diff --git a/.clang-format b/.clang-format index d34381a554072a46620c8f2e8264661388236b98..8c8ff47073a5abfa9fde8b5d8a7331e4c012a8fa 100644 --- a/.clang-format +++ b/.clang-format @@ -93,7 +93,7 @@ PenaltyBreakTemplateDeclaration: 10 PenaltyExcessCharacter: 1000000 PenaltyReturnTypeOnItsOwnLine: 60 PointerAlignment: Right -ReflowComments: false +ReflowComments: true SortIncludes: true SortUsingDeclarations: true SpaceAfterCStyleCast: false diff --git a/CHANGELOG.md b/CHANGELOG.md index 925bc9d28762a9628e3c08f012f8435abfcd4d0c..85c1650d1261804644858c11312532f88ea3e3b8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -15,13 +15,53 @@ documented in this file. ### Fixed -- LDPC decoding (`armral_ldpc_decode_block`) now achieves the expected error - correction performance in the presence of channel noise. The function now uses - `int16_t` internally rather than `int8_t`, which can be slower for certain - input sizes. - ### Security +## [25.01] - 2025-01-23 + +### Added + +- Added the functions `armral_turbo_decode_batch`, and + `armral_turbo_decode_batch_noalloc`. These functions implement a maximum a + posteriori (MAP) algorithm to decode the output of the LTE Turbo encoding + scheme on a batch of encoded data. + +- Added the function `armral_turbo_decode_batch_noalloc_buffer_size` which + returns the size of buffer required for `armral_turbo_decode_batch_noalloc`. + +### Changed + +- Updated all copyright headers, and the text in + [LICENSE.md](https://gitlab.arm.com/networking/ral/-/blob/main/LICENSE.md), + to include the `BSD-3-Clause` SPDX License Identifier. + +- Improved Neon and SVE performance of `armral_fft_execute_cf32` and + `armral_fft_execute_cs16`. + +- The LTE Turbo coding Additive White Gaussian Noise (AWGN) simulation now + supports the decoding of batches of data, using `armral_turbo_decode_batch`. + The number of batches is specified using the flag "`-b `". + +- FFT lengths up to 42012 are now supported, although lengths greater + than 4096 are mostly untested. + +### Removed + +- Unused FFT kernels have been removed. + +### Fixed + +- Improved error correction of LDPC decoding (`armral_ldpc_decode_block`) in + the presence of channel noise. The function now uses 16-bit signed integers + internally rather than 8-bit signed integers. This may result in decreased + performance. + +- The arguments to the function `armral_turbo_decode_block_noalloc_buffer_size` + have been changed to remove the unused second argument, `max_iter`. + +- When planning FFTs with an unsupported length, `armral_fft_create_plan_cf32` + and `armral_fft_create_plan_cs16` now return `ARMRAL_ARGUMENT_ERROR`. + ## [24.10] - 2024-10-17 ### Added diff --git a/CMakeLists.txt b/CMakeLists.txt index 1124e6031e4c8fde73fa72202a4a62168ecdef15..095548a00bf2664336fd4b0cc23f3df02154a86c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,5 +1,5 @@ cmake_minimum_required(VERSION 3.3) -project(armral VERSION 24.10) +project(armral VERSION 25.01) if(CMAKE_VERSION VERSION_GREATER 3.4) # Stop CMake from automatically adding -rdynamic to linker flags because it @@ -71,13 +71,14 @@ set(ARMRAL_LIB_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ab_t_gu.c ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ac_n_gu.c ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ac_n_uu.c + ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ac_n_uun.c ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ac_t_uu.c ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_cf32_cf32_cs16_ab_t_gu.c ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_cf32_cf32_cs16_ac_n_uu.c ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_cf32_kernel_lookup.c ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_cs16.cpp ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_cs16_cf32_cf32_ac_n_uu.c - ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_cs16_cf32_cs16_ac_n_uu.c + ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_cs16_cf32_cs16_ac_n_uun.c ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_cs16_kernel_lookup.c ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_execute.cpp ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_level.cpp @@ -99,10 +100,10 @@ set(ARMRAL_LIB_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/ConvolutionalEncoder/arm_convolutional_decoder.cpp ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/ConvolutionalEncoder/arm_convolutional_encoder.cpp ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Demodulation/arm_demodulation.c - ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/LDPC/ldpc_decoder.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/LDPC/ldpc_encoder.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/LDPC/ldpc_rate_matching.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/LDPC/ldpc_rate_recovery.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/LDPC/arm_ldpc_decoder.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/LDPC/arm_ldpc_encoder.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/LDPC/arm_ldpc_rate_matching.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/LDPC/arm_ldpc_rate_recovery.cpp ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Modulation/arm_modulation.c ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Polar/arm_polar_crc_attachment.cpp ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Polar/arm_polar_crc_check.cpp @@ -561,11 +562,15 @@ if(BUILD_TESTING) test/UpperPHY/Polar/SubchannelDeinterleave/main.cpp) add_armral_test(polar_subchannel_interleave test/UpperPHY/Polar/SubchannelInterleave/main.cpp) - add_armral_test(turbo_decoding test/UpperPHY/Turbo/Decoding/main.cpp) - add_armral_test(turbo_encoding test/UpperPHY/Turbo/Encoding/main.cpp) + add_armral_test(turbo_decoding_batch + test/UpperPHY/Turbo/Batch/Decoding/main.cpp) + add_armral_test(turbo_decoding test/UpperPHY/Turbo/Single/Decoding/main.cpp) + add_armral_test(turbo_encoding test/UpperPHY/Turbo/Single/Encoding/main.cpp) add_armral_test(turbo_perm_indices test/UpperPHY/Turbo/PermIndices/main.cpp) - add_armral_test(turbo_rate_matching test/UpperPHY/Turbo/RateMatching/main.cpp) - add_armral_test(turbo_rate_recovery test/UpperPHY/Turbo/RateRecovery/main.cpp) + add_armral_test(turbo_rate_matching + test/UpperPHY/Turbo/Single/RateMatching/main.cpp) + add_armral_test(turbo_rate_recovery + test/UpperPHY/Turbo/Single/RateRecovery/main.cpp) add_armral_test(svd test/MatrixFactorizations/SVD/main.cpp) add_armral_bench( @@ -763,12 +768,14 @@ if(BUILD_TESTING) bench/UpperPHY/Polar/SubchannelDeinterleave/main.cpp) add_armral_bench(polar_subchannel_interleave bench/UpperPHY/Polar/SubchannelInterleave/main.cpp) - add_armral_bench(turbo_decoding bench/UpperPHY/Turbo/Decoding/main.cpp) - add_armral_bench(turbo_encoding bench/UpperPHY/Turbo/Encoding/main.cpp) + add_armral_bench(turbo_decoding_batch + bench/UpperPHY/Turbo/Batch/Decoding/main.cpp) + add_armral_bench(turbo_decoding bench/UpperPHY/Turbo/Single/Decoding/main.cpp) + add_armral_bench(turbo_encoding bench/UpperPHY/Turbo/Single/Encoding/main.cpp) add_armral_bench(turbo_rate_matching - bench/UpperPHY/Turbo/RateMatching/main.cpp) + bench/UpperPHY/Turbo/Single/RateMatching/main.cpp) add_armral_bench(turbo_rate_recovery - bench/UpperPHY/Turbo/RateRecovery/main.cpp) + bench/UpperPHY/Turbo/Single/RateRecovery/main.cpp) add_armral_bench(svd bench/MatrixFactorizations/SVD/main.cpp) endif() diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index c5c9cbc1e7e3295d54fd24aa4ed50ab1d88ae24b..646faa55a375c68820c4ba319d50cc9556c0ccdc 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -12,9 +12,9 @@ Acceleration Library (ArmRAL): ## Licensing information -Use of ArmRAL is subject to a BSD-3-Clause license, the text of which can be -found in the `LICENSE.md` file in your product installation. We will receive -inbound contributions under the same license. +Use of ArmRAL is subject to a BSD-3-Clause license. See the `LICENSE.md` file +in your product installation for the license text. We will receive inbound +contributions under the same license. ## Writing and submitting patches @@ -258,7 +258,7 @@ C/C++ code style is maintained through the use of `clang-format` and patch; instructions on how to run these tools are given below. `clang-format` and `clang-tidy` are part of the [LLVM -Project](https://llvm.org/). ArmRAL is tested with version 17.0.0 of +Project](https://llvm.org/). ArmRAL is tested with version 17.0.4 of the tools. Matching your coding style as close as possible to the `clang-tidy` @@ -451,7 +451,9 @@ The following code block provides a template for the `bench.py` script. ```py #!/usr/bin/env python3 # Arm RAN Acceleration Library -# SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates +# SPDX-FileCopyrightText: Copyright 2025 Arm Limited and/or its +# affiliates +# SPDX-License-Identifier: BSD-3-Clause import json from pathlib import Path @@ -517,7 +519,9 @@ The following code block provides a basic template. ```cpp /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "armral.h" diff --git a/Doxyfile.in b/Doxyfile.in index 0f95a40e8d54db53b3e80d33654497e75cb07e72..aace5314a8c80356dadf535167bba74d50351797 100644 --- a/Doxyfile.in +++ b/Doxyfile.in @@ -38,7 +38,7 @@ PROJECT_NAME = "Arm RAN Acceleration Library Reference Guide" # could be handy for archiving the generated documentation or if some version # control system is used. -PROJECT_NUMBER = "24.10" +PROJECT_NUMBER = "25.01" # Using the PROJECT_BRIEF tag one can provide an optional one line description # for a project that appears at the top of each page and should give viewer a diff --git a/LICENSE.md b/LICENSE.md index e511299cc09fd0ca2dc106e40a455371dabe087a..046ac981b08bada7f2aa036f862b569f38afb27f 100644 --- a/LICENSE.md +++ b/LICENSE.md @@ -1,4 +1,6 @@ -SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates +SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its +affiliates +SPDX-License-Identifier: BSD-3-Clause Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: diff --git a/README.md b/README.md index 6c334d56f04970a084ed291ca77f3957ed106aa4..bf005ff37ba5f2f3e1e239b0ca17a2137a53de98 100644 --- a/README.md +++ b/README.md @@ -31,9 +31,10 @@ and the `examples` directory contains the examples. * Ensure you have installed all the tools listed in the **Tools** section of the `RELEASE_NOTES.md` file. -* To use the Cyclic Redundancy Check (CRC) functions, you must run the library - on a core that supports the AArch64 PMULL extension. If your machine supports - the PMULL extension, pmull is listed under the **Features** list given in the +* To use the Cyclic Redundancy Check (CRC) functions, the Gold sequence + generator, and the convolutional encoder, you must run the library on a core + that supports the AArch64 PMULL extension. If your machine supports the PMULL + extension, pmull is listed under the **Features** list given in the `/proc/cpuinfo` file. ## Build ArmRAL @@ -396,7 +397,7 @@ file. The Arm RAN Acceleration Library Reference Guide is available online at: - https://developer.arm.com/documentation/102249/2410 + https://developer.arm.com/documentation/102249/2501 If you have Doxygen installed on your system, you can build a local HTML version of the ArmRAL documentation using CMake. diff --git a/RELEASE_NOTES.md b/RELEASE_NOTES.md index 1c73f12ebcb28e10dfe3d2d4a84c5de59e7ebd4e..27b7cfe260b92cb898420543a1ed2b2d0728ce38 100644 --- a/RELEASE_NOTES.md +++ b/RELEASE_NOTES.md @@ -1,7 +1,7 @@ -# Arm RAN Acceleration Library 24.10 Release Notes +# Arm RAN Acceleration Library 25.01 Release Notes Non-Confidential -Copyright © 2020-2024 Arm Limited (or its affiliates). All rights reserved. +Copyright © 2020-2025 Arm Limited (or its affiliates). All rights reserved. Arm conventions and proprietary notices, including confidentiality status, terminology statement, and product release status, can be found at the end of @@ -14,14 +14,14 @@ These Release Notes contain the following sections: - Release overview - Release contents - Support -- Release History +- Release history - Conventions - Proprietary notices ## Release overview -The following sections describe the product to which these release notes relate -and its quality status at time of release. +This section describes the product to which these release notes relate and +provides information about its license. ### Product description @@ -43,13 +43,13 @@ ArmRAL includes functions that operate on 16-bit signed integers and 16-bit and ### Release status -This is the 24.10 release of ArmRAL. +This is the 25.01 release of ArmRAL. ### Licensing information -Use of ArmRAL is subject to a BSD-3-Clause license, the text of which can be -found in the `LICENSE.md` file in your product installation. We will receive -inbound contributions under the same license. +Use of ArmRAL is subject to a BSD-3-Clause license. See the `LICENSE.md` file +in your product installation for the license text. We will receive inbound +contributions under the same license. If you require a different license than BSD-3-Clause for compatibility with your end product, please get in contact via including @@ -59,7 +59,7 @@ end product, please get in contact via including ArmRAL releases contain documentation and source files. -The following subsections describe: +This section describes: - Cloning the product's git repository from Arm's GitLab - The contents of this release @@ -73,18 +73,18 @@ ArmRAL is available on **To access this release, clone the following repository using HTTPS:** - git clone -b armral-24.10 https://git.gitlab.arm.com/networking/ral + git clone -b armral-25.01 https://git.gitlab.arm.com/networking/ral ### Deliverables The downloaded product includes the following deliverables: -- ArmRAL 24.10 +- ArmRAL 25.01 - Release Notes (this document) - Documentation Product documentation is available on the -[Arm Developer website](https://developer.arm.com/documentation/102249/2410). +[Arm Developer website](https://developer.arm.com/documentation/102249/2501). **Note:** Documentation, errata and release notes might change between product releases. For the latest documentation bundle, check the product download @@ -92,7 +92,7 @@ page. ### Differences from previous release -The following subsections describe differences from the previous release of +The following sections describe differences from the previous release of ArmRAL. #### Additions and functionality changes @@ -100,50 +100,51 @@ ArmRAL. This section describes new features or any technical changes to features or components in this release. -- Added the function `armral_turbo_perm_idx_init` which generates all - permutation indices used in the permutation step of LTE Turbo decoding. +- The functions `armral_turbo_decode_batch`, and + `armral_turbo_decode_batch_noalloc` have been added. These functions implement + a maximum a posteriori (MAP) algorithm to decode the output of the LTE Turbo + encoding scheme on a batch of encoded data. -- The interfaces for `armral_turbo_decode_block` and - `armral_turbo_decode_block_noalloc` now have an additional argument. They now - include the option to supply a user-allocated buffer which, if used, must be - initialized with permutation indices by calling - `armral_turbo_perm_idx_init`. This buffer can then be reused in subsequent - calls to the Turbo decoding functions and will improve their performance by - removing the need to compute the indices on each call. If the buffer is not - initialized and a null pointer is passed instead, the functions will recompute - the permutation indices on every call. +- The function `armral_turbo_decode_batch_noalloc_buffer_size` has been added, + which returns the size of buffer required for + `armral_turbo_decode_batch_noalloc`. -- Added the function `armral_cmplx_matmul_i16_noalloc` which multiplies two - matrices of complex Q15 values using a 64-bit Q32.31 accumulator. This - function does not call any system memory allocators, unlike the existing - `armral_cmplx_matmul_i16` function. +- FFT lengths up to 42012 are now supported, although lengths greater + than 4096 are mostly untested. + +- Unused FFT kernels have been removed. #### Performance improvements This section describes any features or components with improved performance. -- Performance improvements for the following routines: +- Neon and SVE performance improvements for the following routines: - - `armral_fft_execute_cf32` and `armral_fft_execute_cs16`. Cases which were - calculated using recursive calls to Rader's algorithm are now calculated - using Bluestein's algorithm. + - `armral_fft_execute_cf32` and `armral_fft_execute_cs16`. #### Changes to simulation programs This section describes any changes, new features or components added to the channel simulation programs in this release. -- There are no changes to the channel simulation programs in this - release. +- The LTE Turbo coding Additive White Gaussian Noise (AWGN) simulation now + supports the decoding of batches of data, using `armral_turbo_decode_batch`. + The number of batches is specified using the flag "`-b `". #### Resolved issues This section describes any known issues resolved in the current release. -- Fixed performance regressions in the SVE versions of the following routines: +- Improved error correction of LDPC decoding (`armral_ldpc_decode_block`) in + the presence of channel noise. The function now uses 16-bit signed integers + internally rather than 8-bit signed integers. This may result in decreased + performance. + +- The arguments to the function `armral_turbo_decode_block_noalloc_buffer_size` + have been changed to remove the unused second argument, `max_iter`. - - `armral_cmplx_vecdot_f32` - - `armral_cmplx_vecmul_f32_2` +- When planning FFTs with an unsupported length, `armral_fft_create_plan_cf32` + and `armral_fft_create_plan_cs16` now return `ARMRAL_ARGUMENT_ERROR`. ### Known limitations @@ -174,7 +175,7 @@ To build or run ArmRAL you will need: Additionally: - To run the benchmarks, you must have the Linux utility tool `perf` installed - and a recent version of Python 3. ArmRAL has been tested with Python 3.8.5. + and a recent version of Python 3. ArmRAL has been tested with Python 3.11.11. - To build a local version of the documentation, you must have Doxygen installed. ArmRAL has been tested with Doxygen version 1.8.13. @@ -194,7 +195,7 @@ ArmRAL's release history is available on the [Arm Developer website](https://dev ## Conventions -The following subsections describe conventions used in Arm documents. +The following sections describe conventions used in Arm documents. ### Glossary @@ -255,7 +256,7 @@ rights reserved. Other brands and names mentioned in this document may be the trademarks of their respective owners. Please follow Arm’s trademark usage guidelines at . -Copyright © 2020-2024 Arm Limited (or its affiliates). All rights reserved. +Copyright © 2020-2025 Arm Limited (or its affiliates). All rights reserved. Arm Limited. Company 02557590 registered in England. 110 Fulbourn Road, Cambridge, England CB1 9NJ. diff --git a/SECURITY.md b/SECURITY.md new file mode 100644 index 0000000000000000000000000000000000000000..c50c8ec2a62a5ee238f7d4396d8b234b873f2b5e --- /dev/null +++ b/SECURITY.md @@ -0,0 +1,80 @@ +# Reporting vulnerabilities + +Arm takes security issues seriously and welcomes feedback from researchers and +the security community in order to improve the security of its products and +services. We operate a coordinated disclosure policy for disclosing +vulnerabilities and other security issues. + +Security issues can be complex and one single timescale doesn't fit all +circumstances. We will make best endeavours to inform you when we expect +security notifications and fixes to be available and facilitate coordinated +disclosure when notifications and patches/mitigations are available. + +## Report + +For all security issues, contact Arm by email at [psirt@arm.com](mailto:psirt@arm.com). + +## Secure submission using PGP + +We support and encourage secure submission of vulnerability reports using PGP, +using the key below. If you would like replies to be encrypted, please provide +your own public key through a secure mechanism. + +``` +-----BEGIN PGP PUBLIC KEY BLOCK----- +Comment: CB33 9CE6 994B C71F 2430 9E59 6223 980B 410D E67E +Comment: Arm PSIRT + +xsFNBGXKO/4BEADR0bGtT8vZwLM+8b5bgCI7lf4hMctfVNFR4SuMd792GVUb/iwc +PXAbyuHkY1Qi8PzSmrkTEop+p+lXloI9G6jT2+OpVJ7d38b6LpMVZ4rhaQQEAp6p +sChTZfv4f1I/10kGpYQGOQB71GORwbPKr7QUv3XWCNNjkBFFTrXO1XdqkkVrlGo6 +27RP3r5LjYfEgd9eO7rfgYiuRfozTNrxyJPM96Miq1N/PISPmxk4v/5/dkUX43dW +UeUajOxrYqakABImiP4RNWicKBR8wJpFxKtxG/29KB1SHOaYxmNaPtED9uAWyHUL +WYYgh0p1XhfRX48tnGS41RZEvnXvZdbakfvbK78TVJdiLzPstjeTV664zH4EOHvM +mi8YlIH16+uk4RqQ163eVvA52bCVU1YUDg1AhwrSDBse+brJ6qIEa7W8oqsM+dc3 +tw8eZn3K/ClAvYjEcfXpa8rQYa09VaIgCSMi61VGl2oPmO1AuMx7Osqe8Pb6fV3P ++lWgLRR10GciIlsYm9m0+D8Hotr9JEsmzXfJhw02LLzvRQUDMWY7/1X+arSboL+R +z95uEf8EUzh330NveTHwYA4KCvJvO5vSPj/0R+7bs/4dfXieWWMwQqEZC/WBOhyR +756rzN/JdQpFRl5Z5hwWByRndSbqCuSFyTw2rXK2ju34eCwveUo94CDQ8wARAQAB +zRlBcm0gUFNJUlQgPHBzaXJ0QGFybS5jb20+wsGUBBMBCAA+FiEEyzOc5plLxx8k +MJ5ZYiOYC0EN5n4FAmXKO/4CGwMFCQeGH1wFCwkIBwIGFQoJCAsCBBYCAwECHgEC +F4AACgkQYiOYC0EN5n5VUA//QU8Gs+AywtWV5T0iin9uN5crs/MvEDkDSBOEcc9A +g9tevjw13Ix8eyiG7dfgbzNOhhAiFiYxzyi7o6guPl1K1o1Mcfs9r/f8e3w56XDP +7C9zqE+eid0cEDOvYzclMaNxwKnRntMKsmF2SG9p4OQxSRxO2RzXNFP7yVCotkDS +TYS3ib9mdCrZCTcwEmYEWoJemS2AzV3+8XUR14m0WS66Pry3FICl6y7p8RslStd8 +BQJW7PQJTdAkZTcs5zPyjWq9+FBM/NGNExAqYwNzscKj+p1SHgaLT/xKSbetIKgN +6zRbfiFA+nZazr3OQNeecYImbUQUkPfWj2qwkFgyOG9pmM8UQ0CksvDHlKeHlMh2 +KPVkSrkavADrz1/PpqF10a5Kcip8ye0puv///rZfbysW3h2WGb6ZuiY5+uevpuox +ngCDojHfx3QeOUn5YTRs2M9LSQ6wfG0BtQjEP+cEo9r0+pY50spq1FouyKCFe2hT +gHZZ3QBP3m5Cywyz0Oj+C1XRuzI/lxTNPTFz8VjBQkotQZpSSPjyN3dvksbHdxpa +rR0KWVLZdvTFKBJpDr5weC5MUyC7mGvE/NNcxGIbGFfbDVhNyE9g6mUgmWIyV142 +SQzOARf/V1CK3La2ECFMOe5j3hMlrvRgCihqYFqrtKKOhVL71ipWLqfvGMXVn+uv +/G3OwU0EZco7/gEQAL7DbxhUWcosRQ0e9D03PJx38ybkRlP3dqxYZzB+MBANrHX6 +1V4tcDOXqTuqt3DPA0VDybQBuG0G8qoim8Ibs0Hu4kjI3sNl83v45jv4kGNRPoeo +26kOucgi21Mki5pe12vuAl4PfT5WfPvW5ADwPNKnMuEH6SaeDYcojMB2h6wIBZKi +GkCGPNZsN1+bv7QUJfEAJvbWaqMAm0/AEacqEq3UBbq/JX+i30Yi0bBHzn1rKaeH +5rzjSqmlv4NQcdlOaUkc79B0U/FhuF9gQdxtS6FRgfvljP9RgWQan0qW+4mL0MQz ++kwIlbYVb00bNxXEs7xrkAvbA3+y1Kxn35vWTdD8N8uRYLdCzEJ0KtMSKWHzgOiM +7Q1G0eTfyZ1LYg2MlEMwffkSbxnTgKy2QQ9MUH9FrKcICoGK+2x0t9h5lAZ/6Mu/ +BLORIn/wvkL4alNJ0xCzWgX8VaAzzbscaOBMlLjTzEpYpzpAlZpNBwU7SfQ5Go+n +lVkeWlgD4nrB9nSQxK8uxsOigEz7cDLTS3zHXLwvFmIdl6AvbUsohhorN5JgKVjm +2O/TsPtB2EVZF1m0Ep6uRuuCNmUk2+062KIrWiezOFzds/CUpi1yi+zj4SmbGwVK +xCoVqsinKXhi4QzGWyWdgE77qCFLeJWeohxGO16WJWE3t2+I/Uz+Pdsu2DQ3ABEB +AAHCwXwEGAEIACYWIQTLM5zmmUvHHyQwnlliI5gLQQ3mfgUCZco7/gIbDAUJB4Yf +XAAKCRBiI5gLQQ3mflnMEADIKeO5EjDPjAN7Gp4TFtoRz3NMe/37HrVIVdS+llBi +n5x2s7wQ3EIBTEuANFpXDDuRP+EwerIlBShI6lr/52X7DCtJy7tb1KcwWXsmuYG/ +wncSUjNSmI+E0SFd4WJWyBJ6nqsACgvHw8aOC4mYa82LBbr7fd5a9zXLvELwwJQW +pvsevCXA3P+Jo8Rb7Xx5mNW55Y1dzXAnGm5GhVvsRHyv2vyDrI4sC3a2xxGvypqq +D2J6ysZkzk8+sRLr0Ur1JFBFb1zl3TLB5QbTev+GRg/0tCM6IrS9CH0UHgA+AXSg +XBKt9IkIKhv23EPTh4tAq3ENAM56Wenyi8bs0NsM8aT7c0WlHCpj6az6SNpi3X5t +5r1hgGOFH5R9Sw/ICNATSRo7ME59v0mDMn2ph7W9qHA/S8OrsVW44Xf3MiqeiYtf +5BESTOyr48xKOCAbEABt40IJxsZkhREtf+6wywQ5gRQGK6LphsPNS0qsHG9nFaFE +kvl2L7ER+qJMjzrvkYH98FJVeClzmZun+6pgbpk5sBSHqzi3WTd/Pb37oS/SRSNJ +thstK7gKyYdzHEHFVt/OAWkiPlysZZcfRawivSpoIfHcA04ZvT87tDstXPWfG252 +HieKA5hpCmpUd2NuSNhcYXjpMNOUEVUq27wcdXc6JWFRJ8oTEfGxcKrT3s0WcPmJ +9g== +=tP9E +-----END PGP PUBLIC KEY BLOCK----- +``` + +For more information visit diff --git a/bench/BasicMathFun/MatrixInv/Batch/GeneralMatInv/NonPA/bench.py b/bench/BasicMathFun/MatrixInv/Batch/GeneralMatInv/NonPA/bench.py index 1770d5a640093f4d66f7802517dfdf9c2c2b3aaa..34a1546259bf1fd76618451a35c67fcc0279dd94 100755 --- a/bench/BasicMathFun/MatrixInv/Batch/GeneralMatInv/NonPA/bench.py +++ b/bench/BasicMathFun/MatrixInv/Batch/GeneralMatInv/NonPA/bench.py @@ -1,6 +1,8 @@ #!/usr/bin/env python3 # Arm RAN Acceleration Library -# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates +# SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its +# affiliates +# SPDX-License-Identifier: BSD-3-Clause import json from pathlib import Path diff --git a/bench/BasicMathFun/MatrixInv/Batch/GeneralMatInv/NonPA/main.cpp b/bench/BasicMathFun/MatrixInv/Batch/GeneralMatInv/NonPA/main.cpp index 761df3c09621227f3e4f42722f1668efc4dd3330..843ed4f00c386eb6b0a23e04d6ed5f5225cf1dbf 100644 --- a/bench/BasicMathFun/MatrixInv/Batch/GeneralMatInv/NonPA/main.cpp +++ b/bench/BasicMathFun/MatrixInv/Batch/GeneralMatInv/NonPA/main.cpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "armral.h" #include "matrix_utils.hpp" diff --git a/bench/BasicMathFun/MatrixInv/Batch/GeneralMatInv/PA/bench.py b/bench/BasicMathFun/MatrixInv/Batch/GeneralMatInv/PA/bench.py index 51d5ad798490dda222bc2acd764ea3016e757753..75ec94d7f3bbff91624aef07c3da2bb82394c0fb 100755 --- a/bench/BasicMathFun/MatrixInv/Batch/GeneralMatInv/PA/bench.py +++ b/bench/BasicMathFun/MatrixInv/Batch/GeneralMatInv/PA/bench.py @@ -1,6 +1,8 @@ #!/usr/bin/env python3 # Arm RAN Acceleration Library -# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates +# SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its +# affiliates +# SPDX-License-Identifier: BSD-3-Clause import json from pathlib import Path diff --git a/bench/BasicMathFun/MatrixInv/Batch/GeneralMatInv/PA/main.cpp b/bench/BasicMathFun/MatrixInv/Batch/GeneralMatInv/PA/main.cpp index d2cf4519b1ac25ed01f09ffd0b34dea212175ae7..d281038e23e5422c349e3bab7b4aca45db2e1904 100644 --- a/bench/BasicMathFun/MatrixInv/Batch/GeneralMatInv/PA/main.cpp +++ b/bench/BasicMathFun/MatrixInv/Batch/GeneralMatInv/PA/main.cpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "armral.h" #include "matrix_utils.hpp" diff --git a/bench/BasicMathFun/MatrixInv/Batch/HermitianMatInv/NonPA/bench.py b/bench/BasicMathFun/MatrixInv/Batch/HermitianMatInv/NonPA/bench.py index 2cb80568cc31ca4621b8a4f54e660b4f15453a56..456abae0a453398938f48731950be3625d1ace07 100755 --- a/bench/BasicMathFun/MatrixInv/Batch/HermitianMatInv/NonPA/bench.py +++ b/bench/BasicMathFun/MatrixInv/Batch/HermitianMatInv/NonPA/bench.py @@ -1,6 +1,8 @@ #!/usr/bin/env python3 # Arm RAN Acceleration Library -# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates +# SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its +# affiliates +# SPDX-License-Identifier: BSD-3-Clause import json from pathlib import Path diff --git a/bench/BasicMathFun/MatrixInv/Batch/HermitianMatInv/NonPA/main.cpp b/bench/BasicMathFun/MatrixInv/Batch/HermitianMatInv/NonPA/main.cpp index 25f0857698847b6532061525d03b6ad34667ac46..fa76b845726321428c167db47070fcbd5bfa3867 100644 --- a/bench/BasicMathFun/MatrixInv/Batch/HermitianMatInv/NonPA/main.cpp +++ b/bench/BasicMathFun/MatrixInv/Batch/HermitianMatInv/NonPA/main.cpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "armral.h" #include "matrix_utils.hpp" diff --git a/bench/BasicMathFun/MatrixInv/Batch/HermitianMatInv/PA/bench.py b/bench/BasicMathFun/MatrixInv/Batch/HermitianMatInv/PA/bench.py index 426c4940caa413317e69e5cb8ce4bd7eafa4ee0e..49ed2300c2e87cac37bf0247b8447a9c0b445f13 100755 --- a/bench/BasicMathFun/MatrixInv/Batch/HermitianMatInv/PA/bench.py +++ b/bench/BasicMathFun/MatrixInv/Batch/HermitianMatInv/PA/bench.py @@ -1,6 +1,8 @@ #!/usr/bin/env python3 # Arm RAN Acceleration Library -# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates +# SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its +# affiliates +# SPDX-License-Identifier: BSD-3-Clause import json from pathlib import Path diff --git a/bench/BasicMathFun/MatrixInv/Batch/HermitianMatInv/PA/main.cpp b/bench/BasicMathFun/MatrixInv/Batch/HermitianMatInv/PA/main.cpp index f6bb3a65493ac43921ffc7864c391ebe359e04da..34eec99e4d27ecae6c4ce64478efccf09757967b 100644 --- a/bench/BasicMathFun/MatrixInv/Batch/HermitianMatInv/PA/main.cpp +++ b/bench/BasicMathFun/MatrixInv/Batch/HermitianMatInv/PA/main.cpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "armral.h" #include "matrix_utils.hpp" diff --git a/bench/BasicMathFun/MatrixInv/Single/GeneralMatInv/bench.py b/bench/BasicMathFun/MatrixInv/Single/GeneralMatInv/bench.py index 369ee50ac1d9f8ad9a2a641c51eb923b9502e9d3..f365ee29b531c9caaa23f92cd243d4361ab62b62 100755 --- a/bench/BasicMathFun/MatrixInv/Single/GeneralMatInv/bench.py +++ b/bench/BasicMathFun/MatrixInv/Single/GeneralMatInv/bench.py @@ -1,6 +1,8 @@ #!/usr/bin/env python3 # Arm RAN Acceleration Library -# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates +# SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its +# affiliates +# SPDX-License-Identifier: BSD-3-Clause import json from pathlib import Path diff --git a/bench/BasicMathFun/MatrixInv/Single/GeneralMatInv/main.cpp b/bench/BasicMathFun/MatrixInv/Single/GeneralMatInv/main.cpp index dda075a43412d1623168a7fe46e3cd7dcbaca6ad..38781e2ef022d0d1517b96790fc327a8e3930414 100644 --- a/bench/BasicMathFun/MatrixInv/Single/GeneralMatInv/main.cpp +++ b/bench/BasicMathFun/MatrixInv/Single/GeneralMatInv/main.cpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "armral.h" #include "matrix_utils.hpp" diff --git a/bench/BasicMathFun/MatrixInv/Single/HermitianMatInv/bench.py b/bench/BasicMathFun/MatrixInv/Single/HermitianMatInv/bench.py index af79e2eab00a6cb7ae085fd5188f12ab8758dd97..afb6dc4a0e725f623cb4d7061afd724ccd6bb461 100755 --- a/bench/BasicMathFun/MatrixInv/Single/HermitianMatInv/bench.py +++ b/bench/BasicMathFun/MatrixInv/Single/HermitianMatInv/bench.py @@ -1,6 +1,8 @@ #!/usr/bin/env python3 # Arm RAN Acceleration Library -# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates +# SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its +# affiliates +# SPDX-License-Identifier: BSD-3-Clause import json from pathlib import Path diff --git a/bench/BasicMathFun/MatrixInv/Single/HermitianMatInv/main.cpp b/bench/BasicMathFun/MatrixInv/Single/HermitianMatInv/main.cpp index 0082492d7321d6852a98043bc7ab0c06ea077eb3..227c8a4682ddc535a255249a7171a92d172e1173 100644 --- a/bench/BasicMathFun/MatrixInv/Single/HermitianMatInv/main.cpp +++ b/bench/BasicMathFun/MatrixInv/Single/HermitianMatInv/main.cpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "armral.h" #include "matrix_utils.hpp" diff --git a/bench/BasicMathFun/MatrixMult/Batch/ArmSolve/1x2/bench.py b/bench/BasicMathFun/MatrixMult/Batch/ArmSolve/1x2/bench.py index 5fdd983a12fb6ef178cf34a663142a8371c526cb..80f9f416a4e3601f48cf55bb6b84d1b6597d7457 100755 --- a/bench/BasicMathFun/MatrixMult/Batch/ArmSolve/1x2/bench.py +++ b/bench/BasicMathFun/MatrixMult/Batch/ArmSolve/1x2/bench.py @@ -1,6 +1,8 @@ #!/usr/bin/env python3 # Arm RAN Acceleration Library -# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates +# SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its +# affiliates +# SPDX-License-Identifier: BSD-3-Clause import json import itertools diff --git a/bench/BasicMathFun/MatrixMult/Batch/ArmSolve/1x2/main.cpp b/bench/BasicMathFun/MatrixMult/Batch/ArmSolve/1x2/main.cpp index 96216c64d5974f469062e7cc489eeb0b1614367d..5ad4e7bb8a08a206ee59adfdee23532925ff6bea 100644 --- a/bench/BasicMathFun/MatrixMult/Batch/ArmSolve/1x2/main.cpp +++ b/bench/BasicMathFun/MatrixMult/Batch/ArmSolve/1x2/main.cpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "cs16_utils.hpp" diff --git a/bench/BasicMathFun/MatrixMult/Batch/ArmSolve/1x4/bench.py b/bench/BasicMathFun/MatrixMult/Batch/ArmSolve/1x4/bench.py index 51e0638c6017ea44bc1954690963c2c7d29d4cee..e5df330a6ec9cd9a62f7e54d2b0787ebe1b4571d 100755 --- a/bench/BasicMathFun/MatrixMult/Batch/ArmSolve/1x4/bench.py +++ b/bench/BasicMathFun/MatrixMult/Batch/ArmSolve/1x4/bench.py @@ -1,6 +1,8 @@ #!/usr/bin/env python3 # Arm RAN Acceleration Library -# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates +# SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its +# affiliates +# SPDX-License-Identifier: BSD-3-Clause import json import itertools diff --git a/bench/BasicMathFun/MatrixMult/Batch/ArmSolve/1x4/main.cpp b/bench/BasicMathFun/MatrixMult/Batch/ArmSolve/1x4/main.cpp index 6c83b1aca123483c36f931cb60b0c2e1641cbdde..58d3c0b5a668883c577730cd651a1f9e98dc5d60 100644 --- a/bench/BasicMathFun/MatrixMult/Batch/ArmSolve/1x4/main.cpp +++ b/bench/BasicMathFun/MatrixMult/Batch/ArmSolve/1x4/main.cpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "cs16_utils.hpp" diff --git a/bench/BasicMathFun/MatrixMult/Batch/ArmSolve/2x2/bench.py b/bench/BasicMathFun/MatrixMult/Batch/ArmSolve/2x2/bench.py index 4d8f7fbc081addb2370bf9b2038a6475b817f057..16b8ef0a0468f01f94f4e7e73988b74dc3d7ac4a 100755 --- a/bench/BasicMathFun/MatrixMult/Batch/ArmSolve/2x2/bench.py +++ b/bench/BasicMathFun/MatrixMult/Batch/ArmSolve/2x2/bench.py @@ -1,6 +1,8 @@ #!/usr/bin/env python3 # Arm RAN Acceleration Library -# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates +# SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its +# affiliates +# SPDX-License-Identifier: BSD-3-Clause import json import itertools diff --git a/bench/BasicMathFun/MatrixMult/Batch/ArmSolve/2x2/main.cpp b/bench/BasicMathFun/MatrixMult/Batch/ArmSolve/2x2/main.cpp index 2fd1c770b758874b10cc71b83c391b7840864490..49247e148a6c480a4805937f10a519319248dab7 100644 --- a/bench/BasicMathFun/MatrixMult/Batch/ArmSolve/2x2/main.cpp +++ b/bench/BasicMathFun/MatrixMult/Batch/ArmSolve/2x2/main.cpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "cs16_utils.hpp" diff --git a/bench/BasicMathFun/MatrixMult/Batch/ArmSolve/2x4/bench.py b/bench/BasicMathFun/MatrixMult/Batch/ArmSolve/2x4/bench.py index 6c0e4a84ff1ef519627f04d9b7adc234dd891954..8bbbf62604765d7decfa430103474efdbec20fbb 100755 --- a/bench/BasicMathFun/MatrixMult/Batch/ArmSolve/2x4/bench.py +++ b/bench/BasicMathFun/MatrixMult/Batch/ArmSolve/2x4/bench.py @@ -1,6 +1,8 @@ #!/usr/bin/env python3 # Arm RAN Acceleration Library -# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates +# SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its +# affiliates +# SPDX-License-Identifier: BSD-3-Clause import json import itertools diff --git a/bench/BasicMathFun/MatrixMult/Batch/ArmSolve/2x4/main.cpp b/bench/BasicMathFun/MatrixMult/Batch/ArmSolve/2x4/main.cpp index 24ab935cf63f79c428a1cc49ad13ce0667aac432..5214585472350c391eff525a0d94076e74a665b9 100644 --- a/bench/BasicMathFun/MatrixMult/Batch/ArmSolve/2x4/main.cpp +++ b/bench/BasicMathFun/MatrixMult/Batch/ArmSolve/2x4/main.cpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "cs16_utils.hpp" diff --git a/bench/BasicMathFun/MatrixMult/Batch/ArmSolve/4x4/bench.py b/bench/BasicMathFun/MatrixMult/Batch/ArmSolve/4x4/bench.py index 1a73cc6a5fc870e58ec9157d3e80320c66a90872..fe9ffd14f45223c1ef914619822b6befd3be0d7a 100755 --- a/bench/BasicMathFun/MatrixMult/Batch/ArmSolve/4x4/bench.py +++ b/bench/BasicMathFun/MatrixMult/Batch/ArmSolve/4x4/bench.py @@ -1,6 +1,8 @@ #!/usr/bin/env python3 # Arm RAN Acceleration Library -# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates +# SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its +# affiliates +# SPDX-License-Identifier: BSD-3-Clause import json import itertools diff --git a/bench/BasicMathFun/MatrixMult/Batch/ArmSolve/4x4/main.cpp b/bench/BasicMathFun/MatrixMult/Batch/ArmSolve/4x4/main.cpp index 20522897e17c983e11d3f761863b88d5b8d38b6e..c8cafd4a6c99449a6a06c2218e8eeaec1c7eeeba 100644 --- a/bench/BasicMathFun/MatrixMult/Batch/ArmSolve/4x4/main.cpp +++ b/bench/BasicMathFun/MatrixMult/Batch/ArmSolve/4x4/main.cpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "cs16_utils.hpp" diff --git a/bench/BasicMathFun/MatrixMult/Batch/MatrixVectorMult16/32b/NonPA/bench.py b/bench/BasicMathFun/MatrixMult/Batch/MatrixVectorMult16/32b/NonPA/bench.py index ae47493aa3905932b7db05280d38fd3a6ac1e581..c65279b7659da4cc2c6744b7d754b2d00e6da23d 100755 --- a/bench/BasicMathFun/MatrixMult/Batch/MatrixVectorMult16/32b/NonPA/bench.py +++ b/bench/BasicMathFun/MatrixMult/Batch/MatrixVectorMult16/32b/NonPA/bench.py @@ -1,6 +1,8 @@ #!/usr/bin/env python3 # Arm RAN Acceleration Library -# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates +# SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its +# affiliates +# SPDX-License-Identifier: BSD-3-Clause import itertools import json diff --git a/bench/BasicMathFun/MatrixMult/Batch/MatrixVectorMult16/32b/NonPA/main.cpp b/bench/BasicMathFun/MatrixMult/Batch/MatrixVectorMult16/32b/NonPA/main.cpp index f29d40bd5645aff4b01cd4f6fc0d10d2afb12821..f5dd708d67d2e951b3e4e53d8931a2e6f27281ba 100644 --- a/bench/BasicMathFun/MatrixMult/Batch/MatrixVectorMult16/32b/NonPA/main.cpp +++ b/bench/BasicMathFun/MatrixMult/Batch/MatrixVectorMult16/32b/NonPA/main.cpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "armral.h" @@ -46,10 +48,13 @@ void run_matvecmul_batch_i16_32b_perf(uint32_t num_mats, uint32_t vecs_per_mat, int main(int argc, char **argv) { if (argc != 6) { // num_mats - The number of input matrices - // vecs_per_mat - The number of input and output vectors for each input matrix - // m - The number of rows in each matrix and the length of each output vector - // n - The number of columns in each matrixa and the length of each input vector - // nreps - The number of times to repeat the function + // vecs_per_mat - The number of input and output vectors for each + // input matrix + // m - The number of rows in each matrix and the length of + // each output vector + // n - The number of columns in each matrix and the length of + // each input vector + // nreps - The number of times to repeat the function fprintf(stderr, "usage: %s num_mats vecs_per_mat m n nreps\n", argv[0]); exit(EXIT_FAILURE); } diff --git a/bench/BasicMathFun/MatrixMult/Batch/MatrixVectorMult16/32b/PA/bench.py b/bench/BasicMathFun/MatrixMult/Batch/MatrixVectorMult16/32b/PA/bench.py index 726a7682447fd3abf0fd5c59f9b2ed69d7eb3ecf..3a8566ab30c56107da88778c541cbbf58ba1b5b9 100755 --- a/bench/BasicMathFun/MatrixMult/Batch/MatrixVectorMult16/32b/PA/bench.py +++ b/bench/BasicMathFun/MatrixMult/Batch/MatrixVectorMult16/32b/PA/bench.py @@ -1,6 +1,8 @@ #!/usr/bin/env python3 # Arm RAN Acceleration Library -# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates +# SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its +# affiliates +# SPDX-License-Identifier: BSD-3-Clause import itertools import json diff --git a/bench/BasicMathFun/MatrixMult/Batch/MatrixVectorMult16/32b/PA/main.cpp b/bench/BasicMathFun/MatrixMult/Batch/MatrixVectorMult16/32b/PA/main.cpp index 19d7cedcfa76bb73303cc48685b7e8a703bed481..2a931a1d953bf3da23f60c58a51314af25e773ea 100644 --- a/bench/BasicMathFun/MatrixMult/Batch/MatrixVectorMult16/32b/PA/main.cpp +++ b/bench/BasicMathFun/MatrixMult/Batch/MatrixVectorMult16/32b/PA/main.cpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "armral.h" @@ -59,9 +61,12 @@ void run_matvecmul_batch_i16_32b_pa_perf(uint32_t num_mats, int main(int argc, char **argv) { if (argc != 6) { // num_mats - The number of input matrices - // vecs_per_mat - The number of input and output vectors for each input matrix - // m - The number of rows in each matrix and the length of each output vector - // n - The number of columns in each matrixa and the length of each input vector + // vecs_per_mat - The number of input and output vectors for each + // input matrix + // m - The number of rows in each matrix and the length of + // each output vector + // n - The number of columns in each matrix and the length of + // each input vector // nreps - The number of times to repeat the function fprintf(stderr, "usage: %s num_mats vecs_per_mat m n nreps\n", argv[0]); exit(EXIT_FAILURE); diff --git a/bench/BasicMathFun/MatrixMult/Batch/MatrixVectorMult16/64b/NonPA/bench.py b/bench/BasicMathFun/MatrixMult/Batch/MatrixVectorMult16/64b/NonPA/bench.py index d42117e8d35f6606b6d3ce12cfea8914e8323acb..c7833fff5efad09f18bb47a99993e473f0065092 100755 --- a/bench/BasicMathFun/MatrixMult/Batch/MatrixVectorMult16/64b/NonPA/bench.py +++ b/bench/BasicMathFun/MatrixMult/Batch/MatrixVectorMult16/64b/NonPA/bench.py @@ -1,6 +1,8 @@ #!/usr/bin/env python3 # Arm RAN Acceleration Library -# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates +# SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its +# affiliates +# SPDX-License-Identifier: BSD-3-Clause import itertools import json diff --git a/bench/BasicMathFun/MatrixMult/Batch/MatrixVectorMult16/64b/NonPA/main.cpp b/bench/BasicMathFun/MatrixMult/Batch/MatrixVectorMult16/64b/NonPA/main.cpp index 0c987fc41f6cf9c28f0b2e08ea06dd22fcf5a6f5..49d7fc25b6e961ef187a9946780958b7b1c1bdf4 100644 --- a/bench/BasicMathFun/MatrixMult/Batch/MatrixVectorMult16/64b/NonPA/main.cpp +++ b/bench/BasicMathFun/MatrixMult/Batch/MatrixVectorMult16/64b/NonPA/main.cpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "armral.h" @@ -46,9 +48,12 @@ void run_matvecmul_batch_i16_64b_perf(uint32_t num_mats, uint32_t vecs_per_mat, int main(int argc, char **argv) { if (argc != 6) { // num_mats - The number of input matrices - // vecs_per_mat - The number of input and output vectors for each input matrix - // m - The number of rows in each matrix and the length of each output vector - // n - The number of columns in each matrixa and the length of each input vector + // vecs_per_mat - The number of input and output vectors for each + // input matrix + // m - The number of rows in each matrix and the length of + // each output vector + // n - The number of columns in each matrix and the length of + // each input vector // nreps - The number of times to repeat the function fprintf(stderr, "usage: %s num_mats vecs_per_mat m n nreps\n", argv[0]); exit(EXIT_FAILURE); diff --git a/bench/BasicMathFun/MatrixMult/Batch/MatrixVectorMult16/64b/PA/bench.py b/bench/BasicMathFun/MatrixMult/Batch/MatrixVectorMult16/64b/PA/bench.py index d1dee616ea9fbc9575bc6d89aba5722c5efdfc70..11b8605504918f930241081ee979679c7fe9bb0c 100755 --- a/bench/BasicMathFun/MatrixMult/Batch/MatrixVectorMult16/64b/PA/bench.py +++ b/bench/BasicMathFun/MatrixMult/Batch/MatrixVectorMult16/64b/PA/bench.py @@ -1,6 +1,8 @@ #!/usr/bin/env python3 # Arm RAN Acceleration Library -# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates +# SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its +# affiliates +# SPDX-License-Identifier: BSD-3-Clause import itertools import json diff --git a/bench/BasicMathFun/MatrixMult/Batch/MatrixVectorMult16/64b/PA/main.cpp b/bench/BasicMathFun/MatrixMult/Batch/MatrixVectorMult16/64b/PA/main.cpp index 1df10870111a00429c0b9ef07b11182b692b2e99..e9831e8989a795db45f605a8d8b7bc085d52d437 100644 --- a/bench/BasicMathFun/MatrixMult/Batch/MatrixVectorMult16/64b/PA/main.cpp +++ b/bench/BasicMathFun/MatrixMult/Batch/MatrixVectorMult16/64b/PA/main.cpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "armral.h" @@ -59,9 +61,12 @@ void run_matvecmul_batch_i16_64b_pa_perf(uint32_t num_mats, int main(int argc, char **argv) { if (argc != 6) { // num_mats - The number of input matrices - // vecs_per_mat - The number of input and output vectors for each input matrix - // m - The number of rows in each matrix and the length of each output vector - // n - The number of columns in each matrixa and the length of each input vector + // vecs_per_mat - The number of input and output vectors for each + // input matrix + // m - The number of rows in each matrix and the length of + // each output vector + // n - The number of columns in each matrix and the length of + // each input vector // nreps - The number of times to repeat the function fprintf(stderr, "usage: %s num_mats vecs_per_mat m n nreps\n", argv[0]); exit(EXIT_FAILURE); diff --git a/bench/BasicMathFun/MatrixMult/Batch/MatrixVectorMult32/NonPA/bench.py b/bench/BasicMathFun/MatrixMult/Batch/MatrixVectorMult32/NonPA/bench.py index ca14a8682a2f0b06616001c806ee3742da8af545..20dd8685dd98ea2a06a123d919793d6bab1edf58 100755 --- a/bench/BasicMathFun/MatrixMult/Batch/MatrixVectorMult32/NonPA/bench.py +++ b/bench/BasicMathFun/MatrixMult/Batch/MatrixVectorMult32/NonPA/bench.py @@ -1,6 +1,8 @@ #!/usr/bin/env python3 # Arm RAN Acceleration Library -# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates +# SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its +# affiliates +# SPDX-License-Identifier: BSD-3-Clause import itertools import json diff --git a/bench/BasicMathFun/MatrixMult/Batch/MatrixVectorMult32/NonPA/main.cpp b/bench/BasicMathFun/MatrixMult/Batch/MatrixVectorMult32/NonPA/main.cpp index 3ea7a1cb4ab402e73fd5815c71b9a5c829f9940f..e8d7e41fcbd2d2448badf594abd0a4cd13b0541c 100644 --- a/bench/BasicMathFun/MatrixMult/Batch/MatrixVectorMult32/NonPA/main.cpp +++ b/bench/BasicMathFun/MatrixMult/Batch/MatrixVectorMult32/NonPA/main.cpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "armral.h" @@ -45,9 +47,12 @@ void run_matvecmul_batch_f32_perf(uint32_t num_mats, uint32_t vecs_per_mat, int main(int argc, char **argv) { if (argc != 6) { // num_mats - The number of input matrices - // vecs_per_mat - The number of input and output vectors for each input matrix - // m - The number of rows in each matrix and the length of each output vector - // n - The number of columns in each matrix and the length of each input vector + // vecs_per_mat - The number of input and output vectors for each + // input matrix + // m - The number of rows in each matrix and the length of + // each output vector + // n - The number of columns in each matrix and the length of + // each input vector // nreps - The number of times to repeat the function fprintf(stderr, "usage: %s num_mats vecs_per_mat m n nreps\n", argv[0]); exit(EXIT_FAILURE); diff --git a/bench/BasicMathFun/MatrixMult/Batch/MatrixVectorMult32/PA/bench.py b/bench/BasicMathFun/MatrixMult/Batch/MatrixVectorMult32/PA/bench.py index f10ae624629acb527344f1dc1bd2fbacfcc7031e..06354b44c5cb1ad3694fac6888e5fe88607e2c66 100755 --- a/bench/BasicMathFun/MatrixMult/Batch/MatrixVectorMult32/PA/bench.py +++ b/bench/BasicMathFun/MatrixMult/Batch/MatrixVectorMult32/PA/bench.py @@ -1,6 +1,8 @@ #!/usr/bin/env python3 # Arm RAN Acceleration Library -# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates +# SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its +# affiliates +# SPDX-License-Identifier: BSD-3-Clause import itertools import json diff --git a/bench/BasicMathFun/MatrixMult/Batch/MatrixVectorMult32/PA/main.cpp b/bench/BasicMathFun/MatrixMult/Batch/MatrixVectorMult32/PA/main.cpp index dee1b72b8703bf0e402975fc6e66c1177bd8c31c..2572dae323cd296c038cbeddfea6890a52eb25ea 100644 --- a/bench/BasicMathFun/MatrixMult/Batch/MatrixVectorMult32/PA/main.cpp +++ b/bench/BasicMathFun/MatrixMult/Batch/MatrixVectorMult32/PA/main.cpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "armral.h" @@ -59,9 +61,12 @@ void run_general_matvecmul_batch_f32_pa_perf(uint32_t num_mats, int main(int argc, char **argv) { if (argc != 6) { // num_mats - The number of input matrices - // vecs_per_mat - The number of input and output vectors for each input matrix - // m - The number of rows in each matrix and the length of each output vector - // n - The number of columns in each matrix and the length of each input vector + // vecs_per_mat - The number of input and output vectors for each + // input matrix + // m - The number of rows in each matrix and the length of + // each output vector + // n - The number of columns in each matrix and the length of + // each input vector // nreps - The number of times to repeat the function fprintf(stderr, "usage: %s num_mats vecs_per_mat m n nreps\n", argv[0]); exit(EXIT_FAILURE); diff --git a/bench/BasicMathFun/MatrixMult/Single/MatrixMult16/32b/bench.py b/bench/BasicMathFun/MatrixMult/Single/MatrixMult16/32b/bench.py index 243427188696b3cd0658104eba56d4dbd9ef732b..f9eae0884358d368c94440fc14e53226f97fcc7c 100755 --- a/bench/BasicMathFun/MatrixMult/Single/MatrixMult16/32b/bench.py +++ b/bench/BasicMathFun/MatrixMult/Single/MatrixMult16/32b/bench.py @@ -1,6 +1,8 @@ #!/usr/bin/env python3 # Arm RAN Acceleration Library -# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates +# SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its +# affiliates +# SPDX-License-Identifier: BSD-3-Clause import json from pathlib import Path diff --git a/bench/BasicMathFun/MatrixMult/Single/MatrixMult16/32b/main.cpp b/bench/BasicMathFun/MatrixMult/Single/MatrixMult16/32b/main.cpp index 48edaf721de81a74f61b851a2f45def5ede1d5dd..1687c6533c4f3334924b3b0d38aa61db11b85646 100644 --- a/bench/BasicMathFun/MatrixMult/Single/MatrixMult16/32b/main.cpp +++ b/bench/BasicMathFun/MatrixMult/Single/MatrixMult16/32b/main.cpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "armral.h" diff --git a/bench/BasicMathFun/MatrixMult/Single/MatrixMult16/64b/bench.py b/bench/BasicMathFun/MatrixMult/Single/MatrixMult16/64b/bench.py index 3250872080c705265a7dee04c9477947dc277196..12bbe63bcc87e8211b3f91f415da3a31e3a2d8a2 100755 --- a/bench/BasicMathFun/MatrixMult/Single/MatrixMult16/64b/bench.py +++ b/bench/BasicMathFun/MatrixMult/Single/MatrixMult16/64b/bench.py @@ -1,6 +1,8 @@ #!/usr/bin/env python3 # Arm RAN Acceleration Library -# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates +# SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its +# affiliates +# SPDX-License-Identifier: BSD-3-Clause import json from pathlib import Path diff --git a/bench/BasicMathFun/MatrixMult/Single/MatrixMult16/64b/main.cpp b/bench/BasicMathFun/MatrixMult/Single/MatrixMult16/64b/main.cpp index 57faa8ea75b159cbdc2b7ec7c2b521f2cd5b7ff4..3d6d746681a3b1539c224d7e10ffee99327ae4d6 100644 --- a/bench/BasicMathFun/MatrixMult/Single/MatrixMult16/64b/main.cpp +++ b/bench/BasicMathFun/MatrixMult/Single/MatrixMult16/64b/main.cpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "armral.h" diff --git a/bench/BasicMathFun/MatrixMult/Single/MatrixMult32/2x2/IQ/bench.py b/bench/BasicMathFun/MatrixMult/Single/MatrixMult32/2x2/IQ/bench.py index 1ee075561c90cac508128595f631b06611ae17b8..d8e61392bb27a3ee21a2e4e0c646814e3b14615d 100755 --- a/bench/BasicMathFun/MatrixMult/Single/MatrixMult32/2x2/IQ/bench.py +++ b/bench/BasicMathFun/MatrixMult/Single/MatrixMult32/2x2/IQ/bench.py @@ -1,6 +1,8 @@ #!/usr/bin/env python3 # Arm RAN Acceleration Library -# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates +# SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its +# affiliates +# SPDX-License-Identifier: BSD-3-Clause import json from pathlib import Path diff --git a/bench/BasicMathFun/MatrixMult/Single/MatrixMult32/2x2/IQ/main.cpp b/bench/BasicMathFun/MatrixMult/Single/MatrixMult32/2x2/IQ/main.cpp index fab257f79e5e27759f9fc3efe439ce63d6a5268b..6a69393ab7e55984d85ef8c28d1aadfd15a43b0f 100644 --- a/bench/BasicMathFun/MatrixMult/Single/MatrixMult32/2x2/IQ/main.cpp +++ b/bench/BasicMathFun/MatrixMult/Single/MatrixMult32/2x2/IQ/main.cpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "cf32_utils.hpp" diff --git a/bench/BasicMathFun/MatrixMult/Single/MatrixMult32/2x2/NonIQ/bench.py b/bench/BasicMathFun/MatrixMult/Single/MatrixMult32/2x2/NonIQ/bench.py index 3cb7d5d19a1f3fe20e3705e8a4c0e9635106e109..eb7e6bff5e23cbb91b64b8f86477ea7755873868 100755 --- a/bench/BasicMathFun/MatrixMult/Single/MatrixMult32/2x2/NonIQ/bench.py +++ b/bench/BasicMathFun/MatrixMult/Single/MatrixMult32/2x2/NonIQ/bench.py @@ -1,6 +1,8 @@ #!/usr/bin/env python3 # Arm RAN Acceleration Library -# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates +# SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its +# affiliates +# SPDX-License-Identifier: BSD-3-Clause import json from pathlib import Path diff --git a/bench/BasicMathFun/MatrixMult/Single/MatrixMult32/2x2/NonIQ/main.cpp b/bench/BasicMathFun/MatrixMult/Single/MatrixMult32/2x2/NonIQ/main.cpp index 060fe7e460b9ad34ade6ab607f43fbaaa0eb8952..d33dcfce3341f902ce62b6fa6b843401a709bf80 100644 --- a/bench/BasicMathFun/MatrixMult/Single/MatrixMult32/2x2/NonIQ/main.cpp +++ b/bench/BasicMathFun/MatrixMult/Single/MatrixMult32/2x2/NonIQ/main.cpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "armral.h" diff --git a/bench/BasicMathFun/MatrixMult/Single/MatrixMult32/4x4/IQ/bench.py b/bench/BasicMathFun/MatrixMult/Single/MatrixMult32/4x4/IQ/bench.py index e79186feef95840bd3a3b08c18a5266bd60cfc8e..3366423a5c964295918132619e166828e952996d 100755 --- a/bench/BasicMathFun/MatrixMult/Single/MatrixMult32/4x4/IQ/bench.py +++ b/bench/BasicMathFun/MatrixMult/Single/MatrixMult32/4x4/IQ/bench.py @@ -1,6 +1,8 @@ #!/usr/bin/env python3 # Arm RAN Acceleration Library -# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates +# SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its +# affiliates +# SPDX-License-Identifier: BSD-3-Clause import json from pathlib import Path diff --git a/bench/BasicMathFun/MatrixMult/Single/MatrixMult32/4x4/IQ/main.cpp b/bench/BasicMathFun/MatrixMult/Single/MatrixMult32/4x4/IQ/main.cpp index f0af1d848d72dee5a0ace816161650e3d41aeafd..4f2666db656289d58eb63faaf035ee5ba5e8fce5 100644 --- a/bench/BasicMathFun/MatrixMult/Single/MatrixMult32/4x4/IQ/main.cpp +++ b/bench/BasicMathFun/MatrixMult/Single/MatrixMult32/4x4/IQ/main.cpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "cf32_utils.hpp" diff --git a/bench/BasicMathFun/MatrixMult/Single/MatrixMult32/4x4/NonIQ/bench.py b/bench/BasicMathFun/MatrixMult/Single/MatrixMult32/4x4/NonIQ/bench.py index 9c51504424a879e488a88a5be685fa505f1eb299..0327fc4a2745cb7afb0c81d094892ee196069717 100755 --- a/bench/BasicMathFun/MatrixMult/Single/MatrixMult32/4x4/NonIQ/bench.py +++ b/bench/BasicMathFun/MatrixMult/Single/MatrixMult32/4x4/NonIQ/bench.py @@ -1,6 +1,8 @@ #!/usr/bin/env python3 # Arm RAN Acceleration Library -# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates +# SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its +# affiliates +# SPDX-License-Identifier: BSD-3-Clause import json from pathlib import Path diff --git a/bench/BasicMathFun/MatrixMult/Single/MatrixMult32/4x4/NonIQ/main.cpp b/bench/BasicMathFun/MatrixMult/Single/MatrixMult32/4x4/NonIQ/main.cpp index a73a0742ad0ba161f53e02f07f4cb42274bf16e6..bf8e2d03ca2c3d55e832393a20ecd3d26b561520 100644 --- a/bench/BasicMathFun/MatrixMult/Single/MatrixMult32/4x4/NonIQ/main.cpp +++ b/bench/BasicMathFun/MatrixMult/Single/MatrixMult32/4x4/NonIQ/main.cpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "armral.h" diff --git a/bench/BasicMathFun/MatrixMult/Single/MatrixMult32/general/bench.py b/bench/BasicMathFun/MatrixMult/Single/MatrixMult32/general/bench.py index 87b90583ca5461c68ad01433d9baf5cf1debc81d..ed45660410056a9faa933ded0e823ebb744fe410 100755 --- a/bench/BasicMathFun/MatrixMult/Single/MatrixMult32/general/bench.py +++ b/bench/BasicMathFun/MatrixMult/Single/MatrixMult32/general/bench.py @@ -1,6 +1,8 @@ #!/usr/bin/env python3 # Arm RAN Acceleration Library -# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates +# SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its +# affiliates +# SPDX-License-Identifier: BSD-3-Clause import json import itertools diff --git a/bench/BasicMathFun/MatrixMult/Single/MatrixMult32/general/main.cpp b/bench/BasicMathFun/MatrixMult/Single/MatrixMult32/general/main.cpp index 695fbc20245994125d7718ea42d2b0406afac5cc..bab4a86bfb1776780ed7b998cff93f43cd3d5f30 100644 --- a/bench/BasicMathFun/MatrixMult/Single/MatrixMult32/general/main.cpp +++ b/bench/BasicMathFun/MatrixMult/Single/MatrixMult32/general/main.cpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "armral.h" diff --git a/bench/BasicMathFun/MatrixMult/Single/MatrixMultAAH32/bench.py b/bench/BasicMathFun/MatrixMult/Single/MatrixMultAAH32/bench.py index 1dca6b0446f4c44d0f08839166a94ab87f092ea1..221112d7928009e8184f5e684065f5964e79ebb8 100755 --- a/bench/BasicMathFun/MatrixMult/Single/MatrixMultAAH32/bench.py +++ b/bench/BasicMathFun/MatrixMult/Single/MatrixMultAAH32/bench.py @@ -1,6 +1,8 @@ #!/usr/bin/env python3 # Arm RAN Acceleration Library -# SPDX-FileCopyrightText: Copyright 2023-2024 Arm Limited and/or its affiliates +# SPDX-FileCopyrightText: Copyright 2023-2025 Arm Limited and/or its +# affiliates +# SPDX-License-Identifier: BSD-3-Clause import json import itertools diff --git a/bench/BasicMathFun/MatrixMult/Single/MatrixMultAAH32/main.cpp b/bench/BasicMathFun/MatrixMult/Single/MatrixMultAAH32/main.cpp index 55e290a56f890dbda33a453cf99c6cf81e98c1c7..d0e98598b4b7f69967caddef3d80d73dddd454bb 100644 --- a/bench/BasicMathFun/MatrixMult/Single/MatrixMultAAH32/main.cpp +++ b/bench/BasicMathFun/MatrixMult/Single/MatrixMultAAH32/main.cpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2023-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2023-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "armral.h" diff --git a/bench/BasicMathFun/MatrixMult/Single/MatrixMultAHB32/bench.py b/bench/BasicMathFun/MatrixMult/Single/MatrixMultAHB32/bench.py index c7f14afac9173da4b63e4f3a6fc5860e44198aa0..c1a81e720d9d3f9d656a106cc62f315945458a32 100755 --- a/bench/BasicMathFun/MatrixMult/Single/MatrixMultAHB32/bench.py +++ b/bench/BasicMathFun/MatrixMult/Single/MatrixMultAHB32/bench.py @@ -1,6 +1,8 @@ #!/usr/bin/env python3 # Arm RAN Acceleration Library -# SPDX-FileCopyrightText: Copyright 2023-2024 Arm Limited and/or its affiliates +# SPDX-FileCopyrightText: Copyright 2023-2025 Arm Limited and/or its +# affiliates +# SPDX-License-Identifier: BSD-3-Clause import json import itertools diff --git a/bench/BasicMathFun/MatrixMult/Single/MatrixMultAHB32/main.cpp b/bench/BasicMathFun/MatrixMult/Single/MatrixMultAHB32/main.cpp index 041939c02699dd520eee444ee2d71293c824be2c..747c238afbfc8be0b8fd9e4305244cfe3e77186a 100644 --- a/bench/BasicMathFun/MatrixMult/Single/MatrixMultAHB32/main.cpp +++ b/bench/BasicMathFun/MatrixMult/Single/MatrixMultAHB32/main.cpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2023-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2023-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "armral.h" diff --git a/bench/BasicMathFun/MatrixMult/Single/MatrixVectorMult16/32bit/bench.py b/bench/BasicMathFun/MatrixMult/Single/MatrixVectorMult16/32bit/bench.py index d2d7e5834af879e2b4f50b7e864a2cafb4ca7c17..7485779503fb496c47da9711d713aee6affe6b18 100755 --- a/bench/BasicMathFun/MatrixMult/Single/MatrixVectorMult16/32bit/bench.py +++ b/bench/BasicMathFun/MatrixMult/Single/MatrixVectorMult16/32bit/bench.py @@ -1,6 +1,8 @@ #!/usr/bin/env python3 # Arm RAN Acceleration Library -# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates +# SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its +# affiliates +# SPDX-License-Identifier: BSD-3-Clause import json import itertools diff --git a/bench/BasicMathFun/MatrixMult/Single/MatrixVectorMult16/32bit/main.cpp b/bench/BasicMathFun/MatrixMult/Single/MatrixVectorMult16/32bit/main.cpp index 3d21c2371c63d23055b7c1ef0ac607fe3ccf1dd2..f0efedd56a870b1ee67b74039f27204a898b3978 100644 --- a/bench/BasicMathFun/MatrixMult/Single/MatrixVectorMult16/32bit/main.cpp +++ b/bench/BasicMathFun/MatrixMult/Single/MatrixVectorMult16/32bit/main.cpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "armral.h" @@ -33,8 +35,10 @@ void run_mat_vec_mult_i16_32b_perf(const uint32_t m, const uint32_t n, int main(int argc, char **argv) { if (argc != 4) { - // m - The number of rows in the matrix, and the length of the output vector - // n - The number of columns in the matrix, and the length of the input vector + // m - The number of rows in the matrix, and the length of the output + // vector + // n - The number of columns in the matrix, and the length of + // the input vector // num_reps - The number of times to repeat the function fprintf(stderr, "usage: %s m n nreps\n", argv[0]); exit(EXIT_FAILURE); diff --git a/bench/BasicMathFun/MatrixMult/Single/MatrixVectorMult16/64bit/bench.py b/bench/BasicMathFun/MatrixMult/Single/MatrixVectorMult16/64bit/bench.py index be7d58d22c71acdba031145bd387284b55a63367..29349da2777d493b5b233aae60b9d2f16eff92b1 100755 --- a/bench/BasicMathFun/MatrixMult/Single/MatrixVectorMult16/64bit/bench.py +++ b/bench/BasicMathFun/MatrixMult/Single/MatrixVectorMult16/64bit/bench.py @@ -1,6 +1,8 @@ #!/usr/bin/env python3 # Arm RAN Acceleration Library -# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates +# SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its +# affiliates +# SPDX-License-Identifier: BSD-3-Clause import json import itertools diff --git a/bench/BasicMathFun/MatrixMult/Single/MatrixVectorMult16/64bit/main.cpp b/bench/BasicMathFun/MatrixMult/Single/MatrixVectorMult16/64bit/main.cpp index 5481fac80d367a7365ff771482ba20c3de4c70e6..c5ac5504a827e5a9f144d01ed16b67404a1ea30c 100644 --- a/bench/BasicMathFun/MatrixMult/Single/MatrixVectorMult16/64bit/main.cpp +++ b/bench/BasicMathFun/MatrixMult/Single/MatrixVectorMult16/64bit/main.cpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "armral.h" @@ -33,8 +35,10 @@ void run_mat_vec_mult_i16_64b_perf(const uint32_t m, const uint32_t n, int main(int argc, char **argv) { if (argc != 4) { - // m - The number of rows in the matrix, and the length of the output vector - // n - The number of columns in the matrix, and the length of the input vector + // m - The number of rows in the matrix, and the length of the output + // vector + // n - The number of columns in the matrix, and the length of + // the input vector // num_reps - The number of times to repeat the function fprintf(stderr, "usage: %s m n nreps\n", argv[0]); exit(EXIT_FAILURE); diff --git a/bench/BasicMathFun/MatrixMult/Single/MatrixVectorMult32/bench.py b/bench/BasicMathFun/MatrixMult/Single/MatrixVectorMult32/bench.py index 3a6ee40797a4d5e4607e91a48294cda602c3f927..79348fd3234ea7dedef368c34738717d16e595ff 100755 --- a/bench/BasicMathFun/MatrixMult/Single/MatrixVectorMult32/bench.py +++ b/bench/BasicMathFun/MatrixMult/Single/MatrixVectorMult32/bench.py @@ -1,6 +1,8 @@ #!/usr/bin/env python3 # Arm RAN Acceleration Library -# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates +# SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its +# affiliates +# SPDX-License-Identifier: BSD-3-Clause import json import itertools diff --git a/bench/BasicMathFun/MatrixMult/Single/MatrixVectorMult32/main.cpp b/bench/BasicMathFun/MatrixMult/Single/MatrixVectorMult32/main.cpp index 07a22a33449230fd0ee2e95df390605e8719bee2..a3752b16ca25deeacc45023418b364dd5363eb26 100644 --- a/bench/BasicMathFun/MatrixMult/Single/MatrixVectorMult32/main.cpp +++ b/bench/BasicMathFun/MatrixMult/Single/MatrixVectorMult32/main.cpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "cf32_utils.hpp" diff --git a/bench/BasicMathFun/MatrixPseudoInv/Direct/bench.py b/bench/BasicMathFun/MatrixPseudoInv/Direct/bench.py index 56c99b43f65664a2cd6b2d3e750a80955e872c13..33b498e87deb02e62acf93a454bb34ec2a8184f4 100755 --- a/bench/BasicMathFun/MatrixPseudoInv/Direct/bench.py +++ b/bench/BasicMathFun/MatrixPseudoInv/Direct/bench.py @@ -1,6 +1,8 @@ #!/usr/bin/env python3 # Arm RAN Acceleration Library -# SPDX-FileCopyrightText: Copyright 2023-2024 Arm Limited and/or its affiliates +# SPDX-FileCopyrightText: Copyright 2023-2025 Arm Limited and/or its +# affiliates +# SPDX-License-Identifier: BSD-3-Clause import json import itertools diff --git a/bench/BasicMathFun/MatrixPseudoInv/Direct/main.cpp b/bench/BasicMathFun/MatrixPseudoInv/Direct/main.cpp index 1e559691ec5909f78a836e0cc9679734b2482c08..69653e0ac7e86761ba939d3d6632ebedd9a6cbd5 100644 --- a/bench/BasicMathFun/MatrixPseudoInv/Direct/main.cpp +++ b/bench/BasicMathFun/MatrixPseudoInv/Direct/main.cpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2023-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2023-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "armral.h" diff --git a/bench/BasicMathFun/VectorDotProd/VecDot16/bench.py b/bench/BasicMathFun/VectorDotProd/VecDot16/bench.py index a333391a9ba8377502f36c9f2b3c5ee2e9a1c780..a88bdec2a8e92cb174d92e96bafe0a44b788358a 100755 --- a/bench/BasicMathFun/VectorDotProd/VecDot16/bench.py +++ b/bench/BasicMathFun/VectorDotProd/VecDot16/bench.py @@ -1,6 +1,8 @@ #!/usr/bin/env python3 # Arm RAN Acceleration Library -# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates +# SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its +# affiliates +# SPDX-License-Identifier: BSD-3-Clause import json from pathlib import Path diff --git a/bench/BasicMathFun/VectorDotProd/VecDot16/main.cpp b/bench/BasicMathFun/VectorDotProd/VecDot16/main.cpp index d1542064dd78fc9681437dcd872194b5c5491023..92f68adc1535786f18e192160278865492873f80 100644 --- a/bench/BasicMathFun/VectorDotProd/VecDot16/main.cpp +++ b/bench/BasicMathFun/VectorDotProd/VecDot16/main.cpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "armral.h" diff --git a/bench/BasicMathFun/VectorDotProd/VecDot16_2/bench.py b/bench/BasicMathFun/VectorDotProd/VecDot16_2/bench.py index e0c3df7b225aae0bc723ba74dfb0f5b12441274b..3ad4ce56819c3069dfbd9fb217dff0d007536c0b 100755 --- a/bench/BasicMathFun/VectorDotProd/VecDot16_2/bench.py +++ b/bench/BasicMathFun/VectorDotProd/VecDot16_2/bench.py @@ -1,6 +1,8 @@ #!/usr/bin/env python3 # Arm RAN Acceleration Library -# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates +# SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its +# affiliates +# SPDX-License-Identifier: BSD-3-Clause import json from pathlib import Path diff --git a/bench/BasicMathFun/VectorDotProd/VecDot16_2/main.cpp b/bench/BasicMathFun/VectorDotProd/VecDot16_2/main.cpp index b1e24eab16f3a61caba36d5ba8a7795a97f678cc..2fa64fdb7e862dbf0a9275420113bf2eb09d9635 100644 --- a/bench/BasicMathFun/VectorDotProd/VecDot16_2/main.cpp +++ b/bench/BasicMathFun/VectorDotProd/VecDot16_2/main.cpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "armral.h" diff --git a/bench/BasicMathFun/VectorDotProd/VecDot16_2_32bit/bench.py b/bench/BasicMathFun/VectorDotProd/VecDot16_2_32bit/bench.py index 2be5f3189b145c6cef29a6bc414f271b5ac12c4c..ea725a09006a95331945d17b612dff860d776d35 100755 --- a/bench/BasicMathFun/VectorDotProd/VecDot16_2_32bit/bench.py +++ b/bench/BasicMathFun/VectorDotProd/VecDot16_2_32bit/bench.py @@ -1,6 +1,8 @@ #!/usr/bin/env python3 # Arm RAN Acceleration Library -# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates +# SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its +# affiliates +# SPDX-License-Identifier: BSD-3-Clause import json from pathlib import Path diff --git a/bench/BasicMathFun/VectorDotProd/VecDot16_2_32bit/main.cpp b/bench/BasicMathFun/VectorDotProd/VecDot16_2_32bit/main.cpp index bc51b6aaeb27a72b427534593cc0ac1819ec12bc..eee3f702a94f96298e6155968657375230abf826 100644 --- a/bench/BasicMathFun/VectorDotProd/VecDot16_2_32bit/main.cpp +++ b/bench/BasicMathFun/VectorDotProd/VecDot16_2_32bit/main.cpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "armral.h" diff --git a/bench/BasicMathFun/VectorDotProd/VecDot16_32bit/bench.py b/bench/BasicMathFun/VectorDotProd/VecDot16_32bit/bench.py index 70c0455fe5b2e6bc39fb80b172fd1b97a48ab66a..69958cfc58a3bcfc8f68e2d7418cbc7a144861f1 100755 --- a/bench/BasicMathFun/VectorDotProd/VecDot16_32bit/bench.py +++ b/bench/BasicMathFun/VectorDotProd/VecDot16_32bit/bench.py @@ -1,6 +1,8 @@ #!/usr/bin/env python3 # Arm RAN Acceleration Library -# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates +# SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its +# affiliates +# SPDX-License-Identifier: BSD-3-Clause import json from pathlib import Path diff --git a/bench/BasicMathFun/VectorDotProd/VecDot16_32bit/main.cpp b/bench/BasicMathFun/VectorDotProd/VecDot16_32bit/main.cpp index 618feebd476ae82c9d36bd4e78899ae6c098db20..f3492646044fe5d0ea68b6062a0a2af27203554b 100644 --- a/bench/BasicMathFun/VectorDotProd/VecDot16_32bit/main.cpp +++ b/bench/BasicMathFun/VectorDotProd/VecDot16_32bit/main.cpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "armral.h" diff --git a/bench/BasicMathFun/VectorDotProd/VecDot32/bench.py b/bench/BasicMathFun/VectorDotProd/VecDot32/bench.py index 37a8b266183fd01b80d5abc98f91c2bc5349bcc6..2a02eedda18617d22acce91eceb9f0360cec3ae2 100755 --- a/bench/BasicMathFun/VectorDotProd/VecDot32/bench.py +++ b/bench/BasicMathFun/VectorDotProd/VecDot32/bench.py @@ -1,6 +1,8 @@ #!/usr/bin/env python3 # Arm RAN Acceleration Library -# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates +# SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its +# affiliates +# SPDX-License-Identifier: BSD-3-Clause import json from pathlib import Path diff --git a/bench/BasicMathFun/VectorDotProd/VecDot32/main.cpp b/bench/BasicMathFun/VectorDotProd/VecDot32/main.cpp index c2aee11db3312bad9b52cb07563d1558ad755d02..29d38346e234af63889b29a62d90bb44b67cc7e0 100644 --- a/bench/BasicMathFun/VectorDotProd/VecDot32/main.cpp +++ b/bench/BasicMathFun/VectorDotProd/VecDot32/main.cpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "armral.h" diff --git a/bench/BasicMathFun/VectorDotProd/VecDot32_2/bench.py b/bench/BasicMathFun/VectorDotProd/VecDot32_2/bench.py index 0a4b022dc0ada7794a216fec98014a6df4e06981..c38e145be891928af1f791298ca26d6b63081701 100755 --- a/bench/BasicMathFun/VectorDotProd/VecDot32_2/bench.py +++ b/bench/BasicMathFun/VectorDotProd/VecDot32_2/bench.py @@ -1,6 +1,8 @@ #!/usr/bin/env python3 # Arm RAN Acceleration Library -# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates +# SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its +# affiliates +# SPDX-License-Identifier: BSD-3-Clause import json from pathlib import Path diff --git a/bench/BasicMathFun/VectorDotProd/VecDot32_2/main.cpp b/bench/BasicMathFun/VectorDotProd/VecDot32_2/main.cpp index a379d9e6a9a79f66d64d7e337466f75572fc99e8..ce24624d49c52667798e6f7ae2d01b14a37f0bf5 100644 --- a/bench/BasicMathFun/VectorDotProd/VecDot32_2/main.cpp +++ b/bench/BasicMathFun/VectorDotProd/VecDot32_2/main.cpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "armral.h" diff --git a/bench/BasicMathFun/VectorMult/VecMul16/bench.py b/bench/BasicMathFun/VectorMult/VecMul16/bench.py index e6f953ef5759d03c4d99bfcd8a7f3bd1a1da227e..d53072bf8d20e704b1764d65ffcaec98d7b34ce6 100755 --- a/bench/BasicMathFun/VectorMult/VecMul16/bench.py +++ b/bench/BasicMathFun/VectorMult/VecMul16/bench.py @@ -1,6 +1,8 @@ #!/usr/bin/env python3 # Arm RAN Acceleration Library -# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates +# SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its +# affiliates +# SPDX-License-Identifier: BSD-3-Clause import json from pathlib import Path diff --git a/bench/BasicMathFun/VectorMult/VecMul16/main.cpp b/bench/BasicMathFun/VectorMult/VecMul16/main.cpp index e6999be9c159e1770ae1eb53c72e280d13b96f11..9fad9dbc63ff48c8f7ba1dae3b05b03218473954 100644 --- a/bench/BasicMathFun/VectorMult/VecMul16/main.cpp +++ b/bench/BasicMathFun/VectorMult/VecMul16/main.cpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "armral.h" diff --git a/bench/BasicMathFun/VectorMult/VecMul16_2/bench.py b/bench/BasicMathFun/VectorMult/VecMul16_2/bench.py index 99f3d83e006b56b23e922c6c02a0270f81a89d3c..8c2cc81263e0d102f986f449f0bfea6711650d4f 100755 --- a/bench/BasicMathFun/VectorMult/VecMul16_2/bench.py +++ b/bench/BasicMathFun/VectorMult/VecMul16_2/bench.py @@ -1,6 +1,8 @@ #!/usr/bin/env python3 # Arm RAN Acceleration Library -# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates +# SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its +# affiliates +# SPDX-License-Identifier: BSD-3-Clause import json from pathlib import Path diff --git a/bench/BasicMathFun/VectorMult/VecMul16_2/main.cpp b/bench/BasicMathFun/VectorMult/VecMul16_2/main.cpp index 876dd37ec5eb5ede8e5b02ef8f6d619e8220f103..d73d91396f8cf2e6f465e443763151f824d56bb9 100644 --- a/bench/BasicMathFun/VectorMult/VecMul16_2/main.cpp +++ b/bench/BasicMathFun/VectorMult/VecMul16_2/main.cpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "armral.h" diff --git a/bench/BasicMathFun/VectorMult/VecMul32/bench.py b/bench/BasicMathFun/VectorMult/VecMul32/bench.py index ac6de3316a0a2ad89d0eb8322ae2ab5f8e302191..9ba3c66f1a24e86fc816c237f6fae3ca16403bab 100755 --- a/bench/BasicMathFun/VectorMult/VecMul32/bench.py +++ b/bench/BasicMathFun/VectorMult/VecMul32/bench.py @@ -1,6 +1,8 @@ #!/usr/bin/env python3 # Arm RAN Acceleration Library -# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates +# SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its +# affiliates +# SPDX-License-Identifier: BSD-3-Clause import json from pathlib import Path diff --git a/bench/BasicMathFun/VectorMult/VecMul32/main.cpp b/bench/BasicMathFun/VectorMult/VecMul32/main.cpp index 07fec66c4555f0163e1f153bb3861dde595e0302..9de1e9ecff28927b7a244d06be0d2cbdd2797df9 100644 --- a/bench/BasicMathFun/VectorMult/VecMul32/main.cpp +++ b/bench/BasicMathFun/VectorMult/VecMul32/main.cpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "armral.h" diff --git a/bench/BasicMathFun/VectorMult/VecMul32_2/bench.py b/bench/BasicMathFun/VectorMult/VecMul32_2/bench.py index c7936ea9dd19b58ef331bb2dd9e5fba18c92f8a6..c4619f477792b5f5cfacfbde4c5e935ddf41b4b3 100755 --- a/bench/BasicMathFun/VectorMult/VecMul32_2/bench.py +++ b/bench/BasicMathFun/VectorMult/VecMul32_2/bench.py @@ -1,6 +1,8 @@ #!/usr/bin/env python3 # Arm RAN Acceleration Library -# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates +# SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its +# affiliates +# SPDX-License-Identifier: BSD-3-Clause import json from pathlib import Path diff --git a/bench/BasicMathFun/VectorMult/VecMul32_2/main.cpp b/bench/BasicMathFun/VectorMult/VecMul32_2/main.cpp index b42d6ad7f61d2f1c371b7594280a66aa33b428b7..30d77d29df158cc438b989aec1a5d94f3a19e77e 100644 --- a/bench/BasicMathFun/VectorMult/VecMul32_2/main.cpp +++ b/bench/BasicMathFun/VectorMult/VecMul32_2/main.cpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "armral.h" diff --git a/bench/DuRuInterface/MuLaw/Compression/14bit/bench.py b/bench/DuRuInterface/MuLaw/Compression/14bit/bench.py index 95720ea6bd98ffdee2e111e982256ef5226b9642..1c560a1f98b3c0adeb41169b21d254f27afdce84 100755 --- a/bench/DuRuInterface/MuLaw/Compression/14bit/bench.py +++ b/bench/DuRuInterface/MuLaw/Compression/14bit/bench.py @@ -1,6 +1,8 @@ #!/usr/bin/env python3 # Arm RAN Acceleration Library -# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates +# SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its +# affiliates +# SPDX-License-Identifier: BSD-3-Clause import json from pathlib import Path diff --git a/bench/DuRuInterface/MuLaw/Compression/14bit/main.cpp b/bench/DuRuInterface/MuLaw/Compression/14bit/main.cpp index 615c0a5a6ea5c5214ee2c5c6af0416649fee7f85..d411eed3fd0d7554532c91815f502c682a0f1f84 100644 --- a/bench/DuRuInterface/MuLaw/Compression/14bit/main.cpp +++ b/bench/DuRuInterface/MuLaw/Compression/14bit/main.cpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "armral.h" diff --git a/bench/DuRuInterface/MuLaw/Compression/8bit/bench.py b/bench/DuRuInterface/MuLaw/Compression/8bit/bench.py index f55e33ba31e2f59bba6efc00a3e8615db6c6f3e3..7dc3c0e588df3a287c7e188aa25e0bcafea028bc 100755 --- a/bench/DuRuInterface/MuLaw/Compression/8bit/bench.py +++ b/bench/DuRuInterface/MuLaw/Compression/8bit/bench.py @@ -1,6 +1,8 @@ #!/usr/bin/env python3 # Arm RAN Acceleration Library -# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates +# SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its +# affiliates +# SPDX-License-Identifier: BSD-3-Clause import json from pathlib import Path diff --git a/bench/DuRuInterface/MuLaw/Compression/8bit/main.cpp b/bench/DuRuInterface/MuLaw/Compression/8bit/main.cpp index ffdc044df25e9adbb1b9f3c2fb932c3d4dbefedb..be49c01282275f6b4d776d294e19ddf94b981c1a 100644 --- a/bench/DuRuInterface/MuLaw/Compression/8bit/main.cpp +++ b/bench/DuRuInterface/MuLaw/Compression/8bit/main.cpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "armral.h" diff --git a/bench/DuRuInterface/MuLaw/Compression/9bit/bench.py b/bench/DuRuInterface/MuLaw/Compression/9bit/bench.py index 82fc07f9fbe0735e610c72d563c1e3bcab1e80c7..d35020d7a9928da3a2fcd1575cc4bb22653d583b 100755 --- a/bench/DuRuInterface/MuLaw/Compression/9bit/bench.py +++ b/bench/DuRuInterface/MuLaw/Compression/9bit/bench.py @@ -1,6 +1,8 @@ #!/usr/bin/env python3 # Arm RAN Acceleration Library -# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates +# SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its +# affiliates +# SPDX-License-Identifier: BSD-3-Clause import json from pathlib import Path diff --git a/bench/DuRuInterface/MuLaw/Compression/9bit/main.cpp b/bench/DuRuInterface/MuLaw/Compression/9bit/main.cpp index a1ef41031d32c0fe1c09c181279c54a174eaf05d..1cb3f10cc2af2e2cd97e1bb95b3b664358882544 100644 --- a/bench/DuRuInterface/MuLaw/Compression/9bit/main.cpp +++ b/bench/DuRuInterface/MuLaw/Compression/9bit/main.cpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "armral.h" diff --git a/bench/DuRuInterface/MuLaw/Decompression/14bit/bench.py b/bench/DuRuInterface/MuLaw/Decompression/14bit/bench.py index 48cb1fd53707853de750f205c76fdb7274bfdb6a..f6d6c9a038921e802c67788f51d28581dcc1ab10 100755 --- a/bench/DuRuInterface/MuLaw/Decompression/14bit/bench.py +++ b/bench/DuRuInterface/MuLaw/Decompression/14bit/bench.py @@ -1,6 +1,8 @@ #!/usr/bin/env python3 # Arm RAN Acceleration Library -# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates +# SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its +# affiliates +# SPDX-License-Identifier: BSD-3-Clause import json from pathlib import Path diff --git a/bench/DuRuInterface/MuLaw/Decompression/14bit/main.cpp b/bench/DuRuInterface/MuLaw/Decompression/14bit/main.cpp index e8b9dfd6e28d9279f097efb1f4575236328bf976..f4844cb6cb2672025ec3adf3a641aa648557320d 100644 --- a/bench/DuRuInterface/MuLaw/Decompression/14bit/main.cpp +++ b/bench/DuRuInterface/MuLaw/Decompression/14bit/main.cpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "armral.h" diff --git a/bench/DuRuInterface/MuLaw/Decompression/8bit/bench.py b/bench/DuRuInterface/MuLaw/Decompression/8bit/bench.py index 0444d8a22f3def7e8e6de0fb8deebb4ceee757a1..caa5a39287f5cdc2d646c9bbd09a7e9469541b53 100755 --- a/bench/DuRuInterface/MuLaw/Decompression/8bit/bench.py +++ b/bench/DuRuInterface/MuLaw/Decompression/8bit/bench.py @@ -1,6 +1,8 @@ #!/usr/bin/env python3 # Arm RAN Acceleration Library -# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates +# SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its +# affiliates +# SPDX-License-Identifier: BSD-3-Clause import json from pathlib import Path diff --git a/bench/DuRuInterface/MuLaw/Decompression/8bit/main.cpp b/bench/DuRuInterface/MuLaw/Decompression/8bit/main.cpp index 61104eb7a56adef9261c828dcc07933e9be629f7..0dc1074b71d66ea6660bba04a643c7014f6fc46c 100644 --- a/bench/DuRuInterface/MuLaw/Decompression/8bit/main.cpp +++ b/bench/DuRuInterface/MuLaw/Decompression/8bit/main.cpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "armral.h" diff --git a/bench/DuRuInterface/MuLaw/Decompression/9bit/bench.py b/bench/DuRuInterface/MuLaw/Decompression/9bit/bench.py index 2bfe560ad7eb47cfb80c8dec8610cff17285df79..076eb36ed44becf485b432290b624a7f0ecf1d90 100755 --- a/bench/DuRuInterface/MuLaw/Decompression/9bit/bench.py +++ b/bench/DuRuInterface/MuLaw/Decompression/9bit/bench.py @@ -1,6 +1,8 @@ #!/usr/bin/env python3 # Arm RAN Acceleration Library -# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates +# SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its +# affiliates +# SPDX-License-Identifier: BSD-3-Clause import json from pathlib import Path diff --git a/bench/DuRuInterface/MuLaw/Decompression/9bit/main.cpp b/bench/DuRuInterface/MuLaw/Decompression/9bit/main.cpp index ae484c97b154205f334866a566078e1da18ef63c..4d7cbcce5b888fe229ceaa1f8734522a1d74e713 100644 --- a/bench/DuRuInterface/MuLaw/Decompression/9bit/main.cpp +++ b/bench/DuRuInterface/MuLaw/Decompression/9bit/main.cpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "armral.h" diff --git a/bench/DuRuInterface/ORanBlockFloat/Compression/12bit/bench.py b/bench/DuRuInterface/ORanBlockFloat/Compression/12bit/bench.py index 54bce883feb879c24870910eb14e17c5a8b44fe0..9a2a5ac8662632fcd986be5724e254f0cebc9611 100755 --- a/bench/DuRuInterface/ORanBlockFloat/Compression/12bit/bench.py +++ b/bench/DuRuInterface/ORanBlockFloat/Compression/12bit/bench.py @@ -1,6 +1,8 @@ #!/usr/bin/env python3 # Arm RAN Acceleration Library -# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates +# SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its +# affiliates +# SPDX-License-Identifier: BSD-3-Clause import json from pathlib import Path diff --git a/bench/DuRuInterface/ORanBlockFloat/Compression/12bit/main.cpp b/bench/DuRuInterface/ORanBlockFloat/Compression/12bit/main.cpp index e094bec6234bff9857ace7b878c5fbf928f87e0a..015dc8421b07f51fb3366becdd212834069aec20 100644 --- a/bench/DuRuInterface/ORanBlockFloat/Compression/12bit/main.cpp +++ b/bench/DuRuInterface/ORanBlockFloat/Compression/12bit/main.cpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "armral.h" diff --git a/bench/DuRuInterface/ORanBlockFloat/Compression/14bit/bench.py b/bench/DuRuInterface/ORanBlockFloat/Compression/14bit/bench.py index 3b30a95820d97907c93ce72492a251a2ddcda38f..166c9be159ae510f6266a82d5770ede4d2cc87bd 100755 --- a/bench/DuRuInterface/ORanBlockFloat/Compression/14bit/bench.py +++ b/bench/DuRuInterface/ORanBlockFloat/Compression/14bit/bench.py @@ -1,6 +1,8 @@ #!/usr/bin/env python3 # Arm RAN Acceleration Library -# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates +# SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its +# affiliates +# SPDX-License-Identifier: BSD-3-Clause import json from pathlib import Path diff --git a/bench/DuRuInterface/ORanBlockFloat/Compression/14bit/main.cpp b/bench/DuRuInterface/ORanBlockFloat/Compression/14bit/main.cpp index ecdcada46f6704e1f71b05509aa254025d5c92f5..d45f15cf7dd2036e28559c8bb2578de8a746c4f0 100644 --- a/bench/DuRuInterface/ORanBlockFloat/Compression/14bit/main.cpp +++ b/bench/DuRuInterface/ORanBlockFloat/Compression/14bit/main.cpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "armral.h" diff --git a/bench/DuRuInterface/ORanBlockFloat/Compression/8bit/bench.py b/bench/DuRuInterface/ORanBlockFloat/Compression/8bit/bench.py index baac1526473ae86fe3e1406528b67c669ef9e51e..aabb4e791a7f5b40cd09f27449a2132f4468e4f9 100755 --- a/bench/DuRuInterface/ORanBlockFloat/Compression/8bit/bench.py +++ b/bench/DuRuInterface/ORanBlockFloat/Compression/8bit/bench.py @@ -1,6 +1,8 @@ #!/usr/bin/env python3 # Arm RAN Acceleration Library -# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates +# SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its +# affiliates +# SPDX-License-Identifier: BSD-3-Clause import json from pathlib import Path diff --git a/bench/DuRuInterface/ORanBlockFloat/Compression/8bit/main.cpp b/bench/DuRuInterface/ORanBlockFloat/Compression/8bit/main.cpp index bd8f377f744677b8945312aeba84451c9a680b98..381316524d4900be6f853865e0418899a1ceee9c 100644 --- a/bench/DuRuInterface/ORanBlockFloat/Compression/8bit/main.cpp +++ b/bench/DuRuInterface/ORanBlockFloat/Compression/8bit/main.cpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "armral.h" diff --git a/bench/DuRuInterface/ORanBlockFloat/Compression/9bit/bench.py b/bench/DuRuInterface/ORanBlockFloat/Compression/9bit/bench.py index 2dfa15dfdde518c88c41eabed217fd14f39dfc0f..3119afe1d58afdb3526abde68c4dbc574408643f 100755 --- a/bench/DuRuInterface/ORanBlockFloat/Compression/9bit/bench.py +++ b/bench/DuRuInterface/ORanBlockFloat/Compression/9bit/bench.py @@ -1,6 +1,8 @@ #!/usr/bin/env python3 # Arm RAN Acceleration Library -# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates +# SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its +# affiliates +# SPDX-License-Identifier: BSD-3-Clause import json from pathlib import Path diff --git a/bench/DuRuInterface/ORanBlockFloat/Compression/9bit/main.cpp b/bench/DuRuInterface/ORanBlockFloat/Compression/9bit/main.cpp index 3734cbb706acc903d7fa8318c41f35fcaa301332..129ef5784fbc99294c674f4f9b80d31672ffa81a 100644 --- a/bench/DuRuInterface/ORanBlockFloat/Compression/9bit/main.cpp +++ b/bench/DuRuInterface/ORanBlockFloat/Compression/9bit/main.cpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "armral.h" diff --git a/bench/DuRuInterface/ORanBlockFloat/Decompression/12bit/bench.py b/bench/DuRuInterface/ORanBlockFloat/Decompression/12bit/bench.py index 1a0883aee5c4e0bfd2547c02b6f5130c2d988f24..056af789a45e44f0f2388bf11d7bc1aec4d2cf2c 100755 --- a/bench/DuRuInterface/ORanBlockFloat/Decompression/12bit/bench.py +++ b/bench/DuRuInterface/ORanBlockFloat/Decompression/12bit/bench.py @@ -1,6 +1,8 @@ #!/usr/bin/env python3 # Arm RAN Acceleration Library -# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates +# SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its +# affiliates +# SPDX-License-Identifier: BSD-3-Clause import json from pathlib import Path diff --git a/bench/DuRuInterface/ORanBlockFloat/Decompression/12bit/main.cpp b/bench/DuRuInterface/ORanBlockFloat/Decompression/12bit/main.cpp index ef139818531d120f221f5b63997dece4c415d669..474307c6ecfb382cfa48794f5dd0dc19340df9dd 100644 --- a/bench/DuRuInterface/ORanBlockFloat/Decompression/12bit/main.cpp +++ b/bench/DuRuInterface/ORanBlockFloat/Decompression/12bit/main.cpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "armral.h" diff --git a/bench/DuRuInterface/ORanBlockFloat/Decompression/14bit/bench.py b/bench/DuRuInterface/ORanBlockFloat/Decompression/14bit/bench.py index 6b38d699bc883d7a80c171d46e344c1cfa8250d3..1499ab14b1b5f68105ba7c65653414fda8a8768e 100755 --- a/bench/DuRuInterface/ORanBlockFloat/Decompression/14bit/bench.py +++ b/bench/DuRuInterface/ORanBlockFloat/Decompression/14bit/bench.py @@ -1,6 +1,8 @@ #!/usr/bin/env python3 # Arm RAN Acceleration Library -# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates +# SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its +# affiliates +# SPDX-License-Identifier: BSD-3-Clause import json from pathlib import Path diff --git a/bench/DuRuInterface/ORanBlockFloat/Decompression/14bit/main.cpp b/bench/DuRuInterface/ORanBlockFloat/Decompression/14bit/main.cpp index de72229341e803a87bbee0254382de269008db74..e97c02a1f7ffbb835ca6d3a815ed9c37c045ff79 100644 --- a/bench/DuRuInterface/ORanBlockFloat/Decompression/14bit/main.cpp +++ b/bench/DuRuInterface/ORanBlockFloat/Decompression/14bit/main.cpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "armral.h" diff --git a/bench/DuRuInterface/ORanBlockFloat/Decompression/8bit/bench.py b/bench/DuRuInterface/ORanBlockFloat/Decompression/8bit/bench.py index 8f6f7e8e693238833306a79fd63312fe35516e3b..9d94bc1f3b57646c10d7b78331cd877101e1c705 100755 --- a/bench/DuRuInterface/ORanBlockFloat/Decompression/8bit/bench.py +++ b/bench/DuRuInterface/ORanBlockFloat/Decompression/8bit/bench.py @@ -1,6 +1,8 @@ #!/usr/bin/env python3 # Arm RAN Acceleration Library -# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates +# SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its +# affiliates +# SPDX-License-Identifier: BSD-3-Clause import json from pathlib import Path diff --git a/bench/DuRuInterface/ORanBlockFloat/Decompression/8bit/main.cpp b/bench/DuRuInterface/ORanBlockFloat/Decompression/8bit/main.cpp index 954cbf2a459485f35c3d9344485498262685689b..a7ca4594ac0de55f3e135ba764b2e2a8134b1ee5 100644 --- a/bench/DuRuInterface/ORanBlockFloat/Decompression/8bit/main.cpp +++ b/bench/DuRuInterface/ORanBlockFloat/Decompression/8bit/main.cpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "armral.h" diff --git a/bench/DuRuInterface/ORanBlockFloat/Decompression/9bit/bench.py b/bench/DuRuInterface/ORanBlockFloat/Decompression/9bit/bench.py index c19dff8f0da95652e79dca29920f34cd3bffda7e..cc51bd5cc64a43df67a6fbf66550e4d6a227e1ef 100755 --- a/bench/DuRuInterface/ORanBlockFloat/Decompression/9bit/bench.py +++ b/bench/DuRuInterface/ORanBlockFloat/Decompression/9bit/bench.py @@ -1,6 +1,8 @@ #!/usr/bin/env python3 # Arm RAN Acceleration Library -# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates +# SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its +# affiliates +# SPDX-License-Identifier: BSD-3-Clause import json from pathlib import Path diff --git a/bench/DuRuInterface/ORanBlockFloat/Decompression/9bit/main.cpp b/bench/DuRuInterface/ORanBlockFloat/Decompression/9bit/main.cpp index 5b5722a436fcd29d85fa23d1714ddf5f451d693c..41d7be0de9b3963c4709b55e3f484081b6e5e1a0 100644 --- a/bench/DuRuInterface/ORanBlockFloat/Decompression/9bit/main.cpp +++ b/bench/DuRuInterface/ORanBlockFloat/Decompression/9bit/main.cpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "armral.h" diff --git a/bench/DuRuInterface/ORanBlockScaling/Compression/14bit/bench.py b/bench/DuRuInterface/ORanBlockScaling/Compression/14bit/bench.py index efc7012590cce747d2ede6e7a8170c82b179eac2..46080fe76becef6d96bbe731dd22ff0047346644 100755 --- a/bench/DuRuInterface/ORanBlockScaling/Compression/14bit/bench.py +++ b/bench/DuRuInterface/ORanBlockScaling/Compression/14bit/bench.py @@ -1,6 +1,8 @@ #!/usr/bin/env python3 # Arm RAN Acceleration Library -# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates +# SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its +# affiliates +# SPDX-License-Identifier: BSD-3-Clause import json from pathlib import Path diff --git a/bench/DuRuInterface/ORanBlockScaling/Compression/14bit/main.cpp b/bench/DuRuInterface/ORanBlockScaling/Compression/14bit/main.cpp index 239c2e7d7dba7dc3cae8e041380bea93cb0074d4..239177891743b15f6330a8b3ac432da999238801 100644 --- a/bench/DuRuInterface/ORanBlockScaling/Compression/14bit/main.cpp +++ b/bench/DuRuInterface/ORanBlockScaling/Compression/14bit/main.cpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "armral.h" diff --git a/bench/DuRuInterface/ORanBlockScaling/Compression/8bit/bench.py b/bench/DuRuInterface/ORanBlockScaling/Compression/8bit/bench.py index 78bc30807ec658af3053a25f27809c56df671faf..900c41b894ffd4bda297df63fe04e4b8fbaa73f9 100755 --- a/bench/DuRuInterface/ORanBlockScaling/Compression/8bit/bench.py +++ b/bench/DuRuInterface/ORanBlockScaling/Compression/8bit/bench.py @@ -1,6 +1,8 @@ #!/usr/bin/env python3 # Arm RAN Acceleration Library -# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates +# SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its +# affiliates +# SPDX-License-Identifier: BSD-3-Clause import json from pathlib import Path diff --git a/bench/DuRuInterface/ORanBlockScaling/Compression/8bit/main.cpp b/bench/DuRuInterface/ORanBlockScaling/Compression/8bit/main.cpp index 3b90304cc6dcde8d16a3a56fd4fe44a7481853da..ea087551dff755b310a85c1a60f5c3313c0e6fd3 100644 --- a/bench/DuRuInterface/ORanBlockScaling/Compression/8bit/main.cpp +++ b/bench/DuRuInterface/ORanBlockScaling/Compression/8bit/main.cpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "armral.h" diff --git a/bench/DuRuInterface/ORanBlockScaling/Compression/9bit/bench.py b/bench/DuRuInterface/ORanBlockScaling/Compression/9bit/bench.py index bfcc06884c59d9a4a7632654daabef28323ce863..d417f6787c8f5baf1dee8e2913be185f1bbf9b9c 100755 --- a/bench/DuRuInterface/ORanBlockScaling/Compression/9bit/bench.py +++ b/bench/DuRuInterface/ORanBlockScaling/Compression/9bit/bench.py @@ -1,6 +1,8 @@ #!/usr/bin/env python3 # Arm RAN Acceleration Library -# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates +# SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its +# affiliates +# SPDX-License-Identifier: BSD-3-Clause import json from pathlib import Path diff --git a/bench/DuRuInterface/ORanBlockScaling/Compression/9bit/main.cpp b/bench/DuRuInterface/ORanBlockScaling/Compression/9bit/main.cpp index 14b02de45fedd2a759d47b2e1da72f351c5e2616..0d458204beb8f7902ef6ef5efb609aca579bd551 100644 --- a/bench/DuRuInterface/ORanBlockScaling/Compression/9bit/main.cpp +++ b/bench/DuRuInterface/ORanBlockScaling/Compression/9bit/main.cpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "armral.h" diff --git a/bench/DuRuInterface/ORanBlockScaling/Decompression/14bit/bench.py b/bench/DuRuInterface/ORanBlockScaling/Decompression/14bit/bench.py index ac7429b7ec8c735eaea5c3e7bf72fd124ec8bdeb..01f7a8cd20bf081e00a4b186c8d7debb51d306aa 100755 --- a/bench/DuRuInterface/ORanBlockScaling/Decompression/14bit/bench.py +++ b/bench/DuRuInterface/ORanBlockScaling/Decompression/14bit/bench.py @@ -1,6 +1,8 @@ #!/usr/bin/env python3 # Arm RAN Acceleration Library -# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates +# SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its +# affiliates +# SPDX-License-Identifier: BSD-3-Clause import json from pathlib import Path diff --git a/bench/DuRuInterface/ORanBlockScaling/Decompression/14bit/main.cpp b/bench/DuRuInterface/ORanBlockScaling/Decompression/14bit/main.cpp index 9193570fbfab5e303bfc88c189fd8011e5d2f9de..338a996738e86157a3854cb589bf139ed182bed7 100644 --- a/bench/DuRuInterface/ORanBlockScaling/Decompression/14bit/main.cpp +++ b/bench/DuRuInterface/ORanBlockScaling/Decompression/14bit/main.cpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "armral.h" diff --git a/bench/DuRuInterface/ORanBlockScaling/Decompression/8bit/bench.py b/bench/DuRuInterface/ORanBlockScaling/Decompression/8bit/bench.py index 7d12222fa227506f03e35ac45759ddf2d9e0ba83..25675e6b4922b337b23905b29b721f0fd2bc5e04 100755 --- a/bench/DuRuInterface/ORanBlockScaling/Decompression/8bit/bench.py +++ b/bench/DuRuInterface/ORanBlockScaling/Decompression/8bit/bench.py @@ -1,6 +1,8 @@ #!/usr/bin/env python3 # Arm RAN Acceleration Library -# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates +# SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its +# affiliates +# SPDX-License-Identifier: BSD-3-Clause import json from pathlib import Path diff --git a/bench/DuRuInterface/ORanBlockScaling/Decompression/8bit/main.cpp b/bench/DuRuInterface/ORanBlockScaling/Decompression/8bit/main.cpp index f47cb545d26b652a25abefa1cc2a5cdc2189e10c..cf07171b4a13ed9bd6bd423303a0977e82b11b4a 100644 --- a/bench/DuRuInterface/ORanBlockScaling/Decompression/8bit/main.cpp +++ b/bench/DuRuInterface/ORanBlockScaling/Decompression/8bit/main.cpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "armral.h" diff --git a/bench/DuRuInterface/ORanBlockScaling/Decompression/9bit/bench.py b/bench/DuRuInterface/ORanBlockScaling/Decompression/9bit/bench.py index 4a2ab15f4e4631c73c4f59dee4fd92efee8f71d1..eeca70cb024f52f76bcfa2d42cebce86ab4b333e 100755 --- a/bench/DuRuInterface/ORanBlockScaling/Decompression/9bit/bench.py +++ b/bench/DuRuInterface/ORanBlockScaling/Decompression/9bit/bench.py @@ -1,6 +1,8 @@ #!/usr/bin/env python3 # Arm RAN Acceleration Library -# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates +# SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its +# affiliates +# SPDX-License-Identifier: BSD-3-Clause import json from pathlib import Path diff --git a/bench/DuRuInterface/ORanBlockScaling/Decompression/9bit/main.cpp b/bench/DuRuInterface/ORanBlockScaling/Decompression/9bit/main.cpp index 4d3c7d1084eb9e86d59fe1eac74edb9db5f8e35b..519f154c561fbef7b07b22ba6e813546088de9af 100644 --- a/bench/DuRuInterface/ORanBlockScaling/Decompression/9bit/main.cpp +++ b/bench/DuRuInterface/ORanBlockScaling/Decompression/9bit/main.cpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "armral.h" diff --git a/bench/LowerPHY/Correlation/bench.py b/bench/LowerPHY/Correlation/bench.py index e2a9be154217c1ac2a99ab915e295567b6b36a54..b7df41cf189e260feb51aec88f7147d869634093 100755 --- a/bench/LowerPHY/Correlation/bench.py +++ b/bench/LowerPHY/Correlation/bench.py @@ -1,6 +1,8 @@ #!/usr/bin/env python3 # Arm RAN Acceleration Library -# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates +# SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its +# affiliates +# SPDX-License-Identifier: BSD-3-Clause import json from pathlib import Path diff --git a/bench/LowerPHY/Correlation/main.cpp b/bench/LowerPHY/Correlation/main.cpp index 7315a83c3d2c17ea4804edc08a1287193f2f87c7..74545d2ddbb0886001ea0bead85a36e35508c9d4 100644 --- a/bench/LowerPHY/Correlation/main.cpp +++ b/bench/LowerPHY/Correlation/main.cpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "armral.h" diff --git a/bench/LowerPHY/FFT/FFT16/bench.py b/bench/LowerPHY/FFT/FFT16/bench.py index 6e2190016f972abfc51456c58b0a2498a47f95e4..084295bc0f03d0fac0978d0d681385ddeda21019 100755 --- a/bench/LowerPHY/FFT/FFT16/bench.py +++ b/bench/LowerPHY/FFT/FFT16/bench.py @@ -1,6 +1,8 @@ #!/usr/bin/env python3 # Arm RAN Acceleration Library -# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates +# SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its +# affiliates +# SPDX-License-Identifier: BSD-3-Clause import json from pathlib import Path diff --git a/bench/LowerPHY/FFT/FFT16/main.cpp b/bench/LowerPHY/FFT/FFT16/main.cpp index 1bf340ee072c27152aab99b61707a9fab320f48b..678ed0ef84a9254fd14bbb832f20ad451d2f9e67 100644 --- a/bench/LowerPHY/FFT/FFT16/main.cpp +++ b/bench/LowerPHY/FFT/FFT16/main.cpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "armral.h" @@ -34,7 +36,7 @@ void run_fft_i16_perf(uint32_t n, armral_fft_direction_t dir, armral_fft_destroy_plan_cs16(&p); } -} //anonymous namespace +} // anonymous namespace int main(int argc, char **argv) { if (argc != 4) { diff --git a/bench/LowerPHY/FFT/FFT32/bench.py b/bench/LowerPHY/FFT/FFT32/bench.py index e84f4fc8647f19b2c6543e15d5da2dbc7f21bb0b..e112ea73e0c4e8bd182d2b5633635cb92b869a01 100755 --- a/bench/LowerPHY/FFT/FFT32/bench.py +++ b/bench/LowerPHY/FFT/FFT32/bench.py @@ -1,6 +1,8 @@ #!/usr/bin/env python3 # Arm RAN Acceleration Library -# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates +# SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its +# affiliates +# SPDX-License-Identifier: BSD-3-Clause import json from pathlib import Path diff --git a/bench/LowerPHY/FFT/FFT32/main.cpp b/bench/LowerPHY/FFT/FFT32/main.cpp index 1d469fd6e4176ac0213e836d806f8bfbe894d9de..7d31c1905d51542256d09c7bdd7bd4238ccbfa17 100644 --- a/bench/LowerPHY/FFT/FFT32/main.cpp +++ b/bench/LowerPHY/FFT/FFT32/main.cpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "armral.h" diff --git a/bench/LowerPHY/FIR/FIR16/bench.py b/bench/LowerPHY/FIR/FIR16/bench.py index 23a3626af1df40ef3148faffd95ed51d5022b771..c230f8bf8cc53df191ceadd8150063e3f2b57993 100755 --- a/bench/LowerPHY/FIR/FIR16/bench.py +++ b/bench/LowerPHY/FIR/FIR16/bench.py @@ -1,6 +1,8 @@ #!/usr/bin/env python3 # Arm RAN Acceleration Library -# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates +# SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its +# affiliates +# SPDX-License-Identifier: BSD-3-Clause import json from pathlib import Path diff --git a/bench/LowerPHY/FIR/FIR16/main.cpp b/bench/LowerPHY/FIR/FIR16/main.cpp index aae2b728c32a7ef2875dbfb459a23f1217e39f24..23b22f1d5165815f8bdecba11fffb0697d1f3ceb 100644 --- a/bench/LowerPHY/FIR/FIR16/main.cpp +++ b/bench/LowerPHY/FIR/FIR16/main.cpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "armral.h" diff --git a/bench/LowerPHY/FIR/FIR16Decimate2/bench.py b/bench/LowerPHY/FIR/FIR16Decimate2/bench.py index bd47c5dfef9cc4082a9d1c376655ab9ce8fe4dc4..3fe2cd18b1e5da5937c871e318fafab128d9c6e1 100755 --- a/bench/LowerPHY/FIR/FIR16Decimate2/bench.py +++ b/bench/LowerPHY/FIR/FIR16Decimate2/bench.py @@ -1,6 +1,8 @@ #!/usr/bin/env python3 # Arm RAN Acceleration Library -# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates +# SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its +# affiliates +# SPDX-License-Identifier: BSD-3-Clause import json from pathlib import Path diff --git a/bench/LowerPHY/FIR/FIR16Decimate2/main.cpp b/bench/LowerPHY/FIR/FIR16Decimate2/main.cpp index f11ee5f142152e21ad15dd1abf52089912e602fd..d97d5b5e6655d2d69b914728c9d06a47930ac11d 100644 --- a/bench/LowerPHY/FIR/FIR16Decimate2/main.cpp +++ b/bench/LowerPHY/FIR/FIR16Decimate2/main.cpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "armral.h" @@ -43,7 +45,7 @@ void run_fir_i16_decimate_2_perf(uint32_t num_samples, uint32_t num_taps, } } -} //anonymous namespace +} // anonymous namespace int main(int argc, char **argv) { if (argc != 4) { diff --git a/bench/LowerPHY/FIR/FIR32/bench.py b/bench/LowerPHY/FIR/FIR32/bench.py index bb24247ddee620d92dabae51d2703b5a91ccc7dd..14a2eadc3863c54204858c27d4d1b1222034745c 100755 --- a/bench/LowerPHY/FIR/FIR32/bench.py +++ b/bench/LowerPHY/FIR/FIR32/bench.py @@ -1,6 +1,8 @@ #!/usr/bin/env python3 # Arm RAN Acceleration Library -# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates +# SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its +# affiliates +# SPDX-License-Identifier: BSD-3-Clause import json from pathlib import Path diff --git a/bench/LowerPHY/FIR/FIR32/main.cpp b/bench/LowerPHY/FIR/FIR32/main.cpp index b376ccda23322072d77c69dcd71b0e4cd0b06aa8..baf98f7fe13c1c2ce7f7934699621ef1c3d919e5 100644 --- a/bench/LowerPHY/FIR/FIR32/main.cpp +++ b/bench/LowerPHY/FIR/FIR32/main.cpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "armral.h" diff --git a/bench/LowerPHY/FIR/FIR32Decimate2/bench.py b/bench/LowerPHY/FIR/FIR32Decimate2/bench.py index f70853ae2bc88c8dbda8103eabd0f642c7be8936..b757dd4d9c93c833e98ef214c64cf57bcc6679df 100755 --- a/bench/LowerPHY/FIR/FIR32Decimate2/bench.py +++ b/bench/LowerPHY/FIR/FIR32Decimate2/bench.py @@ -1,6 +1,8 @@ #!/usr/bin/env python3 # Arm RAN Acceleration Library -# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates +# SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its +# affiliates +# SPDX-License-Identifier: BSD-3-Clause import json from pathlib import Path diff --git a/bench/LowerPHY/FIR/FIR32Decimate2/main.cpp b/bench/LowerPHY/FIR/FIR32Decimate2/main.cpp index d8ac010935d48da109128f81034f13c852429308..3c0fe23ba737721d0d5e3f9c58e4faef8c0c81cb 100644 --- a/bench/LowerPHY/FIR/FIR32Decimate2/main.cpp +++ b/bench/LowerPHY/FIR/FIR32Decimate2/main.cpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "armral.h" diff --git a/bench/LowerPHY/Scrambling/bench.py b/bench/LowerPHY/Scrambling/bench.py index ad7b7b27cf931a6ec3394b8963a783afb34428a4..4c1a612b7bd42cac2356cd2aff1962e0dbfa08cc 100755 --- a/bench/LowerPHY/Scrambling/bench.py +++ b/bench/LowerPHY/Scrambling/bench.py @@ -1,6 +1,8 @@ #!/usr/bin/env python3 # Arm RAN Acceleration Library -# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates +# SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its +# affiliates +# SPDX-License-Identifier: BSD-3-Clause import json from pathlib import Path diff --git a/bench/LowerPHY/Scrambling/main.cpp b/bench/LowerPHY/Scrambling/main.cpp index 5e1985eb2318c79b6459e681f5c541dc3f66db85..6bbbd690c0ec2d1828f437ce45bdd6e8a57f10dd 100644 --- a/bench/LowerPHY/Scrambling/main.cpp +++ b/bench/LowerPHY/Scrambling/main.cpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "armral.h" diff --git a/bench/LowerPHY/SeqGenerator/bench.py b/bench/LowerPHY/SeqGenerator/bench.py index 64db32d9bc4695ba571dbfced1bdc98b727d5e10..88fb6b36172f9b1ae7701f88ddf7b89ed5335fd3 100755 --- a/bench/LowerPHY/SeqGenerator/bench.py +++ b/bench/LowerPHY/SeqGenerator/bench.py @@ -1,6 +1,8 @@ #!/usr/bin/env python3 # Arm RAN Acceleration Library -# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates +# SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its +# affiliates +# SPDX-License-Identifier: BSD-3-Clause import json from pathlib import Path diff --git a/bench/LowerPHY/SeqGenerator/main.cpp b/bench/LowerPHY/SeqGenerator/main.cpp index bd83fe2eb61d6c388623c63db24e7e6010f72d51..187bea0534ddfe7e7f1b6e3708b81bb183a77406 100644 --- a/bench/LowerPHY/SeqGenerator/main.cpp +++ b/bench/LowerPHY/SeqGenerator/main.cpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "cs16_utils.hpp" #include "int_utils.hpp" diff --git a/bench/MatrixFactorizations/SVD/bench.py b/bench/MatrixFactorizations/SVD/bench.py index 4cb05bd9cb80d556cd3cb5cc3b1f6fdb95a9ce4c..abb49addcf147d9ed41e8e9c2a7e7545f4d770cf 100755 --- a/bench/MatrixFactorizations/SVD/bench.py +++ b/bench/MatrixFactorizations/SVD/bench.py @@ -1,6 +1,8 @@ #!/usr/bin/env python3 # Arm RAN Acceleration Library -# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates +# SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its +# affiliates +# SPDX-License-Identifier: BSD-3-Clause import json import itertools diff --git a/bench/MatrixFactorizations/SVD/main.cpp b/bench/MatrixFactorizations/SVD/main.cpp index 61e54443e8d5c18b48f5e8a0911b80e08a72961c..7c55d93ddf3a28fa8c1a39606a79d917213b22e1 100644 --- a/bench/MatrixFactorizations/SVD/main.cpp +++ b/bench/MatrixFactorizations/SVD/main.cpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "armral.h" diff --git a/bench/UpperPHY/CRC/11/BigEndian/bench.py b/bench/UpperPHY/CRC/11/BigEndian/bench.py index b2c277793943792b47fe2c936705827ea82e8216..3aae6db480d357036e050d1d931f5c366d16d37a 100755 --- a/bench/UpperPHY/CRC/11/BigEndian/bench.py +++ b/bench/UpperPHY/CRC/11/BigEndian/bench.py @@ -1,6 +1,8 @@ #!/usr/bin/env python3 # Arm RAN Acceleration Library -# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates +# SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its +# affiliates +# SPDX-License-Identifier: BSD-3-Clause import json from pathlib import Path diff --git a/bench/UpperPHY/CRC/11/BigEndian/main.cpp b/bench/UpperPHY/CRC/11/BigEndian/main.cpp index a75f3e83ebeaaad92723b7474a571e9b89694244..e74142297e1c6d6233ac54f29dc92714c1d91b09 100644 --- a/bench/UpperPHY/CRC/11/BigEndian/main.cpp +++ b/bench/UpperPHY/CRC/11/BigEndian/main.cpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "armral.h" diff --git a/bench/UpperPHY/CRC/11/LittleEndian/bench.py b/bench/UpperPHY/CRC/11/LittleEndian/bench.py index bca79a9245bd513fd9db2057f145ae2af13693af..d39953c1158bad05feca936d26fd290bc4cfa1c4 100755 --- a/bench/UpperPHY/CRC/11/LittleEndian/bench.py +++ b/bench/UpperPHY/CRC/11/LittleEndian/bench.py @@ -1,6 +1,8 @@ #!/usr/bin/env python3 # Arm RAN Acceleration Library -# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates +# SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its +# affiliates +# SPDX-License-Identifier: BSD-3-Clause import json from pathlib import Path diff --git a/bench/UpperPHY/CRC/11/LittleEndian/main.cpp b/bench/UpperPHY/CRC/11/LittleEndian/main.cpp index 0e82518690c4fde32ad43e5ffd8244e41c89c3a4..f72a55c835e685c239fd1c0a6fc33d757b73785d 100644 --- a/bench/UpperPHY/CRC/11/LittleEndian/main.cpp +++ b/bench/UpperPHY/CRC/11/LittleEndian/main.cpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "armral.h" diff --git a/bench/UpperPHY/CRC/16/BigEndian/bench.py b/bench/UpperPHY/CRC/16/BigEndian/bench.py index 738b08acc62f4a8f71eea5d0717daddb135e6f62..f5d10f2432cd3f15786efccc9885ab257d0c78c0 100755 --- a/bench/UpperPHY/CRC/16/BigEndian/bench.py +++ b/bench/UpperPHY/CRC/16/BigEndian/bench.py @@ -1,6 +1,8 @@ #!/usr/bin/env python3 # Arm RAN Acceleration Library -# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates +# SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its +# affiliates +# SPDX-License-Identifier: BSD-3-Clause import json from pathlib import Path diff --git a/bench/UpperPHY/CRC/16/BigEndian/main.cpp b/bench/UpperPHY/CRC/16/BigEndian/main.cpp index 9265e41f29f6cb5fa133f66579dd8a7c6c083a12..a16b6a889265f5bd8bdf725261d89bcc967c6125 100644 --- a/bench/UpperPHY/CRC/16/BigEndian/main.cpp +++ b/bench/UpperPHY/CRC/16/BigEndian/main.cpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "armral.h" diff --git a/bench/UpperPHY/CRC/16/LittleEndian/bench.py b/bench/UpperPHY/CRC/16/LittleEndian/bench.py index 5c6cc1ffe8236286d053e19afa18f4cfd949049b..01841dd317eba554cbac678bdf2a93146c933c52 100755 --- a/bench/UpperPHY/CRC/16/LittleEndian/bench.py +++ b/bench/UpperPHY/CRC/16/LittleEndian/bench.py @@ -1,6 +1,8 @@ #!/usr/bin/env python3 # Arm RAN Acceleration Library -# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates +# SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its +# affiliates +# SPDX-License-Identifier: BSD-3-Clause import json from pathlib import Path diff --git a/bench/UpperPHY/CRC/16/LittleEndian/main.cpp b/bench/UpperPHY/CRC/16/LittleEndian/main.cpp index d1cd3439b59cd045194249c83138eeaaaa495eb0..a6f038320afcd29efff1869f2ba894e43d8f7d92 100644 --- a/bench/UpperPHY/CRC/16/LittleEndian/main.cpp +++ b/bench/UpperPHY/CRC/16/LittleEndian/main.cpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "armral.h" diff --git a/bench/UpperPHY/CRC/24/A/BigEndian/bench.py b/bench/UpperPHY/CRC/24/A/BigEndian/bench.py index 8052caed10115cc3b7cd6392ede854218a59d41b..10e002ebcd321383949a3852080af0451605a673 100755 --- a/bench/UpperPHY/CRC/24/A/BigEndian/bench.py +++ b/bench/UpperPHY/CRC/24/A/BigEndian/bench.py @@ -1,6 +1,8 @@ #!/usr/bin/env python3 # Arm RAN Acceleration Library -# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates +# SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its +# affiliates +# SPDX-License-Identifier: BSD-3-Clause import json from pathlib import Path diff --git a/bench/UpperPHY/CRC/24/A/BigEndian/main.cpp b/bench/UpperPHY/CRC/24/A/BigEndian/main.cpp index 33313dd65771e673a56571695183b8a4070d01cf..98c6de20f1e49546e6951f138b35615c35a57530 100644 --- a/bench/UpperPHY/CRC/24/A/BigEndian/main.cpp +++ b/bench/UpperPHY/CRC/24/A/BigEndian/main.cpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "armral.h" diff --git a/bench/UpperPHY/CRC/24/A/LittleEndian/bench.py b/bench/UpperPHY/CRC/24/A/LittleEndian/bench.py index 64c1ccc59df33aec48678eb89c92442e8c595755..9a20181c6ce1db0eee73e49c1a89f99a4fd661e2 100755 --- a/bench/UpperPHY/CRC/24/A/LittleEndian/bench.py +++ b/bench/UpperPHY/CRC/24/A/LittleEndian/bench.py @@ -1,6 +1,8 @@ #!/usr/bin/env python3 # Arm RAN Acceleration Library -# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates +# SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its +# affiliates +# SPDX-License-Identifier: BSD-3-Clause import json from pathlib import Path diff --git a/bench/UpperPHY/CRC/24/A/LittleEndian/main.cpp b/bench/UpperPHY/CRC/24/A/LittleEndian/main.cpp index 7c0e405ae71a56350c65dc072dcf99bedbcdd6ca..f2e65c45927d4725ea81777947fc1efb5c8298ee 100644 --- a/bench/UpperPHY/CRC/24/A/LittleEndian/main.cpp +++ b/bench/UpperPHY/CRC/24/A/LittleEndian/main.cpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "armral.h" diff --git a/bench/UpperPHY/CRC/24/B/BigEndian/bench.py b/bench/UpperPHY/CRC/24/B/BigEndian/bench.py index 739668557c2ab9c7968f33730cebcbae5335a48e..956fd2f74e4b659b990443157bb244d07099ce2d 100755 --- a/bench/UpperPHY/CRC/24/B/BigEndian/bench.py +++ b/bench/UpperPHY/CRC/24/B/BigEndian/bench.py @@ -1,6 +1,8 @@ #!/usr/bin/env python3 # Arm RAN Acceleration Library -# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates +# SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its +# affiliates +# SPDX-License-Identifier: BSD-3-Clause import json from pathlib import Path diff --git a/bench/UpperPHY/CRC/24/B/BigEndian/main.cpp b/bench/UpperPHY/CRC/24/B/BigEndian/main.cpp index c557b4707075bccc7621f916791ca44c746b8b6b..625c3db0b322dab0b0fa0be55b5334bfb5ccdeb7 100644 --- a/bench/UpperPHY/CRC/24/B/BigEndian/main.cpp +++ b/bench/UpperPHY/CRC/24/B/BigEndian/main.cpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "armral.h" diff --git a/bench/UpperPHY/CRC/24/B/LittleEndian/bench.py b/bench/UpperPHY/CRC/24/B/LittleEndian/bench.py index 06bfea6fe711ea1ba2c196daf2c72da2c4ae63e8..72493799794bc238e627680f46671822df788a91 100755 --- a/bench/UpperPHY/CRC/24/B/LittleEndian/bench.py +++ b/bench/UpperPHY/CRC/24/B/LittleEndian/bench.py @@ -1,6 +1,8 @@ #!/usr/bin/env python3 # Arm RAN Acceleration Library -# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates +# SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its +# affiliates +# SPDX-License-Identifier: BSD-3-Clause import json from pathlib import Path diff --git a/bench/UpperPHY/CRC/24/B/LittleEndian/main.cpp b/bench/UpperPHY/CRC/24/B/LittleEndian/main.cpp index b332e1b6d050fdf2ce26cb619346327393c26dd0..aba2d321a58dc329f64b8dd30e5b0422ae5e9b94 100644 --- a/bench/UpperPHY/CRC/24/B/LittleEndian/main.cpp +++ b/bench/UpperPHY/CRC/24/B/LittleEndian/main.cpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "armral.h" diff --git a/bench/UpperPHY/CRC/24/C/BigEndian/bench.py b/bench/UpperPHY/CRC/24/C/BigEndian/bench.py index 1df67fdc8513edb4da41c85e814b9b21fbea957f..c1c7df8a317489ed563eeb391adab31d1b84a2ad 100755 --- a/bench/UpperPHY/CRC/24/C/BigEndian/bench.py +++ b/bench/UpperPHY/CRC/24/C/BigEndian/bench.py @@ -1,6 +1,8 @@ #!/usr/bin/env python3 # Arm RAN Acceleration Library -# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates +# SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its +# affiliates +# SPDX-License-Identifier: BSD-3-Clause import json from pathlib import Path diff --git a/bench/UpperPHY/CRC/24/C/BigEndian/main.cpp b/bench/UpperPHY/CRC/24/C/BigEndian/main.cpp index f4d8553de7ef5703ad67b680b7a3a25227fbfc20..d334fd4438cccc73ee0bdc529347df344489648d 100644 --- a/bench/UpperPHY/CRC/24/C/BigEndian/main.cpp +++ b/bench/UpperPHY/CRC/24/C/BigEndian/main.cpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "armral.h" diff --git a/bench/UpperPHY/CRC/24/C/LittleEndian/bench.py b/bench/UpperPHY/CRC/24/C/LittleEndian/bench.py index 70471b5c177def2b78a935576bba7c4be486fa43..a02f6fdedc9221e36ba489bbdc4852b2ab5ac886 100755 --- a/bench/UpperPHY/CRC/24/C/LittleEndian/bench.py +++ b/bench/UpperPHY/CRC/24/C/LittleEndian/bench.py @@ -1,6 +1,8 @@ #!/usr/bin/env python3 # Arm RAN Acceleration Library -# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates +# SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its +# affiliates +# SPDX-License-Identifier: BSD-3-Clause import json from pathlib import Path diff --git a/bench/UpperPHY/CRC/24/C/LittleEndian/main.cpp b/bench/UpperPHY/CRC/24/C/LittleEndian/main.cpp index f3cfbc59c19d242b41d76a4016de8794eae3c678..86261a65e33f169a6b9842a5d3f62b69da9e154b 100644 --- a/bench/UpperPHY/CRC/24/C/LittleEndian/main.cpp +++ b/bench/UpperPHY/CRC/24/C/LittleEndian/main.cpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "armral.h" diff --git a/bench/UpperPHY/CRC/6/BigEndian/bench.py b/bench/UpperPHY/CRC/6/BigEndian/bench.py index 1bc3711217effee82161458c2178e6a28f9c98df..4b4d8ab932cc2499bd3f403c46f5eeccc140377b 100755 --- a/bench/UpperPHY/CRC/6/BigEndian/bench.py +++ b/bench/UpperPHY/CRC/6/BigEndian/bench.py @@ -1,6 +1,8 @@ #!/usr/bin/env python3 # Arm RAN Acceleration Library -# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates +# SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its +# affiliates +# SPDX-License-Identifier: BSD-3-Clause import json from pathlib import Path diff --git a/bench/UpperPHY/CRC/6/BigEndian/main.cpp b/bench/UpperPHY/CRC/6/BigEndian/main.cpp index 3ed97a408fd00d9481746f4d8150a15273785e1d..00c7dfaa48421cb27a35c5d43e8bb1ebf3682b26 100644 --- a/bench/UpperPHY/CRC/6/BigEndian/main.cpp +++ b/bench/UpperPHY/CRC/6/BigEndian/main.cpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "armral.h" diff --git a/bench/UpperPHY/CRC/6/LittleEndian/bench.py b/bench/UpperPHY/CRC/6/LittleEndian/bench.py index 7cb63784161c491962bc68ffb2b4f4c5987a1269..3793248d11d0c1728951af97fd3197df86b0f519 100755 --- a/bench/UpperPHY/CRC/6/LittleEndian/bench.py +++ b/bench/UpperPHY/CRC/6/LittleEndian/bench.py @@ -1,6 +1,8 @@ #!/usr/bin/env python3 # Arm RAN Acceleration Library -# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates +# SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its +# affiliates +# SPDX-License-Identifier: BSD-3-Clause import json from pathlib import Path diff --git a/bench/UpperPHY/CRC/6/LittleEndian/main.cpp b/bench/UpperPHY/CRC/6/LittleEndian/main.cpp index ab6958ec35eecc5905b2ef66490854072907d80e..9f182737262d4e31ddaccc850b69957280dff47d 100644 --- a/bench/UpperPHY/CRC/6/LittleEndian/main.cpp +++ b/bench/UpperPHY/CRC/6/LittleEndian/main.cpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "armral.h" diff --git a/bench/UpperPHY/ConvolutionalDecoder/bench.py b/bench/UpperPHY/ConvolutionalDecoder/bench.py index f9c42b3ce3f5472964f02c661e584c8ad0338b4c..1ca403b5d7f82b481458bef309053e952d1daee9 100755 --- a/bench/UpperPHY/ConvolutionalDecoder/bench.py +++ b/bench/UpperPHY/ConvolutionalDecoder/bench.py @@ -1,6 +1,8 @@ #!/usr/bin/env python3 # Arm RAN Acceleration Library -# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates +# SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its +# affiliates +# SPDX-License-Identifier: BSD-3-Clause import json from pathlib import Path diff --git a/bench/UpperPHY/ConvolutionalDecoder/main.cpp b/bench/UpperPHY/ConvolutionalDecoder/main.cpp index fbcfd53dc05e42c5428906bca3edf3fc9280d0ac..805d5e46fff597608dcb9c6ec0a556ef61ba7d52 100644 --- a/bench/UpperPHY/ConvolutionalDecoder/main.cpp +++ b/bench/UpperPHY/ConvolutionalDecoder/main.cpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "armral.h" diff --git a/bench/UpperPHY/ConvolutionalEncoder/bench.py b/bench/UpperPHY/ConvolutionalEncoder/bench.py index 7dc34b60cac304cf2c88c5e2b28bdfcc6ab6f80c..a80fe6f003bf1b765b44bd1b4a2eb26c05d883ed 100755 --- a/bench/UpperPHY/ConvolutionalEncoder/bench.py +++ b/bench/UpperPHY/ConvolutionalEncoder/bench.py @@ -1,6 +1,8 @@ #!/usr/bin/env python3 # Arm RAN Acceleration Library -# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates +# SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its +# affiliates +# SPDX-License-Identifier: BSD-3-Clause import json from pathlib import Path diff --git a/bench/UpperPHY/ConvolutionalEncoder/main.cpp b/bench/UpperPHY/ConvolutionalEncoder/main.cpp index 65b9941ac2e9eb406fc1a08052fba494fce5b1c6..dbe965ba0c8e2ea391e9c9f02ec3860b3173aabb 100644 --- a/bench/UpperPHY/ConvolutionalEncoder/main.cpp +++ b/bench/UpperPHY/ConvolutionalEncoder/main.cpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "armral.h" diff --git a/bench/UpperPHY/Demodulation/bench.py b/bench/UpperPHY/Demodulation/bench.py index 1a099a6c075709fa3e46c44adcdcbe5681cb5d2a..d88889680bdbf2ef0eb70c07593a715bcf4981fe 100755 --- a/bench/UpperPHY/Demodulation/bench.py +++ b/bench/UpperPHY/Demodulation/bench.py @@ -1,6 +1,8 @@ #!/usr/bin/env python3 # Arm RAN Acceleration Library -# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates +# SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its +# affiliates +# SPDX-License-Identifier: BSD-3-Clause import json import itertools diff --git a/bench/UpperPHY/Demodulation/main.cpp b/bench/UpperPHY/Demodulation/main.cpp index e4e06fd417ee06323a1f6124d7703d1d41ec4c3c..f375631812a5688954b74926056fd662e1a8073c 100644 --- a/bench/UpperPHY/Demodulation/main.cpp +++ b/bench/UpperPHY/Demodulation/main.cpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "armral.h" diff --git a/bench/UpperPHY/LDPC/Decoding/bench.py b/bench/UpperPHY/LDPC/Decoding/bench.py index 0476cc7df0f5c2b1796f224a72db3db218ca2c76..7653c6d31077c6a83e5f8093d9a17594895565db 100755 --- a/bench/UpperPHY/LDPC/Decoding/bench.py +++ b/bench/UpperPHY/LDPC/Decoding/bench.py @@ -1,6 +1,8 @@ #!/usr/bin/env python3 # Arm RAN Acceleration Library -# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates +# SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its +# affiliates +# SPDX-License-Identifier: BSD-3-Clause import json import itertools diff --git a/bench/UpperPHY/LDPC/Decoding/main.cpp b/bench/UpperPHY/LDPC/Decoding/main.cpp index f1fa2af7fcd2b1becdd7c1df20742fb2e09d1e82..67c9fe829c26a74cd710149df04f07a59e7a7f6c 100755 --- a/bench/UpperPHY/LDPC/Decoding/main.cpp +++ b/bench/UpperPHY/LDPC/Decoding/main.cpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "armral.h" diff --git a/bench/UpperPHY/LDPC/Encoding/bench.py b/bench/UpperPHY/LDPC/Encoding/bench.py index 3a8e7fb8724d0ec55e58aeeaec73f611928767e4..2ac6be71c904d53917f81094af902bd7ccf59444 100755 --- a/bench/UpperPHY/LDPC/Encoding/bench.py +++ b/bench/UpperPHY/LDPC/Encoding/bench.py @@ -1,6 +1,8 @@ #!/usr/bin/env python3 # Arm RAN Acceleration Library -# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates +# SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its +# affiliates +# SPDX-License-Identifier: BSD-3-Clause import json import itertools diff --git a/bench/UpperPHY/LDPC/Encoding/main.cpp b/bench/UpperPHY/LDPC/Encoding/main.cpp index 864a8c5653fd332c976fecea7b3b67551e49ca3e..f0bc3bbc25e544f431a9f2554d77266fe50e3d7a 100644 --- a/bench/UpperPHY/LDPC/Encoding/main.cpp +++ b/bench/UpperPHY/LDPC/Encoding/main.cpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "armral.h" #include "int_utils.hpp" diff --git a/bench/UpperPHY/LDPC/RateMatching/bench.py b/bench/UpperPHY/LDPC/RateMatching/bench.py index cc49114dc0af421d1268e828c1a7a67bb774196c..59a451e84b3b299ca20b000c9d62232b1a030590 100755 --- a/bench/UpperPHY/LDPC/RateMatching/bench.py +++ b/bench/UpperPHY/LDPC/RateMatching/bench.py @@ -1,6 +1,8 @@ #!/usr/bin/env python3 # Arm RAN Acceleration Library -# SPDX-FileCopyrightText: Copyright 2023-2024 Arm Limited and/or its affiliates +# SPDX-FileCopyrightText: Copyright 2023-2025 Arm Limited and/or its +# affiliates +# SPDX-License-Identifier: BSD-3-Clause import json from pathlib import Path diff --git a/bench/UpperPHY/LDPC/RateMatching/main.cpp b/bench/UpperPHY/LDPC/RateMatching/main.cpp index 5aa17c545152f66138f193a6ded0050ec0d498d5..dfe8ed75d3ef9087c768a99987936d280044ad45 100644 --- a/bench/UpperPHY/LDPC/RateMatching/main.cpp +++ b/bench/UpperPHY/LDPC/RateMatching/main.cpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2023-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2023-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "armral.h" #include "ldpc_coding.hpp" diff --git a/bench/UpperPHY/LDPC/RateRecovery/bench.py b/bench/UpperPHY/LDPC/RateRecovery/bench.py index 8c0004963550d79a64be7ca95dbf0a68033570bd..507886cf48992008c3650e00a3b3175523c349b7 100755 --- a/bench/UpperPHY/LDPC/RateRecovery/bench.py +++ b/bench/UpperPHY/LDPC/RateRecovery/bench.py @@ -1,6 +1,8 @@ #!/usr/bin/env python3 # Arm RAN Acceleration Library -# SPDX-FileCopyrightText: Copyright 2023-2024 Arm Limited and/or its affiliates +# SPDX-FileCopyrightText: Copyright 2023-2025 Arm Limited and/or its +# affiliates +# SPDX-License-Identifier: BSD-3-Clause import json from pathlib import Path diff --git a/bench/UpperPHY/LDPC/RateRecovery/main.cpp b/bench/UpperPHY/LDPC/RateRecovery/main.cpp index 469a4bdb73f602c446d54274260df5e4da69bb8b..867df001a477910f622abc3d91f78232a93f7531 100644 --- a/bench/UpperPHY/LDPC/RateRecovery/main.cpp +++ b/bench/UpperPHY/LDPC/RateRecovery/main.cpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2023-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2023-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "armral.h" #include "ldpc_coding.hpp" diff --git a/bench/UpperPHY/Modulation/bench.py b/bench/UpperPHY/Modulation/bench.py index 9933b7b626a84c552845ff4c809453e88f17bfb1..0f4814fdaa2b9810f3d43b288a5133cdca504576 100755 --- a/bench/UpperPHY/Modulation/bench.py +++ b/bench/UpperPHY/Modulation/bench.py @@ -1,6 +1,8 @@ #!/usr/bin/env python3 # Arm RAN Acceleration Library -# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates +# SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its +# affiliates +# SPDX-License-Identifier: BSD-3-Clause import json import itertools diff --git a/bench/UpperPHY/Modulation/main.cpp b/bench/UpperPHY/Modulation/main.cpp index bb777f019efac247d48efa9ac25e3f0f0d5b6491..e7dd7e13781ed0b45905ebea5fcdb19ddb2cdf41 100644 --- a/bench/UpperPHY/Modulation/main.cpp +++ b/bench/UpperPHY/Modulation/main.cpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "armral.h" diff --git a/bench/UpperPHY/Polar/Decoding/bench.py b/bench/UpperPHY/Polar/Decoding/bench.py index 5cddc12e2f4eeca980d717bf1e94b729535c9966..48f1652ec48df4a9b807f8916c7edb369480f88d 100755 --- a/bench/UpperPHY/Polar/Decoding/bench.py +++ b/bench/UpperPHY/Polar/Decoding/bench.py @@ -1,6 +1,8 @@ #!/usr/bin/env python3 # Arm RAN Acceleration Library -# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates +# SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its +# affiliates +# SPDX-License-Identifier: BSD-3-Clause import json from pathlib import Path diff --git a/bench/UpperPHY/Polar/Decoding/main.cpp b/bench/UpperPHY/Polar/Decoding/main.cpp index 31e89fb160a8564e09b5388fa164e356fef74354..bc424d5fa45336b3635fb25c5c98626c49128fca 100644 --- a/bench/UpperPHY/Polar/Decoding/main.cpp +++ b/bench/UpperPHY/Polar/Decoding/main.cpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "armral.h" #include "rng.hpp" diff --git a/bench/UpperPHY/Polar/Encoding/bench.py b/bench/UpperPHY/Polar/Encoding/bench.py index d05b5db297c57e7c2cf0f4154f54502b549170b2..c7408dfae2564269413bcebcca58e31e93b930a4 100755 --- a/bench/UpperPHY/Polar/Encoding/bench.py +++ b/bench/UpperPHY/Polar/Encoding/bench.py @@ -1,6 +1,8 @@ #!/usr/bin/env python3 # Arm RAN Acceleration Library -# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates +# SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its +# affiliates +# SPDX-License-Identifier: BSD-3-Clause import json from pathlib import Path diff --git a/bench/UpperPHY/Polar/Encoding/main.cpp b/bench/UpperPHY/Polar/Encoding/main.cpp index a1cab8be0603d011f1a58d97fa77c4080665da00..2a87d94cdd33c5a854ed6d0012fe0262d99b6a0c 100644 --- a/bench/UpperPHY/Polar/Encoding/main.cpp +++ b/bench/UpperPHY/Polar/Encoding/main.cpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "armral.h" diff --git a/bench/UpperPHY/Polar/Frozen/bench.py b/bench/UpperPHY/Polar/Frozen/bench.py index 50648a21f9586993e8715e5c40acc6f5edf95f7b..e3f03b034d472621b1325a060b53ec63fd314d07 100755 --- a/bench/UpperPHY/Polar/Frozen/bench.py +++ b/bench/UpperPHY/Polar/Frozen/bench.py @@ -1,6 +1,8 @@ #!/usr/bin/env python3 # Arm RAN Acceleration Library -# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates +# SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its +# affiliates +# SPDX-License-Identifier: BSD-3-Clause import json from pathlib import Path diff --git a/bench/UpperPHY/Polar/Frozen/main.cpp b/bench/UpperPHY/Polar/Frozen/main.cpp index 5ba5e3553f81617b365db33b4d5602bc751dcc42..a80438b6effd2eb6d3b8b9af401b4c84ec8f7f11 100644 --- a/bench/UpperPHY/Polar/Frozen/main.cpp +++ b/bench/UpperPHY/Polar/Frozen/main.cpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "armral.h" @@ -38,7 +40,8 @@ int main(int argc, char **argv) { // e - The encoded length in bits // k - The number of information bits to output // n_pc - The number of parity bits in the encoded message - // n_pc_wm - The number of row-weight-selected parity bits in the encoded message + // n_pc_wm - The number of row-weight-selected parity bits in the encoded + // message // nreps - The number of times to repeat the function fprintf(stderr, "usage: %s n e k n_pc n_pc_wm nreps\n", argv[0]); exit(EXIT_FAILURE); diff --git a/bench/UpperPHY/Polar/RateMatching/bench.py b/bench/UpperPHY/Polar/RateMatching/bench.py index 92c0535df5c85eb43493b011a09fe62d2f92992d..0b4a0605c806ae2d8293ae3a11915749c81cf8ee 100755 --- a/bench/UpperPHY/Polar/RateMatching/bench.py +++ b/bench/UpperPHY/Polar/RateMatching/bench.py @@ -1,6 +1,8 @@ #!/usr/bin/env python3 # Arm RAN Acceleration Library -# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates +# SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its +# affiliates +# SPDX-License-Identifier: BSD-3-Clause import json from pathlib import Path diff --git a/bench/UpperPHY/Polar/RateMatching/main.cpp b/bench/UpperPHY/Polar/RateMatching/main.cpp index a5bf08ab34dfd21520363b2c69b7da6f9f6166d9..863d1bd62eea83a9e2ca20a085f0ad4140c24a27 100644 --- a/bench/UpperPHY/Polar/RateMatching/main.cpp +++ b/bench/UpperPHY/Polar/RateMatching/main.cpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "armral.h" diff --git a/bench/UpperPHY/Polar/RateRecovery/bench.py b/bench/UpperPHY/Polar/RateRecovery/bench.py index a2a2c3f8db5583d830b29a3f5e3eee88e4050f04..6c9b2a91a463c1f39de778ec2a3aa0f480496bc0 100755 --- a/bench/UpperPHY/Polar/RateRecovery/bench.py +++ b/bench/UpperPHY/Polar/RateRecovery/bench.py @@ -1,6 +1,8 @@ #!/usr/bin/env python3 # Arm RAN Acceleration Library -# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates +# SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its +# affiliates +# SPDX-License-Identifier: BSD-3-Clause import json from pathlib import Path diff --git a/bench/UpperPHY/Polar/RateRecovery/main.cpp b/bench/UpperPHY/Polar/RateRecovery/main.cpp index 019b4a58508540ae45a2100076d176005dc57808..d229d09a2f4cb53a886956d5a8fb6f6df0465664 100644 --- a/bench/UpperPHY/Polar/RateRecovery/main.cpp +++ b/bench/UpperPHY/Polar/RateRecovery/main.cpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "armral.h" diff --git a/bench/UpperPHY/Polar/SubchannelDeinterleave/bench.py b/bench/UpperPHY/Polar/SubchannelDeinterleave/bench.py index 29fd5bc331ac674c0944adc34987c006a0872aa1..829652e1316b8683eda8784ad93ce69c61b116ca 100755 --- a/bench/UpperPHY/Polar/SubchannelDeinterleave/bench.py +++ b/bench/UpperPHY/Polar/SubchannelDeinterleave/bench.py @@ -1,6 +1,8 @@ #!/usr/bin/env python3 # Arm RAN Acceleration Library -# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates +# SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its +# affiliates +# SPDX-License-Identifier: BSD-3-Clause import json from pathlib import Path diff --git a/bench/UpperPHY/Polar/SubchannelDeinterleave/main.cpp b/bench/UpperPHY/Polar/SubchannelDeinterleave/main.cpp index e5bb27dc699bf7522984af99fb0d4e3e7fb1f0fb..f2f716e8056bcf203515892d6a8173360c35c8f5 100644 --- a/bench/UpperPHY/Polar/SubchannelDeinterleave/main.cpp +++ b/bench/UpperPHY/Polar/SubchannelDeinterleave/main.cpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "armral.h" diff --git a/bench/UpperPHY/Polar/SubchannelInterleave/bench.py b/bench/UpperPHY/Polar/SubchannelInterleave/bench.py index de89975decfd9391e7a0cb2e499f508d097f863a..679303a3403812db2a4149958eed312a3f95f8de 100755 --- a/bench/UpperPHY/Polar/SubchannelInterleave/bench.py +++ b/bench/UpperPHY/Polar/SubchannelInterleave/bench.py @@ -1,6 +1,8 @@ #!/usr/bin/env python3 # Arm RAN Acceleration Library -# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates +# SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its +# affiliates +# SPDX-License-Identifier: BSD-3-Clause import json from pathlib import Path diff --git a/bench/UpperPHY/Polar/SubchannelInterleave/main.cpp b/bench/UpperPHY/Polar/SubchannelInterleave/main.cpp index 01d3db481fec644568e822a0401537ac6f57eac5..cab46056b7611010f84963e3e6faf57730e45502 100644 --- a/bench/UpperPHY/Polar/SubchannelInterleave/main.cpp +++ b/bench/UpperPHY/Polar/SubchannelInterleave/main.cpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "armral.h" diff --git a/bench/UpperPHY/Turbo/Batch/Decoding/bench.py b/bench/UpperPHY/Turbo/Batch/Decoding/bench.py new file mode 100755 index 0000000000000000000000000000000000000000..ffc629fd19b42efc83ea70914b1a407a83f66529 --- /dev/null +++ b/bench/UpperPHY/Turbo/Batch/Decoding/bench.py @@ -0,0 +1,36 @@ +#!/usr/bin/env python3 +# Arm RAN Acceleration Library +# SPDX-FileCopyrightText: Copyright 2024-2025 Arm Limited and/or its +# affiliates +# SPDX-License-Identifier: BSD-3-Clause + +import json +import itertools +from pathlib import Path +import os + + +def get_path(x): return x if Path(x).is_file() else os.path.join("armral", x) + + +exe_name = get_path("bench_turbo_decoding_batch") + +j = { + "exe_name": exe_name, + "cases": [] +} + +reps = 30000000 +prbArr = [1, 2] +bitArr = [40, 408, 1088, 3136, 6144] +blocksArr = [4, 8, 12] + +for bit, prb, blocks in itertools.product(bitArr, prbArr, blocksArr): + case = { + "name": "turbo_decoding_batch_{}blocks_{}bits_{}".format(blocks, bit, prb), + "args": "{} {} {}".format(prb, bit, blocks), + "reps": reps // bit + } + j["cases"].append(case) + +print(json.dumps(j)) diff --git a/bench/UpperPHY/Turbo/Batch/Decoding/main.cpp b/bench/UpperPHY/Turbo/Batch/Decoding/main.cpp new file mode 100644 index 0000000000000000000000000000000000000000..1f143d49ed1365e9e45fe4c47947497c0ad1e986 --- /dev/null +++ b/bench/UpperPHY/Turbo/Batch/Decoding/main.cpp @@ -0,0 +1,97 @@ +/* + Arm RAN Acceleration Library + SPDX-FileCopyrightText: Copyright 2024-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause +*/ +#include "arm_turbo_decoder_batch.hpp" +#include "arm_turbo_decoder_single.hpp" +#include "armral.h" +#include "turbo_code_common.hpp" +#include "utils/allocators.hpp" + +#include +#include +#include + +namespace { + +void run_turbo_decode_batch_perf(const uint32_t num_prbs, + const uint32_t num_bits, + const uint32_t num_blocks, + const uint32_t num_reps) { + printf("[TURBO DECODING BATCH] - number of resources = %u, number of input " + "bits = " + "%u, number of blocks = %u, number of iterations = %u\n", + num_prbs, num_bits, num_blocks, num_reps); + + uint32_t num_bytes = num_bits / 8; + std::vector ans(num_blocks * num_prbs * num_bytes); + + std::vector sys(num_blocks * num_prbs * (num_bits + 4)); + std::vector par(num_blocks * num_prbs * (num_bits + 4)); + std::vector itl(num_blocks * num_prbs * (num_bits + 4)); + auto *sys_ptr = sys.data(); + auto *par_ptr = par.data(); + auto *itl_ptr = itl.data(); + auto *ans_ptr = ans.data(); + + // Set the maximum number of decoder iterations to 2. We disable early + // exit checking by setting the template parameter to false. + constexpr auto num_iters = 2; + + [[maybe_unused]] std::vector buffer( + armral_turbo_decode_batch_noalloc_buffer_size(num_bits)); + + for (uint32_t i = 0; i < num_reps; ++i) { + for (uint32_t j = 0; j < num_prbs; ++j) { +#ifdef ARMRAL_BENCH_NOALLOC + buffer_bump_allocator allocator{buffer.data()}; + armral::turbo::decode( + sys_ptr + j * num_blocks * (num_bits + 4), + par_ptr + j * num_blocks * (num_bits + 4), + itl_ptr + j * num_blocks * (num_bits + 4), num_bits, + ans_ptr + j * num_blocks * num_bytes, 2.F, num_iters, num_blocks, + nullptr, allocator, trellis_termination, decode_block_step, + batched_trellis_termination, decode_batch_step); +#else + heap_allocator allocator{}; + armral::turbo::decode( + sys_ptr + j * num_blocks * (num_bits + 4), + par_ptr + j * num_blocks * (num_bits + 4), + itl_ptr + j * num_blocks * (num_bits + 4), num_bits, + ans_ptr + j * num_blocks * num_bytes, 2.F, num_iters, num_blocks, + nullptr, allocator, trellis_termination, decode_block_step, + batched_trellis_termination, decode_batch_step); +#endif + } + } +} + +} // anonymous namespace + +int main(int argc, char **argv) { + if (argc != 5) { + // nprbs - The number of resources + // nbits - The number of bits in the code block + // nblock - The number of blocks to include in the batch + // nreps - The number of times to repeat the function + fprintf(stderr, "usage: %s nprbs nbits nblocks nreps\n", argv[0]); + exit(EXIT_FAILURE); + } + const auto num_prbs = (uint32_t)atoi(argv[1]); + const auto num_bits = (uint32_t)atoi(argv[2]); + const auto num_blocks = (uint32_t)atoi(argv[3]); + const auto num_reps = (uint32_t)atoi(argv[4]); + + if (armral::turbo::valid_num_bits(num_bits)) { + run_turbo_decode_batch_perf(num_prbs, num_bits, num_blocks, num_reps); + } else { + printf("ERROR: Unsupported number of bits (%u) specified for turbo " + "decoding batch.\n", + num_bits); + exit(EXIT_FAILURE); + } + + return EXIT_SUCCESS; +} diff --git a/bench/UpperPHY/Turbo/Decoding/bench.py b/bench/UpperPHY/Turbo/Single/Decoding/bench.py similarity index 78% rename from bench/UpperPHY/Turbo/Decoding/bench.py rename to bench/UpperPHY/Turbo/Single/Decoding/bench.py index 11c546a87d579866cd598912bd94a2471a80eb2e..8be0321cb08eeed1d57c79b804845285b6607a2b 100755 --- a/bench/UpperPHY/Turbo/Decoding/bench.py +++ b/bench/UpperPHY/Turbo/Single/Decoding/bench.py @@ -1,6 +1,8 @@ #!/usr/bin/env python3 # Arm RAN Acceleration Library -# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates +# SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its +# affiliates +# SPDX-License-Identifier: BSD-3-Clause import json import itertools diff --git a/bench/UpperPHY/Turbo/Decoding/main.cpp b/bench/UpperPHY/Turbo/Single/Decoding/main.cpp similarity index 78% rename from bench/UpperPHY/Turbo/Decoding/main.cpp rename to bench/UpperPHY/Turbo/Single/Decoding/main.cpp index bcb262fe807ded9cea40d7876b7068deff7c981c..b919566e605fcab2ae962fe44399ff2d9563e271 100644 --- a/bench/UpperPHY/Turbo/Decoding/main.cpp +++ b/bench/UpperPHY/Turbo/Single/Decoding/main.cpp @@ -1,9 +1,12 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ +#include "arm_turbo_decoder_single.hpp" #include "armral.h" -#include "turbo_code.hpp" +#include "turbo_code_common.hpp" #include "utils/allocators.hpp" #include @@ -34,22 +37,24 @@ void run_turbo_decoding_perf(const uint32_t num_prbs, const uint32_t num_bits, constexpr auto num_iters = 2; [[maybe_unused]] std::vector buffer( - armral_turbo_decode_block_noalloc_buffer_size(num_bits, num_iters)); + armral_turbo_decode_block_noalloc_buffer_size(num_bits)); for (uint32_t i = 0; i < num_reps; ++i) { for (uint32_t j = 0; j < num_prbs; ++j) { #ifdef ARMRAL_BENCH_NOALLOC buffer_bump_allocator allocator{buffer.data()}; - armral::turbo::decode_block( + armral::turbo::decode( sys_ptr + j * (num_bits + 4), par_ptr + j * (num_bits + 4), itl_ptr + j * (num_bits + 4), num_bits, ans_ptr + j * num_bytes, 2.F, - num_iters, nullptr, allocator); + num_iters, 1, nullptr, allocator, trellis_termination, + decode_block_step, nullptr, nullptr); #else heap_allocator allocator{}; - armral::turbo::decode_block( + armral::turbo::decode( sys_ptr + j * (num_bits + 4), par_ptr + j * (num_bits + 4), itl_ptr + j * (num_bits + 4), num_bits, ans_ptr + j * num_bytes, 2.F, - num_iters, nullptr, allocator); + num_iters, 1, nullptr, allocator, trellis_termination, + decode_block_step, nullptr, nullptr); #endif } } diff --git a/bench/UpperPHY/Turbo/Encoding/bench.py b/bench/UpperPHY/Turbo/Single/Encoding/bench.py similarity index 78% rename from bench/UpperPHY/Turbo/Encoding/bench.py rename to bench/UpperPHY/Turbo/Single/Encoding/bench.py index a50972fe57b95ac08163f2ca58064ad72a5f0632..7cb09c18328d957cdf0f73a18ebe8f0340f700ab 100755 --- a/bench/UpperPHY/Turbo/Encoding/bench.py +++ b/bench/UpperPHY/Turbo/Single/Encoding/bench.py @@ -1,6 +1,8 @@ #!/usr/bin/env python3 # Arm RAN Acceleration Library -# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates +# SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its +# affiliates +# SPDX-License-Identifier: BSD-3-Clause import json import itertools diff --git a/bench/UpperPHY/Turbo/Encoding/main.cpp b/bench/UpperPHY/Turbo/Single/Encoding/main.cpp similarity index 93% rename from bench/UpperPHY/Turbo/Encoding/main.cpp rename to bench/UpperPHY/Turbo/Single/Encoding/main.cpp index a4d39796a695e4e4da277f5b60be1ad3b207ab8b..4d2efe1c72446e81f5489ee014bbfa5b001b613c 100644 --- a/bench/UpperPHY/Turbo/Encoding/main.cpp +++ b/bench/UpperPHY/Turbo/Single/Encoding/main.cpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "armral.h" diff --git a/bench/UpperPHY/Turbo/RateMatching/bench.py b/bench/UpperPHY/Turbo/Single/RateMatching/bench.py similarity index 81% rename from bench/UpperPHY/Turbo/RateMatching/bench.py rename to bench/UpperPHY/Turbo/Single/RateMatching/bench.py index 9ba9ee120131a1757f32ba417afea6b7b2db95bc..d7d8b9d51ec37961af9f2cecd98a9697ead1017d 100755 --- a/bench/UpperPHY/Turbo/RateMatching/bench.py +++ b/bench/UpperPHY/Turbo/Single/RateMatching/bench.py @@ -1,6 +1,8 @@ #!/usr/bin/env python3 # Arm RAN Acceleration Library -# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates +# SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its +# affiliates +# SPDX-License-Identifier: BSD-3-Clause import json from pathlib import Path diff --git a/bench/UpperPHY/Turbo/RateMatching/main.cpp b/bench/UpperPHY/Turbo/Single/RateMatching/main.cpp similarity index 92% rename from bench/UpperPHY/Turbo/RateMatching/main.cpp rename to bench/UpperPHY/Turbo/Single/RateMatching/main.cpp index d9535f03cd447646cb3a9dd57b2d69d8a4fc15b8..4b7222fd0b09d9057ba1b78691d91716187030b8 100644 --- a/bench/UpperPHY/Turbo/RateMatching/main.cpp +++ b/bench/UpperPHY/Turbo/Single/RateMatching/main.cpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "armral.h" diff --git a/bench/UpperPHY/Turbo/RateRecovery/bench.py b/bench/UpperPHY/Turbo/Single/RateRecovery/bench.py similarity index 81% rename from bench/UpperPHY/Turbo/RateRecovery/bench.py rename to bench/UpperPHY/Turbo/Single/RateRecovery/bench.py index 3e74ded5ae35a8b9f703d7a97bc6028f4a8e499e..a0691e43a8008d949766073c5dd589dd04fc4697 100755 --- a/bench/UpperPHY/Turbo/RateRecovery/bench.py +++ b/bench/UpperPHY/Turbo/Single/RateRecovery/bench.py @@ -1,6 +1,8 @@ #!/usr/bin/env python3 # Arm RAN Acceleration Library -# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates +# SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its +# affiliates +# SPDX-License-Identifier: BSD-3-Clause import json from pathlib import Path diff --git a/bench/UpperPHY/Turbo/RateRecovery/main.cpp b/bench/UpperPHY/Turbo/Single/RateRecovery/main.cpp similarity index 92% rename from bench/UpperPHY/Turbo/RateRecovery/main.cpp rename to bench/UpperPHY/Turbo/Single/RateRecovery/main.cpp index 68a80c044773544ea683f128e33518fa12f55847..8519d8e2d0533c4e373fc0b597f86a2686fc4ce9 100644 --- a/bench/UpperPHY/Turbo/RateRecovery/main.cpp +++ b/bench/UpperPHY/Turbo/Single/RateRecovery/main.cpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "armral.h" diff --git a/bench/benchmarker.py b/bench/benchmarker.py index 4761a006571b01b7097612935d3f8c8347ed512e..7b4a46c5f0a2ce648449b2378917268c0a057375 100755 --- a/bench/benchmarker.py +++ b/bench/benchmarker.py @@ -1,6 +1,8 @@ #!/usr/bin/env python3 # Arm RAN Acceleration Library -# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates +# SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its +# affiliates +# SPDX-License-Identifier: BSD-3-Clause # This program is for benchmarking the performance of armral functions. diff --git a/bench/benchmarker_utils.py b/bench/benchmarker_utils.py index 797890e0c179853b063c4d032c3aacaa9d2004db..f2a6c761204248fb06404f5c48e44d8c60874619 100755 --- a/bench/benchmarker_utils.py +++ b/bench/benchmarker_utils.py @@ -1,6 +1,8 @@ #!/usr/bin/env python3 # Arm RAN Acceleration Library -# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates +# SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its +# affiliates +# SPDX-License-Identifier: BSD-3-Clause import collections import os import subprocess diff --git a/bench/default_runner.py b/bench/default_runner.py index ee02254ce0d9a2d660e4aae892cdc8630584ad51..d76fbd1f86c6332174c37a95494314c83e064870 100755 --- a/bench/default_runner.py +++ b/bench/default_runner.py @@ -1,6 +1,8 @@ #!/usr/bin/env python3 # Arm RAN Acceleration Library -# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates +# SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its +# affiliates +# SPDX-License-Identifier: BSD-3-Clause import json import argparse diff --git a/cmake_uninstall.cmake.in b/cmake_uninstall.cmake.in index 0bceb6badbcbc47b7a03d1bd26bcedb4e312eb8b..ad706a3d315666be9c36987c9ec03d6e8f518565 100644 --- a/cmake_uninstall.cmake.in +++ b/cmake_uninstall.cmake.in @@ -1,19 +1,23 @@ if(NOT EXISTS "@CMAKE_BINARY_DIR@/install_manifest.txt") - message(FATAL_ERROR "Cannot find install manifest: @CMAKE_BINARY_DIR@/install_manifest.txt \ - \nThis might be the case if you have not yet run\nmake install") + message( + FATAL_ERROR + "Cannot find install manifest: @CMAKE_BINARY_DIR@/install_manifest.txt \ + \nThis might be the case if you have not yet run\nmake install" + ) endif() file(READ "@CMAKE_BINARY_DIR@/install_manifest.txt" files) string(REGEX REPLACE "\n" ";" files "${files}") set(INSTALL_BASE_DIR "") -# Try and read the base directory of the install. This is used to try and -# figure out if the install is in a shared location, or in a user's home -# directory. If the installation is not in the home directory, we don't want to -# force remove the installed files in case it causes unexpected behavior for -# others on the system +# Try and read the base directory of the install. This is used to try and figure +# out if the install is in a shared location, or in a user's home directory. If +# the installation is not in the home directory, we don't want to force remove +# the installed files in case it causes unexpected behavior for others on the +# system foreach(file ${files}) if(file MATCHES "armral.a") - string(REGEX REPLACE "/@CMAKE_INSTALL_LIBDIR@/libarmral.a" "" INSTALL_BASE_DIR "${file}") + string(REGEX REPLACE "/@CMAKE_INSTALL_LIBDIR@/libarmral.a" "" + INSTALL_BASE_DIR "${file}") endif() get_filename_component(INSTALL_PARENT_DIR "${INSTALL_BASE_DIR}" DIRECTORY) endforeach() @@ -25,11 +29,11 @@ function(attempt_dir_removal curr_dir base_dir) endif() file(GLOB RESULT "${curr_dir}/*") list(LENGTH RESULT RES_LEN) - if (RES_LEN EQUAL 0) + if(RES_LEN EQUAL 0) exec_program( - "@CMAKE_COMMAND@" ARGS "-E remove_directory \"${curr_dir}\"" - RETURN_VALUE rm_retval - ) + "@CMAKE_COMMAND@" ARGS + "-E remove_directory \"${curr_dir}\"" + RETURN_VALUE rm_retval) else() break() endif() @@ -41,9 +45,9 @@ foreach(file ${files}) message(STATUS "Uninstalling ${file}") if(IS_SYMLINK "${file}" OR EXISTS "${file}") exec_program( - "@CMAKE_COMMAND@" ARGS "-E remove \"${file}\"" - RETURN_VALUE rm_retval - ) + "@CMAKE_COMMAND@" ARGS + "-E remove \"${file}\"" + RETURN_VALUE rm_retval) if(NOT "${rm_retval}" STREQUAL 0) message(STATUS "Failed to uninstall ${file}") else() diff --git a/docs/doxywrapper/arm_footer.html b/docs/doxywrapper/arm_footer.html index 93fae82a8d40a24c5501732fabc7d661c0d1985a..bba3a7e2f4a8c5e55be33e24e07825a44c532703 100644 --- a/docs/doxywrapper/arm_footer.html +++ b/docs/doxywrapper/arm_footer.html @@ -4,14 +4,14 @@ diff --git a/docs/doxywrapper/proprietary_notice.html b/docs/doxywrapper/proprietary_notice.html index 931d1028ff762d9bdd7b413fd1776787dca419a0..777989f612216f6ecfb0cce95f65c93bdd703139 100644 --- a/docs/doxywrapper/proprietary_notice.html +++ b/docs/doxywrapper/proprietary_notice.html @@ -47,7 +47,7 @@ document may be the trademarks of their respective owners. Please follow Arm's trademark usage guidelines at https://www.arm.com/company/policies/trademarks.

-

Copyright © 2020-2024 Arm Limited (or its affiliates). All rights reserved.
+

Copyright © 2020-2025 Arm Limited (or its affiliates). All rights reserved.
Arm Limited. Company 02557590 registered in England.
110 Fulbourn Road, Cambridge, England CB1 9NJ.
(LES-PRE-20349)

diff --git a/docs/examples.md b/docs/examples.md index d99815d8030fe4bac20224b3dddd6c7ce4acd8f3..b3f0d76b904a316c2496fb55620c5878f07879ac 100644 --- a/docs/examples.md +++ b/docs/examples.md @@ -17,7 +17,7 @@ Acceleration Library (ArmRAL). To build the library, use: - git clone -b armral-24.10 https://git.gitlab.arm.com/networking/ral.git + git clone -b armral-25.01 https://git.gitlab.arm.com/networking/ral.git mkdir ral/build cd ral/build cmake .. diff --git a/docs/frontmatter.md b/docs/frontmatter.md index 98a6ac20bcd4f4303da28cc03134132a57b08da6..05ebc17afa9636479e16cf6705df37bca3715160 100644 --- a/docs/frontmatter.md +++ b/docs/frontmatter.md @@ -1,6 +1,6 @@ # Arm RAN Acceleration Library (ArmRAL) Reference Guide -Copyright © 2020-2024 Arm Limited (or its affiliates). All rights reserved. +Copyright © 2020-2025 Arm Limited (or its affiliates). All rights reserved. ## About this book @@ -24,7 +24,7 @@ supplier and give: If you have any comments on content, send an e-mail to errata@arm.com. Give: * The title Arm RAN Acceleration Library Reference Guide. -* The number 102249_2410_00_en. +* The number 102249_2501_00_en. * If applicable, the relevant page number(s) to which your comments refer. * A concise explanation of your comments. @@ -80,7 +80,7 @@ rights reserved. Other brands and names mentioned in this document may be the trademarks of their respective owners. Please follow Arm's trademark usage guidelines at . -Copyright © 2020-2024 Arm Limited (or its affiliates). All rights reserved. +Copyright © 2020-2025 Arm Limited (or its affiliates). All rights reserved. Arm Limited. Company 02557590 registered in England. @@ -137,3 +137,4 @@ Issue | Date | Confidentiality | Change 2404-00 | 19 April 2024 | Non-Confidential | Update for Arm RAN Acceleration Library v24.04 2407-00 | 18 July 2024 | Non-Confidential | Update for Arm RAN Acceleration Library v24.07 2410-00 | 17 October 2024 | Non-Confidential | Update for Arm RAN Acceleration Library v24.10 +2501-00 | 23 January 2025 | Non-Confidential | Update for Arm RAN Acceleration Library v25.01 diff --git a/examples/block_float_9b_example.c b/examples/block_float_9b_example.c index 1a48eb930537e160ceb62009db2f6cdd13b93d97..793ccd4a82cf93133b2ebc9811686e54860edc7d 100644 --- a/examples/block_float_9b_example.c +++ b/examples/block_float_9b_example.c @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "armral.h" diff --git a/examples/fft_cf32_example.c b/examples/fft_cf32_example.c index 4fc4762dbca8012a487ed5b0bd93056b63199c92..fb9fc8efd758f667f1a7aac7b08bc0003a36b9c7 100644 --- a/examples/fft_cf32_example.c +++ b/examples/fft_cf32_example.c @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "armral.h" diff --git a/examples/modulation_example.c b/examples/modulation_example.c index 94538ac816aa23de7124c876feaa3840f3bd42ed..021e09d837df5eb7e5efe5edc97faaf63b57d894 100644 --- a/examples/modulation_example.c +++ b/examples/modulation_example.c @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "armral.h" diff --git a/examples/polar_example.cpp b/examples/polar_example.cpp index d2b9f814c9726b5917c6f7bd401ea459d31b487c..e9221e806e690c76583e07eadeaa622a5bd161e4 100644 --- a/examples/polar_example.cpp +++ b/examples/polar_example.cpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "armral.h" diff --git a/include/armral.h b/include/armral.h index c495fdec8a169ce30d2a7b2fa9da04a7533a8828..3c8d0b93a9369463aa6d8ec397ce991cf1388fbe 100644 --- a/include/armral.h +++ b/include/armral.h @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #pragma once @@ -978,7 +980,7 @@ armral_status armral_cmplx_mat_vec_mult_batch_f32_pa( * @ingroup groupMatrix */ /** - * @addtogroup gen_cmplx_matrix_mult General Complex Matrix-Matrix Multiplication + * @addtogroup gen_cmplx_matmul General Complex Matrix-Matrix Multiplication * @{ * \brief Computes a general matrix-by-matrix multiplication, storing the * result in a destination matrix. @@ -1147,13 +1149,13 @@ armral_status armral_cmplx_matmul_ahb_f32(uint16_t m, uint16_t n, uint16_t k, const armral_cmplx_f32_t *p_src_b, armral_cmplx_f32_t *p_dst); -/** @} end of gen_cmplx_matrix_mult */ +/** @} end of gen_cmplx_matmul */ /** * @ingroup groupMatrix */ /** - * @addtogroup spec_cmplx_matrix_mult Specific-Sized Complex Matrix-Matrix Multiplication + * @addtogroup sp_cplx_mtml Specific-Sized Complex Matrix-Matrix Multiplication * @{ * \brief Computes a specific-sized matrix-by-matrix multiplication, storing the * result in a destination matrix. @@ -1284,7 +1286,7 @@ armral_status armral_cmplx_mat_mult_4x4_f32_iq(const float32_t *src_a_re, float32_t *dst_re, float32_t *dst_im); -/** @} end of spec_cmplx_matrix_mult */ +/** @} end of sp_cplx_mtml */ /** * @ingroup groupMatrix @@ -3217,6 +3219,8 @@ typedef enum { * To avoid memory leaks, call \link armral_fft_destroy_plan_cf32 \endlink when * you no longer need this plan. * + * \note This function supports input sizes `n <= 42012`. + * * @param[in,out] p A pointer to the resulting plan pointer. On output `*p` is * a valid pointer, to be passed to * \link armral_fft_execute_cf32 \endlink. @@ -3275,6 +3279,8 @@ armral_status armral_fft_destroy_plan_cf32(armral_fft_plan_t **p); * To avoid memory leaks, call \link armral_fft_destroy_plan_cs16 \endlink when * you no longer need this plan. * + * \note This function supports input sizes `n <= 42012`. + * * @param[in,out] p A pointer to the resulting plan pointer. On output `*p` is * a valid pointer, to be passed to * \link armral_fft_execute_cs16 \endlink. @@ -4011,7 +4017,7 @@ armral_status armral_turbo_encode_block_noalloc(const uint8_t *src, uint32_t k, * This function implements a maximum a posteriori (MAP) algorithm to decode the * output of the LTE Turbo encoding scheme described in 3GPP Technical * Specification (TS) 36.212 "Multiplexing and channel coding". It takes as - * input three arrays `sys`, `par` and `itl`, each of length `k + 4` bits where + * input three arrays `sys`, `par` and `itl`, each of length `k + 4` bytes where * `k` must be one of the values defined in TS 36.212 Table 5.1.3-3. These * three arrays contain the log-likelihood ratios (LLRs) of the systematic, * parity and interleaved parity bits. The decoding is performed for a single @@ -4028,19 +4034,19 @@ armral_status armral_turbo_encode_block_noalloc(const uint8_t *src, uint32_t k, * * \note * The function is called in one of two ways: - * - `perm_idxs` is populated by calling `armral_turbo_perm_idx_init` before the - * first call to `armral_turbo_decode_block`. This initialization only happens - * once and the resulting permutation array can be reused in multiple calls to - * `armral_turbo_decode_block`. - * - `perm_idxs` is NULL. In this case `armral_turbo_decode_block` will generate - * the permutation indices during each call. + * - `perm_idxs` is pre-populated by calling \link armral_turbo_perm_idx_init + * \endlink before the first call to `armral_turbo_decode_block`. This + * initialization only happens once and the resulting permutation array can be + * reused in multiple calls to `armral_turbo_decode_block`. + * - `perm_idxs` is a null pointer. In this case `armral_turbo_decode_block` + * will regenerate the permutation indices during every call. * * @param[in] sys The systematic portion of the input of length `k + 4` * bytes representing 8-bit log-likelihood ratios. * @param[in] par The parity portion of the input of length `k + 4` bytes * representing 8-bit log-likelihood ratios. * @param[in] itl The interleaved portion of the input of length `k + 4` - * representing 8-bit log-likelihood ratios. + * bytes representing 8-bit log-likelihood ratios. * @param[in] k Length of the output code block in bits. * @param[out] dst Decoded output data of length `k` bits. * @param[in] max_iter Maximum number of decoding iterations to perform. @@ -4060,7 +4066,7 @@ armral_status armral_turbo_decode_block(const int8_t *sys, const int8_t *par, * This function implements a maximum a posteriori (MAP) algorithm to decode the * output of the LTE Turbo encoding scheme described in 3GPP Technical * Specification (TS) 36.212 "Multiplexing and channel coding". It takes as - * input three arrays `sys`, `par` and `itl`, each of length `k + 4` bits where + * input three arrays `sys`, `par` and `itl`, each of length `k + 4` bytes where * `k` must be one of the values defined in TS 36.212 Table 5.1.3-3. These * three arrays contain the log-likelihood ratios (LLRs) of the systematic, * parity and interleaved parity bits. The decoding is performed for a single @@ -4084,19 +4090,20 @@ armral_status armral_turbo_decode_block(const int8_t *sys, const int8_t *par, * * \note * The function is called in one of two ways: - * - `perm_idxs` is populated by calling `armral_turbo_perm_idx_init` before the - * first call to `armral_turbo_decode_block`. This initialization only happens - * once and the resulting permutation array can be reused in multiple calls to - * `armral_turbo_decode_block`. - * - `perm_idxs` is NULL. In this case `armral_turbo_decode_block` will generate - * the permutation indices during each call. + * - `perm_idxs` is pre-populated by calling \link armral_turbo_perm_idx_init + * \endlink before the first call to `armral_turbo_decode_block_noalloc`. This + * initialization only happens once and the resulting permutation array can be + * reused in multiple calls to `armral_turbo_decode_block_noalloc`. + * - `perm_idxs` is a null pointer. In this case + * `armral_turbo_decode_block_noalloc` will regenerate the permutation indices + * during every call. * * @param[in] sys The systematic portion of the input of length `k + 4` * bytes representing 8-bit log-likelihood ratios. * @param[in] par The parity portion of the input of length `k + 4` bytes * representing 8-bit log-likelihood ratios. * @param[in] itl The interleaved portion of the input of length `k + 4` - * representing 8-bit log-likelihood ratios. + * bytes representing 8-bit log-likelihood ratios. * @param[in] k Length of the output code block in bits. * @param[out] dst Decoded output data of length `k` bits. * @param[in] max_iter Maximum number of decoding iterations to perform. @@ -4115,11 +4122,152 @@ armral_status armral_turbo_decode_block_noalloc( * decoding of a single code block of length `k`. * * @param[in] k Length of the output code block in bits. - * @param[in] max_iter Maximum number of decoding iterations to perform. * @return The required buffer size in bytes. */ -uint32_t armral_turbo_decode_block_noalloc_buffer_size(uint32_t k, - uint32_t max_iter); +uint32_t armral_turbo_decode_block_noalloc_buffer_size(uint32_t k); + +/** + * This function implements a maximum a posteriori (MAP) algorithm to decode the + * output of the LTE Turbo encoding scheme described in 3GPP Technical + * Specification (TS) 36.212 "Multiplexing and channel coding". It takes as + * input three arrays `sys`, `par` and `itl`, each of length + * `num_blocks * (k + 4)` bytes where `k` must be one of the values defined + * in TS 36.212 Table 5.1.3-3. These three arrays each contain a batch of + * `num_block` log-likelihood ratios (LLRs) of the systematic, parity + * and interleaved parity bits respectively. The decoding is performed in + * batches of 8 code blocks, with any remaining blocks decoded individually, + * using \link armral_turbo_decode_block \endlink. + * + * The input arrays for the batch must be uninterleaved and stored + * contiguously, such that element `ki` of block `bi`'s systematic data, for + * example, is located at `sys[bi * (k + 4) + ki]`. + * + * The output is written into the array `dst`, which must contain enough bytes + * to store `num_blocks * k` bits. The output is also uninterleaved and stored + * contiguously, such that byte `ki` of block `bi` is located at + * `dst[bi * (k / 8) + ki]`. These are hard outputs (that is, either 0 or 1); + * the function does not return LLRs. + * + * The function takes a parameter `max_iter`, which specifies the + * maximum number of iterations that the decoder will perform. The + * algorithm will terminate in fewer iterations if there is no change + * in the computed LLRs between consecutive iterations. + * + * \note + * This function is called in one of two ways: + * - `perm_idxs` is pre-populated by calling \link armral_turbo_perm_idx_init + * \endlink before the first call to `armral_turbo_decode_batch`. This + * initialization only happens once and the resulting permutation array can be + * reused in multiple calls to `armral_turbo_decode_batch`. + * - `perm_idxs` is a null pointer. In this case `armral_turbo_decode_batch` + * will regenerate the permutation indices during every call. + * + * @param[in] num_blocks Number of blocks of data to decode in one call to + * this function. + * @param[in] sys The batched systematic portion of the input of + * length `num_blocks * (k + 4)` bytes representing a + * batch of 8-bit log-likelihood ratios. + * @param[in] par The batched parity portion of the input of + * length `num_blocks * (k + 4)` bytes representing a + * batch of 8-bit log-likelihood ratios. + * @param[in] itl The batched interleaved portion of the input of + * length `num_blocks * (k + 4)` bytes representing a + * batch of 8-bit log-likelihood ratios. + * @param[in] k Length of one output code block in bits. + * @param[out] dst Batched decoded output data of length + * `num_blocks * k` bits. + * @param[in] max_iter Maximum number of decoding iterations to perform. + * @param[in] perm_idxs Buffer containing the permutation indices for all + * `k` generated by an earlier call to + * \link armral_turbo_perm_idx_init \endlink. + * @return An `armral_status` value that indicates success or failure. + */ +armral_status armral_turbo_decode_batch(uint32_t num_blocks, const int8_t *sys, + const int8_t *par, const int8_t *itl, + uint32_t k, uint8_t *dst, + uint32_t max_iter, uint16_t *perm_idxs); + +/** + * Non-allocating variant of \link armral_turbo_decode_batch \endlink. + * + * This function implements a maximum a posteriori (MAP) algorithm to decode the + * output of the LTE Turbo encoding scheme described in 3GPP Technical + * Specification (TS) 36.212 "Multiplexing and channel coding". It takes as + * input three arrays `sys`, `par` and `itl`, each of length + * `num_blocks * (k + 4)` bytes where `k` must be one of the values defined + * in TS 36.212 Table 5.1.3-3. These three arrays each contain a batch of + * `num_block` log-likelihood ratios (LLRs) of the systematic, parity + * and interleaved parity bits respectively. The decoding is performed in + * batches of 8 code blocks, with any remaining blocks decoded individually, + * using \link armral_turbo_decode_block_noalloc \endlink. + * + * The input arrays for the batch must be uninterleaved and stored + * contiguously, such that element `ki` of block `bi`'s systematic data, for + * example, is located at `sys[bi * (k + 4) + ki]`. + * + * The output is written into the array `dst`, which must contain enough bytes + * to store `num_blocks * k` bits. The output is also uninterleaved and stored + * contiguously, such that byte `ki` of block `bi` is located at + * `dst[bi * (k / 8) + ki]`. These are hard outputs (that is, either 0 or 1); + * the function does not return LLRs. + * + * The function takes a parameter `max_iter`, which specifies the + * maximum number of iterations that the decoder will perform. The + * algorithm will terminate in fewer iterations if there is no change + * in the computed LLRs between consecutive iterations. + * + * This function takes a pre-allocated buffer (`buffer`) to use internally. + * This variant will not call any system memory allocators. + * + * The buffer must be at least as large as the number of bytes returned by + * calling \link armral_turbo_decode_block_noalloc_buffer_size \endlink + * with identical inputs. + * + * \note + * This function is called in one of two ways: + * - `perm_idxs` is pre-populated by calling \link armral_turbo_perm_idx_init + * \endlink before the first call to `armral_turbo_decode_batch_noalloc`. This + * initialization only happens once and the resulting permutation array can be + * reused in multiple calls to `armral_turbo_decode_batch_noalloc`. + * - `perm_idxs` is a null pointer. In this case + * `armral_turbo_decode_batch_noalloc` will regenerate the permutation indices + * during every call. + * + * @param[in] num_blocks Number of blocks of data to decode in one call to + * this function. + * @param[in] sys The batched systematic portion of the input of + * length `num_blocks * (k + 4)` bytes representing a + * batch of 8-bit log-likelihood ratios. + * @param[in] par The batched parity portion of the input of + * length `num_blocks * (k + 4)` bytes representing a + * batch of 8-bit log-likelihood ratios. + * @param[in] itl The batched interleaved portion of the input of + * length `num_blocks * (k + 4)` bytes representing a + * batch of 8-bit log-likelihood ratios. + * @param[in] k Length of one output code block in bits. + * @param[out] dst Batched decoded output data of length + * `num_blocks * k` bits. + * @param[in] max_iter Maximum number of decoding iterations to perform. + * @param[in] perm_idxs Buffer containing the permutation indices for all + * `k` generated by an earlier call to + * \link armral_turbo_perm_idx_init \endlink. + * @param[in] buffer Workspace buffer to be used internally. + * @return An `armral_status` value that indicates success or failure. + */ +armral_status +armral_turbo_decode_batch_noalloc(uint32_t num_blocks, const int8_t *sys, + const int8_t *par, const int8_t *itl, + uint32_t k, uint8_t *dst, uint32_t max_iter, + uint16_t *perm_idxs, void *buffer); + +/** + * Calculates the required buffer size in bytes required to perform Turbo + * decoding of a batch of 8 code blocks, each of length `k`. + * + * @param[in] k Length of one output code block in bits. + * @return The required buffer size in bytes. + */ +uint32_t armral_turbo_decode_batch_noalloc_buffer_size(uint32_t k); /** * Matches the rate of the Turbo encoded code block to the rate of the channel diff --git a/python/benchmark_excel_summary.py b/python/benchmark_excel_summary.py index 8dd3fe8fa4e13d018af416fa9e0fd92d9db6cc25..cae23e985217cccb332a665062ffa0e4e11fc4b3 100755 --- a/python/benchmark_excel_summary.py +++ b/python/benchmark_excel_summary.py @@ -1,6 +1,8 @@ #!/usr/bin/env python3 # Arm RAN Acceleration Library -# SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates +# SPDX-FileCopyrightText: Copyright 2024-2025 Arm Limited and/or its +# affiliates +# SPDX-License-Identifier: BSD-3-Clause import argparse import json diff --git a/simulation/README.md b/simulation/README.md index 9d17dad5b4acfcecaa5de55c60d5696ec75e2fe8..3b831768395fc4ddb0ee32e23ffdda48d975a92c 100644 --- a/simulation/README.md +++ b/simulation/README.md @@ -227,7 +227,7 @@ You can run the `turbo` coding Additive White Gaussian Noise (AWGN) simulation with the following parameters: turbo_awgn -k num_bits -m mod_type -e num_matched_bits - [-r rv] [-u demod_ulp] [-i iter_max] + [-b num_blocks] [-r rv] [-u demod_ulp] [-i iter_max] For each value of the `Eb / N0` ratio used, a JSON record is written to stdout. The JSON record contains the following fields: @@ -235,6 +235,7 @@ The JSON record contains the following fields: { "k": , "e": , + "num_blocks": , "mod_type": , "ulp": , "Eb/N0": , diff --git a/simulation/awgn/awgn.cpp b/simulation/awgn/awgn.cpp index cfd76520693009934287b74c2c9090a2423b33d7..c0a8fb23c8fb499b6af4622233fbd2006640327a 100644 --- a/simulation/awgn/awgn.cpp +++ b/simulation/awgn/awgn.cpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "awgn.hpp" #include "rng.hpp" diff --git a/simulation/awgn/awgn.hpp b/simulation/awgn/awgn.hpp index ba978e80e4d2848148aff94701d4beb96dc87917..06bcc1c6cb13032047d8ca2fd9cca9ed723cc677 100644 --- a/simulation/awgn/awgn.hpp +++ b/simulation/awgn/awgn.hpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #pragma once diff --git a/simulation/capacity/capacity.py b/simulation/capacity/capacity.py index 496028fa5871daf154d929075c55c2164ff7f808..bb29032378440f163e153410e1c55cb3b1690400 100755 --- a/simulation/capacity/capacity.py +++ b/simulation/capacity/capacity.py @@ -1,6 +1,8 @@ #!/usr/bin/env python3 # Arm RAN Acceleration Library -# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates +# SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its +# affiliates +# SPDX-License-Identifier: BSD-3-Clause from argparse import ArgumentParser from math import sqrt, exp, pi, log diff --git a/simulation/convolutional_awgn/convolutional_awgn.cpp b/simulation/convolutional_awgn/convolutional_awgn.cpp index 25d939caef75f6d11e4db2af6f0782ae4673cbdc..89f0b9c81f4b97198b2351ee4780eaa807091d00 100644 --- a/simulation/convolutional_awgn/convolutional_awgn.cpp +++ b/simulation/convolutional_awgn/convolutional_awgn.cpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "armral.h" #include "awgn.hpp" @@ -8,7 +10,6 @@ #include "utils/bits_to_bytes.hpp" #include -#include #include #include #include @@ -311,7 +312,7 @@ int main(int argc, char **argv) { // A default ulp value is 128, which removes some low order bits when // demodulating. Care should be taken that this is not too large. In // particular, when this value exceeds half the distance between - // neighbouring symbols, it is no longer possible to tell which symbol has + // neighboring symbols, it is no longer possible to tell which symbol has // been transmitted in the case where we have a noiseless channel. This may // cause the simulation to never decode in an error-free manner. ulp = 128; diff --git a/simulation/convolutional_awgn/convolutional_error_rate.py b/simulation/convolutional_awgn/convolutional_error_rate.py index 0a887e61b72c49b015fcaf6d433b7b79e082d1ce..7bc7034ab8655fec414af843ab46a9e68aa2e356 100755 --- a/simulation/convolutional_awgn/convolutional_error_rate.py +++ b/simulation/convolutional_awgn/convolutional_error_rate.py @@ -1,6 +1,8 @@ #!/usr/bin/env python3 # Arm RAN Acceleration Library -# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates +# SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its +# affiliates +# SPDX-License-Identifier: BSD-3-Clause from argparse import ArgumentParser from dataclasses import dataclass diff --git a/simulation/include/simulation_common.hpp b/simulation/include/simulation_common.hpp index 3a36b98a02cf1ce14acb89ade92162a784061b8c..9665f3c3679c703e137be89565e243841932b43c 100644 --- a/simulation/include/simulation_common.hpp +++ b/simulation/include/simulation_common.hpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #pragma once diff --git a/simulation/include/simulation_common.py b/simulation/include/simulation_common.py index 1500149d6ff5ba41f8e4082326878cc7fda44912..070e597e487493a7a568fa6de00e773bfb8d1269 100755 --- a/simulation/include/simulation_common.py +++ b/simulation/include/simulation_common.py @@ -1,6 +1,8 @@ #!/usr/bin/env python3 # Arm RAN Acceleration Library -# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates +# SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its +# affiliates +# SPDX-License-Identifier: BSD-3-Clause from dataclasses import dataclass from datetime import datetime diff --git a/simulation/ldpc_awgn/ldpc_awgn.cpp b/simulation/ldpc_awgn/ldpc_awgn.cpp index a7890aef2f03b60d127a5cfcf910609d1942dad3..13f59cd9f76718bd94946fc4ee3a9f7917f3bc6e 100644 --- a/simulation/ldpc_awgn/ldpc_awgn.cpp +++ b/simulation/ldpc_awgn/ldpc_awgn.cpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "armral.h" #include "awgn.hpp" @@ -8,7 +10,6 @@ #include "utils/bits_to_bytes.hpp" #include -#include #include #include #include @@ -225,7 +226,7 @@ int run_check(armral::utils::random_state *state, uint32_t z, armral_ldpc_decode_block(data->data_recovered, bg, z, ARMRAL_LDPC_NO_CRC, 10, data->data_decoded); - // To make it easier to compare the values, convert the bit array to a byte // + // To make it easier to compare the values, convert the bit array to a byte // array armral::bits_to_bytes(data->len_out, data->data_decoded, data->data_decoded_bytes); @@ -424,7 +425,7 @@ int main(int argc, char **argv) { // A default ulp value is 128, which removes some low order bits when // demodulating. Care should be taken that this is not too large. In // particular, when this value exceeds half the distance between - // neighbouring symbols, it is no longer possible to tell which symbol has + // neighboring symbols, it is no longer possible to tell which symbol has // been transmitted in the case where we have a noise-free channel. This may // cause the simulation to never decode in an error-free manner. ulp = 128; diff --git a/simulation/ldpc_awgn/ldpc_error_rate.py b/simulation/ldpc_awgn/ldpc_error_rate.py index 0eb6643a35d12bf0601f9519f3860dd46223e3c7..568ded95e9c2db19ae04ce9af64f82c7d9c2ff10 100755 --- a/simulation/ldpc_awgn/ldpc_error_rate.py +++ b/simulation/ldpc_awgn/ldpc_error_rate.py @@ -1,6 +1,8 @@ #!/usr/bin/env python3 # Arm RAN Acceleration Library -# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates +# SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its +# affiliates +# SPDX-License-Identifier: BSD-3-Clause from argparse import ArgumentParser from dataclasses import dataclass diff --git a/simulation/modulation_awgn/modulation_awgn.cpp b/simulation/modulation_awgn/modulation_awgn.cpp index 79f1001a731567c190f7705e5a9156aed78f420a..06a76b3ad399d8572ac3a1df72df2246b399f278 100644 --- a/simulation/modulation_awgn/modulation_awgn.cpp +++ b/simulation/modulation_awgn/modulation_awgn.cpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "awgn.hpp" #include "simulation_common.hpp" @@ -229,7 +231,7 @@ int main(int argc, char **argv) { // A default ulp value is 128, which removes some low order bits when // demodulating. Care should be taken that this is not too large. In // particular, when this value exceeds half the distance between - // neighbouring symbols, it is no longer possible to tell which symbol has + // neighboring symbols, it is no longer possible to tell which symbol has // been transmitted in the case where we have a noiseless channel. This may // cause the simulation to never decode in an error-free manner. ulp = 128; diff --git a/simulation/modulation_awgn/modulation_error_rate.py b/simulation/modulation_awgn/modulation_error_rate.py index 14ff20cdf4abd42d5c95f83f855b041c7a435567..8f1c3b812662e4ce7fdc24750ef3d3e219843000 100755 --- a/simulation/modulation_awgn/modulation_error_rate.py +++ b/simulation/modulation_awgn/modulation_error_rate.py @@ -1,6 +1,8 @@ #!/usr/bin/env python3 # Arm RAN Acceleration Library -# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates +# SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its +# affiliates +# SPDX-License-Identifier: BSD-3-Clause from argparse import ArgumentParser import pandas as pd diff --git a/simulation/polar_awgn/polar_awgn.cpp b/simulation/polar_awgn/polar_awgn.cpp index 6c179bae2d7b32e2646a7c81169ea6e3b8636042..4150f901a1b44c6cfac7608e2e541f590d0207bb 100644 --- a/simulation/polar_awgn/polar_awgn.cpp +++ b/simulation/polar_awgn/polar_awgn.cpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "armral.h" #include "awgn.hpp" @@ -10,7 +12,6 @@ #include #include #include -#include #include #include #include @@ -68,7 +69,6 @@ void usage(const char *exe_name) { << " Default value is 128.\n" << " Number of possible decodings to return.\n" << " Supported values are: " << print_valid_l() - << ".\n" << " Default value is 1.\n" << std::endl; } @@ -434,7 +434,7 @@ int main(int argc, char **argv) { // A default ulp value is 128, which removes some low order bits when // demodulating. Care should be taken that this is not too large. In // particular, when this value exceeds half the distance between - // neighbouring symbols, it is no longer possible to tell which symbol has + // neighboring symbols, it is no longer possible to tell which symbol has // been transmitted in the case where we have a noiseless channel. This may // cause the simulation to never decode in an error-free manner. ulp = 128; diff --git a/simulation/polar_awgn/polar_error_rate.py b/simulation/polar_awgn/polar_error_rate.py index f8a76cbf7fdb05a8b9ac16250998fb78f66e94a4..ffe35ac380765d6b680da9adabc4d63f946daf09 100755 --- a/simulation/polar_awgn/polar_error_rate.py +++ b/simulation/polar_awgn/polar_error_rate.py @@ -1,6 +1,8 @@ #!/usr/bin/env python3 # Arm RAN Acceleration Library -# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates +# SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its +# affiliates +# SPDX-License-Identifier: BSD-3-Clause from argparse import ArgumentParser from dataclasses import dataclass diff --git a/simulation/turbo_awgn/turbo_awgn.cpp b/simulation/turbo_awgn/turbo_awgn.cpp index 230427a17b0ac7d288d0ce52a0df7f30e3c2350d..1c02f387710e65dec55eb7ba13cde4c30328823e 100644 --- a/simulation/turbo_awgn/turbo_awgn.cpp +++ b/simulation/turbo_awgn/turbo_awgn.cpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "armral.h" #include "awgn.hpp" @@ -8,7 +10,6 @@ #include "utils/bits_to_bytes.hpp" #include -#include #include #include #include @@ -83,15 +84,20 @@ std::string valid_rv_str() { void usage(const char *exe_name) { std::cout << "Usage: " << exe_name - << " -k num_bits -m mod_type -e num_matched_bits [-r rv] [-u demod_ulp] " - << "[-i iter_max]\n\n" + << " -k num_bits -m mod_type -e num_matched_bits [-b num_blocks] [-r rv]" + << " [-u demod_ulp] [-i iter_max]\n\n" << "The arguments required by " << exe_name << " are:\n\n" - << " Number of bits in the encoded message.\n" - << " This must be one of:\n" + << " Number of bits in each block of the encoded\n" + << " message. This must be one of:\n" << print_valid_block_size("\t\t\t") << " Type of modulation. Supported values are:\n" << armral::simulation::print_valid_mod_type(3) << " Number of bits in the rate-matched message.\n" + << " Number of blocks of data to decode. Values\n" + << " greater than or equal to 8 will use the\n" + << " 8-block batched turbo decoder, with any\n" + << " remaining blocks decoded with the single turbo\n" + << " decoder. Default value is 1.\n" << " The redundancy version used for rate matching\n" << " and recovery. Supported values are:\n" << " " << valid_rv_str() @@ -109,6 +115,7 @@ void usage(const char *exe_name) { } struct turbo_example_data { + uint16_t num_blocks; // Number of blocks. All other lengths are per block. uint32_t len_in; // k, the number of bits in the input block uint32_t len_encoded; // length (in bits) of the outputs of the encoder uint32_t len_matched; // length (in bits) of the rate-matched message @@ -132,8 +139,10 @@ struct turbo_example_data { uint8_t *data_decoded_bytes; // the decoded data, one byte per input bit uint16_t *permutation_indices; // buffer to hold all permutation indices - turbo_example_data(uint32_t k, armral_modulation_type mod, uint32_t e, - uint32_t r, uint16_t *perm_idxs) { + turbo_example_data(uint32_t size_of_batch, uint32_t k, + armral_modulation_type mod, uint32_t e, uint32_t r, + uint16_t *perm_idxs) { + num_blocks = size_of_batch; mod_type = mod; len_in = k; len_encoded = k + 4; @@ -141,21 +150,22 @@ struct turbo_example_data { rv = r; len_out = k; permutation_indices = perm_idxs; - data_in = SNEW(uint8_t, (len_in + 7) / 8); - data_in_bytes = SNEW(uint8_t, k); - sys_encoded = SNEW(uint8_t, (len_encoded + 7) / 8); - par_encoded = SNEW(uint8_t, (len_encoded + 7) / 8); - itl_encoded = SNEW(uint8_t, (len_encoded + 7) / 8); - data_matched = SNEW(uint8_t, (len_matched + 7) / 8); + data_in = SNEW(uint8_t, num_blocks * (len_in + 7) / 8); + data_in_bytes = SNEW(uint8_t, num_blocks * k); + sys_encoded = SNEW(uint8_t, num_blocks * (len_encoded + 7) / 8); + par_encoded = SNEW(uint8_t, num_blocks * (len_encoded + 7) / 8); + itl_encoded = SNEW(uint8_t, num_blocks * (len_encoded + 7) / 8); + data_matched = SNEW(uint8_t, num_blocks * (len_matched + 7) / 8); bit_per_symbol = armral::simulation::bits_per_symbol(mod_type); num_mod_symbols = (len_matched + bit_per_symbol - 1) / bit_per_symbol; - data_mod = SNEW(armral_cmplx_int16_t, num_mod_symbols); - sys_recovered = SNEW(int8_t, len_encoded); - par_recovered = SNEW(int8_t, len_encoded); - itl_recovered = SNEW(int8_t, len_encoded); - data_demod_soft = SNEW(int8_t, num_mod_symbols * bit_per_symbol); - data_decoded = SNEW(uint8_t, (len_out + 7) / 8); - data_decoded_bytes = SNEW(uint8_t, len_out); + data_mod = SNEW(armral_cmplx_int16_t, num_blocks * num_mod_symbols); + sys_recovered = SNEW(int8_t, num_blocks * len_encoded); + par_recovered = SNEW(int8_t, num_blocks * len_encoded); + itl_recovered = SNEW(int8_t, num_blocks * len_encoded); + data_demod_soft = + SNEW(int8_t, num_blocks * num_mod_symbols * bit_per_symbol); + data_decoded = SNEW(uint8_t, num_blocks * (len_out + 7) / 8); + data_decoded_bytes = SNEW(uint8_t, num_blocks * len_out); } ~turbo_example_data() { @@ -175,89 +185,138 @@ struct turbo_example_data { } }; +// A structure to encapsulate the bit/block error counts from one simulation +struct turbo_error_counts { + uint32_t num_bit_errors; + uint32_t num_block_errors; + + turbo_error_counts() { + num_bit_errors = 0; + num_block_errors = 0; + } +}; + // Perform an end-to-end encoding, rate matching, modulation, transmission, // demodulation, rate recovery, and decoding and count the number of errors -int run_check(armral::utils::random_state *state, double snr_db, uint32_t ulp, - uint32_t iter_max, turbo_example_data *data) { - // Init data - memset(data->data_in, 0, (data->len_in + 7) / 8 * sizeof(uint8_t)); - for (uint32_t i = 0; i < data->len_in; ++i) { - uint8_t bit = static_cast( - armral::utils::linear_congruential_generator{}.one(state)); - uint16_t byte_ind = i / 8; - // The most significant bit is the first bit (in wire order). Not sure if - // that is an issue with randomly generated data, but we are paying - // attention to it here. - uint16_t idx = 7 - (i % 8); - data->data_in[byte_ind] |= bit << idx; +void run_check(armral::utils::random_state *state, double snr_db, uint32_t ulp, + uint32_t iter_max, turbo_example_data *data, + turbo_error_counts *results) { + // Setup num_blocks blocks of data, one block at a time + for (uint32_t batch_idx = 0; batch_idx < data->num_blocks; ++batch_idx) { + uint32_t encoded_offset = batch_idx * ((data->len_encoded + 7) / 8); + uint32_t recovered_offset = batch_idx * data->len_encoded; + uint32_t matched_offset = batch_idx * ((data->len_matched + 7) / 8); + uint32_t in_offset = batch_idx * ((data->len_in + 7) / 8); + uint32_t mod_offset = batch_idx * data->num_mod_symbols; + uint32_t demod_soft_offset = + batch_idx * data->num_mod_symbols * data->bit_per_symbol; + + // Init data + memset(data->data_in + in_offset, 0, + (data->len_in + 7) / 8 * sizeof(uint8_t)); + for (uint32_t i = 0; i < data->len_in; ++i) { + uint8_t bit = static_cast( + armral::utils::linear_congruential_generator{}.one(state)); + uint16_t byte_ind = i / 8; + // The most significant bit is the first bit (in wire order). Not sure if + // that is an issue with randomly generated data, but we are paying + // attention to it here. + uint16_t idx = 7 - (i % 8); + data->data_in[in_offset + byte_ind] |= bit << idx; + } + + // Run turbo encoding for a single block + armral_turbo_encode_block(data->data_in + in_offset, data->len_in, + data->sys_encoded + encoded_offset, + data->par_encoded + encoded_offset, + data->itl_encoded + encoded_offset); + + // Run turbo rate matching. This performs the operations described in + // section 5.1.4 of TS 36.212 to match the rate of the encoded code block to + // the rate of the channel. The output is an array which stores e bits. + armral_turbo_rate_matching(data->len_encoded, data->len_matched, data->rv, + data->sys_encoded + encoded_offset, + data->par_encoded + encoded_offset, + data->itl_encoded + encoded_offset, + data->data_matched + matched_offset); + + // Run modulation + armral_modulation(data->num_mod_symbols * data->bit_per_symbol, + data->mod_type, data->data_matched + matched_offset, + data->data_mod + mod_offset); + + // AWGN channel effects - add some noise to all the encoded bits + armral::simulation::add_awgn(state, data->num_mod_symbols, snr_db, + ARMRAL_FIXED_POINT_INDEX_Q2_13, + data->data_mod + mod_offset); + + // Run demodulation + armral_demodulation(data->num_mod_symbols, ulp, data->mod_type, + data->data_mod + mod_offset, + data->data_demod_soft + demod_soft_offset); + + // The LLRs are updated by rate recovery and must be zero the first time + // rate recovery is performed. Since different input data is created for + // every loop iteration, we need to reset the LLRs each time. + memset(data->sys_recovered + recovered_offset, 0, data->len_encoded); + memset(data->par_recovered + recovered_offset, 0, data->len_encoded); + memset(data->itl_recovered + recovered_offset, 0, data->len_encoded); + + // Run turbo rate recovery. This performs the inverse operations of rate + // matching to output the rate-recovered LLRs. + armral_turbo_rate_recovery(data->len_encoded, data->len_matched, data->rv, + data->data_demod_soft + demod_soft_offset, + data->sys_recovered + recovered_offset, + data->par_recovered + recovered_offset, + data->itl_recovered + recovered_offset); } - // Run turbo encoding for a single block - armral_turbo_encode_block(data->data_in, data->len_in, data->sys_encoded, - data->par_encoded, data->itl_encoded); - - // Run turbo rate matching. This performs the operations described in - // section 5.1.4 of TS 36.212 to match the rate of the encoded code block to - // the rate of the channel. The output is an array which stores e bits. - armral_turbo_rate_matching(data->len_encoded, data->len_matched, data->rv, - data->sys_encoded, data->par_encoded, - data->itl_encoded, data->data_matched); - - // Run modulation - armral_modulation(data->num_mod_symbols * data->bit_per_symbol, - data->mod_type, data->data_matched, data->data_mod); - - // AWGN channel effects - add some noise to all the encoded bits - armral::simulation::add_awgn(state, data->num_mod_symbols, snr_db, - ARMRAL_FIXED_POINT_INDEX_Q2_13, data->data_mod); - - // Run demodulation - armral_demodulation(data->num_mod_symbols, ulp, data->mod_type, - data->data_mod, data->data_demod_soft); - - // The LLRs are updated by rate recovery and must be zero the first time rate - // recovery is performed. Since different input data is created for every loop - // iteration, we need to reset the LLRs each time. - memset(data->sys_recovered, 0, data->len_encoded); - memset(data->par_recovered, 0, data->len_encoded); - memset(data->itl_recovered, 0, data->len_encoded); - - // Run turbo rate recovery. This performs the inverse operations of rate - // matching to output the rate-recovered LLRs. - armral_turbo_rate_recovery(data->len_encoded, data->len_matched, data->rv, - data->data_demod_soft, data->sys_recovered, - data->par_recovered, data->itl_recovered); - - // Run turbo decoding for a single block. - armral_turbo_decode_block( - data->sys_recovered, data->par_recovered, data->itl_recovered, - data->len_out, data->data_decoded, iter_max, data->permutation_indices); + // Run turbo decoding for num_blocks blocks + armral_turbo_decode_batch(data->num_blocks, data->sys_recovered, + data->par_recovered, data->itl_recovered, + data->len_out, data->data_decoded, iter_max, + data->permutation_indices); + + results->num_bit_errors = 0; + results->num_block_errors = 0; // To make it easier to compare the values, convert the bit array to a byte // array - armral::bits_to_bytes(data->len_out, data->data_decoded, - data->data_decoded_bytes); - - // Check the number of errors in decoding - int num_errors = 0; - armral::bits_to_bytes(data->len_in, data->data_in, data->data_in_bytes); - for (uint32_t i = 0; i < data->len_in; ++i) { - if (data->data_decoded_bytes[i] != data->data_in_bytes[i]) { - num_errors++; + for (uint32_t b = 0; b < data->num_blocks; ++b) { + uint32_t decoded_offset = b * (data->len_out + 7) / 8; + uint32_t in_offset = b * (data->len_in + 7) / 8; + + armral::bits_to_bytes(data->len_out, data->data_decoded + decoded_offset, + data->data_decoded_bytes + b * data->len_out); + + // Check the number of errors in decoding + armral::bits_to_bytes(data->len_in, data->data_in + in_offset, + data->data_in_bytes + b * data->len_in); + + bool block_error = false; + for (uint32_t i = 0; i < data->len_in; ++i) { + if (data->data_decoded_bytes[i + b * data->len_out] != + data->data_in_bytes[i + b * data->len_in]) { + results->num_bit_errors++; + block_error = true; + } + } + if (block_error) { + results->num_block_errors++; } } - return num_errors; } struct sim_result { sim_result(uint32_t k_in, uint32_t e_in, armral_modulation_type mod, uint32_t ulp_in, double ebn0_in, double snr_in, uint32_t iter_max_in, uint32_t nb, uint32_t nm, - uint32_t num_messages) + uint32_t num_messages, uint32_t num_blocks_in) : k(k_in), e(e_in), mod_type(armral::simulation::mod_to_str(mod)), ulp(ulp_in), ebn0(ebn0_in), snr(snr_in), iter_max(iter_max_in), bler(static_cast(nm) / num_messages), - ber(static_cast(nb) / (num_messages * k)) {} + ber(static_cast(nb) / (num_messages * k)), + num_blocks(num_blocks_in) {} uint32_t k; uint32_t e; @@ -268,12 +327,14 @@ struct sim_result { uint32_t iter_max; double bler; double ber; + uint32_t num_blocks; std::string to_str() const { std::ostringstream s; s.precision(10); s.setf(std::ios::fixed, std::ios::floatfield); - s << "{\"k\": " << k << ", \"e\": " << e << ", \"mod_type\": \"" << mod_type + s << "{\"k\": " << k << ", \"e\": " << e + << ", \"num_blocks\": " << num_blocks << ", \"mod_type\": \"" << mod_type << "\", \"ulp\": " << ulp << ", \"Eb/N0\": " << ebn0 << ", \"snr\": " << snr << ", \"iter_max\": " << iter_max << ", \"bler\": " << bler << ", \"ber\": " << ber << "}"; @@ -283,7 +344,7 @@ struct sim_result { bool run_snr(uint32_t k, uint32_t iter_max, armral_modulation_type mod_type, uint32_t e, uint32_t rv, uint16_t ulp, double ebn0_db, - uint16_t *perm_idxs) { + uint16_t *perm_idxs, uint32_t num_blocks) { // Compute SNR in dB // The coding rate is the ratio of input information bits, k, to the number of // rate-matched bits, e. @@ -306,24 +367,25 @@ bool run_snr(uint32_t k, uint32_t iter_max, armral_modulation_type mod_type, uint64_t nr = 1e4; #pragma omp parallel reduction(+ : nb, num_message_errors) { - turbo_example_data data(k, mod_type, e, rv, perm_idxs); -#pragma omp for - for (uint64_t r = 0; r < nr; ++r) { + turbo_example_data data(num_blocks, k, mod_type, e, rv, perm_idxs); +#pragma omp for schedule(static) + for (uint64_t r = 0; r < nr / num_blocks; ++r) { auto state = armral::utils::random_state::from_seeds({r, nr_total}); - uint32_t num_bit_errors = - run_check(&state, snr_db, ulp, iter_max, &data); - nb += num_bit_errors; - num_message_errors += num_bit_errors == 0 ? 0 : 1; + turbo_error_counts results; + run_check(&state, snr_db, ulp, iter_max, &data, &results); + nb += results.num_bit_errors; + num_message_errors += results.num_block_errors; } } - nr_total += nr; + // Account for cases where nr is not divisible by num_blocks + nr_total += (nr / num_blocks) * num_blocks; } double message_error_rate = static_cast(num_message_errors) / nr_total; // Write out data in JSON format std::cout << sim_result(k, e, mod_type, ulp, ebn0_db, snr_db, iter_max, nb, - num_message_errors, nr_total) + num_message_errors, nr_total, num_blocks) .to_str() << std::endl; @@ -341,6 +403,7 @@ int main(int argc, char **argv) { uint16_t ulp = 0; uint32_t iter_max = 10; armral_modulation_type mod_type = ARMRAL_MOD_256QAM; + uint32_t num_blocks = 1; bool is_k_set = false; bool is_mod_set = false; bool is_e_set = false; @@ -348,7 +411,7 @@ int main(int argc, char **argv) { // Parse arguments int option; - while ((option = getopt(argc, argv, "k:m:e:r:u:i:")) != -1) { + while ((option = getopt(argc, argv, "k:m:e:r:u:i:b:")) != -1) { switch (option) { case 'k': k = (uint32_t)atoi(optarg); @@ -371,6 +434,9 @@ int main(int argc, char **argv) { case 'i': iter_max = (uint32_t)atoi(optarg); break; + case 'b': + num_blocks = (uint32_t)atoi(optarg); + break; default: print_usage = true; } @@ -411,7 +477,7 @@ int main(int argc, char **argv) { // A default ulp value is 128, which removes some low order bits when // demodulating. Care should be taken that this is not too large. In // particular, when this value exceeds half the distance between - // neighbouring symbols, it is no longer possible to tell which symbol has + // neighboring symbols, it is no longer possible to tell which symbol has // been transmitted in the case where we have a noiseless channel. This may // cause the simulation to never decode in an error-free manner. ulp = 128; @@ -422,9 +488,9 @@ int main(int argc, char **argv) { armral_turbo_perm_idx_init(perm_idxs); for (double snr = -2; - run_snr(k, iter_max, mod_type, e, rv, ulp, snr, perm_idxs); snr += 0.5) { + run_snr(k, iter_max, mod_type, e, rv, ulp, snr, perm_idxs, num_blocks); + snr += 0.5) { } - free(perm_idxs); return 0; } diff --git a/simulation/turbo_awgn/turbo_error_rate.py b/simulation/turbo_awgn/turbo_error_rate.py index 51cd9fdf25171523755ddbdd2b41012c6aac8268..130894243eea1b041b86950b4f866f472cb637fc 100755 --- a/simulation/turbo_awgn/turbo_error_rate.py +++ b/simulation/turbo_awgn/turbo_error_rate.py @@ -1,6 +1,8 @@ #!/usr/bin/env python3 # Arm RAN Acceleration Library -# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates +# SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its +# affiliates +# SPDX-License-Identifier: BSD-3-Clause from argparse import ArgumentParser from dataclasses import dataclass @@ -21,9 +23,9 @@ from simulation_common import (plot_awgn_table, # noqa: E402 @dataclass class PlotFormattedTurbo(PlotFormattedBase): def _legend_label_str(self, labels): - # we have labels of the form: k, e, mod_type, ulp, coding rate, iter_max - assert len(labels) == 6 - widths = [7, 7, 10, 5, 13, 9] + # we have labels of the form: k, e, num_blocks, mod_type, ulp, coding rate, iter_max + assert len(labels) == 7 + widths = [7, 7, 12, 10, 5, 13, 9] def to_str(x): if isinstance(x, float): @@ -40,7 +42,7 @@ def plot_graph(vals, error_rate, field, k, x, bw, save_plot, mod_vals): # Definitions for creating the pivot table index = [x] if x == "snr" else ["Eb/N0"] - cols = ["k", "e", "mod_type", "ulp", "coding rate", "iter_max"] + cols = ["k", "e", "num_blocks", "mod_type", "ulp", "coding rate", "iter_max"] # Filter data on the k, if given filters = {"k": k} diff --git a/src/BasicMathFun/MatrixInv/arm_cmplx_hermitian_mat_inversion_f32.cpp b/src/BasicMathFun/MatrixInv/arm_cmplx_hermitian_mat_inversion_f32.cpp index 591c01da1fa01c9076bff76cf4b8ac319017ceb1..0e2d7f64fa3979e740ab01c1d80a2f7ecbd1331d 100644 --- a/src/BasicMathFun/MatrixInv/arm_cmplx_hermitian_mat_inversion_f32.cpp +++ b/src/BasicMathFun/MatrixInv/arm_cmplx_hermitian_mat_inversion_f32.cpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "armral.h" diff --git a/src/BasicMathFun/MatrixInv/arm_cmplx_mat_inversion_f32.cpp b/src/BasicMathFun/MatrixInv/arm_cmplx_mat_inversion_f32.cpp index 7105e2ba2a58b4150c7fd65b63f8d4e913c1b648..4d5db0a1187c0f927ee3b145f8e1d713b90ee86d 100644 --- a/src/BasicMathFun/MatrixInv/arm_cmplx_mat_inversion_f32.cpp +++ b/src/BasicMathFun/MatrixInv/arm_cmplx_mat_inversion_f32.cpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "armral.h" @@ -263,8 +265,8 @@ static inline void sve_invert_matrix_2x2_impl(svfloat32_t ab, svfloat32_t cd, // For a 2x2 matrix | a b | // | c d | // the inverse is - // | d * conj(det) -b * conj(det) | * 1 / (Re(det)^2 + Im(det)^2) - // | -c * conj(det) a * conj(det) | + // | d * conj(det) -b * conj(det) | * 1 / (Re(det)^2 + Im(det)^2) + // | -c * conj(det) a * conj(det) | svbool_t ptrue = svptrue_b32(); @@ -394,9 +396,10 @@ mat_inverse<3, 3, 3>::invert_matrix(const armral_cmplx_f32_t *__restrict p_src, // | g h i | // the inverse is - // | (ei - fh) * conj(det) (ch - bi) * conj(det) (bf - ce) * conj(det) | * 1/(Re(det)^2 + Im(det)^2) + // | (ei - fh) * conj(det) (ch - bi) * conj(det) (bf - ce) * conj(det) | // | (fg - di) * conj(det) (ai - cg) * conj(det) (cd - af) * conj(det) | // | (dh - eg) * conj(det) (bg - ah) * conj(det) (ae - bd) * conj(det) | + // * 1/(Re(det)^2 + Im(det)^2) svfloat32_t ab = svld1rq_f32(ptrue, src); svfloat32_t cd = svld1rq_f32(ptrue, src + 4); @@ -460,15 +463,20 @@ mat_inverse<3, 3, 3>::invert_matrix(const armral_cmplx_f32_t *__restrict p_src, abs = svadd_f32_x(ptrue, abs, svrev_f32(abs)); svfloat32_t det_abs_recip = svdivr_n_f32_x(ptrue, abs, 1.0F); - // [ (ei - fg) * conj(det) (ch - bi) * conj(det) ] * 1 / (Re(det)^2 + Im(det)^2) + // [ (ei - fg) * conj(det), + // (ch - bi) * conj(det) ] * 1 / (Re(det)^2 + Im(det)^2) svfloat32_t res_00_01 = sve_mul_cmplx_f32<0, 270>(ptrue, det, vec_00_01); - // [ (bf - ce) * conj(det) (fg - di) * conj(det) ] * 1 / (Re(det)^2 + Im(det)^2) + // [ (bf - ce) * conj(det), + // (fg - di) * conj(det) ] * (Re(det)^2 + Im(det)^2) svfloat32_t res_02_10 = sve_mul_cmplx_f32<0, 270>(ptrue, det, vec_02_10); - // [ (ai - cg) * conj(det) (cd - af) * conj(det) ] * 1 / (Re(det)^2 + Im(det)^2) + // [ (ai - cg) * conj(det), + // (cd - af) * conj(det) ] * 1 / (Re(det)^2 + Im(det)^2) svfloat32_t res_11_12 = sve_mul_cmplx_f32<0, 270>(ptrue, det, vec_11_12); - // [ (dh - eg) * conj(det) (bg - ah) * conj(det) ] * 1 / (Re(det)^2 + Im(det)^2) + // [ (dh - eg) * conj(det), + // (bg - ah) * conj(det) ] * 1 / (Re(det)^2 + Im(det)^2) svfloat32_t res_20_21 = sve_mul_cmplx_f32<0, 270>(ptrue, det, vec_20_21); - // [ (ae - db) * conj(det) (bd - ae) * conj(det) ] * 1 / (Re(det)^2 + Im(det)^2) + // [ (ae - db) * conj(det), + // (bd - ae) * conj(det) ] * 1 / (Re(det)^2 + Im(det)^2) svfloat32_t res_22 = sve_mul_cmplx_f32<0, 270>(ptrue, det, vec_22); svst1_f32(p4, dst, svmul_f32_x(ptrue, res_00_01, det_abs_recip)); diff --git a/src/BasicMathFun/MatrixInv/cmplx_hermitian_mat_inversion_f32.hpp b/src/BasicMathFun/MatrixInv/cmplx_hermitian_mat_inversion_f32.hpp index 49eebadac455de28c61b5a323c80e0619bfd4841..e5946183a0ca73eceaa5f3bd168fadab7ad14810 100644 --- a/src/BasicMathFun/MatrixInv/cmplx_hermitian_mat_inversion_f32.hpp +++ b/src/BasicMathFun/MatrixInv/cmplx_hermitian_mat_inversion_f32.hpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2023-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2023-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ namespace armral::cmplx_herm_mat_inv { diff --git a/src/BasicMathFun/MatrixMult/arm_cmplx_mat_vec_mult_f32.c b/src/BasicMathFun/MatrixMult/arm_cmplx_mat_vec_mult_f32.c index 8225ce49ea507889d62e41dfc08b3a7822ca4128..60ea425c65011e6b27065f9ef75359277a3f1eb3 100644 --- a/src/BasicMathFun/MatrixMult/arm_cmplx_mat_vec_mult_f32.c +++ b/src/BasicMathFun/MatrixMult/arm_cmplx_mat_vec_mult_f32.c @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "armral.h" #include "intrinsics.h" diff --git a/src/BasicMathFun/MatrixMult/arm_cmplx_mat_vec_mult_i16.c b/src/BasicMathFun/MatrixMult/arm_cmplx_mat_vec_mult_i16.c index e8512f362df882ea4e262cb86c92f8e6f367c18a..8ecd3eccbc064936fb2153bb067e9544d812e0f3 100644 --- a/src/BasicMathFun/MatrixMult/arm_cmplx_mat_vec_mult_i16.c +++ b/src/BasicMathFun/MatrixMult/arm_cmplx_mat_vec_mult_i16.c @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "armral.h" #include "intrinsics.h" diff --git a/src/BasicMathFun/MatrixMult/arm_cmplx_mat_vec_mult_i16_32bit.c b/src/BasicMathFun/MatrixMult/arm_cmplx_mat_vec_mult_i16_32bit.c index f7fa1d50676624b8277d92670727789ae430c22a..289f02a89d70aeffc421285327521bdc2ff098fd 100644 --- a/src/BasicMathFun/MatrixMult/arm_cmplx_mat_vec_mult_i16_32bit.c +++ b/src/BasicMathFun/MatrixMult/arm_cmplx_mat_vec_mult_i16_32bit.c @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "armral.h" #include "intrinsics.h" diff --git a/src/BasicMathFun/MatrixMult/arm_cmplx_matmul_aah_f32.cpp b/src/BasicMathFun/MatrixMult/arm_cmplx_matmul_aah_f32.cpp index 88b337b46092c98b06bdca4ff85678cafb9ed10c..8f5bcd3f818c0b26432e77f7d8d69df1cf4bead5 100644 --- a/src/BasicMathFun/MatrixMult/arm_cmplx_matmul_aah_f32.cpp +++ b/src/BasicMathFun/MatrixMult/arm_cmplx_matmul_aah_f32.cpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2023-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2023-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "armral.h" diff --git a/src/BasicMathFun/MatrixMult/arm_cmplx_matmul_ahb_f32.cpp b/src/BasicMathFun/MatrixMult/arm_cmplx_matmul_ahb_f32.cpp index 440c22a065e96c6c2f60e09f0d54beb417b8e680..650c35efe5ad5de92746744460ee13f97bdea0cc 100644 --- a/src/BasicMathFun/MatrixMult/arm_cmplx_matmul_ahb_f32.cpp +++ b/src/BasicMathFun/MatrixMult/arm_cmplx_matmul_ahb_f32.cpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2023-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2023-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "armral.h" #include "intrinsics.h" diff --git a/src/BasicMathFun/MatrixMult/arm_cmplx_matmul_f32.cpp b/src/BasicMathFun/MatrixMult/arm_cmplx_matmul_f32.cpp index 996670f4601ed3e941a145456d7d1a440667c87e..fc3ca712da7c805c01ac96e1f699c94bccca94d9 100644 --- a/src/BasicMathFun/MatrixMult/arm_cmplx_matmul_f32.cpp +++ b/src/BasicMathFun/MatrixMult/arm_cmplx_matmul_f32.cpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "armral.h" diff --git a/src/BasicMathFun/MatrixMult/arm_cmplx_matmul_i16.cpp b/src/BasicMathFun/MatrixMult/arm_cmplx_matmul_i16.cpp index 3710ca02a4db28af80d83079f78caf6926f60ac9..12489a8fc43eac8a334dbecb8f0efaa516fb5111 100644 --- a/src/BasicMathFun/MatrixMult/arm_cmplx_matmul_i16.cpp +++ b/src/BasicMathFun/MatrixMult/arm_cmplx_matmul_i16.cpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "armral.h" #include "cmplx_matmul_i16.hpp" diff --git a/src/BasicMathFun/MatrixMult/arm_cmplx_matmul_i16_32bit.cpp b/src/BasicMathFun/MatrixMult/arm_cmplx_matmul_i16_32bit.cpp index b0fe41f4cca00794d2a9fd1936fd38282f360461..51bc0e7ff21099ab79c5aa75ee0ba62d85dbef19 100644 --- a/src/BasicMathFun/MatrixMult/arm_cmplx_matmul_i16_32bit.cpp +++ b/src/BasicMathFun/MatrixMult/arm_cmplx_matmul_i16_32bit.cpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "armral.h" #include "cmplx_matmul_i16_32bit.hpp" diff --git a/src/BasicMathFun/MatrixMult/arm_solve_1sc.c b/src/BasicMathFun/MatrixMult/arm_solve_1sc.c index 18124b58c9af8a43f6e1d07b194e05353c386285..76c8a51e3d4964e3d91acff73b4f18a3665c262c 100644 --- a/src/BasicMathFun/MatrixMult/arm_solve_1sc.c +++ b/src/BasicMathFun/MatrixMult/arm_solve_1sc.c @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "arm_solve_1sc.h" #include "arm_solve_convert.h" diff --git a/src/BasicMathFun/MatrixMult/arm_solve_1sc.h b/src/BasicMathFun/MatrixMult/arm_solve_1sc.h index c542d2e6752cc1ff7682b7e45b8b60e529e15a3a..f742bc41568e2f5b297b90f90aa0eee02ebc1d66 100644 --- a/src/BasicMathFun/MatrixMult/arm_solve_1sc.h +++ b/src/BasicMathFun/MatrixMult/arm_solve_1sc.h @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #pragma once diff --git a/src/BasicMathFun/MatrixMult/arm_solve_4sc.c b/src/BasicMathFun/MatrixMult/arm_solve_4sc.c index e9a43c23c5b7d0f4c11543c308c5d3ef8551ed33..0b1009df2c7e9ab40c131ca85ebede5661de88d7 100644 --- a/src/BasicMathFun/MatrixMult/arm_solve_4sc.c +++ b/src/BasicMathFun/MatrixMult/arm_solve_4sc.c @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "arm_solve_4sc.h" #include "arm_solve_convert.h" diff --git a/src/BasicMathFun/MatrixMult/arm_solve_4sc.h b/src/BasicMathFun/MatrixMult/arm_solve_4sc.h index f6854b7a6e8e46c537b15da3aae5ccce10778c1d..c01e1e2eb3cfe2a761ba6caaa324daa3842bc371 100644 --- a/src/BasicMathFun/MatrixMult/arm_solve_4sc.h +++ b/src/BasicMathFun/MatrixMult/arm_solve_4sc.h @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #pragma once diff --git a/src/BasicMathFun/MatrixMult/arm_solve_6sc.c b/src/BasicMathFun/MatrixMult/arm_solve_6sc.c index 94dfcdca0364deeb1f991918db44725220965343..5d7e85c6250a99e32ae94ec0f2e06de486227be3 100644 --- a/src/BasicMathFun/MatrixMult/arm_solve_6sc.c +++ b/src/BasicMathFun/MatrixMult/arm_solve_6sc.c @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "arm_solve_6sc.h" #include "arm_solve_convert.h" diff --git a/src/BasicMathFun/MatrixMult/arm_solve_6sc.h b/src/BasicMathFun/MatrixMult/arm_solve_6sc.h index 249dbae601b20ea8d6733bf2215541e09d6c8ea4..fab9c4064bc15b8bfc7824de68336762e4f8c973 100644 --- a/src/BasicMathFun/MatrixMult/arm_solve_6sc.h +++ b/src/BasicMathFun/MatrixMult/arm_solve_6sc.h @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #pragma once diff --git a/src/BasicMathFun/MatrixMult/arm_solve_convert.h b/src/BasicMathFun/MatrixMult/arm_solve_convert.h index cd8cb13ba6c589fa7622ce0b774693528fcb4227..840b99401233195f2eaee122de1776d8680247db 100644 --- a/src/BasicMathFun/MatrixMult/arm_solve_convert.h +++ b/src/BasicMathFun/MatrixMult/arm_solve_convert.h @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #pragma once diff --git a/src/BasicMathFun/MatrixMult/arm_solve_f32.c b/src/BasicMathFun/MatrixMult/arm_solve_f32.c index 62cf87a02d03b678dfd9ab71692ca960091cf8bd..3eb5de64b6231f536c004d55611ec60c3677f56c 100644 --- a/src/BasicMathFun/MatrixMult/arm_solve_f32.c +++ b/src/BasicMathFun/MatrixMult/arm_solve_f32.c @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "arm_solve_1sc.h" #include "arm_solve_4sc.h" diff --git a/src/BasicMathFun/MatrixMult/cmplx_matmul_i16.hpp b/src/BasicMathFun/MatrixMult/cmplx_matmul_i16.hpp index 9b8b08fd0e4d8bf64d5eb290e327e3c2fb22e2c8..5ed2fa64a1bcc4dfde63c99fada3501097dbcecf 100644 --- a/src/BasicMathFun/MatrixMult/cmplx_matmul_i16.hpp +++ b/src/BasicMathFun/MatrixMult/cmplx_matmul_i16.hpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #pragma once diff --git a/src/BasicMathFun/MatrixMult/cmplx_matmul_i16_32bit.hpp b/src/BasicMathFun/MatrixMult/cmplx_matmul_i16_32bit.hpp index 87acc068808e029ae7ff084d483c587934c411de..db513267723309f73e0356b8da749582d72f40a1 100644 --- a/src/BasicMathFun/MatrixMult/cmplx_matmul_i16_32bit.hpp +++ b/src/BasicMathFun/MatrixMult/cmplx_matmul_i16_32bit.hpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #pragma once diff --git a/src/BasicMathFun/MatrixPseudoInv/arm_cmplx_pseudo_inverse_direct_f32.cpp b/src/BasicMathFun/MatrixPseudoInv/arm_cmplx_pseudo_inverse_direct_f32.cpp index 41547d190d185971d50b3e05515201fb1750d353..eab2c141db641d0ca0fa9e280be985c307ce4a2c 100644 --- a/src/BasicMathFun/MatrixPseudoInv/arm_cmplx_pseudo_inverse_direct_f32.cpp +++ b/src/BasicMathFun/MatrixPseudoInv/arm_cmplx_pseudo_inverse_direct_f32.cpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2023-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2023-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "armral.h" #include "utils/allocators.hpp" diff --git a/src/BasicMathFun/MatrixPseudoInv/cmplx_mat_pseudo_inverse.hpp b/src/BasicMathFun/MatrixPseudoInv/cmplx_mat_pseudo_inverse.hpp index c4d80719c7bdde8d28d497b8f21b4c48e2e6c3f2..2a6edadee36f4a8ce545fd9231a4b5dbcd06aedb 100644 --- a/src/BasicMathFun/MatrixPseudoInv/cmplx_mat_pseudo_inverse.hpp +++ b/src/BasicMathFun/MatrixPseudoInv/cmplx_mat_pseudo_inverse.hpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2023-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2023-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ namespace armral::cmplx_mat_pseudo_inv { diff --git a/src/BasicMathFun/VectorDotProd/arm_cmplx_vecdot_f32.c b/src/BasicMathFun/VectorDotProd/arm_cmplx_vecdot_f32.c index ee71134a30f3f346e0bc35f29f2aadf0e7089077..a038049b6551606e3a92a0075e08a51fab1f5802 100644 --- a/src/BasicMathFun/VectorDotProd/arm_cmplx_vecdot_f32.c +++ b/src/BasicMathFun/VectorDotProd/arm_cmplx_vecdot_f32.c @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "armral.h" diff --git a/src/BasicMathFun/VectorDotProd/arm_cmplx_vecdot_f32_2.c b/src/BasicMathFun/VectorDotProd/arm_cmplx_vecdot_f32_2.c index fa9b224bbd7501d865cf4f89f0580ff2a22d42f7..3c29ffb1e522b14edd0eb51e0b6687d76e08dcca 100644 --- a/src/BasicMathFun/VectorDotProd/arm_cmplx_vecdot_f32_2.c +++ b/src/BasicMathFun/VectorDotProd/arm_cmplx_vecdot_f32_2.c @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "armral.h" #ifdef ARMRAL_ARCH_SVE diff --git a/src/BasicMathFun/VectorDotProd/arm_cmplx_vecdot_i16.c b/src/BasicMathFun/VectorDotProd/arm_cmplx_vecdot_i16.c index 7fad68349e8256964189fa664969327fdb4768f3..6395382e6fcbb563c5dd97078de0040f33866c85 100644 --- a/src/BasicMathFun/VectorDotProd/arm_cmplx_vecdot_i16.c +++ b/src/BasicMathFun/VectorDotProd/arm_cmplx_vecdot_i16.c @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "armral.h" #include "intrinsics.h" diff --git a/src/BasicMathFun/VectorDotProd/arm_cmplx_vecdot_i16_2.c b/src/BasicMathFun/VectorDotProd/arm_cmplx_vecdot_i16_2.c index ed3a538106f6520ad4aa4d5d4a4abb34634622be..48b17b4b535c558ca7f758f9ac1793eeadef693d 100644 --- a/src/BasicMathFun/VectorDotProd/arm_cmplx_vecdot_i16_2.c +++ b/src/BasicMathFun/VectorDotProd/arm_cmplx_vecdot_i16_2.c @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "armral.h" #include "intrinsics.h" diff --git a/src/BasicMathFun/VectorDotProd/arm_cmplx_vecdot_i16_2_32bit.c b/src/BasicMathFun/VectorDotProd/arm_cmplx_vecdot_i16_2_32bit.c index 190e459df1cdd57ff869103ab734374e12e852e9..18f16e3c6892a495027273a43d63a2e845e51d49 100644 --- a/src/BasicMathFun/VectorDotProd/arm_cmplx_vecdot_i16_2_32bit.c +++ b/src/BasicMathFun/VectorDotProd/arm_cmplx_vecdot_i16_2_32bit.c @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "armral.h" #include "intrinsics.h" diff --git a/src/BasicMathFun/VectorDotProd/arm_cmplx_vecdot_i16_32bit.c b/src/BasicMathFun/VectorDotProd/arm_cmplx_vecdot_i16_32bit.c index 60aec7ff50d616d11ece7ba96950abff688fa93a..9c0a2ac43ad680881b44eb6759354853e3d2575d 100644 --- a/src/BasicMathFun/VectorDotProd/arm_cmplx_vecdot_i16_32bit.c +++ b/src/BasicMathFun/VectorDotProd/arm_cmplx_vecdot_i16_32bit.c @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "armral.h" #include "intrinsics.h" diff --git a/src/BasicMathFun/VectorMult/arm_cmplx_vecmul_f32.c b/src/BasicMathFun/VectorMult/arm_cmplx_vecmul_f32.c index 87e9636594bfefd1c1b7ff0cf283cc339d0d29c8..9eb62711c4c88131169fb283a21565d4df9f7d17 100644 --- a/src/BasicMathFun/VectorMult/arm_cmplx_vecmul_f32.c +++ b/src/BasicMathFun/VectorMult/arm_cmplx_vecmul_f32.c @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "armral.h" #ifdef ARMRAL_ARCH_SVE diff --git a/src/BasicMathFun/VectorMult/arm_cmplx_vecmul_f32_2.c b/src/BasicMathFun/VectorMult/arm_cmplx_vecmul_f32_2.c index fb1a218eb0c87f60d55bf9bd6d4b98fe7b5bf8b2..25ba2c3da398be672af129afa25e9bd85a6f890b 100644 --- a/src/BasicMathFun/VectorMult/arm_cmplx_vecmul_f32_2.c +++ b/src/BasicMathFun/VectorMult/arm_cmplx_vecmul_f32_2.c @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "armral.h" #ifdef ARMRAL_ARCH_SVE diff --git a/src/BasicMathFun/VectorMult/arm_cmplx_vecmul_i16.cpp b/src/BasicMathFun/VectorMult/arm_cmplx_vecmul_i16.cpp index 3eb94d52134b402350d784c4ecf45f436ed8838a..4d5b6972661f3a2489af0d75e2dde3713f93cd4e 100644 --- a/src/BasicMathFun/VectorMult/arm_cmplx_vecmul_i16.cpp +++ b/src/BasicMathFun/VectorMult/arm_cmplx_vecmul_i16.cpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "armral.h" #include "intrinsics.h" diff --git a/src/BasicMathFun/VectorMult/arm_cmplx_vecmul_i16_2.c b/src/BasicMathFun/VectorMult/arm_cmplx_vecmul_i16_2.c index 3b3cd28517b75fb6cf786c21430e6571fed0e5f6..983ba1fbc6c24b43cd07ad774401f460072480f6 100644 --- a/src/BasicMathFun/VectorMult/arm_cmplx_vecmul_i16_2.c +++ b/src/BasicMathFun/VectorMult/arm_cmplx_vecmul_i16_2.c @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "armral.h" #include "intrinsics.h" diff --git a/src/DuRuInterface/MuLawCompression/arm_mu_law_compression.cpp b/src/DuRuInterface/MuLawCompression/arm_mu_law_compression.cpp index c30b4e1fc16d7af06e7767cf7908785bde24b3ed..3d1407e4c289cba937394fb2236819d8089d0856 100644 --- a/src/DuRuInterface/MuLawCompression/arm_mu_law_compression.cpp +++ b/src/DuRuInterface/MuLawCompression/arm_mu_law_compression.cpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "armral.h" #ifdef ARMRAL_ARCH_SVE diff --git a/src/DuRuInterface/MuLawCompression/arm_mu_law_decompression.cpp b/src/DuRuInterface/MuLawCompression/arm_mu_law_decompression.cpp index 3679a899a789d7806da28aeb72abf53b341d2e2c..f300b176333254ff34a989d4117c1ac6380b6b78 100644 --- a/src/DuRuInterface/MuLawCompression/arm_mu_law_decompression.cpp +++ b/src/DuRuInterface/MuLawCompression/arm_mu_law_decompression.cpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "armral.h" diff --git a/src/DuRuInterface/ORanBlockFloat/arm_block_float_compression.cpp b/src/DuRuInterface/ORanBlockFloat/arm_block_float_compression.cpp index be7e41ebbd66001c0f3552a03a5019bcb96769c5..df86c8581db57c3cc01464d1b2a749621c00f691 100644 --- a/src/DuRuInterface/ORanBlockFloat/arm_block_float_compression.cpp +++ b/src/DuRuInterface/ORanBlockFloat/arm_block_float_compression.cpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "armral.h" #if ARMRAL_ARCH_SVE >= 2 diff --git a/src/DuRuInterface/ORanBlockFloat/arm_block_float_decompression.cpp b/src/DuRuInterface/ORanBlockFloat/arm_block_float_decompression.cpp index d6fc89cfea3f5f71dfe6ee710289eb61833cbfe1..21ef6345c0eb49d7c1f443aa83834e1d35a3b77f 100644 --- a/src/DuRuInterface/ORanBlockFloat/arm_block_float_decompression.cpp +++ b/src/DuRuInterface/ORanBlockFloat/arm_block_float_decompression.cpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "armral.h" diff --git a/src/DuRuInterface/ORanBlockScaling/arm_block_scaling_compression.cpp b/src/DuRuInterface/ORanBlockScaling/arm_block_scaling_compression.cpp index 14f7488f5454c9dd209438ee4a33485da4ce2882..5e13bf13d780ac8480429d6b3779a58817a655c1 100644 --- a/src/DuRuInterface/ORanBlockScaling/arm_block_scaling_compression.cpp +++ b/src/DuRuInterface/ORanBlockScaling/arm_block_scaling_compression.cpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "armral.h" #if ARMRAL_ARCH_SVE >= 2 diff --git a/src/DuRuInterface/ORanBlockScaling/arm_block_scaling_decompression.cpp b/src/DuRuInterface/ORanBlockScaling/arm_block_scaling_decompression.cpp index 01caaf5b00348f8cbddac1eba19c216a739a871c..ded4fb974acbeb8b616a05705d778afe3771a843 100644 --- a/src/DuRuInterface/ORanBlockScaling/arm_block_scaling_decompression.cpp +++ b/src/DuRuInterface/ORanBlockScaling/arm_block_scaling_decompression.cpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "armral.h" diff --git a/src/DuRuInterface/bit_packing_common.hpp b/src/DuRuInterface/bit_packing_common.hpp index 070318a968cbc32a792e8e9f5da94a5bc1fe6800..cc8a7919467cf024a3ada02d2768a346bcf71f06 100644 --- a/src/DuRuInterface/bit_packing_common.hpp +++ b/src/DuRuInterface/bit_packing_common.hpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #pragma once @@ -161,7 +163,7 @@ static inline void pack_9bit_and_store_int16(const int16x8x3_t reg[], // The final element is now correct, so store it vst1_lane_s8(dst + 8, vreinterpret_s8_u8(d18), 7); - // ORR d07 with [00000000 | a0000000 | bb000000 | ccc00000 | ...] and store + // ORR d07 with [00000000 | a0000000 | bb000000 | ccc00000 | ...] d07 = vorr_u8(d07, vext_u8(vdup_n_u8(0), d18, 7)); vst1_s8(dst, vreinterpret_s8_u8(d07)); diff --git a/src/DuRuInterface/bit_unpacking_common.hpp b/src/DuRuInterface/bit_unpacking_common.hpp index d30c7dd2b0dba748d7a360fdd7a3bcdbefc806af..9c29a22ebd30e816af66b0081379aee411cfe9d7 100644 --- a/src/DuRuInterface/bit_unpacking_common.hpp +++ b/src/DuRuInterface/bit_unpacking_common.hpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2024-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #pragma once @@ -182,4 +184,4 @@ void common_decompr_9bit_neon(uint32_t n_prb, } } -} // namespace \ No newline at end of file +} // namespace diff --git a/src/LowerPHY/Correlation/arm_correlation.c b/src/LowerPHY/Correlation/arm_correlation.c index b260b4de6160d244ee4e32f4bd059d03207e1e0f..4b64caf8b6303b586e23d163fe96322e3632401b 100644 --- a/src/LowerPHY/Correlation/arm_correlation.c +++ b/src/LowerPHY/Correlation/arm_correlation.c @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "armral.h" #include "intrinsics.h" diff --git a/src/LowerPHY/FFT/bluestein.cpp b/src/LowerPHY/FFT/bluestein.cpp index c35dd112f34390b04b5726dccc2eabd5f4a2e82e..ab6224c1396e990ae7af0e6e0cab637bb47c25ec 100644 --- a/src/LowerPHY/FFT/bluestein.cpp +++ b/src/LowerPHY/FFT/bluestein.cpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2024-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "bluestein.hpp" @@ -11,12 +13,13 @@ namespace armral::fft { template -bluestein make_bluestein(int n, armral_fft_direction_t dir, - const int *base_kernels, - int len_base_kernels) { +std::optional> +make_bluestein(int n, armral_fft_direction_t dir, const int *base_kernels, + int len_base_kernels) { using real_t = armral::fft::real_t; - // Look for the next size > 2n-1 which would allow us to use fast kernels alone + // Look for the next size > 2n-1 which would allow us to use fast kernels + // alone int n_pad = 2 * n - 1; n_pad--; int curn_n = 0; @@ -52,11 +55,21 @@ bluestein make_bluestein(int n, armral_fft_direction_t dir, // Create 2 plans: forward and backward armral_fft_plan_t *pf = nullptr; armral_fft_plan_t *pb = nullptr; - armral::fft::create_plan( + auto pf_status = armral::fft::create_plan( &pf, n_pad, armral_fft_direction_t::ARMRAL_FFT_FORWARDS, false); - armral::fft::create_plan( + auto pb_status = armral::fft::create_plan( &pb, n_pad, armral_fft_direction_t::ARMRAL_FFT_BACKWARDS, false); + if (pf_status == ARMRAL_ARGUMENT_ERROR || + pb_status == ARMRAL_ARGUMENT_ERROR) { + if (pf) { + armral::fft::destroy_plan(&pf); + } else if (pb) { + armral::fft::destroy_plan(&pb); + } + return std::nullopt; + } + // Execute fwds plan transforming series b armral::fft::execute(pf, b, b, 1, 1, 1); @@ -69,17 +82,20 @@ bluestein make_bluestein(int n, armral_fft_direction_t dir, return bluestein{n, n_pad, dir, pf, pb, a, b}; } -template bluestein +template std::optional< + bluestein> make_bluestein(int n, armral_fft_direction_t dir, const int *base_kernels, int len_base_kernels); -template bluestein +template std::optional< + bluestein> make_bluestein(int n, armral_fft_direction_t dir, const int *base_kernels, int len_base_kernels); -template bluestein +template std::optional< + bluestein> make_bluestein(int n, armral_fft_direction_t dir, const int *base_kernels, int len_base_kernels); -template bluestein +template std::optional< + bluestein> make_bluestein(int n, armral_fft_direction_t dir, const int *base_kernels, int len_base_kernels); diff --git a/src/LowerPHY/FFT/bluestein.hpp b/src/LowerPHY/FFT/bluestein.hpp index 15811ef0b64203c7f93d092151433e3e943dddcc..77acb4093a9bc7c3f27bb23cbc031d2addc874f4 100644 --- a/src/LowerPHY/FFT/bluestein.hpp +++ b/src/LowerPHY/FFT/bluestein.hpp @@ -1,12 +1,16 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2024-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #pragma once #include "fft_plan.hpp" +#include "optional" + namespace armral::fft { /// Class to support using Bluestein's algorithm for prime n. @@ -54,9 +58,9 @@ struct bluestein { }; template -bluestein make_bluestein(int n, armral_fft_direction_t dir, - const int *base_kernels, - int len_base_kernels); +std::optional> +make_bluestein(int n, armral_fft_direction_t dir, const int *base_kernels, + int len_base_kernels); template void execute_bluestein(const bluestein &bs, const Tx *x, Ty *y, diff --git a/src/LowerPHY/FFT/fft_cf32.cpp b/src/LowerPHY/FFT/fft_cf32.cpp index 6917fd21ed7d76e70ac88caf573f3b95fc57003a..2d8b1d6f1044d2e5c2c90839504bcd4cb4765ed7 100644 --- a/src/LowerPHY/FFT/fft_cf32.cpp +++ b/src/LowerPHY/FFT/fft_cf32.cpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "fft_execute.hpp" #include "fft_plan.hpp" diff --git a/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ab_t_gs.c b/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ab_t_gs.c index 72cc33ca6555cd98840b69fe136789800116b5ac..99f255e14e85f18942c6b4803e49183db893542c 100644 --- a/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ab_t_gs.c +++ b/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ab_t_gs.c @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "fft_cf32_cf32_cf32_ab_t_gs.h" @@ -72,14 +74,11 @@ void armral_fft_cf32_cf32_cf32_ab_t_gs2(const armral_cmplx_f32_t *restrict x, svld1_gather_s64index_f64(pred_full, (const double *)(v78), v91)); svfloat32_t v92 = svreinterpret_f32_f64( svld1_gather_s64index_f64(pred_full, (const double *)(v90), v91)); - svfloat32_t zero38; - asm volatile("mov %0.s, #0" : "=w"(zero38)); + svfloat32_t zero38 = svdup_n_f32(0); svfloat32_t v38 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero38, v80, v37, 0), v80, v37, 90); - svfloat32_t v46; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v46) : "w"(v92), "w"(v38)); - svfloat32_t v47; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v47) : "w"(v92), "w"(v38)); + svfloat32_t v46 = svadd_f32_x(svptrue_b32(), v92, v38); + svfloat32_t v47 = svsub_f32_x(svptrue_b32(), v92, v38); svst1_scatter_s64index_f64(pred_full, (double *)(v102), v112, svreinterpret_f64_f32(v46)); svst1_scatter_s64index_f64(pred_full, (double *)(v111), v112, @@ -190,8 +189,7 @@ void armral_fft_cf32_cf32_cf32_ab_t_gs3(const armral_cmplx_f32_t *restrict x, svld1_gather_s64index_f64(pred_full, (const double *)(v138), v139)); svfloat32_t v143 = svdup_n_f32(v84); float32x2_t *v169 = &v6[v105]; - svfloat32_t zero52; - asm volatile("mov %0.s, #0" : "=w"(zero52)); + svfloat32_t zero52 = svdup_n_f32(0); svfloat32_t v52 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero52, v119, v51, 0), v119, v51, 90); @@ -199,27 +197,20 @@ void armral_fft_cf32_cf32_cf32_ab_t_gs3(const armral_cmplx_f32_t *restrict x, svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v7)[v57])); svfloat32_t v129 = svreinterpret_f32_f64( svld1_gather_s64index_f64(pred_full, (const double *)(v127), v139)); - svfloat32_t zero59; - asm volatile("mov %0.s, #0" : "=w"(zero59)); + svfloat32_t zero59 = svdup_n_f32(0); svfloat32_t v59 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero59, v129, v58, 0), v129, v58, 90); - svfloat32_t v60; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v60) : "w"(v52), "w"(v59)); - svfloat32_t v61; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v61) : "w"(v52), "w"(v59)); - svfloat32_t v69; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v69) : "w"(v60), "w"(v140)); - svfloat32_t zero86; - asm volatile("mov %0.s, #0" : "=w"(zero86)); + svfloat32_t v60 = svadd_f32_x(svptrue_b32(), v52, v59); + svfloat32_t v61 = svsub_f32_x(svptrue_b32(), v52, v59); + svfloat32_t v69 = svadd_f32_x(svptrue_b32(), v60, v140); + svfloat32_t zero86 = svdup_n_f32(0); svfloat32_t v86 = svcmla_f32_x(pred_full, zero86, v143, v61, 90); svfloat32_t v87 = svmla_f32_x(pred_full, v69, v60, v142); svst1_scatter_s64index_f64(pred_full, (double *)(v151), v170, svreinterpret_f64_f32(v69)); - svfloat32_t v88; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v88) : "w"(v87), "w"(v86)); - svfloat32_t v89; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v89) : "w"(v87), "w"(v86)); + svfloat32_t v88 = svadd_f32_x(svptrue_b32(), v87, v86); + svfloat32_t v89 = svsub_f32_x(svptrue_b32(), v87, v86); svst1_scatter_s64index_f64(pred_full, (double *)(v160), v170, svreinterpret_f64_f32(v89)); svst1_scatter_s64index_f64(pred_full, (double *)(v169), v170, @@ -345,8 +336,7 @@ void armral_fft_cf32_cf32_cf32_ab_t_gs4(const armral_cmplx_f32_t *restrict x, float32x2_t *v223 = &v6[v140]; svfloat32_t v37 = svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v7)[v36])); - svfloat32_t zero73; - asm volatile("mov %0.s, #0" : "=w"(zero73)); + svfloat32_t zero73 = svdup_n_f32(0); svfloat32_t v73 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero73, v163, v72, 0), v163, v72, 90); @@ -356,35 +346,24 @@ void armral_fft_cf32_cf32_cf32_ab_t_gs4(const armral_cmplx_f32_t *restrict x, svld1_gather_s64index_f64(pred_full, (const double *)(v152), v183)); svfloat32_t v173 = svreinterpret_f32_f64( svld1_gather_s64index_f64(pred_full, (const double *)(v171), v183)); - svfloat32_t zero38; - asm volatile("mov %0.s, #0" : "=w"(zero38)); + svfloat32_t zero38 = svdup_n_f32(0); svfloat32_t v38 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero38, v154, v37, 0), v154, v37, 90); - svfloat32_t zero80; - asm volatile("mov %0.s, #0" : "=w"(zero80)); + svfloat32_t zero80 = svdup_n_f32(0); svfloat32_t v80 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero80, v173, v79, 0), v173, v79, 90); - svfloat32_t v88; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v88) : "w"(v184), "w"(v38)); - svfloat32_t v89; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v89) : "w"(v184), "w"(v38)); - svfloat32_t v90; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v90) : "w"(v73), "w"(v80)); - svfloat32_t v91; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v91) : "w"(v73), "w"(v80)); - svfloat32_t v92; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v92) : "w"(v88), "w"(v90)); - svfloat32_t v93; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v93) : "w"(v88), "w"(v90)); - svfloat32_t zero115; - asm volatile("mov %0.s, #0" : "=w"(zero115)); + svfloat32_t v88 = svadd_f32_x(svptrue_b32(), v184, v38); + svfloat32_t v89 = svsub_f32_x(svptrue_b32(), v184, v38); + svfloat32_t v90 = svadd_f32_x(svptrue_b32(), v73, v80); + svfloat32_t v91 = svsub_f32_x(svptrue_b32(), v73, v80); + svfloat32_t v92 = svadd_f32_x(svptrue_b32(), v88, v90); + svfloat32_t v93 = svsub_f32_x(svptrue_b32(), v88, v90); + svfloat32_t zero115 = svdup_n_f32(0); svfloat32_t v115 = svcmla_f32_x(pred_full, zero115, v188, v91, 90); - svfloat32_t v116; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v116) : "w"(v89), "w"(v115)); - svfloat32_t v117; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v117) : "w"(v89), "w"(v115)); + svfloat32_t v116 = svadd_f32_x(svptrue_b32(), v89, v115); + svfloat32_t v117 = svsub_f32_x(svptrue_b32(), v89, v115); svst1_scatter_s64index_f64(pred_full, (double *)(v196), v224, svreinterpret_f64_f32(v92)); svst1_scatter_s64index_f64(pred_full, (double *)(v214), v224, @@ -565,8 +544,7 @@ void armral_fft_cf32_cf32_cf32_ab_t_gs5(const armral_cmplx_f32_t *restrict x, float32x2_t *v276 = &v6[v177]; float32x2_t *v285 = &v6[v184]; float32x2_t *v294 = &v6[v191]; - svfloat32_t zero52; - asm volatile("mov %0.s, #0" : "=w"(zero52)); + svfloat32_t zero52 = svdup_n_f32(0); svfloat32_t v52 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero52, v205, v51, 0), v205, v51, 90); @@ -582,59 +560,41 @@ void armral_fft_cf32_cf32_cf32_ab_t_gs5(const armral_cmplx_f32_t *restrict x, svld1_gather_s64index_f64(pred_full, (const double *)(v223), v243)); svfloat32_t v234 = svreinterpret_f32_f64( svld1_gather_s64index_f64(pred_full, (const double *)(v232), v243)); - svfloat32_t zero59; - asm volatile("mov %0.s, #0" : "=w"(zero59)); + svfloat32_t zero59 = svdup_n_f32(0); svfloat32_t v59 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero59, v215, v58, 0), v215, v58, 90); - svfloat32_t zero94; - asm volatile("mov %0.s, #0" : "=w"(zero94)); + svfloat32_t zero94 = svdup_n_f32(0); svfloat32_t v94 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero94, v225, v93, 0), v225, v93, 90); - svfloat32_t zero101; - asm volatile("mov %0.s, #0" : "=w"(zero101)); + svfloat32_t zero101 = svdup_n_f32(0); svfloat32_t v101 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero101, v234, v100, 0), v234, v100, 90); - svfloat32_t v102; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v102) : "w"(v52), "w"(v59)); - svfloat32_t v103; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v103) : "w"(v52), "w"(v59)); - svfloat32_t v104; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v104) : "w"(v94), "w"(v101)); - svfloat32_t v105; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v105) : "w"(v94), "w"(v101)); - svfloat32_t v106; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v106) : "w"(v102), "w"(v104)); - svfloat32_t v107; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v107) : "w"(v102), "w"(v104)); - svfloat32_t v108; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v108) : "w"(v103), "w"(v105)); - svfloat32_t zero138; - asm volatile("mov %0.s, #0" : "=w"(zero138)); + svfloat32_t v102 = svadd_f32_x(svptrue_b32(), v52, v59); + svfloat32_t v103 = svsub_f32_x(svptrue_b32(), v52, v59); + svfloat32_t v104 = svadd_f32_x(svptrue_b32(), v94, v101); + svfloat32_t v105 = svsub_f32_x(svptrue_b32(), v94, v101); + svfloat32_t v106 = svadd_f32_x(svptrue_b32(), v102, v104); + svfloat32_t v107 = svsub_f32_x(svptrue_b32(), v102, v104); + svfloat32_t v108 = svadd_f32_x(svptrue_b32(), v103, v105); + svfloat32_t zero138 = svdup_n_f32(0); svfloat32_t v138 = svcmla_f32_x(pred_full, zero138, v248, v103, 90); - svfloat32_t v116; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v116) : "w"(v106), "w"(v244)); - svfloat32_t zero145; - asm volatile("mov %0.s, #0" : "=w"(zero145)); + svfloat32_t v116 = svadd_f32_x(svptrue_b32(), v106, v244); + svfloat32_t zero145 = svdup_n_f32(0); svfloat32_t v145 = svcmla_f32_x(pred_full, zero145, v249, v108, 90); svfloat32_t v153 = svmla_f32_x(pred_full, v116, v106, v246); - svfloat32_t v156; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v156) : "w"(v138), "w"(v145)); + svfloat32_t v156 = svsub_f32_x(svptrue_b32(), v138, v145); svfloat32_t v157 = svcmla_f32_x(pred_full, v145, v250, v105, 90); svst1_scatter_s64index_f64(pred_full, (double *)(v258), v295, svreinterpret_f64_f32(v116)); svfloat32_t v154 = svmla_f32_x(pred_full, v153, v107, v247); svfloat32_t v155 = svmls_f32_x(pred_full, v153, v107, v247); - svfloat32_t v158; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v158) : "w"(v154), "w"(v156)); - svfloat32_t v159; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v159) : "w"(v154), "w"(v156)); - svfloat32_t v160; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v160) : "w"(v155), "w"(v157)); - svfloat32_t v161; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v161) : "w"(v155), "w"(v157)); + svfloat32_t v158 = svadd_f32_x(svptrue_b32(), v154, v156); + svfloat32_t v159 = svsub_f32_x(svptrue_b32(), v154, v156); + svfloat32_t v160 = svadd_f32_x(svptrue_b32(), v155, v157); + svfloat32_t v161 = svsub_f32_x(svptrue_b32(), v155, v157); svst1_scatter_s64index_f64(pred_full, (double *)(v267), v295, svreinterpret_f64_f32(v159)); svst1_scatter_s64index_f64(pred_full, (double *)(v276), v295, @@ -820,8 +780,7 @@ void armral_fft_cf32_cf32_cf32_ab_t_gs6(const armral_cmplx_f32_t *restrict x, svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v7)[v78])); svfloat32_t v114 = svreinterpret_f32_f64( svld1_f64(pred_full, &((const double *)v7)[v113])); - svfloat32_t zero122; - asm volatile("mov %0.s, #0" : "=w"(zero122)); + svfloat32_t zero122 = svdup_n_f32(0); svfloat32_t v122 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero122, v268, v121, 0), v268, v121, 90); @@ -833,55 +792,37 @@ void armral_fft_cf32_cf32_cf32_ab_t_gs6(const armral_cmplx_f32_t *restrict x, svld1_gather_s64index_f64(pred_full, (const double *)(v248), v279)); svfloat32_t v259 = svreinterpret_f32_f64( svld1_gather_s64index_f64(pred_full, (const double *)(v257), v279)); - svfloat32_t zero38; - asm volatile("mov %0.s, #0" : "=w"(zero38)); + svfloat32_t zero38 = svdup_n_f32(0); svfloat32_t v38 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero38, v232, v37, 0), v232, v37, 90); - svfloat32_t zero73; - asm volatile("mov %0.s, #0" : "=w"(zero73)); + svfloat32_t zero73 = svdup_n_f32(0); svfloat32_t v73 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero73, v241, v72, 0), v241, v72, 90); - svfloat32_t zero80; - asm volatile("mov %0.s, #0" : "=w"(zero80)); + svfloat32_t zero80 = svdup_n_f32(0); svfloat32_t v80 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero80, v250, v79, 0), v250, v79, 90); - svfloat32_t zero115; - asm volatile("mov %0.s, #0" : "=w"(zero115)); + svfloat32_t zero115 = svdup_n_f32(0); svfloat32_t v115 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero115, v259, v114, 0), v259, v114, 90); - svfloat32_t v130; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v130) : "w"(v280), "w"(v38)); - svfloat32_t v131; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v131) : "w"(v280), "w"(v38)); - svfloat32_t v132; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v132) : "w"(v73), "w"(v80)); - svfloat32_t v133; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v133) : "w"(v73), "w"(v80)); - svfloat32_t v134; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v134) : "w"(v115), "w"(v122)); - svfloat32_t v135; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v135) : "w"(v115), "w"(v122)); - svfloat32_t v136; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v136) : "w"(v132), "w"(v134)); - svfloat32_t v137; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v137) : "w"(v132), "w"(v134)); - svfloat32_t v159; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v159) : "w"(v133), "w"(v135)); - svfloat32_t v160; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v160) : "w"(v133), "w"(v135)); - svfloat32_t v138; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v138) : "w"(v136), "w"(v130)); - svfloat32_t zero155; - asm volatile("mov %0.s, #0" : "=w"(zero155)); + svfloat32_t v130 = svadd_f32_x(svptrue_b32(), v280, v38); + svfloat32_t v131 = svsub_f32_x(svptrue_b32(), v280, v38); + svfloat32_t v132 = svadd_f32_x(svptrue_b32(), v73, v80); + svfloat32_t v133 = svsub_f32_x(svptrue_b32(), v73, v80); + svfloat32_t v134 = svadd_f32_x(svptrue_b32(), v115, v122); + svfloat32_t v135 = svsub_f32_x(svptrue_b32(), v115, v122); + svfloat32_t v136 = svadd_f32_x(svptrue_b32(), v132, v134); + svfloat32_t v137 = svsub_f32_x(svptrue_b32(), v132, v134); + svfloat32_t v159 = svadd_f32_x(svptrue_b32(), v133, v135); + svfloat32_t v160 = svsub_f32_x(svptrue_b32(), v133, v135); + svfloat32_t v138 = svadd_f32_x(svptrue_b32(), v136, v130); + svfloat32_t zero155 = svdup_n_f32(0); svfloat32_t v155 = svcmla_f32_x(pred_full, zero155, v286, v137, 90); - svfloat32_t v161; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v161) : "w"(v159), "w"(v131)); - svfloat32_t zero178; - asm volatile("mov %0.s, #0" : "=w"(zero178)); + svfloat32_t v161 = svadd_f32_x(svptrue_b32(), v159, v131); + svfloat32_t zero178 = svdup_n_f32(0); svfloat32_t v178 = svcmla_f32_x(pred_full, zero178, v286, v160, 90); svfloat32_t v156 = svmla_f32_x(pred_full, v138, v136, v285); svfloat32_t v179 = svmla_f32_x(pred_full, v161, v159, v285); @@ -889,14 +830,10 @@ void armral_fft_cf32_cf32_cf32_ab_t_gs6(const armral_cmplx_f32_t *restrict x, svreinterpret_f64_f32(v138)); svst1_scatter_s64index_f64(pred_full, (double *)(v303), v340, svreinterpret_f64_f32(v161)); - svfloat32_t v157; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v157) : "w"(v156), "w"(v155)); - svfloat32_t v158; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v158) : "w"(v156), "w"(v155)); - svfloat32_t v180; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v180) : "w"(v179), "w"(v178)); - svfloat32_t v181; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v181) : "w"(v179), "w"(v178)); + svfloat32_t v157 = svadd_f32_x(svptrue_b32(), v156, v155); + svfloat32_t v158 = svsub_f32_x(svptrue_b32(), v156, v155); + svfloat32_t v180 = svadd_f32_x(svptrue_b32(), v179, v178); + svfloat32_t v181 = svsub_f32_x(svptrue_b32(), v179, v178); svst1_scatter_s64index_f64(pred_full, (double *)(v312), v340, svreinterpret_f64_f32(v158)); svst1_scatter_s64index_f64(pred_full, (double *)(v321), v340, @@ -1147,8 +1084,7 @@ void armral_fft_cf32_cf32_cf32_ab_t_gs7(const armral_cmplx_f32_t *restrict x, float32x2_t *v407 = &v6[v269]; float32x2_t *v416 = &v6[v276]; float32x2_t *v425 = &v6[v283]; - svfloat32_t zero52; - asm volatile("mov %0.s, #0" : "=w"(zero52)); + svfloat32_t zero52 = svdup_n_f32(0); svfloat32_t v52 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero52, v297, v51, 0), v297, v51, 90); @@ -1172,110 +1108,72 @@ void armral_fft_cf32_cf32_cf32_ab_t_gs7(const armral_cmplx_f32_t *restrict x, svld1_gather_s64index_f64(pred_full, (const double *)(v333), v353)); svfloat32_t v344 = svreinterpret_f32_f64( svld1_gather_s64index_f64(pred_full, (const double *)(v342), v353)); - svfloat32_t zero59; - asm volatile("mov %0.s, #0" : "=w"(zero59)); + svfloat32_t zero59 = svdup_n_f32(0); svfloat32_t v59 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero59, v307, v58, 0), v307, v58, 90); - svfloat32_t zero94; - asm volatile("mov %0.s, #0" : "=w"(zero94)); + svfloat32_t zero94 = svdup_n_f32(0); svfloat32_t v94 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero94, v317, v93, 0), v317, v93, 90); - svfloat32_t zero101; - asm volatile("mov %0.s, #0" : "=w"(zero101)); + svfloat32_t zero101 = svdup_n_f32(0); svfloat32_t v101 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero101, v326, v100, 0), v326, v100, 90); - svfloat32_t zero136; - asm volatile("mov %0.s, #0" : "=w"(zero136)); + svfloat32_t zero136 = svdup_n_f32(0); svfloat32_t v136 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero136, v335, v135, 0), v335, v135, 90); - svfloat32_t zero143; - asm volatile("mov %0.s, #0" : "=w"(zero143)); + svfloat32_t zero143 = svdup_n_f32(0); svfloat32_t v143 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero143, v344, v142, 0), v344, v142, 90); - svfloat32_t v144; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v144) : "w"(v52), "w"(v59)); - svfloat32_t v145; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v145) : "w"(v52), "w"(v59)); - svfloat32_t v146; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v146) : "w"(v94), "w"(v101)); - svfloat32_t v147; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v147) : "w"(v94), "w"(v101)); - svfloat32_t v148; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v148) : "w"(v136), "w"(v143)); - svfloat32_t v149; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v149) : "w"(v136), "w"(v143)); - svfloat32_t v150; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v150) : "w"(v144), "w"(v146)); - svfloat32_t v160; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v160) : "w"(v144), "w"(v146)); - svfloat32_t v161; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v161) : "w"(v146), "w"(v148)); - svfloat32_t v162; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v162) : "w"(v148), "w"(v144)); - svfloat32_t v163; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v163) : "w"(v145), "w"(v147)); - svfloat32_t v165; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v165) : "w"(v145), "w"(v147)); - svfloat32_t v166; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v166) : "w"(v147), "w"(v149)); - svfloat32_t v167; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v167) : "w"(v149), "w"(v145)); - svfloat32_t v151; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v151) : "w"(v150), "w"(v148)); - svfloat32_t v164; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v164) : "w"(v163), "w"(v149)); - svfloat32_t zero206; - asm volatile("mov %0.s, #0" : "=w"(zero206)); + svfloat32_t v144 = svadd_f32_x(svptrue_b32(), v52, v59); + svfloat32_t v145 = svsub_f32_x(svptrue_b32(), v52, v59); + svfloat32_t v146 = svadd_f32_x(svptrue_b32(), v94, v101); + svfloat32_t v147 = svsub_f32_x(svptrue_b32(), v94, v101); + svfloat32_t v148 = svadd_f32_x(svptrue_b32(), v136, v143); + svfloat32_t v149 = svsub_f32_x(svptrue_b32(), v136, v143); + svfloat32_t v150 = svadd_f32_x(svptrue_b32(), v144, v146); + svfloat32_t v160 = svsub_f32_x(svptrue_b32(), v144, v146); + svfloat32_t v161 = svsub_f32_x(svptrue_b32(), v146, v148); + svfloat32_t v162 = svsub_f32_x(svptrue_b32(), v148, v144); + svfloat32_t v163 = svadd_f32_x(svptrue_b32(), v145, v147); + svfloat32_t v165 = svsub_f32_x(svptrue_b32(), v145, v147); + svfloat32_t v166 = svsub_f32_x(svptrue_b32(), v147, v149); + svfloat32_t v167 = svsub_f32_x(svptrue_b32(), v149, v145); + svfloat32_t v151 = svadd_f32_x(svptrue_b32(), v150, v148); + svfloat32_t v164 = svadd_f32_x(svptrue_b32(), v163, v149); + svfloat32_t zero206 = svdup_n_f32(0); svfloat32_t v206 = svcmla_f32_x(pred_full, zero206, v361, v165, 90); - svfloat32_t zero213; - asm volatile("mov %0.s, #0" : "=w"(zero213)); + svfloat32_t zero213 = svdup_n_f32(0); svfloat32_t v213 = svcmla_f32_x(pred_full, zero213, v362, v166, 90); - svfloat32_t zero220; - asm volatile("mov %0.s, #0" : "=w"(zero220)); + svfloat32_t zero220 = svdup_n_f32(0); svfloat32_t v220 = svcmla_f32_x(pred_full, zero220, v363, v167, 90); - svfloat32_t v159; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v159) : "w"(v151), "w"(v354)); - svfloat32_t zero199; - asm volatile("mov %0.s, #0" : "=w"(zero199)); + svfloat32_t v159 = svadd_f32_x(svptrue_b32(), v151, v354); + svfloat32_t zero199 = svdup_n_f32(0); svfloat32_t v199 = svcmla_f32_x(pred_full, zero199, v360, v164, 90); svfloat32_t v221 = svmla_f32_x(pred_full, v159, v151, v356); - svfloat32_t v228; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v228) : "w"(v199), "w"(v206)); - svfloat32_t v230; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v230) : "w"(v199), "w"(v206)); - svfloat32_t v232; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v232) : "w"(v199), "w"(v213)); + svfloat32_t v228 = svadd_f32_x(svptrue_b32(), v199, v206); + svfloat32_t v230 = svsub_f32_x(svptrue_b32(), v199, v206); + svfloat32_t v232 = svsub_f32_x(svptrue_b32(), v199, v213); svst1_scatter_s64index_f64(pred_full, (double *)(v371), v426, svreinterpret_f64_f32(v159)); svfloat32_t v222 = svmla_f32_x(pred_full, v221, v160, v357); svfloat32_t v224 = svmls_f32_x(pred_full, v221, v160, v357); svfloat32_t v226 = svmls_f32_x(pred_full, v221, v161, v358); - svfloat32_t v229; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v229) : "w"(v228), "w"(v213)); - svfloat32_t v231; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v231) : "w"(v230), "w"(v220)); - svfloat32_t v233; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v233) : "w"(v232), "w"(v220)); + svfloat32_t v229 = svadd_f32_x(svptrue_b32(), v228, v213); + svfloat32_t v231 = svsub_f32_x(svptrue_b32(), v230, v220); + svfloat32_t v233 = svadd_f32_x(svptrue_b32(), v232, v220); svfloat32_t v223 = svmla_f32_x(pred_full, v222, v161, v358); svfloat32_t v225 = svmls_f32_x(pred_full, v224, v162, v359); svfloat32_t v227 = svmla_f32_x(pred_full, v226, v162, v359); - svfloat32_t v234; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v234) : "w"(v223), "w"(v229)); - svfloat32_t v235; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v235) : "w"(v223), "w"(v229)); - svfloat32_t v236; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v236) : "w"(v225), "w"(v231)); - svfloat32_t v237; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v237) : "w"(v225), "w"(v231)); - svfloat32_t v238; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v238) : "w"(v227), "w"(v233)); - svfloat32_t v239; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v239) : "w"(v227), "w"(v233)); + svfloat32_t v234 = svadd_f32_x(svptrue_b32(), v223, v229); + svfloat32_t v235 = svsub_f32_x(svptrue_b32(), v223, v229); + svfloat32_t v236 = svadd_f32_x(svptrue_b32(), v225, v231); + svfloat32_t v237 = svsub_f32_x(svptrue_b32(), v225, v231); + svfloat32_t v238 = svadd_f32_x(svptrue_b32(), v227, v233); + svfloat32_t v239 = svsub_f32_x(svptrue_b32(), v227, v233); svst1_scatter_s64index_f64(pred_full, (double *)(v380), v426, svreinterpret_f64_f32(v235)); svst1_scatter_s64index_f64(pred_full, (double *)(v389), v426, @@ -1510,8 +1408,7 @@ void armral_fft_cf32_cf32_cf32_ab_t_gs8(const armral_cmplx_f32_t *restrict x, svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v7)[v71])); svfloat32_t v79 = svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v7)[v78])); - svfloat32_t zero115; - asm volatile("mov %0.s, #0" : "=w"(zero115)); + svfloat32_t zero115 = svdup_n_f32(0); svfloat32_t v115 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero115, v335, v114, 0), v335, v114, 90); @@ -1533,99 +1430,66 @@ void armral_fft_cf32_cf32_cf32_ab_t_gs8(const armral_cmplx_f32_t *restrict x, svld1_gather_s64index_f64(pred_full, (const double *)(v353), v373)); svfloat32_t v364 = svreinterpret_f32_f64( svld1_gather_s64index_f64(pred_full, (const double *)(v362), v373)); - svfloat32_t zero38; - asm volatile("mov %0.s, #0" : "=w"(zero38)); + svfloat32_t zero38 = svdup_n_f32(0); svfloat32_t v38 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero38, v308, v37, 0), v308, v37, 90); - svfloat32_t zero73; - asm volatile("mov %0.s, #0" : "=w"(zero73)); + svfloat32_t zero73 = svdup_n_f32(0); svfloat32_t v73 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero73, v317, v72, 0), v317, v72, 90); - svfloat32_t zero80; - asm volatile("mov %0.s, #0" : "=w"(zero80)); + svfloat32_t zero80 = svdup_n_f32(0); svfloat32_t v80 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero80, v326, v79, 0), v326, v79, 90); - svfloat32_t zero122; - asm volatile("mov %0.s, #0" : "=w"(zero122)); + svfloat32_t zero122 = svdup_n_f32(0); svfloat32_t v122 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero122, v345, v121, 0), v345, v121, 90); - svfloat32_t zero157; - asm volatile("mov %0.s, #0" : "=w"(zero157)); + svfloat32_t zero157 = svdup_n_f32(0); svfloat32_t v157 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero157, v355, v156, 0), v355, v156, 90); - svfloat32_t zero164; - asm volatile("mov %0.s, #0" : "=w"(zero164)); + svfloat32_t zero164 = svdup_n_f32(0); svfloat32_t v164 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero164, v364, v163, 0), v364, v163, 90); - svfloat32_t v172; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v172) : "w"(v374), "w"(v38)); - svfloat32_t v173; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v173) : "w"(v374), "w"(v38)); - svfloat32_t v174; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v174) : "w"(v73), "w"(v80)); - svfloat32_t v175; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v175) : "w"(v73), "w"(v80)); - svfloat32_t v176; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v176) : "w"(v115), "w"(v122)); - svfloat32_t v177; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v177) : "w"(v115), "w"(v122)); - svfloat32_t v178; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v178) : "w"(v157), "w"(v164)); - svfloat32_t v179; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v179) : "w"(v157), "w"(v164)); - svfloat32_t v180; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v180) : "w"(v172), "w"(v174)); - svfloat32_t v181; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v181) : "w"(v172), "w"(v174)); - svfloat32_t v182; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v182) : "w"(v176), "w"(v178)); - svfloat32_t v183; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v183) : "w"(v176), "w"(v178)); - svfloat32_t v186; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v186) : "w"(v177), "w"(v179)); - svfloat32_t v187; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v187) : "w"(v177), "w"(v179)); - svfloat32_t zero221; - asm volatile("mov %0.s, #0" : "=w"(zero221)); + svfloat32_t v172 = svadd_f32_x(svptrue_b32(), v374, v38); + svfloat32_t v173 = svsub_f32_x(svptrue_b32(), v374, v38); + svfloat32_t v174 = svadd_f32_x(svptrue_b32(), v73, v80); + svfloat32_t v175 = svsub_f32_x(svptrue_b32(), v73, v80); + svfloat32_t v176 = svadd_f32_x(svptrue_b32(), v115, v122); + svfloat32_t v177 = svsub_f32_x(svptrue_b32(), v115, v122); + svfloat32_t v178 = svadd_f32_x(svptrue_b32(), v157, v164); + svfloat32_t v179 = svsub_f32_x(svptrue_b32(), v157, v164); + svfloat32_t v180 = svadd_f32_x(svptrue_b32(), v172, v174); + svfloat32_t v181 = svsub_f32_x(svptrue_b32(), v172, v174); + svfloat32_t v182 = svadd_f32_x(svptrue_b32(), v176, v178); + svfloat32_t v183 = svsub_f32_x(svptrue_b32(), v176, v178); + svfloat32_t v186 = svadd_f32_x(svptrue_b32(), v177, v179); + svfloat32_t v187 = svsub_f32_x(svptrue_b32(), v177, v179); + svfloat32_t zero221 = svdup_n_f32(0); svfloat32_t v221 = svcmla_f32_x(pred_full, zero221, v380, v175, 90); - svfloat32_t v184; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v184) : "w"(v180), "w"(v182)); - svfloat32_t v185; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v185) : "w"(v180), "w"(v182)); - svfloat32_t zero209; - asm volatile("mov %0.s, #0" : "=w"(zero209)); + svfloat32_t v184 = svadd_f32_x(svptrue_b32(), v180, v182); + svfloat32_t v185 = svsub_f32_x(svptrue_b32(), v180, v182); + svfloat32_t zero209 = svdup_n_f32(0); svfloat32_t v209 = svcmla_f32_x(pred_full, zero209, v380, v183, 90); - svfloat32_t zero228; - asm volatile("mov %0.s, #0" : "=w"(zero228)); + svfloat32_t zero228 = svdup_n_f32(0); svfloat32_t v228 = svcmla_f32_x(pred_full, zero228, v381, v186, 90); - svfloat32_t v234; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v234) : "w"(v181), "w"(v209)); - svfloat32_t v235; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v235) : "w"(v181), "w"(v209)); + svfloat32_t v234 = svadd_f32_x(svptrue_b32(), v181, v209); + svfloat32_t v235 = svsub_f32_x(svptrue_b32(), v181, v209); svfloat32_t v236 = svmla_f32_x(pred_full, v173, v187, v382); svfloat32_t v237 = svmls_f32_x(pred_full, v173, v187, v382); - svfloat32_t v238; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v238) : "w"(v221), "w"(v228)); - svfloat32_t v239; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v239) : "w"(v221), "w"(v228)); + svfloat32_t v238 = svadd_f32_x(svptrue_b32(), v221, v228); + svfloat32_t v239 = svsub_f32_x(svptrue_b32(), v221, v228); svst1_scatter_s64index_f64(pred_full, (double *)(v390), v454, svreinterpret_f64_f32(v184)); svst1_scatter_s64index_f64(pred_full, (double *)(v426), v454, svreinterpret_f64_f32(v185)); - svfloat32_t v240; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v240) : "w"(v236), "w"(v238)); - svfloat32_t v241; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v241) : "w"(v236), "w"(v238)); - svfloat32_t v242; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v242) : "w"(v237), "w"(v239)); - svfloat32_t v243; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v243) : "w"(v237), "w"(v239)); + svfloat32_t v240 = svadd_f32_x(svptrue_b32(), v236, v238); + svfloat32_t v241 = svsub_f32_x(svptrue_b32(), v236, v238); + svfloat32_t v242 = svadd_f32_x(svptrue_b32(), v237, v239); + svfloat32_t v243 = svsub_f32_x(svptrue_b32(), v237, v239); svst1_scatter_s64index_f64(pred_full, (double *)(v408), v454, svreinterpret_f64_f32(v235)); svst1_scatter_s64index_f64(pred_full, (double *)(v444), v454, @@ -1928,8 +1792,7 @@ void armral_fft_cf32_cf32_cf32_ab_t_gs9(const armral_cmplx_f32_t *restrict x, float32x2_t *v522 = &v6[v346]; float32x2_t *v531 = &v6[v353]; float32x2_t *v540 = &v6[v360]; - svfloat32_t zero52; - asm volatile("mov %0.s, #0" : "=w"(zero52)); + svfloat32_t zero52 = svdup_n_f32(0); svfloat32_t v52 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero52, v374, v51, 0), v374, v51, 90); @@ -1961,124 +1824,80 @@ void armral_fft_cf32_cf32_cf32_ab_t_gs9(const armral_cmplx_f32_t *restrict x, svld1_gather_s64index_f64(pred_full, (const double *)(v428), v448)); svfloat32_t v439 = svreinterpret_f32_f64( svld1_gather_s64index_f64(pred_full, (const double *)(v437), v448)); - svfloat32_t zero59; - asm volatile("mov %0.s, #0" : "=w"(zero59)); + svfloat32_t zero59 = svdup_n_f32(0); svfloat32_t v59 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero59, v384, v58, 0), v384, v58, 90); - svfloat32_t zero94; - asm volatile("mov %0.s, #0" : "=w"(zero94)); + svfloat32_t zero94 = svdup_n_f32(0); svfloat32_t v94 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero94, v394, v93, 0), v394, v93, 90); - svfloat32_t zero101; - asm volatile("mov %0.s, #0" : "=w"(zero101)); + svfloat32_t zero101 = svdup_n_f32(0); svfloat32_t v101 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero101, v403, v100, 0), v403, v100, 90); - svfloat32_t zero136; - asm volatile("mov %0.s, #0" : "=w"(zero136)); + svfloat32_t zero136 = svdup_n_f32(0); svfloat32_t v136 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero136, v412, v135, 0), v412, v135, 90); - svfloat32_t zero143; - asm volatile("mov %0.s, #0" : "=w"(zero143)); + svfloat32_t zero143 = svdup_n_f32(0); svfloat32_t v143 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero143, v421, v142, 0), v421, v142, 90); - svfloat32_t zero178; - asm volatile("mov %0.s, #0" : "=w"(zero178)); + svfloat32_t zero178 = svdup_n_f32(0); svfloat32_t v178 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero178, v430, v177, 0), v430, v177, 90); - svfloat32_t zero185; - asm volatile("mov %0.s, #0" : "=w"(zero185)); + svfloat32_t zero185 = svdup_n_f32(0); svfloat32_t v185 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero185, v439, v184, 0), v439, v184, 90); - svfloat32_t v186; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v186) : "w"(v52), "w"(v59)); - svfloat32_t v187; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v187) : "w"(v52), "w"(v59)); - svfloat32_t v188; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v188) : "w"(v94), "w"(v101)); - svfloat32_t v189; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v189) : "w"(v94), "w"(v101)); - svfloat32_t v190; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v190) : "w"(v136), "w"(v143)); - svfloat32_t v191; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v191) : "w"(v136), "w"(v143)); - svfloat32_t v192; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v192) : "w"(v178), "w"(v185)); - svfloat32_t v193; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v193) : "w"(v178), "w"(v185)); - svfloat32_t v194; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v194) : "w"(v186), "w"(v188)); - svfloat32_t v205; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v205) : "w"(v187), "w"(v189)); - svfloat32_t v207; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v207) : "w"(v186), "w"(v188)); - svfloat32_t v208; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v208) : "w"(v188), "w"(v192)); - svfloat32_t v209; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v209) : "w"(v192), "w"(v186)); - svfloat32_t v210; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v210) : "w"(v187), "w"(v189)); - svfloat32_t v211; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v211) : "w"(v189), "w"(v193)); - svfloat32_t v212; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v212) : "w"(v193), "w"(v187)); - svfloat32_t zero241; - asm volatile("mov %0.s, #0" : "=w"(zero241)); + svfloat32_t v186 = svadd_f32_x(svptrue_b32(), v52, v59); + svfloat32_t v187 = svsub_f32_x(svptrue_b32(), v52, v59); + svfloat32_t v188 = svadd_f32_x(svptrue_b32(), v94, v101); + svfloat32_t v189 = svsub_f32_x(svptrue_b32(), v94, v101); + svfloat32_t v190 = svadd_f32_x(svptrue_b32(), v136, v143); + svfloat32_t v191 = svsub_f32_x(svptrue_b32(), v136, v143); + svfloat32_t v192 = svadd_f32_x(svptrue_b32(), v178, v185); + svfloat32_t v193 = svsub_f32_x(svptrue_b32(), v178, v185); + svfloat32_t v194 = svadd_f32_x(svptrue_b32(), v186, v188); + svfloat32_t v205 = svadd_f32_x(svptrue_b32(), v187, v189); + svfloat32_t v207 = svsub_f32_x(svptrue_b32(), v186, v188); + svfloat32_t v208 = svsub_f32_x(svptrue_b32(), v188, v192); + svfloat32_t v209 = svsub_f32_x(svptrue_b32(), v192, v186); + svfloat32_t v210 = svsub_f32_x(svptrue_b32(), v187, v189); + svfloat32_t v211 = svsub_f32_x(svptrue_b32(), v189, v193); + svfloat32_t v212 = svsub_f32_x(svptrue_b32(), v193, v187); + svfloat32_t zero241 = svdup_n_f32(0); svfloat32_t v241 = svcmla_f32_x(pred_full, zero241, v454, v191, 90); - svfloat32_t v195; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v195) : "w"(v194), "w"(v192)); - svfloat32_t v206; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v206) : "w"(v205), "w"(v193)); - svfloat32_t zero263; - asm volatile("mov %0.s, #0" : "=w"(zero263)); + svfloat32_t v195 = svadd_f32_x(svptrue_b32(), v194, v192); + svfloat32_t v206 = svadd_f32_x(svptrue_b32(), v205, v193); + svfloat32_t zero263 = svdup_n_f32(0); svfloat32_t v263 = svcmla_f32_x(pred_full, zero263, v458, v210, 90); - svfloat32_t zero270; - asm volatile("mov %0.s, #0" : "=w"(zero270)); + svfloat32_t zero270 = svdup_n_f32(0); svfloat32_t v270 = svcmla_f32_x(pred_full, zero270, v459, v211, 90); - svfloat32_t zero277; - asm volatile("mov %0.s, #0" : "=w"(zero277)); + svfloat32_t zero277 = svdup_n_f32(0); svfloat32_t v277 = svcmla_f32_x(pred_full, zero277, v460, v212, 90); - svfloat32_t v196; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v196) : "w"(v195), "w"(v190)); - svfloat32_t v222; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v222) : "w"(v195), "w"(v451)); - svfloat32_t zero229; - asm volatile("mov %0.s, #0" : "=w"(zero229)); + svfloat32_t v196 = svadd_f32_x(svptrue_b32(), v195, v190); + svfloat32_t v222 = svmul_f32_x(svptrue_b32(), v195, v451); + svfloat32_t zero229 = svdup_n_f32(0); svfloat32_t v229 = svcmla_f32_x(pred_full, zero229, v454, v206, 90); - svfloat32_t v291; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v291) : "w"(v241), "w"(v263)); - svfloat32_t v293; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v293) : "w"(v241), "w"(v270)); - svfloat32_t v295; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v295) : "w"(v241), "w"(v263)); - svfloat32_t v204; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v204) : "w"(v196), "w"(v449)); - svfloat32_t v278; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v278) : "w"(v222), "w"(v222)); - svfloat32_t v292; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v292) : "w"(v291), "w"(v270)); - svfloat32_t v294; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v294) : "w"(v293), "w"(v277)); - svfloat32_t v296; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v296) : "w"(v295), "w"(v277)); + svfloat32_t v291 = svadd_f32_x(svptrue_b32(), v241, v263); + svfloat32_t v293 = svsub_f32_x(svptrue_b32(), v241, v270); + svfloat32_t v295 = svsub_f32_x(svptrue_b32(), v241, v263); + svfloat32_t v204 = svadd_f32_x(svptrue_b32(), v196, v449); + svfloat32_t v278 = svadd_f32_x(svptrue_b32(), v222, v222); + svfloat32_t v292 = svadd_f32_x(svptrue_b32(), v291, v270); + svfloat32_t v294 = svadd_f32_x(svptrue_b32(), v293, v277); + svfloat32_t v296 = svsub_f32_x(svptrue_b32(), v295, v277); svfloat32_t v279 = svmla_f32_x(pred_full, v278, v195, v451); svfloat32_t v283 = svmla_f32_x(pred_full, v204, v190, v453); svst1_scatter_s64index_f64(pred_full, (double *)(v468), v541, svreinterpret_f64_f32(v204)); - svfloat32_t v280; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v280) : "w"(v204), "w"(v279)); - svfloat32_t v284; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v284) : "w"(v283), "w"(v278)); - svfloat32_t v281; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v281) : "w"(v280), "w"(v229)); - svfloat32_t v282; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v282) : "w"(v280), "w"(v229)); + svfloat32_t v280 = svadd_f32_x(svptrue_b32(), v204, v279); + svfloat32_t v284 = svadd_f32_x(svptrue_b32(), v283, v278); + svfloat32_t v281 = svadd_f32_x(svptrue_b32(), v280, v229); + svfloat32_t v282 = svsub_f32_x(svptrue_b32(), v280, v229); svfloat32_t v285 = svmla_f32_x(pred_full, v284, v207, v455); svfloat32_t v287 = svmls_f32_x(pred_full, v284, v208, v456); svfloat32_t v289 = svmls_f32_x(pred_full, v284, v207, v455); @@ -2089,18 +1908,12 @@ void armral_fft_cf32_cf32_cf32_ab_t_gs9(const armral_cmplx_f32_t *restrict x, svreinterpret_f64_f32(v282)); svst1_scatter_s64index_f64(pred_full, (double *)(v522), v541, svreinterpret_f64_f32(v281)); - svfloat32_t v297; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v297) : "w"(v286), "w"(v292)); - svfloat32_t v298; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v298) : "w"(v286), "w"(v292)); - svfloat32_t v299; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v299) : "w"(v288), "w"(v294)); - svfloat32_t v300; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v300) : "w"(v288), "w"(v294)); - svfloat32_t v301; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v301) : "w"(v290), "w"(v296)); - svfloat32_t v302; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v302) : "w"(v290), "w"(v296)); + svfloat32_t v297 = svadd_f32_x(svptrue_b32(), v286, v292); + svfloat32_t v298 = svsub_f32_x(svptrue_b32(), v286, v292); + svfloat32_t v299 = svadd_f32_x(svptrue_b32(), v288, v294); + svfloat32_t v300 = svsub_f32_x(svptrue_b32(), v288, v294); + svfloat32_t v301 = svadd_f32_x(svptrue_b32(), v290, v296); + svfloat32_t v302 = svsub_f32_x(svptrue_b32(), v290, v296); svst1_scatter_s64index_f64(pred_full, (double *)(v477), v541, svreinterpret_f64_f32(v298)); svst1_scatter_s64index_f64(pred_full, (double *)(v486), v541, @@ -2412,8 +2225,7 @@ void armral_fft_cf32_cf32_cf32_ab_t_gs10(const armral_cmplx_f32_t *restrict x, svld1_f64(pred_full, &((const double *)v7)[v120])); svfloat32_t v156 = svreinterpret_f32_f64( svld1_f64(pred_full, &((const double *)v7)[v155])); - svfloat32_t zero164; - asm volatile("mov %0.s, #0" : "=w"(zero164)); + svfloat32_t zero164 = svdup_n_f32(0); svfloat32_t v164 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero164, v462, v163, 0), v462, v163, 90); @@ -2437,117 +2249,77 @@ void armral_fft_cf32_cf32_cf32_ab_t_gs10(const armral_cmplx_f32_t *restrict x, svld1_gather_s64index_f64(pred_full, (const double *)(v471), v491)); svfloat32_t v482 = svreinterpret_f32_f64( svld1_gather_s64index_f64(pred_full, (const double *)(v480), v491)); - svfloat32_t zero38; - asm volatile("mov %0.s, #0" : "=w"(zero38)); + svfloat32_t zero38 = svdup_n_f32(0); svfloat32_t v38 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero38, v408, v37, 0), v408, v37, 90); - svfloat32_t zero73; - asm volatile("mov %0.s, #0" : "=w"(zero73)); + svfloat32_t zero73 = svdup_n_f32(0); svfloat32_t v73 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero73, v417, v72, 0), v417, v72, 90); - svfloat32_t zero80; - asm volatile("mov %0.s, #0" : "=w"(zero80)); + svfloat32_t zero80 = svdup_n_f32(0); svfloat32_t v80 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero80, v426, v79, 0), v426, v79, 90); - svfloat32_t zero115; - asm volatile("mov %0.s, #0" : "=w"(zero115)); + svfloat32_t zero115 = svdup_n_f32(0); svfloat32_t v115 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero115, v435, v114, 0), v435, v114, 90); - svfloat32_t zero122; - asm volatile("mov %0.s, #0" : "=w"(zero122)); + svfloat32_t zero122 = svdup_n_f32(0); svfloat32_t v122 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero122, v444, v121, 0), v444, v121, 90); - svfloat32_t zero157; - asm volatile("mov %0.s, #0" : "=w"(zero157)); + svfloat32_t zero157 = svdup_n_f32(0); svfloat32_t v157 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero157, v453, v156, 0), v453, v156, 90); - svfloat32_t zero199; - asm volatile("mov %0.s, #0" : "=w"(zero199)); + svfloat32_t zero199 = svdup_n_f32(0); svfloat32_t v199 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero199, v473, v198, 0), v473, v198, 90); - svfloat32_t zero206; - asm volatile("mov %0.s, #0" : "=w"(zero206)); + svfloat32_t zero206 = svdup_n_f32(0); svfloat32_t v206 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero206, v482, v205, 0), v482, v205, 90); - svfloat32_t v214; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v214) : "w"(v492), "w"(v38)); - svfloat32_t v215; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v215) : "w"(v492), "w"(v38)); - svfloat32_t v216; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v216) : "w"(v73), "w"(v80)); - svfloat32_t v217; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v217) : "w"(v73), "w"(v80)); - svfloat32_t v218; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v218) : "w"(v115), "w"(v122)); - svfloat32_t v219; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v219) : "w"(v115), "w"(v122)); - svfloat32_t v220; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v220) : "w"(v157), "w"(v164)); - svfloat32_t v221; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v221) : "w"(v157), "w"(v164)); - svfloat32_t v222; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v222) : "w"(v199), "w"(v206)); - svfloat32_t v223; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v223) : "w"(v199), "w"(v206)); - svfloat32_t v224; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v224) : "w"(v216), "w"(v222)); - svfloat32_t v225; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v225) : "w"(v216), "w"(v222)); - svfloat32_t v226; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v226) : "w"(v220), "w"(v218)); - svfloat32_t v227; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v227) : "w"(v220), "w"(v218)); - svfloat32_t v277; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v277) : "w"(v217), "w"(v223)); - svfloat32_t v278; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v278) : "w"(v217), "w"(v223)); - svfloat32_t v279; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v279) : "w"(v221), "w"(v219)); - svfloat32_t v280; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v280) : "w"(v221), "w"(v219)); - svfloat32_t v228; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v228) : "w"(v224), "w"(v226)); - svfloat32_t v229; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v229) : "w"(v224), "w"(v226)); - svfloat32_t v230; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v230) : "w"(v225), "w"(v227)); - svfloat32_t zero253; - asm volatile("mov %0.s, #0" : "=w"(zero253)); + svfloat32_t v214 = svadd_f32_x(svptrue_b32(), v492, v38); + svfloat32_t v215 = svsub_f32_x(svptrue_b32(), v492, v38); + svfloat32_t v216 = svadd_f32_x(svptrue_b32(), v73, v80); + svfloat32_t v217 = svsub_f32_x(svptrue_b32(), v73, v80); + svfloat32_t v218 = svadd_f32_x(svptrue_b32(), v115, v122); + svfloat32_t v219 = svsub_f32_x(svptrue_b32(), v115, v122); + svfloat32_t v220 = svadd_f32_x(svptrue_b32(), v157, v164); + svfloat32_t v221 = svsub_f32_x(svptrue_b32(), v157, v164); + svfloat32_t v222 = svadd_f32_x(svptrue_b32(), v199, v206); + svfloat32_t v223 = svsub_f32_x(svptrue_b32(), v199, v206); + svfloat32_t v224 = svadd_f32_x(svptrue_b32(), v216, v222); + svfloat32_t v225 = svsub_f32_x(svptrue_b32(), v216, v222); + svfloat32_t v226 = svadd_f32_x(svptrue_b32(), v220, v218); + svfloat32_t v227 = svsub_f32_x(svptrue_b32(), v220, v218); + svfloat32_t v277 = svadd_f32_x(svptrue_b32(), v217, v223); + svfloat32_t v278 = svsub_f32_x(svptrue_b32(), v217, v223); + svfloat32_t v279 = svadd_f32_x(svptrue_b32(), v221, v219); + svfloat32_t v280 = svsub_f32_x(svptrue_b32(), v221, v219); + svfloat32_t v228 = svadd_f32_x(svptrue_b32(), v224, v226); + svfloat32_t v229 = svsub_f32_x(svptrue_b32(), v224, v226); + svfloat32_t v230 = svadd_f32_x(svptrue_b32(), v225, v227); + svfloat32_t zero253 = svdup_n_f32(0); svfloat32_t v253 = svcmla_f32_x(pred_full, zero253, v502, v225, 90); - svfloat32_t v281; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v281) : "w"(v277), "w"(v279)); - svfloat32_t v282; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v282) : "w"(v277), "w"(v279)); - svfloat32_t v283; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v283) : "w"(v278), "w"(v280)); - svfloat32_t zero306; - asm volatile("mov %0.s, #0" : "=w"(zero306)); + svfloat32_t v281 = svadd_f32_x(svptrue_b32(), v277, v279); + svfloat32_t v282 = svsub_f32_x(svptrue_b32(), v277, v279); + svfloat32_t v283 = svadd_f32_x(svptrue_b32(), v278, v280); + svfloat32_t zero306 = svdup_n_f32(0); svfloat32_t v306 = svcmla_f32_x(pred_full, zero306, v502, v278, 90); - svfloat32_t v231; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v231) : "w"(v228), "w"(v214)); - svfloat32_t zero260; - asm volatile("mov %0.s, #0" : "=w"(zero260)); + svfloat32_t v231 = svadd_f32_x(svptrue_b32(), v228, v214); + svfloat32_t zero260 = svdup_n_f32(0); svfloat32_t v260 = svcmla_f32_x(pred_full, zero260, v503, v230, 90); - svfloat32_t v284; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v284) : "w"(v281), "w"(v215)); - svfloat32_t zero313; - asm volatile("mov %0.s, #0" : "=w"(zero313)); + svfloat32_t v284 = svadd_f32_x(svptrue_b32(), v281, v215); + svfloat32_t zero313 = svdup_n_f32(0); svfloat32_t v313 = svcmla_f32_x(pred_full, zero313, v503, v283, 90); svfloat32_t v268 = svmla_f32_x(pred_full, v231, v228, v500); - svfloat32_t v271; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v271) : "w"(v253), "w"(v260)); + svfloat32_t v271 = svsub_f32_x(svptrue_b32(), v253, v260); svfloat32_t v272 = svcmla_f32_x(pred_full, v260, v504, v227, 90); svfloat32_t v321 = svmla_f32_x(pred_full, v284, v281, v500); - svfloat32_t v324; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v324) : "w"(v306), "w"(v313)); + svfloat32_t v324 = svsub_f32_x(svptrue_b32(), v306, v313); svfloat32_t v325 = svcmla_f32_x(pred_full, v313, v504, v280, 90); svst1_scatter_s64index_f64(pred_full, (double *)(v512), v594, svreinterpret_f64_f32(v231)); @@ -2557,22 +2329,14 @@ void armral_fft_cf32_cf32_cf32_ab_t_gs10(const armral_cmplx_f32_t *restrict x, svfloat32_t v270 = svmls_f32_x(pred_full, v268, v229, v501); svfloat32_t v322 = svmla_f32_x(pred_full, v321, v282, v501); svfloat32_t v323 = svmls_f32_x(pred_full, v321, v282, v501); - svfloat32_t v273; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v273) : "w"(v269), "w"(v271)); - svfloat32_t v274; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v274) : "w"(v269), "w"(v271)); - svfloat32_t v275; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v275) : "w"(v270), "w"(v272)); - svfloat32_t v276; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v276) : "w"(v270), "w"(v272)); - svfloat32_t v326; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v326) : "w"(v322), "w"(v324)); - svfloat32_t v327; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v327) : "w"(v322), "w"(v324)); - svfloat32_t v328; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v328) : "w"(v323), "w"(v325)); - svfloat32_t v329; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v329) : "w"(v323), "w"(v325)); + svfloat32_t v273 = svadd_f32_x(svptrue_b32(), v269, v271); + svfloat32_t v274 = svsub_f32_x(svptrue_b32(), v269, v271); + svfloat32_t v275 = svadd_f32_x(svptrue_b32(), v270, v272); + svfloat32_t v276 = svsub_f32_x(svptrue_b32(), v270, v272); + svfloat32_t v326 = svadd_f32_x(svptrue_b32(), v322, v324); + svfloat32_t v327 = svsub_f32_x(svptrue_b32(), v322, v324); + svfloat32_t v328 = svadd_f32_x(svptrue_b32(), v323, v325); + svfloat32_t v329 = svsub_f32_x(svptrue_b32(), v323, v325); svst1_scatter_s64index_f64(pred_full, (double *)(v530), v594, svreinterpret_f64_f32(v274)); svst1_scatter_s64index_f64(pred_full, (double *)(v539), v594, @@ -3027,8 +2791,7 @@ void armral_fft_cf32_cf32_cf32_ab_t_gs11(const armral_cmplx_f32_t *restrict x, float32x2_t *v714 = &v6[v494]; float32x2_t *v723 = &v6[v501]; float32x2_t *v732 = &v6[v508]; - svfloat32_t zero164; - asm volatile("mov %0.s, #0" : "=w"(zero164)); + svfloat32_t zero164 = svdup_n_f32(0); svfloat32_t v164 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero164, v529, v163, 0), v529, v163, 90); @@ -3068,150 +2831,94 @@ void armral_fft_cf32_cf32_cf32_ab_t_gs11(const armral_cmplx_f32_t *restrict x, svld1_gather_s64index_f64(pred_full, (const double *)(v600), v621)); svfloat32_t v611 = svreinterpret_f32_f64( svld1_gather_s64index_f64(pred_full, (const double *)(v609), v621)); - svfloat32_t zero171; - asm volatile("mov %0.s, #0" : "=w"(zero171)); + svfloat32_t zero171 = svdup_n_f32(0); svfloat32_t v171 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero171, v539, v170, 0), v539, v170, 90); - svfloat32_t zero178; - asm volatile("mov %0.s, #0" : "=w"(zero178)); + svfloat32_t zero178 = svdup_n_f32(0); svfloat32_t v178 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero178, v548, v177, 0), v548, v177, 90); - svfloat32_t zero185; - asm volatile("mov %0.s, #0" : "=w"(zero185)); + svfloat32_t zero185 = svdup_n_f32(0); svfloat32_t v185 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero185, v557, v184, 0), v557, v184, 90); - svfloat32_t zero192; - asm volatile("mov %0.s, #0" : "=w"(zero192)); + svfloat32_t zero192 = svdup_n_f32(0); svfloat32_t v192 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero192, v566, v191, 0), v566, v191, 90); - svfloat32_t zero199; - asm volatile("mov %0.s, #0" : "=w"(zero199)); + svfloat32_t zero199 = svdup_n_f32(0); svfloat32_t v199 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero199, v575, v198, 0), v575, v198, 90); - svfloat32_t zero206; - asm volatile("mov %0.s, #0" : "=w"(zero206)); + svfloat32_t zero206 = svdup_n_f32(0); svfloat32_t v206 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero206, v584, v205, 0), v584, v205, 90); - svfloat32_t zero213; - asm volatile("mov %0.s, #0" : "=w"(zero213)); + svfloat32_t zero213 = svdup_n_f32(0); svfloat32_t v213 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero213, v593, v212, 0), v593, v212, 90); - svfloat32_t zero220; - asm volatile("mov %0.s, #0" : "=w"(zero220)); + svfloat32_t zero220 = svdup_n_f32(0); svfloat32_t v220 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero220, v602, v219, 0), v602, v219, 90); - svfloat32_t zero227; - asm volatile("mov %0.s, #0" : "=w"(zero227)); + svfloat32_t zero227 = svdup_n_f32(0); svfloat32_t v227 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero227, v611, v226, 0), v611, v226, 90); - svfloat32_t v228; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v228) : "w"(v164), "w"(v171)); - svfloat32_t v229; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v229) : "w"(v178), "w"(v185)); - svfloat32_t v230; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v230) : "w"(v192), "w"(v199)); - svfloat32_t v231; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v231) : "w"(v206), "w"(v213)); - svfloat32_t v232; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v232) : "w"(v220), "w"(v227)); - svfloat32_t v233; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v233) : "w"(v164), "w"(v171)); - svfloat32_t v234; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v234) : "w"(v178), "w"(v185)); - svfloat32_t v235; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v235) : "w"(v192), "w"(v199)); - svfloat32_t v236; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v236) : "w"(v206), "w"(v213)); - svfloat32_t v237; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v237) : "w"(v220), "w"(v227)); - svfloat32_t v238; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v238) : "w"(v228), "w"(v229)); - svfloat32_t v239; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v239) : "w"(v230), "w"(v232)); - svfloat32_t v241; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v241) : "w"(v234), "w"(v235)); - svfloat32_t v242; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v242) : "w"(v233), "w"(v237)); - svfloat32_t v254; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v254) : "w"(v229), "w"(v231)); - svfloat32_t v255; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v255) : "w"(v228), "w"(v231)); - svfloat32_t v256; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v256) : "w"(v229), "w"(v228)); - svfloat32_t v257; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v257) : "w"(v232), "w"(v231)); - svfloat32_t v258; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v258) : "w"(v230), "w"(v231)); - svfloat32_t v259; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v259) : "w"(v232), "w"(v230)); - svfloat32_t v260; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v260) : "w"(v229), "w"(v232)); - svfloat32_t v261; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v261) : "w"(v228), "w"(v230)); - svfloat32_t v263; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v263) : "w"(v234), "w"(v236)); - svfloat32_t v264; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v264) : "w"(v233), "w"(v236)); - svfloat32_t v265; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v265) : "w"(v233), "w"(v234)); - svfloat32_t v266; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v266) : "w"(v236), "w"(v237)); - svfloat32_t v267; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v267) : "w"(v235), "w"(v236)); - svfloat32_t v268; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v268) : "w"(v235), "w"(v237)); - svfloat32_t v269; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v269) : "w"(v234), "w"(v237)); - svfloat32_t v270; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v270) : "w"(v233), "w"(v235)); - svfloat32_t v240; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v240) : "w"(v231), "w"(v238)); - svfloat32_t v252; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v252) : "w"(v241), "w"(v242)); - svfloat32_t v262; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v262) : "w"(v239), "w"(v238)); - svfloat32_t v271; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v271) : "w"(v241), "w"(v242)); - svfloat32_t v298; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v298) : "w"(v255), "w"(v627)); - svfloat32_t v303; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v303) : "w"(v256), "w"(v628)); - svfloat32_t v313; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v313) : "w"(v258), "w"(v630)); - svfloat32_t v318; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v318) : "w"(v259), "w"(v631)); - svfloat32_t zero340; - asm volatile("mov %0.s, #0" : "=w"(zero340)); + svfloat32_t v228 = svadd_f32_x(svptrue_b32(), v164, v171); + svfloat32_t v229 = svadd_f32_x(svptrue_b32(), v178, v185); + svfloat32_t v230 = svadd_f32_x(svptrue_b32(), v192, v199); + svfloat32_t v231 = svadd_f32_x(svptrue_b32(), v206, v213); + svfloat32_t v232 = svadd_f32_x(svptrue_b32(), v220, v227); + svfloat32_t v233 = svsub_f32_x(svptrue_b32(), v164, v171); + svfloat32_t v234 = svsub_f32_x(svptrue_b32(), v178, v185); + svfloat32_t v235 = svsub_f32_x(svptrue_b32(), v192, v199); + svfloat32_t v236 = svsub_f32_x(svptrue_b32(), v206, v213); + svfloat32_t v237 = svsub_f32_x(svptrue_b32(), v220, v227); + svfloat32_t v238 = svadd_f32_x(svptrue_b32(), v228, v229); + svfloat32_t v239 = svadd_f32_x(svptrue_b32(), v230, v232); + svfloat32_t v241 = svsub_f32_x(svptrue_b32(), v234, v235); + svfloat32_t v242 = svadd_f32_x(svptrue_b32(), v233, v237); + svfloat32_t v254 = svsub_f32_x(svptrue_b32(), v229, v231); + svfloat32_t v255 = svsub_f32_x(svptrue_b32(), v228, v231); + svfloat32_t v256 = svsub_f32_x(svptrue_b32(), v229, v228); + svfloat32_t v257 = svsub_f32_x(svptrue_b32(), v232, v231); + svfloat32_t v258 = svsub_f32_x(svptrue_b32(), v230, v231); + svfloat32_t v259 = svsub_f32_x(svptrue_b32(), v232, v230); + svfloat32_t v260 = svsub_f32_x(svptrue_b32(), v229, v232); + svfloat32_t v261 = svsub_f32_x(svptrue_b32(), v228, v230); + svfloat32_t v263 = svadd_f32_x(svptrue_b32(), v234, v236); + svfloat32_t v264 = svsub_f32_x(svptrue_b32(), v233, v236); + svfloat32_t v265 = svadd_f32_x(svptrue_b32(), v233, v234); + svfloat32_t v266 = svsub_f32_x(svptrue_b32(), v236, v237); + svfloat32_t v267 = svsub_f32_x(svptrue_b32(), v235, v236); + svfloat32_t v268 = svsub_f32_x(svptrue_b32(), v235, v237); + svfloat32_t v269 = svadd_f32_x(svptrue_b32(), v234, v237); + svfloat32_t v270 = svsub_f32_x(svptrue_b32(), v233, v235); + svfloat32_t v240 = svadd_f32_x(svptrue_b32(), v231, v238); + svfloat32_t v252 = svsub_f32_x(svptrue_b32(), v241, v242); + svfloat32_t v262 = svsub_f32_x(svptrue_b32(), v239, v238); + svfloat32_t v271 = svadd_f32_x(svptrue_b32(), v241, v242); + svfloat32_t v298 = svmul_f32_x(svptrue_b32(), v255, v627); + svfloat32_t v303 = svmul_f32_x(svptrue_b32(), v256, v628); + svfloat32_t v313 = svmul_f32_x(svptrue_b32(), v258, v630); + svfloat32_t v318 = svmul_f32_x(svptrue_b32(), v259, v631); + svfloat32_t zero340 = svdup_n_f32(0); svfloat32_t v340 = svcmla_f32_x(pred_full, zero340, v635, v263, 90); - svfloat32_t zero354; - asm volatile("mov %0.s, #0" : "=w"(zero354)); + svfloat32_t zero354 = svdup_n_f32(0); svfloat32_t v354 = svcmla_f32_x(pred_full, zero354, v637, v265, 90); - svfloat32_t zero361; - asm volatile("mov %0.s, #0" : "=w"(zero361)); + svfloat32_t zero361 = svdup_n_f32(0); svfloat32_t v361 = svcmla_f32_x(pred_full, zero361, v638, v266, 90); - svfloat32_t zero375; - asm volatile("mov %0.s, #0" : "=w"(zero375)); + svfloat32_t zero375 = svdup_n_f32(0); svfloat32_t v375 = svcmla_f32_x(pred_full, zero375, v640, v268, 90); - svfloat32_t zero382; - asm volatile("mov %0.s, #0" : "=w"(zero382)); + svfloat32_t zero382 = svdup_n_f32(0); svfloat32_t v382 = svcmla_f32_x(pred_full, zero382, v641, v269, 90); - svfloat32_t v243; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v243) : "w"(v240), "w"(v239)); - svfloat32_t v253; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v253) : "w"(v252), "w"(v236)); - svfloat32_t v333; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v333) : "w"(v262), "w"(v634)); - svfloat32_t zero396; - asm volatile("mov %0.s, #0" : "=w"(zero396)); + svfloat32_t v243 = svadd_f32_x(svptrue_b32(), v240, v239); + svfloat32_t v253 = svsub_f32_x(svptrue_b32(), v252, v236); + svfloat32_t v333 = svmul_f32_x(svptrue_b32(), v262, v634); + svfloat32_t zero396 = svdup_n_f32(0); svfloat32_t v396 = svcmla_f32_x(pred_full, zero396, v643, v271, 90); svfloat32_t v398 = svmla_f32_x(pred_full, v298, v254, v626); svfloat32_t v399 = svmla_f32_x(pred_full, v303, v255, v627); @@ -3220,88 +2927,51 @@ void armral_fft_cf32_cf32_cf32_ab_t_gs11(const armral_cmplx_f32_t *restrict x, svfloat32_t v402 = svmla_f32_x(pred_full, v318, v258, v630); svfloat32_t v403 = svnmls_f32_x(pred_full, v318, v257, v629); svfloat32_t v406 = svcmla_f32_x(pred_full, v354, v636, v264, 90); - svfloat32_t v407; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v407) : "w"(v340), "w"(v354)); + svfloat32_t v407 = svsub_f32_x(svptrue_b32(), v340, v354); svfloat32_t v408 = svcmla_f32_x(pred_full, v375, v639, v267, 90); - svfloat32_t v409; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v409) : "w"(v361), "w"(v375)); - svfloat32_t v251; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v251) : "w"(v622), "w"(v243)); - svfloat32_t zero288; - asm volatile("mov %0.s, #0" : "=w"(zero288)); + svfloat32_t v409 = svsub_f32_x(svptrue_b32(), v361, v375); + svfloat32_t v251 = svadd_f32_x(svptrue_b32(), v622, v243); + svfloat32_t zero288 = svdup_n_f32(0); svfloat32_t v288 = svcmla_f32_x(pred_full, zero288, v625, v253, 90); svfloat32_t v404 = svmla_f32_x(pred_full, v333, v261, v633); svfloat32_t v405 = svmla_f32_x(pred_full, v333, v260, v632); svfloat32_t v410 = svcmla_f32_x(pred_full, v396, v642, v270, 90); - svfloat32_t v411; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v411) : "w"(v382), "w"(v396)); - svfloat32_t v430; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v430) : "w"(v406), "w"(v407)); + svfloat32_t v411 = svsub_f32_x(svptrue_b32(), v382, v396); + svfloat32_t v430 = svadd_f32_x(svptrue_b32(), v406, v407); svfloat32_t v397 = svmls_f32_x(pred_full, v251, v243, v624); - svfloat32_t v412; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v412) : "w"(v402), "w"(v404)); - svfloat32_t v422; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v422) : "w"(v288), "w"(v408)); - svfloat32_t v424; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v424) : "w"(v410), "w"(v406)); - svfloat32_t v426; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v426) : "w"(v288), "w"(v411)); - svfloat32_t v428; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v428) : "w"(v411), "w"(v407)); - svfloat32_t v431; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v431) : "w"(v430), "w"(v408)); + svfloat32_t v412 = svadd_f32_x(svptrue_b32(), v402, v404); + svfloat32_t v422 = svadd_f32_x(svptrue_b32(), v288, v408); + svfloat32_t v424 = svsub_f32_x(svptrue_b32(), v410, v406); + svfloat32_t v426 = svadd_f32_x(svptrue_b32(), v288, v411); + svfloat32_t v428 = svsub_f32_x(svptrue_b32(), v411, v407); + svfloat32_t v431 = svadd_f32_x(svptrue_b32(), v430, v408); svst1_scatter_s64index_f64(pred_full, (double *)(v651), v742, svreinterpret_f64_f32(v251)); - svfloat32_t v413; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v413) : "w"(v412), "w"(v397)); - svfloat32_t v414; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v414) : "w"(v397), "w"(v399)); - svfloat32_t v416; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v416) : "w"(v397), "w"(v403)); - svfloat32_t v418; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v418) : "w"(v397), "w"(v400)); - svfloat32_t v420; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v420) : "w"(v397), "w"(v398)); - svfloat32_t v423; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v423) : "w"(v422), "w"(v410)); - svfloat32_t v425; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v425) : "w"(v424), "w"(v288)); - svfloat32_t v427; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v427) : "w"(v426), "w"(v409)); - svfloat32_t v429; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v429) : "w"(v428), "w"(v288)); - svfloat32_t v432; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v432) : "w"(v431), "w"(v409)); - svfloat32_t v415; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v415) : "w"(v414), "w"(v404)); - svfloat32_t v417; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v417) : "w"(v416), "w"(v405)); - svfloat32_t v419; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v419) : "w"(v418), "w"(v405)); - svfloat32_t v421; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v421) : "w"(v420), "w"(v401)); - svfloat32_t v433; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v433) : "w"(v432), "w"(v288)); - svfloat32_t v435; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v435) : "w"(v413), "w"(v423)); - svfloat32_t v442; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v442) : "w"(v413), "w"(v423)); - svfloat32_t v434; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v434) : "w"(v421), "w"(v433)); - svfloat32_t v436; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v436) : "w"(v415), "w"(v425)); - svfloat32_t v437; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v437) : "w"(v417), "w"(v427)); - svfloat32_t v438; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v438) : "w"(v419), "w"(v429)); - svfloat32_t v439; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v439) : "w"(v419), "w"(v429)); - svfloat32_t v440; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v440) : "w"(v417), "w"(v427)); - svfloat32_t v441; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v441) : "w"(v415), "w"(v425)); - svfloat32_t v443; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v443) : "w"(v421), "w"(v433)); + svfloat32_t v413 = svadd_f32_x(svptrue_b32(), v412, v397); + svfloat32_t v414 = svsub_f32_x(svptrue_b32(), v397, v399); + svfloat32_t v416 = svadd_f32_x(svptrue_b32(), v397, v403); + svfloat32_t v418 = svsub_f32_x(svptrue_b32(), v397, v400); + svfloat32_t v420 = svadd_f32_x(svptrue_b32(), v397, v398); + svfloat32_t v423 = svadd_f32_x(svptrue_b32(), v422, v410); + svfloat32_t v425 = svsub_f32_x(svptrue_b32(), v424, v288); + svfloat32_t v427 = svadd_f32_x(svptrue_b32(), v426, v409); + svfloat32_t v429 = svsub_f32_x(svptrue_b32(), v428, v288); + svfloat32_t v432 = svadd_f32_x(svptrue_b32(), v431, v409); + svfloat32_t v415 = svsub_f32_x(svptrue_b32(), v414, v404); + svfloat32_t v417 = svadd_f32_x(svptrue_b32(), v416, v405); + svfloat32_t v419 = svsub_f32_x(svptrue_b32(), v418, v405); + svfloat32_t v421 = svsub_f32_x(svptrue_b32(), v420, v401); + svfloat32_t v433 = svsub_f32_x(svptrue_b32(), v432, v288); + svfloat32_t v435 = svadd_f32_x(svptrue_b32(), v413, v423); + svfloat32_t v442 = svsub_f32_x(svptrue_b32(), v413, v423); + svfloat32_t v434 = svadd_f32_x(svptrue_b32(), v421, v433); + svfloat32_t v436 = svadd_f32_x(svptrue_b32(), v415, v425); + svfloat32_t v437 = svsub_f32_x(svptrue_b32(), v417, v427); + svfloat32_t v438 = svadd_f32_x(svptrue_b32(), v419, v429); + svfloat32_t v439 = svsub_f32_x(svptrue_b32(), v419, v429); + svfloat32_t v440 = svadd_f32_x(svptrue_b32(), v417, v427); + svfloat32_t v441 = svsub_f32_x(svptrue_b32(), v415, v425); + svfloat32_t v443 = svsub_f32_x(svptrue_b32(), v421, v433); svst1_scatter_s64index_f64(pred_full, (double *)(v669), v742, svreinterpret_f64_f32(v435)); svst1_scatter_s64index_f64(pred_full, (double *)(v732), v742, @@ -3656,8 +3326,7 @@ void armral_fft_cf32_cf32_cf32_ab_t_gs12(const armral_cmplx_f32_t *restrict x, svld1_f64(pred_full, &((const double *)v7)[v155])); svfloat32_t v170 = svreinterpret_f32_f64( svld1_f64(pred_full, &((const double *)v7)[v169])); - svfloat32_t zero206; - asm volatile("mov %0.s, #0" : "=w"(zero206)); + svfloat32_t zero206 = svdup_n_f32(0); svfloat32_t v206 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero206, v517, v205, 0), v517, v205, 90); @@ -3685,59 +3354,43 @@ void armral_fft_cf32_cf32_cf32_ab_t_gs12(const armral_cmplx_f32_t *restrict x, svld1_gather_s64index_f64(pred_full, (const double *)(v525), v546)); svfloat32_t v537 = svreinterpret_f32_f64( svld1_gather_s64index_f64(pred_full, (const double *)(v535), v546)); - svfloat32_t zero52; - asm volatile("mov %0.s, #0" : "=w"(zero52)); + svfloat32_t zero52 = svdup_n_f32(0); svfloat32_t v52 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero52, v445, v51, 0), v445, v51, 90); - svfloat32_t zero59; - asm volatile("mov %0.s, #0" : "=w"(zero59)); + svfloat32_t zero59 = svdup_n_f32(0); svfloat32_t v59 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero59, v454, v58, 0), v454, v58, 90); - svfloat32_t zero94; - asm volatile("mov %0.s, #0" : "=w"(zero94)); + svfloat32_t zero94 = svdup_n_f32(0); svfloat32_t v94 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero94, v463, v93, 0), v463, v93, 90); - svfloat32_t zero101; - asm volatile("mov %0.s, #0" : "=w"(zero101)); + svfloat32_t zero101 = svdup_n_f32(0); svfloat32_t v101 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero101, v472, v100, 0), v472, v100, 90); - svfloat32_t zero150; - asm volatile("mov %0.s, #0" : "=w"(zero150)); + svfloat32_t zero150 = svdup_n_f32(0); svfloat32_t v150 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero150, v490, v149, 0), v490, v149, 90); - svfloat32_t zero157; - asm volatile("mov %0.s, #0" : "=w"(zero157)); + svfloat32_t zero157 = svdup_n_f32(0); svfloat32_t v157 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero157, v499, v156, 0), v499, v156, 90); - svfloat32_t zero213; - asm volatile("mov %0.s, #0" : "=w"(zero213)); + svfloat32_t zero213 = svdup_n_f32(0); svfloat32_t v213 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero213, v527, v212, 0), v527, v212, 90); - svfloat32_t v228; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v228) : "w"(v52), "w"(v59)); - svfloat32_t v229; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v229) : "w"(v52), "w"(v59)); - svfloat32_t v238; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v238) : "w"(v94), "w"(v101)); - svfloat32_t v239; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v239) : "w"(v94), "w"(v101)); - svfloat32_t v241; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v241) : "w"(v150), "w"(v157)); - svfloat32_t v242; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v242) : "w"(v150), "w"(v157)); - svfloat32_t v244; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v244) : "w"(v206), "w"(v213)); - svfloat32_t v245; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v245) : "w"(v206), "w"(v213)); - svfloat32_t v237; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v237) : "w"(v228), "w"(v547)); + svfloat32_t v228 = svadd_f32_x(svptrue_b32(), v52, v59); + svfloat32_t v229 = svsub_f32_x(svptrue_b32(), v52, v59); + svfloat32_t v238 = svadd_f32_x(svptrue_b32(), v94, v101); + svfloat32_t v239 = svsub_f32_x(svptrue_b32(), v94, v101); + svfloat32_t v241 = svadd_f32_x(svptrue_b32(), v150, v157); + svfloat32_t v242 = svsub_f32_x(svptrue_b32(), v150, v157); + svfloat32_t v244 = svadd_f32_x(svptrue_b32(), v206, v213); + svfloat32_t v245 = svsub_f32_x(svptrue_b32(), v206, v213); + svfloat32_t v237 = svadd_f32_x(svptrue_b32(), v228, v547); svfloat32_t v240 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, v238, v481, v114, 0), v481, v114, 90); @@ -3747,95 +3400,60 @@ void armral_fft_cf32_cf32_cf32_ab_t_gs12(const armral_cmplx_f32_t *restrict x, svfloat32_t v246 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, v244, v537, v226, 0), v537, v226, 90); - svfloat32_t v277; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v277) : "w"(v228), "w"(v241)); - svfloat32_t v278; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v278) : "w"(v228), "w"(v241)); - svfloat32_t v279; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v279) : "w"(v238), "w"(v244)); - svfloat32_t v280; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v280) : "w"(v238), "w"(v244)); - svfloat32_t v307; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v307) : "w"(v229), "w"(v242)); - svfloat32_t v308; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v308) : "w"(v229), "w"(v242)); - svfloat32_t v309; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v309) : "w"(v239), "w"(v245)); - svfloat32_t v310; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v310) : "w"(v239), "w"(v245)); - svfloat32_t v247; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v247) : "w"(v237), "w"(v243)); - svfloat32_t v248; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v248) : "w"(v237), "w"(v243)); - svfloat32_t v249; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v249) : "w"(v240), "w"(v246)); - svfloat32_t v250; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v250) : "w"(v240), "w"(v246)); - svfloat32_t v281; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v281) : "w"(v277), "w"(v279)); - svfloat32_t v282; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v282) : "w"(v277), "w"(v279)); - svfloat32_t zero304; - asm volatile("mov %0.s, #0" : "=w"(zero304)); + svfloat32_t v277 = svadd_f32_x(svptrue_b32(), v228, v241); + svfloat32_t v278 = svsub_f32_x(svptrue_b32(), v228, v241); + svfloat32_t v279 = svadd_f32_x(svptrue_b32(), v238, v244); + svfloat32_t v280 = svsub_f32_x(svptrue_b32(), v238, v244); + svfloat32_t v307 = svadd_f32_x(svptrue_b32(), v229, v242); + svfloat32_t v308 = svsub_f32_x(svptrue_b32(), v229, v242); + svfloat32_t v309 = svadd_f32_x(svptrue_b32(), v239, v245); + svfloat32_t v310 = svsub_f32_x(svptrue_b32(), v239, v245); + svfloat32_t v247 = svadd_f32_x(svptrue_b32(), v237, v243); + svfloat32_t v248 = svsub_f32_x(svptrue_b32(), v237, v243); + svfloat32_t v249 = svadd_f32_x(svptrue_b32(), v240, v246); + svfloat32_t v250 = svsub_f32_x(svptrue_b32(), v240, v246); + svfloat32_t v281 = svadd_f32_x(svptrue_b32(), v277, v279); + svfloat32_t v282 = svsub_f32_x(svptrue_b32(), v277, v279); + svfloat32_t zero304 = svdup_n_f32(0); svfloat32_t v304 = svcmla_f32_x(pred_full, zero304, v555, v280, 90); - svfloat32_t v311; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v311) : "w"(v307), "w"(v309)); - svfloat32_t v312; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v312) : "w"(v307), "w"(v309)); - svfloat32_t zero333; - asm volatile("mov %0.s, #0" : "=w"(zero333)); + svfloat32_t v311 = svadd_f32_x(svptrue_b32(), v307, v309); + svfloat32_t v312 = svsub_f32_x(svptrue_b32(), v307, v309); + svfloat32_t zero333 = svdup_n_f32(0); svfloat32_t v333 = svcmla_f32_x(pred_full, zero333, v558, v308, 90); - svfloat32_t v251; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v251) : "w"(v247), "w"(v249)); - svfloat32_t v252; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v252) : "w"(v247), "w"(v249)); - svfloat32_t zero274; - asm volatile("mov %0.s, #0" : "=w"(zero274)); + svfloat32_t v251 = svadd_f32_x(svptrue_b32(), v247, v249); + svfloat32_t v252 = svsub_f32_x(svptrue_b32(), v247, v249); + svfloat32_t zero274 = svdup_n_f32(0); svfloat32_t v274 = svcmla_f32_x(pred_full, zero274, v551, v250, 90); svfloat32_t v305 = svmla_f32_x(pred_full, v304, v278, v554); svfloat32_t v306 = svnmls_f32_x(pred_full, v304, v278, v554); - svfloat32_t zero319; - asm volatile("mov %0.s, #0" : "=w"(zero319)); + svfloat32_t zero319 = svdup_n_f32(0); svfloat32_t v319 = svcmla_f32_x(pred_full, zero319, v558, v311, 90); - svfloat32_t zero326; - asm volatile("mov %0.s, #0" : "=w"(zero326)); + svfloat32_t zero326 = svdup_n_f32(0); svfloat32_t v326 = svcmla_f32_x(pred_full, zero326, v558, v312, 90); svfloat32_t v339 = svmla_f32_x(pred_full, v333, v310, v559); svfloat32_t v340 = svmls_f32_x(pred_full, v333, v310, v559); - svfloat32_t v275; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v275) : "w"(v248), "w"(v274)); - svfloat32_t v276; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v276) : "w"(v248), "w"(v274)); + svfloat32_t v275 = svadd_f32_x(svptrue_b32(), v248, v274); + svfloat32_t v276 = svsub_f32_x(svptrue_b32(), v248, v274); svfloat32_t v341 = svmla_f32_x(pred_full, v251, v281, v554); svfloat32_t v389 = svmla_f32_x(pred_full, v252, v282, v554); svst1_scatter_s64index_f64(pred_full, (double *)(v567), v667, svreinterpret_f64_f32(v251)); svst1_scatter_s64index_f64(pred_full, (double *)(v621), v667, svreinterpret_f64_f32(v252)); - svfloat32_t v342; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v342) : "w"(v341), "w"(v319)); - svfloat32_t v343; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v343) : "w"(v341), "w"(v319)); - svfloat32_t v365; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v365) : "w"(v276), "w"(v306)); - svfloat32_t v390; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v390) : "w"(v389), "w"(v326)); - svfloat32_t v391; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v391) : "w"(v389), "w"(v326)); - svfloat32_t v413; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v413) : "w"(v275), "w"(v305)); + svfloat32_t v342 = svadd_f32_x(svptrue_b32(), v341, v319); + svfloat32_t v343 = svsub_f32_x(svptrue_b32(), v341, v319); + svfloat32_t v365 = svadd_f32_x(svptrue_b32(), v276, v306); + svfloat32_t v390 = svadd_f32_x(svptrue_b32(), v389, v326); + svfloat32_t v391 = svsub_f32_x(svptrue_b32(), v389, v326); + svfloat32_t v413 = svadd_f32_x(svptrue_b32(), v275, v305); svst1_scatter_s64index_f64(pred_full, (double *)(v594), v667, svreinterpret_f64_f32(v276)); svst1_scatter_s64index_f64(pred_full, (double *)(v648), v667, svreinterpret_f64_f32(v275)); - svfloat32_t v366; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v366) : "w"(v365), "w"(v340)); - svfloat32_t v367; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v367) : "w"(v365), "w"(v340)); - svfloat32_t v414; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v414) : "w"(v413), "w"(v339)); - svfloat32_t v415; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v415) : "w"(v413), "w"(v339)); + svfloat32_t v366 = svadd_f32_x(svptrue_b32(), v365, v340); + svfloat32_t v367 = svsub_f32_x(svptrue_b32(), v365, v340); + svfloat32_t v414 = svadd_f32_x(svptrue_b32(), v413, v339); + svfloat32_t v415 = svsub_f32_x(svptrue_b32(), v413, v339); svst1_scatter_s64index_f64(pred_full, (double *)(v576), v667, svreinterpret_f64_f32(v343)); svst1_scatter_s64index_f64(pred_full, (double *)(v585), v667, @@ -4340,8 +3958,7 @@ void armral_fft_cf32_cf32_cf32_ab_t_gs13(const armral_cmplx_f32_t *restrict x, float32x2_t *v820 = &v6[v564]; float32x2_t *v829 = &v6[v571]; float32x2_t *v838 = &v6[v578]; - svfloat32_t zero192; - asm volatile("mov %0.s, #0" : "=w"(zero192)); + svfloat32_t zero192 = svdup_n_f32(0); svfloat32_t v192 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero192, v599, v191, 0), v599, v191, 90); @@ -4389,210 +4006,133 @@ void armral_fft_cf32_cf32_cf32_ab_t_gs13(const armral_cmplx_f32_t *restrict x, svld1_gather_s64index_f64(pred_full, (const double *)(v688), v709)); svfloat32_t v699 = svreinterpret_f32_f64( svld1_gather_s64index_f64(pred_full, (const double *)(v697), v709)); - svfloat32_t zero199; - asm volatile("mov %0.s, #0" : "=w"(zero199)); + svfloat32_t zero199 = svdup_n_f32(0); svfloat32_t v199 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero199, v609, v198, 0), v609, v198, 90); - svfloat32_t zero206; - asm volatile("mov %0.s, #0" : "=w"(zero206)); + svfloat32_t zero206 = svdup_n_f32(0); svfloat32_t v206 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero206, v618, v205, 0), v618, v205, 90); - svfloat32_t zero213; - asm volatile("mov %0.s, #0" : "=w"(zero213)); + svfloat32_t zero213 = svdup_n_f32(0); svfloat32_t v213 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero213, v627, v212, 0), v627, v212, 90); - svfloat32_t zero220; - asm volatile("mov %0.s, #0" : "=w"(zero220)); + svfloat32_t zero220 = svdup_n_f32(0); svfloat32_t v220 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero220, v636, v219, 0), v636, v219, 90); - svfloat32_t zero227; - asm volatile("mov %0.s, #0" : "=w"(zero227)); + svfloat32_t zero227 = svdup_n_f32(0); svfloat32_t v227 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero227, v645, v226, 0), v645, v226, 90); - svfloat32_t zero234; - asm volatile("mov %0.s, #0" : "=w"(zero234)); + svfloat32_t zero234 = svdup_n_f32(0); svfloat32_t v234 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero234, v654, v233, 0), v654, v233, 90); - svfloat32_t zero241; - asm volatile("mov %0.s, #0" : "=w"(zero241)); + svfloat32_t zero241 = svdup_n_f32(0); svfloat32_t v241 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero241, v663, v240, 0), v663, v240, 90); - svfloat32_t zero248; - asm volatile("mov %0.s, #0" : "=w"(zero248)); + svfloat32_t zero248 = svdup_n_f32(0); svfloat32_t v248 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero248, v672, v247, 0), v672, v247, 90); - svfloat32_t zero255; - asm volatile("mov %0.s, #0" : "=w"(zero255)); + svfloat32_t zero255 = svdup_n_f32(0); svfloat32_t v255 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero255, v681, v254, 0), v681, v254, 90); - svfloat32_t zero262; - asm volatile("mov %0.s, #0" : "=w"(zero262)); + svfloat32_t zero262 = svdup_n_f32(0); svfloat32_t v262 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero262, v690, v261, 0), v690, v261, 90); - svfloat32_t zero269; - asm volatile("mov %0.s, #0" : "=w"(zero269)); + svfloat32_t zero269 = svdup_n_f32(0); svfloat32_t v269 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero269, v699, v268, 0), v699, v268, 90); - svfloat32_t v270; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v270) : "w"(v192), "w"(v199)); - svfloat32_t v271; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v271) : "w"(v206), "w"(v213)); - svfloat32_t v272; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v272) : "w"(v220), "w"(v227)); - svfloat32_t v273; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v273) : "w"(v234), "w"(v241)); - svfloat32_t v274; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v274) : "w"(v248), "w"(v255)); - svfloat32_t v275; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v275) : "w"(v262), "w"(v269)); - svfloat32_t v276; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v276) : "w"(v192), "w"(v199)); - svfloat32_t v277; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v277) : "w"(v206), "w"(v213)); - svfloat32_t v278; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v278) : "w"(v220), "w"(v227)); - svfloat32_t v279; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v279) : "w"(v234), "w"(v241)); - svfloat32_t v280; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v280) : "w"(v248), "w"(v255)); - svfloat32_t v281; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v281) : "w"(v262), "w"(v269)); - svfloat32_t v282; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v282) : "w"(v271), "w"(v274)); - svfloat32_t v284; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v284) : "w"(v270), "w"(v272)); - svfloat32_t v287; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v287) : "w"(v277), "w"(v280)); - svfloat32_t v289; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v289) : "w"(v276), "w"(v278)); - svfloat32_t v291; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v291) : "w"(v271), "w"(v275)); - svfloat32_t v292; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v292) : "w"(v272), "w"(v273)); - svfloat32_t v293; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v293) : "w"(v270), "w"(v273)); - svfloat32_t v294; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v294) : "w"(v274), "w"(v275)); - svfloat32_t v299; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v299) : "w"(v277), "w"(v281)); - svfloat32_t v300; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v300) : "w"(v276), "w"(v278)); - svfloat32_t v301; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v301) : "w"(v277), "w"(v280)); - svfloat32_t v302; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v302) : "w"(v276), "w"(v279)); - svfloat32_t v303; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v303) : "w"(v280), "w"(v281)); - svfloat32_t v304; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v304) : "w"(v278), "w"(v279)); - svfloat32_t v283; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v283) : "w"(v282), "w"(v275)); - svfloat32_t v285; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v285) : "w"(v284), "w"(v273)); - svfloat32_t v288; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v288) : "w"(v287), "w"(v281)); - svfloat32_t v290; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v290) : "w"(v289), "w"(v279)); - svfloat32_t v295; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v295) : "w"(v291), "w"(v292)); - svfloat32_t v296; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v296) : "w"(v293), "w"(v294)); - svfloat32_t v297; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v297) : "w"(v291), "w"(v292)); - svfloat32_t v298; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v298) : "w"(v293), "w"(v294)); - svfloat32_t v317; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v317) : "w"(v299), "w"(v300)); - svfloat32_t v318; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v318) : "w"(v301), "w"(v302)); - svfloat32_t v319; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v319) : "w"(v303), "w"(v304)); - svfloat32_t zero392; - asm volatile("mov %0.s, #0" : "=w"(zero392)); + svfloat32_t v270 = svadd_f32_x(svptrue_b32(), v192, v199); + svfloat32_t v271 = svadd_f32_x(svptrue_b32(), v206, v213); + svfloat32_t v272 = svadd_f32_x(svptrue_b32(), v220, v227); + svfloat32_t v273 = svadd_f32_x(svptrue_b32(), v234, v241); + svfloat32_t v274 = svadd_f32_x(svptrue_b32(), v248, v255); + svfloat32_t v275 = svadd_f32_x(svptrue_b32(), v262, v269); + svfloat32_t v276 = svsub_f32_x(svptrue_b32(), v192, v199); + svfloat32_t v277 = svsub_f32_x(svptrue_b32(), v206, v213); + svfloat32_t v278 = svsub_f32_x(svptrue_b32(), v220, v227); + svfloat32_t v279 = svsub_f32_x(svptrue_b32(), v234, v241); + svfloat32_t v280 = svsub_f32_x(svptrue_b32(), v248, v255); + svfloat32_t v281 = svsub_f32_x(svptrue_b32(), v262, v269); + svfloat32_t v282 = svadd_f32_x(svptrue_b32(), v271, v274); + svfloat32_t v284 = svadd_f32_x(svptrue_b32(), v270, v272); + svfloat32_t v287 = svadd_f32_x(svptrue_b32(), v277, v280); + svfloat32_t v289 = svadd_f32_x(svptrue_b32(), v276, v278); + svfloat32_t v291 = svsub_f32_x(svptrue_b32(), v271, v275); + svfloat32_t v292 = svsub_f32_x(svptrue_b32(), v272, v273); + svfloat32_t v293 = svsub_f32_x(svptrue_b32(), v270, v273); + svfloat32_t v294 = svsub_f32_x(svptrue_b32(), v274, v275); + svfloat32_t v299 = svsub_f32_x(svptrue_b32(), v277, v281); + svfloat32_t v300 = svsub_f32_x(svptrue_b32(), v276, v278); + svfloat32_t v301 = svsub_f32_x(svptrue_b32(), v277, v280); + svfloat32_t v302 = svadd_f32_x(svptrue_b32(), v276, v279); + svfloat32_t v303 = svsub_f32_x(svptrue_b32(), v280, v281); + svfloat32_t v304 = svadd_f32_x(svptrue_b32(), v278, v279); + svfloat32_t v283 = svadd_f32_x(svptrue_b32(), v282, v275); + svfloat32_t v285 = svadd_f32_x(svptrue_b32(), v284, v273); + svfloat32_t v288 = svadd_f32_x(svptrue_b32(), v287, v281); + svfloat32_t v290 = svsub_f32_x(svptrue_b32(), v289, v279); + svfloat32_t v295 = svsub_f32_x(svptrue_b32(), v291, v292); + svfloat32_t v296 = svsub_f32_x(svptrue_b32(), v293, v294); + svfloat32_t v297 = svadd_f32_x(svptrue_b32(), v291, v292); + svfloat32_t v298 = svadd_f32_x(svptrue_b32(), v293, v294); + svfloat32_t v317 = svadd_f32_x(svptrue_b32(), v299, v300); + svfloat32_t v318 = svadd_f32_x(svptrue_b32(), v301, v302); + svfloat32_t v319 = svsub_f32_x(svptrue_b32(), v303, v304); + svfloat32_t zero392 = svdup_n_f32(0); svfloat32_t v392 = svcmla_f32_x(pred_full, zero392, v723, v299, 90); - svfloat32_t zero399; - asm volatile("mov %0.s, #0" : "=w"(zero399)); + svfloat32_t zero399 = svdup_n_f32(0); svfloat32_t v399 = svcmla_f32_x(pred_full, zero399, v724, v300, 90); - svfloat32_t zero413; - asm volatile("mov %0.s, #0" : "=w"(zero413)); + svfloat32_t zero413 = svdup_n_f32(0); svfloat32_t v413 = svcmla_f32_x(pred_full, zero413, v726, v301, 90); - svfloat32_t zero420; - asm volatile("mov %0.s, #0" : "=w"(zero420)); + svfloat32_t zero420 = svdup_n_f32(0); svfloat32_t v420 = svcmla_f32_x(pred_full, zero420, v727, v302, 90); - svfloat32_t zero434; - asm volatile("mov %0.s, #0" : "=w"(zero434)); + svfloat32_t zero434 = svdup_n_f32(0); svfloat32_t v434 = svcmla_f32_x(pred_full, zero434, v729, v303, 90); - svfloat32_t v286; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v286) : "w"(v283), "w"(v285)); - svfloat32_t v313; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v313) : "w"(v285), "w"(v283)); - svfloat32_t v314; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v314) : "w"(v288), "w"(v290)); - svfloat32_t v315; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v315) : "w"(v295), "w"(v296)); - svfloat32_t v316; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v316) : "w"(v297), "w"(v298)); - svfloat32_t zero341; - asm volatile("mov %0.s, #0" : "=w"(zero341)); + svfloat32_t v286 = svadd_f32_x(svptrue_b32(), v283, v285); + svfloat32_t v313 = svsub_f32_x(svptrue_b32(), v285, v283); + svfloat32_t v314 = svadd_f32_x(svptrue_b32(), v288, v290); + svfloat32_t v315 = svadd_f32_x(svptrue_b32(), v295, v296); + svfloat32_t v316 = svsub_f32_x(svptrue_b32(), v297, v298); + svfloat32_t zero341 = svdup_n_f32(0); svfloat32_t v341 = svcmla_f32_x(pred_full, zero341, v714, v288, 90); - svfloat32_t zero348; - asm volatile("mov %0.s, #0" : "=w"(zero348)); + svfloat32_t zero348 = svdup_n_f32(0); svfloat32_t v348 = svcmla_f32_x(pred_full, zero348, v715, v290, 90); - svfloat32_t v360; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v360) : "w"(v295), "w"(v717)); - svfloat32_t zero406; - asm volatile("mov %0.s, #0" : "=w"(zero406)); + svfloat32_t v360 = svmul_f32_x(svptrue_b32(), v295, v717); + svfloat32_t zero406 = svdup_n_f32(0); svfloat32_t v406 = svcmla_f32_x(pred_full, zero406, v725, v317, 90); - svfloat32_t zero427; - asm volatile("mov %0.s, #0" : "=w"(zero427)); + svfloat32_t zero427 = svdup_n_f32(0); svfloat32_t v427 = svcmla_f32_x(pred_full, zero427, v728, v318, 90); - svfloat32_t zero448; - asm volatile("mov %0.s, #0" : "=w"(zero448)); + svfloat32_t zero448 = svdup_n_f32(0); svfloat32_t v448 = svcmla_f32_x(pred_full, zero448, v731, v319, 90); - svfloat32_t v312; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v312) : "w"(v710), "w"(v286)); - svfloat32_t zero355; - asm volatile("mov %0.s, #0" : "=w"(zero355)); + svfloat32_t v312 = svadd_f32_x(svptrue_b32(), v710, v286); + svfloat32_t zero355 = svdup_n_f32(0); svfloat32_t v355 = svcmla_f32_x(pred_full, zero355, v716, v314, 90); - svfloat32_t v370; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v370) : "w"(v315), "w"(v719)); + svfloat32_t v370 = svmul_f32_x(svptrue_b32(), v315, v719); svfloat32_t v450 = svmla_f32_x(pred_full, v360, v296, v718); - svfloat32_t v462; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v462) : "w"(v392), "w"(v406)); - svfloat32_t v463; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v463) : "w"(v399), "w"(v406)); - svfloat32_t v464; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v464) : "w"(v413), "w"(v427)); - svfloat32_t v465; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v465) : "w"(v420), "w"(v427)); - svfloat32_t v466; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v466) : "w"(v434), "w"(v448)); + svfloat32_t v462 = svsub_f32_x(svptrue_b32(), v392, v406); + svfloat32_t v463 = svsub_f32_x(svptrue_b32(), v399, v406); + svfloat32_t v464 = svsub_f32_x(svptrue_b32(), v413, v427); + svfloat32_t v465 = svsub_f32_x(svptrue_b32(), v420, v427); + svfloat32_t v466 = svsub_f32_x(svptrue_b32(), v434, v448); svfloat32_t v467 = svcmla_f32_x(pred_full, v448, v730, v304, 90); svfloat32_t v449 = svmls_f32_x(pred_full, v312, v286, v712); svfloat32_t v451 = svmls_f32_x(pred_full, v450, v313, v713); svfloat32_t v452 = svmla_f32_x(pred_full, v370, v296, v718); svfloat32_t v454 = svnmls_f32_x(pred_full, v360, v315, v719); - svfloat32_t v468; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v468) : "w"(v341), "w"(v355)); - svfloat32_t v469; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v469) : "w"(v348), "w"(v355)); - svfloat32_t v480; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v480) : "w"(v462), "w"(v466)); - svfloat32_t v482; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v482) : "w"(v464), "w"(v466)); - svfloat32_t v484; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v484) : "w"(v463), "w"(v467)); + svfloat32_t v468 = svsub_f32_x(svptrue_b32(), v341, v355); + svfloat32_t v469 = svsub_f32_x(svptrue_b32(), v348, v355); + svfloat32_t v480 = svadd_f32_x(svptrue_b32(), v462, v466); + svfloat32_t v482 = svadd_f32_x(svptrue_b32(), v464, v466); + svfloat32_t v484 = svsub_f32_x(svptrue_b32(), v463, v467); svst1_scatter_s64index_f64(pred_full, (double *)(v739), v848, svreinterpret_f64_f32(v312)); svfloat32_t v453 = svmla_f32_x(pred_full, v452, v313, v713); @@ -4600,63 +4140,36 @@ void armral_fft_cf32_cf32_cf32_ab_t_gs13(const armral_cmplx_f32_t *restrict x, svfloat32_t v456 = svmla_f32_x(pred_full, v449, v297, v720); svfloat32_t v458 = svmls_f32_x(pred_full, v449, v298, v721); svfloat32_t v460 = svmls_f32_x(pred_full, v449, v297, v720); - svfloat32_t v476; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v476) : "w"(v469), "w"(v462)); - svfloat32_t v478; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v478) : "w"(v467), "w"(v468)); - svfloat32_t v481; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v481) : "w"(v480), "w"(v469)); - svfloat32_t v483; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v483) : "w"(v482), "w"(v469)); - svfloat32_t v485; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v485) : "w"(v484), "w"(v468)); - svfloat32_t v486; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v486) : "w"(v468), "w"(v463)); + svfloat32_t v476 = svsub_f32_x(svptrue_b32(), v469, v462); + svfloat32_t v478 = svsub_f32_x(svptrue_b32(), v467, v468); + svfloat32_t v481 = svadd_f32_x(svptrue_b32(), v480, v469); + svfloat32_t v483 = svsub_f32_x(svptrue_b32(), v482, v469); + svfloat32_t v485 = svsub_f32_x(svptrue_b32(), v484, v468); + svfloat32_t v486 = svadd_f32_x(svptrue_b32(), v468, v463); svfloat32_t v457 = svmla_f32_x(pred_full, v456, v298, v721); svfloat32_t v459 = svmls_f32_x(pred_full, v458, v316, v722); svfloat32_t v461 = svmla_f32_x(pred_full, v460, v316, v722); - svfloat32_t v477; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v477) : "w"(v476), "w"(v464)); - svfloat32_t v479; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v479) : "w"(v478), "w"(v465)); - svfloat32_t v487; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v487) : "w"(v486), "w"(v465)); - svfloat32_t v470; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v470) : "w"(v451), "w"(v457)); - svfloat32_t v471; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v471) : "w"(v453), "w"(v459)); - svfloat32_t v472; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v472) : "w"(v459), "w"(v453)); - svfloat32_t v473; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v473) : "w"(v455), "w"(v461)); - svfloat32_t v474; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v474) : "w"(v457), "w"(v451)); - svfloat32_t v475; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v475) : "w"(v461), "w"(v455)); - svfloat32_t v488; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v488) : "w"(v470), "w"(v477)); - svfloat32_t v489; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v489) : "w"(v471), "w"(v479)); - svfloat32_t v490; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v490) : "w"(v472), "w"(v481)); - svfloat32_t v491; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v491) : "w"(v473), "w"(v483)); - svfloat32_t v492; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v492) : "w"(v474), "w"(v485)); - svfloat32_t v493; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v493) : "w"(v475), "w"(v487)); - svfloat32_t v494; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v494) : "w"(v475), "w"(v487)); - svfloat32_t v495; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v495) : "w"(v474), "w"(v485)); - svfloat32_t v496; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v496) : "w"(v473), "w"(v483)); - svfloat32_t v497; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v497) : "w"(v472), "w"(v481)); - svfloat32_t v498; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v498) : "w"(v471), "w"(v479)); - svfloat32_t v499; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v499) : "w"(v470), "w"(v477)); + svfloat32_t v477 = svadd_f32_x(svptrue_b32(), v476, v464); + svfloat32_t v479 = svsub_f32_x(svptrue_b32(), v478, v465); + svfloat32_t v487 = svsub_f32_x(svptrue_b32(), v486, v465); + svfloat32_t v470 = svadd_f32_x(svptrue_b32(), v451, v457); + svfloat32_t v471 = svadd_f32_x(svptrue_b32(), v453, v459); + svfloat32_t v472 = svsub_f32_x(svptrue_b32(), v459, v453); + svfloat32_t v473 = svadd_f32_x(svptrue_b32(), v455, v461); + svfloat32_t v474 = svsub_f32_x(svptrue_b32(), v457, v451); + svfloat32_t v475 = svsub_f32_x(svptrue_b32(), v461, v455); + svfloat32_t v488 = svsub_f32_x(svptrue_b32(), v470, v477); + svfloat32_t v489 = svadd_f32_x(svptrue_b32(), v471, v479); + svfloat32_t v490 = svsub_f32_x(svptrue_b32(), v472, v481); + svfloat32_t v491 = svsub_f32_x(svptrue_b32(), v473, v483); + svfloat32_t v492 = svadd_f32_x(svptrue_b32(), v474, v485); + svfloat32_t v493 = svsub_f32_x(svptrue_b32(), v475, v487); + svfloat32_t v494 = svadd_f32_x(svptrue_b32(), v475, v487); + svfloat32_t v495 = svsub_f32_x(svptrue_b32(), v474, v485); + svfloat32_t v496 = svadd_f32_x(svptrue_b32(), v473, v483); + svfloat32_t v497 = svadd_f32_x(svptrue_b32(), v472, v481); + svfloat32_t v498 = svsub_f32_x(svptrue_b32(), v471, v479); + svfloat32_t v499 = svadd_f32_x(svptrue_b32(), v470, v477); svst1_scatter_s64index_f64(pred_full, (double *)(v748), v848, svreinterpret_f64_f32(v488)); svst1_scatter_s64index_f64(pred_full, (double *)(v757), v848, @@ -5113,8 +4626,7 @@ void armral_fft_cf32_cf32_cf32_ab_t_gs14(const armral_cmplx_f32_t *restrict x, svld1_f64(pred_full, &((const double *)v7)[v162])); svfloat32_t v198 = svreinterpret_f32_f64( svld1_f64(pred_full, &((const double *)v7)[v197])); - svfloat32_t zero206; - asm volatile("mov %0.s, #0" : "=w"(zero206)); + svfloat32_t zero206 = svdup_n_f32(0); svfloat32_t v206 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero206, v668, v205, 0), v668, v205, 90); @@ -5150,200 +4662,126 @@ void armral_fft_cf32_cf32_cf32_ab_t_gs14(const armral_cmplx_f32_t *restrict x, svld1_gather_s64index_f64(pred_full, (const double *)(v695), v715)); svfloat32_t v706 = svreinterpret_f32_f64( svld1_gather_s64index_f64(pred_full, (const double *)(v704), v715)); - svfloat32_t zero38; - asm volatile("mov %0.s, #0" : "=w"(zero38)); + svfloat32_t zero38 = svdup_n_f32(0); svfloat32_t v38 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero38, v596, v37, 0), v596, v37, 90); - svfloat32_t zero73; - asm volatile("mov %0.s, #0" : "=w"(zero73)); + svfloat32_t zero73 = svdup_n_f32(0); svfloat32_t v73 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero73, v605, v72, 0), v605, v72, 90); - svfloat32_t zero80; - asm volatile("mov %0.s, #0" : "=w"(zero80)); + svfloat32_t zero80 = svdup_n_f32(0); svfloat32_t v80 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero80, v614, v79, 0), v614, v79, 90); - svfloat32_t zero115; - asm volatile("mov %0.s, #0" : "=w"(zero115)); + svfloat32_t zero115 = svdup_n_f32(0); svfloat32_t v115 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero115, v623, v114, 0), v623, v114, 90); - svfloat32_t zero122; - asm volatile("mov %0.s, #0" : "=w"(zero122)); + svfloat32_t zero122 = svdup_n_f32(0); svfloat32_t v122 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero122, v632, v121, 0), v632, v121, 90); - svfloat32_t zero157; - asm volatile("mov %0.s, #0" : "=w"(zero157)); + svfloat32_t zero157 = svdup_n_f32(0); svfloat32_t v157 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero157, v641, v156, 0), v641, v156, 90); - svfloat32_t zero164; - asm volatile("mov %0.s, #0" : "=w"(zero164)); + svfloat32_t zero164 = svdup_n_f32(0); svfloat32_t v164 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero164, v650, v163, 0), v650, v163, 90); - svfloat32_t zero199; - asm volatile("mov %0.s, #0" : "=w"(zero199)); + svfloat32_t zero199 = svdup_n_f32(0); svfloat32_t v199 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero199, v659, v198, 0), v659, v198, 90); - svfloat32_t zero241; - asm volatile("mov %0.s, #0" : "=w"(zero241)); + svfloat32_t zero241 = svdup_n_f32(0); svfloat32_t v241 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero241, v679, v240, 0), v679, v240, 90); - svfloat32_t zero248; - asm volatile("mov %0.s, #0" : "=w"(zero248)); + svfloat32_t zero248 = svdup_n_f32(0); svfloat32_t v248 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero248, v688, v247, 0), v688, v247, 90); - svfloat32_t zero283; - asm volatile("mov %0.s, #0" : "=w"(zero283)); + svfloat32_t zero283 = svdup_n_f32(0); svfloat32_t v283 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero283, v697, v282, 0), v697, v282, 90); - svfloat32_t zero290; - asm volatile("mov %0.s, #0" : "=w"(zero290)); + svfloat32_t zero290 = svdup_n_f32(0); svfloat32_t v290 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero290, v706, v289, 0), v706, v289, 90); - svfloat32_t v298; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v298) : "w"(v716), "w"(v38)); - svfloat32_t v299; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v299) : "w"(v716), "w"(v38)); - svfloat32_t v300; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v300) : "w"(v73), "w"(v80)); - svfloat32_t v301; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v301) : "w"(v73), "w"(v80)); - svfloat32_t v302; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v302) : "w"(v115), "w"(v122)); - svfloat32_t v303; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v303) : "w"(v115), "w"(v122)); - svfloat32_t v304; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v304) : "w"(v157), "w"(v164)); - svfloat32_t v305; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v305) : "w"(v157), "w"(v164)); - svfloat32_t v306; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v306) : "w"(v199), "w"(v206)); - svfloat32_t v307; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v307) : "w"(v199), "w"(v206)); - svfloat32_t v308; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v308) : "w"(v241), "w"(v248)); - svfloat32_t v309; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v309) : "w"(v241), "w"(v248)); - svfloat32_t v310; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v310) : "w"(v283), "w"(v290)); - svfloat32_t v311; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v311) : "w"(v283), "w"(v290)); - svfloat32_t v312; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v312) : "w"(v300), "w"(v310)); - svfloat32_t v313; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v313) : "w"(v300), "w"(v310)); - svfloat32_t v314; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v314) : "w"(v306), "w"(v304)); - svfloat32_t v315; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v315) : "w"(v306), "w"(v304)); - svfloat32_t v316; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v316) : "w"(v302), "w"(v308)); - svfloat32_t v317; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v317) : "w"(v302), "w"(v308)); - svfloat32_t v401; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v401) : "w"(v301), "w"(v311)); - svfloat32_t v402; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v402) : "w"(v301), "w"(v311)); - svfloat32_t v403; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v403) : "w"(v307), "w"(v305)); - svfloat32_t v404; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v404) : "w"(v307), "w"(v305)); - svfloat32_t v405; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v405) : "w"(v303), "w"(v309)); - svfloat32_t v406; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v406) : "w"(v303), "w"(v309)); - svfloat32_t v318; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v318) : "w"(v312), "w"(v314)); - svfloat32_t v321; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v321) : "w"(v312), "w"(v314)); - svfloat32_t v322; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v322) : "w"(v314), "w"(v316)); - svfloat32_t v323; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v323) : "w"(v316), "w"(v312)); - svfloat32_t v324; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v324) : "w"(v313), "w"(v315)); - svfloat32_t v326; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v326) : "w"(v313), "w"(v315)); - svfloat32_t v327; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v327) : "w"(v315), "w"(v317)); - svfloat32_t v328; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v328) : "w"(v317), "w"(v313)); - svfloat32_t v407; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v407) : "w"(v401), "w"(v403)); - svfloat32_t v410; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v410) : "w"(v401), "w"(v403)); - svfloat32_t v411; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v411) : "w"(v403), "w"(v405)); - svfloat32_t v412; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v412) : "w"(v405), "w"(v401)); - svfloat32_t v413; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v413) : "w"(v402), "w"(v404)); - svfloat32_t v415; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v415) : "w"(v402), "w"(v404)); - svfloat32_t v416; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v416) : "w"(v404), "w"(v406)); - svfloat32_t v417; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v417) : "w"(v406), "w"(v402)); - svfloat32_t v319; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v319) : "w"(v318), "w"(v316)); - svfloat32_t v325; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v325) : "w"(v324), "w"(v317)); - svfloat32_t zero367; - asm volatile("mov %0.s, #0" : "=w"(zero367)); + svfloat32_t v298 = svadd_f32_x(svptrue_b32(), v716, v38); + svfloat32_t v299 = svsub_f32_x(svptrue_b32(), v716, v38); + svfloat32_t v300 = svadd_f32_x(svptrue_b32(), v73, v80); + svfloat32_t v301 = svsub_f32_x(svptrue_b32(), v73, v80); + svfloat32_t v302 = svadd_f32_x(svptrue_b32(), v115, v122); + svfloat32_t v303 = svsub_f32_x(svptrue_b32(), v115, v122); + svfloat32_t v304 = svadd_f32_x(svptrue_b32(), v157, v164); + svfloat32_t v305 = svsub_f32_x(svptrue_b32(), v157, v164); + svfloat32_t v306 = svadd_f32_x(svptrue_b32(), v199, v206); + svfloat32_t v307 = svsub_f32_x(svptrue_b32(), v199, v206); + svfloat32_t v308 = svadd_f32_x(svptrue_b32(), v241, v248); + svfloat32_t v309 = svsub_f32_x(svptrue_b32(), v241, v248); + svfloat32_t v310 = svadd_f32_x(svptrue_b32(), v283, v290); + svfloat32_t v311 = svsub_f32_x(svptrue_b32(), v283, v290); + svfloat32_t v312 = svadd_f32_x(svptrue_b32(), v300, v310); + svfloat32_t v313 = svsub_f32_x(svptrue_b32(), v300, v310); + svfloat32_t v314 = svadd_f32_x(svptrue_b32(), v306, v304); + svfloat32_t v315 = svsub_f32_x(svptrue_b32(), v306, v304); + svfloat32_t v316 = svadd_f32_x(svptrue_b32(), v302, v308); + svfloat32_t v317 = svsub_f32_x(svptrue_b32(), v302, v308); + svfloat32_t v401 = svadd_f32_x(svptrue_b32(), v301, v311); + svfloat32_t v402 = svsub_f32_x(svptrue_b32(), v301, v311); + svfloat32_t v403 = svadd_f32_x(svptrue_b32(), v307, v305); + svfloat32_t v404 = svsub_f32_x(svptrue_b32(), v307, v305); + svfloat32_t v405 = svadd_f32_x(svptrue_b32(), v303, v309); + svfloat32_t v406 = svsub_f32_x(svptrue_b32(), v303, v309); + svfloat32_t v318 = svadd_f32_x(svptrue_b32(), v312, v314); + svfloat32_t v321 = svsub_f32_x(svptrue_b32(), v312, v314); + svfloat32_t v322 = svsub_f32_x(svptrue_b32(), v314, v316); + svfloat32_t v323 = svsub_f32_x(svptrue_b32(), v316, v312); + svfloat32_t v324 = svadd_f32_x(svptrue_b32(), v313, v315); + svfloat32_t v326 = svsub_f32_x(svptrue_b32(), v313, v315); + svfloat32_t v327 = svsub_f32_x(svptrue_b32(), v315, v317); + svfloat32_t v328 = svsub_f32_x(svptrue_b32(), v317, v313); + svfloat32_t v407 = svadd_f32_x(svptrue_b32(), v401, v403); + svfloat32_t v410 = svsub_f32_x(svptrue_b32(), v401, v403); + svfloat32_t v411 = svsub_f32_x(svptrue_b32(), v403, v405); + svfloat32_t v412 = svsub_f32_x(svptrue_b32(), v405, v401); + svfloat32_t v413 = svadd_f32_x(svptrue_b32(), v402, v404); + svfloat32_t v415 = svsub_f32_x(svptrue_b32(), v402, v404); + svfloat32_t v416 = svsub_f32_x(svptrue_b32(), v404, v406); + svfloat32_t v417 = svsub_f32_x(svptrue_b32(), v406, v402); + svfloat32_t v319 = svadd_f32_x(svptrue_b32(), v318, v316); + svfloat32_t v325 = svadd_f32_x(svptrue_b32(), v324, v317); + svfloat32_t zero367 = svdup_n_f32(0); svfloat32_t v367 = svcmla_f32_x(pred_full, zero367, v732, v326, 90); - svfloat32_t zero374; - asm volatile("mov %0.s, #0" : "=w"(zero374)); + svfloat32_t zero374 = svdup_n_f32(0); svfloat32_t v374 = svcmla_f32_x(pred_full, zero374, v733, v327, 90); - svfloat32_t zero381; - asm volatile("mov %0.s, #0" : "=w"(zero381)); + svfloat32_t zero381 = svdup_n_f32(0); svfloat32_t v381 = svcmla_f32_x(pred_full, zero381, v734, v328, 90); - svfloat32_t v408; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v408) : "w"(v407), "w"(v405)); - svfloat32_t v414; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v414) : "w"(v413), "w"(v406)); - svfloat32_t zero456; - asm volatile("mov %0.s, #0" : "=w"(zero456)); + svfloat32_t v408 = svadd_f32_x(svptrue_b32(), v407, v405); + svfloat32_t v414 = svadd_f32_x(svptrue_b32(), v413, v406); + svfloat32_t zero456 = svdup_n_f32(0); svfloat32_t v456 = svcmla_f32_x(pred_full, zero456, v732, v415, 90); - svfloat32_t zero463; - asm volatile("mov %0.s, #0" : "=w"(zero463)); + svfloat32_t zero463 = svdup_n_f32(0); svfloat32_t v463 = svcmla_f32_x(pred_full, zero463, v733, v416, 90); - svfloat32_t zero470; - asm volatile("mov %0.s, #0" : "=w"(zero470)); + svfloat32_t zero470 = svdup_n_f32(0); svfloat32_t v470 = svcmla_f32_x(pred_full, zero470, v734, v417, 90); - svfloat32_t v320; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v320) : "w"(v319), "w"(v298)); - svfloat32_t zero360; - asm volatile("mov %0.s, #0" : "=w"(zero360)); + svfloat32_t v320 = svadd_f32_x(svptrue_b32(), v319, v298); + svfloat32_t zero360 = svdup_n_f32(0); svfloat32_t v360 = svcmla_f32_x(pred_full, zero360, v731, v325, 90); - svfloat32_t v409; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v409) : "w"(v408), "w"(v299)); - svfloat32_t zero449; - asm volatile("mov %0.s, #0" : "=w"(zero449)); + svfloat32_t v409 = svadd_f32_x(svptrue_b32(), v408, v299); + svfloat32_t zero449 = svdup_n_f32(0); svfloat32_t v449 = svcmla_f32_x(pred_full, zero449, v731, v414, 90); svfloat32_t v382 = svmla_f32_x(pred_full, v320, v319, v727); - svfloat32_t v389; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v389) : "w"(v360), "w"(v367)); - svfloat32_t v391; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v391) : "w"(v360), "w"(v367)); - svfloat32_t v393; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v393) : "w"(v360), "w"(v374)); + svfloat32_t v389 = svadd_f32_x(svptrue_b32(), v360, v367); + svfloat32_t v391 = svsub_f32_x(svptrue_b32(), v360, v367); + svfloat32_t v393 = svsub_f32_x(svptrue_b32(), v360, v374); svfloat32_t v471 = svmla_f32_x(pred_full, v409, v408, v727); - svfloat32_t v478; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v478) : "w"(v449), "w"(v456)); - svfloat32_t v480; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v480) : "w"(v449), "w"(v456)); - svfloat32_t v482; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v482) : "w"(v449), "w"(v463)); + svfloat32_t v478 = svadd_f32_x(svptrue_b32(), v449, v456); + svfloat32_t v480 = svsub_f32_x(svptrue_b32(), v449, v456); + svfloat32_t v482 = svsub_f32_x(svptrue_b32(), v449, v463); svst1_scatter_s64index_f64(pred_full, (double *)(v742), v860, svreinterpret_f64_f32(v320)); svst1_scatter_s64index_f64(pred_full, (double *)(v751), v860, @@ -5351,51 +4789,33 @@ void armral_fft_cf32_cf32_cf32_ab_t_gs14(const armral_cmplx_f32_t *restrict x, svfloat32_t v383 = svmla_f32_x(pred_full, v382, v321, v728); svfloat32_t v385 = svmls_f32_x(pred_full, v382, v321, v728); svfloat32_t v387 = svmls_f32_x(pred_full, v382, v322, v729); - svfloat32_t v390; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v390) : "w"(v389), "w"(v374)); - svfloat32_t v392; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v392) : "w"(v391), "w"(v381)); - svfloat32_t v394; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v394) : "w"(v393), "w"(v381)); + svfloat32_t v390 = svadd_f32_x(svptrue_b32(), v389, v374); + svfloat32_t v392 = svsub_f32_x(svptrue_b32(), v391, v381); + svfloat32_t v394 = svadd_f32_x(svptrue_b32(), v393, v381); svfloat32_t v472 = svmla_f32_x(pred_full, v471, v410, v728); svfloat32_t v474 = svmls_f32_x(pred_full, v471, v410, v728); svfloat32_t v476 = svmls_f32_x(pred_full, v471, v411, v729); - svfloat32_t v479; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v479) : "w"(v478), "w"(v463)); - svfloat32_t v481; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v481) : "w"(v480), "w"(v470)); - svfloat32_t v483; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v483) : "w"(v482), "w"(v470)); + svfloat32_t v479 = svadd_f32_x(svptrue_b32(), v478, v463); + svfloat32_t v481 = svsub_f32_x(svptrue_b32(), v480, v470); + svfloat32_t v483 = svadd_f32_x(svptrue_b32(), v482, v470); svfloat32_t v384 = svmla_f32_x(pred_full, v383, v322, v729); svfloat32_t v386 = svmls_f32_x(pred_full, v385, v323, v730); svfloat32_t v388 = svmla_f32_x(pred_full, v387, v323, v730); svfloat32_t v473 = svmla_f32_x(pred_full, v472, v411, v729); svfloat32_t v475 = svmls_f32_x(pred_full, v474, v412, v730); svfloat32_t v477 = svmla_f32_x(pred_full, v476, v412, v730); - svfloat32_t v395; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v395) : "w"(v384), "w"(v390)); - svfloat32_t v396; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v396) : "w"(v384), "w"(v390)); - svfloat32_t v397; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v397) : "w"(v386), "w"(v392)); - svfloat32_t v398; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v398) : "w"(v386), "w"(v392)); - svfloat32_t v399; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v399) : "w"(v388), "w"(v394)); - svfloat32_t v400; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v400) : "w"(v388), "w"(v394)); - svfloat32_t v484; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v484) : "w"(v473), "w"(v479)); - svfloat32_t v485; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v485) : "w"(v473), "w"(v479)); - svfloat32_t v486; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v486) : "w"(v475), "w"(v481)); - svfloat32_t v487; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v487) : "w"(v475), "w"(v481)); - svfloat32_t v488; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v488) : "w"(v477), "w"(v483)); - svfloat32_t v489; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v489) : "w"(v477), "w"(v483)); + svfloat32_t v395 = svadd_f32_x(svptrue_b32(), v384, v390); + svfloat32_t v396 = svsub_f32_x(svptrue_b32(), v384, v390); + svfloat32_t v397 = svadd_f32_x(svptrue_b32(), v386, v392); + svfloat32_t v398 = svsub_f32_x(svptrue_b32(), v386, v392); + svfloat32_t v399 = svadd_f32_x(svptrue_b32(), v388, v394); + svfloat32_t v400 = svsub_f32_x(svptrue_b32(), v388, v394); + svfloat32_t v484 = svadd_f32_x(svptrue_b32(), v473, v479); + svfloat32_t v485 = svsub_f32_x(svptrue_b32(), v473, v479); + svfloat32_t v486 = svadd_f32_x(svptrue_b32(), v475, v481); + svfloat32_t v487 = svsub_f32_x(svptrue_b32(), v475, v481); + svfloat32_t v488 = svadd_f32_x(svptrue_b32(), v477, v483); + svfloat32_t v489 = svsub_f32_x(svptrue_b32(), v477, v483); svst1_scatter_s64index_f64(pred_full, (double *)(v760), v860, svreinterpret_f64_f32(v396)); svst1_scatter_s64index_f64(pred_full, (double *)(v769), v860, @@ -5912,8 +5332,7 @@ void armral_fft_cf32_cf32_cf32_ab_t_gs15(const armral_cmplx_f32_t *restrict x, svld1_f64(pred_full, &((const double *)v7)[v113])); svfloat32_t v149 = svreinterpret_f32_f64( svld1_f64(pred_full, &((const double *)v7)[v148])); - svfloat32_t zero157; - asm volatile("mov %0.s, #0" : "=w"(zero157)); + svfloat32_t zero157 = svdup_n_f32(0); svfloat32_t v157 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero157, v647, v156, 0), v647, v156, 90); @@ -5957,73 +5376,53 @@ void armral_fft_cf32_cf32_cf32_ab_t_gs15(const armral_cmplx_f32_t *restrict x, svld1_gather_s64index_f64(pred_full, (const double *)(v701), v721)); svfloat32_t v712 = svreinterpret_f32_f64( svld1_gather_s64index_f64(pred_full, (const double *)(v710), v721)); - svfloat32_t zero52; - asm volatile("mov %0.s, #0" : "=w"(zero52)); + svfloat32_t zero52 = svdup_n_f32(0); svfloat32_t v52 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero52, v593, v51, 0), v593, v51, 90); - svfloat32_t zero59; - asm volatile("mov %0.s, #0" : "=w"(zero59)); + svfloat32_t zero59 = svdup_n_f32(0); svfloat32_t v59 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero59, v602, v58, 0), v602, v58, 90); - svfloat32_t zero94; - asm volatile("mov %0.s, #0" : "=w"(zero94)); + svfloat32_t zero94 = svdup_n_f32(0); svfloat32_t v94 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero94, v611, v93, 0), v611, v93, 90); - svfloat32_t zero101; - asm volatile("mov %0.s, #0" : "=w"(zero101)); + svfloat32_t zero101 = svdup_n_f32(0); svfloat32_t v101 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero101, v620, v100, 0), v620, v100, 90); - svfloat32_t zero150; - asm volatile("mov %0.s, #0" : "=w"(zero150)); + svfloat32_t zero150 = svdup_n_f32(0); svfloat32_t v150 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero150, v638, v149, 0), v638, v149, 90); - svfloat32_t zero206; - asm volatile("mov %0.s, #0" : "=w"(zero206)); + svfloat32_t zero206 = svdup_n_f32(0); svfloat32_t v206 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero206, v667, v205, 0), v667, v205, 90); - svfloat32_t zero213; - asm volatile("mov %0.s, #0" : "=w"(zero213)); + svfloat32_t zero213 = svdup_n_f32(0); svfloat32_t v213 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero213, v676, v212, 0), v676, v212, 90); - svfloat32_t zero262; - asm volatile("mov %0.s, #0" : "=w"(zero262)); + svfloat32_t zero262 = svdup_n_f32(0); svfloat32_t v262 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero262, v694, v261, 0), v694, v261, 90); - svfloat32_t zero269; - asm volatile("mov %0.s, #0" : "=w"(zero269)); + svfloat32_t zero269 = svdup_n_f32(0); svfloat32_t v269 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero269, v703, v268, 0), v703, v268, 90); - svfloat32_t v284; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v284) : "w"(v52), "w"(v59)); - svfloat32_t v285; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v285) : "w"(v52), "w"(v59)); - svfloat32_t v294; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v294) : "w"(v94), "w"(v101)); - svfloat32_t v295; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v295) : "w"(v94), "w"(v101)); - svfloat32_t v297; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v297) : "w"(v150), "w"(v157)); - svfloat32_t v298; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v298) : "w"(v150), "w"(v157)); - svfloat32_t v300; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v300) : "w"(v206), "w"(v213)); - svfloat32_t v301; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v301) : "w"(v206), "w"(v213)); - svfloat32_t v303; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v303) : "w"(v262), "w"(v269)); - svfloat32_t v304; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v304) : "w"(v262), "w"(v269)); - svfloat32_t v293; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v293) : "w"(v284), "w"(v722)); + svfloat32_t v284 = svadd_f32_x(svptrue_b32(), v52, v59); + svfloat32_t v285 = svsub_f32_x(svptrue_b32(), v52, v59); + svfloat32_t v294 = svadd_f32_x(svptrue_b32(), v94, v101); + svfloat32_t v295 = svsub_f32_x(svptrue_b32(), v94, v101); + svfloat32_t v297 = svadd_f32_x(svptrue_b32(), v150, v157); + svfloat32_t v298 = svsub_f32_x(svptrue_b32(), v150, v157); + svfloat32_t v300 = svadd_f32_x(svptrue_b32(), v206, v213); + svfloat32_t v301 = svsub_f32_x(svptrue_b32(), v206, v213); + svfloat32_t v303 = svadd_f32_x(svptrue_b32(), v262, v269); + svfloat32_t v304 = svsub_f32_x(svptrue_b32(), v262, v269); + svfloat32_t v293 = svadd_f32_x(svptrue_b32(), v284, v722); svfloat32_t v296 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, v294, v629, v114, 0), v629, v114, 90); @@ -6036,86 +5435,51 @@ void armral_fft_cf32_cf32_cf32_ab_t_gs15(const armral_cmplx_f32_t *restrict x, svfloat32_t v305 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, v303, v712, v282, 0), v712, v282, 90); - svfloat32_t v359; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v359) : "w"(v294), "w"(v303)); - svfloat32_t v360; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v360) : "w"(v294), "w"(v303)); - svfloat32_t v361; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v361) : "w"(v300), "w"(v297)); - svfloat32_t v362; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v362) : "w"(v300), "w"(v297)); - svfloat32_t v412; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v412) : "w"(v295), "w"(v304)); - svfloat32_t v413; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v413) : "w"(v295), "w"(v304)); - svfloat32_t v414; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v414) : "w"(v301), "w"(v298)); - svfloat32_t v415; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v415) : "w"(v301), "w"(v298)); - svfloat32_t v306; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v306) : "w"(v296), "w"(v305)); - svfloat32_t v307; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v307) : "w"(v296), "w"(v305)); - svfloat32_t v308; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v308) : "w"(v302), "w"(v299)); - svfloat32_t v309; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v309) : "w"(v302), "w"(v299)); - svfloat32_t v363; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v363) : "w"(v359), "w"(v361)); - svfloat32_t v364; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v364) : "w"(v359), "w"(v361)); - svfloat32_t v365; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v365) : "w"(v360), "w"(v362)); - svfloat32_t zero388; - asm volatile("mov %0.s, #0" : "=w"(zero388)); + svfloat32_t v359 = svadd_f32_x(svptrue_b32(), v294, v303); + svfloat32_t v360 = svsub_f32_x(svptrue_b32(), v294, v303); + svfloat32_t v361 = svadd_f32_x(svptrue_b32(), v300, v297); + svfloat32_t v362 = svsub_f32_x(svptrue_b32(), v300, v297); + svfloat32_t v412 = svadd_f32_x(svptrue_b32(), v295, v304); + svfloat32_t v413 = svsub_f32_x(svptrue_b32(), v295, v304); + svfloat32_t v414 = svadd_f32_x(svptrue_b32(), v301, v298); + svfloat32_t v415 = svsub_f32_x(svptrue_b32(), v301, v298); + svfloat32_t v306 = svadd_f32_x(svptrue_b32(), v296, v305); + svfloat32_t v307 = svsub_f32_x(svptrue_b32(), v296, v305); + svfloat32_t v308 = svadd_f32_x(svptrue_b32(), v302, v299); + svfloat32_t v309 = svsub_f32_x(svptrue_b32(), v302, v299); + svfloat32_t v363 = svadd_f32_x(svptrue_b32(), v359, v361); + svfloat32_t v364 = svsub_f32_x(svptrue_b32(), v359, v361); + svfloat32_t v365 = svadd_f32_x(svptrue_b32(), v360, v362); + svfloat32_t zero388 = svdup_n_f32(0); svfloat32_t v388 = svcmla_f32_x(pred_full, zero388, v732, v360, 90); - svfloat32_t v416; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v416) : "w"(v412), "w"(v414)); - svfloat32_t v417; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v417) : "w"(v412), "w"(v414)); - svfloat32_t v418; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v418) : "w"(v413), "w"(v415)); - svfloat32_t v455; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v455) : "w"(v415), "w"(v740)); - svfloat32_t v310; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v310) : "w"(v306), "w"(v308)); - svfloat32_t v311; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v311) : "w"(v306), "w"(v308)); - svfloat32_t v312; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v312) : "w"(v307), "w"(v309)); - svfloat32_t zero335; - asm volatile("mov %0.s, #0" : "=w"(zero335)); + svfloat32_t v416 = svadd_f32_x(svptrue_b32(), v412, v414); + svfloat32_t v417 = svsub_f32_x(svptrue_b32(), v412, v414); + svfloat32_t v418 = svadd_f32_x(svptrue_b32(), v413, v415); + svfloat32_t v455 = svmul_f32_x(svptrue_b32(), v415, v740); + svfloat32_t v310 = svadd_f32_x(svptrue_b32(), v306, v308); + svfloat32_t v311 = svsub_f32_x(svptrue_b32(), v306, v308); + svfloat32_t v312 = svadd_f32_x(svptrue_b32(), v307, v309); + svfloat32_t zero335 = svdup_n_f32(0); svfloat32_t v335 = svcmla_f32_x(pred_full, zero335, v726, v307, 90); - svfloat32_t v366; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v366) : "w"(v363), "w"(v284)); - svfloat32_t v376; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v376) : "w"(v363), "w"(v730)); - svfloat32_t zero395; - asm volatile("mov %0.s, #0" : "=w"(zero395)); + svfloat32_t v366 = svadd_f32_x(svptrue_b32(), v363, v284); + svfloat32_t v376 = svmul_f32_x(svptrue_b32(), v363, v730); + svfloat32_t zero395 = svdup_n_f32(0); svfloat32_t v395 = svcmla_f32_x(pred_full, zero395, v733, v365, 90); - svfloat32_t v419; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v419) : "w"(v416), "w"(v285)); - svfloat32_t zero440; - asm volatile("mov %0.s, #0" : "=w"(zero440)); + svfloat32_t v419 = svadd_f32_x(svptrue_b32(), v416, v285); + svfloat32_t zero440 = svdup_n_f32(0); svfloat32_t v440 = svcmla_f32_x(pred_full, zero440, v737, v417, 90); - svfloat32_t v450; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v450) : "w"(v418), "w"(v739)); - svfloat32_t v313; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v313) : "w"(v310), "w"(v293)); - svfloat32_t zero342; - asm volatile("mov %0.s, #0" : "=w"(zero342)); + svfloat32_t v450 = svmul_f32_x(svptrue_b32(), v418, v739); + svfloat32_t v313 = svadd_f32_x(svptrue_b32(), v310, v293); + svfloat32_t zero342 = svdup_n_f32(0); svfloat32_t v342 = svcmla_f32_x(pred_full, zero342, v727, v312, 90); - svfloat32_t v406; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v406) : "w"(v388), "w"(v395)); + svfloat32_t v406 = svsub_f32_x(svptrue_b32(), v388, v395); svfloat32_t v407 = svcmla_f32_x(pred_full, v395, v734, v362, 90); - svfloat32_t zero426; - asm volatile("mov %0.s, #0" : "=w"(zero426)); + svfloat32_t zero426 = svdup_n_f32(0); svfloat32_t v426 = svcmla_f32_x(pred_full, zero426, v735, v419, 90); svfloat32_t v459 = svnmls_f32_x(pred_full, v450, v413, v738); svfloat32_t v460 = svmla_f32_x(pred_full, v455, v418, v739); svfloat32_t v350 = svmla_f32_x(pred_full, v313, v310, v724); - svfloat32_t v353; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v353) : "w"(v335), "w"(v342)); + svfloat32_t v353 = svsub_f32_x(svptrue_b32(), v335, v342); svfloat32_t v354 = svcmla_f32_x(pred_full, v342, v728, v309, 90); svfloat32_t v403 = svmla_f32_x(pred_full, v376, v366, v729); svfloat32_t v456 = svcmla_f32_x(pred_full, v426, v736, v416, 90); @@ -6126,50 +5490,30 @@ void armral_fft_cf32_cf32_cf32_ab_t_gs15(const armral_cmplx_f32_t *restrict x, svfloat32_t v352 = svmls_f32_x(pred_full, v350, v311, v725); svfloat32_t v404 = svmla_f32_x(pred_full, v403, v364, v731); svfloat32_t v405 = svmls_f32_x(pred_full, v403, v364, v731); - svfloat32_t v457; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v457) : "w"(v456), "w"(v440)); - svfloat32_t v458; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v458) : "w"(v456), "w"(v440)); - svfloat32_t v466; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v466) : "w"(v465), "w"(v426)); - svfloat32_t v467; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v467) : "w"(v465), "w"(v426)); - svfloat32_t v355; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v355) : "w"(v351), "w"(v353)); - svfloat32_t v356; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v356) : "w"(v351), "w"(v353)); - svfloat32_t v357; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v357) : "w"(v352), "w"(v354)); - svfloat32_t v358; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v358) : "w"(v352), "w"(v354)); - svfloat32_t v408; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v408) : "w"(v404), "w"(v406)); - svfloat32_t v409; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v409) : "w"(v404), "w"(v406)); - svfloat32_t v410; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v410) : "w"(v405), "w"(v407)); - svfloat32_t v411; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v411) : "w"(v405), "w"(v407)); - svfloat32_t v461; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v461) : "w"(v457), "w"(v459)); - svfloat32_t v462; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v462) : "w"(v457), "w"(v459)); - svfloat32_t v463; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v463) : "w"(v458), "w"(v460)); - svfloat32_t v464; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v464) : "w"(v458), "w"(v460)); + svfloat32_t v457 = svadd_f32_x(svptrue_b32(), v456, v440); + svfloat32_t v458 = svsub_f32_x(svptrue_b32(), v456, v440); + svfloat32_t v466 = svadd_f32_x(svptrue_b32(), v465, v426); + svfloat32_t v467 = svsub_f32_x(svptrue_b32(), v465, v426); + svfloat32_t v355 = svadd_f32_x(svptrue_b32(), v351, v353); + svfloat32_t v356 = svsub_f32_x(svptrue_b32(), v351, v353); + svfloat32_t v357 = svadd_f32_x(svptrue_b32(), v352, v354); + svfloat32_t v358 = svsub_f32_x(svptrue_b32(), v352, v354); + svfloat32_t v408 = svadd_f32_x(svptrue_b32(), v404, v406); + svfloat32_t v409 = svsub_f32_x(svptrue_b32(), v404, v406); + svfloat32_t v410 = svadd_f32_x(svptrue_b32(), v405, v407); + svfloat32_t v411 = svsub_f32_x(svptrue_b32(), v405, v407); + svfloat32_t v461 = svadd_f32_x(svptrue_b32(), v457, v459); + svfloat32_t v462 = svsub_f32_x(svptrue_b32(), v457, v459); + svfloat32_t v463 = svadd_f32_x(svptrue_b32(), v458, v460); + svfloat32_t v464 = svsub_f32_x(svptrue_b32(), v458, v460); svst1_scatter_s64index_f64(pred_full, (double *)(v757), v875, svreinterpret_f64_f32(v467)); svst1_scatter_s64index_f64(pred_full, (double *)(v766), v875, svreinterpret_f64_f32(v466)); - svfloat32_t v489; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v489) : "w"(v356), "w"(v409)); - svfloat32_t v513; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v513) : "w"(v358), "w"(v411)); - svfloat32_t v537; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v537) : "w"(v357), "w"(v410)); - svfloat32_t v561; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v561) : "w"(v355), "w"(v408)); + svfloat32_t v489 = svadd_f32_x(svptrue_b32(), v356, v409); + svfloat32_t v513 = svadd_f32_x(svptrue_b32(), v358, v411); + svfloat32_t v537 = svadd_f32_x(svptrue_b32(), v357, v410); + svfloat32_t v561 = svadd_f32_x(svptrue_b32(), v355, v408); svst1_scatter_s64index_f64(pred_full, (double *)(v775), v875, svreinterpret_f64_f32(v356)); svst1_scatter_s64index_f64(pred_full, (double *)(v802), v875, @@ -6178,22 +5522,14 @@ void armral_fft_cf32_cf32_cf32_ab_t_gs15(const armral_cmplx_f32_t *restrict x, svreinterpret_f64_f32(v357)); svst1_scatter_s64index_f64(pred_full, (double *)(v856), v875, svreinterpret_f64_f32(v355)); - svfloat32_t v490; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v490) : "w"(v489), "w"(v462)); - svfloat32_t v491; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v491) : "w"(v489), "w"(v462)); - svfloat32_t v514; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v514) : "w"(v513), "w"(v464)); - svfloat32_t v515; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v515) : "w"(v513), "w"(v464)); - svfloat32_t v538; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v538) : "w"(v537), "w"(v463)); - svfloat32_t v539; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v539) : "w"(v537), "w"(v463)); - svfloat32_t v562; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v562) : "w"(v561), "w"(v461)); - svfloat32_t v563; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v563) : "w"(v561), "w"(v461)); + svfloat32_t v490 = svadd_f32_x(svptrue_b32(), v489, v462); + svfloat32_t v491 = svsub_f32_x(svptrue_b32(), v489, v462); + svfloat32_t v514 = svadd_f32_x(svptrue_b32(), v513, v464); + svfloat32_t v515 = svsub_f32_x(svptrue_b32(), v513, v464); + svfloat32_t v538 = svadd_f32_x(svptrue_b32(), v537, v463); + svfloat32_t v539 = svsub_f32_x(svptrue_b32(), v537, v463); + svfloat32_t v562 = svadd_f32_x(svptrue_b32(), v561, v461); + svfloat32_t v563 = svsub_f32_x(svptrue_b32(), v561, v461); svst1_scatter_s64index_f64(pred_full, (double *)(v784), v875, svreinterpret_f64_f32(v491)); svst1_scatter_s64index_f64(pred_full, (double *)(v793), v875, @@ -6669,8 +6005,7 @@ void armral_fft_cf32_cf32_cf32_ab_t_gs16(const armral_cmplx_f32_t *restrict x, svld1_f64(pred_full, &((const double *)v7)[v155])); svfloat32_t v163 = svreinterpret_f32_f64( svld1_f64(pred_full, &((const double *)v7)[v162])); - svfloat32_t zero199; - asm volatile("mov %0.s, #0" : "=w"(zero199)); + svfloat32_t zero199 = svdup_n_f32(0); svfloat32_t v199 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero199, v713, v198, 0), v713, v198, 90); @@ -6716,266 +6051,169 @@ void armral_fft_cf32_cf32_cf32_ab_t_gs16(const armral_cmplx_f32_t *restrict x, svld1_gather_s64index_f64(pred_full, (const double *)(v767), v787)); svfloat32_t v778 = svreinterpret_f32_f64( svld1_gather_s64index_f64(pred_full, (const double *)(v776), v787)); - svfloat32_t zero38; - asm volatile("mov %0.s, #0" : "=w"(zero38)); + svfloat32_t zero38 = svdup_n_f32(0); svfloat32_t v38 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero38, v650, v37, 0), v650, v37, 90); - svfloat32_t zero73; - asm volatile("mov %0.s, #0" : "=w"(zero73)); + svfloat32_t zero73 = svdup_n_f32(0); svfloat32_t v73 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero73, v659, v72, 0), v659, v72, 90); - svfloat32_t zero80; - asm volatile("mov %0.s, #0" : "=w"(zero80)); + svfloat32_t zero80 = svdup_n_f32(0); svfloat32_t v80 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero80, v668, v79, 0), v668, v79, 90); - svfloat32_t zero115; - asm volatile("mov %0.s, #0" : "=w"(zero115)); + svfloat32_t zero115 = svdup_n_f32(0); svfloat32_t v115 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero115, v677, v114, 0), v677, v114, 90); - svfloat32_t zero122; - asm volatile("mov %0.s, #0" : "=w"(zero122)); + svfloat32_t zero122 = svdup_n_f32(0); svfloat32_t v122 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero122, v686, v121, 0), v686, v121, 90); - svfloat32_t zero157; - asm volatile("mov %0.s, #0" : "=w"(zero157)); + svfloat32_t zero157 = svdup_n_f32(0); svfloat32_t v157 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero157, v695, v156, 0), v695, v156, 90); - svfloat32_t zero164; - asm volatile("mov %0.s, #0" : "=w"(zero164)); + svfloat32_t zero164 = svdup_n_f32(0); svfloat32_t v164 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero164, v704, v163, 0), v704, v163, 90); - svfloat32_t zero206; - asm volatile("mov %0.s, #0" : "=w"(zero206)); + svfloat32_t zero206 = svdup_n_f32(0); svfloat32_t v206 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero206, v723, v205, 0), v723, v205, 90); - svfloat32_t zero241; - asm volatile("mov %0.s, #0" : "=w"(zero241)); + svfloat32_t zero241 = svdup_n_f32(0); svfloat32_t v241 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero241, v733, v240, 0), v733, v240, 90); - svfloat32_t zero248; - asm volatile("mov %0.s, #0" : "=w"(zero248)); + svfloat32_t zero248 = svdup_n_f32(0); svfloat32_t v248 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero248, v742, v247, 0), v742, v247, 90); - svfloat32_t zero283; - asm volatile("mov %0.s, #0" : "=w"(zero283)); + svfloat32_t zero283 = svdup_n_f32(0); svfloat32_t v283 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero283, v751, v282, 0), v751, v282, 90); - svfloat32_t zero290; - asm volatile("mov %0.s, #0" : "=w"(zero290)); + svfloat32_t zero290 = svdup_n_f32(0); svfloat32_t v290 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero290, v760, v289, 0), v760, v289, 90); - svfloat32_t zero325; - asm volatile("mov %0.s, #0" : "=w"(zero325)); + svfloat32_t zero325 = svdup_n_f32(0); svfloat32_t v325 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero325, v769, v324, 0), v769, v324, 90); - svfloat32_t zero332; - asm volatile("mov %0.s, #0" : "=w"(zero332)); + svfloat32_t zero332 = svdup_n_f32(0); svfloat32_t v332 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero332, v778, v331, 0), v778, v331, 90); - svfloat32_t v340; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v340) : "w"(v788), "w"(v38)); - svfloat32_t v341; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v341) : "w"(v788), "w"(v38)); - svfloat32_t v342; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v342) : "w"(v73), "w"(v80)); - svfloat32_t v343; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v343) : "w"(v73), "w"(v80)); - svfloat32_t v344; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v344) : "w"(v115), "w"(v122)); - svfloat32_t v345; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v345) : "w"(v115), "w"(v122)); - svfloat32_t v346; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v346) : "w"(v157), "w"(v164)); - svfloat32_t v347; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v347) : "w"(v157), "w"(v164)); - svfloat32_t v348; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v348) : "w"(v199), "w"(v206)); - svfloat32_t v349; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v349) : "w"(v199), "w"(v206)); - svfloat32_t v350; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v350) : "w"(v241), "w"(v248)); - svfloat32_t v351; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v351) : "w"(v241), "w"(v248)); - svfloat32_t v352; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v352) : "w"(v283), "w"(v290)); - svfloat32_t v353; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v353) : "w"(v283), "w"(v290)); - svfloat32_t v354; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v354) : "w"(v325), "w"(v332)); - svfloat32_t v355; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v355) : "w"(v325), "w"(v332)); - svfloat32_t v356; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v356) : "w"(v340), "w"(v342)); - svfloat32_t v357; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v357) : "w"(v340), "w"(v342)); - svfloat32_t v358; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v358) : "w"(v344), "w"(v346)); - svfloat32_t v359; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v359) : "w"(v344), "w"(v346)); - svfloat32_t v360; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v360) : "w"(v348), "w"(v350)); - svfloat32_t v361; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v361) : "w"(v348), "w"(v350)); - svfloat32_t v362; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v362) : "w"(v352), "w"(v354)); - svfloat32_t v363; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v363) : "w"(v352), "w"(v354)); - svfloat32_t v372; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v372) : "w"(v345), "w"(v347)); - svfloat32_t v373; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v373) : "w"(v345), "w"(v347)); - svfloat32_t v374; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v374) : "w"(v349), "w"(v355)); - svfloat32_t v375; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v375) : "w"(v349), "w"(v355)); - svfloat32_t v376; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v376) : "w"(v351), "w"(v353)); - svfloat32_t v377; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v377) : "w"(v351), "w"(v353)); - svfloat32_t zero437; - asm volatile("mov %0.s, #0" : "=w"(zero437)); + svfloat32_t v340 = svadd_f32_x(svptrue_b32(), v788, v38); + svfloat32_t v341 = svsub_f32_x(svptrue_b32(), v788, v38); + svfloat32_t v342 = svadd_f32_x(svptrue_b32(), v73, v80); + svfloat32_t v343 = svsub_f32_x(svptrue_b32(), v73, v80); + svfloat32_t v344 = svadd_f32_x(svptrue_b32(), v115, v122); + svfloat32_t v345 = svsub_f32_x(svptrue_b32(), v115, v122); + svfloat32_t v346 = svadd_f32_x(svptrue_b32(), v157, v164); + svfloat32_t v347 = svsub_f32_x(svptrue_b32(), v157, v164); + svfloat32_t v348 = svadd_f32_x(svptrue_b32(), v199, v206); + svfloat32_t v349 = svsub_f32_x(svptrue_b32(), v199, v206); + svfloat32_t v350 = svadd_f32_x(svptrue_b32(), v241, v248); + svfloat32_t v351 = svsub_f32_x(svptrue_b32(), v241, v248); + svfloat32_t v352 = svadd_f32_x(svptrue_b32(), v283, v290); + svfloat32_t v353 = svsub_f32_x(svptrue_b32(), v283, v290); + svfloat32_t v354 = svadd_f32_x(svptrue_b32(), v325, v332); + svfloat32_t v355 = svsub_f32_x(svptrue_b32(), v325, v332); + svfloat32_t v356 = svadd_f32_x(svptrue_b32(), v340, v342); + svfloat32_t v357 = svsub_f32_x(svptrue_b32(), v340, v342); + svfloat32_t v358 = svadd_f32_x(svptrue_b32(), v344, v346); + svfloat32_t v359 = svsub_f32_x(svptrue_b32(), v344, v346); + svfloat32_t v360 = svadd_f32_x(svptrue_b32(), v348, v350); + svfloat32_t v361 = svsub_f32_x(svptrue_b32(), v348, v350); + svfloat32_t v362 = svadd_f32_x(svptrue_b32(), v352, v354); + svfloat32_t v363 = svsub_f32_x(svptrue_b32(), v352, v354); + svfloat32_t v372 = svadd_f32_x(svptrue_b32(), v345, v347); + svfloat32_t v373 = svsub_f32_x(svptrue_b32(), v345, v347); + svfloat32_t v374 = svadd_f32_x(svptrue_b32(), v349, v355); + svfloat32_t v375 = svsub_f32_x(svptrue_b32(), v349, v355); + svfloat32_t v376 = svadd_f32_x(svptrue_b32(), v351, v353); + svfloat32_t v377 = svsub_f32_x(svptrue_b32(), v351, v353); + svfloat32_t zero437 = svdup_n_f32(0); svfloat32_t v437 = svcmla_f32_x(pred_full, zero437, v798, v343, 90); - svfloat32_t v364; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v364) : "w"(v356), "w"(v358)); - svfloat32_t v365; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v365) : "w"(v356), "w"(v358)); - svfloat32_t v366; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v366) : "w"(v360), "w"(v362)); - svfloat32_t v367; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v367) : "w"(v360), "w"(v362)); - svfloat32_t v370; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v370) : "w"(v361), "w"(v363)); - svfloat32_t v371; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v371) : "w"(v361), "w"(v363)); - svfloat32_t v378; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v378) : "w"(v374), "w"(v376)); - svfloat32_t v379; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v379) : "w"(v375), "w"(v377)); - svfloat32_t zero413; - asm volatile("mov %0.s, #0" : "=w"(zero413)); + svfloat32_t v364 = svadd_f32_x(svptrue_b32(), v356, v358); + svfloat32_t v365 = svsub_f32_x(svptrue_b32(), v356, v358); + svfloat32_t v366 = svadd_f32_x(svptrue_b32(), v360, v362); + svfloat32_t v367 = svsub_f32_x(svptrue_b32(), v360, v362); + svfloat32_t v370 = svadd_f32_x(svptrue_b32(), v361, v363); + svfloat32_t v371 = svsub_f32_x(svptrue_b32(), v361, v363); + svfloat32_t v378 = svadd_f32_x(svptrue_b32(), v374, v376); + svfloat32_t v379 = svadd_f32_x(svptrue_b32(), v375, v377); + svfloat32_t zero413 = svdup_n_f32(0); svfloat32_t v413 = svcmla_f32_x(pred_full, zero413, v798, v359, 90); - svfloat32_t zero444; - asm volatile("mov %0.s, #0" : "=w"(zero444)); + svfloat32_t zero444 = svdup_n_f32(0); svfloat32_t v444 = svcmla_f32_x(pred_full, zero444, v799, v372, 90); - svfloat32_t zero470; - asm volatile("mov %0.s, #0" : "=w"(zero470)); + svfloat32_t zero470 = svdup_n_f32(0); svfloat32_t v470 = svcmla_f32_x(pred_full, zero470, v803, v376, 90); - svfloat32_t v480; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v480) : "w"(v375), "w"(v805)); - svfloat32_t v485; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v485) : "w"(v377), "w"(v806)); - svfloat32_t v368; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v368) : "w"(v364), "w"(v366)); - svfloat32_t v369; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v369) : "w"(v364), "w"(v366)); - svfloat32_t zero401; - asm volatile("mov %0.s, #0" : "=w"(zero401)); + svfloat32_t v480 = svmul_f32_x(svptrue_b32(), v375, v805); + svfloat32_t v485 = svmul_f32_x(svptrue_b32(), v377, v806); + svfloat32_t v368 = svadd_f32_x(svptrue_b32(), v364, v366); + svfloat32_t v369 = svsub_f32_x(svptrue_b32(), v364, v366); + svfloat32_t zero401 = svdup_n_f32(0); svfloat32_t v401 = svcmla_f32_x(pred_full, zero401, v798, v367, 90); - svfloat32_t zero420; - asm volatile("mov %0.s, #0" : "=w"(zero420)); + svfloat32_t zero420 = svdup_n_f32(0); svfloat32_t v420 = svcmla_f32_x(pred_full, zero420, v799, v370, 90); - svfloat32_t zero456; - asm volatile("mov %0.s, #0" : "=w"(zero456)); + svfloat32_t zero456 = svdup_n_f32(0); svfloat32_t v456 = svcmla_f32_x(pred_full, zero456, v801, v378, 90); - svfloat32_t v475; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v475) : "w"(v379), "w"(v804)); + svfloat32_t v475 = svmul_f32_x(svptrue_b32(), v379, v804); svfloat32_t v496 = svmla_f32_x(pred_full, v341, v373, v800); svfloat32_t v497 = svmls_f32_x(pred_full, v341, v373, v800); - svfloat32_t v498; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v498) : "w"(v437), "w"(v444)); - svfloat32_t v499; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v499) : "w"(v437), "w"(v444)); - svfloat32_t v486; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v486) : "w"(v365), "w"(v401)); - svfloat32_t v487; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v487) : "w"(v365), "w"(v401)); + svfloat32_t v498 = svadd_f32_x(svptrue_b32(), v437, v444); + svfloat32_t v499 = svsub_f32_x(svptrue_b32(), v437, v444); + svfloat32_t v486 = svadd_f32_x(svptrue_b32(), v365, v401); + svfloat32_t v487 = svsub_f32_x(svptrue_b32(), v365, v401); svfloat32_t v488 = svmla_f32_x(pred_full, v357, v371, v800); - svfloat32_t v489; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v489) : "w"(v413), "w"(v420)); + svfloat32_t v489 = svadd_f32_x(svptrue_b32(), v413, v420); svfloat32_t v490 = svmls_f32_x(pred_full, v357, v371, v800); - svfloat32_t v491; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v491) : "w"(v420), "w"(v413)); + svfloat32_t v491 = svsub_f32_x(svptrue_b32(), v420, v413); svfloat32_t v500 = svcmla_f32_x(pred_full, v456, v802, v374, 90); - svfloat32_t v501; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v501) : "w"(v456), "w"(v470)); + svfloat32_t v501 = svsub_f32_x(svptrue_b32(), v456, v470); svfloat32_t v502 = svnmls_f32_x(pred_full, v475, v375, v805); svfloat32_t v503 = svnmls_f32_x(pred_full, v475, v377, v806); svfloat32_t v504 = svnmls_f32_x(pred_full, v480, v379, v804); svfloat32_t v505 = svnmls_f32_x(pred_full, v485, v379, v804); - svfloat32_t v510; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v510) : "w"(v497), "w"(v499)); - svfloat32_t v511; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v511) : "w"(v497), "w"(v499)); + svfloat32_t v510 = svadd_f32_x(svptrue_b32(), v497, v499); + svfloat32_t v511 = svsub_f32_x(svptrue_b32(), v497, v499); svst1_scatter_s64index_f64(pred_full, (double *)(v814), v950, svreinterpret_f64_f32(v368)); svst1_scatter_s64index_f64(pred_full, (double *)(v886), v950, svreinterpret_f64_f32(v369)); - svfloat32_t v492; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v492) : "w"(v488), "w"(v489)); - svfloat32_t v493; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v493) : "w"(v490), "w"(v491)); - svfloat32_t v494; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v494) : "w"(v490), "w"(v491)); - svfloat32_t v495; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v495) : "w"(v488), "w"(v489)); - svfloat32_t v506; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v506) : "w"(v496), "w"(v502)); - svfloat32_t v507; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v507) : "w"(v496), "w"(v502)); - svfloat32_t v508; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v508) : "w"(v496), "w"(v504)); - svfloat32_t v509; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v509) : "w"(v496), "w"(v504)); - svfloat32_t v512; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v512) : "w"(v497), "w"(v505)); - svfloat32_t v513; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v513) : "w"(v497), "w"(v505)); - svfloat32_t v516; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v516) : "w"(v500), "w"(v498)); - svfloat32_t v517; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v517) : "w"(v500), "w"(v498)); - svfloat32_t v518; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v518) : "w"(v501), "w"(v503)); - svfloat32_t v519; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v519) : "w"(v501), "w"(v503)); - svfloat32_t v520; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v520) : "w"(v501), "w"(v499)); - svfloat32_t v521; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v521) : "w"(v501), "w"(v499)); + svfloat32_t v492 = svadd_f32_x(svptrue_b32(), v488, v489); + svfloat32_t v493 = svadd_f32_x(svptrue_b32(), v490, v491); + svfloat32_t v494 = svsub_f32_x(svptrue_b32(), v490, v491); + svfloat32_t v495 = svsub_f32_x(svptrue_b32(), v488, v489); + svfloat32_t v506 = svadd_f32_x(svptrue_b32(), v496, v502); + svfloat32_t v507 = svsub_f32_x(svptrue_b32(), v496, v502); + svfloat32_t v508 = svadd_f32_x(svptrue_b32(), v496, v504); + svfloat32_t v509 = svsub_f32_x(svptrue_b32(), v496, v504); + svfloat32_t v512 = svadd_f32_x(svptrue_b32(), v497, v505); + svfloat32_t v513 = svsub_f32_x(svptrue_b32(), v497, v505); + svfloat32_t v516 = svadd_f32_x(svptrue_b32(), v500, v498); + svfloat32_t v517 = svsub_f32_x(svptrue_b32(), v500, v498); + svfloat32_t v518 = svadd_f32_x(svptrue_b32(), v501, v503); + svfloat32_t v519 = svsub_f32_x(svptrue_b32(), v501, v503); + svfloat32_t v520 = svadd_f32_x(svptrue_b32(), v501, v499); + svfloat32_t v521 = svsub_f32_x(svptrue_b32(), v501, v499); svst1_scatter_s64index_f64(pred_full, (double *)(v850), v950, svreinterpret_f64_f32(v487)); svst1_scatter_s64index_f64(pred_full, (double *)(v922), v950, svreinterpret_f64_f32(v486)); - svfloat32_t v522; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v522) : "w"(v506), "w"(v516)); - svfloat32_t v523; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v523) : "w"(v507), "w"(v517)); - svfloat32_t v524; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v524) : "w"(v508), "w"(v517)); - svfloat32_t v525; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v525) : "w"(v509), "w"(v516)); - svfloat32_t v526; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v526) : "w"(v510), "w"(v518)); - svfloat32_t v527; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v527) : "w"(v511), "w"(v519)); - svfloat32_t v528; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v528) : "w"(v512), "w"(v521)); - svfloat32_t v529; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v529) : "w"(v513), "w"(v520)); + svfloat32_t v522 = svadd_f32_x(svptrue_b32(), v506, v516); + svfloat32_t v523 = svadd_f32_x(svptrue_b32(), v507, v517); + svfloat32_t v524 = svsub_f32_x(svptrue_b32(), v508, v517); + svfloat32_t v525 = svsub_f32_x(svptrue_b32(), v509, v516); + svfloat32_t v526 = svadd_f32_x(svptrue_b32(), v510, v518); + svfloat32_t v527 = svadd_f32_x(svptrue_b32(), v511, v519); + svfloat32_t v528 = svsub_f32_x(svptrue_b32(), v512, v521); + svfloat32_t v529 = svsub_f32_x(svptrue_b32(), v513, v520); svst1_scatter_s64index_f64(pred_full, (double *)(v832), v950, svreinterpret_f64_f32(v495)); svst1_scatter_s64index_f64(pred_full, (double *)(v868), v950, @@ -7726,8 +6964,7 @@ void armral_fft_cf32_cf32_cf32_ab_t_gs17(const armral_cmplx_f32_t *restrict x, float32x2_t *v1184 = &v6[v837]; float32x2_t *v1193 = &v6[v845]; float32x2_t *v1202 = &v6[v853]; - svfloat32_t zero52; - asm volatile("mov %0.s, #0" : "=w"(zero52)); + svfloat32_t zero52 = svdup_n_f32(0); svfloat32_t v52 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero52, v867, v51, 0), v867, v51, 90); @@ -7791,242 +7028,151 @@ void armral_fft_cf32_cf32_cf32_ab_t_gs17(const armral_cmplx_f32_t *restrict x, svld1_gather_s64index_f64(pred_full, (const double *)(v993), v1013)); svfloat32_t v1004 = svreinterpret_f32_f64( svld1_gather_s64index_f64(pred_full, (const double *)(v1002), v1013)); - svfloat32_t zero59; - asm volatile("mov %0.s, #0" : "=w"(zero59)); + svfloat32_t zero59 = svdup_n_f32(0); svfloat32_t v59 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero59, v877, v58, 0), v877, v58, 90); - svfloat32_t zero94; - asm volatile("mov %0.s, #0" : "=w"(zero94)); + svfloat32_t zero94 = svdup_n_f32(0); svfloat32_t v94 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero94, v887, v93, 0), v887, v93, 90); - svfloat32_t zero101; - asm volatile("mov %0.s, #0" : "=w"(zero101)); + svfloat32_t zero101 = svdup_n_f32(0); svfloat32_t v101 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero101, v896, v100, 0), v896, v100, 90); - svfloat32_t zero136; - asm volatile("mov %0.s, #0" : "=w"(zero136)); + svfloat32_t zero136 = svdup_n_f32(0); svfloat32_t v136 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero136, v905, v135, 0), v905, v135, 90); - svfloat32_t zero143; - asm volatile("mov %0.s, #0" : "=w"(zero143)); + svfloat32_t zero143 = svdup_n_f32(0); svfloat32_t v143 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero143, v914, v142, 0), v914, v142, 90); - svfloat32_t zero178; - asm volatile("mov %0.s, #0" : "=w"(zero178)); + svfloat32_t zero178 = svdup_n_f32(0); svfloat32_t v178 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero178, v923, v177, 0), v923, v177, 90); - svfloat32_t zero185; - asm volatile("mov %0.s, #0" : "=w"(zero185)); + svfloat32_t zero185 = svdup_n_f32(0); svfloat32_t v185 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero185, v932, v184, 0), v932, v184, 90); - svfloat32_t zero220; - asm volatile("mov %0.s, #0" : "=w"(zero220)); + svfloat32_t zero220 = svdup_n_f32(0); svfloat32_t v220 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero220, v941, v219, 0), v941, v219, 90); - svfloat32_t zero227; - asm volatile("mov %0.s, #0" : "=w"(zero227)); + svfloat32_t zero227 = svdup_n_f32(0); svfloat32_t v227 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero227, v950, v226, 0), v950, v226, 90); - svfloat32_t zero262; - asm volatile("mov %0.s, #0" : "=w"(zero262)); + svfloat32_t zero262 = svdup_n_f32(0); svfloat32_t v262 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero262, v959, v261, 0), v959, v261, 90); - svfloat32_t zero269; - asm volatile("mov %0.s, #0" : "=w"(zero269)); + svfloat32_t zero269 = svdup_n_f32(0); svfloat32_t v269 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero269, v968, v268, 0), v968, v268, 90); - svfloat32_t zero304; - asm volatile("mov %0.s, #0" : "=w"(zero304)); + svfloat32_t zero304 = svdup_n_f32(0); svfloat32_t v304 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero304, v977, v303, 0), v977, v303, 90); - svfloat32_t zero311; - asm volatile("mov %0.s, #0" : "=w"(zero311)); + svfloat32_t zero311 = svdup_n_f32(0); svfloat32_t v311 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero311, v986, v310, 0), v986, v310, 90); - svfloat32_t zero346; - asm volatile("mov %0.s, #0" : "=w"(zero346)); + svfloat32_t zero346 = svdup_n_f32(0); svfloat32_t v346 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero346, v995, v345, 0), v995, v345, 90); - svfloat32_t zero353; - asm volatile("mov %0.s, #0" : "=w"(zero353)); + svfloat32_t zero353 = svdup_n_f32(0); svfloat32_t v353 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero353, v1004, v352, 0), v1004, v352, 90); - svfloat32_t v354; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v354) : "w"(v52), "w"(v59)); - svfloat32_t v355; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v355) : "w"(v52), "w"(v59)); - svfloat32_t v356; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v356) : "w"(v94), "w"(v101)); - svfloat32_t v357; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v357) : "w"(v94), "w"(v101)); - svfloat32_t v358; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v358) : "w"(v136), "w"(v143)); - svfloat32_t v359; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v359) : "w"(v136), "w"(v143)); - svfloat32_t v360; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v360) : "w"(v178), "w"(v185)); - svfloat32_t v361; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v361) : "w"(v178), "w"(v185)); - svfloat32_t v362; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v362) : "w"(v220), "w"(v227)); - svfloat32_t v363; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v363) : "w"(v220), "w"(v227)); - svfloat32_t v364; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v364) : "w"(v262), "w"(v269)); - svfloat32_t v365; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v365) : "w"(v262), "w"(v269)); - svfloat32_t v366; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v366) : "w"(v304), "w"(v311)); - svfloat32_t v367; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v367) : "w"(v304), "w"(v311)); - svfloat32_t v368; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v368) : "w"(v346), "w"(v353)); - svfloat32_t v369; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v369) : "w"(v346), "w"(v353)); - svfloat32_t v370; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v370) : "w"(v354), "w"(v362)); - svfloat32_t v371; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v371) : "w"(v356), "w"(v364)); - svfloat32_t v372; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v372) : "w"(v358), "w"(v366)); - svfloat32_t v373; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v373) : "w"(v360), "w"(v368)); - svfloat32_t v376; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v376) : "w"(v354), "w"(v362)); - svfloat32_t v377; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v377) : "w"(v356), "w"(v364)); - svfloat32_t v378; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v378) : "w"(v358), "w"(v366)); - svfloat32_t v379; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v379) : "w"(v360), "w"(v368)); - svfloat32_t v390; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v390) : "w"(v355), "w"(v359)); - svfloat32_t v391; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v391) : "w"(v357), "w"(v361)); - svfloat32_t v392; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v392) : "w"(v355), "w"(v359)); - svfloat32_t v393; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v393) : "w"(v369), "w"(v365)); - svfloat32_t v394; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v394) : "w"(v363), "w"(v367)); - svfloat32_t v395; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v395) : "w"(v365), "w"(v369)); - svfloat32_t v396; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v396) : "w"(v363), "w"(v367)); - svfloat32_t v397; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v397) : "w"(v357), "w"(v361)); - svfloat32_t v410; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v410) : "w"(v355), "w"(v363)); - svfloat32_t v411; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v411) : "w"(v361), "w"(v369)); - svfloat32_t v374; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v374) : "w"(v370), "w"(v372)); - svfloat32_t v375; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v375) : "w"(v371), "w"(v373)); - svfloat32_t v380; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v380) : "w"(v370), "w"(v372)); - svfloat32_t v381; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v381) : "w"(v371), "w"(v373)); - svfloat32_t v384; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v384) : "w"(v377), "w"(v379)); - svfloat32_t v385; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v385) : "w"(v376), "w"(v378)); - svfloat32_t v387; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v387) : "w"(v378), "w"(v379)); - svfloat32_t v388; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v388) : "w"(v376), "w"(v377)); - svfloat32_t v398; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v398) : "w"(v390), "w"(v391)); - svfloat32_t v399; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v399) : "w"(v394), "w"(v395)); - svfloat32_t v401; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v401) : "w"(v390), "w"(v391)); - svfloat32_t v402; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v402) : "w"(v394), "w"(v395)); - svfloat32_t v404; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v404) : "w"(v392), "w"(v393)); - svfloat32_t v405; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v405) : "w"(v396), "w"(v397)); - svfloat32_t v407; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v407) : "w"(v392), "w"(v393)); - svfloat32_t v408; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v408) : "w"(v396), "w"(v397)); - svfloat32_t v447; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v447) : "w"(v378), "w"(v1018)); - svfloat32_t zero614; - asm volatile("mov %0.s, #0" : "=w"(zero614)); + svfloat32_t v354 = svadd_f32_x(svptrue_b32(), v52, v59); + svfloat32_t v355 = svsub_f32_x(svptrue_b32(), v52, v59); + svfloat32_t v356 = svadd_f32_x(svptrue_b32(), v94, v101); + svfloat32_t v357 = svsub_f32_x(svptrue_b32(), v94, v101); + svfloat32_t v358 = svadd_f32_x(svptrue_b32(), v136, v143); + svfloat32_t v359 = svsub_f32_x(svptrue_b32(), v136, v143); + svfloat32_t v360 = svadd_f32_x(svptrue_b32(), v178, v185); + svfloat32_t v361 = svsub_f32_x(svptrue_b32(), v178, v185); + svfloat32_t v362 = svadd_f32_x(svptrue_b32(), v220, v227); + svfloat32_t v363 = svsub_f32_x(svptrue_b32(), v220, v227); + svfloat32_t v364 = svadd_f32_x(svptrue_b32(), v262, v269); + svfloat32_t v365 = svsub_f32_x(svptrue_b32(), v262, v269); + svfloat32_t v366 = svadd_f32_x(svptrue_b32(), v304, v311); + svfloat32_t v367 = svsub_f32_x(svptrue_b32(), v304, v311); + svfloat32_t v368 = svadd_f32_x(svptrue_b32(), v346, v353); + svfloat32_t v369 = svsub_f32_x(svptrue_b32(), v346, v353); + svfloat32_t v370 = svadd_f32_x(svptrue_b32(), v354, v362); + svfloat32_t v371 = svadd_f32_x(svptrue_b32(), v356, v364); + svfloat32_t v372 = svadd_f32_x(svptrue_b32(), v358, v366); + svfloat32_t v373 = svadd_f32_x(svptrue_b32(), v360, v368); + svfloat32_t v376 = svsub_f32_x(svptrue_b32(), v354, v362); + svfloat32_t v377 = svsub_f32_x(svptrue_b32(), v356, v364); + svfloat32_t v378 = svsub_f32_x(svptrue_b32(), v358, v366); + svfloat32_t v379 = svsub_f32_x(svptrue_b32(), v360, v368); + svfloat32_t v390 = svadd_f32_x(svptrue_b32(), v355, v359); + svfloat32_t v391 = svadd_f32_x(svptrue_b32(), v357, v361); + svfloat32_t v392 = svsub_f32_x(svptrue_b32(), v355, v359); + svfloat32_t v393 = svsub_f32_x(svptrue_b32(), v369, v365); + svfloat32_t v394 = svadd_f32_x(svptrue_b32(), v363, v367); + svfloat32_t v395 = svadd_f32_x(svptrue_b32(), v365, v369); + svfloat32_t v396 = svsub_f32_x(svptrue_b32(), v363, v367); + svfloat32_t v397 = svsub_f32_x(svptrue_b32(), v357, v361); + svfloat32_t v410 = svadd_f32_x(svptrue_b32(), v355, v363); + svfloat32_t v411 = svadd_f32_x(svptrue_b32(), v361, v369); + svfloat32_t v374 = svadd_f32_x(svptrue_b32(), v370, v372); + svfloat32_t v375 = svadd_f32_x(svptrue_b32(), v371, v373); + svfloat32_t v380 = svsub_f32_x(svptrue_b32(), v370, v372); + svfloat32_t v381 = svsub_f32_x(svptrue_b32(), v371, v373); + svfloat32_t v384 = svadd_f32_x(svptrue_b32(), v377, v379); + svfloat32_t v385 = svadd_f32_x(svptrue_b32(), v376, v378); + svfloat32_t v387 = svsub_f32_x(svptrue_b32(), v378, v379); + svfloat32_t v388 = svsub_f32_x(svptrue_b32(), v376, v377); + svfloat32_t v398 = svadd_f32_x(svptrue_b32(), v390, v391); + svfloat32_t v399 = svadd_f32_x(svptrue_b32(), v394, v395); + svfloat32_t v401 = svsub_f32_x(svptrue_b32(), v390, v391); + svfloat32_t v402 = svsub_f32_x(svptrue_b32(), v394, v395); + svfloat32_t v404 = svadd_f32_x(svptrue_b32(), v392, v393); + svfloat32_t v405 = svadd_f32_x(svptrue_b32(), v396, v397); + svfloat32_t v407 = svsub_f32_x(svptrue_b32(), v392, v393); + svfloat32_t v408 = svsub_f32_x(svptrue_b32(), v396, v397); + svfloat32_t v447 = svmul_f32_x(svptrue_b32(), v378, v1018); + svfloat32_t zero614 = svdup_n_f32(0); svfloat32_t v614 = svcmla_f32_x(pred_full, zero614, v1045, v411, 90); - svfloat32_t v382; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v382) : "w"(v374), "w"(v375)); - svfloat32_t v383; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v383) : "w"(v374), "w"(v375)); - svfloat32_t v386; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v386) : "w"(v385), "w"(v384)); - svfloat32_t v389; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v389) : "w"(v380), "w"(v381)); - svfloat32_t v400; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v400) : "w"(v398), "w"(v399)); - svfloat32_t v403; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v403) : "w"(v401), "w"(v402)); - svfloat32_t v406; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v406) : "w"(v404), "w"(v405)); - svfloat32_t v409; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v409) : "w"(v407), "w"(v408)); - svfloat32_t v412; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v412) : "w"(v405), "w"(v399)); - svfloat32_t v415; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v415) : "w"(v398), "w"(v404)); - svfloat32_t v457; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v457) : "w"(v380), "w"(v1020)); - svfloat32_t v462; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v462) : "w"(v381), "w"(v1021)); - svfloat32_t v492; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v492) : "w"(v387), "w"(v1027)); - svfloat32_t v497; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v497) : "w"(v388), "w"(v1028)); - svfloat32_t v413; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v413) : "w"(v412), "w"(v355)); - svfloat32_t v416; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v416) : "w"(v415), "w"(v361)); - svfloat32_t v427; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v427) : "w"(v1014), "w"(v382)); - svfloat32_t v487; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v487) : "w"(v386), "w"(v1026)); - svfloat32_t zero523; - asm volatile("mov %0.s, #0" : "=w"(zero523)); + svfloat32_t v382 = svadd_f32_x(svptrue_b32(), v374, v375); + svfloat32_t v383 = svsub_f32_x(svptrue_b32(), v374, v375); + svfloat32_t v386 = svsub_f32_x(svptrue_b32(), v385, v384); + svfloat32_t v389 = svadd_f32_x(svptrue_b32(), v380, v381); + svfloat32_t v400 = svadd_f32_x(svptrue_b32(), v398, v399); + svfloat32_t v403 = svadd_f32_x(svptrue_b32(), v401, v402); + svfloat32_t v406 = svadd_f32_x(svptrue_b32(), v404, v405); + svfloat32_t v409 = svadd_f32_x(svptrue_b32(), v407, v408); + svfloat32_t v412 = svsub_f32_x(svptrue_b32(), v405, v399); + svfloat32_t v415 = svsub_f32_x(svptrue_b32(), v398, v404); + svfloat32_t v457 = svmul_f32_x(svptrue_b32(), v380, v1020); + svfloat32_t v462 = svmul_f32_x(svptrue_b32(), v381, v1021); + svfloat32_t v492 = svmul_f32_x(svptrue_b32(), v387, v1027); + svfloat32_t v497 = svmul_f32_x(svptrue_b32(), v388, v1028); + svfloat32_t v413 = svadd_f32_x(svptrue_b32(), v412, v355); + svfloat32_t v416 = svadd_f32_x(svptrue_b32(), v415, v361); + svfloat32_t v427 = svadd_f32_x(svptrue_b32(), v1014, v382); + svfloat32_t v487 = svmul_f32_x(svptrue_b32(), v386, v1026); + svfloat32_t zero523 = svdup_n_f32(0); svfloat32_t v523 = svcmla_f32_x(pred_full, zero523, v1032, v400, 90); - svfloat32_t zero544; - asm volatile("mov %0.s, #0" : "=w"(zero544)); + svfloat32_t zero544 = svdup_n_f32(0); svfloat32_t v544 = svcmla_f32_x(pred_full, zero544, v1035, v403, 90); - svfloat32_t zero565; - asm volatile("mov %0.s, #0" : "=w"(zero565)); + svfloat32_t zero565 = svdup_n_f32(0); svfloat32_t v565 = svcmla_f32_x(pred_full, zero565, v1038, v406, 90); - svfloat32_t zero586; - asm volatile("mov %0.s, #0" : "=w"(zero586)); + svfloat32_t zero586 = svdup_n_f32(0); svfloat32_t v586 = svcmla_f32_x(pred_full, zero586, v1041, v409, 90); svfloat32_t v652 = svmla_f32_x(pred_full, v492, v379, v1019); svfloat32_t v653 = svnmls_f32_x(pred_full, v447, v387, v1027); svfloat32_t v654 = svmla_f32_x(pred_full, v497, v377, v1017); svfloat32_t v655 = svnmls_f32_x(pred_full, v497, v376, v1016); - svfloat32_t v414; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v414) : "w"(v413), "w"(v411)); - svfloat32_t v417; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v417) : "w"(v416), "w"(v363)); + svfloat32_t v414 = svsub_f32_x(svptrue_b32(), v413, v411); + svfloat32_t v417 = svadd_f32_x(svptrue_b32(), v416, v363); svfloat32_t v650 = svmla_f32_x(pred_full, v487, v384, v1024); svfloat32_t v651 = svnmls_f32_x(pred_full, v487, v385, v1025); svfloat32_t v656 = svnmls_f32_x(pred_full, v462, v389, v1029); @@ -8042,150 +7188,85 @@ void armral_fft_cf32_cf32_cf32_ab_t_gs17(const armral_cmplx_f32_t *restrict x, svfloat32_t v684 = svcmla_f32_x(pred_full, v586, v1040, v408, 90); svst1_scatter_s64index_f64(pred_full, (double *)(v1058), v1203, svreinterpret_f64_f32(v427)); - svfloat32_t v418; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v418) : "w"(v417), "w"(v369)); - svfloat32_t zero635; - asm volatile("mov %0.s, #0" : "=w"(zero635)); + svfloat32_t v418 = svsub_f32_x(svptrue_b32(), v417, v369); + svfloat32_t zero635 = svdup_n_f32(0); svfloat32_t v635 = svcmla_f32_x(pred_full, zero635, v1048, v414, 90); svfloat32_t v659 = svmla_f32_x(pred_full, v658, v383, v1023); svfloat32_t v660 = svmls_f32_x(pred_full, v658, v383, v1023); - svfloat32_t v661; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v661) : "w"(v650), "w"(v652)); - svfloat32_t v663; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v663) : "w"(v651), "w"(v653)); - svfloat32_t v665; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v665) : "w"(v650), "w"(v654)); - svfloat32_t v667; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v667) : "w"(v651), "w"(v655)); - svfloat32_t v688; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v688) : "w"(v677), "w"(v679)); - svfloat32_t v689; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v689) : "w"(v677), "w"(v679)); - svfloat32_t v690; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v690) : "w"(v678), "w"(v680)); - svfloat32_t v691; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v691) : "w"(v678), "w"(v680)); - svfloat32_t v692; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v692) : "w"(v681), "w"(v683)); - svfloat32_t v693; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v693) : "w"(v683), "w"(v681)); - svfloat32_t v694; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v694) : "w"(v682), "w"(v684)); - svfloat32_t v695; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v695) : "w"(v684), "w"(v682)); - svfloat32_t v419; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v419) : "w"(v414), "w"(v418)); - svfloat32_t zero642; - asm volatile("mov %0.s, #0" : "=w"(zero642)); + svfloat32_t v661 = svsub_f32_x(svptrue_b32(), v650, v652); + svfloat32_t v663 = svadd_f32_x(svptrue_b32(), v651, v653); + svfloat32_t v665 = svadd_f32_x(svptrue_b32(), v650, v654); + svfloat32_t v667 = svadd_f32_x(svptrue_b32(), v651, v655); + svfloat32_t v688 = svadd_f32_x(svptrue_b32(), v677, v679); + svfloat32_t v689 = svsub_f32_x(svptrue_b32(), v677, v679); + svfloat32_t v690 = svadd_f32_x(svptrue_b32(), v678, v680); + svfloat32_t v691 = svsub_f32_x(svptrue_b32(), v678, v680); + svfloat32_t v692 = svadd_f32_x(svptrue_b32(), v681, v683); + svfloat32_t v693 = svsub_f32_x(svptrue_b32(), v683, v681); + svfloat32_t v694 = svadd_f32_x(svptrue_b32(), v682, v684); + svfloat32_t v695 = svsub_f32_x(svptrue_b32(), v684, v682); + svfloat32_t v419 = svadd_f32_x(svptrue_b32(), v414, v418); + svfloat32_t zero642 = svdup_n_f32(0); svfloat32_t v642 = svcmla_f32_x(pred_full, zero642, v1049, v418, 90); - svfloat32_t v662; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v662) : "w"(v656), "w"(v659)); - svfloat32_t v664; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v664) : "w"(v657), "w"(v660)); - svfloat32_t v666; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v666) : "w"(v659), "w"(v656)); - svfloat32_t v668; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v668) : "w"(v660), "w"(v657)); - svfloat32_t v705; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v705) : "w"(v690), "w"(v694)); - svfloat32_t v707; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v707) : "w"(v689), "w"(v695)); - svfloat32_t v709; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v709) : "w"(v688), "w"(v692)); - svfloat32_t v711; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v711) : "w"(v695), "w"(v689)); - svfloat32_t v713; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v713) : "w"(v688), "w"(v692)); - svfloat32_t v716; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v716) : "w"(v693), "w"(v691)); - svfloat32_t v719; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v719) : "w"(v694), "w"(v690)); - svfloat32_t v722; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v722) : "w"(v691), "w"(v693)); - svfloat32_t v669; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v669) : "w"(v661), "w"(v662)); - svfloat32_t v670; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v670) : "w"(v663), "w"(v664)); - svfloat32_t v671; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v671) : "w"(v665), "w"(v666)); - svfloat32_t v672; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v672) : "w"(v667), "w"(v668)); - svfloat32_t v673; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v673) : "w"(v662), "w"(v661)); - svfloat32_t v674; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v674) : "w"(v664), "w"(v663)); - svfloat32_t v675; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v675) : "w"(v666), "w"(v665)); - svfloat32_t v676; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v676) : "w"(v668), "w"(v667)); - svfloat32_t v696; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v696) : "w"(v635), "w"(v642)); + svfloat32_t v662 = svadd_f32_x(svptrue_b32(), v656, v659); + svfloat32_t v664 = svadd_f32_x(svptrue_b32(), v657, v660); + svfloat32_t v666 = svsub_f32_x(svptrue_b32(), v659, v656); + svfloat32_t v668 = svsub_f32_x(svptrue_b32(), v660, v657); + svfloat32_t v705 = svadd_f32_x(svptrue_b32(), v690, v694); + svfloat32_t v707 = svadd_f32_x(svptrue_b32(), v689, v695); + svfloat32_t v709 = svsub_f32_x(svptrue_b32(), v688, v692); + svfloat32_t v711 = svsub_f32_x(svptrue_b32(), v695, v689); + svfloat32_t v713 = svadd_f32_x(svptrue_b32(), v688, v692); + svfloat32_t v716 = svsub_f32_x(svptrue_b32(), v693, v691); + svfloat32_t v719 = svsub_f32_x(svptrue_b32(), v694, v690); + svfloat32_t v722 = svadd_f32_x(svptrue_b32(), v691, v693); + svfloat32_t v669 = svadd_f32_x(svptrue_b32(), v661, v662); + svfloat32_t v670 = svadd_f32_x(svptrue_b32(), v663, v664); + svfloat32_t v671 = svadd_f32_x(svptrue_b32(), v665, v666); + svfloat32_t v672 = svadd_f32_x(svptrue_b32(), v667, v668); + svfloat32_t v673 = svsub_f32_x(svptrue_b32(), v662, v661); + svfloat32_t v674 = svsub_f32_x(svptrue_b32(), v664, v663); + svfloat32_t v675 = svsub_f32_x(svptrue_b32(), v666, v665); + svfloat32_t v676 = svsub_f32_x(svptrue_b32(), v668, v667); + svfloat32_t v696 = svsub_f32_x(svptrue_b32(), v635, v642); svfloat32_t v685 = svcmla_f32_x(pred_full, v642, v1050, v419, 90); - svfloat32_t v698; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v698) : "w"(v696), "w"(v696)); - svfloat32_t v723; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v723) : "w"(v722), "w"(v696)); + svfloat32_t v698 = svadd_f32_x(svptrue_b32(), v696, v696); + svfloat32_t v723 = svsub_f32_x(svptrue_b32(), v722, v696); svfloat32_t v686 = svcmla_f32_x(pred_full, v685, v1042, v410, 90); - svfloat32_t v699; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v699) : "w"(v614), "w"(v698)); - svfloat32_t v702; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v702) : "w"(v685), "w"(v685)); - svfloat32_t v720; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v720) : "w"(v719), "w"(v698)); - svfloat32_t v763; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v763) : "w"(v676), "w"(v723)); - svfloat32_t v771; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v771) : "w"(v676), "w"(v723)); + svfloat32_t v699 = svsub_f32_x(svptrue_b32(), v614, v698); + svfloat32_t v702 = svadd_f32_x(svptrue_b32(), v685, v685); + svfloat32_t v720 = svadd_f32_x(svptrue_b32(), v719, v698); + svfloat32_t v763 = svadd_f32_x(svptrue_b32(), v676, v723); + svfloat32_t v771 = svsub_f32_x(svptrue_b32(), v676, v723); svfloat32_t v687 = svcmla_f32_x(pred_full, v686, v1043, v355, 90); svfloat32_t v697 = svcmla_f32_x(pred_full, v686, v1044, v363, 90); svfloat32_t v700 = svcmla_f32_x(pred_full, v699, v1046, v361, 90); svfloat32_t v701 = svcmla_f32_x(pred_full, v699, v1047, v369, 90); - svfloat32_t v703; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v703) : "w"(v702), "w"(v702)); - svfloat32_t v704; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v704) : "w"(v696), "w"(v702)); - svfloat32_t v710; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v710) : "w"(v709), "w"(v702)); - svfloat32_t v721; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v721) : "w"(v720), "w"(v702)); + svfloat32_t v703 = svadd_f32_x(svptrue_b32(), v702, v702); + svfloat32_t v704 = svadd_f32_x(svptrue_b32(), v696, v702); + svfloat32_t v710 = svadd_f32_x(svptrue_b32(), v709, v702); + svfloat32_t v721 = svadd_f32_x(svptrue_b32(), v720, v702); svst1_scatter_s64index_f64(pred_full, (double *)(v1103), v1203, svreinterpret_f64_f32(v763)); svst1_scatter_s64index_f64(pred_full, (double *)(v1112), v1203, svreinterpret_f64_f32(v771)); - svfloat32_t v706; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v706) : "w"(v705), "w"(v697)); - svfloat32_t v708; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v708) : "w"(v707), "w"(v700)); - svfloat32_t v712; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v712) : "w"(v711), "w"(v704)); - svfloat32_t v714; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v714) : "w"(v713), "w"(v687)); - svfloat32_t v717; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v717) : "w"(v716), "w"(v701)); - svfloat32_t v747; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v747) : "w"(v671), "w"(v710)); - svfloat32_t v755; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v755) : "w"(v671), "w"(v710)); - svfloat32_t v843; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v843) : "w"(v675), "w"(v721)); - svfloat32_t v851; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v851) : "w"(v675), "w"(v721)); - svfloat32_t v715; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v715) : "w"(v714), "w"(v696)); - svfloat32_t v718; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v718) : "w"(v717), "w"(v703)); - svfloat32_t v731; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v731) : "w"(v669), "w"(v706)); - svfloat32_t v739; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v739) : "w"(v669), "w"(v706)); - svfloat32_t v795; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v795) : "w"(v672), "w"(v712)); - svfloat32_t v803; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v803) : "w"(v672), "w"(v712)); - svfloat32_t v811; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v811) : "w"(v670), "w"(v708)); - svfloat32_t v819; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v819) : "w"(v670), "w"(v708)); + svfloat32_t v706 = svadd_f32_x(svptrue_b32(), v705, v697); + svfloat32_t v708 = svadd_f32_x(svptrue_b32(), v707, v700); + svfloat32_t v712 = svsub_f32_x(svptrue_b32(), v711, v704); + svfloat32_t v714 = svadd_f32_x(svptrue_b32(), v713, v687); + svfloat32_t v717 = svsub_f32_x(svptrue_b32(), v716, v701); + svfloat32_t v747 = svadd_f32_x(svptrue_b32(), v671, v710); + svfloat32_t v755 = svsub_f32_x(svptrue_b32(), v671, v710); + svfloat32_t v843 = svadd_f32_x(svptrue_b32(), v675, v721); + svfloat32_t v851 = svsub_f32_x(svptrue_b32(), v675, v721); + svfloat32_t v715 = svadd_f32_x(svptrue_b32(), v714, v696); + svfloat32_t v718 = svadd_f32_x(svptrue_b32(), v717, v703); + svfloat32_t v731 = svadd_f32_x(svptrue_b32(), v669, v706); + svfloat32_t v739 = svsub_f32_x(svptrue_b32(), v669, v706); + svfloat32_t v795 = svadd_f32_x(svptrue_b32(), v672, v712); + svfloat32_t v803 = svsub_f32_x(svptrue_b32(), v672, v712); + svfloat32_t v811 = svadd_f32_x(svptrue_b32(), v670, v708); + svfloat32_t v819 = svsub_f32_x(svptrue_b32(), v670, v708); svst1_scatter_s64index_f64(pred_full, (double *)(v1085), v1203, svreinterpret_f64_f32(v747)); svst1_scatter_s64index_f64(pred_full, (double *)(v1094), v1203, @@ -8194,14 +7275,10 @@ void armral_fft_cf32_cf32_cf32_ab_t_gs17(const armral_cmplx_f32_t *restrict x, svreinterpret_f64_f32(v843)); svst1_scatter_s64index_f64(pred_full, (double *)(v1202), v1203, svreinterpret_f64_f32(v851)); - svfloat32_t v779; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v779) : "w"(v673), "w"(v715)); - svfloat32_t v787; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v787) : "w"(v673), "w"(v715)); - svfloat32_t v827; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v827) : "w"(v674), "w"(v718)); - svfloat32_t v835; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v835) : "w"(v674), "w"(v718)); + svfloat32_t v779 = svadd_f32_x(svptrue_b32(), v673, v715); + svfloat32_t v787 = svsub_f32_x(svptrue_b32(), v673, v715); + svfloat32_t v827 = svadd_f32_x(svptrue_b32(), v674, v718); + svfloat32_t v835 = svsub_f32_x(svptrue_b32(), v674, v718); svst1_scatter_s64index_f64(pred_full, (double *)(v1067), v1203, svreinterpret_f64_f32(v731)); svst1_scatter_s64index_f64(pred_full, (double *)(v1076), v1203, @@ -8754,8 +7831,7 @@ void armral_fft_cf32_cf32_cf32_ab_t_gs18(const armral_cmplx_f32_t *restrict x, svld1_f64(pred_full, &((const double *)v7)[v204])); svfloat32_t v240 = svreinterpret_f32_f64( svld1_f64(pred_full, &((const double *)v7)[v239])); - svfloat32_t zero248; - asm volatile("mov %0.s, #0" : "=w"(zero248)); + svfloat32_t zero248 = svdup_n_f32(0); svfloat32_t v248 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero248, v844, v247, 0), v844, v247, 90); @@ -8803,264 +7879,164 @@ void armral_fft_cf32_cf32_cf32_ab_t_gs18(const armral_cmplx_f32_t *restrict x, svld1_gather_s64index_f64(pred_full, (const double *)(v889), v909)); svfloat32_t v900 = svreinterpret_f32_f64( svld1_gather_s64index_f64(pred_full, (const double *)(v898), v909)); - svfloat32_t zero38; - asm volatile("mov %0.s, #0" : "=w"(zero38)); + svfloat32_t zero38 = svdup_n_f32(0); svfloat32_t v38 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero38, v754, v37, 0), v754, v37, 90); - svfloat32_t zero73; - asm volatile("mov %0.s, #0" : "=w"(zero73)); + svfloat32_t zero73 = svdup_n_f32(0); svfloat32_t v73 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero73, v763, v72, 0), v763, v72, 90); - svfloat32_t zero80; - asm volatile("mov %0.s, #0" : "=w"(zero80)); + svfloat32_t zero80 = svdup_n_f32(0); svfloat32_t v80 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero80, v772, v79, 0), v772, v79, 90); - svfloat32_t zero115; - asm volatile("mov %0.s, #0" : "=w"(zero115)); + svfloat32_t zero115 = svdup_n_f32(0); svfloat32_t v115 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero115, v781, v114, 0), v781, v114, 90); - svfloat32_t zero122; - asm volatile("mov %0.s, #0" : "=w"(zero122)); + svfloat32_t zero122 = svdup_n_f32(0); svfloat32_t v122 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero122, v790, v121, 0), v790, v121, 90); - svfloat32_t zero157; - asm volatile("mov %0.s, #0" : "=w"(zero157)); + svfloat32_t zero157 = svdup_n_f32(0); svfloat32_t v157 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero157, v799, v156, 0), v799, v156, 90); - svfloat32_t zero164; - asm volatile("mov %0.s, #0" : "=w"(zero164)); + svfloat32_t zero164 = svdup_n_f32(0); svfloat32_t v164 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero164, v808, v163, 0), v808, v163, 90); - svfloat32_t zero199; - asm volatile("mov %0.s, #0" : "=w"(zero199)); + svfloat32_t zero199 = svdup_n_f32(0); svfloat32_t v199 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero199, v817, v198, 0), v817, v198, 90); - svfloat32_t zero206; - asm volatile("mov %0.s, #0" : "=w"(zero206)); + svfloat32_t zero206 = svdup_n_f32(0); svfloat32_t v206 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero206, v826, v205, 0), v826, v205, 90); - svfloat32_t zero241; - asm volatile("mov %0.s, #0" : "=w"(zero241)); + svfloat32_t zero241 = svdup_n_f32(0); svfloat32_t v241 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero241, v835, v240, 0), v835, v240, 90); - svfloat32_t zero283; - asm volatile("mov %0.s, #0" : "=w"(zero283)); + svfloat32_t zero283 = svdup_n_f32(0); svfloat32_t v283 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero283, v855, v282, 0), v855, v282, 90); - svfloat32_t zero290; - asm volatile("mov %0.s, #0" : "=w"(zero290)); + svfloat32_t zero290 = svdup_n_f32(0); svfloat32_t v290 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero290, v864, v289, 0), v864, v289, 90); - svfloat32_t zero325; - asm volatile("mov %0.s, #0" : "=w"(zero325)); + svfloat32_t zero325 = svdup_n_f32(0); svfloat32_t v325 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero325, v873, v324, 0), v873, v324, 90); - svfloat32_t zero332; - asm volatile("mov %0.s, #0" : "=w"(zero332)); + svfloat32_t zero332 = svdup_n_f32(0); svfloat32_t v332 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero332, v882, v331, 0), v882, v331, 90); - svfloat32_t zero367; - asm volatile("mov %0.s, #0" : "=w"(zero367)); + svfloat32_t zero367 = svdup_n_f32(0); svfloat32_t v367 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero367, v891, v366, 0), v891, v366, 90); - svfloat32_t zero374; - asm volatile("mov %0.s, #0" : "=w"(zero374)); + svfloat32_t zero374 = svdup_n_f32(0); svfloat32_t v374 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero374, v900, v373, 0), v900, v373, 90); - svfloat32_t v382; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v382) : "w"(v910), "w"(v38)); - svfloat32_t v383; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v383) : "w"(v910), "w"(v38)); - svfloat32_t v384; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v384) : "w"(v73), "w"(v80)); - svfloat32_t v385; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v385) : "w"(v73), "w"(v80)); - svfloat32_t v386; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v386) : "w"(v115), "w"(v122)); - svfloat32_t v387; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v387) : "w"(v115), "w"(v122)); - svfloat32_t v388; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v388) : "w"(v157), "w"(v164)); - svfloat32_t v389; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v389) : "w"(v157), "w"(v164)); - svfloat32_t v390; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v390) : "w"(v199), "w"(v206)); - svfloat32_t v391; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v391) : "w"(v199), "w"(v206)); - svfloat32_t v392; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v392) : "w"(v241), "w"(v248)); - svfloat32_t v393; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v393) : "w"(v241), "w"(v248)); - svfloat32_t v394; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v394) : "w"(v283), "w"(v290)); - svfloat32_t v395; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v395) : "w"(v283), "w"(v290)); - svfloat32_t v396; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v396) : "w"(v325), "w"(v332)); - svfloat32_t v397; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v397) : "w"(v325), "w"(v332)); - svfloat32_t v398; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v398) : "w"(v367), "w"(v374)); - svfloat32_t v399; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v399) : "w"(v367), "w"(v374)); - svfloat32_t v400; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v400) : "w"(v384), "w"(v398)); - svfloat32_t v401; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v401) : "w"(v384), "w"(v398)); - svfloat32_t v402; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v402) : "w"(v396), "w"(v386)); - svfloat32_t v403; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v403) : "w"(v396), "w"(v386)); - svfloat32_t v404; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v404) : "w"(v388), "w"(v394)); - svfloat32_t v405; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v405) : "w"(v388), "w"(v394)); - svfloat32_t v406; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v406) : "w"(v390), "w"(v392)); - svfloat32_t v407; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v407) : "w"(v390), "w"(v392)); - svfloat32_t v510; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v510) : "w"(v385), "w"(v399)); - svfloat32_t v511; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v511) : "w"(v385), "w"(v399)); - svfloat32_t v512; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v512) : "w"(v397), "w"(v387)); - svfloat32_t v513; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v513) : "w"(v397), "w"(v387)); - svfloat32_t v514; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v514) : "w"(v389), "w"(v395)); - svfloat32_t v515; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v515) : "w"(v389), "w"(v395)); - svfloat32_t v516; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v516) : "w"(v391), "w"(v393)); - svfloat32_t v517; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v517) : "w"(v391), "w"(v393)); - svfloat32_t v408; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v408) : "w"(v400), "w"(v402)); - svfloat32_t v412; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v412) : "w"(v401), "w"(v403)); - svfloat32_t v414; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v414) : "w"(v400), "w"(v402)); - svfloat32_t v415; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v415) : "w"(v402), "w"(v406)); - svfloat32_t v416; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v416) : "w"(v406), "w"(v400)); - svfloat32_t v417; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v417) : "w"(v401), "w"(v403)); - svfloat32_t v418; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v418) : "w"(v403), "w"(v407)); - svfloat32_t v419; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v419) : "w"(v407), "w"(v401)); - svfloat32_t zero448; - asm volatile("mov %0.s, #0" : "=w"(zero448)); + svfloat32_t v382 = svadd_f32_x(svptrue_b32(), v910, v38); + svfloat32_t v383 = svsub_f32_x(svptrue_b32(), v910, v38); + svfloat32_t v384 = svadd_f32_x(svptrue_b32(), v73, v80); + svfloat32_t v385 = svsub_f32_x(svptrue_b32(), v73, v80); + svfloat32_t v386 = svadd_f32_x(svptrue_b32(), v115, v122); + svfloat32_t v387 = svsub_f32_x(svptrue_b32(), v115, v122); + svfloat32_t v388 = svadd_f32_x(svptrue_b32(), v157, v164); + svfloat32_t v389 = svsub_f32_x(svptrue_b32(), v157, v164); + svfloat32_t v390 = svadd_f32_x(svptrue_b32(), v199, v206); + svfloat32_t v391 = svsub_f32_x(svptrue_b32(), v199, v206); + svfloat32_t v392 = svadd_f32_x(svptrue_b32(), v241, v248); + svfloat32_t v393 = svsub_f32_x(svptrue_b32(), v241, v248); + svfloat32_t v394 = svadd_f32_x(svptrue_b32(), v283, v290); + svfloat32_t v395 = svsub_f32_x(svptrue_b32(), v283, v290); + svfloat32_t v396 = svadd_f32_x(svptrue_b32(), v325, v332); + svfloat32_t v397 = svsub_f32_x(svptrue_b32(), v325, v332); + svfloat32_t v398 = svadd_f32_x(svptrue_b32(), v367, v374); + svfloat32_t v399 = svsub_f32_x(svptrue_b32(), v367, v374); + svfloat32_t v400 = svadd_f32_x(svptrue_b32(), v384, v398); + svfloat32_t v401 = svsub_f32_x(svptrue_b32(), v384, v398); + svfloat32_t v402 = svadd_f32_x(svptrue_b32(), v396, v386); + svfloat32_t v403 = svsub_f32_x(svptrue_b32(), v396, v386); + svfloat32_t v404 = svadd_f32_x(svptrue_b32(), v388, v394); + svfloat32_t v405 = svsub_f32_x(svptrue_b32(), v388, v394); + svfloat32_t v406 = svadd_f32_x(svptrue_b32(), v390, v392); + svfloat32_t v407 = svsub_f32_x(svptrue_b32(), v390, v392); + svfloat32_t v510 = svadd_f32_x(svptrue_b32(), v385, v399); + svfloat32_t v511 = svsub_f32_x(svptrue_b32(), v385, v399); + svfloat32_t v512 = svadd_f32_x(svptrue_b32(), v397, v387); + svfloat32_t v513 = svsub_f32_x(svptrue_b32(), v397, v387); + svfloat32_t v514 = svadd_f32_x(svptrue_b32(), v389, v395); + svfloat32_t v515 = svsub_f32_x(svptrue_b32(), v389, v395); + svfloat32_t v516 = svadd_f32_x(svptrue_b32(), v391, v393); + svfloat32_t v517 = svsub_f32_x(svptrue_b32(), v391, v393); + svfloat32_t v408 = svadd_f32_x(svptrue_b32(), v400, v402); + svfloat32_t v412 = svadd_f32_x(svptrue_b32(), v401, v403); + svfloat32_t v414 = svsub_f32_x(svptrue_b32(), v400, v402); + svfloat32_t v415 = svsub_f32_x(svptrue_b32(), v402, v406); + svfloat32_t v416 = svsub_f32_x(svptrue_b32(), v406, v400); + svfloat32_t v417 = svsub_f32_x(svptrue_b32(), v401, v403); + svfloat32_t v418 = svsub_f32_x(svptrue_b32(), v403, v407); + svfloat32_t v419 = svsub_f32_x(svptrue_b32(), v407, v401); + svfloat32_t zero448 = svdup_n_f32(0); svfloat32_t v448 = svcmla_f32_x(pred_full, zero448, v926, v405, 90); - svfloat32_t v518; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v518) : "w"(v510), "w"(v512)); - svfloat32_t v522; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v522) : "w"(v511), "w"(v513)); - svfloat32_t v524; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v524) : "w"(v510), "w"(v512)); - svfloat32_t v525; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v525) : "w"(v512), "w"(v516)); - svfloat32_t v526; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v526) : "w"(v516), "w"(v510)); - svfloat32_t v527; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v527) : "w"(v511), "w"(v513)); - svfloat32_t v528; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v528) : "w"(v513), "w"(v517)); - svfloat32_t v529; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v529) : "w"(v517), "w"(v511)); - svfloat32_t zero558; - asm volatile("mov %0.s, #0" : "=w"(zero558)); + svfloat32_t v518 = svadd_f32_x(svptrue_b32(), v510, v512); + svfloat32_t v522 = svadd_f32_x(svptrue_b32(), v511, v513); + svfloat32_t v524 = svsub_f32_x(svptrue_b32(), v510, v512); + svfloat32_t v525 = svsub_f32_x(svptrue_b32(), v512, v516); + svfloat32_t v526 = svsub_f32_x(svptrue_b32(), v516, v510); + svfloat32_t v527 = svsub_f32_x(svptrue_b32(), v511, v513); + svfloat32_t v528 = svsub_f32_x(svptrue_b32(), v513, v517); + svfloat32_t v529 = svsub_f32_x(svptrue_b32(), v517, v511); + svfloat32_t zero558 = svdup_n_f32(0); svfloat32_t v558 = svcmla_f32_x(pred_full, zero558, v926, v515, 90); - svfloat32_t v409; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v409) : "w"(v408), "w"(v406)); - svfloat32_t v413; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v413) : "w"(v412), "w"(v407)); - svfloat32_t zero470; - asm volatile("mov %0.s, #0" : "=w"(zero470)); + svfloat32_t v409 = svadd_f32_x(svptrue_b32(), v408, v406); + svfloat32_t v413 = svadd_f32_x(svptrue_b32(), v412, v407); + svfloat32_t zero470 = svdup_n_f32(0); svfloat32_t v470 = svcmla_f32_x(pred_full, zero470, v930, v417, 90); - svfloat32_t zero477; - asm volatile("mov %0.s, #0" : "=w"(zero477)); + svfloat32_t zero477 = svdup_n_f32(0); svfloat32_t v477 = svcmla_f32_x(pred_full, zero477, v931, v418, 90); - svfloat32_t zero484; - asm volatile("mov %0.s, #0" : "=w"(zero484)); + svfloat32_t zero484 = svdup_n_f32(0); svfloat32_t v484 = svcmla_f32_x(pred_full, zero484, v932, v419, 90); - svfloat32_t v519; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v519) : "w"(v518), "w"(v516)); - svfloat32_t v523; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v523) : "w"(v522), "w"(v517)); - svfloat32_t zero580; - asm volatile("mov %0.s, #0" : "=w"(zero580)); + svfloat32_t v519 = svadd_f32_x(svptrue_b32(), v518, v516); + svfloat32_t v523 = svadd_f32_x(svptrue_b32(), v522, v517); + svfloat32_t zero580 = svdup_n_f32(0); svfloat32_t v580 = svcmla_f32_x(pred_full, zero580, v930, v527, 90); - svfloat32_t zero587; - asm volatile("mov %0.s, #0" : "=w"(zero587)); + svfloat32_t zero587 = svdup_n_f32(0); svfloat32_t v587 = svcmla_f32_x(pred_full, zero587, v931, v528, 90); - svfloat32_t zero594; - asm volatile("mov %0.s, #0" : "=w"(zero594)); + svfloat32_t zero594 = svdup_n_f32(0); svfloat32_t v594 = svcmla_f32_x(pred_full, zero594, v932, v529, 90); - svfloat32_t v410; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v410) : "w"(v409), "w"(v404)); - svfloat32_t v429; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v429) : "w"(v409), "w"(v923)); - svfloat32_t zero436; - asm volatile("mov %0.s, #0" : "=w"(zero436)); + svfloat32_t v410 = svadd_f32_x(svptrue_b32(), v409, v404); + svfloat32_t v429 = svmul_f32_x(svptrue_b32(), v409, v923); + svfloat32_t zero436 = svdup_n_f32(0); svfloat32_t v436 = svcmla_f32_x(pred_full, zero436, v926, v413, 90); - svfloat32_t v498; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v498) : "w"(v448), "w"(v470)); - svfloat32_t v500; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v500) : "w"(v448), "w"(v477)); - svfloat32_t v502; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v502) : "w"(v448), "w"(v470)); - svfloat32_t v520; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v520) : "w"(v519), "w"(v514)); - svfloat32_t v539; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v539) : "w"(v519), "w"(v923)); - svfloat32_t zero546; - asm volatile("mov %0.s, #0" : "=w"(zero546)); + svfloat32_t v498 = svadd_f32_x(svptrue_b32(), v448, v470); + svfloat32_t v500 = svsub_f32_x(svptrue_b32(), v448, v477); + svfloat32_t v502 = svsub_f32_x(svptrue_b32(), v448, v470); + svfloat32_t v520 = svadd_f32_x(svptrue_b32(), v519, v514); + svfloat32_t v539 = svmul_f32_x(svptrue_b32(), v519, v923); + svfloat32_t zero546 = svdup_n_f32(0); svfloat32_t v546 = svcmla_f32_x(pred_full, zero546, v926, v523, 90); - svfloat32_t v608; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v608) : "w"(v558), "w"(v580)); - svfloat32_t v610; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v610) : "w"(v558), "w"(v587)); - svfloat32_t v612; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v612) : "w"(v558), "w"(v580)); - svfloat32_t v411; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v411) : "w"(v410), "w"(v382)); - svfloat32_t v485; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v485) : "w"(v429), "w"(v429)); - svfloat32_t v499; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v499) : "w"(v498), "w"(v477)); - svfloat32_t v501; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v501) : "w"(v500), "w"(v484)); - svfloat32_t v503; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v503) : "w"(v502), "w"(v484)); - svfloat32_t v521; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v521) : "w"(v520), "w"(v383)); - svfloat32_t v595; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v595) : "w"(v539), "w"(v539)); - svfloat32_t v609; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v609) : "w"(v608), "w"(v587)); - svfloat32_t v611; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v611) : "w"(v610), "w"(v594)); - svfloat32_t v613; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v613) : "w"(v612), "w"(v594)); + svfloat32_t v608 = svadd_f32_x(svptrue_b32(), v558, v580); + svfloat32_t v610 = svsub_f32_x(svptrue_b32(), v558, v587); + svfloat32_t v612 = svsub_f32_x(svptrue_b32(), v558, v580); + svfloat32_t v411 = svadd_f32_x(svptrue_b32(), v410, v382); + svfloat32_t v485 = svadd_f32_x(svptrue_b32(), v429, v429); + svfloat32_t v499 = svadd_f32_x(svptrue_b32(), v498, v477); + svfloat32_t v501 = svadd_f32_x(svptrue_b32(), v500, v484); + svfloat32_t v503 = svsub_f32_x(svptrue_b32(), v502, v484); + svfloat32_t v521 = svadd_f32_x(svptrue_b32(), v520, v383); + svfloat32_t v595 = svadd_f32_x(svptrue_b32(), v539, v539); + svfloat32_t v609 = svadd_f32_x(svptrue_b32(), v608, v587); + svfloat32_t v611 = svadd_f32_x(svptrue_b32(), v610, v594); + svfloat32_t v613 = svsub_f32_x(svptrue_b32(), v612, v594); svfloat32_t v486 = svmla_f32_x(pred_full, v485, v409, v923); svfloat32_t v490 = svmla_f32_x(pred_full, v411, v404, v925); svfloat32_t v596 = svmla_f32_x(pred_full, v595, v519, v923); @@ -9069,25 +8045,17 @@ void armral_fft_cf32_cf32_cf32_ab_t_gs18(const armral_cmplx_f32_t *restrict x, svreinterpret_f64_f32(v411)); svst1_scatter_s64index_f64(pred_full, (double *)(v949), v1094, svreinterpret_f64_f32(v521)); - svfloat32_t v487; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v487) : "w"(v411), "w"(v486)); - svfloat32_t v491; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v491) : "w"(v490), "w"(v485)); - svfloat32_t v597; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v597) : "w"(v521), "w"(v596)); - svfloat32_t v601; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v601) : "w"(v600), "w"(v595)); - svfloat32_t v488; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v488) : "w"(v487), "w"(v436)); - svfloat32_t v489; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v489) : "w"(v487), "w"(v436)); + svfloat32_t v487 = svadd_f32_x(svptrue_b32(), v411, v486); + svfloat32_t v491 = svadd_f32_x(svptrue_b32(), v490, v485); + svfloat32_t v597 = svadd_f32_x(svptrue_b32(), v521, v596); + svfloat32_t v601 = svadd_f32_x(svptrue_b32(), v600, v595); + svfloat32_t v488 = svadd_f32_x(svptrue_b32(), v487, v436); + svfloat32_t v489 = svsub_f32_x(svptrue_b32(), v487, v436); svfloat32_t v492 = svmla_f32_x(pred_full, v491, v414, v927); svfloat32_t v494 = svmls_f32_x(pred_full, v491, v415, v928); svfloat32_t v496 = svmls_f32_x(pred_full, v491, v414, v927); - svfloat32_t v598; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v598) : "w"(v597), "w"(v546)); - svfloat32_t v599; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v599) : "w"(v597), "w"(v546)); + svfloat32_t v598 = svadd_f32_x(svptrue_b32(), v597, v546); + svfloat32_t v599 = svsub_f32_x(svptrue_b32(), v597, v546); svfloat32_t v602 = svmla_f32_x(pred_full, v601, v524, v927); svfloat32_t v604 = svmls_f32_x(pred_full, v601, v525, v928); svfloat32_t v606 = svmls_f32_x(pred_full, v601, v524, v927); @@ -9105,30 +8073,18 @@ void armral_fft_cf32_cf32_cf32_ab_t_gs18(const armral_cmplx_f32_t *restrict x, svreinterpret_f64_f32(v488)); svst1_scatter_s64index_f64(pred_full, (double *)(v1057), v1094, svreinterpret_f64_f32(v598)); - svfloat32_t v504; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v504) : "w"(v493), "w"(v499)); - svfloat32_t v505; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v505) : "w"(v493), "w"(v499)); - svfloat32_t v506; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v506) : "w"(v495), "w"(v501)); - svfloat32_t v507; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v507) : "w"(v495), "w"(v501)); - svfloat32_t v508; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v508) : "w"(v497), "w"(v503)); - svfloat32_t v509; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v509) : "w"(v497), "w"(v503)); - svfloat32_t v614; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v614) : "w"(v603), "w"(v609)); - svfloat32_t v615; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v615) : "w"(v603), "w"(v609)); - svfloat32_t v616; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v616) : "w"(v605), "w"(v611)); - svfloat32_t v617; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v617) : "w"(v605), "w"(v611)); - svfloat32_t v618; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v618) : "w"(v607), "w"(v613)); - svfloat32_t v619; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v619) : "w"(v607), "w"(v613)); + svfloat32_t v504 = svadd_f32_x(svptrue_b32(), v493, v499); + svfloat32_t v505 = svsub_f32_x(svptrue_b32(), v493, v499); + svfloat32_t v506 = svadd_f32_x(svptrue_b32(), v495, v501); + svfloat32_t v507 = svsub_f32_x(svptrue_b32(), v495, v501); + svfloat32_t v508 = svadd_f32_x(svptrue_b32(), v497, v503); + svfloat32_t v509 = svsub_f32_x(svptrue_b32(), v497, v503); + svfloat32_t v614 = svadd_f32_x(svptrue_b32(), v603, v609); + svfloat32_t v615 = svsub_f32_x(svptrue_b32(), v603, v609); + svfloat32_t v616 = svadd_f32_x(svptrue_b32(), v605, v611); + svfloat32_t v617 = svsub_f32_x(svptrue_b32(), v605, v611); + svfloat32_t v618 = svadd_f32_x(svptrue_b32(), v607, v613); + svfloat32_t v619 = svsub_f32_x(svptrue_b32(), v607, v613); svst1_scatter_s64index_f64(pred_full, (double *)(v958), v1094, svreinterpret_f64_f32(v505)); svst1_scatter_s64index_f64(pred_full, (double *)(v967), v1094, @@ -9945,8 +8901,7 @@ void armral_fft_cf32_cf32_cf32_ab_t_gs19(const armral_cmplx_f32_t *restrict x, float32x2_t *v1319 = &v6[v933]; float32x2_t *v1328 = &v6[v941]; float32x2_t *v1337 = &v6[v949]; - svfloat32_t zero52; - asm volatile("mov %0.s, #0" : "=w"(zero52)); + svfloat32_t zero52 = svdup_n_f32(0); svfloat32_t v52 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero52, v963, v51, 0), v963, v51, 90); @@ -10018,455 +8973,274 @@ void armral_fft_cf32_cf32_cf32_ab_t_gs19(const armral_cmplx_f32_t *restrict x, svld1_gather_s64index_f64(pred_full, (const double *)(v1107), v1127)); svfloat32_t v1118 = svreinterpret_f32_f64( svld1_gather_s64index_f64(pred_full, (const double *)(v1116), v1127)); - svfloat32_t zero59; - asm volatile("mov %0.s, #0" : "=w"(zero59)); + svfloat32_t zero59 = svdup_n_f32(0); svfloat32_t v59 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero59, v973, v58, 0), v973, v58, 90); - svfloat32_t zero94; - asm volatile("mov %0.s, #0" : "=w"(zero94)); + svfloat32_t zero94 = svdup_n_f32(0); svfloat32_t v94 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero94, v992, v93, 0), v992, v93, 90); - svfloat32_t zero101; - asm volatile("mov %0.s, #0" : "=w"(zero101)); + svfloat32_t zero101 = svdup_n_f32(0); svfloat32_t v101 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero101, v983, v100, 0), v983, v100, 90); - svfloat32_t zero136; - asm volatile("mov %0.s, #0" : "=w"(zero136)); + svfloat32_t zero136 = svdup_n_f32(0); svfloat32_t v136 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero136, v1001, v135, 0), v1001, v135, 90); - svfloat32_t zero143; - asm volatile("mov %0.s, #0" : "=w"(zero143)); + svfloat32_t zero143 = svdup_n_f32(0); svfloat32_t v143 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero143, v1010, v142, 0), v1010, v142, 90); - svfloat32_t zero178; - asm volatile("mov %0.s, #0" : "=w"(zero178)); + svfloat32_t zero178 = svdup_n_f32(0); svfloat32_t v178 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero178, v1028, v177, 0), v1028, v177, 90); - svfloat32_t zero185; - asm volatile("mov %0.s, #0" : "=w"(zero185)); + svfloat32_t zero185 = svdup_n_f32(0); svfloat32_t v185 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero185, v1019, v184, 0), v1019, v184, 90); - svfloat32_t zero220; - asm volatile("mov %0.s, #0" : "=w"(zero220)); + svfloat32_t zero220 = svdup_n_f32(0); svfloat32_t v220 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero220, v1037, v219, 0), v1037, v219, 90); - svfloat32_t zero227; - asm volatile("mov %0.s, #0" : "=w"(zero227)); + svfloat32_t zero227 = svdup_n_f32(0); svfloat32_t v227 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero227, v1046, v226, 0), v1046, v226, 90); - svfloat32_t zero262; - asm volatile("mov %0.s, #0" : "=w"(zero262)); + svfloat32_t zero262 = svdup_n_f32(0); svfloat32_t v262 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero262, v1064, v261, 0), v1064, v261, 90); - svfloat32_t zero269; - asm volatile("mov %0.s, #0" : "=w"(zero269)); + svfloat32_t zero269 = svdup_n_f32(0); svfloat32_t v269 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero269, v1055, v268, 0), v1055, v268, 90); - svfloat32_t zero304; - asm volatile("mov %0.s, #0" : "=w"(zero304)); + svfloat32_t zero304 = svdup_n_f32(0); svfloat32_t v304 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero304, v1073, v303, 0), v1073, v303, 90); - svfloat32_t zero311; - asm volatile("mov %0.s, #0" : "=w"(zero311)); + svfloat32_t zero311 = svdup_n_f32(0); svfloat32_t v311 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero311, v1082, v310, 0), v1082, v310, 90); - svfloat32_t zero346; - asm volatile("mov %0.s, #0" : "=w"(zero346)); + svfloat32_t zero346 = svdup_n_f32(0); svfloat32_t v346 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero346, v1100, v345, 0), v1100, v345, 90); - svfloat32_t zero353; - asm volatile("mov %0.s, #0" : "=w"(zero353)); + svfloat32_t zero353 = svdup_n_f32(0); svfloat32_t v353 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero353, v1091, v352, 0), v1091, v352, 90); - svfloat32_t zero388; - asm volatile("mov %0.s, #0" : "=w"(zero388)); + svfloat32_t zero388 = svdup_n_f32(0); svfloat32_t v388 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero388, v1109, v387, 0), v1109, v387, 90); - svfloat32_t zero395; - asm volatile("mov %0.s, #0" : "=w"(zero395)); + svfloat32_t zero395 = svdup_n_f32(0); svfloat32_t v395 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero395, v1118, v394, 0), v1118, v394, 90); - svfloat32_t v396; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v396) : "w"(v52), "w"(v59)); - svfloat32_t v397; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v397) : "w"(v52), "w"(v59)); - svfloat32_t v398; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v398) : "w"(v101), "w"(v94)); - svfloat32_t v399; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v399) : "w"(v94), "w"(v101)); - svfloat32_t v400; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v400) : "w"(v136), "w"(v143)); - svfloat32_t v401; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v401) : "w"(v136), "w"(v143)); - svfloat32_t v402; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v402) : "w"(v185), "w"(v178)); - svfloat32_t v403; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v403) : "w"(v178), "w"(v185)); - svfloat32_t v404; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v404) : "w"(v220), "w"(v227)); - svfloat32_t v405; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v405) : "w"(v220), "w"(v227)); - svfloat32_t v406; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v406) : "w"(v269), "w"(v262)); - svfloat32_t v407; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v407) : "w"(v262), "w"(v269)); - svfloat32_t v408; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v408) : "w"(v304), "w"(v311)); - svfloat32_t v409; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v409) : "w"(v304), "w"(v311)); - svfloat32_t v410; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v410) : "w"(v353), "w"(v346)); - svfloat32_t v411; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v411) : "w"(v346), "w"(v353)); - svfloat32_t v412; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v412) : "w"(v388), "w"(v395)); - svfloat32_t v413; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v413) : "w"(v388), "w"(v395)); - svfloat32_t v414; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v414) : "w"(v396), "w"(v408)); - svfloat32_t v415; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v415) : "w"(v398), "w"(v410)); - svfloat32_t v416; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v416) : "w"(v400), "w"(v412)); - svfloat32_t v417; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v417) : "w"(v402), "w"(v408)); - svfloat32_t v418; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v418) : "w"(v404), "w"(v410)); - svfloat32_t v419; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v419) : "w"(v406), "w"(v412)); - svfloat32_t v420; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v420) : "w"(v396), "w"(v402)); - svfloat32_t v422; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v422) : "w"(v398), "w"(v404)); - svfloat32_t v424; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v424) : "w"(v400), "w"(v406)); - svfloat32_t v454; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v454) : "w"(v397), "w"(v409)); - svfloat32_t v455; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v455) : "w"(v399), "w"(v411)); - svfloat32_t v456; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v456) : "w"(v401), "w"(v413)); - svfloat32_t v457; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v457) : "w"(v403), "w"(v409)); - svfloat32_t v458; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v458) : "w"(v405), "w"(v411)); - svfloat32_t v459; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v459) : "w"(v407), "w"(v413)); - svfloat32_t v460; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v460) : "w"(v397), "w"(v403)); - svfloat32_t v462; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v462) : "w"(v399), "w"(v405)); - svfloat32_t v464; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v464) : "w"(v401), "w"(v407)); - svfloat32_t v421; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v421) : "w"(v420), "w"(v408)); - svfloat32_t v423; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v423) : "w"(v422), "w"(v410)); - svfloat32_t v425; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v425) : "w"(v424), "w"(v412)); - svfloat32_t v426; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v426) : "w"(v414), "w"(v416)); - svfloat32_t v427; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v427) : "w"(v417), "w"(v419)); - svfloat32_t v444; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v444) : "w"(v414), "w"(v417)); - svfloat32_t v445; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v445) : "w"(v416), "w"(v419)); - svfloat32_t v461; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v461) : "w"(v460), "w"(v409)); - svfloat32_t v463; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v463) : "w"(v462), "w"(v411)); - svfloat32_t v465; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v465) : "w"(v464), "w"(v413)); - svfloat32_t v466; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v466) : "w"(v454), "w"(v456)); - svfloat32_t v467; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v467) : "w"(v457), "w"(v459)); - svfloat32_t v476; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v476) : "w"(v454), "w"(v457)); - svfloat32_t v477; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v477) : "w"(v456), "w"(v459)); - svfloat32_t zero641; - asm volatile("mov %0.s, #0" : "=w"(zero641)); + svfloat32_t v396 = svadd_f32_x(svptrue_b32(), v52, v59); + svfloat32_t v397 = svsub_f32_x(svptrue_b32(), v52, v59); + svfloat32_t v398 = svadd_f32_x(svptrue_b32(), v101, v94); + svfloat32_t v399 = svsub_f32_x(svptrue_b32(), v94, v101); + svfloat32_t v400 = svadd_f32_x(svptrue_b32(), v136, v143); + svfloat32_t v401 = svsub_f32_x(svptrue_b32(), v136, v143); + svfloat32_t v402 = svadd_f32_x(svptrue_b32(), v185, v178); + svfloat32_t v403 = svsub_f32_x(svptrue_b32(), v178, v185); + svfloat32_t v404 = svadd_f32_x(svptrue_b32(), v220, v227); + svfloat32_t v405 = svsub_f32_x(svptrue_b32(), v220, v227); + svfloat32_t v406 = svadd_f32_x(svptrue_b32(), v269, v262); + svfloat32_t v407 = svsub_f32_x(svptrue_b32(), v262, v269); + svfloat32_t v408 = svadd_f32_x(svptrue_b32(), v304, v311); + svfloat32_t v409 = svsub_f32_x(svptrue_b32(), v304, v311); + svfloat32_t v410 = svadd_f32_x(svptrue_b32(), v353, v346); + svfloat32_t v411 = svsub_f32_x(svptrue_b32(), v346, v353); + svfloat32_t v412 = svadd_f32_x(svptrue_b32(), v388, v395); + svfloat32_t v413 = svsub_f32_x(svptrue_b32(), v388, v395); + svfloat32_t v414 = svsub_f32_x(svptrue_b32(), v396, v408); + svfloat32_t v415 = svsub_f32_x(svptrue_b32(), v398, v410); + svfloat32_t v416 = svsub_f32_x(svptrue_b32(), v400, v412); + svfloat32_t v417 = svsub_f32_x(svptrue_b32(), v402, v408); + svfloat32_t v418 = svsub_f32_x(svptrue_b32(), v404, v410); + svfloat32_t v419 = svsub_f32_x(svptrue_b32(), v406, v412); + svfloat32_t v420 = svadd_f32_x(svptrue_b32(), v396, v402); + svfloat32_t v422 = svadd_f32_x(svptrue_b32(), v398, v404); + svfloat32_t v424 = svadd_f32_x(svptrue_b32(), v400, v406); + svfloat32_t v454 = svsub_f32_x(svptrue_b32(), v397, v409); + svfloat32_t v455 = svsub_f32_x(svptrue_b32(), v399, v411); + svfloat32_t v456 = svsub_f32_x(svptrue_b32(), v401, v413); + svfloat32_t v457 = svsub_f32_x(svptrue_b32(), v403, v409); + svfloat32_t v458 = svsub_f32_x(svptrue_b32(), v405, v411); + svfloat32_t v459 = svsub_f32_x(svptrue_b32(), v407, v413); + svfloat32_t v460 = svadd_f32_x(svptrue_b32(), v397, v403); + svfloat32_t v462 = svadd_f32_x(svptrue_b32(), v399, v405); + svfloat32_t v464 = svadd_f32_x(svptrue_b32(), v401, v407); + svfloat32_t v421 = svadd_f32_x(svptrue_b32(), v420, v408); + svfloat32_t v423 = svadd_f32_x(svptrue_b32(), v422, v410); + svfloat32_t v425 = svadd_f32_x(svptrue_b32(), v424, v412); + svfloat32_t v426 = svadd_f32_x(svptrue_b32(), v414, v416); + svfloat32_t v427 = svadd_f32_x(svptrue_b32(), v417, v419); + svfloat32_t v444 = svsub_f32_x(svptrue_b32(), v414, v417); + svfloat32_t v445 = svsub_f32_x(svptrue_b32(), v416, v419); + svfloat32_t v461 = svadd_f32_x(svptrue_b32(), v460, v409); + svfloat32_t v463 = svadd_f32_x(svptrue_b32(), v462, v411); + svfloat32_t v465 = svadd_f32_x(svptrue_b32(), v464, v413); + svfloat32_t v466 = svadd_f32_x(svptrue_b32(), v454, v456); + svfloat32_t v467 = svadd_f32_x(svptrue_b32(), v457, v459); + svfloat32_t v476 = svsub_f32_x(svptrue_b32(), v454, v457); + svfloat32_t v477 = svsub_f32_x(svptrue_b32(), v456, v459); + svfloat32_t zero641 = svdup_n_f32(0); svfloat32_t v641 = svcmla_f32_x(pred_full, zero641, v1156, v457, 90); - svfloat32_t zero662; - asm volatile("mov %0.s, #0" : "=w"(zero662)); + svfloat32_t zero662 = svdup_n_f32(0); svfloat32_t v662 = svcmla_f32_x(pred_full, zero662, v1159, v459, 90); - svfloat32_t v428; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v428) : "w"(v421), "w"(v423)); - svfloat32_t v438; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v438) : "w"(v427), "w"(v418)); - svfloat32_t v439; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v439) : "w"(v426), "w"(v415)); - svfloat32_t v441; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v441) : "w"(v427), "w"(v418)); - svfloat32_t v442; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v442) : "w"(v426), "w"(v415)); - svfloat32_t v446; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v446) : "w"(v414), "w"(v445)); - svfloat32_t v448; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v448) : "w"(v444), "w"(v419)); - svfloat32_t v451; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v451) : "w"(v421), "w"(v425)); - svfloat32_t v452; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v452) : "w"(v423), "w"(v425)); - svfloat32_t v468; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v468) : "w"(v461), "w"(v463)); - svfloat32_t v470; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v470) : "w"(v467), "w"(v458)); - svfloat32_t v471; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v471) : "w"(v466), "w"(v455)); - svfloat32_t v473; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v473) : "w"(v467), "w"(v458)); - svfloat32_t v474; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v474) : "w"(v466), "w"(v455)); - svfloat32_t v478; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v478) : "w"(v454), "w"(v477)); - svfloat32_t v480; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v480) : "w"(v476), "w"(v459)); - svfloat32_t v483; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v483) : "w"(v461), "w"(v465)); - svfloat32_t v484; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v484) : "w"(v463), "w"(v465)); - svfloat32_t v429; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v429) : "w"(v428), "w"(v425)); - svfloat32_t v440; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v440) : "w"(v439), "w"(v438)); - svfloat32_t v443; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v443) : "w"(v442), "w"(v441)); - svfloat32_t v447; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v447) : "w"(v446), "w"(v418)); - svfloat32_t v449; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v449) : "w"(v448), "w"(v415)); - svfloat32_t v453; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v453) : "w"(v451), "w"(v452)); - svfloat32_t v469; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v469) : "w"(v468), "w"(v465)); - svfloat32_t v472; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v472) : "w"(v471), "w"(v470)); - svfloat32_t v475; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v475) : "w"(v474), "w"(v473)); - svfloat32_t v479; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v479) : "w"(v478), "w"(v458)); - svfloat32_t v481; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v481) : "w"(v480), "w"(v455)); - svfloat32_t v485; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v485) : "w"(v483), "w"(v484)); - svfloat32_t v505; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v505) : "w"(v439), "w"(v1132)); - svfloat32_t v520; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v520) : "w"(v442), "w"(v1135)); - svfloat32_t zero599; - asm volatile("mov %0.s, #0" : "=w"(zero599)); + svfloat32_t v428 = svadd_f32_x(svptrue_b32(), v421, v423); + svfloat32_t v438 = svadd_f32_x(svptrue_b32(), v427, v418); + svfloat32_t v439 = svadd_f32_x(svptrue_b32(), v426, v415); + svfloat32_t v441 = svsub_f32_x(svptrue_b32(), v427, v418); + svfloat32_t v442 = svsub_f32_x(svptrue_b32(), v426, v415); + svfloat32_t v446 = svsub_f32_x(svptrue_b32(), v414, v445); + svfloat32_t v448 = svadd_f32_x(svptrue_b32(), v444, v419); + svfloat32_t v451 = svsub_f32_x(svptrue_b32(), v421, v425); + svfloat32_t v452 = svsub_f32_x(svptrue_b32(), v423, v425); + svfloat32_t v468 = svadd_f32_x(svptrue_b32(), v461, v463); + svfloat32_t v470 = svadd_f32_x(svptrue_b32(), v467, v458); + svfloat32_t v471 = svadd_f32_x(svptrue_b32(), v466, v455); + svfloat32_t v473 = svsub_f32_x(svptrue_b32(), v467, v458); + svfloat32_t v474 = svsub_f32_x(svptrue_b32(), v466, v455); + svfloat32_t v478 = svsub_f32_x(svptrue_b32(), v454, v477); + svfloat32_t v480 = svadd_f32_x(svptrue_b32(), v476, v459); + svfloat32_t v483 = svsub_f32_x(svptrue_b32(), v461, v465); + svfloat32_t v484 = svsub_f32_x(svptrue_b32(), v463, v465); + svfloat32_t v429 = svadd_f32_x(svptrue_b32(), v428, v425); + svfloat32_t v440 = svsub_f32_x(svptrue_b32(), v439, v438); + svfloat32_t v443 = svsub_f32_x(svptrue_b32(), v442, v441); + svfloat32_t v447 = svsub_f32_x(svptrue_b32(), v446, v418); + svfloat32_t v449 = svsub_f32_x(svptrue_b32(), v448, v415); + svfloat32_t v453 = svadd_f32_x(svptrue_b32(), v451, v452); + svfloat32_t v469 = svadd_f32_x(svptrue_b32(), v468, v465); + svfloat32_t v472 = svsub_f32_x(svptrue_b32(), v471, v470); + svfloat32_t v475 = svsub_f32_x(svptrue_b32(), v474, v473); + svfloat32_t v479 = svsub_f32_x(svptrue_b32(), v478, v458); + svfloat32_t v481 = svsub_f32_x(svptrue_b32(), v480, v455); + svfloat32_t v485 = svadd_f32_x(svptrue_b32(), v483, v484); + svfloat32_t v505 = svmul_f32_x(svptrue_b32(), v439, v1132); + svfloat32_t v520 = svmul_f32_x(svptrue_b32(), v442, v1135); + svfloat32_t zero599 = svdup_n_f32(0); svfloat32_t v599 = svcmla_f32_x(pred_full, zero599, v1150, v470, 90); - svfloat32_t zero620; - asm volatile("mov %0.s, #0" : "=w"(zero620)); + svfloat32_t zero620 = svdup_n_f32(0); svfloat32_t v620 = svcmla_f32_x(pred_full, zero620, v1153, v473, 90); - svfloat32_t zero704; - asm volatile("mov %0.s, #0" : "=w"(zero704)); + svfloat32_t zero704 = svdup_n_f32(0); svfloat32_t v704 = svcmla_f32_x(pred_full, zero704, v1165, v483, 90); - svfloat32_t zero711; - asm volatile("mov %0.s, #0" : "=w"(zero711)); + svfloat32_t zero711 = svdup_n_f32(0); svfloat32_t v711 = svcmla_f32_x(pred_full, zero711, v1166, v484, 90); - svfloat32_t v437; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v437) : "w"(v1128), "w"(v429)); - svfloat32_t v450; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v450) : "w"(v447), "w"(v449)); - svfloat32_t v482; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v482) : "w"(v479), "w"(v481)); - svfloat32_t v510; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v510) : "w"(v440), "w"(v1133)); - svfloat32_t v525; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v525) : "w"(v443), "w"(v1136)); - svfloat32_t v585; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v585) : "w"(v453), "w"(v1148)); - svfloat32_t zero592; - asm volatile("mov %0.s, #0" : "=w"(zero592)); + svfloat32_t v437 = svadd_f32_x(svptrue_b32(), v1128, v429); + svfloat32_t v450 = svsub_f32_x(svptrue_b32(), v447, v449); + svfloat32_t v482 = svsub_f32_x(svptrue_b32(), v479, v481); + svfloat32_t v510 = svmul_f32_x(svptrue_b32(), v440, v1133); + svfloat32_t v525 = svmul_f32_x(svptrue_b32(), v443, v1136); + svfloat32_t v585 = svmul_f32_x(svptrue_b32(), v453, v1148); + svfloat32_t zero592 = svdup_n_f32(0); svfloat32_t v592 = svcmla_f32_x(pred_full, zero592, v1149, v469, 90); - svfloat32_t zero718; - asm volatile("mov %0.s, #0" : "=w"(zero718)); + svfloat32_t zero718 = svdup_n_f32(0); svfloat32_t v718 = svcmla_f32_x(pred_full, zero718, v1167, v485, 90); svfloat32_t v719 = svmla_f32_x(pred_full, v505, v438, v1131); svfloat32_t v720 = svmla_f32_x(pred_full, v520, v441, v1134); svfloat32_t v750 = svcmla_f32_x(pred_full, v599, v1151, v471, 90); svfloat32_t v751 = svcmla_f32_x(pred_full, v620, v1154, v474, 90); - svfloat32_t v570; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v570) : "w"(v450), "w"(v1145)); - svfloat32_t zero697; - asm volatile("mov %0.s, #0" : "=w"(zero697)); + svfloat32_t v570 = svmul_f32_x(svptrue_b32(), v450, v1145); + svfloat32_t zero697 = svdup_n_f32(0); svfloat32_t v697 = svcmla_f32_x(pred_full, zero697, v1164, v482, 90); - svfloat32_t v722; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v722) : "w"(v719), "w"(v720)); + svfloat32_t v722 = svadd_f32_x(svptrue_b32(), v719, v720); svfloat32_t v723 = svmla_f32_x(pred_full, v510, v438, v1131); svfloat32_t v724 = svmla_f32_x(pred_full, v525, v441, v1134); - svfloat32_t v741; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v741) : "w"(v719), "w"(v720)); + svfloat32_t v741 = svsub_f32_x(svptrue_b32(), v719, v720); svfloat32_t v743 = svnmls_f32_x(pred_full, v585, v451, v1146); svfloat32_t v744 = svnmls_f32_x(pred_full, v585, v452, v1147); svfloat32_t v745 = svmla_f32_x(pred_full, v437, v429, v1130); - svfloat32_t v753; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v753) : "w"(v750), "w"(v751)); + svfloat32_t v753 = svadd_f32_x(svptrue_b32(), v750, v751); svfloat32_t v754 = svcmla_f32_x(pred_full, v599, v1152, v472, 90); svfloat32_t v755 = svcmla_f32_x(pred_full, v620, v1155, v475, 90); - svfloat32_t v772; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v772) : "w"(v750), "w"(v751)); - svfloat32_t v774; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v774) : "w"(v704), "w"(v718)); - svfloat32_t v775; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v775) : "w"(v711), "w"(v718)); + svfloat32_t v772 = svsub_f32_x(svptrue_b32(), v750, v751); + svfloat32_t v774 = svsub_f32_x(svptrue_b32(), v704, v718); + svfloat32_t v775 = svsub_f32_x(svptrue_b32(), v711, v718); svst1_scatter_s64index_f64(pred_full, (double *)(v1175), v1338, svreinterpret_f64_f32(v437)); svfloat32_t v721 = svmla_f32_x(pred_full, v570, v449, v1144); svfloat32_t v725 = svmla_f32_x(pred_full, v570, v447, v1143); svfloat32_t v726 = svnmls_f32_x(pred_full, v722, v417, v1137); - svfloat32_t v727; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v727) : "w"(v723), "w"(v724)); - svfloat32_t v733; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v733) : "w"(v723), "w"(v724)); + svfloat32_t v727 = svadd_f32_x(svptrue_b32(), v723, v724); + svfloat32_t v733 = svsub_f32_x(svptrue_b32(), v723, v724); svfloat32_t v738 = svmla_f32_x(pred_full, v722, v416, v1142); - svfloat32_t v746; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v746) : "w"(v745), "w"(v743)); - svfloat32_t v747; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v747) : "w"(v745), "w"(v743)); - svfloat32_t v749; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v749) : "w"(v745), "w"(v744)); + svfloat32_t v746 = svadd_f32_x(svptrue_b32(), v745, v743); + svfloat32_t v747 = svsub_f32_x(svptrue_b32(), v745, v743); + svfloat32_t v749 = svadd_f32_x(svptrue_b32(), v745, v744); svfloat32_t v752 = svcmla_f32_x(pred_full, v697, v1163, v481, 90); svfloat32_t v756 = svcmla_f32_x(pred_full, v697, v1162, v479, 90); - svfloat32_t v757; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v757) : "w"(v641), "w"(v753)); - svfloat32_t v758; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v758) : "w"(v754), "w"(v755)); - svfloat32_t v764; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v764) : "w"(v754), "w"(v755)); + svfloat32_t v757 = svsub_f32_x(svptrue_b32(), v641, v753); + svfloat32_t v758 = svadd_f32_x(svptrue_b32(), v754, v755); + svfloat32_t v764 = svsub_f32_x(svptrue_b32(), v754, v755); svfloat32_t v769 = svcmla_f32_x(pred_full, v753, v1161, v456, 90); - svfloat32_t v776; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v776) : "w"(v592), "w"(v774)); - svfloat32_t v777; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v777) : "w"(v592), "w"(v774)); - svfloat32_t v779; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v779) : "w"(v592), "w"(v775)); + svfloat32_t v776 = svadd_f32_x(svptrue_b32(), v592, v774); + svfloat32_t v777 = svsub_f32_x(svptrue_b32(), v592, v774); + svfloat32_t v779 = svadd_f32_x(svptrue_b32(), v592, v775); svfloat32_t v728 = svnmls_f32_x(pred_full, v725, v419, v1140); svfloat32_t v729 = svmla_f32_x(pred_full, v721, v444, v1138); svfloat32_t v731 = svmla_f32_x(pred_full, v727, v445, v1141); - svfloat32_t v734; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v734) : "w"(v733), "w"(v721)); - svfloat32_t v735; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v735) : "w"(v726), "w"(v727)); - svfloat32_t v742; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v742) : "w"(v741), "w"(v725)); - svfloat32_t v748; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v748) : "w"(v747), "w"(v744)); - svfloat32_t v759; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v759) : "w"(v662), "w"(v756)); + svfloat32_t v734 = svadd_f32_x(svptrue_b32(), v733, v721); + svfloat32_t v735 = svadd_f32_x(svptrue_b32(), v726, v727); + svfloat32_t v742 = svadd_f32_x(svptrue_b32(), v741, v725); + svfloat32_t v748 = svsub_f32_x(svptrue_b32(), v747, v744); + svfloat32_t v759 = svsub_f32_x(svptrue_b32(), v662, v756); svfloat32_t v760 = svcmla_f32_x(pred_full, v752, v1157, v476, 90); svfloat32_t v762 = svcmla_f32_x(pred_full, v758, v1160, v477, 90); - svfloat32_t v765; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v765) : "w"(v764), "w"(v752)); - svfloat32_t v766; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v766) : "w"(v757), "w"(v758)); - svfloat32_t v773; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v773) : "w"(v772), "w"(v756)); - svfloat32_t v778; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v778) : "w"(v777), "w"(v775)); - svfloat32_t v730; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v730) : "w"(v729), "w"(v726)); - svfloat32_t v732; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v732) : "w"(v731), "w"(v728)); + svfloat32_t v765 = svadd_f32_x(svptrue_b32(), v764, v752); + svfloat32_t v766 = svadd_f32_x(svptrue_b32(), v757, v758); + svfloat32_t v773 = svadd_f32_x(svptrue_b32(), v772, v756); + svfloat32_t v778 = svsub_f32_x(svptrue_b32(), v777, v775); + svfloat32_t v730 = svadd_f32_x(svptrue_b32(), v729, v726); + svfloat32_t v732 = svadd_f32_x(svptrue_b32(), v731, v728); svfloat32_t v736 = svmla_f32_x(pred_full, v735, v414, v1139); - svfloat32_t v739; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v739) : "w"(v738), "w"(v728)); - svfloat32_t v761; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v761) : "w"(v760), "w"(v757)); - svfloat32_t v763; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v763) : "w"(v762), "w"(v759)); + svfloat32_t v739 = svadd_f32_x(svptrue_b32(), v738, v728); + svfloat32_t v761 = svadd_f32_x(svptrue_b32(), v760, v757); + svfloat32_t v763 = svadd_f32_x(svptrue_b32(), v762, v759); svfloat32_t v767 = svcmla_f32_x(pred_full, v766, v1158, v454, 90); - svfloat32_t v770; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v770) : "w"(v769), "w"(v759)); - svfloat32_t v784; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v784) : "w"(v742), "w"(v734)); - svfloat32_t v788; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v788) : "w"(v749), "w"(v742)); - svfloat32_t v791; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v791) : "w"(v734), "w"(v749)); - svfloat32_t v796; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v796) : "w"(v773), "w"(v765)); - svfloat32_t v800; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v800) : "w"(v773), "w"(v779)); - svfloat32_t v803; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v803) : "w"(v765), "w"(v779)); - svfloat32_t v737; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v737) : "w"(v736), "w"(v725)); - svfloat32_t v740; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v740) : "w"(v739), "w"(v721)); - svfloat32_t v768; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v768) : "w"(v767), "w"(v756)); - svfloat32_t v771; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v771) : "w"(v770), "w"(v752)); - svfloat32_t v785; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v785) : "w"(v784), "w"(v749)); - svfloat32_t v789; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v789) : "w"(v730), "w"(v746)); - svfloat32_t v790; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v790) : "w"(v732), "w"(v748)); - svfloat32_t v797; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v797) : "w"(v796), "w"(v779)); - svfloat32_t v801; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v801) : "w"(v761), "w"(v776)); - svfloat32_t v802; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v802) : "w"(v763), "w"(v778)); - svfloat32_t v827; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v827) : "w"(v791), "w"(v803)); - svfloat32_t v835; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v835) : "w"(v791), "w"(v803)); - svfloat32_t v843; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v843) : "w"(v788), "w"(v800)); - svfloat32_t v851; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v851) : "w"(v788), "w"(v800)); - svfloat32_t v780; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v780) : "w"(v737), "w"(v730)); - svfloat32_t v782; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v782) : "w"(v740), "w"(v732)); - svfloat32_t v786; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v786) : "w"(v746), "w"(v737)); - svfloat32_t v787; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v787) : "w"(v748), "w"(v740)); - svfloat32_t v792; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v792) : "w"(v768), "w"(v761)); - svfloat32_t v794; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v794) : "w"(v771), "w"(v763)); - svfloat32_t v798; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v798) : "w"(v776), "w"(v768)); - svfloat32_t v799; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v799) : "w"(v778), "w"(v771)); - svfloat32_t v859; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v859) : "w"(v790), "w"(v802)); - svfloat32_t v867; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v867) : "w"(v790), "w"(v802)); - svfloat32_t v875; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v875) : "w"(v785), "w"(v797)); - svfloat32_t v883; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v883) : "w"(v785), "w"(v797)); - svfloat32_t v923; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v923) : "w"(v789), "w"(v801)); - svfloat32_t v931; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v931) : "w"(v789), "w"(v801)); + svfloat32_t v770 = svadd_f32_x(svptrue_b32(), v769, v759); + svfloat32_t v784 = svsub_f32_x(svptrue_b32(), v742, v734); + svfloat32_t v788 = svsub_f32_x(svptrue_b32(), v749, v742); + svfloat32_t v791 = svadd_f32_x(svptrue_b32(), v734, v749); + svfloat32_t v796 = svsub_f32_x(svptrue_b32(), v773, v765); + svfloat32_t v800 = svsub_f32_x(svptrue_b32(), v773, v779); + svfloat32_t v803 = svadd_f32_x(svptrue_b32(), v765, v779); + svfloat32_t v737 = svadd_f32_x(svptrue_b32(), v736, v725); + svfloat32_t v740 = svadd_f32_x(svptrue_b32(), v739, v721); + svfloat32_t v768 = svadd_f32_x(svptrue_b32(), v767, v756); + svfloat32_t v771 = svadd_f32_x(svptrue_b32(), v770, v752); + svfloat32_t v785 = svadd_f32_x(svptrue_b32(), v784, v749); + svfloat32_t v789 = svadd_f32_x(svptrue_b32(), v730, v746); + svfloat32_t v790 = svadd_f32_x(svptrue_b32(), v732, v748); + svfloat32_t v797 = svadd_f32_x(svptrue_b32(), v796, v779); + svfloat32_t v801 = svadd_f32_x(svptrue_b32(), v761, v776); + svfloat32_t v802 = svadd_f32_x(svptrue_b32(), v763, v778); + svfloat32_t v827 = svsub_f32_x(svptrue_b32(), v791, v803); + svfloat32_t v835 = svadd_f32_x(svptrue_b32(), v791, v803); + svfloat32_t v843 = svadd_f32_x(svptrue_b32(), v788, v800); + svfloat32_t v851 = svsub_f32_x(svptrue_b32(), v788, v800); + svfloat32_t v780 = svsub_f32_x(svptrue_b32(), v737, v730); + svfloat32_t v782 = svsub_f32_x(svptrue_b32(), v740, v732); + svfloat32_t v786 = svsub_f32_x(svptrue_b32(), v746, v737); + svfloat32_t v787 = svsub_f32_x(svptrue_b32(), v748, v740); + svfloat32_t v792 = svsub_f32_x(svptrue_b32(), v768, v761); + svfloat32_t v794 = svsub_f32_x(svptrue_b32(), v771, v763); + svfloat32_t v798 = svsub_f32_x(svptrue_b32(), v776, v768); + svfloat32_t v799 = svsub_f32_x(svptrue_b32(), v778, v771); + svfloat32_t v859 = svadd_f32_x(svptrue_b32(), v790, v802); + svfloat32_t v867 = svsub_f32_x(svptrue_b32(), v790, v802); + svfloat32_t v875 = svadd_f32_x(svptrue_b32(), v785, v797); + svfloat32_t v883 = svsub_f32_x(svptrue_b32(), v785, v797); + svfloat32_t v923 = svsub_f32_x(svptrue_b32(), v789, v801); + svfloat32_t v931 = svadd_f32_x(svptrue_b32(), v789, v801); svst1_scatter_s64index_f64(pred_full, (double *)(v1202), v1338, svreinterpret_f64_f32(v827)); svst1_scatter_s64index_f64(pred_full, (double *)(v1211), v1338, @@ -10475,22 +9249,14 @@ void armral_fft_cf32_cf32_cf32_ab_t_gs19(const armral_cmplx_f32_t *restrict x, svreinterpret_f64_f32(v843)); svst1_scatter_s64index_f64(pred_full, (double *)(v1229), v1338, svreinterpret_f64_f32(v851)); - svfloat32_t v781; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v781) : "w"(v780), "w"(v746)); - svfloat32_t v783; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v783) : "w"(v782), "w"(v748)); - svfloat32_t v793; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v793) : "w"(v792), "w"(v776)); - svfloat32_t v795; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v795) : "w"(v794), "w"(v778)); - svfloat32_t v891; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v891) : "w"(v787), "w"(v799)); - svfloat32_t v899; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v899) : "w"(v787), "w"(v799)); - svfloat32_t v907; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v907) : "w"(v786), "w"(v798)); - svfloat32_t v915; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v915) : "w"(v786), "w"(v798)); + svfloat32_t v781 = svadd_f32_x(svptrue_b32(), v780, v746); + svfloat32_t v783 = svadd_f32_x(svptrue_b32(), v782, v748); + svfloat32_t v793 = svadd_f32_x(svptrue_b32(), v792, v776); + svfloat32_t v795 = svadd_f32_x(svptrue_b32(), v794, v778); + svfloat32_t v891 = svadd_f32_x(svptrue_b32(), v787, v799); + svfloat32_t v899 = svsub_f32_x(svptrue_b32(), v787, v799); + svfloat32_t v907 = svadd_f32_x(svptrue_b32(), v786, v798); + svfloat32_t v915 = svsub_f32_x(svptrue_b32(), v786, v798); svst1_scatter_s64index_f64(pred_full, (double *)(v1238), v1338, svreinterpret_f64_f32(v859)); svst1_scatter_s64index_f64(pred_full, (double *)(v1247), v1338, @@ -10503,14 +9269,10 @@ void armral_fft_cf32_cf32_cf32_ab_t_gs19(const armral_cmplx_f32_t *restrict x, svreinterpret_f64_f32(v923)); svst1_scatter_s64index_f64(pred_full, (double *)(v1319), v1338, svreinterpret_f64_f32(v931)); - svfloat32_t v811; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v811) : "w"(v781), "w"(v793)); - svfloat32_t v819; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v819) : "w"(v781), "w"(v793)); - svfloat32_t v939; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v939) : "w"(v783), "w"(v795)); - svfloat32_t v947; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v947) : "w"(v783), "w"(v795)); + svfloat32_t v811 = svadd_f32_x(svptrue_b32(), v781, v793); + svfloat32_t v819 = svsub_f32_x(svptrue_b32(), v781, v793); + svfloat32_t v939 = svadd_f32_x(svptrue_b32(), v783, v795); + svfloat32_t v947 = svsub_f32_x(svptrue_b32(), v783, v795); svst1_scatter_s64index_f64(pred_full, (double *)(v1274), v1338, svreinterpret_f64_f32(v891)); svst1_scatter_s64index_f64(pred_full, (double *)(v1283), v1338, @@ -11114,8 +9876,7 @@ void armral_fft_cf32_cf32_cf32_ab_t_gs20(const armral_cmplx_f32_t *restrict x, svld1_f64(pred_full, &((const double *)v7)[v365])); svfloat32_t v373 = svreinterpret_f32_f64( svld1_f64(pred_full, &((const double *)v7)[v372])); - svfloat32_t zero409; - asm volatile("mov %0.s, #0" : "=w"(zero409)); + svfloat32_t zero409 = svdup_n_f32(0); svfloat32_t v409 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero409, v977, v408, 0), v977, v408, 90); @@ -11157,269 +9918,174 @@ void armral_fft_cf32_cf32_cf32_ab_t_gs20(const armral_cmplx_f32_t *restrict x, svld1_gather_s64index_f64(pred_full, (const double *)(v966), v997)); svfloat32_t v987 = svreinterpret_f32_f64( svld1_gather_s64index_f64(pred_full, (const double *)(v985), v997)); - svfloat32_t zero38; - asm volatile("mov %0.s, #0" : "=w"(zero38)); + svfloat32_t zero38 = svdup_n_f32(0); svfloat32_t v38 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero38, v824, v37, 0), v824, v37, 90); - svfloat32_t zero73; - asm volatile("mov %0.s, #0" : "=w"(zero73)); + svfloat32_t zero73 = svdup_n_f32(0); svfloat32_t v73 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero73, v833, v72, 0), v833, v72, 90); - svfloat32_t zero80; - asm volatile("mov %0.s, #0" : "=w"(zero80)); + svfloat32_t zero80 = svdup_n_f32(0); svfloat32_t v80 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero80, v842, v79, 0), v842, v79, 90); - svfloat32_t zero115; - asm volatile("mov %0.s, #0" : "=w"(zero115)); + svfloat32_t zero115 = svdup_n_f32(0); svfloat32_t v115 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero115, v851, v114, 0), v851, v114, 90); - svfloat32_t zero122; - asm volatile("mov %0.s, #0" : "=w"(zero122)); + svfloat32_t zero122 = svdup_n_f32(0); svfloat32_t v122 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero122, v860, v121, 0), v860, v121, 90); - svfloat32_t zero157; - asm volatile("mov %0.s, #0" : "=w"(zero157)); + svfloat32_t zero157 = svdup_n_f32(0); svfloat32_t v157 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero157, v869, v156, 0), v869, v156, 90); - svfloat32_t zero164; - asm volatile("mov %0.s, #0" : "=w"(zero164)); + svfloat32_t zero164 = svdup_n_f32(0); svfloat32_t v164 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero164, v878, v163, 0), v878, v163, 90); - svfloat32_t zero199; - asm volatile("mov %0.s, #0" : "=w"(zero199)); + svfloat32_t zero199 = svdup_n_f32(0); svfloat32_t v199 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero199, v887, v198, 0), v887, v198, 90); - svfloat32_t zero206; - asm volatile("mov %0.s, #0" : "=w"(zero206)); + svfloat32_t zero206 = svdup_n_f32(0); svfloat32_t v206 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero206, v896, v205, 0), v896, v205, 90); - svfloat32_t zero241; - asm volatile("mov %0.s, #0" : "=w"(zero241)); + svfloat32_t zero241 = svdup_n_f32(0); svfloat32_t v241 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero241, v905, v240, 0), v905, v240, 90); - svfloat32_t zero248; - asm volatile("mov %0.s, #0" : "=w"(zero248)); + svfloat32_t zero248 = svdup_n_f32(0); svfloat32_t v248 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero248, v914, v247, 0), v914, v247, 90); - svfloat32_t zero283; - asm volatile("mov %0.s, #0" : "=w"(zero283)); + svfloat32_t zero283 = svdup_n_f32(0); svfloat32_t v283 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero283, v923, v282, 0), v923, v282, 90); - svfloat32_t zero290; - asm volatile("mov %0.s, #0" : "=w"(zero290)); + svfloat32_t zero290 = svdup_n_f32(0); svfloat32_t v290 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero290, v932, v289, 0), v932, v289, 90); - svfloat32_t zero325; - asm volatile("mov %0.s, #0" : "=w"(zero325)); + svfloat32_t zero325 = svdup_n_f32(0); svfloat32_t v325 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero325, v941, v324, 0), v941, v324, 90); - svfloat32_t zero332; - asm volatile("mov %0.s, #0" : "=w"(zero332)); + svfloat32_t zero332 = svdup_n_f32(0); svfloat32_t v332 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero332, v950, v331, 0), v950, v331, 90); - svfloat32_t zero367; - asm volatile("mov %0.s, #0" : "=w"(zero367)); + svfloat32_t zero367 = svdup_n_f32(0); svfloat32_t v367 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero367, v959, v366, 0), v959, v366, 90); - svfloat32_t zero374; - asm volatile("mov %0.s, #0" : "=w"(zero374)); + svfloat32_t zero374 = svdup_n_f32(0); svfloat32_t v374 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero374, v968, v373, 0), v968, v373, 90); - svfloat32_t zero416; - asm volatile("mov %0.s, #0" : "=w"(zero416)); + svfloat32_t zero416 = svdup_n_f32(0); svfloat32_t v416 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero416, v987, v415, 0), v987, v415, 90); - svfloat32_t v424; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v424) : "w"(v998), "w"(v38)); - svfloat32_t v425; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v425) : "w"(v998), "w"(v38)); - svfloat32_t v426; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v426) : "w"(v73), "w"(v80)); - svfloat32_t v427; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v427) : "w"(v73), "w"(v80)); - svfloat32_t v430; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v430) : "w"(v115), "w"(v122)); - svfloat32_t v431; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v431) : "w"(v115), "w"(v122)); - svfloat32_t v432; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v432) : "w"(v157), "w"(v164)); - svfloat32_t v433; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v433) : "w"(v157), "w"(v164)); - svfloat32_t v436; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v436) : "w"(v199), "w"(v206)); - svfloat32_t v437; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v437) : "w"(v199), "w"(v206)); - svfloat32_t v438; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v438) : "w"(v241), "w"(v248)); - svfloat32_t v439; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v439) : "w"(v241), "w"(v248)); - svfloat32_t v442; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v442) : "w"(v283), "w"(v290)); - svfloat32_t v443; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v443) : "w"(v283), "w"(v290)); - svfloat32_t v444; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v444) : "w"(v325), "w"(v332)); - svfloat32_t v445; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v445) : "w"(v325), "w"(v332)); - svfloat32_t v448; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v448) : "w"(v367), "w"(v374)); - svfloat32_t v449; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v449) : "w"(v367), "w"(v374)); - svfloat32_t v450; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v450) : "w"(v409), "w"(v416)); - svfloat32_t v451; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v451) : "w"(v409), "w"(v416)); - svfloat32_t v428; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v428) : "w"(v424), "w"(v426)); - svfloat32_t v429; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v429) : "w"(v424), "w"(v426)); - svfloat32_t v434; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v434) : "w"(v430), "w"(v432)); - svfloat32_t v435; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v435) : "w"(v430), "w"(v432)); - svfloat32_t v440; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v440) : "w"(v436), "w"(v438)); - svfloat32_t v441; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v441) : "w"(v436), "w"(v438)); - svfloat32_t v446; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v446) : "w"(v442), "w"(v444)); - svfloat32_t v447; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v447) : "w"(v442), "w"(v444)); - svfloat32_t v452; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v452) : "w"(v448), "w"(v450)); - svfloat32_t v453; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v453) : "w"(v448), "w"(v450)); - svfloat32_t v560; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v560) : "w"(v431), "w"(v449)); - svfloat32_t v561; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v561) : "w"(v431), "w"(v449)); - svfloat32_t v562; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v562) : "w"(v443), "w"(v437)); - svfloat32_t v563; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v563) : "w"(v443), "w"(v437)); - svfloat32_t v613; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v613) : "w"(v433), "w"(v451)); - svfloat32_t v614; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v614) : "w"(v433), "w"(v451)); - svfloat32_t v615; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v615) : "w"(v445), "w"(v439)); - svfloat32_t v616; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v616) : "w"(v445), "w"(v439)); - svfloat32_t v454; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v454) : "w"(v434), "w"(v452)); - svfloat32_t v455; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v455) : "w"(v434), "w"(v452)); - svfloat32_t v456; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v456) : "w"(v446), "w"(v440)); - svfloat32_t v457; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v457) : "w"(v446), "w"(v440)); - svfloat32_t v507; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v507) : "w"(v435), "w"(v453)); - svfloat32_t v508; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v508) : "w"(v435), "w"(v453)); - svfloat32_t v509; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v509) : "w"(v447), "w"(v441)); - svfloat32_t v510; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v510) : "w"(v447), "w"(v441)); - svfloat32_t v564; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v564) : "w"(v560), "w"(v562)); - svfloat32_t v565; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v565) : "w"(v560), "w"(v562)); - svfloat32_t v566; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v566) : "w"(v561), "w"(v563)); - svfloat32_t zero589; - asm volatile("mov %0.s, #0" : "=w"(zero589)); + svfloat32_t v424 = svadd_f32_x(svptrue_b32(), v998, v38); + svfloat32_t v425 = svsub_f32_x(svptrue_b32(), v998, v38); + svfloat32_t v426 = svadd_f32_x(svptrue_b32(), v73, v80); + svfloat32_t v427 = svsub_f32_x(svptrue_b32(), v73, v80); + svfloat32_t v430 = svadd_f32_x(svptrue_b32(), v115, v122); + svfloat32_t v431 = svsub_f32_x(svptrue_b32(), v115, v122); + svfloat32_t v432 = svadd_f32_x(svptrue_b32(), v157, v164); + svfloat32_t v433 = svsub_f32_x(svptrue_b32(), v157, v164); + svfloat32_t v436 = svadd_f32_x(svptrue_b32(), v199, v206); + svfloat32_t v437 = svsub_f32_x(svptrue_b32(), v199, v206); + svfloat32_t v438 = svadd_f32_x(svptrue_b32(), v241, v248); + svfloat32_t v439 = svsub_f32_x(svptrue_b32(), v241, v248); + svfloat32_t v442 = svadd_f32_x(svptrue_b32(), v283, v290); + svfloat32_t v443 = svsub_f32_x(svptrue_b32(), v283, v290); + svfloat32_t v444 = svadd_f32_x(svptrue_b32(), v325, v332); + svfloat32_t v445 = svsub_f32_x(svptrue_b32(), v325, v332); + svfloat32_t v448 = svadd_f32_x(svptrue_b32(), v367, v374); + svfloat32_t v449 = svsub_f32_x(svptrue_b32(), v367, v374); + svfloat32_t v450 = svadd_f32_x(svptrue_b32(), v409, v416); + svfloat32_t v451 = svsub_f32_x(svptrue_b32(), v409, v416); + svfloat32_t v428 = svadd_f32_x(svptrue_b32(), v424, v426); + svfloat32_t v429 = svsub_f32_x(svptrue_b32(), v424, v426); + svfloat32_t v434 = svadd_f32_x(svptrue_b32(), v430, v432); + svfloat32_t v435 = svsub_f32_x(svptrue_b32(), v430, v432); + svfloat32_t v440 = svadd_f32_x(svptrue_b32(), v436, v438); + svfloat32_t v441 = svsub_f32_x(svptrue_b32(), v436, v438); + svfloat32_t v446 = svadd_f32_x(svptrue_b32(), v442, v444); + svfloat32_t v447 = svsub_f32_x(svptrue_b32(), v442, v444); + svfloat32_t v452 = svadd_f32_x(svptrue_b32(), v448, v450); + svfloat32_t v453 = svsub_f32_x(svptrue_b32(), v448, v450); + svfloat32_t v560 = svadd_f32_x(svptrue_b32(), v431, v449); + svfloat32_t v561 = svsub_f32_x(svptrue_b32(), v431, v449); + svfloat32_t v562 = svadd_f32_x(svptrue_b32(), v443, v437); + svfloat32_t v563 = svsub_f32_x(svptrue_b32(), v443, v437); + svfloat32_t v613 = svadd_f32_x(svptrue_b32(), v433, v451); + svfloat32_t v614 = svsub_f32_x(svptrue_b32(), v433, v451); + svfloat32_t v615 = svadd_f32_x(svptrue_b32(), v445, v439); + svfloat32_t v616 = svsub_f32_x(svptrue_b32(), v445, v439); + svfloat32_t v454 = svadd_f32_x(svptrue_b32(), v434, v452); + svfloat32_t v455 = svsub_f32_x(svptrue_b32(), v434, v452); + svfloat32_t v456 = svadd_f32_x(svptrue_b32(), v446, v440); + svfloat32_t v457 = svsub_f32_x(svptrue_b32(), v446, v440); + svfloat32_t v507 = svadd_f32_x(svptrue_b32(), v435, v453); + svfloat32_t v508 = svsub_f32_x(svptrue_b32(), v435, v453); + svfloat32_t v509 = svadd_f32_x(svptrue_b32(), v447, v441); + svfloat32_t v510 = svsub_f32_x(svptrue_b32(), v447, v441); + svfloat32_t v564 = svadd_f32_x(svptrue_b32(), v560, v562); + svfloat32_t v565 = svsub_f32_x(svptrue_b32(), v560, v562); + svfloat32_t v566 = svadd_f32_x(svptrue_b32(), v561, v563); + svfloat32_t zero589 = svdup_n_f32(0); svfloat32_t v589 = svcmla_f32_x(pred_full, zero589, v1014, v561, 90); - svfloat32_t v617; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v617) : "w"(v613), "w"(v615)); - svfloat32_t v618; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v618) : "w"(v613), "w"(v615)); - svfloat32_t v619; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v619) : "w"(v614), "w"(v616)); - svfloat32_t v656; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v656) : "w"(v616), "w"(v1022)); - svfloat32_t v458; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v458) : "w"(v454), "w"(v456)); - svfloat32_t v459; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v459) : "w"(v454), "w"(v456)); - svfloat32_t v460; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v460) : "w"(v455), "w"(v457)); - svfloat32_t zero483; - asm volatile("mov %0.s, #0" : "=w"(zero483)); + svfloat32_t v617 = svadd_f32_x(svptrue_b32(), v613, v615); + svfloat32_t v618 = svsub_f32_x(svptrue_b32(), v613, v615); + svfloat32_t v619 = svadd_f32_x(svptrue_b32(), v614, v616); + svfloat32_t v656 = svmul_f32_x(svptrue_b32(), v616, v1022); + svfloat32_t v458 = svadd_f32_x(svptrue_b32(), v454, v456); + svfloat32_t v459 = svsub_f32_x(svptrue_b32(), v454, v456); + svfloat32_t v460 = svadd_f32_x(svptrue_b32(), v455, v457); + svfloat32_t zero483 = svdup_n_f32(0); svfloat32_t v483 = svcmla_f32_x(pred_full, zero483, v1014, v455, 90); - svfloat32_t v511; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v511) : "w"(v507), "w"(v509)); - svfloat32_t v512; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v512) : "w"(v507), "w"(v509)); - svfloat32_t v513; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v513) : "w"(v508), "w"(v510)); - svfloat32_t zero536; - asm volatile("mov %0.s, #0" : "=w"(zero536)); + svfloat32_t v511 = svadd_f32_x(svptrue_b32(), v507, v509); + svfloat32_t v512 = svsub_f32_x(svptrue_b32(), v507, v509); + svfloat32_t v513 = svadd_f32_x(svptrue_b32(), v508, v510); + svfloat32_t zero536 = svdup_n_f32(0); svfloat32_t v536 = svcmla_f32_x(pred_full, zero536, v1014, v508, 90); - svfloat32_t v567; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v567) : "w"(v564), "w"(v425)); - svfloat32_t zero596; - asm volatile("mov %0.s, #0" : "=w"(zero596)); + svfloat32_t v567 = svadd_f32_x(svptrue_b32(), v564, v425); + svfloat32_t zero596 = svdup_n_f32(0); svfloat32_t v596 = svcmla_f32_x(pred_full, zero596, v1015, v566, 90); - svfloat32_t v620; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v620) : "w"(v617), "w"(v427)); - svfloat32_t zero641; - asm volatile("mov %0.s, #0" : "=w"(zero641)); + svfloat32_t v620 = svadd_f32_x(svptrue_b32(), v617, v427); + svfloat32_t zero641 = svdup_n_f32(0); svfloat32_t v641 = svcmla_f32_x(pred_full, zero641, v1019, v618, 90); - svfloat32_t v651; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v651) : "w"(v619), "w"(v1021)); - svfloat32_t v461; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v461) : "w"(v458), "w"(v428)); - svfloat32_t zero490; - asm volatile("mov %0.s, #0" : "=w"(zero490)); + svfloat32_t v651 = svmul_f32_x(svptrue_b32(), v619, v1021); + svfloat32_t v461 = svadd_f32_x(svptrue_b32(), v458, v428); + svfloat32_t zero490 = svdup_n_f32(0); svfloat32_t v490 = svcmla_f32_x(pred_full, zero490, v1015, v460, 90); - svfloat32_t v514; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v514) : "w"(v511), "w"(v429)); - svfloat32_t zero543; - asm volatile("mov %0.s, #0" : "=w"(zero543)); + svfloat32_t v514 = svadd_f32_x(svptrue_b32(), v511, v429); + svfloat32_t zero543 = svdup_n_f32(0); svfloat32_t v543 = svcmla_f32_x(pred_full, zero543, v1015, v513, 90); svfloat32_t v604 = svmla_f32_x(pred_full, v567, v564, v1012); - svfloat32_t v607; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v607) : "w"(v589), "w"(v596)); + svfloat32_t v607 = svsub_f32_x(svptrue_b32(), v589, v596); svfloat32_t v608 = svcmla_f32_x(pred_full, v596, v1016, v563, 90); - svfloat32_t zero627; - asm volatile("mov %0.s, #0" : "=w"(zero627)); + svfloat32_t zero627 = svdup_n_f32(0); svfloat32_t v627 = svcmla_f32_x(pred_full, zero627, v1017, v620, 90); svfloat32_t v660 = svnmls_f32_x(pred_full, v651, v614, v1020); svfloat32_t v661 = svmla_f32_x(pred_full, v656, v619, v1021); svfloat32_t v498 = svmla_f32_x(pred_full, v461, v458, v1012); - svfloat32_t v501; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v501) : "w"(v483), "w"(v490)); + svfloat32_t v501 = svsub_f32_x(svptrue_b32(), v483, v490); svfloat32_t v502 = svcmla_f32_x(pred_full, v490, v1016, v457, 90); svfloat32_t v551 = svmla_f32_x(pred_full, v514, v511, v1012); - svfloat32_t v554; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v554) : "w"(v536), "w"(v543)); + svfloat32_t v554 = svsub_f32_x(svptrue_b32(), v536, v543); svfloat32_t v555 = svcmla_f32_x(pred_full, v543, v1016, v510, 90); svfloat32_t v605 = svmla_f32_x(pred_full, v604, v565, v1013); svfloat32_t v606 = svmls_f32_x(pred_full, v604, v565, v1013); svfloat32_t v657 = svcmla_f32_x(pred_full, v627, v1018, v617, 90); - svfloat32_t v666; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v666) : "w"(v567), "w"(v627)); - svfloat32_t v667; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v667) : "w"(v567), "w"(v627)); + svfloat32_t v666 = svadd_f32_x(svptrue_b32(), v567, v627); + svfloat32_t v667 = svsub_f32_x(svptrue_b32(), v567, v627); svst1_scatter_s64index_f64(pred_full, (double *)(v1030), v1202, svreinterpret_f64_f32(v461)); svst1_scatter_s64index_f64(pred_full, (double *)(v1048), v1202, @@ -11428,62 +10094,36 @@ void armral_fft_cf32_cf32_cf32_ab_t_gs20(const armral_cmplx_f32_t *restrict x, svfloat32_t v500 = svmls_f32_x(pred_full, v498, v459, v1013); svfloat32_t v552 = svmla_f32_x(pred_full, v551, v512, v1013); svfloat32_t v553 = svmls_f32_x(pred_full, v551, v512, v1013); - svfloat32_t v609; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v609) : "w"(v605), "w"(v607)); - svfloat32_t v610; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v610) : "w"(v605), "w"(v607)); - svfloat32_t v611; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v611) : "w"(v606), "w"(v608)); - svfloat32_t v612; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v612) : "w"(v606), "w"(v608)); - svfloat32_t v658; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v658) : "w"(v657), "w"(v641)); - svfloat32_t v659; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v659) : "w"(v657), "w"(v641)); + svfloat32_t v609 = svadd_f32_x(svptrue_b32(), v605, v607); + svfloat32_t v610 = svsub_f32_x(svptrue_b32(), v605, v607); + svfloat32_t v611 = svadd_f32_x(svptrue_b32(), v606, v608); + svfloat32_t v612 = svsub_f32_x(svptrue_b32(), v606, v608); + svfloat32_t v658 = svadd_f32_x(svptrue_b32(), v657, v641); + svfloat32_t v659 = svsub_f32_x(svptrue_b32(), v657, v641); svst1_scatter_s64index_f64(pred_full, (double *)(v1039), v1202, svreinterpret_f64_f32(v667)); svst1_scatter_s64index_f64(pred_full, (double *)(v1057), v1202, svreinterpret_f64_f32(v666)); - svfloat32_t v503; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v503) : "w"(v499), "w"(v501)); - svfloat32_t v504; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v504) : "w"(v499), "w"(v501)); - svfloat32_t v505; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v505) : "w"(v500), "w"(v502)); - svfloat32_t v506; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v506) : "w"(v500), "w"(v502)); - svfloat32_t v556; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v556) : "w"(v552), "w"(v554)); - svfloat32_t v557; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v557) : "w"(v552), "w"(v554)); - svfloat32_t v558; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v558) : "w"(v553), "w"(v555)); - svfloat32_t v559; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v559) : "w"(v553), "w"(v555)); - svfloat32_t v662; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v662) : "w"(v658), "w"(v660)); - svfloat32_t v663; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v663) : "w"(v658), "w"(v660)); - svfloat32_t v664; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v664) : "w"(v659), "w"(v661)); - svfloat32_t v665; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v665) : "w"(v659), "w"(v661)); - svfloat32_t v696; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v696) : "w"(v610), "w"(v663)); - svfloat32_t v697; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v697) : "w"(v610), "w"(v663)); - svfloat32_t v726; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v726) : "w"(v612), "w"(v665)); - svfloat32_t v727; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v727) : "w"(v612), "w"(v665)); - svfloat32_t v756; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v756) : "w"(v611), "w"(v664)); - svfloat32_t v757; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v757) : "w"(v611), "w"(v664)); - svfloat32_t v786; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v786) : "w"(v609), "w"(v662)); - svfloat32_t v787; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v787) : "w"(v609), "w"(v662)); + svfloat32_t v503 = svadd_f32_x(svptrue_b32(), v499, v501); + svfloat32_t v504 = svsub_f32_x(svptrue_b32(), v499, v501); + svfloat32_t v505 = svadd_f32_x(svptrue_b32(), v500, v502); + svfloat32_t v506 = svsub_f32_x(svptrue_b32(), v500, v502); + svfloat32_t v556 = svadd_f32_x(svptrue_b32(), v552, v554); + svfloat32_t v557 = svsub_f32_x(svptrue_b32(), v552, v554); + svfloat32_t v558 = svadd_f32_x(svptrue_b32(), v553, v555); + svfloat32_t v559 = svsub_f32_x(svptrue_b32(), v553, v555); + svfloat32_t v662 = svadd_f32_x(svptrue_b32(), v658, v660); + svfloat32_t v663 = svsub_f32_x(svptrue_b32(), v658, v660); + svfloat32_t v664 = svadd_f32_x(svptrue_b32(), v659, v661); + svfloat32_t v665 = svsub_f32_x(svptrue_b32(), v659, v661); + svfloat32_t v696 = svadd_f32_x(svptrue_b32(), v610, v663); + svfloat32_t v697 = svsub_f32_x(svptrue_b32(), v610, v663); + svfloat32_t v726 = svadd_f32_x(svptrue_b32(), v612, v665); + svfloat32_t v727 = svsub_f32_x(svptrue_b32(), v612, v665); + svfloat32_t v756 = svadd_f32_x(svptrue_b32(), v611, v664); + svfloat32_t v757 = svsub_f32_x(svptrue_b32(), v611, v664); + svfloat32_t v786 = svadd_f32_x(svptrue_b32(), v609, v662); + svfloat32_t v787 = svsub_f32_x(svptrue_b32(), v609, v662); svst1_scatter_s64index_f64(pred_full, (double *)(v1066), v1202, svreinterpret_f64_f32(v504)); svst1_scatter_s64index_f64(pred_full, (double *)(v1084), v1202, @@ -12250,8 +10890,7 @@ void armral_fft_cf32_cf32_cf32_ab_t_gs21(const armral_cmplx_f32_t *restrict x, svld1_f64(pred_full, &((const double *)v7)[v267])); svfloat32_t v282 = svreinterpret_f32_f64( svld1_f64(pred_full, &((const double *)v7)[v281])); - svfloat32_t zero318; - asm volatile("mov %0.s, #0" : "=w"(zero318)); + svfloat32_t zero318 = svdup_n_f32(0); svfloat32_t v318 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero318, v995, v317, 0), v995, v317, 90); @@ -12303,101 +10942,73 @@ void armral_fft_cf32_cf32_cf32_ab_t_gs21(const armral_cmplx_f32_t *restrict x, svld1_gather_s64index_f64(pred_full, (const double *)(v1031), v1051)); svfloat32_t v1042 = svreinterpret_f32_f64( svld1_gather_s64index_f64(pred_full, (const double *)(v1040), v1051)); - svfloat32_t zero52; - asm volatile("mov %0.s, #0" : "=w"(zero52)); + svfloat32_t zero52 = svdup_n_f32(0); svfloat32_t v52 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero52, v869, v51, 0), v869, v51, 90); - svfloat32_t zero59; - asm volatile("mov %0.s, #0" : "=w"(zero59)); + svfloat32_t zero59 = svdup_n_f32(0); svfloat32_t v59 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero59, v878, v58, 0), v878, v58, 90); - svfloat32_t zero94; - asm volatile("mov %0.s, #0" : "=w"(zero94)); + svfloat32_t zero94 = svdup_n_f32(0); svfloat32_t v94 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero94, v887, v93, 0), v887, v93, 90); - svfloat32_t zero101; - asm volatile("mov %0.s, #0" : "=w"(zero101)); + svfloat32_t zero101 = svdup_n_f32(0); svfloat32_t v101 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero101, v896, v100, 0), v896, v100, 90); - svfloat32_t zero150; - asm volatile("mov %0.s, #0" : "=w"(zero150)); + svfloat32_t zero150 = svdup_n_f32(0); svfloat32_t v150 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero150, v914, v149, 0), v914, v149, 90); - svfloat32_t zero157; - asm volatile("mov %0.s, #0" : "=w"(zero157)); + svfloat32_t zero157 = svdup_n_f32(0); svfloat32_t v157 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero157, v923, v156, 0), v923, v156, 90); - svfloat32_t zero206; - asm volatile("mov %0.s, #0" : "=w"(zero206)); + svfloat32_t zero206 = svdup_n_f32(0); svfloat32_t v206 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero206, v941, v205, 0), v941, v205, 90); - svfloat32_t zero213; - asm volatile("mov %0.s, #0" : "=w"(zero213)); + svfloat32_t zero213 = svdup_n_f32(0); svfloat32_t v213 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero213, v950, v212, 0), v950, v212, 90); - svfloat32_t zero262; - asm volatile("mov %0.s, #0" : "=w"(zero262)); + svfloat32_t zero262 = svdup_n_f32(0); svfloat32_t v262 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero262, v968, v261, 0), v968, v261, 90); - svfloat32_t zero269; - asm volatile("mov %0.s, #0" : "=w"(zero269)); + svfloat32_t zero269 = svdup_n_f32(0); svfloat32_t v269 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero269, v977, v268, 0), v977, v268, 90); - svfloat32_t zero325; - asm volatile("mov %0.s, #0" : "=w"(zero325)); + svfloat32_t zero325 = svdup_n_f32(0); svfloat32_t v325 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero325, v1005, v324, 0), v1005, v324, 90); - svfloat32_t zero374; - asm volatile("mov %0.s, #0" : "=w"(zero374)); + svfloat32_t zero374 = svdup_n_f32(0); svfloat32_t v374 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero374, v1024, v373, 0), v1024, v373, 90); - svfloat32_t zero381; - asm volatile("mov %0.s, #0" : "=w"(zero381)); + svfloat32_t zero381 = svdup_n_f32(0); svfloat32_t v381 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero381, v1033, v380, 0), v1033, v380, 90); - svfloat32_t v396; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v396) : "w"(v52), "w"(v59)); - svfloat32_t v397; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v397) : "w"(v52), "w"(v59)); - svfloat32_t v406; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v406) : "w"(v94), "w"(v101)); - svfloat32_t v407; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v407) : "w"(v94), "w"(v101)); - svfloat32_t v409; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v409) : "w"(v150), "w"(v157)); - svfloat32_t v410; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v410) : "w"(v150), "w"(v157)); - svfloat32_t v412; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v412) : "w"(v206), "w"(v213)); - svfloat32_t v413; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v413) : "w"(v206), "w"(v213)); - svfloat32_t v415; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v415) : "w"(v262), "w"(v269)); - svfloat32_t v416; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v416) : "w"(v262), "w"(v269)); - svfloat32_t v418; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v418) : "w"(v318), "w"(v325)); - svfloat32_t v419; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v419) : "w"(v318), "w"(v325)); - svfloat32_t v421; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v421) : "w"(v374), "w"(v381)); - svfloat32_t v422; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v422) : "w"(v374), "w"(v381)); - svfloat32_t v405; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v405) : "w"(v396), "w"(v1052)); + svfloat32_t v396 = svadd_f32_x(svptrue_b32(), v52, v59); + svfloat32_t v397 = svsub_f32_x(svptrue_b32(), v52, v59); + svfloat32_t v406 = svadd_f32_x(svptrue_b32(), v94, v101); + svfloat32_t v407 = svsub_f32_x(svptrue_b32(), v94, v101); + svfloat32_t v409 = svadd_f32_x(svptrue_b32(), v150, v157); + svfloat32_t v410 = svsub_f32_x(svptrue_b32(), v150, v157); + svfloat32_t v412 = svadd_f32_x(svptrue_b32(), v206, v213); + svfloat32_t v413 = svsub_f32_x(svptrue_b32(), v206, v213); + svfloat32_t v415 = svadd_f32_x(svptrue_b32(), v262, v269); + svfloat32_t v416 = svsub_f32_x(svptrue_b32(), v262, v269); + svfloat32_t v418 = svadd_f32_x(svptrue_b32(), v318, v325); + svfloat32_t v419 = svsub_f32_x(svptrue_b32(), v318, v325); + svfloat32_t v421 = svadd_f32_x(svptrue_b32(), v374, v381); + svfloat32_t v422 = svsub_f32_x(svptrue_b32(), v374, v381); + svfloat32_t v405 = svadd_f32_x(svptrue_b32(), v396, v1052); svfloat32_t v408 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, v406, v905, v114, 0), v905, v114, 90); @@ -12416,173 +11027,98 @@ void armral_fft_cf32_cf32_cf32_ab_t_gs21(const armral_cmplx_f32_t *restrict x, svfloat32_t v423 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, v421, v1042, v394, 0), v1042, v394, 90); - svfloat32_t v513; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v513) : "w"(v406), "w"(v421)); - svfloat32_t v514; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v514) : "w"(v406), "w"(v421)); - svfloat32_t v515; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v515) : "w"(v415), "w"(v412)); - svfloat32_t v516; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v516) : "w"(v415), "w"(v412)); - svfloat32_t v517; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v517) : "w"(v409), "w"(v418)); - svfloat32_t v518; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v518) : "w"(v409), "w"(v418)); - svfloat32_t v602; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v602) : "w"(v407), "w"(v422)); - svfloat32_t v603; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v603) : "w"(v407), "w"(v422)); - svfloat32_t v604; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v604) : "w"(v416), "w"(v413)); - svfloat32_t v605; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v605) : "w"(v416), "w"(v413)); - svfloat32_t v606; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v606) : "w"(v410), "w"(v419)); - svfloat32_t v607; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v607) : "w"(v410), "w"(v419)); - svfloat32_t v424; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v424) : "w"(v408), "w"(v423)); - svfloat32_t v425; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v425) : "w"(v408), "w"(v423)); - svfloat32_t v426; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v426) : "w"(v417), "w"(v414)); - svfloat32_t v427; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v427) : "w"(v417), "w"(v414)); - svfloat32_t v428; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v428) : "w"(v411), "w"(v420)); - svfloat32_t v429; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v429) : "w"(v411), "w"(v420)); - svfloat32_t v519; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v519) : "w"(v513), "w"(v515)); - svfloat32_t v522; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v522) : "w"(v513), "w"(v515)); - svfloat32_t v523; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v523) : "w"(v515), "w"(v517)); - svfloat32_t v524; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v524) : "w"(v517), "w"(v513)); - svfloat32_t v525; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v525) : "w"(v514), "w"(v516)); - svfloat32_t v527; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v527) : "w"(v514), "w"(v516)); - svfloat32_t v528; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v528) : "w"(v516), "w"(v518)); - svfloat32_t v529; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v529) : "w"(v518), "w"(v514)); - svfloat32_t v608; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v608) : "w"(v602), "w"(v604)); - svfloat32_t v611; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v611) : "w"(v602), "w"(v604)); - svfloat32_t v612; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v612) : "w"(v604), "w"(v606)); - svfloat32_t v613; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v613) : "w"(v606), "w"(v602)); - svfloat32_t v614; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v614) : "w"(v603), "w"(v605)); - svfloat32_t v616; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v616) : "w"(v603), "w"(v605)); - svfloat32_t v617; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v617) : "w"(v605), "w"(v607)); - svfloat32_t v618; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v618) : "w"(v607), "w"(v603)); - svfloat32_t v430; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v430) : "w"(v424), "w"(v426)); - svfloat32_t v433; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v433) : "w"(v424), "w"(v426)); - svfloat32_t v434; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v434) : "w"(v426), "w"(v428)); - svfloat32_t v435; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v435) : "w"(v428), "w"(v424)); - svfloat32_t v436; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v436) : "w"(v425), "w"(v427)); - svfloat32_t v438; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v438) : "w"(v425), "w"(v427)); - svfloat32_t v439; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v439) : "w"(v427), "w"(v429)); - svfloat32_t v440; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v440) : "w"(v429), "w"(v425)); - svfloat32_t v520; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v520) : "w"(v519), "w"(v517)); - svfloat32_t v526; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v526) : "w"(v525), "w"(v518)); - svfloat32_t zero568; - asm volatile("mov %0.s, #0" : "=w"(zero568)); + svfloat32_t v513 = svadd_f32_x(svptrue_b32(), v406, v421); + svfloat32_t v514 = svsub_f32_x(svptrue_b32(), v406, v421); + svfloat32_t v515 = svadd_f32_x(svptrue_b32(), v415, v412); + svfloat32_t v516 = svsub_f32_x(svptrue_b32(), v415, v412); + svfloat32_t v517 = svadd_f32_x(svptrue_b32(), v409, v418); + svfloat32_t v518 = svsub_f32_x(svptrue_b32(), v409, v418); + svfloat32_t v602 = svadd_f32_x(svptrue_b32(), v407, v422); + svfloat32_t v603 = svsub_f32_x(svptrue_b32(), v407, v422); + svfloat32_t v604 = svadd_f32_x(svptrue_b32(), v416, v413); + svfloat32_t v605 = svsub_f32_x(svptrue_b32(), v416, v413); + svfloat32_t v606 = svadd_f32_x(svptrue_b32(), v410, v419); + svfloat32_t v607 = svsub_f32_x(svptrue_b32(), v410, v419); + svfloat32_t v424 = svadd_f32_x(svptrue_b32(), v408, v423); + svfloat32_t v425 = svsub_f32_x(svptrue_b32(), v408, v423); + svfloat32_t v426 = svadd_f32_x(svptrue_b32(), v417, v414); + svfloat32_t v427 = svsub_f32_x(svptrue_b32(), v417, v414); + svfloat32_t v428 = svadd_f32_x(svptrue_b32(), v411, v420); + svfloat32_t v429 = svsub_f32_x(svptrue_b32(), v411, v420); + svfloat32_t v519 = svadd_f32_x(svptrue_b32(), v513, v515); + svfloat32_t v522 = svsub_f32_x(svptrue_b32(), v513, v515); + svfloat32_t v523 = svsub_f32_x(svptrue_b32(), v515, v517); + svfloat32_t v524 = svsub_f32_x(svptrue_b32(), v517, v513); + svfloat32_t v525 = svadd_f32_x(svptrue_b32(), v514, v516); + svfloat32_t v527 = svsub_f32_x(svptrue_b32(), v514, v516); + svfloat32_t v528 = svsub_f32_x(svptrue_b32(), v516, v518); + svfloat32_t v529 = svsub_f32_x(svptrue_b32(), v518, v514); + svfloat32_t v608 = svadd_f32_x(svptrue_b32(), v602, v604); + svfloat32_t v611 = svsub_f32_x(svptrue_b32(), v602, v604); + svfloat32_t v612 = svsub_f32_x(svptrue_b32(), v604, v606); + svfloat32_t v613 = svsub_f32_x(svptrue_b32(), v606, v602); + svfloat32_t v614 = svadd_f32_x(svptrue_b32(), v603, v605); + svfloat32_t v616 = svsub_f32_x(svptrue_b32(), v603, v605); + svfloat32_t v617 = svsub_f32_x(svptrue_b32(), v605, v607); + svfloat32_t v618 = svsub_f32_x(svptrue_b32(), v607, v603); + svfloat32_t v430 = svadd_f32_x(svptrue_b32(), v424, v426); + svfloat32_t v433 = svsub_f32_x(svptrue_b32(), v424, v426); + svfloat32_t v434 = svsub_f32_x(svptrue_b32(), v426, v428); + svfloat32_t v435 = svsub_f32_x(svptrue_b32(), v428, v424); + svfloat32_t v436 = svadd_f32_x(svptrue_b32(), v425, v427); + svfloat32_t v438 = svsub_f32_x(svptrue_b32(), v425, v427); + svfloat32_t v439 = svsub_f32_x(svptrue_b32(), v427, v429); + svfloat32_t v440 = svsub_f32_x(svptrue_b32(), v429, v425); + svfloat32_t v520 = svadd_f32_x(svptrue_b32(), v519, v517); + svfloat32_t v526 = svadd_f32_x(svptrue_b32(), v525, v518); + svfloat32_t zero568 = svdup_n_f32(0); svfloat32_t v568 = svcmla_f32_x(pred_full, zero568, v1068, v527, 90); - svfloat32_t zero575; - asm volatile("mov %0.s, #0" : "=w"(zero575)); + svfloat32_t zero575 = svdup_n_f32(0); svfloat32_t v575 = svcmla_f32_x(pred_full, zero575, v1069, v528, 90); - svfloat32_t zero582; - asm volatile("mov %0.s, #0" : "=w"(zero582)); + svfloat32_t zero582 = svdup_n_f32(0); svfloat32_t v582 = svcmla_f32_x(pred_full, zero582, v1070, v529, 90); - svfloat32_t v609; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v609) : "w"(v608), "w"(v606)); - svfloat32_t v615; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v615) : "w"(v614), "w"(v607)); - svfloat32_t zero639; - asm volatile("mov %0.s, #0" : "=w"(zero639)); + svfloat32_t v609 = svadd_f32_x(svptrue_b32(), v608, v606); + svfloat32_t v615 = svadd_f32_x(svptrue_b32(), v614, v607); + svfloat32_t zero639 = svdup_n_f32(0); svfloat32_t v639 = svcmla_f32_x(pred_full, zero639, v1073, v611, 90); - svfloat32_t zero646; - asm volatile("mov %0.s, #0" : "=w"(zero646)); + svfloat32_t zero646 = svdup_n_f32(0); svfloat32_t v646 = svcmla_f32_x(pred_full, zero646, v1074, v612, 90); - svfloat32_t zero653; - asm volatile("mov %0.s, #0" : "=w"(zero653)); + svfloat32_t zero653 = svdup_n_f32(0); svfloat32_t v653 = svcmla_f32_x(pred_full, zero653, v1075, v613, 90); - svfloat32_t v663; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v663) : "w"(v616), "w"(v1077)); - svfloat32_t v668; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v668) : "w"(v617), "w"(v1078)); - svfloat32_t v431; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v431) : "w"(v430), "w"(v428)); - svfloat32_t v437; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v437) : "w"(v436), "w"(v429)); - svfloat32_t zero479; - asm volatile("mov %0.s, #0" : "=w"(zero479)); + svfloat32_t v663 = svmul_f32_x(svptrue_b32(), v616, v1077); + svfloat32_t v668 = svmul_f32_x(svptrue_b32(), v617, v1078); + svfloat32_t v431 = svadd_f32_x(svptrue_b32(), v430, v428); + svfloat32_t v437 = svadd_f32_x(svptrue_b32(), v436, v429); + svfloat32_t zero479 = svdup_n_f32(0); svfloat32_t v479 = svcmla_f32_x(pred_full, zero479, v1059, v438, 90); - svfloat32_t zero486; - asm volatile("mov %0.s, #0" : "=w"(zero486)); + svfloat32_t zero486 = svdup_n_f32(0); svfloat32_t v486 = svcmla_f32_x(pred_full, zero486, v1060, v439, 90); - svfloat32_t zero493; - asm volatile("mov %0.s, #0" : "=w"(zero493)); + svfloat32_t zero493 = svdup_n_f32(0); svfloat32_t v493 = svcmla_f32_x(pred_full, zero493, v1061, v440, 90); - svfloat32_t v521; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v521) : "w"(v520), "w"(v396)); - svfloat32_t v539; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v539) : "w"(v520), "w"(v1063)); - svfloat32_t zero561; - asm volatile("mov %0.s, #0" : "=w"(zero561)); + svfloat32_t v521 = svadd_f32_x(svptrue_b32(), v520, v396); + svfloat32_t v539 = svmul_f32_x(svptrue_b32(), v520, v1063); + svfloat32_t zero561 = svdup_n_f32(0); svfloat32_t v561 = svcmla_f32_x(pred_full, zero561, v1067, v526, 90); - svfloat32_t v610; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v610) : "w"(v609), "w"(v397)); - svfloat32_t v432; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v432) : "w"(v431), "w"(v405)); - svfloat32_t zero472; - asm volatile("mov %0.s, #0" : "=w"(zero472)); + svfloat32_t v610 = svadd_f32_x(svptrue_b32(), v609, v397); + svfloat32_t v432 = svadd_f32_x(svptrue_b32(), v431, v405); + svfloat32_t zero472 = svdup_n_f32(0); svfloat32_t v472 = svcmla_f32_x(pred_full, zero472, v1058, v437, 90); - svfloat32_t v590; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v590) : "w"(v561), "w"(v568)); - svfloat32_t v592; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v592) : "w"(v561), "w"(v568)); - svfloat32_t v594; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v594) : "w"(v561), "w"(v575)); - svfloat32_t zero625; - asm volatile("mov %0.s, #0" : "=w"(zero625)); + svfloat32_t v590 = svadd_f32_x(svptrue_b32(), v561, v568); + svfloat32_t v592 = svsub_f32_x(svptrue_b32(), v561, v568); + svfloat32_t v594 = svsub_f32_x(svptrue_b32(), v561, v575); + svfloat32_t zero625 = svdup_n_f32(0); svfloat32_t v625 = svcmla_f32_x(pred_full, zero625, v1071, v610, 90); svfloat32_t v681 = svmla_f32_x(pred_full, v663, v615, v1076); svfloat32_t v683 = svnmls_f32_x(pred_full, v663, v615, v1076); svfloat32_t v685 = svnmls_f32_x(pred_full, v668, v615, v1076); svfloat32_t v494 = svmla_f32_x(pred_full, v432, v431, v1054); - svfloat32_t v501; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v501) : "w"(v472), "w"(v479)); - svfloat32_t v503; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v503) : "w"(v472), "w"(v479)); - svfloat32_t v505; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v505) : "w"(v472), "w"(v486)); + svfloat32_t v501 = svadd_f32_x(svptrue_b32(), v472, v479); + svfloat32_t v503 = svsub_f32_x(svptrue_b32(), v472, v479); + svfloat32_t v505 = svsub_f32_x(svptrue_b32(), v472, v486); svfloat32_t v583 = svmla_f32_x(pred_full, v539, v521, v1062); - svfloat32_t v591; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v591) : "w"(v590), "w"(v575)); - svfloat32_t v593; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v593) : "w"(v592), "w"(v582)); - svfloat32_t v595; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v595) : "w"(v594), "w"(v582)); + svfloat32_t v591 = svadd_f32_x(svptrue_b32(), v590, v575); + svfloat32_t v593 = svsub_f32_x(svptrue_b32(), v592, v582); + svfloat32_t v595 = svadd_f32_x(svptrue_b32(), v594, v582); svfloat32_t v674 = svcmla_f32_x(pred_full, v625, v1072, v609, 90); svfloat32_t v682 = svmla_f32_x(pred_full, v681, v617, v1078); svfloat32_t v684 = svmls_f32_x(pred_full, v683, v618, v1079); @@ -12593,89 +11129,54 @@ void armral_fft_cf32_cf32_cf32_ab_t_gs21(const armral_cmplx_f32_t *restrict x, svfloat32_t v495 = svmla_f32_x(pred_full, v494, v433, v1055); svfloat32_t v497 = svmls_f32_x(pred_full, v494, v433, v1055); svfloat32_t v499 = svmls_f32_x(pred_full, v494, v434, v1056); - svfloat32_t v502; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v502) : "w"(v501), "w"(v486)); - svfloat32_t v504; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v504) : "w"(v503), "w"(v493)); - svfloat32_t v506; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v506) : "w"(v505), "w"(v493)); + svfloat32_t v502 = svadd_f32_x(svptrue_b32(), v501, v486); + svfloat32_t v504 = svsub_f32_x(svptrue_b32(), v503, v493); + svfloat32_t v506 = svadd_f32_x(svptrue_b32(), v505, v493); svfloat32_t v584 = svmla_f32_x(pred_full, v583, v522, v1064); svfloat32_t v586 = svmls_f32_x(pred_full, v583, v522, v1064); svfloat32_t v588 = svmls_f32_x(pred_full, v583, v523, v1065); - svfloat32_t v675; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v675) : "w"(v674), "w"(v639)); - svfloat32_t v677; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v677) : "w"(v674), "w"(v639)); - svfloat32_t v679; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v679) : "w"(v674), "w"(v646)); - svfloat32_t v694; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v694) : "w"(v693), "w"(v625)); - svfloat32_t v695; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v695) : "w"(v693), "w"(v625)); + svfloat32_t v675 = svadd_f32_x(svptrue_b32(), v674, v639); + svfloat32_t v677 = svsub_f32_x(svptrue_b32(), v674, v639); + svfloat32_t v679 = svsub_f32_x(svptrue_b32(), v674, v646); + svfloat32_t v694 = svadd_f32_x(svptrue_b32(), v693, v625); + svfloat32_t v695 = svsub_f32_x(svptrue_b32(), v693, v625); svfloat32_t v496 = svmla_f32_x(pred_full, v495, v434, v1056); svfloat32_t v498 = svmls_f32_x(pred_full, v497, v435, v1057); svfloat32_t v500 = svmla_f32_x(pred_full, v499, v435, v1057); svfloat32_t v585 = svmla_f32_x(pred_full, v584, v523, v1065); svfloat32_t v587 = svmls_f32_x(pred_full, v586, v524, v1066); svfloat32_t v589 = svmla_f32_x(pred_full, v588, v524, v1066); - svfloat32_t v676; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v676) : "w"(v675), "w"(v646)); - svfloat32_t v678; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v678) : "w"(v677), "w"(v653)); - svfloat32_t v680; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v680) : "w"(v679), "w"(v653)); + svfloat32_t v676 = svadd_f32_x(svptrue_b32(), v675, v646); + svfloat32_t v678 = svsub_f32_x(svptrue_b32(), v677, v653); + svfloat32_t v680 = svadd_f32_x(svptrue_b32(), v679, v653); svst1_scatter_s64index_f64(pred_full, (double *)(v1096), v1268, svreinterpret_f64_f32(v695)); svst1_scatter_s64index_f64(pred_full, (double *)(v1105), v1268, svreinterpret_f64_f32(v694)); - svfloat32_t v507; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v507) : "w"(v496), "w"(v502)); - svfloat32_t v508; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v508) : "w"(v496), "w"(v502)); - svfloat32_t v509; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v509) : "w"(v498), "w"(v504)); - svfloat32_t v510; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v510) : "w"(v498), "w"(v504)); - svfloat32_t v511; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v511) : "w"(v500), "w"(v506)); - svfloat32_t v512; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v512) : "w"(v500), "w"(v506)); - svfloat32_t v596; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v596) : "w"(v585), "w"(v591)); - svfloat32_t v597; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v597) : "w"(v585), "w"(v591)); - svfloat32_t v598; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v598) : "w"(v587), "w"(v593)); - svfloat32_t v599; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v599) : "w"(v587), "w"(v593)); - svfloat32_t v600; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v600) : "w"(v589), "w"(v595)); - svfloat32_t v601; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v601) : "w"(v589), "w"(v595)); - svfloat32_t v687; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v687) : "w"(v676), "w"(v682)); - svfloat32_t v688; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v688) : "w"(v676), "w"(v682)); - svfloat32_t v689; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v689) : "w"(v678), "w"(v684)); - svfloat32_t v690; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v690) : "w"(v678), "w"(v684)); - svfloat32_t v691; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v691) : "w"(v680), "w"(v686)); - svfloat32_t v692; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v692) : "w"(v680), "w"(v686)); - svfloat32_t v717; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v717) : "w"(v508), "w"(v597)); - svfloat32_t v741; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v741) : "w"(v510), "w"(v599)); - svfloat32_t v765; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v765) : "w"(v511), "w"(v600)); - svfloat32_t v789; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v789) : "w"(v512), "w"(v601)); - svfloat32_t v813; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v813) : "w"(v509), "w"(v598)); - svfloat32_t v837; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v837) : "w"(v507), "w"(v596)); + svfloat32_t v507 = svadd_f32_x(svptrue_b32(), v496, v502); + svfloat32_t v508 = svsub_f32_x(svptrue_b32(), v496, v502); + svfloat32_t v509 = svadd_f32_x(svptrue_b32(), v498, v504); + svfloat32_t v510 = svsub_f32_x(svptrue_b32(), v498, v504); + svfloat32_t v511 = svadd_f32_x(svptrue_b32(), v500, v506); + svfloat32_t v512 = svsub_f32_x(svptrue_b32(), v500, v506); + svfloat32_t v596 = svadd_f32_x(svptrue_b32(), v585, v591); + svfloat32_t v597 = svsub_f32_x(svptrue_b32(), v585, v591); + svfloat32_t v598 = svadd_f32_x(svptrue_b32(), v587, v593); + svfloat32_t v599 = svsub_f32_x(svptrue_b32(), v587, v593); + svfloat32_t v600 = svadd_f32_x(svptrue_b32(), v589, v595); + svfloat32_t v601 = svsub_f32_x(svptrue_b32(), v589, v595); + svfloat32_t v687 = svadd_f32_x(svptrue_b32(), v676, v682); + svfloat32_t v688 = svsub_f32_x(svptrue_b32(), v676, v682); + svfloat32_t v689 = svadd_f32_x(svptrue_b32(), v678, v684); + svfloat32_t v690 = svsub_f32_x(svptrue_b32(), v678, v684); + svfloat32_t v691 = svadd_f32_x(svptrue_b32(), v680, v686); + svfloat32_t v692 = svsub_f32_x(svptrue_b32(), v680, v686); + svfloat32_t v717 = svadd_f32_x(svptrue_b32(), v508, v597); + svfloat32_t v741 = svadd_f32_x(svptrue_b32(), v510, v599); + svfloat32_t v765 = svadd_f32_x(svptrue_b32(), v511, v600); + svfloat32_t v789 = svadd_f32_x(svptrue_b32(), v512, v601); + svfloat32_t v813 = svadd_f32_x(svptrue_b32(), v509, v598); + svfloat32_t v837 = svadd_f32_x(svptrue_b32(), v507, v596); svst1_scatter_s64index_f64(pred_full, (double *)(v1114), v1268, svreinterpret_f64_f32(v508)); svst1_scatter_s64index_f64(pred_full, (double *)(v1141), v1268, @@ -12688,30 +11189,18 @@ void armral_fft_cf32_cf32_cf32_ab_t_gs21(const armral_cmplx_f32_t *restrict x, svreinterpret_f64_f32(v509)); svst1_scatter_s64index_f64(pred_full, (double *)(v1249), v1268, svreinterpret_f64_f32(v507)); - svfloat32_t v718; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v718) : "w"(v717), "w"(v688)); - svfloat32_t v719; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v719) : "w"(v717), "w"(v688)); - svfloat32_t v742; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v742) : "w"(v741), "w"(v690)); - svfloat32_t v743; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v743) : "w"(v741), "w"(v690)); - svfloat32_t v766; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v766) : "w"(v765), "w"(v691)); - svfloat32_t v767; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v767) : "w"(v765), "w"(v691)); - svfloat32_t v790; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v790) : "w"(v789), "w"(v692)); - svfloat32_t v791; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v791) : "w"(v789), "w"(v692)); - svfloat32_t v814; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v814) : "w"(v813), "w"(v689)); - svfloat32_t v815; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v815) : "w"(v813), "w"(v689)); - svfloat32_t v838; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v838) : "w"(v837), "w"(v687)); - svfloat32_t v839; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v839) : "w"(v837), "w"(v687)); + svfloat32_t v718 = svadd_f32_x(svptrue_b32(), v717, v688); + svfloat32_t v719 = svsub_f32_x(svptrue_b32(), v717, v688); + svfloat32_t v742 = svadd_f32_x(svptrue_b32(), v741, v690); + svfloat32_t v743 = svsub_f32_x(svptrue_b32(), v741, v690); + svfloat32_t v766 = svadd_f32_x(svptrue_b32(), v765, v691); + svfloat32_t v767 = svsub_f32_x(svptrue_b32(), v765, v691); + svfloat32_t v790 = svadd_f32_x(svptrue_b32(), v789, v692); + svfloat32_t v791 = svsub_f32_x(svptrue_b32(), v789, v692); + svfloat32_t v814 = svadd_f32_x(svptrue_b32(), v813, v689); + svfloat32_t v815 = svsub_f32_x(svptrue_b32(), v813, v689); + svfloat32_t v838 = svadd_f32_x(svptrue_b32(), v837, v687); + svfloat32_t v839 = svsub_f32_x(svptrue_b32(), v837, v687); svst1_scatter_s64index_f64(pred_full, (double *)(v1123), v1268, svreinterpret_f64_f32(v719)); svst1_scatter_s64index_f64(pred_full, (double *)(v1132), v1268, @@ -13510,8 +11999,7 @@ void armral_fft_cf32_cf32_cf32_ab_t_gs22(const armral_cmplx_f32_t *restrict x, svld1_f64(pred_full, &((const double *)v7)[v246])); svfloat32_t v282 = svreinterpret_f32_f64( svld1_f64(pred_full, &((const double *)v7)[v281])); - svfloat32_t zero290; - asm volatile("mov %0.s, #0" : "=w"(zero290)); + svfloat32_t zero290 = svdup_n_f32(0); svfloat32_t v290 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero290, v1176, v289, 0), v1176, v289, 90); @@ -13571,340 +12059,208 @@ void armral_fft_cf32_cf32_cf32_ab_t_gs22(const armral_cmplx_f32_t *restrict x, svld1_gather_s64index_f64(pred_full, (const double *)(v1239), v1259)); svfloat32_t v1250 = svreinterpret_f32_f64( svld1_gather_s64index_f64(pred_full, (const double *)(v1248), v1259)); - svfloat32_t zero38; - asm volatile("mov %0.s, #0" : "=w"(zero38)); + svfloat32_t zero38 = svdup_n_f32(0); svfloat32_t v38 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero38, v1068, v37, 0), v1068, v37, 90); - svfloat32_t zero73; - asm volatile("mov %0.s, #0" : "=w"(zero73)); + svfloat32_t zero73 = svdup_n_f32(0); svfloat32_t v73 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero73, v1077, v72, 0), v1077, v72, 90); - svfloat32_t zero80; - asm volatile("mov %0.s, #0" : "=w"(zero80)); + svfloat32_t zero80 = svdup_n_f32(0); svfloat32_t v80 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero80, v1086, v79, 0), v1086, v79, 90); - svfloat32_t zero115; - asm volatile("mov %0.s, #0" : "=w"(zero115)); + svfloat32_t zero115 = svdup_n_f32(0); svfloat32_t v115 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero115, v1095, v114, 0), v1095, v114, 90); - svfloat32_t zero122; - asm volatile("mov %0.s, #0" : "=w"(zero122)); + svfloat32_t zero122 = svdup_n_f32(0); svfloat32_t v122 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero122, v1104, v121, 0), v1104, v121, 90); - svfloat32_t zero157; - asm volatile("mov %0.s, #0" : "=w"(zero157)); + svfloat32_t zero157 = svdup_n_f32(0); svfloat32_t v157 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero157, v1113, v156, 0), v1113, v156, 90); - svfloat32_t zero164; - asm volatile("mov %0.s, #0" : "=w"(zero164)); + svfloat32_t zero164 = svdup_n_f32(0); svfloat32_t v164 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero164, v1122, v163, 0), v1122, v163, 90); - svfloat32_t zero199; - asm volatile("mov %0.s, #0" : "=w"(zero199)); + svfloat32_t zero199 = svdup_n_f32(0); svfloat32_t v199 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero199, v1131, v198, 0), v1131, v198, 90); - svfloat32_t zero206; - asm volatile("mov %0.s, #0" : "=w"(zero206)); + svfloat32_t zero206 = svdup_n_f32(0); svfloat32_t v206 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero206, v1140, v205, 0), v1140, v205, 90); - svfloat32_t zero241; - asm volatile("mov %0.s, #0" : "=w"(zero241)); + svfloat32_t zero241 = svdup_n_f32(0); svfloat32_t v241 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero241, v1149, v240, 0), v1149, v240, 90); - svfloat32_t zero248; - asm volatile("mov %0.s, #0" : "=w"(zero248)); + svfloat32_t zero248 = svdup_n_f32(0); svfloat32_t v248 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero248, v1158, v247, 0), v1158, v247, 90); - svfloat32_t zero283; - asm volatile("mov %0.s, #0" : "=w"(zero283)); + svfloat32_t zero283 = svdup_n_f32(0); svfloat32_t v283 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero283, v1167, v282, 0), v1167, v282, 90); - svfloat32_t zero325; - asm volatile("mov %0.s, #0" : "=w"(zero325)); + svfloat32_t zero325 = svdup_n_f32(0); svfloat32_t v325 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero325, v1187, v324, 0), v1187, v324, 90); - svfloat32_t zero332; - asm volatile("mov %0.s, #0" : "=w"(zero332)); + svfloat32_t zero332 = svdup_n_f32(0); svfloat32_t v332 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero332, v1196, v331, 0), v1196, v331, 90); - svfloat32_t zero367; - asm volatile("mov %0.s, #0" : "=w"(zero367)); + svfloat32_t zero367 = svdup_n_f32(0); svfloat32_t v367 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero367, v1205, v366, 0), v1205, v366, 90); - svfloat32_t zero374; - asm volatile("mov %0.s, #0" : "=w"(zero374)); + svfloat32_t zero374 = svdup_n_f32(0); svfloat32_t v374 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero374, v1214, v373, 0), v1214, v373, 90); - svfloat32_t zero409; - asm volatile("mov %0.s, #0" : "=w"(zero409)); + svfloat32_t zero409 = svdup_n_f32(0); svfloat32_t v409 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero409, v1223, v408, 0), v1223, v408, 90); - svfloat32_t zero416; - asm volatile("mov %0.s, #0" : "=w"(zero416)); + svfloat32_t zero416 = svdup_n_f32(0); svfloat32_t v416 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero416, v1232, v415, 0), v1232, v415, 90); - svfloat32_t zero451; - asm volatile("mov %0.s, #0" : "=w"(zero451)); + svfloat32_t zero451 = svdup_n_f32(0); svfloat32_t v451 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero451, v1241, v450, 0), v1241, v450, 90); - svfloat32_t zero458; - asm volatile("mov %0.s, #0" : "=w"(zero458)); + svfloat32_t zero458 = svdup_n_f32(0); svfloat32_t v458 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero458, v1250, v457, 0), v1250, v457, 90); - svfloat32_t v466; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v466) : "w"(v1260), "w"(v38)); - svfloat32_t v467; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v467) : "w"(v1260), "w"(v38)); - svfloat32_t v468; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v468) : "w"(v73), "w"(v80)); - svfloat32_t v469; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v469) : "w"(v73), "w"(v80)); - svfloat32_t v470; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v470) : "w"(v115), "w"(v122)); - svfloat32_t v471; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v471) : "w"(v115), "w"(v122)); - svfloat32_t v472; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v472) : "w"(v157), "w"(v164)); - svfloat32_t v473; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v473) : "w"(v157), "w"(v164)); - svfloat32_t v474; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v474) : "w"(v199), "w"(v206)); - svfloat32_t v475; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v475) : "w"(v199), "w"(v206)); - svfloat32_t v476; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v476) : "w"(v241), "w"(v248)); - svfloat32_t v477; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v477) : "w"(v241), "w"(v248)); - svfloat32_t v478; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v478) : "w"(v283), "w"(v290)); - svfloat32_t v479; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v479) : "w"(v283), "w"(v290)); - svfloat32_t v480; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v480) : "w"(v325), "w"(v332)); - svfloat32_t v481; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v481) : "w"(v325), "w"(v332)); - svfloat32_t v482; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v482) : "w"(v367), "w"(v374)); - svfloat32_t v483; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v483) : "w"(v367), "w"(v374)); - svfloat32_t v484; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v484) : "w"(v409), "w"(v416)); - svfloat32_t v485; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v485) : "w"(v409), "w"(v416)); - svfloat32_t v486; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v486) : "w"(v451), "w"(v458)); - svfloat32_t v487; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v487) : "w"(v451), "w"(v458)); - svfloat32_t v488; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v488) : "w"(v468), "w"(v486)); - svfloat32_t v489; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v489) : "w"(v470), "w"(v484)); - svfloat32_t v490; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v490) : "w"(v472), "w"(v482)); - svfloat32_t v491; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v491) : "w"(v474), "w"(v480)); - svfloat32_t v492; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v492) : "w"(v476), "w"(v478)); - svfloat32_t v493; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v493) : "w"(v468), "w"(v486)); - svfloat32_t v494; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v494) : "w"(v470), "w"(v484)); - svfloat32_t v495; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v495) : "w"(v472), "w"(v482)); - svfloat32_t v496; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v496) : "w"(v474), "w"(v480)); - svfloat32_t v497; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v497) : "w"(v476), "w"(v478)); - svfloat32_t v697; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v697) : "w"(v469), "w"(v487)); - svfloat32_t v698; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v698) : "w"(v471), "w"(v485)); - svfloat32_t v699; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v699) : "w"(v473), "w"(v483)); - svfloat32_t v700; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v700) : "w"(v475), "w"(v481)); - svfloat32_t v701; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v701) : "w"(v477), "w"(v479)); - svfloat32_t v702; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v702) : "w"(v469), "w"(v487)); - svfloat32_t v703; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v703) : "w"(v471), "w"(v485)); - svfloat32_t v704; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v704) : "w"(v473), "w"(v483)); - svfloat32_t v705; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v705) : "w"(v475), "w"(v481)); - svfloat32_t v706; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v706) : "w"(v477), "w"(v479)); - svfloat32_t v498; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v498) : "w"(v488), "w"(v489)); - svfloat32_t v499; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v499) : "w"(v490), "w"(v492)); - svfloat32_t v501; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v501) : "w"(v494), "w"(v495)); - svfloat32_t v502; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v502) : "w"(v493), "w"(v497)); - svfloat32_t v507; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v507) : "w"(v489), "w"(v491)); - svfloat32_t v508; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v508) : "w"(v488), "w"(v491)); - svfloat32_t v509; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v509) : "w"(v489), "w"(v488)); - svfloat32_t v510; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v510) : "w"(v492), "w"(v491)); - svfloat32_t v511; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v511) : "w"(v490), "w"(v491)); - svfloat32_t v512; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v512) : "w"(v492), "w"(v490)); - svfloat32_t v513; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v513) : "w"(v489), "w"(v492)); - svfloat32_t v514; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v514) : "w"(v488), "w"(v490)); - svfloat32_t v516; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v516) : "w"(v494), "w"(v496)); - svfloat32_t v517; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v517) : "w"(v493), "w"(v496)); - svfloat32_t v518; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v518) : "w"(v493), "w"(v494)); - svfloat32_t v519; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v519) : "w"(v496), "w"(v497)); - svfloat32_t v520; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v520) : "w"(v495), "w"(v496)); - svfloat32_t v521; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v521) : "w"(v495), "w"(v497)); - svfloat32_t v522; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v522) : "w"(v494), "w"(v497)); - svfloat32_t v523; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v523) : "w"(v493), "w"(v495)); - svfloat32_t v707; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v707) : "w"(v697), "w"(v698)); - svfloat32_t v708; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v708) : "w"(v699), "w"(v701)); - svfloat32_t v710; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v710) : "w"(v703), "w"(v704)); - svfloat32_t v711; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v711) : "w"(v702), "w"(v706)); - svfloat32_t v716; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v716) : "w"(v698), "w"(v700)); - svfloat32_t v717; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v717) : "w"(v697), "w"(v700)); - svfloat32_t v718; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v718) : "w"(v698), "w"(v697)); - svfloat32_t v719; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v719) : "w"(v701), "w"(v700)); - svfloat32_t v720; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v720) : "w"(v699), "w"(v700)); - svfloat32_t v721; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v721) : "w"(v701), "w"(v699)); - svfloat32_t v722; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v722) : "w"(v698), "w"(v701)); - svfloat32_t v723; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v723) : "w"(v697), "w"(v699)); - svfloat32_t v725; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v725) : "w"(v703), "w"(v705)); - svfloat32_t v726; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v726) : "w"(v702), "w"(v705)); - svfloat32_t v727; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v727) : "w"(v702), "w"(v703)); - svfloat32_t v728; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v728) : "w"(v705), "w"(v706)); - svfloat32_t v729; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v729) : "w"(v704), "w"(v705)); - svfloat32_t v730; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v730) : "w"(v704), "w"(v706)); - svfloat32_t v731; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v731) : "w"(v703), "w"(v706)); - svfloat32_t v732; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v732) : "w"(v702), "w"(v704)); - svfloat32_t v500; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v500) : "w"(v491), "w"(v498)); - svfloat32_t v505; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v505) : "w"(v501), "w"(v502)); - svfloat32_t v515; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v515) : "w"(v499), "w"(v498)); - svfloat32_t v524; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v524) : "w"(v501), "w"(v502)); - svfloat32_t v551; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v551) : "w"(v508), "w"(v1286)); - svfloat32_t v556; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v556) : "w"(v509), "w"(v1287)); - svfloat32_t v566; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v566) : "w"(v511), "w"(v1289)); - svfloat32_t v571; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v571) : "w"(v512), "w"(v1290)); - svfloat32_t zero593; - asm volatile("mov %0.s, #0" : "=w"(zero593)); + svfloat32_t v466 = svadd_f32_x(svptrue_b32(), v1260, v38); + svfloat32_t v467 = svsub_f32_x(svptrue_b32(), v1260, v38); + svfloat32_t v468 = svadd_f32_x(svptrue_b32(), v73, v80); + svfloat32_t v469 = svsub_f32_x(svptrue_b32(), v73, v80); + svfloat32_t v470 = svadd_f32_x(svptrue_b32(), v115, v122); + svfloat32_t v471 = svsub_f32_x(svptrue_b32(), v115, v122); + svfloat32_t v472 = svadd_f32_x(svptrue_b32(), v157, v164); + svfloat32_t v473 = svsub_f32_x(svptrue_b32(), v157, v164); + svfloat32_t v474 = svadd_f32_x(svptrue_b32(), v199, v206); + svfloat32_t v475 = svsub_f32_x(svptrue_b32(), v199, v206); + svfloat32_t v476 = svadd_f32_x(svptrue_b32(), v241, v248); + svfloat32_t v477 = svsub_f32_x(svptrue_b32(), v241, v248); + svfloat32_t v478 = svadd_f32_x(svptrue_b32(), v283, v290); + svfloat32_t v479 = svsub_f32_x(svptrue_b32(), v283, v290); + svfloat32_t v480 = svadd_f32_x(svptrue_b32(), v325, v332); + svfloat32_t v481 = svsub_f32_x(svptrue_b32(), v325, v332); + svfloat32_t v482 = svadd_f32_x(svptrue_b32(), v367, v374); + svfloat32_t v483 = svsub_f32_x(svptrue_b32(), v367, v374); + svfloat32_t v484 = svadd_f32_x(svptrue_b32(), v409, v416); + svfloat32_t v485 = svsub_f32_x(svptrue_b32(), v409, v416); + svfloat32_t v486 = svadd_f32_x(svptrue_b32(), v451, v458); + svfloat32_t v487 = svsub_f32_x(svptrue_b32(), v451, v458); + svfloat32_t v488 = svadd_f32_x(svptrue_b32(), v468, v486); + svfloat32_t v489 = svadd_f32_x(svptrue_b32(), v470, v484); + svfloat32_t v490 = svadd_f32_x(svptrue_b32(), v472, v482); + svfloat32_t v491 = svadd_f32_x(svptrue_b32(), v474, v480); + svfloat32_t v492 = svadd_f32_x(svptrue_b32(), v476, v478); + svfloat32_t v493 = svsub_f32_x(svptrue_b32(), v468, v486); + svfloat32_t v494 = svsub_f32_x(svptrue_b32(), v470, v484); + svfloat32_t v495 = svsub_f32_x(svptrue_b32(), v472, v482); + svfloat32_t v496 = svsub_f32_x(svptrue_b32(), v474, v480); + svfloat32_t v497 = svsub_f32_x(svptrue_b32(), v476, v478); + svfloat32_t v697 = svadd_f32_x(svptrue_b32(), v469, v487); + svfloat32_t v698 = svadd_f32_x(svptrue_b32(), v471, v485); + svfloat32_t v699 = svadd_f32_x(svptrue_b32(), v473, v483); + svfloat32_t v700 = svadd_f32_x(svptrue_b32(), v475, v481); + svfloat32_t v701 = svadd_f32_x(svptrue_b32(), v477, v479); + svfloat32_t v702 = svsub_f32_x(svptrue_b32(), v469, v487); + svfloat32_t v703 = svsub_f32_x(svptrue_b32(), v471, v485); + svfloat32_t v704 = svsub_f32_x(svptrue_b32(), v473, v483); + svfloat32_t v705 = svsub_f32_x(svptrue_b32(), v475, v481); + svfloat32_t v706 = svsub_f32_x(svptrue_b32(), v477, v479); + svfloat32_t v498 = svadd_f32_x(svptrue_b32(), v488, v489); + svfloat32_t v499 = svadd_f32_x(svptrue_b32(), v490, v492); + svfloat32_t v501 = svsub_f32_x(svptrue_b32(), v494, v495); + svfloat32_t v502 = svadd_f32_x(svptrue_b32(), v493, v497); + svfloat32_t v507 = svsub_f32_x(svptrue_b32(), v489, v491); + svfloat32_t v508 = svsub_f32_x(svptrue_b32(), v488, v491); + svfloat32_t v509 = svsub_f32_x(svptrue_b32(), v489, v488); + svfloat32_t v510 = svsub_f32_x(svptrue_b32(), v492, v491); + svfloat32_t v511 = svsub_f32_x(svptrue_b32(), v490, v491); + svfloat32_t v512 = svsub_f32_x(svptrue_b32(), v492, v490); + svfloat32_t v513 = svsub_f32_x(svptrue_b32(), v489, v492); + svfloat32_t v514 = svsub_f32_x(svptrue_b32(), v488, v490); + svfloat32_t v516 = svadd_f32_x(svptrue_b32(), v494, v496); + svfloat32_t v517 = svsub_f32_x(svptrue_b32(), v493, v496); + svfloat32_t v518 = svadd_f32_x(svptrue_b32(), v493, v494); + svfloat32_t v519 = svsub_f32_x(svptrue_b32(), v496, v497); + svfloat32_t v520 = svsub_f32_x(svptrue_b32(), v495, v496); + svfloat32_t v521 = svsub_f32_x(svptrue_b32(), v495, v497); + svfloat32_t v522 = svadd_f32_x(svptrue_b32(), v494, v497); + svfloat32_t v523 = svsub_f32_x(svptrue_b32(), v493, v495); + svfloat32_t v707 = svadd_f32_x(svptrue_b32(), v697, v698); + svfloat32_t v708 = svadd_f32_x(svptrue_b32(), v699, v701); + svfloat32_t v710 = svsub_f32_x(svptrue_b32(), v703, v704); + svfloat32_t v711 = svadd_f32_x(svptrue_b32(), v702, v706); + svfloat32_t v716 = svsub_f32_x(svptrue_b32(), v698, v700); + svfloat32_t v717 = svsub_f32_x(svptrue_b32(), v697, v700); + svfloat32_t v718 = svsub_f32_x(svptrue_b32(), v698, v697); + svfloat32_t v719 = svsub_f32_x(svptrue_b32(), v701, v700); + svfloat32_t v720 = svsub_f32_x(svptrue_b32(), v699, v700); + svfloat32_t v721 = svsub_f32_x(svptrue_b32(), v701, v699); + svfloat32_t v722 = svsub_f32_x(svptrue_b32(), v698, v701); + svfloat32_t v723 = svsub_f32_x(svptrue_b32(), v697, v699); + svfloat32_t v725 = svadd_f32_x(svptrue_b32(), v703, v705); + svfloat32_t v726 = svsub_f32_x(svptrue_b32(), v702, v705); + svfloat32_t v727 = svadd_f32_x(svptrue_b32(), v702, v703); + svfloat32_t v728 = svsub_f32_x(svptrue_b32(), v705, v706); + svfloat32_t v729 = svsub_f32_x(svptrue_b32(), v704, v705); + svfloat32_t v730 = svsub_f32_x(svptrue_b32(), v704, v706); + svfloat32_t v731 = svadd_f32_x(svptrue_b32(), v703, v706); + svfloat32_t v732 = svsub_f32_x(svptrue_b32(), v702, v704); + svfloat32_t v500 = svadd_f32_x(svptrue_b32(), v491, v498); + svfloat32_t v505 = svsub_f32_x(svptrue_b32(), v501, v502); + svfloat32_t v515 = svsub_f32_x(svptrue_b32(), v499, v498); + svfloat32_t v524 = svadd_f32_x(svptrue_b32(), v501, v502); + svfloat32_t v551 = svmul_f32_x(svptrue_b32(), v508, v1286); + svfloat32_t v556 = svmul_f32_x(svptrue_b32(), v509, v1287); + svfloat32_t v566 = svmul_f32_x(svptrue_b32(), v511, v1289); + svfloat32_t v571 = svmul_f32_x(svptrue_b32(), v512, v1290); + svfloat32_t zero593 = svdup_n_f32(0); svfloat32_t v593 = svcmla_f32_x(pred_full, zero593, v1294, v516, 90); - svfloat32_t zero607; - asm volatile("mov %0.s, #0" : "=w"(zero607)); + svfloat32_t zero607 = svdup_n_f32(0); svfloat32_t v607 = svcmla_f32_x(pred_full, zero607, v1296, v518, 90); - svfloat32_t zero614; - asm volatile("mov %0.s, #0" : "=w"(zero614)); + svfloat32_t zero614 = svdup_n_f32(0); svfloat32_t v614 = svcmla_f32_x(pred_full, zero614, v1297, v519, 90); - svfloat32_t zero628; - asm volatile("mov %0.s, #0" : "=w"(zero628)); + svfloat32_t zero628 = svdup_n_f32(0); svfloat32_t v628 = svcmla_f32_x(pred_full, zero628, v1299, v521, 90); - svfloat32_t zero635; - asm volatile("mov %0.s, #0" : "=w"(zero635)); + svfloat32_t zero635 = svdup_n_f32(0); svfloat32_t v635 = svcmla_f32_x(pred_full, zero635, v1300, v522, 90); - svfloat32_t v709; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v709) : "w"(v700), "w"(v707)); - svfloat32_t v714; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v714) : "w"(v710), "w"(v711)); - svfloat32_t v724; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v724) : "w"(v708), "w"(v707)); - svfloat32_t v733; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v733) : "w"(v710), "w"(v711)); - svfloat32_t v760; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v760) : "w"(v717), "w"(v1286)); - svfloat32_t v765; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v765) : "w"(v718), "w"(v1287)); - svfloat32_t v775; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v775) : "w"(v720), "w"(v1289)); - svfloat32_t v780; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v780) : "w"(v721), "w"(v1290)); - svfloat32_t zero802; - asm volatile("mov %0.s, #0" : "=w"(zero802)); + svfloat32_t v709 = svadd_f32_x(svptrue_b32(), v700, v707); + svfloat32_t v714 = svsub_f32_x(svptrue_b32(), v710, v711); + svfloat32_t v724 = svsub_f32_x(svptrue_b32(), v708, v707); + svfloat32_t v733 = svadd_f32_x(svptrue_b32(), v710, v711); + svfloat32_t v760 = svmul_f32_x(svptrue_b32(), v717, v1286); + svfloat32_t v765 = svmul_f32_x(svptrue_b32(), v718, v1287); + svfloat32_t v775 = svmul_f32_x(svptrue_b32(), v720, v1289); + svfloat32_t v780 = svmul_f32_x(svptrue_b32(), v721, v1290); + svfloat32_t zero802 = svdup_n_f32(0); svfloat32_t v802 = svcmla_f32_x(pred_full, zero802, v1294, v725, 90); - svfloat32_t zero816; - asm volatile("mov %0.s, #0" : "=w"(zero816)); + svfloat32_t zero816 = svdup_n_f32(0); svfloat32_t v816 = svcmla_f32_x(pred_full, zero816, v1296, v727, 90); - svfloat32_t zero823; - asm volatile("mov %0.s, #0" : "=w"(zero823)); + svfloat32_t zero823 = svdup_n_f32(0); svfloat32_t v823 = svcmla_f32_x(pred_full, zero823, v1297, v728, 90); - svfloat32_t zero837; - asm volatile("mov %0.s, #0" : "=w"(zero837)); + svfloat32_t zero837 = svdup_n_f32(0); svfloat32_t v837 = svcmla_f32_x(pred_full, zero837, v1299, v730, 90); - svfloat32_t zero844; - asm volatile("mov %0.s, #0" : "=w"(zero844)); + svfloat32_t zero844 = svdup_n_f32(0); svfloat32_t v844 = svcmla_f32_x(pred_full, zero844, v1300, v731, 90); - svfloat32_t v503; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v503) : "w"(v500), "w"(v499)); - svfloat32_t v506; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v506) : "w"(v505), "w"(v496)); - svfloat32_t v586; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v586) : "w"(v515), "w"(v1293)); - svfloat32_t zero649; - asm volatile("mov %0.s, #0" : "=w"(zero649)); + svfloat32_t v503 = svadd_f32_x(svptrue_b32(), v500, v499); + svfloat32_t v506 = svsub_f32_x(svptrue_b32(), v505, v496); + svfloat32_t v586 = svmul_f32_x(svptrue_b32(), v515, v1293); + svfloat32_t zero649 = svdup_n_f32(0); svfloat32_t v649 = svcmla_f32_x(pred_full, zero649, v1302, v524, 90); svfloat32_t v651 = svmla_f32_x(pred_full, v551, v507, v1285); svfloat32_t v652 = svmla_f32_x(pred_full, v556, v508, v1286); @@ -13913,19 +12269,13 @@ void armral_fft_cf32_cf32_cf32_ab_t_gs22(const armral_cmplx_f32_t *restrict x, svfloat32_t v655 = svmla_f32_x(pred_full, v571, v511, v1289); svfloat32_t v656 = svnmls_f32_x(pred_full, v571, v510, v1288); svfloat32_t v659 = svcmla_f32_x(pred_full, v607, v1295, v517, 90); - svfloat32_t v660; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v660) : "w"(v593), "w"(v607)); + svfloat32_t v660 = svsub_f32_x(svptrue_b32(), v593, v607); svfloat32_t v661 = svcmla_f32_x(pred_full, v628, v1298, v520, 90); - svfloat32_t v662; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v662) : "w"(v614), "w"(v628)); - svfloat32_t v712; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v712) : "w"(v709), "w"(v708)); - svfloat32_t v715; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v715) : "w"(v714), "w"(v705)); - svfloat32_t v795; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v795) : "w"(v724), "w"(v1293)); - svfloat32_t zero858; - asm volatile("mov %0.s, #0" : "=w"(zero858)); + svfloat32_t v662 = svsub_f32_x(svptrue_b32(), v614, v628); + svfloat32_t v712 = svadd_f32_x(svptrue_b32(), v709, v708); + svfloat32_t v715 = svsub_f32_x(svptrue_b32(), v714, v705); + svfloat32_t v795 = svmul_f32_x(svptrue_b32(), v724, v1293); + svfloat32_t zero858 = svdup_n_f32(0); svfloat32_t v858 = svcmla_f32_x(pred_full, zero858, v1302, v733, 90); svfloat32_t v860 = svmla_f32_x(pred_full, v760, v716, v1285); svfloat32_t v861 = svmla_f32_x(pred_full, v765, v717, v1286); @@ -13934,165 +12284,93 @@ void armral_fft_cf32_cf32_cf32_ab_t_gs22(const armral_cmplx_f32_t *restrict x, svfloat32_t v864 = svmla_f32_x(pred_full, v780, v720, v1289); svfloat32_t v865 = svnmls_f32_x(pred_full, v780, v719, v1288); svfloat32_t v868 = svcmla_f32_x(pred_full, v816, v1295, v726, 90); - svfloat32_t v869; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v869) : "w"(v802), "w"(v816)); + svfloat32_t v869 = svsub_f32_x(svptrue_b32(), v802, v816); svfloat32_t v870 = svcmla_f32_x(pred_full, v837, v1298, v729, 90); - svfloat32_t v871; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v871) : "w"(v823), "w"(v837)); - svfloat32_t v504; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v504) : "w"(v466), "w"(v503)); - svfloat32_t zero541; - asm volatile("mov %0.s, #0" : "=w"(zero541)); + svfloat32_t v871 = svsub_f32_x(svptrue_b32(), v823, v837); + svfloat32_t v504 = svadd_f32_x(svptrue_b32(), v466, v503); + svfloat32_t zero541 = svdup_n_f32(0); svfloat32_t v541 = svcmla_f32_x(pred_full, zero541, v1284, v506, 90); svfloat32_t v657 = svmla_f32_x(pred_full, v586, v514, v1292); svfloat32_t v658 = svmla_f32_x(pred_full, v586, v513, v1291); svfloat32_t v663 = svcmla_f32_x(pred_full, v649, v1301, v523, 90); - svfloat32_t v664; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v664) : "w"(v635), "w"(v649)); - svfloat32_t v683; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v683) : "w"(v659), "w"(v660)); - svfloat32_t v713; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v713) : "w"(v467), "w"(v712)); - svfloat32_t zero750; - asm volatile("mov %0.s, #0" : "=w"(zero750)); + svfloat32_t v664 = svsub_f32_x(svptrue_b32(), v635, v649); + svfloat32_t v683 = svadd_f32_x(svptrue_b32(), v659, v660); + svfloat32_t v713 = svadd_f32_x(svptrue_b32(), v467, v712); + svfloat32_t zero750 = svdup_n_f32(0); svfloat32_t v750 = svcmla_f32_x(pred_full, zero750, v1284, v715, 90); svfloat32_t v866 = svmla_f32_x(pred_full, v795, v723, v1292); svfloat32_t v867 = svmla_f32_x(pred_full, v795, v722, v1291); svfloat32_t v872 = svcmla_f32_x(pred_full, v858, v1301, v732, 90); - svfloat32_t v873; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v873) : "w"(v844), "w"(v858)); - svfloat32_t v892; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v892) : "w"(v868), "w"(v869)); + svfloat32_t v873 = svsub_f32_x(svptrue_b32(), v844, v858); + svfloat32_t v892 = svadd_f32_x(svptrue_b32(), v868, v869); svfloat32_t v650 = svmls_f32_x(pred_full, v504, v503, v1283); - svfloat32_t v665; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v665) : "w"(v655), "w"(v657)); - svfloat32_t v675; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v675) : "w"(v541), "w"(v661)); - svfloat32_t v677; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v677) : "w"(v663), "w"(v659)); - svfloat32_t v679; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v679) : "w"(v541), "w"(v664)); - svfloat32_t v681; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v681) : "w"(v664), "w"(v660)); - svfloat32_t v684; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v684) : "w"(v683), "w"(v661)); + svfloat32_t v665 = svadd_f32_x(svptrue_b32(), v655, v657); + svfloat32_t v675 = svadd_f32_x(svptrue_b32(), v541, v661); + svfloat32_t v677 = svsub_f32_x(svptrue_b32(), v663, v659); + svfloat32_t v679 = svadd_f32_x(svptrue_b32(), v541, v664); + svfloat32_t v681 = svsub_f32_x(svptrue_b32(), v664, v660); + svfloat32_t v684 = svadd_f32_x(svptrue_b32(), v683, v661); svfloat32_t v859 = svmls_f32_x(pred_full, v713, v712, v1283); - svfloat32_t v874; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v874) : "w"(v864), "w"(v866)); - svfloat32_t v884; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v884) : "w"(v750), "w"(v870)); - svfloat32_t v886; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v886) : "w"(v872), "w"(v868)); - svfloat32_t v888; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v888) : "w"(v750), "w"(v873)); - svfloat32_t v890; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v890) : "w"(v873), "w"(v869)); - svfloat32_t v893; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v893) : "w"(v892), "w"(v870)); + svfloat32_t v874 = svadd_f32_x(svptrue_b32(), v864, v866); + svfloat32_t v884 = svadd_f32_x(svptrue_b32(), v750, v870); + svfloat32_t v886 = svsub_f32_x(svptrue_b32(), v872, v868); + svfloat32_t v888 = svadd_f32_x(svptrue_b32(), v750, v873); + svfloat32_t v890 = svsub_f32_x(svptrue_b32(), v873, v869); + svfloat32_t v893 = svadd_f32_x(svptrue_b32(), v892, v870); svst1_scatter_s64index_f64(pred_full, (double *)(v1310), v1500, svreinterpret_f64_f32(v504)); svst1_scatter_s64index_f64(pred_full, (double *)(v1319), v1500, svreinterpret_f64_f32(v713)); - svfloat32_t v666; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v666) : "w"(v665), "w"(v650)); - svfloat32_t v667; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v667) : "w"(v650), "w"(v652)); - svfloat32_t v669; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v669) : "w"(v650), "w"(v656)); - svfloat32_t v671; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v671) : "w"(v650), "w"(v653)); - svfloat32_t v673; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v673) : "w"(v650), "w"(v651)); - svfloat32_t v676; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v676) : "w"(v675), "w"(v663)); - svfloat32_t v678; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v678) : "w"(v677), "w"(v541)); - svfloat32_t v680; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v680) : "w"(v679), "w"(v662)); - svfloat32_t v682; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v682) : "w"(v681), "w"(v541)); - svfloat32_t v685; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v685) : "w"(v684), "w"(v662)); - svfloat32_t v875; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v875) : "w"(v874), "w"(v859)); - svfloat32_t v876; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v876) : "w"(v859), "w"(v861)); - svfloat32_t v878; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v878) : "w"(v859), "w"(v865)); - svfloat32_t v880; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v880) : "w"(v859), "w"(v862)); - svfloat32_t v882; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v882) : "w"(v859), "w"(v860)); - svfloat32_t v885; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v885) : "w"(v884), "w"(v872)); - svfloat32_t v887; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v887) : "w"(v886), "w"(v750)); - svfloat32_t v889; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v889) : "w"(v888), "w"(v871)); - svfloat32_t v891; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v891) : "w"(v890), "w"(v750)); - svfloat32_t v894; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v894) : "w"(v893), "w"(v871)); - svfloat32_t v668; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v668) : "w"(v667), "w"(v657)); - svfloat32_t v670; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v670) : "w"(v669), "w"(v658)); - svfloat32_t v672; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v672) : "w"(v671), "w"(v658)); - svfloat32_t v674; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v674) : "w"(v673), "w"(v654)); - svfloat32_t v686; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v686) : "w"(v685), "w"(v541)); - svfloat32_t v688; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v688) : "w"(v666), "w"(v676)); - svfloat32_t v695; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v695) : "w"(v666), "w"(v676)); - svfloat32_t v877; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v877) : "w"(v876), "w"(v866)); - svfloat32_t v879; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v879) : "w"(v878), "w"(v867)); - svfloat32_t v881; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v881) : "w"(v880), "w"(v867)); - svfloat32_t v883; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v883) : "w"(v882), "w"(v863)); - svfloat32_t v895; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v895) : "w"(v894), "w"(v750)); - svfloat32_t v897; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v897) : "w"(v875), "w"(v885)); - svfloat32_t v904; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v904) : "w"(v875), "w"(v885)); - svfloat32_t v687; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v687) : "w"(v674), "w"(v686)); - svfloat32_t v689; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v689) : "w"(v668), "w"(v678)); - svfloat32_t v690; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v690) : "w"(v670), "w"(v680)); - svfloat32_t v691; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v691) : "w"(v672), "w"(v682)); - svfloat32_t v692; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v692) : "w"(v672), "w"(v682)); - svfloat32_t v693; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v693) : "w"(v670), "w"(v680)); - svfloat32_t v694; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v694) : "w"(v668), "w"(v678)); - svfloat32_t v696; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v696) : "w"(v674), "w"(v686)); - svfloat32_t v896; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v896) : "w"(v883), "w"(v895)); - svfloat32_t v898; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v898) : "w"(v877), "w"(v887)); - svfloat32_t v899; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v899) : "w"(v879), "w"(v889)); - svfloat32_t v900; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v900) : "w"(v881), "w"(v891)); - svfloat32_t v901; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v901) : "w"(v881), "w"(v891)); - svfloat32_t v902; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v902) : "w"(v879), "w"(v889)); - svfloat32_t v903; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v903) : "w"(v877), "w"(v887)); - svfloat32_t v905; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v905) : "w"(v883), "w"(v895)); + svfloat32_t v666 = svadd_f32_x(svptrue_b32(), v665, v650); + svfloat32_t v667 = svsub_f32_x(svptrue_b32(), v650, v652); + svfloat32_t v669 = svadd_f32_x(svptrue_b32(), v650, v656); + svfloat32_t v671 = svsub_f32_x(svptrue_b32(), v650, v653); + svfloat32_t v673 = svadd_f32_x(svptrue_b32(), v650, v651); + svfloat32_t v676 = svadd_f32_x(svptrue_b32(), v675, v663); + svfloat32_t v678 = svsub_f32_x(svptrue_b32(), v677, v541); + svfloat32_t v680 = svadd_f32_x(svptrue_b32(), v679, v662); + svfloat32_t v682 = svsub_f32_x(svptrue_b32(), v681, v541); + svfloat32_t v685 = svadd_f32_x(svptrue_b32(), v684, v662); + svfloat32_t v875 = svadd_f32_x(svptrue_b32(), v874, v859); + svfloat32_t v876 = svsub_f32_x(svptrue_b32(), v859, v861); + svfloat32_t v878 = svadd_f32_x(svptrue_b32(), v859, v865); + svfloat32_t v880 = svsub_f32_x(svptrue_b32(), v859, v862); + svfloat32_t v882 = svadd_f32_x(svptrue_b32(), v859, v860); + svfloat32_t v885 = svadd_f32_x(svptrue_b32(), v884, v872); + svfloat32_t v887 = svsub_f32_x(svptrue_b32(), v886, v750); + svfloat32_t v889 = svadd_f32_x(svptrue_b32(), v888, v871); + svfloat32_t v891 = svsub_f32_x(svptrue_b32(), v890, v750); + svfloat32_t v894 = svadd_f32_x(svptrue_b32(), v893, v871); + svfloat32_t v668 = svsub_f32_x(svptrue_b32(), v667, v657); + svfloat32_t v670 = svadd_f32_x(svptrue_b32(), v669, v658); + svfloat32_t v672 = svsub_f32_x(svptrue_b32(), v671, v658); + svfloat32_t v674 = svsub_f32_x(svptrue_b32(), v673, v654); + svfloat32_t v686 = svsub_f32_x(svptrue_b32(), v685, v541); + svfloat32_t v688 = svadd_f32_x(svptrue_b32(), v666, v676); + svfloat32_t v695 = svsub_f32_x(svptrue_b32(), v666, v676); + svfloat32_t v877 = svsub_f32_x(svptrue_b32(), v876, v866); + svfloat32_t v879 = svadd_f32_x(svptrue_b32(), v878, v867); + svfloat32_t v881 = svsub_f32_x(svptrue_b32(), v880, v867); + svfloat32_t v883 = svsub_f32_x(svptrue_b32(), v882, v863); + svfloat32_t v895 = svsub_f32_x(svptrue_b32(), v894, v750); + svfloat32_t v897 = svadd_f32_x(svptrue_b32(), v875, v885); + svfloat32_t v904 = svsub_f32_x(svptrue_b32(), v875, v885); + svfloat32_t v687 = svadd_f32_x(svptrue_b32(), v674, v686); + svfloat32_t v689 = svadd_f32_x(svptrue_b32(), v668, v678); + svfloat32_t v690 = svsub_f32_x(svptrue_b32(), v670, v680); + svfloat32_t v691 = svadd_f32_x(svptrue_b32(), v672, v682); + svfloat32_t v692 = svsub_f32_x(svptrue_b32(), v672, v682); + svfloat32_t v693 = svadd_f32_x(svptrue_b32(), v670, v680); + svfloat32_t v694 = svsub_f32_x(svptrue_b32(), v668, v678); + svfloat32_t v696 = svsub_f32_x(svptrue_b32(), v674, v686); + svfloat32_t v896 = svadd_f32_x(svptrue_b32(), v883, v895); + svfloat32_t v898 = svadd_f32_x(svptrue_b32(), v877, v887); + svfloat32_t v899 = svsub_f32_x(svptrue_b32(), v879, v889); + svfloat32_t v900 = svadd_f32_x(svptrue_b32(), v881, v891); + svfloat32_t v901 = svsub_f32_x(svptrue_b32(), v881, v891); + svfloat32_t v902 = svadd_f32_x(svptrue_b32(), v879, v889); + svfloat32_t v903 = svsub_f32_x(svptrue_b32(), v877, v887); + svfloat32_t v905 = svsub_f32_x(svptrue_b32(), v883, v895); svst1_scatter_s64index_f64(pred_full, (double *)(v1346), v1500, svreinterpret_f64_f32(v695)); svst1_scatter_s64index_f64(pred_full, (double *)(v1355), v1500, @@ -14785,8 +13063,7 @@ void armral_fft_cf32_cf32_cf32_ab_t_gs24(const armral_cmplx_f32_t *restrict x, svld1_f64(pred_full, &((const double *)v7)[v169])); svfloat32_t v205 = svreinterpret_f32_f64( svld1_f64(pred_full, &((const double *)v7)[v204])); - svfloat32_t zero213; - asm volatile("mov %0.s, #0" : "=w"(zero213)); + svfloat32_t zero213 = svdup_n_f32(0); svfloat32_t v213 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero213, v984, v212, 0), v984, v212, 90); @@ -14860,115 +13137,83 @@ void armral_fft_cf32_cf32_cf32_ab_t_gs24(const armral_cmplx_f32_t *restrict x, svld1_gather_s64index_f64(pred_full, (const double *)(v1092), v1112)); svfloat32_t v1103 = svreinterpret_f32_f64( svld1_gather_s64index_f64(pred_full, (const double *)(v1101), v1112)); - svfloat32_t zero52; - asm volatile("mov %0.s, #0" : "=w"(zero52)); + svfloat32_t zero52 = svdup_n_f32(0); svfloat32_t v52 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero52, v903, v51, 0), v903, v51, 90); - svfloat32_t zero59; - asm volatile("mov %0.s, #0" : "=w"(zero59)); + svfloat32_t zero59 = svdup_n_f32(0); svfloat32_t v59 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero59, v912, v58, 0), v912, v58, 90); - svfloat32_t zero94; - asm volatile("mov %0.s, #0" : "=w"(zero94)); + svfloat32_t zero94 = svdup_n_f32(0); svfloat32_t v94 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero94, v921, v93, 0), v921, v93, 90); - svfloat32_t zero101; - asm volatile("mov %0.s, #0" : "=w"(zero101)); + svfloat32_t zero101 = svdup_n_f32(0); svfloat32_t v101 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero101, v930, v100, 0), v930, v100, 90); - svfloat32_t zero150; - asm volatile("mov %0.s, #0" : "=w"(zero150)); + svfloat32_t zero150 = svdup_n_f32(0); svfloat32_t v150 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero150, v948, v149, 0), v948, v149, 90); - svfloat32_t zero157; - asm volatile("mov %0.s, #0" : "=w"(zero157)); + svfloat32_t zero157 = svdup_n_f32(0); svfloat32_t v157 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero157, v957, v156, 0), v957, v156, 90); - svfloat32_t zero206; - asm volatile("mov %0.s, #0" : "=w"(zero206)); + svfloat32_t zero206 = svdup_n_f32(0); svfloat32_t v206 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero206, v975, v205, 0), v975, v205, 90); - svfloat32_t zero262; - asm volatile("mov %0.s, #0" : "=w"(zero262)); + svfloat32_t zero262 = svdup_n_f32(0); svfloat32_t v262 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero262, v1004, v261, 0), v1004, v261, 90); - svfloat32_t zero269; - asm volatile("mov %0.s, #0" : "=w"(zero269)); + svfloat32_t zero269 = svdup_n_f32(0); svfloat32_t v269 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero269, v1013, v268, 0), v1013, v268, 90); - svfloat32_t zero318; - asm volatile("mov %0.s, #0" : "=w"(zero318)); + svfloat32_t zero318 = svdup_n_f32(0); svfloat32_t v318 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero318, v1031, v317, 0), v1031, v317, 90); - svfloat32_t zero325; - asm volatile("mov %0.s, #0" : "=w"(zero325)); + svfloat32_t zero325 = svdup_n_f32(0); svfloat32_t v325 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero325, v1040, v324, 0), v1040, v324, 90); - svfloat32_t zero374; - asm volatile("mov %0.s, #0" : "=w"(zero374)); + svfloat32_t zero374 = svdup_n_f32(0); svfloat32_t v374 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero374, v1058, v373, 0), v1058, v373, 90); - svfloat32_t zero381; - asm volatile("mov %0.s, #0" : "=w"(zero381)); + svfloat32_t zero381 = svdup_n_f32(0); svfloat32_t v381 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero381, v1067, v380, 0), v1067, v380, 90); - svfloat32_t zero430; - asm volatile("mov %0.s, #0" : "=w"(zero430)); + svfloat32_t zero430 = svdup_n_f32(0); svfloat32_t v430 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero430, v1085, v429, 0), v1085, v429, 90); - svfloat32_t zero437; - asm volatile("mov %0.s, #0" : "=w"(zero437)); + svfloat32_t zero437 = svdup_n_f32(0); svfloat32_t v437 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero437, v1094, v436, 0), v1094, v436, 90); - svfloat32_t v452; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v452) : "w"(v52), "w"(v59)); - svfloat32_t v453; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v453) : "w"(v52), "w"(v59)); - svfloat32_t v462; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v462) : "w"(v94), "w"(v101)); - svfloat32_t v463; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v463) : "w"(v94), "w"(v101)); - svfloat32_t v465; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v465) : "w"(v150), "w"(v157)); - svfloat32_t v466; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v466) : "w"(v150), "w"(v157)); - svfloat32_t v468; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v468) : "w"(v206), "w"(v213)); - svfloat32_t v469; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v469) : "w"(v206), "w"(v213)); - svfloat32_t v471; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v471) : "w"(v262), "w"(v269)); - svfloat32_t v472; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v472) : "w"(v262), "w"(v269)); - svfloat32_t v474; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v474) : "w"(v318), "w"(v325)); - svfloat32_t v475; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v475) : "w"(v318), "w"(v325)); - svfloat32_t v477; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v477) : "w"(v374), "w"(v381)); - svfloat32_t v478; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v478) : "w"(v374), "w"(v381)); - svfloat32_t v480; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v480) : "w"(v430), "w"(v437)); - svfloat32_t v481; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v481) : "w"(v430), "w"(v437)); - svfloat32_t v461; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v461) : "w"(v452), "w"(v1113)); + svfloat32_t v452 = svadd_f32_x(svptrue_b32(), v52, v59); + svfloat32_t v453 = svsub_f32_x(svptrue_b32(), v52, v59); + svfloat32_t v462 = svadd_f32_x(svptrue_b32(), v94, v101); + svfloat32_t v463 = svsub_f32_x(svptrue_b32(), v94, v101); + svfloat32_t v465 = svadd_f32_x(svptrue_b32(), v150, v157); + svfloat32_t v466 = svsub_f32_x(svptrue_b32(), v150, v157); + svfloat32_t v468 = svadd_f32_x(svptrue_b32(), v206, v213); + svfloat32_t v469 = svsub_f32_x(svptrue_b32(), v206, v213); + svfloat32_t v471 = svadd_f32_x(svptrue_b32(), v262, v269); + svfloat32_t v472 = svsub_f32_x(svptrue_b32(), v262, v269); + svfloat32_t v474 = svadd_f32_x(svptrue_b32(), v318, v325); + svfloat32_t v475 = svsub_f32_x(svptrue_b32(), v318, v325); + svfloat32_t v477 = svadd_f32_x(svptrue_b32(), v374, v381); + svfloat32_t v478 = svsub_f32_x(svptrue_b32(), v374, v381); + svfloat32_t v480 = svadd_f32_x(svptrue_b32(), v430, v437); + svfloat32_t v481 = svsub_f32_x(svptrue_b32(), v430, v437); + svfloat32_t v461 = svadd_f32_x(svptrue_b32(), v452, v1113); svfloat32_t v464 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, v462, v939, v114, 0), v939, v114, 90); @@ -14990,227 +13235,132 @@ void armral_fft_cf32_cf32_cf32_ab_t_gs24(const armral_cmplx_f32_t *restrict x, svfloat32_t v482 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, v480, v1103, v450, 0), v1103, v450, 90); - svfloat32_t v555; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v555) : "w"(v452), "w"(v471)); - svfloat32_t v556; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v556) : "w"(v452), "w"(v471)); - svfloat32_t v557; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v557) : "w"(v465), "w"(v477)); - svfloat32_t v558; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v558) : "w"(v465), "w"(v477)); - svfloat32_t v559; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v559) : "w"(v462), "w"(v474)); - svfloat32_t v560; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v560) : "w"(v462), "w"(v474)); - svfloat32_t v561; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v561) : "w"(v468), "w"(v480)); - svfloat32_t v562; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v562) : "w"(v468), "w"(v480)); - svfloat32_t v627; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v627) : "w"(v453), "w"(v472)); - svfloat32_t v628; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v628) : "w"(v453), "w"(v472)); - svfloat32_t v629; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v629) : "w"(v466), "w"(v478)); - svfloat32_t v630; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v630) : "w"(v466), "w"(v478)); - svfloat32_t v631; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v631) : "w"(v463), "w"(v475)); - svfloat32_t v632; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v632) : "w"(v463), "w"(v475)); - svfloat32_t v633; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v633) : "w"(v469), "w"(v481)); - svfloat32_t v634; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v634) : "w"(v469), "w"(v481)); - svfloat32_t v483; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v483) : "w"(v461), "w"(v473)); - svfloat32_t v484; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v484) : "w"(v461), "w"(v473)); - svfloat32_t v485; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v485) : "w"(v467), "w"(v479)); - svfloat32_t v486; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v486) : "w"(v467), "w"(v479)); - svfloat32_t v487; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v487) : "w"(v464), "w"(v476)); - svfloat32_t v488; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v488) : "w"(v464), "w"(v476)); - svfloat32_t v489; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v489) : "w"(v470), "w"(v482)); - svfloat32_t v490; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v490) : "w"(v470), "w"(v482)); - svfloat32_t v563; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v563) : "w"(v555), "w"(v557)); - svfloat32_t v564; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v564) : "w"(v555), "w"(v557)); - svfloat32_t v565; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v565) : "w"(v559), "w"(v561)); - svfloat32_t v566; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v566) : "w"(v559), "w"(v561)); - svfloat32_t v569; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v569) : "w"(v560), "w"(v562)); - svfloat32_t v570; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v570) : "w"(v560), "w"(v562)); - svfloat32_t zero604; - asm volatile("mov %0.s, #0" : "=w"(zero604)); + svfloat32_t v555 = svadd_f32_x(svptrue_b32(), v452, v471); + svfloat32_t v556 = svsub_f32_x(svptrue_b32(), v452, v471); + svfloat32_t v557 = svadd_f32_x(svptrue_b32(), v465, v477); + svfloat32_t v558 = svsub_f32_x(svptrue_b32(), v465, v477); + svfloat32_t v559 = svadd_f32_x(svptrue_b32(), v462, v474); + svfloat32_t v560 = svsub_f32_x(svptrue_b32(), v462, v474); + svfloat32_t v561 = svadd_f32_x(svptrue_b32(), v468, v480); + svfloat32_t v562 = svsub_f32_x(svptrue_b32(), v468, v480); + svfloat32_t v627 = svadd_f32_x(svptrue_b32(), v453, v472); + svfloat32_t v628 = svsub_f32_x(svptrue_b32(), v453, v472); + svfloat32_t v629 = svadd_f32_x(svptrue_b32(), v466, v478); + svfloat32_t v630 = svsub_f32_x(svptrue_b32(), v466, v478); + svfloat32_t v631 = svadd_f32_x(svptrue_b32(), v463, v475); + svfloat32_t v632 = svsub_f32_x(svptrue_b32(), v463, v475); + svfloat32_t v633 = svadd_f32_x(svptrue_b32(), v469, v481); + svfloat32_t v634 = svsub_f32_x(svptrue_b32(), v469, v481); + svfloat32_t v483 = svadd_f32_x(svptrue_b32(), v461, v473); + svfloat32_t v484 = svsub_f32_x(svptrue_b32(), v461, v473); + svfloat32_t v485 = svadd_f32_x(svptrue_b32(), v467, v479); + svfloat32_t v486 = svsub_f32_x(svptrue_b32(), v467, v479); + svfloat32_t v487 = svadd_f32_x(svptrue_b32(), v464, v476); + svfloat32_t v488 = svsub_f32_x(svptrue_b32(), v464, v476); + svfloat32_t v489 = svadd_f32_x(svptrue_b32(), v470, v482); + svfloat32_t v490 = svsub_f32_x(svptrue_b32(), v470, v482); + svfloat32_t v563 = svadd_f32_x(svptrue_b32(), v555, v557); + svfloat32_t v564 = svsub_f32_x(svptrue_b32(), v555, v557); + svfloat32_t v565 = svadd_f32_x(svptrue_b32(), v559, v561); + svfloat32_t v566 = svsub_f32_x(svptrue_b32(), v559, v561); + svfloat32_t v569 = svadd_f32_x(svptrue_b32(), v560, v562); + svfloat32_t v570 = svsub_f32_x(svptrue_b32(), v560, v562); + svfloat32_t zero604 = svdup_n_f32(0); svfloat32_t v604 = svcmla_f32_x(pred_full, zero604, v1127, v558, 90); - svfloat32_t v635; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v635) : "w"(v627), "w"(v629)); - svfloat32_t v636; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v636) : "w"(v627), "w"(v629)); - svfloat32_t v637; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v637) : "w"(v631), "w"(v633)); - svfloat32_t v638; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v638) : "w"(v631), "w"(v633)); - svfloat32_t v641; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v641) : "w"(v632), "w"(v634)); - svfloat32_t v642; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v642) : "w"(v632), "w"(v634)); - svfloat32_t zero675; - asm volatile("mov %0.s, #0" : "=w"(zero675)); + svfloat32_t v635 = svadd_f32_x(svptrue_b32(), v627, v629); + svfloat32_t v636 = svsub_f32_x(svptrue_b32(), v627, v629); + svfloat32_t v637 = svadd_f32_x(svptrue_b32(), v631, v633); + svfloat32_t v638 = svsub_f32_x(svptrue_b32(), v631, v633); + svfloat32_t v641 = svadd_f32_x(svptrue_b32(), v632, v634); + svfloat32_t v642 = svsub_f32_x(svptrue_b32(), v632, v634); + svfloat32_t zero675 = svdup_n_f32(0); svfloat32_t v675 = svcmla_f32_x(pred_full, zero675, v1134, v628, 90); - svfloat32_t v491; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v491) : "w"(v483), "w"(v485)); - svfloat32_t v492; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v492) : "w"(v483), "w"(v485)); - svfloat32_t v493; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v493) : "w"(v487), "w"(v489)); - svfloat32_t v494; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v494) : "w"(v487), "w"(v489)); - svfloat32_t v497; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v497) : "w"(v488), "w"(v490)); - svfloat32_t v498; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v498) : "w"(v488), "w"(v490)); - svfloat32_t zero532; - asm volatile("mov %0.s, #0" : "=w"(zero532)); + svfloat32_t v491 = svadd_f32_x(svptrue_b32(), v483, v485); + svfloat32_t v492 = svsub_f32_x(svptrue_b32(), v483, v485); + svfloat32_t v493 = svadd_f32_x(svptrue_b32(), v487, v489); + svfloat32_t v494 = svsub_f32_x(svptrue_b32(), v487, v489); + svfloat32_t v497 = svadd_f32_x(svptrue_b32(), v488, v490); + svfloat32_t v498 = svsub_f32_x(svptrue_b32(), v488, v490); + svfloat32_t zero532 = svdup_n_f32(0); svfloat32_t v532 = svcmla_f32_x(pred_full, zero532, v1119, v486, 90); - svfloat32_t v567; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v567) : "w"(v563), "w"(v565)); - svfloat32_t v568; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v568) : "w"(v563), "w"(v565)); - svfloat32_t zero592; - asm volatile("mov %0.s, #0" : "=w"(zero592)); + svfloat32_t v567 = svadd_f32_x(svptrue_b32(), v563, v565); + svfloat32_t v568 = svsub_f32_x(svptrue_b32(), v563, v565); + svfloat32_t zero592 = svdup_n_f32(0); svfloat32_t v592 = svcmla_f32_x(pred_full, zero592, v1127, v566, 90); - svfloat32_t zero611; - asm volatile("mov %0.s, #0" : "=w"(zero611)); + svfloat32_t zero611 = svdup_n_f32(0); svfloat32_t v611 = svcmla_f32_x(pred_full, zero611, v1128, v569, 90); - svfloat32_t v616; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v616) : "w"(v570), "w"(v1129)); - svfloat32_t v639; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v639) : "w"(v635), "w"(v637)); - svfloat32_t v640; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v640) : "w"(v635), "w"(v637)); - svfloat32_t zero663; - asm volatile("mov %0.s, #0" : "=w"(zero663)); + svfloat32_t v616 = svmul_f32_x(svptrue_b32(), v570, v1129); + svfloat32_t v639 = svadd_f32_x(svptrue_b32(), v635, v637); + svfloat32_t v640 = svsub_f32_x(svptrue_b32(), v635, v637); + svfloat32_t zero663 = svdup_n_f32(0); svfloat32_t v663 = svcmla_f32_x(pred_full, zero663, v1134, v636, 90); - svfloat32_t v685; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v685) : "w"(v641), "w"(v1136)); - svfloat32_t zero692; - asm volatile("mov %0.s, #0" : "=w"(zero692)); + svfloat32_t v685 = svmul_f32_x(svptrue_b32(), v641, v1136); + svfloat32_t zero692 = svdup_n_f32(0); svfloat32_t v692 = svcmla_f32_x(pred_full, zero692, v1137, v642, 90); - svfloat32_t v495; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v495) : "w"(v491), "w"(v493)); - svfloat32_t v496; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v496) : "w"(v491), "w"(v493)); - svfloat32_t zero520; - asm volatile("mov %0.s, #0" : "=w"(zero520)); + svfloat32_t v495 = svadd_f32_x(svptrue_b32(), v491, v493); + svfloat32_t v496 = svsub_f32_x(svptrue_b32(), v491, v493); + svfloat32_t zero520 = svdup_n_f32(0); svfloat32_t v520 = svcmla_f32_x(pred_full, zero520, v1119, v494, 90); - svfloat32_t zero539; - asm volatile("mov %0.s, #0" : "=w"(zero539)); + svfloat32_t zero539 = svdup_n_f32(0); svfloat32_t v539 = svcmla_f32_x(pred_full, zero539, v1120, v497, 90); svfloat32_t v617 = svmla_f32_x(pred_full, v592, v564, v1126); svfloat32_t v618 = svnmls_f32_x(pred_full, v592, v564, v1126); svfloat32_t v619 = svmla_f32_x(pred_full, v616, v556, v1126); svfloat32_t v620 = svnmls_f32_x(pred_full, v616, v556, v1126); - svfloat32_t v621; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v621) : "w"(v604), "w"(v611)); - svfloat32_t v622; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v622) : "w"(v604), "w"(v611)); - svfloat32_t zero649; - asm volatile("mov %0.s, #0" : "=w"(zero649)); + svfloat32_t v621 = svadd_f32_x(svptrue_b32(), v604, v611); + svfloat32_t v622 = svsub_f32_x(svptrue_b32(), v604, v611); + svfloat32_t zero649 = svdup_n_f32(0); svfloat32_t v649 = svcmla_f32_x(pred_full, zero649, v1134, v639, 90); - svfloat32_t zero656; - asm volatile("mov %0.s, #0" : "=w"(zero656)); + svfloat32_t zero656 = svdup_n_f32(0); svfloat32_t v656 = svcmla_f32_x(pred_full, zero656, v1134, v640, 90); svfloat32_t v693 = svmla_f32_x(pred_full, v663, v638, v1135); svfloat32_t v694 = svmls_f32_x(pred_full, v663, v638, v1135); - svfloat32_t v695; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v695) : "w"(v675), "w"(v692)); - svfloat32_t v696; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v696) : "w"(v675), "w"(v692)); + svfloat32_t v695 = svadd_f32_x(svptrue_b32(), v675, v692); + svfloat32_t v696 = svsub_f32_x(svptrue_b32(), v675, v692); svfloat32_t v697 = svmla_f32_x(pred_full, v685, v630, v1135); svfloat32_t v698 = svnmls_f32_x(pred_full, v685, v630, v1135); - svfloat32_t v545; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v545) : "w"(v492), "w"(v520)); - svfloat32_t v546; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v546) : "w"(v492), "w"(v520)); + svfloat32_t v545 = svadd_f32_x(svptrue_b32(), v492, v520); + svfloat32_t v546 = svsub_f32_x(svptrue_b32(), v492, v520); svfloat32_t v547 = svmla_f32_x(pred_full, v484, v498, v1121); svfloat32_t v548 = svmls_f32_x(pred_full, v484, v498, v1121); - svfloat32_t v549; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v549) : "w"(v532), "w"(v539)); - svfloat32_t v550; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v550) : "w"(v532), "w"(v539)); - svfloat32_t v623; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v623) : "w"(v619), "w"(v621)); - svfloat32_t v624; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v624) : "w"(v619), "w"(v621)); - svfloat32_t v625; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v625) : "w"(v620), "w"(v622)); - svfloat32_t v626; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v626) : "w"(v620), "w"(v622)); - svfloat32_t v699; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v699) : "w"(v695), "w"(v697)); - svfloat32_t v700; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v700) : "w"(v695), "w"(v697)); - svfloat32_t v701; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v701) : "w"(v696), "w"(v698)); - svfloat32_t v702; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v702) : "w"(v696), "w"(v698)); + svfloat32_t v549 = svadd_f32_x(svptrue_b32(), v532, v539); + svfloat32_t v550 = svsub_f32_x(svptrue_b32(), v532, v539); + svfloat32_t v623 = svadd_f32_x(svptrue_b32(), v619, v621); + svfloat32_t v624 = svsub_f32_x(svptrue_b32(), v619, v621); + svfloat32_t v625 = svadd_f32_x(svptrue_b32(), v620, v622); + svfloat32_t v626 = svsub_f32_x(svptrue_b32(), v620, v622); + svfloat32_t v699 = svadd_f32_x(svptrue_b32(), v695, v697); + svfloat32_t v700 = svsub_f32_x(svptrue_b32(), v695, v697); + svfloat32_t v701 = svadd_f32_x(svptrue_b32(), v696, v698); + svfloat32_t v702 = svsub_f32_x(svptrue_b32(), v696, v698); svfloat32_t v703 = svmla_f32_x(pred_full, v495, v567, v1126); svfloat32_t v799 = svmla_f32_x(pred_full, v496, v568, v1126); svst1_scatter_s64index_f64(pred_full, (double *)(v1145), v1353, svreinterpret_f64_f32(v495)); svst1_scatter_s64index_f64(pred_full, (double *)(v1253), v1353, svreinterpret_f64_f32(v496)); - svfloat32_t v551; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v551) : "w"(v547), "w"(v549)); - svfloat32_t v552; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v552) : "w"(v547), "w"(v549)); - svfloat32_t v553; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v553) : "w"(v548), "w"(v550)); - svfloat32_t v554; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v554) : "w"(v548), "w"(v550)); - svfloat32_t v704; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v704) : "w"(v703), "w"(v649)); - svfloat32_t v705; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v705) : "w"(v703), "w"(v649)); - svfloat32_t v751; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v751) : "w"(v546), "w"(v618)); - svfloat32_t v800; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v800) : "w"(v799), "w"(v656)); - svfloat32_t v801; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v801) : "w"(v799), "w"(v656)); - svfloat32_t v847; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v847) : "w"(v545), "w"(v617)); + svfloat32_t v551 = svadd_f32_x(svptrue_b32(), v547, v549); + svfloat32_t v552 = svsub_f32_x(svptrue_b32(), v547, v549); + svfloat32_t v553 = svadd_f32_x(svptrue_b32(), v548, v550); + svfloat32_t v554 = svsub_f32_x(svptrue_b32(), v548, v550); + svfloat32_t v704 = svadd_f32_x(svptrue_b32(), v703, v649); + svfloat32_t v705 = svsub_f32_x(svptrue_b32(), v703, v649); + svfloat32_t v751 = svadd_f32_x(svptrue_b32(), v546, v618); + svfloat32_t v800 = svadd_f32_x(svptrue_b32(), v799, v656); + svfloat32_t v801 = svsub_f32_x(svptrue_b32(), v799, v656); + svfloat32_t v847 = svadd_f32_x(svptrue_b32(), v545, v617); svst1_scatter_s64index_f64(pred_full, (double *)(v1199), v1353, svreinterpret_f64_f32(v546)); svst1_scatter_s64index_f64(pred_full, (double *)(v1307), v1353, svreinterpret_f64_f32(v545)); - svfloat32_t v727; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v727) : "w"(v552), "w"(v624)); - svfloat32_t v752; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v752) : "w"(v751), "w"(v694)); - svfloat32_t v753; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v753) : "w"(v751), "w"(v694)); - svfloat32_t v775; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v775) : "w"(v553), "w"(v625)); - svfloat32_t v823; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v823) : "w"(v554), "w"(v626)); - svfloat32_t v848; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v848) : "w"(v847), "w"(v693)); - svfloat32_t v849; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v849) : "w"(v847), "w"(v693)); - svfloat32_t v871; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v871) : "w"(v551), "w"(v623)); + svfloat32_t v727 = svadd_f32_x(svptrue_b32(), v552, v624); + svfloat32_t v752 = svadd_f32_x(svptrue_b32(), v751, v694); + svfloat32_t v753 = svsub_f32_x(svptrue_b32(), v751, v694); + svfloat32_t v775 = svadd_f32_x(svptrue_b32(), v553, v625); + svfloat32_t v823 = svadd_f32_x(svptrue_b32(), v554, v626); + svfloat32_t v848 = svadd_f32_x(svptrue_b32(), v847, v693); + svfloat32_t v849 = svsub_f32_x(svptrue_b32(), v847, v693); + svfloat32_t v871 = svadd_f32_x(svptrue_b32(), v551, v623); svst1_scatter_s64index_f64(pred_full, (double *)(v1154), v1353, svreinterpret_f64_f32(v705)); svst1_scatter_s64index_f64(pred_full, (double *)(v1163), v1353, @@ -15227,22 +13377,14 @@ void armral_fft_cf32_cf32_cf32_ab_t_gs24(const armral_cmplx_f32_t *restrict x, svreinterpret_f64_f32(v554)); svst1_scatter_s64index_f64(pred_full, (double *)(v1334), v1353, svreinterpret_f64_f32(v551)); - svfloat32_t v728; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v728) : "w"(v727), "w"(v700)); - svfloat32_t v729; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v729) : "w"(v727), "w"(v700)); - svfloat32_t v776; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v776) : "w"(v775), "w"(v701)); - svfloat32_t v777; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v777) : "w"(v775), "w"(v701)); - svfloat32_t v824; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v824) : "w"(v823), "w"(v702)); - svfloat32_t v825; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v825) : "w"(v823), "w"(v702)); - svfloat32_t v872; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v872) : "w"(v871), "w"(v699)); - svfloat32_t v873; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v873) : "w"(v871), "w"(v699)); + svfloat32_t v728 = svadd_f32_x(svptrue_b32(), v727, v700); + svfloat32_t v729 = svsub_f32_x(svptrue_b32(), v727, v700); + svfloat32_t v776 = svadd_f32_x(svptrue_b32(), v775, v701); + svfloat32_t v777 = svsub_f32_x(svptrue_b32(), v775, v701); + svfloat32_t v824 = svadd_f32_x(svptrue_b32(), v823, v702); + svfloat32_t v825 = svsub_f32_x(svptrue_b32(), v823, v702); + svfloat32_t v872 = svadd_f32_x(svptrue_b32(), v871, v699); + svfloat32_t v873 = svsub_f32_x(svptrue_b32(), v871, v699); svst1_scatter_s64index_f64(pred_full, (double *)(v1208), v1353, svreinterpret_f64_f32(v753)); svst1_scatter_s64index_f64(pred_full, (double *)(v1217), v1353, @@ -15286,7 +13428,6 @@ void armral_fft_cf32_cf32_cf32_ab_t_gs25(const armral_cmplx_f32_t *restrict x, const float32x2_t *v7 = (const float32x2_t *)w; for (int j = 0; j < howmany; j += 1) { float32x2_t v92 = v5[istride]; - float v1070 = 0.0000000000000000e+00F; float v1163 = 9.6858316112863108e-01F; float v1166 = -2.4868988716485479e-01F; float v1167 = 2.4868988716485479e-01F; @@ -15321,7 +13462,6 @@ void armral_fft_cf32_cf32_cf32_ab_t_gs25(const armral_cmplx_f32_t *restrict x, float32x2_t v98 = vtrn1_f32(v92, v92); float32x2_t v99 = vtrn2_f32(v92, v92); float32x2_t v452 = v5[0]; - float v1073 = dir * v1070; float32x2_t v1164 = (float32x2_t){v1163, v1163}; float32x2_t v1168 = (float32x2_t){v1166, v1167}; float32x2_t v1303 = (float32x2_t){v1302, v1302}; @@ -15392,7 +13532,6 @@ void armral_fft_cf32_cf32_cf32_ab_t_gs25(const armral_cmplx_f32_t *restrict x, int64_t v420 = 36 + j * 48; float32x2_t v434 = v5[istride * 24]; int64_t v438 = 46 + j * 48; - float32x2_t v1071 = (float32x2_t){v1070, v1073}; float32x2_t v1170 = vmul_f32(v1688, v1168); float32x2_t v1309 = vmul_f32(v1688, v1307); float32x2_t v1448 = vmul_f32(v1688, v1446); @@ -15567,86 +13706,26 @@ void armral_fft_cf32_cf32_cf32_ab_t_gs25(const armral_cmplx_f32_t *restrict x, float32x2_t v411 = vfma_f32(v409, v405, v408); float32x2_t v429 = vfma_f32(v427, v423, v426); float32x2_t v447 = vfma_f32(v445, v441, v444); - float32x2_t v462 = vrev64_f32(v33); - float32x2_t v474 = vrev64_f32(v51); - float32x2_t v486 = vrev64_f32(v87); - float32x2_t v504 = vrev64_f32(v69); - float32x2_t v576 = vrev64_f32(v123); - float32x2_t v588 = vrev64_f32(v141); - float32x2_t v600 = vrev64_f32(v177); - float32x2_t v618 = vrev64_f32(v159); - float32x2_t v690 = vrev64_f32(v213); - float32x2_t v702 = vrev64_f32(v231); - float32x2_t v714 = vrev64_f32(v267); - float32x2_t v732 = vrev64_f32(v249); - float32x2_t v804 = vrev64_f32(v303); - float32x2_t v816 = vrev64_f32(v321); - float32x2_t v828 = vrev64_f32(v357); - float32x2_t v846 = vrev64_f32(v339); - float32x2_t v918 = vrev64_f32(v393); - float32x2_t v930 = vrev64_f32(v411); - float32x2_t v942 = vrev64_f32(v447); - float32x2_t v960 = vrev64_f32(v429); - float32x2_t v463 = vmul_f32(v462, v1071); - float32x2_t v475 = vmul_f32(v474, v1071); - float32x2_t v487 = vmul_f32(v486, v1071); - float32x2_t v505 = vmul_f32(v504, v1071); - float32x2_t v577 = vmul_f32(v576, v1071); - float32x2_t v589 = vmul_f32(v588, v1071); - float32x2_t v601 = vmul_f32(v600, v1071); - float32x2_t v619 = vmul_f32(v618, v1071); - float32x2_t v691 = vmul_f32(v690, v1071); - float32x2_t v703 = vmul_f32(v702, v1071); - float32x2_t v715 = vmul_f32(v714, v1071); - float32x2_t v733 = vmul_f32(v732, v1071); - float32x2_t v805 = vmul_f32(v804, v1071); - float32x2_t v817 = vmul_f32(v816, v1071); - float32x2_t v829 = vmul_f32(v828, v1071); - float32x2_t v847 = vmul_f32(v846, v1071); - float32x2_t v919 = vmul_f32(v918, v1071); - float32x2_t v931 = vmul_f32(v930, v1071); - float32x2_t v943 = vmul_f32(v942, v1071); - float32x2_t v961 = vmul_f32(v960, v1071); - float32x2_t v464 = vadd_f32(v463, v33); - float32x2_t v476 = vadd_f32(v475, v51); - float32x2_t v488 = vadd_f32(v487, v87); - float32x2_t v506 = vadd_f32(v505, v69); - float32x2_t v578 = vadd_f32(v577, v123); - float32x2_t v590 = vadd_f32(v589, v141); - float32x2_t v602 = vadd_f32(v601, v177); - float32x2_t v620 = vadd_f32(v619, v159); - float32x2_t v692 = vadd_f32(v691, v213); - float32x2_t v704 = vadd_f32(v703, v231); - float32x2_t v716 = vadd_f32(v715, v267); - float32x2_t v734 = vadd_f32(v733, v249); - float32x2_t v806 = vadd_f32(v805, v303); - float32x2_t v818 = vadd_f32(v817, v321); - float32x2_t v830 = vadd_f32(v829, v357); - float32x2_t v848 = vadd_f32(v847, v339); - float32x2_t v920 = vadd_f32(v919, v393); - float32x2_t v932 = vadd_f32(v931, v411); - float32x2_t v944 = vadd_f32(v943, v447); - float32x2_t v962 = vadd_f32(v961, v429); - float32x2_t v489 = vsub_f32(v464, v488); - float32x2_t v493 = vmul_f32(v464, v1710); - float32x2_t v507 = vsub_f32(v476, v506); - float32x2_t v511 = vmul_f32(v476, v1710); - float32x2_t v603 = vsub_f32(v578, v602); - float32x2_t v607 = vmul_f32(v578, v1710); - float32x2_t v621 = vsub_f32(v590, v620); - float32x2_t v625 = vmul_f32(v590, v1710); - float32x2_t v717 = vsub_f32(v692, v716); - float32x2_t v721 = vmul_f32(v692, v1710); - float32x2_t v735 = vsub_f32(v704, v734); - float32x2_t v739 = vmul_f32(v704, v1710); - float32x2_t v831 = vsub_f32(v806, v830); - float32x2_t v835 = vmul_f32(v806, v1710); - float32x2_t v849 = vsub_f32(v818, v848); - float32x2_t v853 = vmul_f32(v818, v1710); - float32x2_t v945 = vsub_f32(v920, v944); - float32x2_t v949 = vmul_f32(v920, v1710); - float32x2_t v963 = vsub_f32(v932, v962); - float32x2_t v967 = vmul_f32(v932, v1710); + float32x2_t v489 = vsub_f32(v33, v87); + float32x2_t v493 = vmul_f32(v33, v1710); + float32x2_t v507 = vsub_f32(v51, v69); + float32x2_t v511 = vmul_f32(v51, v1710); + float32x2_t v603 = vsub_f32(v123, v177); + float32x2_t v607 = vmul_f32(v123, v1710); + float32x2_t v621 = vsub_f32(v141, v159); + float32x2_t v625 = vmul_f32(v141, v1710); + float32x2_t v717 = vsub_f32(v213, v267); + float32x2_t v721 = vmul_f32(v213, v1710); + float32x2_t v735 = vsub_f32(v231, v249); + float32x2_t v739 = vmul_f32(v231, v1710); + float32x2_t v831 = vsub_f32(v303, v357); + float32x2_t v835 = vmul_f32(v303, v1710); + float32x2_t v849 = vsub_f32(v321, v339); + float32x2_t v853 = vmul_f32(v321, v1710); + float32x2_t v945 = vsub_f32(v393, v447); + float32x2_t v949 = vmul_f32(v393, v1710); + float32x2_t v963 = vsub_f32(v411, v429); + float32x2_t v967 = vmul_f32(v411, v1710); float32x2_t v494 = vsub_f32(v493, v489); float32x2_t v512 = vsub_f32(v511, v507); float32x2_t v523 = vmul_f32(v507, v1663); @@ -15727,10 +13806,10 @@ void armral_fft_cf32_cf32_cf32_ab_t_gs25(const armral_cmplx_f32_t *restrict x, float32x2_t v975 = vsub_f32(v375, v974); float32x2_t v1003 = vmul_f32(v1002, v1689); float32x2_t v1011 = vmul_f32(v1010, v1689); - float32x2_t v1032 = vrev64_f32(v654); - float32x2_t v1044 = vrev64_f32(v768); - float32x2_t v1056 = vrev64_f32(v996); - float32x2_t v1074 = vrev64_f32(v882); + float32x2_t v1059 = vsub_f32(v654, v996); + float32x2_t v1063 = vmul_f32(v654, v1710); + float32x2_t v1077 = vsub_f32(v768, v882); + float32x2_t v1081 = vmul_f32(v768, v1710); float32x2_t v529 = vsub_f32(v519, v528); float32x2_t v533 = vmul_f32(v519, v1710); float32x2_t v643 = vsub_f32(v633, v642); @@ -15741,10 +13820,10 @@ void armral_fft_cf32_cf32_cf32_ab_t_gs25(const armral_cmplx_f32_t *restrict x, float32x2_t v875 = vmul_f32(v861, v1710); float32x2_t v985 = vsub_f32(v975, v984); float32x2_t v989 = vmul_f32(v975, v1710); - float32x2_t v1033 = vmul_f32(v1032, v1071); - float32x2_t v1045 = vmul_f32(v1044, v1071); - float32x2_t v1057 = vmul_f32(v1056, v1071); - float32x2_t v1075 = vmul_f32(v1074, v1071); + float32x2_t v1064 = vsub_f32(v1063, v1059); + float32x2_t v1082 = vsub_f32(v1081, v1077); + float32x2_t v1093 = vmul_f32(v1077, v1663); + float32x2_t v1108 = vmul_f32(v1059, v1663); float32x2_t v534 = vsub_f32(v533, v529); float32x2_t v556 = vsub_f32(v529, v555); float32x2_t v560 = vmul_f32(v529, v1710); @@ -15760,10 +13839,10 @@ void armral_fft_cf32_cf32_cf32_ab_t_gs25(const armral_cmplx_f32_t *restrict x, float32x2_t v990 = vsub_f32(v989, v985); float32x2_t v1012 = vsub_f32(v985, v1011); float32x2_t v1016 = vmul_f32(v985, v1710); - float32x2_t v1034 = vadd_f32(v1033, v654); - float32x2_t v1046 = vadd_f32(v1045, v768); - float32x2_t v1058 = vadd_f32(v1057, v996); - float32x2_t v1076 = vadd_f32(v1075, v882); + float32x2_t v1083 = vadd_f32(v1064, v1082); + float32x2_t v1084 = vsub_f32(v1064, v1082); + float32x2_t v1094 = vadd_f32(v1059, v1093); + float32x2_t v1109 = vsub_f32(v1108, v1077); float32x2_t v548 = vsub_f32(v534, v547); float32x2_t v561 = vsub_f32(v560, v556); float32x2_t v565 = vmul_f32(v534, v1710); @@ -15779,10 +13858,11 @@ void armral_fft_cf32_cf32_cf32_ab_t_gs25(const armral_cmplx_f32_t *restrict x, float32x2_t v1004 = vsub_f32(v990, v1003); float32x2_t v1017 = vsub_f32(v1016, v1012); float32x2_t v1021 = vmul_f32(v990, v1710); - float32x2_t v1059 = vsub_f32(v1034, v1058); - float32x2_t v1063 = vmul_f32(v1034, v1710); - float32x2_t v1077 = vsub_f32(v1046, v1076); - float32x2_t v1081 = vmul_f32(v1046, v1710); + float32x2_t v1088 = vmul_f32(v1083, v1643); + float32x2_t v1098 = vmul_f32(v1084, v1653); + float32x2_t v1110 = vadd_f32(v540, v1083); + float32x2_t v1121 = vrev64_f32(v1094); + float32x2_t v1134 = vrev64_f32(v1109); float32x2_t v1310 = vrev64_f32(v670); float32x2_t v1322 = vrev64_f32(v784); float32x2_t v1334 = vrev64_f32(v1012); @@ -15792,10 +13872,10 @@ void armral_fft_cf32_cf32_cf32_ab_t_gs25(const armral_cmplx_f32_t *restrict x, float32x2_t v794 = vsub_f32(v793, v776); float32x2_t v908 = vsub_f32(v907, v890); float32x2_t v1022 = vsub_f32(v1021, v1004); - float32x2_t v1064 = vsub_f32(v1063, v1059); - float32x2_t v1082 = vsub_f32(v1081, v1077); - float32x2_t v1093 = vmul_f32(v1077, v1663); - float32x2_t v1108 = vmul_f32(v1059, v1663); + float32x2_t v1089 = vsub_f32(v540, v1088); + v6[0] = v1110; + float32x2_t v1122 = vmul_f32(v1121, v1689); + float32x2_t v1135 = vmul_f32(v1134, v1689); float32x2_t v1171 = vrev64_f32(v662); float32x2_t v1183 = vrev64_f32(v776); float32x2_t v1195 = vrev64_f32(v1004); @@ -15808,10 +13888,8 @@ void armral_fft_cf32_cf32_cf32_ab_t_gs25(const armral_cmplx_f32_t *restrict x, float32x2_t v1461 = vrev64_f32(v789); float32x2_t v1473 = vrev64_f32(v1017); float32x2_t v1491 = vrev64_f32(v903); - float32x2_t v1083 = vadd_f32(v1064, v1082); - float32x2_t v1084 = vsub_f32(v1064, v1082); - float32x2_t v1094 = vadd_f32(v1059, v1093); - float32x2_t v1109 = vsub_f32(v1108, v1077); + float32x2_t v1099 = vsub_f32(v1089, v1098); + float32x2_t v1103 = vmul_f32(v1089, v1710); float32x2_t v1172 = vmul_f32(v1171, v1170); float32x2_t v1184 = vmul_f32(v1183, v1309); float32x2_t v1196 = vmul_f32(v1195, v1587); @@ -15828,11 +13906,9 @@ void armral_fft_cf32_cf32_cf32_ab_t_gs25(const armral_cmplx_f32_t *restrict x, float32x2_t v1600 = vrev64_f32(v794); float32x2_t v1612 = vrev64_f32(v1022); float32x2_t v1630 = vrev64_f32(v908); - float32x2_t v1088 = vmul_f32(v1083, v1643); - float32x2_t v1098 = vmul_f32(v1084, v1653); - float32x2_t v1110 = vadd_f32(v540, v1083); - float32x2_t v1121 = vrev64_f32(v1094); - float32x2_t v1134 = vrev64_f32(v1109); + float32x2_t v1104 = vsub_f32(v1103, v1099); + float32x2_t v1136 = vsub_f32(v1099, v1135); + float32x2_t v1145 = vmul_f32(v1099, v1710); float32x2_t v1173 = vfma_f32(v1172, v662, v1164); float32x2_t v1185 = vfma_f32(v1184, v776, v1303); float32x2_t v1197 = vfma_f32(v1196, v1004, v1581); @@ -15849,10 +13925,10 @@ void armral_fft_cf32_cf32_cf32_ab_t_gs25(const armral_cmplx_f32_t *restrict x, float32x2_t v1601 = vmul_f32(v1600, v1599); float32x2_t v1613 = vmul_f32(v1612, v1611); float32x2_t v1631 = vmul_f32(v1630, v1629); - float32x2_t v1089 = vsub_f32(v540, v1088); - v6[0] = v1110; - float32x2_t v1122 = vmul_f32(v1121, v1689); - float32x2_t v1135 = vmul_f32(v1134, v1689); + float32x2_t v1123 = vsub_f32(v1104, v1122); + v6[ostride * 10] = v1136; + float32x2_t v1146 = vsub_f32(v1145, v1136); + float32x2_t v1155 = vmul_f32(v1104, v1710); float32x2_t v1198 = vsub_f32(v1173, v1197); float32x2_t v1202 = vmul_f32(v1173, v1710); float32x2_t v1216 = vsub_f32(v1185, v1215); @@ -15869,8 +13945,9 @@ void armral_fft_cf32_cf32_cf32_ab_t_gs25(const armral_cmplx_f32_t *restrict x, float32x2_t v1602 = vfma_f32(v1601, v794, v1593); float32x2_t v1614 = vfma_f32(v1613, v1022, v1605); float32x2_t v1632 = vfma_f32(v1631, v908, v1623); - float32x2_t v1099 = vsub_f32(v1089, v1098); - float32x2_t v1103 = vmul_f32(v1089, v1710); + v6[ostride * 5] = v1123; + v6[ostride * 15] = v1146; + float32x2_t v1156 = vsub_f32(v1155, v1123); float32x2_t v1203 = vsub_f32(v1202, v1198); float32x2_t v1221 = vsub_f32(v1220, v1216); float32x2_t v1232 = vmul_f32(v1216, v1663); @@ -15887,9 +13964,7 @@ void armral_fft_cf32_cf32_cf32_ab_t_gs25(const armral_cmplx_f32_t *restrict x, float32x2_t v1619 = vmul_f32(v1590, v1710); float32x2_t v1633 = vsub_f32(v1602, v1632); float32x2_t v1637 = vmul_f32(v1602, v1710); - float32x2_t v1104 = vsub_f32(v1103, v1099); - float32x2_t v1136 = vsub_f32(v1099, v1135); - float32x2_t v1145 = vmul_f32(v1099, v1710); + v6[ostride * 20] = v1156; float32x2_t v1222 = vadd_f32(v1203, v1221); float32x2_t v1223 = vsub_f32(v1203, v1221); float32x2_t v1233 = vadd_f32(v1198, v1232); @@ -15907,10 +13982,6 @@ void armral_fft_cf32_cf32_cf32_ab_t_gs25(const armral_cmplx_f32_t *restrict x, float32x2_t v1638 = vsub_f32(v1637, v1633); float32x2_t v1649 = vmul_f32(v1633, v1663); float32x2_t v1664 = vmul_f32(v1615, v1663); - float32x2_t v1123 = vsub_f32(v1104, v1122); - v6[ostride * 10] = v1136; - float32x2_t v1146 = vsub_f32(v1145, v1136); - float32x2_t v1155 = vmul_f32(v1104, v1710); float32x2_t v1227 = vmul_f32(v1222, v1643); float32x2_t v1237 = vmul_f32(v1223, v1653); float32x2_t v1249 = vadd_f32(v548, v1222); @@ -15929,9 +14000,6 @@ void armral_fft_cf32_cf32_cf32_ab_t_gs25(const armral_cmplx_f32_t *restrict x, float32x2_t v1640 = vsub_f32(v1620, v1638); float32x2_t v1650 = vadd_f32(v1615, v1649); float32x2_t v1665 = vsub_f32(v1664, v1633); - v6[ostride * 5] = v1123; - v6[ostride * 15] = v1146; - float32x2_t v1156 = vsub_f32(v1155, v1123); float32x2_t v1228 = vsub_f32(v548, v1227); v6[ostride] = v1249; float32x2_t v1261 = vmul_f32(v1260, v1689); @@ -15947,7 +14015,6 @@ void armral_fft_cf32_cf32_cf32_ab_t_gs25(const armral_cmplx_f32_t *restrict x, float32x2_t v1666 = vadd_f32(v566, v1639); float32x2_t v1677 = vrev64_f32(v1650); float32x2_t v1690 = vrev64_f32(v1665); - v6[ostride * 20] = v1156; float32x2_t v1238 = vsub_f32(v1228, v1237); float32x2_t v1242 = vmul_f32(v1228, v1710); float32x2_t v1382 = vsub_f32(v1381, v1377); @@ -16050,7 +14117,6 @@ void armral_fft_cf32_cf32_cf32_ab_t_gs25(const armral_cmplx_f32_t *restrict x, float v1713 = 2.5000000000000000e-01F; float v1725 = 5.5901699437494745e-01F; float v1737 = 6.1803398874989490e-01F; - float v1765 = 0.0000000000000000e+00F; float v1766 = -9.5105651629515353e-01F; float v1794 = 2.0000000000000000e+00F; const float32x2_t *v1848 = &v5[v0]; @@ -16101,7 +14167,6 @@ void armral_fft_cf32_cf32_cf32_ab_t_gs25(const armral_cmplx_f32_t *restrict x, int64_t v341 = v0 * 24; int64_t v349 = v10 * 23; int64_t v350 = v13 * 24; - float v1051 = v4 * v1765; int64_t v1111 = v2 * 5; int64_t v1126 = v2 * 10; int64_t v1139 = v2 * 15; @@ -16137,6 +14202,7 @@ void armral_fft_cf32_cf32_cf32_ab_t_gs25(const armral_cmplx_f32_t *restrict x, int64_t v1800 = v2 * 24; const float32x2_t *v2030 = &v5[0]; svint64_t v2031 = svindex_s64(0, v1); + svfloat32_t v2136 = svdup_n_f32(0); float32x2_t *v2150 = &v6[0]; svfloat32_t v2193 = svdup_n_f32(v1159); svfloat32_t v2257 = svdup_n_f32(v1321); @@ -16203,7 +14269,6 @@ void armral_fft_cf32_cf32_cf32_ab_t_gs25(const armral_cmplx_f32_t *restrict x, const float32x2_t *v2020 = &v5[v341]; svfloat32_t v2032 = svreinterpret_f32_f64( svld1_gather_s64index_f64(pred_full, (const double *)(v2030), v2031)); - svfloat32_t v2136 = svdup_n_f32(v1051); float32x2_t *v2160 = &v6[v1111]; float32x2_t *v2170 = &v6[v1126]; float32x2_t *v2180 = &v6[v1139]; @@ -16245,8 +14310,7 @@ void armral_fft_cf32_cf32_cf32_ab_t_gs25(const armral_cmplx_f32_t *restrict x, svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v7)[v57])); svfloat32_t v72 = svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v7)[v71])); - svfloat32_t zero87; - asm volatile("mov %0.s, #0" : "=w"(zero87)); + svfloat32_t zero87 = svdup_n_f32(0); svfloat32_t v87 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero87, v1850, v86, 0), v1850, v86, 90); @@ -16334,118 +14398,95 @@ void armral_fft_cf32_cf32_cf32_ab_t_gs25(const armral_cmplx_f32_t *restrict x, svld1_gather_s64index_f64(pred_full, (const double *)(v2011), v2031)); svfloat32_t v2022 = svreinterpret_f32_f64( svld1_gather_s64index_f64(pred_full, (const double *)(v2020), v2031)); - svfloat32_t zero31; - asm volatile("mov %0.s, #0" : "=w"(zero31)); + svfloat32_t zero31 = svdup_n_f32(0); svfloat32_t v31 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero31, v1814, v30, 0), v1814, v30, 90); - svfloat32_t zero45; - asm volatile("mov %0.s, #0" : "=w"(zero45)); + svfloat32_t zero45 = svdup_n_f32(0); svfloat32_t v45 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero45, v1823, v44, 0), v1823, v44, 90); - svfloat32_t zero59; - asm volatile("mov %0.s, #0" : "=w"(zero59)); + svfloat32_t zero59 = svdup_n_f32(0); svfloat32_t v59 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero59, v1832, v58, 0), v1832, v58, 90); - svfloat32_t zero73; - asm volatile("mov %0.s, #0" : "=w"(zero73)); + svfloat32_t zero73 = svdup_n_f32(0); svfloat32_t v73 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero73, v1841, v72, 0), v1841, v72, 90); - svfloat32_t zero101; - asm volatile("mov %0.s, #0" : "=w"(zero101)); + svfloat32_t zero101 = svdup_n_f32(0); svfloat32_t v101 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero101, v1860, v100, 0), v1860, v100, 90); - svfloat32_t zero115; - asm volatile("mov %0.s, #0" : "=w"(zero115)); + svfloat32_t zero115 = svdup_n_f32(0); svfloat32_t v115 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero115, v1869, v114, 0), v1869, v114, 90); - svfloat32_t zero129; - asm volatile("mov %0.s, #0" : "=w"(zero129)); + svfloat32_t zero129 = svdup_n_f32(0); svfloat32_t v129 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero129, v1878, v128, 0), v1878, v128, 90); - svfloat32_t zero143; - asm volatile("mov %0.s, #0" : "=w"(zero143)); + svfloat32_t zero143 = svdup_n_f32(0); svfloat32_t v143 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero143, v1887, v142, 0), v1887, v142, 90); - svfloat32_t zero157; - asm volatile("mov %0.s, #0" : "=w"(zero157)); + svfloat32_t zero157 = svdup_n_f32(0); svfloat32_t v157 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero157, v1896, v156, 0), v1896, v156, 90); - svfloat32_t zero171; - asm volatile("mov %0.s, #0" : "=w"(zero171)); + svfloat32_t zero171 = svdup_n_f32(0); svfloat32_t v171 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero171, v1905, v170, 0), v1905, v170, 90); - svfloat32_t zero185; - asm volatile("mov %0.s, #0" : "=w"(zero185)); + svfloat32_t zero185 = svdup_n_f32(0); svfloat32_t v185 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero185, v1914, v184, 0), v1914, v184, 90); - svfloat32_t zero199; - asm volatile("mov %0.s, #0" : "=w"(zero199)); + svfloat32_t zero199 = svdup_n_f32(0); svfloat32_t v199 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero199, v1923, v198, 0), v1923, v198, 90); - svfloat32_t zero213; - asm volatile("mov %0.s, #0" : "=w"(zero213)); + svfloat32_t zero213 = svdup_n_f32(0); svfloat32_t v213 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero213, v1932, v212, 0), v1932, v212, 90); - svfloat32_t zero227; - asm volatile("mov %0.s, #0" : "=w"(zero227)); + svfloat32_t zero227 = svdup_n_f32(0); svfloat32_t v227 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero227, v1941, v226, 0), v1941, v226, 90); - svfloat32_t zero241; - asm volatile("mov %0.s, #0" : "=w"(zero241)); + svfloat32_t zero241 = svdup_n_f32(0); svfloat32_t v241 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero241, v1950, v240, 0), v1950, v240, 90); - svfloat32_t zero255; - asm volatile("mov %0.s, #0" : "=w"(zero255)); + svfloat32_t zero255 = svdup_n_f32(0); svfloat32_t v255 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero255, v1959, v254, 0), v1959, v254, 90); - svfloat32_t zero269; - asm volatile("mov %0.s, #0" : "=w"(zero269)); + svfloat32_t zero269 = svdup_n_f32(0); svfloat32_t v269 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero269, v1968, v268, 0), v1968, v268, 90); - svfloat32_t zero283; - asm volatile("mov %0.s, #0" : "=w"(zero283)); + svfloat32_t zero283 = svdup_n_f32(0); svfloat32_t v283 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero283, v1977, v282, 0), v1977, v282, 90); - svfloat32_t zero297; - asm volatile("mov %0.s, #0" : "=w"(zero297)); + svfloat32_t zero297 = svdup_n_f32(0); svfloat32_t v297 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero297, v1986, v296, 0), v1986, v296, 90); - svfloat32_t zero311; - asm volatile("mov %0.s, #0" : "=w"(zero311)); + svfloat32_t zero311 = svdup_n_f32(0); svfloat32_t v311 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero311, v1995, v310, 0), v1995, v310, 90); - svfloat32_t zero325; - asm volatile("mov %0.s, #0" : "=w"(zero325)); + svfloat32_t zero325 = svdup_n_f32(0); svfloat32_t v325 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero325, v2004, v324, 0), v2004, v324, 90); - svfloat32_t zero339; - asm volatile("mov %0.s, #0" : "=w"(zero339)); + svfloat32_t zero339 = svdup_n_f32(0); svfloat32_t v339 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero339, v2013, v338, 0), v2013, v338, 90); - svfloat32_t zero353; - asm volatile("mov %0.s, #0" : "=w"(zero353)); + svfloat32_t zero353 = svdup_n_f32(0); svfloat32_t v353 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero353, v2022, v352, 0), v2022, v352, 90); @@ -16469,26 +14510,16 @@ void armral_fft_cf32_cf32_cf32_ab_t_gs25(const armral_cmplx_f32_t *restrict x, svfloat32_t v894 = svcmla_f32_x(pred_full, v325, v2136, v325, 90); svfloat32_t v907 = svcmla_f32_x(pred_full, v353, v2136, v353, 90); svfloat32_t v927 = svcmla_f32_x(pred_full, v339, v2136, v339, 90); - svfloat32_t v400; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v400) : "w"(v373), "w"(v399)); - svfloat32_t v420; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v420) : "w"(v386), "w"(v419)); - svfloat32_t v527; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v527) : "w"(v500), "w"(v526)); - svfloat32_t v547; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v547) : "w"(v513), "w"(v546)); - svfloat32_t v654; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v654) : "w"(v627), "w"(v653)); - svfloat32_t v674; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v674) : "w"(v640), "w"(v673)); - svfloat32_t v781; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v781) : "w"(v754), "w"(v780)); - svfloat32_t v801; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v801) : "w"(v767), "w"(v800)); - svfloat32_t v908; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v908) : "w"(v881), "w"(v907)); - svfloat32_t v928; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v928) : "w"(v894), "w"(v927)); + svfloat32_t v400 = svsub_f32_x(svptrue_b32(), v373, v399); + svfloat32_t v420 = svsub_f32_x(svptrue_b32(), v386, v419); + svfloat32_t v527 = svsub_f32_x(svptrue_b32(), v500, v526); + svfloat32_t v547 = svsub_f32_x(svptrue_b32(), v513, v546); + svfloat32_t v654 = svsub_f32_x(svptrue_b32(), v627, v653); + svfloat32_t v674 = svsub_f32_x(svptrue_b32(), v640, v673); + svfloat32_t v781 = svsub_f32_x(svptrue_b32(), v754, v780); + svfloat32_t v801 = svsub_f32_x(svptrue_b32(), v767, v800); + svfloat32_t v908 = svsub_f32_x(svptrue_b32(), v881, v907); + svfloat32_t v928 = svsub_f32_x(svptrue_b32(), v894, v927); svfloat32_t v406 = svnmls_f32_x(pred_full, v400, v373, v2439); svfloat32_t v426 = svnmls_f32_x(pred_full, v420, v386, v2439); svfloat32_t v533 = svnmls_f32_x(pred_full, v527, v500, v2439); @@ -16499,75 +14530,50 @@ void armral_fft_cf32_cf32_cf32_ab_t_gs25(const armral_cmplx_f32_t *restrict x, svfloat32_t v807 = svnmls_f32_x(pred_full, v801, v767, v2439); svfloat32_t v914 = svnmls_f32_x(pred_full, v908, v881, v2439); svfloat32_t v934 = svnmls_f32_x(pred_full, v928, v894, v2439); - svfloat32_t v427; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v427) : "w"(v406), "w"(v426)); - svfloat32_t v428; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v428) : "w"(v406), "w"(v426)); + svfloat32_t v427 = svadd_f32_x(svptrue_b32(), v406, v426); + svfloat32_t v428 = svsub_f32_x(svptrue_b32(), v406, v426); svfloat32_t v440 = svmla_f32_x(pred_full, v400, v420, v2399); svfloat32_t v458 = svnmls_f32_x(pred_full, v420, v400, v2399); - svfloat32_t v554; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v554) : "w"(v533), "w"(v553)); - svfloat32_t v555; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v555) : "w"(v533), "w"(v553)); + svfloat32_t v554 = svadd_f32_x(svptrue_b32(), v533, v553); + svfloat32_t v555 = svsub_f32_x(svptrue_b32(), v533, v553); svfloat32_t v567 = svmla_f32_x(pred_full, v527, v547, v2399); svfloat32_t v585 = svnmls_f32_x(pred_full, v547, v527, v2399); - svfloat32_t v681; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v681) : "w"(v660), "w"(v680)); - svfloat32_t v682; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v682) : "w"(v660), "w"(v680)); + svfloat32_t v681 = svadd_f32_x(svptrue_b32(), v660, v680); + svfloat32_t v682 = svsub_f32_x(svptrue_b32(), v660, v680); svfloat32_t v694 = svmla_f32_x(pred_full, v654, v674, v2399); svfloat32_t v712 = svnmls_f32_x(pred_full, v674, v654, v2399); - svfloat32_t v808; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v808) : "w"(v787), "w"(v807)); - svfloat32_t v809; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v809) : "w"(v787), "w"(v807)); + svfloat32_t v808 = svadd_f32_x(svptrue_b32(), v787, v807); + svfloat32_t v809 = svsub_f32_x(svptrue_b32(), v787, v807); svfloat32_t v821 = svmla_f32_x(pred_full, v781, v801, v2399); svfloat32_t v839 = svnmls_f32_x(pred_full, v801, v781, v2399); - svfloat32_t v935; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v935) : "w"(v914), "w"(v934)); - svfloat32_t v936; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v936) : "w"(v914), "w"(v934)); + svfloat32_t v935 = svadd_f32_x(svptrue_b32(), v914, v934); + svfloat32_t v936 = svsub_f32_x(svptrue_b32(), v914, v934); svfloat32_t v948 = svmla_f32_x(pred_full, v908, v928, v2399); svfloat32_t v966 = svnmls_f32_x(pred_full, v928, v908, v2399); - svfloat32_t v459; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v459) : "w"(v2032), "w"(v427)); - svfloat32_t zero466; - asm volatile("mov %0.s, #0" : "=w"(zero466)); + svfloat32_t v459 = svadd_f32_x(svptrue_b32(), v2032, v427); + svfloat32_t zero466 = svdup_n_f32(0); svfloat32_t v466 = svcmla_f32_x(pred_full, zero466, v2419, v440, 90); - svfloat32_t zero474; - asm volatile("mov %0.s, #0" : "=w"(zero474)); + svfloat32_t zero474 = svdup_n_f32(0); svfloat32_t v474 = svcmla_f32_x(pred_full, zero474, v2419, v458, 90); - svfloat32_t v586; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v586) : "w"(v87), "w"(v554)); - svfloat32_t zero593; - asm volatile("mov %0.s, #0" : "=w"(zero593)); + svfloat32_t v586 = svadd_f32_x(svptrue_b32(), v87, v554); + svfloat32_t zero593 = svdup_n_f32(0); svfloat32_t v593 = svcmla_f32_x(pred_full, zero593, v2419, v567, 90); - svfloat32_t zero601; - asm volatile("mov %0.s, #0" : "=w"(zero601)); + svfloat32_t zero601 = svdup_n_f32(0); svfloat32_t v601 = svcmla_f32_x(pred_full, zero601, v2419, v585, 90); - svfloat32_t v713; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v713) : "w"(v157), "w"(v681)); - svfloat32_t zero720; - asm volatile("mov %0.s, #0" : "=w"(zero720)); + svfloat32_t v713 = svadd_f32_x(svptrue_b32(), v157, v681); + svfloat32_t zero720 = svdup_n_f32(0); svfloat32_t v720 = svcmla_f32_x(pred_full, zero720, v2419, v694, 90); - svfloat32_t zero728; - asm volatile("mov %0.s, #0" : "=w"(zero728)); + svfloat32_t zero728 = svdup_n_f32(0); svfloat32_t v728 = svcmla_f32_x(pred_full, zero728, v2419, v712, 90); - svfloat32_t v840; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v840) : "w"(v227), "w"(v808)); - svfloat32_t zero847; - asm volatile("mov %0.s, #0" : "=w"(zero847)); + svfloat32_t v840 = svadd_f32_x(svptrue_b32(), v227, v808); + svfloat32_t zero847 = svdup_n_f32(0); svfloat32_t v847 = svcmla_f32_x(pred_full, zero847, v2419, v821, 90); - svfloat32_t zero855; - asm volatile("mov %0.s, #0" : "=w"(zero855)); + svfloat32_t zero855 = svdup_n_f32(0); svfloat32_t v855 = svcmla_f32_x(pred_full, zero855, v2419, v839, 90); - svfloat32_t v967; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v967) : "w"(v297), "w"(v935)); - svfloat32_t zero974; - asm volatile("mov %0.s, #0" : "=w"(zero974)); + svfloat32_t v967 = svadd_f32_x(svptrue_b32(), v297, v935); + svfloat32_t zero974 = svdup_n_f32(0); svfloat32_t v974 = svcmla_f32_x(pred_full, zero974, v2419, v948, 90); - svfloat32_t zero982; - asm volatile("mov %0.s, #0" : "=w"(zero982)); + svfloat32_t zero982 = svdup_n_f32(0); svfloat32_t v982 = svcmla_f32_x(pred_full, zero982, v2419, v966, 90); svfloat32_t v434 = svmls_f32_x(pred_full, v2032, v427, v2395); svfloat32_t v561 = svmls_f32_x(pred_full, v87, v554, v2395); @@ -16584,119 +14590,80 @@ void armral_fft_cf32_cf32_cf32_ab_t_gs25(const armral_cmplx_f32_t *restrict x, svfloat32_t v1034 = svcmla_f32_x(pred_full, v967, v2136, v967, 90); svfloat32_t v1054 = svcmla_f32_x(pred_full, v840, v2136, v840, 90); svfloat32_t v452 = svnmls_f32_x(pred_full, v446, v434, v2439); - svfloat32_t v475; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v475) : "w"(v446), "w"(v474)); + svfloat32_t v475 = svsub_f32_x(svptrue_b32(), v446, v474); svfloat32_t v579 = svnmls_f32_x(pred_full, v573, v561, v2439); - svfloat32_t v602; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v602) : "w"(v573), "w"(v601)); + svfloat32_t v602 = svsub_f32_x(svptrue_b32(), v573, v601); svfloat32_t v706 = svnmls_f32_x(pred_full, v700, v688, v2439); - svfloat32_t v729; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v729) : "w"(v700), "w"(v728)); + svfloat32_t v729 = svsub_f32_x(svptrue_b32(), v700, v728); svfloat32_t v833 = svnmls_f32_x(pred_full, v827, v815, v2439); - svfloat32_t v856; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v856) : "w"(v827), "w"(v855)); + svfloat32_t v856 = svsub_f32_x(svptrue_b32(), v827, v855); svfloat32_t v960 = svnmls_f32_x(pred_full, v954, v942, v2439); - svfloat32_t v983; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v983) : "w"(v954), "w"(v982)); - svfloat32_t v1035; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1035) : "w"(v1008), "w"(v1034)); - svfloat32_t v1055; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1055) : "w"(v1021), "w"(v1054)); - svfloat32_t v467; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v467) : "w"(v452), "w"(v466)); + svfloat32_t v983 = svsub_f32_x(svptrue_b32(), v954, v982); + svfloat32_t v1035 = svsub_f32_x(svptrue_b32(), v1008, v1034); + svfloat32_t v1055 = svsub_f32_x(svptrue_b32(), v1021, v1054); + svfloat32_t v467 = svsub_f32_x(svptrue_b32(), v452, v466); svfloat32_t v481 = svnmls_f32_x(pred_full, v475, v446, v2439); - svfloat32_t v594; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v594) : "w"(v579), "w"(v593)); + svfloat32_t v594 = svsub_f32_x(svptrue_b32(), v579, v593); svfloat32_t v608 = svnmls_f32_x(pred_full, v602, v573, v2439); - svfloat32_t v721; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v721) : "w"(v706), "w"(v720)); + svfloat32_t v721 = svsub_f32_x(svptrue_b32(), v706, v720); svfloat32_t v735 = svnmls_f32_x(pred_full, v729, v700, v2439); - svfloat32_t v848; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v848) : "w"(v833), "w"(v847)); + svfloat32_t v848 = svsub_f32_x(svptrue_b32(), v833, v847); svfloat32_t v862 = svnmls_f32_x(pred_full, v856, v827, v2439); - svfloat32_t v975; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v975) : "w"(v960), "w"(v974)); + svfloat32_t v975 = svsub_f32_x(svptrue_b32(), v960, v974); svfloat32_t v989 = svnmls_f32_x(pred_full, v983, v954, v2439); svfloat32_t v1041 = svnmls_f32_x(pred_full, v1035, v1008, v2439); svfloat32_t v1061 = svnmls_f32_x(pred_full, v1055, v1021, v2439); - svfloat32_t v1324; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v1324) : "w"(v602), "w"(v2257)); - svfloat32_t v1337; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v1337) : "w"(v729), "w"(v2385)); - svfloat32_t v1350; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v1350) : "w"(v983), "w"(v2387)); - svfloat32_t v1370; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v1370) : "w"(v856), "w"(v2323)); + svfloat32_t v1324 = svmul_f32_x(svptrue_b32(), v602, v2257); + svfloat32_t v1337 = svmul_f32_x(svptrue_b32(), v729, v2385); + svfloat32_t v1350 = svmul_f32_x(svptrue_b32(), v983, v2387); + svfloat32_t v1370 = svmul_f32_x(svptrue_b32(), v856, v2323); svfloat32_t v487 = svnmls_f32_x(pred_full, v467, v452, v2439); svfloat32_t v614 = svnmls_f32_x(pred_full, v594, v579, v2439); svfloat32_t v741 = svnmls_f32_x(pred_full, v721, v706, v2439); svfloat32_t v868 = svnmls_f32_x(pred_full, v848, v833, v2439); svfloat32_t v995 = svnmls_f32_x(pred_full, v975, v960, v2439); - svfloat32_t v1062; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v1062) : "w"(v1041), "w"(v1061)); - svfloat32_t v1063; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1063) : "w"(v1041), "w"(v1061)); + svfloat32_t v1062 = svadd_f32_x(svptrue_b32(), v1041, v1061); + svfloat32_t v1063 = svsub_f32_x(svptrue_b32(), v1041, v1061); svfloat32_t v1075 = svmla_f32_x(pred_full, v1035, v1055, v2399); svfloat32_t v1093 = svnmls_f32_x(pred_full, v1055, v1035, v2399); - svfloat32_t v1162; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v1162) : "w"(v594), "w"(v2193)); - svfloat32_t v1175; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v1175) : "w"(v721), "w"(v2257)); - svfloat32_t v1188; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v1188) : "w"(v975), "w"(v2385)); - svfloat32_t v1208; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v1208) : "w"(v848), "w"(v2321)); + svfloat32_t v1162 = svmul_f32_x(svptrue_b32(), v594, v2193); + svfloat32_t v1175 = svmul_f32_x(svptrue_b32(), v721, v2257); + svfloat32_t v1188 = svmul_f32_x(svptrue_b32(), v975, v2385); + svfloat32_t v1208 = svmul_f32_x(svptrue_b32(), v848, v2321); svfloat32_t v1332 = svcmla_f32_x(pred_full, v1324, v2258, v602, 90); svfloat32_t v1345 = svcmla_f32_x(pred_full, v1337, v2386, v729, 90); svfloat32_t v1358 = svcmla_f32_x(pred_full, v1350, v2388, v983, 90); svfloat32_t v1378 = svcmla_f32_x(pred_full, v1370, v2324, v856, 90); - svfloat32_t v1486; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v1486) : "w"(v608), "w"(v2321)); - svfloat32_t v1499; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v1499) : "w"(v735), "w"(v2323)); - svfloat32_t v1512; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v1512) : "w"(v989), "w"(v2392)); - svfloat32_t v1532; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v1532) : "w"(v862), "w"(v2389)); - svfloat32_t v1094; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v1094) : "w"(v459), "w"(v1062)); - svfloat32_t zero1108; - asm volatile("mov %0.s, #0" : "=w"(zero1108)); + svfloat32_t v1486 = svmul_f32_x(svptrue_b32(), v608, v2321); + svfloat32_t v1499 = svmul_f32_x(svptrue_b32(), v735, v2323); + svfloat32_t v1512 = svmul_f32_x(svptrue_b32(), v989, v2392); + svfloat32_t v1532 = svmul_f32_x(svptrue_b32(), v862, v2389); + svfloat32_t v1094 = svadd_f32_x(svptrue_b32(), v459, v1062); + svfloat32_t zero1108 = svdup_n_f32(0); svfloat32_t v1108 = svcmla_f32_x(pred_full, zero1108, v2419, v1075, 90); - svfloat32_t zero1123; - asm volatile("mov %0.s, #0" : "=w"(zero1123)); + svfloat32_t zero1123 = svdup_n_f32(0); svfloat32_t v1123 = svcmla_f32_x(pred_full, zero1123, v2419, v1093, 90); svfloat32_t v1170 = svcmla_f32_x(pred_full, v1162, v2194, v594, 90); svfloat32_t v1183 = svcmla_f32_x(pred_full, v1175, v2258, v721, 90); svfloat32_t v1196 = svcmla_f32_x(pred_full, v1188, v2386, v975, 90); svfloat32_t v1216 = svcmla_f32_x(pred_full, v1208, v2322, v848, 90); - svfloat32_t v1359; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1359) : "w"(v1332), "w"(v1358)); - svfloat32_t v1379; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1379) : "w"(v1345), "w"(v1378)); + svfloat32_t v1359 = svsub_f32_x(svptrue_b32(), v1332, v1358); + svfloat32_t v1379 = svsub_f32_x(svptrue_b32(), v1345, v1378); svfloat32_t v1494 = svcmla_f32_x(pred_full, v1486, v2322, v608, 90); svfloat32_t v1507 = svcmla_f32_x(pred_full, v1499, v2324, v735, 90); svfloat32_t v1520 = svcmla_f32_x(pred_full, v1512, v2393, v989, 90); svfloat32_t v1540 = svcmla_f32_x(pred_full, v1532, v2329, v862, 90); - svfloat32_t v1648; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v1648) : "w"(v614), "w"(v2385)); - svfloat32_t v1661; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v1661) : "w"(v741), "w"(v2387)); - svfloat32_t v1674; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v1674) : "w"(v995), "w"(v2389)); - svfloat32_t v1694; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v1694) : "w"(v868), "w"(v2392)); + svfloat32_t v1648 = svmul_f32_x(svptrue_b32(), v614, v2385); + svfloat32_t v1661 = svmul_f32_x(svptrue_b32(), v741, v2387); + svfloat32_t v1674 = svmul_f32_x(svptrue_b32(), v995, v2389); + svfloat32_t v1694 = svmul_f32_x(svptrue_b32(), v868, v2392); svfloat32_t v1069 = svmls_f32_x(pred_full, v459, v1062, v2395); - svfloat32_t v1197; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1197) : "w"(v1170), "w"(v1196)); - svfloat32_t v1217; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1217) : "w"(v1183), "w"(v1216)); + svfloat32_t v1197 = svsub_f32_x(svptrue_b32(), v1170, v1196); + svfloat32_t v1217 = svsub_f32_x(svptrue_b32(), v1183, v1216); svfloat32_t v1365 = svnmls_f32_x(pred_full, v1359, v1332, v2439); svfloat32_t v1385 = svnmls_f32_x(pred_full, v1379, v1345, v2439); - svfloat32_t v1521; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1521) : "w"(v1494), "w"(v1520)); - svfloat32_t v1541; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1541) : "w"(v1507), "w"(v1540)); + svfloat32_t v1521 = svsub_f32_x(svptrue_b32(), v1494, v1520); + svfloat32_t v1541 = svsub_f32_x(svptrue_b32(), v1507, v1540); svfloat32_t v1656 = svcmla_f32_x(pred_full, v1648, v2386, v614, 90); svfloat32_t v1669 = svcmla_f32_x(pred_full, v1661, v2388, v741, 90); svfloat32_t v1682 = svcmla_f32_x(pred_full, v1674, v2390, v995, 90); @@ -16706,67 +14673,46 @@ void armral_fft_cf32_cf32_cf32_ab_t_gs25(const armral_cmplx_f32_t *restrict x, svfloat32_t v1081 = svmls_f32_x(pred_full, v1069, v1063, v2397); svfloat32_t v1203 = svnmls_f32_x(pred_full, v1197, v1170, v2439); svfloat32_t v1223 = svnmls_f32_x(pred_full, v1217, v1183, v2439); - svfloat32_t v1386; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v1386) : "w"(v1365), "w"(v1385)); - svfloat32_t v1387; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1387) : "w"(v1365), "w"(v1385)); + svfloat32_t v1386 = svadd_f32_x(svptrue_b32(), v1365, v1385); + svfloat32_t v1387 = svsub_f32_x(svptrue_b32(), v1365, v1385); svfloat32_t v1399 = svmla_f32_x(pred_full, v1359, v1379, v2399); svfloat32_t v1417 = svnmls_f32_x(pred_full, v1379, v1359, v2399); svfloat32_t v1527 = svnmls_f32_x(pred_full, v1521, v1494, v2439); svfloat32_t v1547 = svnmls_f32_x(pred_full, v1541, v1507, v2439); - svfloat32_t v1683; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1683) : "w"(v1656), "w"(v1682)); - svfloat32_t v1703; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1703) : "w"(v1669), "w"(v1702)); + svfloat32_t v1683 = svsub_f32_x(svptrue_b32(), v1656, v1682); + svfloat32_t v1703 = svsub_f32_x(svptrue_b32(), v1669, v1702); svfloat32_t v1087 = svnmls_f32_x(pred_full, v1081, v1069, v2439); - svfloat32_t v1124; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1124) : "w"(v1081), "w"(v1123)); - svfloat32_t v1224; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v1224) : "w"(v1203), "w"(v1223)); - svfloat32_t v1225; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1225) : "w"(v1203), "w"(v1223)); + svfloat32_t v1124 = svsub_f32_x(svptrue_b32(), v1081, v1123); + svfloat32_t v1224 = svadd_f32_x(svptrue_b32(), v1203, v1223); + svfloat32_t v1225 = svsub_f32_x(svptrue_b32(), v1203, v1223); svfloat32_t v1237 = svmla_f32_x(pred_full, v1197, v1217, v2399); svfloat32_t v1255 = svnmls_f32_x(pred_full, v1217, v1197, v2399); - svfloat32_t v1418; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v1418) : "w"(v475), "w"(v1386)); - svfloat32_t zero1432; - asm volatile("mov %0.s, #0" : "=w"(zero1432)); + svfloat32_t v1418 = svadd_f32_x(svptrue_b32(), v475, v1386); + svfloat32_t zero1432 = svdup_n_f32(0); svfloat32_t v1432 = svcmla_f32_x(pred_full, zero1432, v2419, v1399, 90); - svfloat32_t zero1447; - asm volatile("mov %0.s, #0" : "=w"(zero1447)); + svfloat32_t zero1447 = svdup_n_f32(0); svfloat32_t v1447 = svcmla_f32_x(pred_full, zero1447, v2419, v1417, 90); - svfloat32_t v1548; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v1548) : "w"(v1527), "w"(v1547)); - svfloat32_t v1549; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1549) : "w"(v1527), "w"(v1547)); + svfloat32_t v1548 = svadd_f32_x(svptrue_b32(), v1527, v1547); + svfloat32_t v1549 = svsub_f32_x(svptrue_b32(), v1527, v1547); svfloat32_t v1561 = svmla_f32_x(pred_full, v1521, v1541, v2399); svfloat32_t v1579 = svnmls_f32_x(pred_full, v1541, v1521, v2399); svfloat32_t v1689 = svnmls_f32_x(pred_full, v1683, v1656, v2439); svfloat32_t v1709 = svnmls_f32_x(pred_full, v1703, v1669, v2439); - svfloat32_t v1109; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1109) : "w"(v1087), "w"(v1108)); + svfloat32_t v1109 = svsub_f32_x(svptrue_b32(), v1087, v1108); svfloat32_t v1137 = svnmls_f32_x(pred_full, v1124, v1081, v2439); - svfloat32_t v1256; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v1256) : "w"(v467), "w"(v1224)); - svfloat32_t zero1270; - asm volatile("mov %0.s, #0" : "=w"(zero1270)); + svfloat32_t v1256 = svadd_f32_x(svptrue_b32(), v467, v1224); + svfloat32_t zero1270 = svdup_n_f32(0); svfloat32_t v1270 = svcmla_f32_x(pred_full, zero1270, v2419, v1237, 90); - svfloat32_t zero1285; - asm volatile("mov %0.s, #0" : "=w"(zero1285)); + svfloat32_t zero1285 = svdup_n_f32(0); svfloat32_t v1285 = svcmla_f32_x(pred_full, zero1285, v2419, v1255, 90); svfloat32_t v1393 = svmls_f32_x(pred_full, v475, v1386, v2395); - svfloat32_t v1580; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v1580) : "w"(v481), "w"(v1548)); - svfloat32_t zero1594; - asm volatile("mov %0.s, #0" : "=w"(zero1594)); + svfloat32_t v1580 = svadd_f32_x(svptrue_b32(), v481, v1548); + svfloat32_t zero1594 = svdup_n_f32(0); svfloat32_t v1594 = svcmla_f32_x(pred_full, zero1594, v2419, v1561, 90); - svfloat32_t zero1609; - asm volatile("mov %0.s, #0" : "=w"(zero1609)); + svfloat32_t zero1609 = svdup_n_f32(0); svfloat32_t v1609 = svcmla_f32_x(pred_full, zero1609, v2419, v1579, 90); - svfloat32_t v1710; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v1710) : "w"(v1689), "w"(v1709)); - svfloat32_t v1711; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1711) : "w"(v1689), "w"(v1709)); + svfloat32_t v1710 = svadd_f32_x(svptrue_b32(), v1689, v1709); + svfloat32_t v1711 = svsub_f32_x(svptrue_b32(), v1689, v1709); svfloat32_t v1723 = svmla_f32_x(pred_full, v1683, v1703, v2399); svfloat32_t v1741 = svnmls_f32_x(pred_full, v1703, v1683, v2399); svst1_scatter_s64index_f64(pred_full, (double *)(v2170), v2447, @@ -16777,13 +14723,10 @@ void armral_fft_cf32_cf32_cf32_ab_t_gs25(const armral_cmplx_f32_t *restrict x, svfloat32_t v1231 = svmls_f32_x(pred_full, v467, v1224, v2395); svfloat32_t v1405 = svmls_f32_x(pred_full, v1393, v1387, v2397); svfloat32_t v1555 = svmls_f32_x(pred_full, v481, v1548, v2395); - svfloat32_t v1742; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v1742) : "w"(v487), "w"(v1710)); - svfloat32_t zero1756; - asm volatile("mov %0.s, #0" : "=w"(zero1756)); + svfloat32_t v1742 = svadd_f32_x(svptrue_b32(), v487, v1710); + svfloat32_t zero1756 = svdup_n_f32(0); svfloat32_t v1756 = svcmla_f32_x(pred_full, zero1756, v2419, v1723, 90); - svfloat32_t zero1771; - asm volatile("mov %0.s, #0" : "=w"(zero1771)); + svfloat32_t zero1771 = svdup_n_f32(0); svfloat32_t v1771 = svcmla_f32_x(pred_full, zero1771, v2419, v1741, 90); svst1_scatter_s64index_f64(pred_full, (double *)(v2160), v2447, svreinterpret_f64_f32(v1109)); @@ -16795,8 +14738,7 @@ void armral_fft_cf32_cf32_cf32_ab_t_gs25(const armral_cmplx_f32_t *restrict x, svreinterpret_f64_f32(v1580)); svfloat32_t v1243 = svmls_f32_x(pred_full, v1231, v1225, v2397); svfloat32_t v1411 = svnmls_f32_x(pred_full, v1405, v1393, v2439); - svfloat32_t v1448; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1448) : "w"(v1405), "w"(v1447)); + svfloat32_t v1448 = svsub_f32_x(svptrue_b32(), v1405, v1447); svfloat32_t v1567 = svmls_f32_x(pred_full, v1555, v1549, v2397); svfloat32_t v1717 = svmls_f32_x(pred_full, v487, v1710, v2395); svst1_scatter_s64index_f64(pred_full, (double *)(v2190), v2447, @@ -16804,27 +14746,21 @@ void armral_fft_cf32_cf32_cf32_ab_t_gs25(const armral_cmplx_f32_t *restrict x, svst1_scatter_s64index_f64(pred_full, (double *)(v2406), v2447, svreinterpret_f64_f32(v1742)); svfloat32_t v1249 = svnmls_f32_x(pred_full, v1243, v1231, v2439); - svfloat32_t v1286; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1286) : "w"(v1243), "w"(v1285)); - svfloat32_t v1433; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1433) : "w"(v1411), "w"(v1432)); + svfloat32_t v1286 = svsub_f32_x(svptrue_b32(), v1243, v1285); + svfloat32_t v1433 = svsub_f32_x(svptrue_b32(), v1411, v1432); svfloat32_t v1461 = svnmls_f32_x(pred_full, v1448, v1405, v2439); svfloat32_t v1573 = svnmls_f32_x(pred_full, v1567, v1555, v2439); - svfloat32_t v1610; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1610) : "w"(v1567), "w"(v1609)); + svfloat32_t v1610 = svsub_f32_x(svptrue_b32(), v1567, v1609); svfloat32_t v1729 = svmls_f32_x(pred_full, v1717, v1711, v2397); svst1_scatter_s64index_f64(pred_full, (double *)(v2298), v2447, svreinterpret_f64_f32(v1448)); - svfloat32_t v1271; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1271) : "w"(v1249), "w"(v1270)); + svfloat32_t v1271 = svsub_f32_x(svptrue_b32(), v1249, v1270); svfloat32_t v1299 = svnmls_f32_x(pred_full, v1286, v1243, v2439); svfloat32_t v1474 = svnmls_f32_x(pred_full, v1433, v1411, v2439); - svfloat32_t v1595; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1595) : "w"(v1573), "w"(v1594)); + svfloat32_t v1595 = svsub_f32_x(svptrue_b32(), v1573, v1594); svfloat32_t v1623 = svnmls_f32_x(pred_full, v1610, v1567, v2439); svfloat32_t v1735 = svnmls_f32_x(pred_full, v1729, v1717, v2439); - svfloat32_t v1772; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1772) : "w"(v1729), "w"(v1771)); + svfloat32_t v1772 = svsub_f32_x(svptrue_b32(), v1729, v1771); svst1_scatter_s64index_f64(pred_full, (double *)(v2234), v2447, svreinterpret_f64_f32(v1286)); svst1_scatter_s64index_f64(pred_full, (double *)(v2288), v2447, @@ -16835,8 +14771,7 @@ void armral_fft_cf32_cf32_cf32_ab_t_gs25(const armral_cmplx_f32_t *restrict x, svreinterpret_f64_f32(v1610)); svfloat32_t v1312 = svnmls_f32_x(pred_full, v1271, v1249, v2439); svfloat32_t v1636 = svnmls_f32_x(pred_full, v1595, v1573, v2439); - svfloat32_t v1757; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1757) : "w"(v1735), "w"(v1756)); + svfloat32_t v1757 = svsub_f32_x(svptrue_b32(), v1735, v1756); svfloat32_t v1785 = svnmls_f32_x(pred_full, v1772, v1729, v2439); svst1_scatter_s64index_f64(pred_full, (double *)(v2224), v2447, svreinterpret_f64_f32(v1271)); @@ -16866,1749 +14801,3 @@ void armral_fft_cf32_cf32_cf32_ab_t_gs25(const armral_cmplx_f32_t *restrict x, } } #endif - -#ifndef ARMRAL_ARCH_SVE -void armral_fft_cf32_cf32_cf32_ab_t_gs32(const armral_cmplx_f32_t *restrict x, - armral_cmplx_f32_t *restrict y, - int istride, int ostride, - const armral_cmplx_f32_t *restrict w, - int howmany, int idist, int odist, - float dir) { - float v4 = dir; - const float32x2_t *v5 = (const float32x2_t *)x; - float32x2_t *v6 = (float32x2_t *)y; - const float32x2_t *v7 = (const float32x2_t *)w; - for (int j = 0; j < howmany; j += 1) { - float32x2_t v407 = v5[istride]; - float v1418 = 7.0710678118654757e-01F; - float v1429 = -7.0710678118654746e-01F; - float v1475 = 5.5557023301960229e-01F; - float v1489 = -1.9509032201612861e-01F; - float v1536 = 9.2387953251128674e-01F; - float v1543 = -9.2387953251128685e-01F; - float v1546 = 3.8268343236508967e-01F; - float v1547 = -3.8268343236508967e-01F; - float v1589 = 1.9509032201612833e-01F; - float v1592 = -9.8078528040323043e-01F; - float v1593 = 9.8078528040323043e-01F; - float v1600 = -5.5557023301960218e-01F; - float v1603 = 8.3146961230254524e-01F; - float v1604 = -8.3146961230254524e-01F; - float v1614 = -1.0000000000000000e+00F; - float v1615 = 1.0000000000000000e+00F; - float32x2_t v1617 = (float32x2_t){v4, v4}; - float32x2_t v444 = vtrn1_f32(v407, v407); - float32x2_t v445 = vtrn2_f32(v407, v407); - float32x2_t v851 = v5[0]; - float32x2_t v1248 = (float32x2_t){v1593, v1593}; - float32x2_t v1305 = (float32x2_t){v1536, v1536}; - float32x2_t v1309 = (float32x2_t){v1547, v1546}; - float32x2_t v1362 = (float32x2_t){v1603, v1603}; - float32x2_t v1366 = (float32x2_t){v1600, v1475}; - float32x2_t v1373 = (float32x2_t){v1489, v1489}; - float32x2_t v1419 = (float32x2_t){v1418, v1418}; - float32x2_t v1430 = (float32x2_t){v1429, v1429}; - float32x2_t v1434 = (float32x2_t){v1615, v1614}; - float32x2_t v1476 = (float32x2_t){v1475, v1475}; - float32x2_t v1480 = (float32x2_t){v1604, v1603}; - float32x2_t v1487 = (float32x2_t){v1592, v1592}; - float32x2_t v1491 = (float32x2_t){v1489, v1589}; - float32x2_t v1533 = (float32x2_t){v1546, v1546}; - float32x2_t v1537 = (float32x2_t){v1543, v1536}; - float32x2_t v1544 = (float32x2_t){v1543, v1543}; - float32x2_t v1548 = (float32x2_t){v1546, v1547}; - float32x2_t v1590 = (float32x2_t){v1589, v1589}; - float32x2_t v1594 = (float32x2_t){v1592, v1593}; - float32x2_t v1601 = (float32x2_t){v1600, v1600}; - float32x2_t v1605 = (float32x2_t){v1603, v1604}; - float32x2_t v1616 = (float32x2_t){v1614, v1615}; - float32x2_t v20 = v5[istride * 16]; - int64_t v37 = 30 + j * 62; - float32x2_t v51 = v5[istride * 8]; - int64_t v55 = 14 + j * 62; - float32x2_t v69 = v5[istride * 24]; - int64_t v73 = 46 + j * 62; - float32x2_t v87 = v5[istride * 4]; - float32x2_t v105 = v5[istride * 20]; - int64_t v122 = 6 + j * 62; - int64_t v135 = 38 + j * 62; - float32x2_t v149 = v5[istride * 12]; - float32x2_t v167 = v5[istride * 28]; - int64_t v184 = 22 + j * 62; - int64_t v197 = 54 + j * 62; - float32x2_t v211 = v5[istride * 2]; - float32x2_t v229 = v5[istride * 18]; - int64_t v246 = 2 + j * 62; - int64_t v259 = 34 + j * 62; - float32x2_t v273 = v5[istride * 10]; - int64_t v277 = 18 + j * 62; - float32x2_t v291 = v5[istride * 26]; - int64_t v295 = 50 + j * 62; - float32x2_t v309 = v5[istride * 6]; - float32x2_t v327 = v5[istride * 22]; - int64_t v344 = 10 + j * 62; - int64_t v357 = 42 + j * 62; - float32x2_t v371 = v5[istride * 14]; - int64_t v375 = 26 + j * 62; - float32x2_t v389 = v5[istride * 30]; - int64_t v393 = 58 + j * 62; - float32x2_t v425 = v5[istride * 17]; - float32x2_t v443 = v7[j * 62]; - int64_t v447 = j * 62 + 1; - int64_t v455 = 32 + j * 62; - float32x2_t v469 = v5[istride * 9]; - int64_t v473 = 16 + j * 62; - float32x2_t v487 = v5[istride * 25]; - int64_t v491 = 48 + j * 62; - float32x2_t v505 = v5[istride * 5]; - float32x2_t v523 = v5[istride * 21]; - int64_t v540 = 8 + j * 62; - int64_t v553 = 40 + j * 62; - float32x2_t v567 = v5[istride * 13]; - float32x2_t v585 = v5[istride * 29]; - int64_t v602 = 24 + j * 62; - int64_t v615 = 56 + j * 62; - float32x2_t v629 = v5[istride * 3]; - float32x2_t v647 = v5[istride * 19]; - int64_t v664 = 4 + j * 62; - int64_t v677 = 36 + j * 62; - float32x2_t v691 = v5[istride * 11]; - int64_t v695 = 20 + j * 62; - float32x2_t v709 = v5[istride * 27]; - int64_t v713 = 52 + j * 62; - float32x2_t v727 = v5[istride * 7]; - float32x2_t v745 = v5[istride * 23]; - int64_t v762 = 12 + j * 62; - int64_t v775 = 44 + j * 62; - float32x2_t v789 = v5[istride * 15]; - float32x2_t v807 = v5[istride * 31]; - int64_t v824 = 28 + j * 62; - int64_t v837 = 60 + j * 62; - float32x2_t v1311 = vmul_f32(v1617, v1309); - float32x2_t v1368 = vmul_f32(v1617, v1366); - float32x2_t v1436 = vmul_f32(v1617, v1434); - float32x2_t v1482 = vmul_f32(v1617, v1480); - float32x2_t v1493 = vmul_f32(v1617, v1491); - float32x2_t v1539 = vmul_f32(v1617, v1537); - float32x2_t v1550 = vmul_f32(v1617, v1548); - float32x2_t v1596 = vmul_f32(v1617, v1594); - float32x2_t v1607 = vmul_f32(v1617, v1605); - float32x2_t v1618 = vmul_f32(v1617, v1616); - float32x2_t v38 = v7[v37]; - float32x2_t v39 = vtrn1_f32(v20, v20); - float32x2_t v40 = vtrn2_f32(v20, v20); - int64_t v42 = v37 + 1; - float32x2_t v56 = v7[v55]; - float32x2_t v57 = vtrn1_f32(v51, v51); - float32x2_t v58 = vtrn2_f32(v51, v51); - int64_t v60 = v55 + 1; - float32x2_t v74 = v7[v73]; - float32x2_t v75 = vtrn1_f32(v69, v69); - float32x2_t v76 = vtrn2_f32(v69, v69); - int64_t v78 = v73 + 1; - float32x2_t v123 = v7[v122]; - float32x2_t v124 = vtrn1_f32(v87, v87); - float32x2_t v125 = vtrn2_f32(v87, v87); - int64_t v127 = v122 + 1; - float32x2_t v136 = v7[v135]; - float32x2_t v137 = vtrn1_f32(v105, v105); - float32x2_t v138 = vtrn2_f32(v105, v105); - int64_t v140 = v135 + 1; - float32x2_t v185 = v7[v184]; - float32x2_t v186 = vtrn1_f32(v149, v149); - float32x2_t v187 = vtrn2_f32(v149, v149); - int64_t v189 = v184 + 1; - float32x2_t v198 = v7[v197]; - float32x2_t v199 = vtrn1_f32(v167, v167); - float32x2_t v200 = vtrn2_f32(v167, v167); - int64_t v202 = v197 + 1; - float32x2_t v247 = v7[v246]; - float32x2_t v248 = vtrn1_f32(v211, v211); - float32x2_t v249 = vtrn2_f32(v211, v211); - int64_t v251 = v246 + 1; - float32x2_t v260 = v7[v259]; - float32x2_t v261 = vtrn1_f32(v229, v229); - float32x2_t v262 = vtrn2_f32(v229, v229); - int64_t v264 = v259 + 1; - float32x2_t v278 = v7[v277]; - float32x2_t v279 = vtrn1_f32(v273, v273); - float32x2_t v280 = vtrn2_f32(v273, v273); - int64_t v282 = v277 + 1; - float32x2_t v296 = v7[v295]; - float32x2_t v297 = vtrn1_f32(v291, v291); - float32x2_t v298 = vtrn2_f32(v291, v291); - int64_t v300 = v295 + 1; - float32x2_t v345 = v7[v344]; - float32x2_t v346 = vtrn1_f32(v309, v309); - float32x2_t v347 = vtrn2_f32(v309, v309); - int64_t v349 = v344 + 1; - float32x2_t v358 = v7[v357]; - float32x2_t v359 = vtrn1_f32(v327, v327); - float32x2_t v360 = vtrn2_f32(v327, v327); - int64_t v362 = v357 + 1; - float32x2_t v376 = v7[v375]; - float32x2_t v377 = vtrn1_f32(v371, v371); - float32x2_t v378 = vtrn2_f32(v371, v371); - int64_t v380 = v375 + 1; - float32x2_t v394 = v7[v393]; - float32x2_t v395 = vtrn1_f32(v389, v389); - float32x2_t v396 = vtrn2_f32(v389, v389); - int64_t v398 = v393 + 1; - float32x2_t v448 = v7[v447]; - float32x2_t v449 = vmul_f32(v444, v443); - float32x2_t v456 = v7[v455]; - float32x2_t v457 = vtrn1_f32(v425, v425); - float32x2_t v458 = vtrn2_f32(v425, v425); - int64_t v460 = v455 + 1; - float32x2_t v474 = v7[v473]; - float32x2_t v475 = vtrn1_f32(v469, v469); - float32x2_t v476 = vtrn2_f32(v469, v469); - int64_t v478 = v473 + 1; - float32x2_t v492 = v7[v491]; - float32x2_t v493 = vtrn1_f32(v487, v487); - float32x2_t v494 = vtrn2_f32(v487, v487); - int64_t v496 = v491 + 1; - float32x2_t v541 = v7[v540]; - float32x2_t v542 = vtrn1_f32(v505, v505); - float32x2_t v543 = vtrn2_f32(v505, v505); - int64_t v545 = v540 + 1; - float32x2_t v554 = v7[v553]; - float32x2_t v555 = vtrn1_f32(v523, v523); - float32x2_t v556 = vtrn2_f32(v523, v523); - int64_t v558 = v553 + 1; - float32x2_t v603 = v7[v602]; - float32x2_t v604 = vtrn1_f32(v567, v567); - float32x2_t v605 = vtrn2_f32(v567, v567); - int64_t v607 = v602 + 1; - float32x2_t v616 = v7[v615]; - float32x2_t v617 = vtrn1_f32(v585, v585); - float32x2_t v618 = vtrn2_f32(v585, v585); - int64_t v620 = v615 + 1; - float32x2_t v665 = v7[v664]; - float32x2_t v666 = vtrn1_f32(v629, v629); - float32x2_t v667 = vtrn2_f32(v629, v629); - int64_t v669 = v664 + 1; - float32x2_t v678 = v7[v677]; - float32x2_t v679 = vtrn1_f32(v647, v647); - float32x2_t v680 = vtrn2_f32(v647, v647); - int64_t v682 = v677 + 1; - float32x2_t v696 = v7[v695]; - float32x2_t v697 = vtrn1_f32(v691, v691); - float32x2_t v698 = vtrn2_f32(v691, v691); - int64_t v700 = v695 + 1; - float32x2_t v714 = v7[v713]; - float32x2_t v715 = vtrn1_f32(v709, v709); - float32x2_t v716 = vtrn2_f32(v709, v709); - int64_t v718 = v713 + 1; - float32x2_t v763 = v7[v762]; - float32x2_t v764 = vtrn1_f32(v727, v727); - float32x2_t v765 = vtrn2_f32(v727, v727); - int64_t v767 = v762 + 1; - float32x2_t v776 = v7[v775]; - float32x2_t v777 = vtrn1_f32(v745, v745); - float32x2_t v778 = vtrn2_f32(v745, v745); - int64_t v780 = v775 + 1; - float32x2_t v825 = v7[v824]; - float32x2_t v826 = vtrn1_f32(v789, v789); - float32x2_t v827 = vtrn2_f32(v789, v789); - int64_t v829 = v824 + 1; - float32x2_t v838 = v7[v837]; - float32x2_t v839 = vtrn1_f32(v807, v807); - float32x2_t v840 = vtrn2_f32(v807, v807); - int64_t v842 = v837 + 1; - float32x2_t v43 = v7[v42]; - float32x2_t v44 = vmul_f32(v39, v38); - float32x2_t v61 = v7[v60]; - float32x2_t v62 = vmul_f32(v57, v56); - float32x2_t v79 = v7[v78]; - float32x2_t v80 = vmul_f32(v75, v74); - float32x2_t v128 = v7[v127]; - float32x2_t v129 = vmul_f32(v124, v123); - float32x2_t v141 = v7[v140]; - float32x2_t v142 = vmul_f32(v137, v136); - float32x2_t v190 = v7[v189]; - float32x2_t v191 = vmul_f32(v186, v185); - float32x2_t v203 = v7[v202]; - float32x2_t v204 = vmul_f32(v199, v198); - float32x2_t v252 = v7[v251]; - float32x2_t v253 = vmul_f32(v248, v247); - float32x2_t v265 = v7[v264]; - float32x2_t v266 = vmul_f32(v261, v260); - float32x2_t v283 = v7[v282]; - float32x2_t v284 = vmul_f32(v279, v278); - float32x2_t v301 = v7[v300]; - float32x2_t v302 = vmul_f32(v297, v296); - float32x2_t v350 = v7[v349]; - float32x2_t v351 = vmul_f32(v346, v345); - float32x2_t v363 = v7[v362]; - float32x2_t v364 = vmul_f32(v359, v358); - float32x2_t v381 = v7[v380]; - float32x2_t v382 = vmul_f32(v377, v376); - float32x2_t v399 = v7[v398]; - float32x2_t v400 = vmul_f32(v395, v394); - float32x2_t v461 = v7[v460]; - float32x2_t v462 = vmul_f32(v457, v456); - float32x2_t v479 = v7[v478]; - float32x2_t v480 = vmul_f32(v475, v474); - float32x2_t v497 = v7[v496]; - float32x2_t v498 = vmul_f32(v493, v492); - float32x2_t v546 = v7[v545]; - float32x2_t v547 = vmul_f32(v542, v541); - float32x2_t v559 = v7[v558]; - float32x2_t v560 = vmul_f32(v555, v554); - float32x2_t v608 = v7[v607]; - float32x2_t v609 = vmul_f32(v604, v603); - float32x2_t v621 = v7[v620]; - float32x2_t v622 = vmul_f32(v617, v616); - float32x2_t v670 = v7[v669]; - float32x2_t v671 = vmul_f32(v666, v665); - float32x2_t v683 = v7[v682]; - float32x2_t v684 = vmul_f32(v679, v678); - float32x2_t v701 = v7[v700]; - float32x2_t v702 = vmul_f32(v697, v696); - float32x2_t v719 = v7[v718]; - float32x2_t v720 = vmul_f32(v715, v714); - float32x2_t v768 = v7[v767]; - float32x2_t v769 = vmul_f32(v764, v763); - float32x2_t v781 = v7[v780]; - float32x2_t v782 = vmul_f32(v777, v776); - float32x2_t v830 = v7[v829]; - float32x2_t v831 = vmul_f32(v826, v825); - float32x2_t v843 = v7[v842]; - float32x2_t v844 = vmul_f32(v839, v838); - float32x2_t v451 = vfma_f32(v449, v445, v448); - float32x2_t v46 = vfma_f32(v44, v40, v43); - float32x2_t v64 = vfma_f32(v62, v58, v61); - float32x2_t v82 = vfma_f32(v80, v76, v79); - float32x2_t v131 = vfma_f32(v129, v125, v128); - float32x2_t v144 = vfma_f32(v142, v138, v141); - float32x2_t v193 = vfma_f32(v191, v187, v190); - float32x2_t v206 = vfma_f32(v204, v200, v203); - float32x2_t v255 = vfma_f32(v253, v249, v252); - float32x2_t v268 = vfma_f32(v266, v262, v265); - float32x2_t v286 = vfma_f32(v284, v280, v283); - float32x2_t v304 = vfma_f32(v302, v298, v301); - float32x2_t v353 = vfma_f32(v351, v347, v350); - float32x2_t v366 = vfma_f32(v364, v360, v363); - float32x2_t v384 = vfma_f32(v382, v378, v381); - float32x2_t v402 = vfma_f32(v400, v396, v399); - float32x2_t v464 = vfma_f32(v462, v458, v461); - float32x2_t v482 = vfma_f32(v480, v476, v479); - float32x2_t v500 = vfma_f32(v498, v494, v497); - float32x2_t v549 = vfma_f32(v547, v543, v546); - float32x2_t v562 = vfma_f32(v560, v556, v559); - float32x2_t v611 = vfma_f32(v609, v605, v608); - float32x2_t v624 = vfma_f32(v622, v618, v621); - float32x2_t v673 = vfma_f32(v671, v667, v670); - float32x2_t v686 = vfma_f32(v684, v680, v683); - float32x2_t v704 = vfma_f32(v702, v698, v701); - float32x2_t v722 = vfma_f32(v720, v716, v719); - float32x2_t v771 = vfma_f32(v769, v765, v768); - float32x2_t v784 = vfma_f32(v782, v778, v781); - float32x2_t v833 = vfma_f32(v831, v827, v830); - float32x2_t v846 = vfma_f32(v844, v840, v843); - float32x2_t v852 = vadd_f32(v851, v46); - float32x2_t v853 = vsub_f32(v851, v46); - float32x2_t v854 = vadd_f32(v64, v82); - float32x2_t v855 = vsub_f32(v64, v82); - float32x2_t v867 = vadd_f32(v131, v144); - float32x2_t v868 = vsub_f32(v131, v144); - float32x2_t v869 = vadd_f32(v193, v206); - float32x2_t v870 = vsub_f32(v193, v206); - float32x2_t v921 = vadd_f32(v255, v268); - float32x2_t v922 = vsub_f32(v255, v268); - float32x2_t v923 = vadd_f32(v286, v304); - float32x2_t v924 = vsub_f32(v286, v304); - float32x2_t v936 = vadd_f32(v353, v366); - float32x2_t v937 = vsub_f32(v353, v366); - float32x2_t v938 = vadd_f32(v384, v402); - float32x2_t v939 = vsub_f32(v384, v402); - float32x2_t v1075 = vadd_f32(v451, v464); - float32x2_t v1076 = vsub_f32(v451, v464); - float32x2_t v1077 = vadd_f32(v482, v500); - float32x2_t v1078 = vsub_f32(v482, v500); - float32x2_t v1090 = vadd_f32(v549, v562); - float32x2_t v1091 = vsub_f32(v549, v562); - float32x2_t v1092 = vadd_f32(v611, v624); - float32x2_t v1093 = vsub_f32(v611, v624); - float32x2_t v1144 = vadd_f32(v673, v686); - float32x2_t v1145 = vsub_f32(v673, v686); - float32x2_t v1146 = vadd_f32(v704, v722); - float32x2_t v1147 = vsub_f32(v704, v722); - float32x2_t v1159 = vadd_f32(v771, v784); - float32x2_t v1160 = vsub_f32(v771, v784); - float32x2_t v1161 = vadd_f32(v833, v846); - float32x2_t v1162 = vsub_f32(v833, v846); - float32x2_t v861 = vrev64_f32(v855); - float32x2_t v863 = vadd_f32(v852, v854); - float32x2_t v864 = vsub_f32(v852, v854); - float32x2_t v871 = vadd_f32(v867, v869); - float32x2_t v872 = vsub_f32(v867, v869); - float32x2_t v887 = vmul_f32(v868, v1419); - float32x2_t v898 = vmul_f32(v870, v1430); - float32x2_t v930 = vrev64_f32(v924); - float32x2_t v932 = vadd_f32(v921, v923); - float32x2_t v933 = vsub_f32(v921, v923); - float32x2_t v945 = vrev64_f32(v939); - float32x2_t v947 = vadd_f32(v936, v938); - float32x2_t v948 = vsub_f32(v936, v938); - float32x2_t v1084 = vrev64_f32(v1078); - float32x2_t v1086 = vadd_f32(v1075, v1077); - float32x2_t v1087 = vsub_f32(v1075, v1077); - float32x2_t v1094 = vadd_f32(v1090, v1092); - float32x2_t v1095 = vsub_f32(v1090, v1092); - float32x2_t v1110 = vmul_f32(v1091, v1419); - float32x2_t v1121 = vmul_f32(v1093, v1430); - float32x2_t v1153 = vrev64_f32(v1147); - float32x2_t v1155 = vadd_f32(v1144, v1146); - float32x2_t v1156 = vsub_f32(v1144, v1146); - float32x2_t v1163 = vadd_f32(v1159, v1161); - float32x2_t v1164 = vsub_f32(v1159, v1161); - float32x2_t v1179 = vmul_f32(v1160, v1419); - float32x2_t v1190 = vmul_f32(v1162, v1430); - float32x2_t v862 = vmul_f32(v861, v1436); - float32x2_t v878 = vrev64_f32(v872); - float32x2_t v880 = vadd_f32(v863, v871); - float32x2_t v881 = vsub_f32(v863, v871); - float32x2_t v893 = vrev64_f32(v887); - float32x2_t v904 = vrev64_f32(v898); - float32x2_t v931 = vmul_f32(v930, v1436); - float32x2_t v946 = vmul_f32(v945, v1436); - float32x2_t v951 = vadd_f32(v932, v947); - float32x2_t v952 = vsub_f32(v932, v947); - float32x2_t v1004 = vmul_f32(v933, v1419); - float32x2_t v1015 = vmul_f32(v948, v1430); - float32x2_t v1085 = vmul_f32(v1084, v1436); - float32x2_t v1101 = vrev64_f32(v1095); - float32x2_t v1103 = vadd_f32(v1086, v1094); - float32x2_t v1104 = vsub_f32(v1086, v1094); - float32x2_t v1116 = vrev64_f32(v1110); - float32x2_t v1127 = vrev64_f32(v1121); - float32x2_t v1154 = vmul_f32(v1153, v1436); - float32x2_t v1170 = vrev64_f32(v1164); - float32x2_t v1172 = vadd_f32(v1155, v1163); - float32x2_t v1173 = vsub_f32(v1155, v1163); - float32x2_t v1185 = vrev64_f32(v1179); - float32x2_t v1196 = vrev64_f32(v1190); - float32x2_t v865 = vsub_f32(v853, v862); - float32x2_t v866 = vadd_f32(v853, v862); - float32x2_t v879 = vmul_f32(v878, v1436); - float32x2_t v894 = vmul_f32(v893, v1618); - float32x2_t v905 = vmul_f32(v904, v1436); - float32x2_t v934 = vsub_f32(v922, v931); - float32x2_t v935 = vadd_f32(v922, v931); - float32x2_t v949 = vsub_f32(v937, v946); - float32x2_t v950 = vadd_f32(v937, v946); - float32x2_t v958 = vrev64_f32(v952); - float32x2_t v960 = vadd_f32(v880, v951); - float32x2_t v961 = vsub_f32(v880, v951); - float32x2_t v1010 = vrev64_f32(v1004); - float32x2_t v1021 = vrev64_f32(v1015); - float32x2_t v1088 = vsub_f32(v1076, v1085); - float32x2_t v1089 = vadd_f32(v1076, v1085); - float32x2_t v1102 = vmul_f32(v1101, v1436); - float32x2_t v1117 = vmul_f32(v1116, v1618); - float32x2_t v1128 = vmul_f32(v1127, v1436); - float32x2_t v1157 = vsub_f32(v1145, v1154); - float32x2_t v1158 = vadd_f32(v1145, v1154); - float32x2_t v1171 = vmul_f32(v1170, v1436); - float32x2_t v1186 = vmul_f32(v1185, v1618); - float32x2_t v1197 = vmul_f32(v1196, v1436); - float32x2_t v1213 = vadd_f32(v1103, v1172); - float32x2_t v1214 = vsub_f32(v1103, v1172); - float32x2_t v1420 = vmul_f32(v1104, v1419); - float32x2_t v1431 = vmul_f32(v1173, v1430); - float32x2_t v882 = vsub_f32(v864, v879); - float32x2_t v883 = vadd_f32(v864, v879); - float32x2_t v906 = vadd_f32(v887, v894); - float32x2_t v907 = vadd_f32(v898, v905); - float32x2_t v959 = vmul_f32(v958, v1436); - float32x2_t v967 = vmul_f32(v934, v1305); - float32x2_t v973 = vrev64_f32(v934); - float32x2_t v978 = vmul_f32(v949, v1533); - float32x2_t v984 = vrev64_f32(v949); - float32x2_t v1011 = vmul_f32(v1010, v1618); - float32x2_t v1022 = vmul_f32(v1021, v1436); - float32x2_t v1041 = vmul_f32(v935, v1533); - float32x2_t v1047 = vrev64_f32(v935); - float32x2_t v1052 = vmul_f32(v950, v1544); - float32x2_t v1058 = vrev64_f32(v950); - float32x2_t v1105 = vsub_f32(v1087, v1102); - float32x2_t v1106 = vadd_f32(v1087, v1102); - float32x2_t v1129 = vadd_f32(v1110, v1117); - float32x2_t v1130 = vadd_f32(v1121, v1128); - float32x2_t v1174 = vsub_f32(v1156, v1171); - float32x2_t v1175 = vadd_f32(v1156, v1171); - float32x2_t v1198 = vadd_f32(v1179, v1186); - float32x2_t v1199 = vadd_f32(v1190, v1197); - float32x2_t v1220 = vrev64_f32(v1214); - float32x2_t v1222 = vadd_f32(v960, v1213); - float32x2_t v1223 = vsub_f32(v960, v1213); - float32x2_t v1426 = vrev64_f32(v1420); - float32x2_t v1437 = vrev64_f32(v1431); - float32x2_t v908 = vadd_f32(v906, v907); - float32x2_t v909 = vsub_f32(v907, v906); - float32x2_t v962 = vsub_f32(v881, v959); - float32x2_t v963 = vadd_f32(v881, v959); - float32x2_t v1023 = vadd_f32(v1004, v1011); - float32x2_t v1024 = vadd_f32(v1015, v1022); - float32x2_t v1131 = vadd_f32(v1129, v1130); - float32x2_t v1132 = vsub_f32(v1130, v1129); - float32x2_t v1200 = vadd_f32(v1198, v1199); - float32x2_t v1201 = vsub_f32(v1199, v1198); - float32x2_t v1221 = vmul_f32(v1220, v1436); - v6[0] = v1222; - v6[ostride * 16] = v1223; - float32x2_t v1306 = vmul_f32(v1105, v1305); - float32x2_t v1312 = vrev64_f32(v1105); - float32x2_t v1317 = vmul_f32(v1174, v1533); - float32x2_t v1323 = vrev64_f32(v1174); - float32x2_t v1427 = vmul_f32(v1426, v1618); - float32x2_t v1438 = vmul_f32(v1437, v1436); - float32x2_t v1534 = vmul_f32(v1106, v1533); - float32x2_t v1540 = vrev64_f32(v1106); - float32x2_t v1545 = vmul_f32(v1175, v1544); - float32x2_t v1551 = vrev64_f32(v1175); - float32x2_t v915 = vrev64_f32(v909); - float32x2_t v917 = vadd_f32(v865, v908); - float32x2_t v918 = vsub_f32(v865, v908); - float32x2_t v986 = vfma_f32(v967, v973, v1311); - float32x2_t v987 = vfma_f32(v978, v984, v1539); - float32x2_t v1025 = vadd_f32(v1023, v1024); - float32x2_t v1026 = vsub_f32(v1024, v1023); - float32x2_t v1060 = vfma_f32(v1041, v1047, v1539); - float32x2_t v1061 = vfma_f32(v1052, v1058, v1550); - float32x2_t v1138 = vrev64_f32(v1132); - float32x2_t v1140 = vadd_f32(v1088, v1131); - float32x2_t v1141 = vsub_f32(v1088, v1131); - float32x2_t v1207 = vrev64_f32(v1201); - float32x2_t v1209 = vadd_f32(v1157, v1200); - float32x2_t v1210 = vsub_f32(v1157, v1200); - float32x2_t v1224 = vsub_f32(v961, v1221); - float32x2_t v1225 = vadd_f32(v961, v1221); - float32x2_t v1439 = vadd_f32(v1420, v1427); - float32x2_t v1440 = vadd_f32(v1431, v1438); - float32x2_t v916 = vmul_f32(v915, v1618); - float32x2_t v988 = vadd_f32(v986, v987); - float32x2_t v989 = vsub_f32(v987, v986); - float32x2_t v1032 = vrev64_f32(v1026); - float32x2_t v1034 = vadd_f32(v882, v1025); - float32x2_t v1035 = vsub_f32(v882, v1025); - float32x2_t v1062 = vadd_f32(v1060, v1061); - float32x2_t v1063 = vsub_f32(v1061, v1060); - float32x2_t v1139 = vmul_f32(v1138, v1618); - float32x2_t v1208 = vmul_f32(v1207, v1618); - v6[ostride * 8] = v1224; - v6[ostride * 24] = v1225; - float32x2_t v1249 = vmul_f32(v1140, v1248); - float32x2_t v1255 = vrev64_f32(v1140); - float32x2_t v1260 = vmul_f32(v1209, v1362); - float32x2_t v1266 = vrev64_f32(v1209); - float32x2_t v1325 = vfma_f32(v1306, v1312, v1311); - float32x2_t v1326 = vfma_f32(v1317, v1323, v1539); - float32x2_t v1441 = vadd_f32(v1439, v1440); - float32x2_t v1442 = vsub_f32(v1440, v1439); - float32x2_t v1477 = vmul_f32(v1141, v1476); - float32x2_t v1483 = vrev64_f32(v1141); - float32x2_t v1488 = vmul_f32(v1210, v1487); - float32x2_t v1494 = vrev64_f32(v1210); - float32x2_t v1553 = vfma_f32(v1534, v1540, v1539); - float32x2_t v1554 = vfma_f32(v1545, v1551, v1550); - float32x2_t v919 = vsub_f32(v866, v916); - float32x2_t v920 = vadd_f32(v866, v916); - float32x2_t v995 = vrev64_f32(v989); - float32x2_t v997 = vadd_f32(v917, v988); - float32x2_t v998 = vsub_f32(v917, v988); - float32x2_t v1033 = vmul_f32(v1032, v1618); - float32x2_t v1069 = vrev64_f32(v1063); - float32x2_t v1142 = vsub_f32(v1089, v1139); - float32x2_t v1143 = vadd_f32(v1089, v1139); - float32x2_t v1211 = vsub_f32(v1158, v1208); - float32x2_t v1212 = vadd_f32(v1158, v1208); - float32x2_t v1327 = vadd_f32(v1325, v1326); - float32x2_t v1328 = vsub_f32(v1326, v1325); - float32x2_t v1448 = vrev64_f32(v1442); - float32x2_t v1450 = vadd_f32(v962, v1441); - float32x2_t v1451 = vsub_f32(v962, v1441); - float32x2_t v1555 = vadd_f32(v1553, v1554); - float32x2_t v1556 = vsub_f32(v1554, v1553); - float32x2_t v996 = vmul_f32(v995, v1618); - float32x2_t v1036 = vsub_f32(v883, v1033); - float32x2_t v1037 = vadd_f32(v883, v1033); - float32x2_t v1070 = vmul_f32(v1069, v1618); - float32x2_t v1071 = vadd_f32(v919, v1062); - float32x2_t v1072 = vsub_f32(v919, v1062); - float32x2_t v1268 = vfma_f32(v1249, v1255, v1493); - float32x2_t v1269 = vfma_f32(v1260, v1266, v1368); - float32x2_t v1334 = vrev64_f32(v1328); - float32x2_t v1336 = vadd_f32(v1034, v1327); - float32x2_t v1337 = vsub_f32(v1034, v1327); - float32x2_t v1363 = vmul_f32(v1142, v1362); - float32x2_t v1369 = vrev64_f32(v1142); - float32x2_t v1374 = vmul_f32(v1211, v1373); - float32x2_t v1380 = vrev64_f32(v1211); - float32x2_t v1449 = vmul_f32(v1448, v1618); - v6[ostride * 4] = v1450; - v6[ostride * 20] = v1451; - float32x2_t v1496 = vfma_f32(v1477, v1483, v1482); - float32x2_t v1497 = vfma_f32(v1488, v1494, v1493); - float32x2_t v1562 = vrev64_f32(v1556); - float32x2_t v1591 = vmul_f32(v1143, v1590); - float32x2_t v1597 = vrev64_f32(v1143); - float32x2_t v1602 = vmul_f32(v1212, v1601); - float32x2_t v1608 = vrev64_f32(v1212); - float32x2_t v999 = vsub_f32(v918, v996); - float32x2_t v1000 = vadd_f32(v918, v996); - float32x2_t v1073 = vsub_f32(v920, v1070); - float32x2_t v1074 = vadd_f32(v920, v1070); - float32x2_t v1270 = vadd_f32(v1268, v1269); - float32x2_t v1271 = vsub_f32(v1269, v1268); - float32x2_t v1335 = vmul_f32(v1334, v1618); - v6[ostride * 2] = v1336; - v6[ostride * 18] = v1337; - float32x2_t v1452 = vsub_f32(v963, v1449); - float32x2_t v1453 = vadd_f32(v963, v1449); - float32x2_t v1498 = vadd_f32(v1496, v1497); - float32x2_t v1499 = vsub_f32(v1497, v1496); - float32x2_t v1563 = vmul_f32(v1562, v1618); - float32x2_t v1564 = vadd_f32(v1036, v1555); - float32x2_t v1565 = vsub_f32(v1036, v1555); - float32x2_t v1277 = vrev64_f32(v1271); - float32x2_t v1279 = vadd_f32(v997, v1270); - float32x2_t v1280 = vsub_f32(v997, v1270); - float32x2_t v1338 = vsub_f32(v1035, v1335); - float32x2_t v1339 = vadd_f32(v1035, v1335); - float32x2_t v1382 = vfma_f32(v1363, v1369, v1368); - float32x2_t v1383 = vfma_f32(v1374, v1380, v1596); - v6[ostride * 12] = v1452; - v6[ostride * 28] = v1453; - float32x2_t v1505 = vrev64_f32(v1499); - float32x2_t v1507 = vadd_f32(v999, v1498); - float32x2_t v1508 = vsub_f32(v999, v1498); - float32x2_t v1566 = vsub_f32(v1037, v1563); - float32x2_t v1567 = vadd_f32(v1037, v1563); - v6[ostride * 6] = v1564; - v6[ostride * 22] = v1565; - float32x2_t v1610 = vfma_f32(v1591, v1597, v1596); - float32x2_t v1611 = vfma_f32(v1602, v1608, v1607); - float32x2_t v1278 = vmul_f32(v1277, v1618); - v6[ostride] = v1279; - v6[ostride * 17] = v1280; - v6[ostride * 10] = v1338; - v6[ostride * 26] = v1339; - float32x2_t v1384 = vadd_f32(v1382, v1383); - float32x2_t v1385 = vsub_f32(v1383, v1382); - float32x2_t v1506 = vmul_f32(v1505, v1618); - v6[ostride * 5] = v1507; - v6[ostride * 21] = v1508; - v6[ostride * 14] = v1566; - v6[ostride * 30] = v1567; - float32x2_t v1612 = vadd_f32(v1610, v1611); - float32x2_t v1613 = vsub_f32(v1611, v1610); - float32x2_t v1281 = vsub_f32(v998, v1278); - float32x2_t v1282 = vadd_f32(v998, v1278); - float32x2_t v1391 = vrev64_f32(v1385); - float32x2_t v1393 = vadd_f32(v1071, v1384); - float32x2_t v1394 = vsub_f32(v1071, v1384); - float32x2_t v1509 = vsub_f32(v1000, v1506); - float32x2_t v1510 = vadd_f32(v1000, v1506); - float32x2_t v1619 = vrev64_f32(v1613); - float32x2_t v1621 = vadd_f32(v1073, v1612); - float32x2_t v1622 = vsub_f32(v1073, v1612); - v6[ostride * 9] = v1281; - v6[ostride * 25] = v1282; - float32x2_t v1392 = vmul_f32(v1391, v1618); - v6[ostride * 3] = v1393; - v6[ostride * 19] = v1394; - v6[ostride * 13] = v1509; - v6[ostride * 29] = v1510; - float32x2_t v1620 = vmul_f32(v1619, v1618); - v6[ostride * 7] = v1621; - v6[ostride * 23] = v1622; - float32x2_t v1395 = vsub_f32(v1072, v1392); - float32x2_t v1396 = vadd_f32(v1072, v1392); - float32x2_t v1623 = vsub_f32(v1074, v1620); - float32x2_t v1624 = vadd_f32(v1074, v1620); - v6[ostride * 11] = v1395; - v6[ostride * 27] = v1396; - v6[ostride * 15] = v1623; - v6[ostride * 31] = v1624; - v5 += 1 * idist; - v6 += 1 * odist; - } -} -#endif - -#ifdef ARMRAL_ARCH_SVE -void armral_fft_cf32_cf32_cf32_ab_t_gs32(const armral_cmplx_f32_t *restrict x, - armral_cmplx_f32_t *restrict y, - int istride, int ostride, - const armral_cmplx_f32_t *restrict w, - int howmany, int idist, int odist, - float dir) { - int64_t v0 = istride; - int64_t v1 = idist; - int64_t v2 = ostride; - int64_t v3 = odist; - float v4 = dir; - const float32x2_t *v5 = (const float32x2_t *)x; - float32x2_t *v6 = (float32x2_t *)y; - const float32x2_t *v7 = (const float32x2_t *)w; - int64_t v8 = howmany; - int64_t v10 = svcntd(); - int64_t v11 = v10 * v1; - int64_t v12 = v10 * v3; - for (int j = 0; j < v8; j += v10) { - svbool_t pred_full = svwhilelt_b32(j * 2, howmany * 2); - int64_t v13 = j; - float v1167 = -1.9509032201612819e-01F; - float v1222 = 7.0710678118654757e-01F; - float v1234 = -7.0710678118654746e-01F; - float v1239 = -1.0000000000000000e+00F; - float v1289 = 5.5557023301960229e-01F; - float v1294 = 8.3146961230254524e-01F; - float v1301 = -9.8078528040323043e-01F; - float v1356 = 3.8268343236508984e-01F; - float v1361 = 9.2387953251128674e-01F; - float v1368 = -9.2387953251128685e-01F; - float v1373 = -3.8268343236508967e-01F; - float v1423 = 1.9509032201612833e-01F; - float v1428 = 9.8078528040323043e-01F; - float v1435 = -5.5557023301960218e-01F; - float v1440 = -8.3146961230254524e-01F; - float v1451 = 1.0000000000000000e+00F; - const float32x2_t *v1630 = &v5[v0]; - float32x2_t *v1868 = &v6[v2]; - int64_t v19 = v0 * 16; - int64_t v34 = v10 * 15; - int64_t v40 = v0 * 8; - int64_t v48 = v10 * 7; - int64_t v54 = v0 * 24; - int64_t v62 = v10 * 23; - int64_t v68 = v0 * 4; - int64_t v82 = v0 * 20; - int64_t v97 = v10 * 3; - int64_t v104 = v10 * 19; - int64_t v110 = v0 * 12; - int64_t v124 = v0 * 28; - int64_t v139 = v10 * 11; - int64_t v146 = v10 * 27; - int64_t v152 = v0 * 2; - int64_t v166 = v0 * 18; - int64_t v188 = v10 * 17; - int64_t v194 = v0 * 10; - int64_t v202 = v10 * 9; - int64_t v208 = v0 * 26; - int64_t v216 = v10 * 25; - int64_t v222 = v0 * 6; - int64_t v236 = v0 * 22; - int64_t v251 = v10 * 5; - int64_t v258 = v10 * 21; - int64_t v264 = v0 * 14; - int64_t v272 = v10 * 13; - int64_t v278 = v0 * 30; - int64_t v286 = v10 * 29; - int64_t v306 = v0 * 17; - int64_t v328 = v10 * 16; - int64_t v334 = v0 * 9; - int64_t v342 = v10 * 8; - int64_t v348 = v0 * 25; - int64_t v356 = v10 * 24; - int64_t v362 = v0 * 5; - int64_t v376 = v0 * 21; - int64_t v391 = v10 * 4; - int64_t v398 = v10 * 20; - int64_t v404 = v0 * 13; - int64_t v418 = v0 * 29; - int64_t v433 = v10 * 12; - int64_t v440 = v10 * 28; - int64_t v446 = v0 * 3; - int64_t v460 = v0 * 19; - int64_t v475 = v10 * 2; - int64_t v482 = v10 * 18; - int64_t v488 = v0 * 11; - int64_t v496 = v10 * 10; - int64_t v502 = v0 * 27; - int64_t v510 = v10 * 26; - int64_t v516 = v0 * 7; - int64_t v530 = v0 * 23; - int64_t v545 = v10 * 6; - int64_t v552 = v10 * 22; - int64_t v558 = v0 * 15; - int64_t v572 = v0 * 31; - int64_t v587 = v10 * 14; - int64_t v594 = v10 * 30; - int64_t v595 = v13 * 31; - int64_t v1000 = v2 * 8; - int64_t v1007 = v2 * 16; - int64_t v1014 = v2 * 24; - int64_t v1067 = v2 * 9; - int64_t v1074 = v2 * 17; - int64_t v1081 = v2 * 25; - float v1096 = v4 * v1356; - int64_t v1127 = v2 * 2; - int64_t v1134 = v2 * 10; - int64_t v1141 = v2 * 18; - int64_t v1148 = v2 * 26; - float v1163 = v4 * v1289; - int64_t v1194 = v2 * 3; - int64_t v1201 = v2 * 11; - int64_t v1208 = v2 * 19; - int64_t v1215 = v2 * 27; - float v1242 = v4 * v1239; - int64_t v1261 = v2 * 4; - int64_t v1268 = v2 * 12; - int64_t v1275 = v2 * 20; - int64_t v1282 = v2 * 28; - float v1297 = v4 * v1294; - float v1309 = v4 * v1423; - int64_t v1328 = v2 * 5; - int64_t v1335 = v2 * 13; - int64_t v1342 = v2 * 21; - int64_t v1349 = v2 * 29; - float v1364 = v4 * v1361; - float v1376 = v4 * v1373; - int64_t v1395 = v2 * 6; - int64_t v1402 = v2 * 14; - int64_t v1409 = v2 * 22; - int64_t v1416 = v2 * 30; - float v1431 = v4 * v1428; - float v1443 = v4 * v1440; - float v1454 = v4 * v1451; - int64_t v1462 = v2 * 7; - int64_t v1469 = v2 * 15; - int64_t v1476 = v2 * 23; - int64_t v1483 = v2 * 31; - const float32x2_t *v1777 = &v5[0]; - svint64_t v1778 = svindex_s64(0, v1); - float32x2_t *v1827 = &v6[0]; - svfloat32_t v1857 = svdup_n_f32(v1428); - svfloat32_t v1898 = svdup_n_f32(v1361); - svfloat32_t v1939 = svdup_n_f32(v1294); - svfloat32_t v1941 = svdup_n_f32(v1167); - svfloat32_t v1980 = svdup_n_f32(v1222); - svfloat32_t v1982 = svdup_n_f32(v1234); - svfloat32_t v2021 = svdup_n_f32(v1289); - svfloat32_t v2023 = svdup_n_f32(v1301); - svfloat32_t v2062 = svdup_n_f32(v1356); - svfloat32_t v2064 = svdup_n_f32(v1368); - svfloat32_t v2103 = svdup_n_f32(v1423); - svfloat32_t v2105 = svdup_n_f32(v1435); - svint64_t v2142 = svindex_s64(0, v3); - int64_t v36 = v34 + v595; - int64_t v50 = v48 + v595; - int64_t v64 = v62 + v595; - int64_t v99 = v97 + v595; - int64_t v106 = v104 + v595; - int64_t v141 = v139 + v595; - int64_t v148 = v146 + v595; - int64_t v183 = v10 + v595; - int64_t v190 = v188 + v595; - int64_t v204 = v202 + v595; - int64_t v218 = v216 + v595; - int64_t v253 = v251 + v595; - int64_t v260 = v258 + v595; - int64_t v274 = v272 + v595; - int64_t v288 = v286 + v595; - svfloat32_t v324 = svreinterpret_f32_f64( - svld1_f64(pred_full, &((const double *)v7)[v595])); - int64_t v330 = v328 + v595; - int64_t v344 = v342 + v595; - int64_t v358 = v356 + v595; - int64_t v393 = v391 + v595; - int64_t v400 = v398 + v595; - int64_t v435 = v433 + v595; - int64_t v442 = v440 + v595; - int64_t v477 = v475 + v595; - int64_t v484 = v482 + v595; - int64_t v498 = v496 + v595; - int64_t v512 = v510 + v595; - int64_t v547 = v545 + v595; - int64_t v554 = v552 + v595; - int64_t v589 = v587 + v595; - int64_t v596 = v594 + v595; - const float32x2_t *v1495 = &v5[v19]; - const float32x2_t *v1504 = &v5[v40]; - const float32x2_t *v1513 = &v5[v54]; - const float32x2_t *v1522 = &v5[v68]; - const float32x2_t *v1531 = &v5[v82]; - const float32x2_t *v1540 = &v5[v110]; - const float32x2_t *v1549 = &v5[v124]; - const float32x2_t *v1558 = &v5[v152]; - const float32x2_t *v1567 = &v5[v166]; - const float32x2_t *v1576 = &v5[v194]; - const float32x2_t *v1585 = &v5[v208]; - const float32x2_t *v1594 = &v5[v222]; - const float32x2_t *v1603 = &v5[v236]; - const float32x2_t *v1612 = &v5[v264]; - const float32x2_t *v1621 = &v5[v278]; - svfloat32_t v1632 = svreinterpret_f32_f64( - svld1_gather_s64index_f64(pred_full, (const double *)(v1630), v1778)); - const float32x2_t *v1640 = &v5[v306]; - const float32x2_t *v1650 = &v5[v334]; - const float32x2_t *v1659 = &v5[v348]; - const float32x2_t *v1668 = &v5[v362]; - const float32x2_t *v1677 = &v5[v376]; - const float32x2_t *v1686 = &v5[v404]; - const float32x2_t *v1695 = &v5[v418]; - const float32x2_t *v1704 = &v5[v446]; - const float32x2_t *v1713 = &v5[v460]; - const float32x2_t *v1722 = &v5[v488]; - const float32x2_t *v1731 = &v5[v502]; - const float32x2_t *v1740 = &v5[v516]; - const float32x2_t *v1749 = &v5[v530]; - const float32x2_t *v1758 = &v5[v558]; - const float32x2_t *v1767 = &v5[v572]; - svfloat32_t v1779 = svreinterpret_f32_f64( - svld1_gather_s64index_f64(pred_full, (const double *)(v1777), v1778)); - float32x2_t *v1836 = &v6[v1000]; - float32x2_t *v1845 = &v6[v1007]; - float32x2_t *v1854 = &v6[v1014]; - float32x2_t *v1877 = &v6[v1067]; - float32x2_t *v1886 = &v6[v1074]; - float32x2_t *v1895 = &v6[v1081]; - svfloat32_t v1899 = svdup_n_f32(v1096); - float32x2_t *v1909 = &v6[v1127]; - float32x2_t *v1918 = &v6[v1134]; - float32x2_t *v1927 = &v6[v1141]; - float32x2_t *v1936 = &v6[v1148]; - svfloat32_t v1940 = svdup_n_f32(v1163); - float32x2_t *v1950 = &v6[v1194]; - float32x2_t *v1959 = &v6[v1201]; - float32x2_t *v1968 = &v6[v1208]; - float32x2_t *v1977 = &v6[v1215]; - svfloat32_t v1983 = svdup_n_f32(v1242); - float32x2_t *v1991 = &v6[v1261]; - float32x2_t *v2000 = &v6[v1268]; - float32x2_t *v2009 = &v6[v1275]; - float32x2_t *v2018 = &v6[v1282]; - svfloat32_t v2022 = svdup_n_f32(v1297); - svfloat32_t v2024 = svdup_n_f32(v1309); - float32x2_t *v2032 = &v6[v1328]; - float32x2_t *v2041 = &v6[v1335]; - float32x2_t *v2050 = &v6[v1342]; - float32x2_t *v2059 = &v6[v1349]; - svfloat32_t v2063 = svdup_n_f32(v1364); - svfloat32_t v2065 = svdup_n_f32(v1376); - float32x2_t *v2073 = &v6[v1395]; - float32x2_t *v2082 = &v6[v1402]; - float32x2_t *v2091 = &v6[v1409]; - float32x2_t *v2100 = &v6[v1416]; - svfloat32_t v2104 = svdup_n_f32(v1431); - svfloat32_t v2106 = svdup_n_f32(v1443); - svfloat32_t v2107 = svdup_n_f32(v1454); - float32x2_t *v2114 = &v6[v1462]; - float32x2_t *v2123 = &v6[v1469]; - float32x2_t *v2132 = &v6[v1476]; - float32x2_t *v2141 = &v6[v1483]; - svfloat32_t v37 = - svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v7)[v36])); - svfloat32_t v51 = - svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v7)[v50])); - svfloat32_t v65 = - svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v7)[v64])); - svfloat32_t v100 = - svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v7)[v99])); - svfloat32_t v107 = svreinterpret_f32_f64( - svld1_f64(pred_full, &((const double *)v7)[v106])); - svfloat32_t v142 = svreinterpret_f32_f64( - svld1_f64(pred_full, &((const double *)v7)[v141])); - svfloat32_t v149 = svreinterpret_f32_f64( - svld1_f64(pred_full, &((const double *)v7)[v148])); - svfloat32_t v184 = svreinterpret_f32_f64( - svld1_f64(pred_full, &((const double *)v7)[v183])); - svfloat32_t v191 = svreinterpret_f32_f64( - svld1_f64(pred_full, &((const double *)v7)[v190])); - svfloat32_t v205 = svreinterpret_f32_f64( - svld1_f64(pred_full, &((const double *)v7)[v204])); - svfloat32_t v219 = svreinterpret_f32_f64( - svld1_f64(pred_full, &((const double *)v7)[v218])); - svfloat32_t v254 = svreinterpret_f32_f64( - svld1_f64(pred_full, &((const double *)v7)[v253])); - svfloat32_t v261 = svreinterpret_f32_f64( - svld1_f64(pred_full, &((const double *)v7)[v260])); - svfloat32_t v275 = svreinterpret_f32_f64( - svld1_f64(pred_full, &((const double *)v7)[v274])); - svfloat32_t v289 = svreinterpret_f32_f64( - svld1_f64(pred_full, &((const double *)v7)[v288])); - svfloat32_t zero325; - asm volatile("mov %0.s, #0" : "=w"(zero325)); - svfloat32_t v325 = svcmla_f32_x( - pred_full, svcmla_f32_x(pred_full, zero325, v1632, v324, 0), v1632, - v324, 90); - svfloat32_t v331 = svreinterpret_f32_f64( - svld1_f64(pred_full, &((const double *)v7)[v330])); - svfloat32_t v345 = svreinterpret_f32_f64( - svld1_f64(pred_full, &((const double *)v7)[v344])); - svfloat32_t v359 = svreinterpret_f32_f64( - svld1_f64(pred_full, &((const double *)v7)[v358])); - svfloat32_t v394 = svreinterpret_f32_f64( - svld1_f64(pred_full, &((const double *)v7)[v393])); - svfloat32_t v401 = svreinterpret_f32_f64( - svld1_f64(pred_full, &((const double *)v7)[v400])); - svfloat32_t v436 = svreinterpret_f32_f64( - svld1_f64(pred_full, &((const double *)v7)[v435])); - svfloat32_t v443 = svreinterpret_f32_f64( - svld1_f64(pred_full, &((const double *)v7)[v442])); - svfloat32_t v478 = svreinterpret_f32_f64( - svld1_f64(pred_full, &((const double *)v7)[v477])); - svfloat32_t v485 = svreinterpret_f32_f64( - svld1_f64(pred_full, &((const double *)v7)[v484])); - svfloat32_t v499 = svreinterpret_f32_f64( - svld1_f64(pred_full, &((const double *)v7)[v498])); - svfloat32_t v513 = svreinterpret_f32_f64( - svld1_f64(pred_full, &((const double *)v7)[v512])); - svfloat32_t v548 = svreinterpret_f32_f64( - svld1_f64(pred_full, &((const double *)v7)[v547])); - svfloat32_t v555 = svreinterpret_f32_f64( - svld1_f64(pred_full, &((const double *)v7)[v554])); - svfloat32_t v590 = svreinterpret_f32_f64( - svld1_f64(pred_full, &((const double *)v7)[v589])); - svfloat32_t v597 = svreinterpret_f32_f64( - svld1_f64(pred_full, &((const double *)v7)[v596])); - svfloat32_t v1497 = svreinterpret_f32_f64( - svld1_gather_s64index_f64(pred_full, (const double *)(v1495), v1778)); - svfloat32_t v1506 = svreinterpret_f32_f64( - svld1_gather_s64index_f64(pred_full, (const double *)(v1504), v1778)); - svfloat32_t v1515 = svreinterpret_f32_f64( - svld1_gather_s64index_f64(pred_full, (const double *)(v1513), v1778)); - svfloat32_t v1524 = svreinterpret_f32_f64( - svld1_gather_s64index_f64(pred_full, (const double *)(v1522), v1778)); - svfloat32_t v1533 = svreinterpret_f32_f64( - svld1_gather_s64index_f64(pred_full, (const double *)(v1531), v1778)); - svfloat32_t v1542 = svreinterpret_f32_f64( - svld1_gather_s64index_f64(pred_full, (const double *)(v1540), v1778)); - svfloat32_t v1551 = svreinterpret_f32_f64( - svld1_gather_s64index_f64(pred_full, (const double *)(v1549), v1778)); - svfloat32_t v1560 = svreinterpret_f32_f64( - svld1_gather_s64index_f64(pred_full, (const double *)(v1558), v1778)); - svfloat32_t v1569 = svreinterpret_f32_f64( - svld1_gather_s64index_f64(pred_full, (const double *)(v1567), v1778)); - svfloat32_t v1578 = svreinterpret_f32_f64( - svld1_gather_s64index_f64(pred_full, (const double *)(v1576), v1778)); - svfloat32_t v1587 = svreinterpret_f32_f64( - svld1_gather_s64index_f64(pred_full, (const double *)(v1585), v1778)); - svfloat32_t v1596 = svreinterpret_f32_f64( - svld1_gather_s64index_f64(pred_full, (const double *)(v1594), v1778)); - svfloat32_t v1605 = svreinterpret_f32_f64( - svld1_gather_s64index_f64(pred_full, (const double *)(v1603), v1778)); - svfloat32_t v1614 = svreinterpret_f32_f64( - svld1_gather_s64index_f64(pred_full, (const double *)(v1612), v1778)); - svfloat32_t v1623 = svreinterpret_f32_f64( - svld1_gather_s64index_f64(pred_full, (const double *)(v1621), v1778)); - svfloat32_t v1642 = svreinterpret_f32_f64( - svld1_gather_s64index_f64(pred_full, (const double *)(v1640), v1778)); - svfloat32_t v1652 = svreinterpret_f32_f64( - svld1_gather_s64index_f64(pred_full, (const double *)(v1650), v1778)); - svfloat32_t v1661 = svreinterpret_f32_f64( - svld1_gather_s64index_f64(pred_full, (const double *)(v1659), v1778)); - svfloat32_t v1670 = svreinterpret_f32_f64( - svld1_gather_s64index_f64(pred_full, (const double *)(v1668), v1778)); - svfloat32_t v1679 = svreinterpret_f32_f64( - svld1_gather_s64index_f64(pred_full, (const double *)(v1677), v1778)); - svfloat32_t v1688 = svreinterpret_f32_f64( - svld1_gather_s64index_f64(pred_full, (const double *)(v1686), v1778)); - svfloat32_t v1697 = svreinterpret_f32_f64( - svld1_gather_s64index_f64(pred_full, (const double *)(v1695), v1778)); - svfloat32_t v1706 = svreinterpret_f32_f64( - svld1_gather_s64index_f64(pred_full, (const double *)(v1704), v1778)); - svfloat32_t v1715 = svreinterpret_f32_f64( - svld1_gather_s64index_f64(pred_full, (const double *)(v1713), v1778)); - svfloat32_t v1724 = svreinterpret_f32_f64( - svld1_gather_s64index_f64(pred_full, (const double *)(v1722), v1778)); - svfloat32_t v1733 = svreinterpret_f32_f64( - svld1_gather_s64index_f64(pred_full, (const double *)(v1731), v1778)); - svfloat32_t v1742 = svreinterpret_f32_f64( - svld1_gather_s64index_f64(pred_full, (const double *)(v1740), v1778)); - svfloat32_t v1751 = svreinterpret_f32_f64( - svld1_gather_s64index_f64(pred_full, (const double *)(v1749), v1778)); - svfloat32_t v1760 = svreinterpret_f32_f64( - svld1_gather_s64index_f64(pred_full, (const double *)(v1758), v1778)); - svfloat32_t v1769 = svreinterpret_f32_f64( - svld1_gather_s64index_f64(pred_full, (const double *)(v1767), v1778)); - svfloat32_t zero38; - asm volatile("mov %0.s, #0" : "=w"(zero38)); - svfloat32_t v38 = - svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero38, v1497, v37, 0), - v1497, v37, 90); - svfloat32_t zero52; - asm volatile("mov %0.s, #0" : "=w"(zero52)); - svfloat32_t v52 = - svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero52, v1506, v51, 0), - v1506, v51, 90); - svfloat32_t zero66; - asm volatile("mov %0.s, #0" : "=w"(zero66)); - svfloat32_t v66 = - svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero66, v1515, v65, 0), - v1515, v65, 90); - svfloat32_t zero101; - asm volatile("mov %0.s, #0" : "=w"(zero101)); - svfloat32_t v101 = svcmla_f32_x( - pred_full, svcmla_f32_x(pred_full, zero101, v1524, v100, 0), v1524, - v100, 90); - svfloat32_t zero108; - asm volatile("mov %0.s, #0" : "=w"(zero108)); - svfloat32_t v108 = svcmla_f32_x( - pred_full, svcmla_f32_x(pred_full, zero108, v1533, v107, 0), v1533, - v107, 90); - svfloat32_t zero143; - asm volatile("mov %0.s, #0" : "=w"(zero143)); - svfloat32_t v143 = svcmla_f32_x( - pred_full, svcmla_f32_x(pred_full, zero143, v1542, v142, 0), v1542, - v142, 90); - svfloat32_t zero150; - asm volatile("mov %0.s, #0" : "=w"(zero150)); - svfloat32_t v150 = svcmla_f32_x( - pred_full, svcmla_f32_x(pred_full, zero150, v1551, v149, 0), v1551, - v149, 90); - svfloat32_t zero185; - asm volatile("mov %0.s, #0" : "=w"(zero185)); - svfloat32_t v185 = svcmla_f32_x( - pred_full, svcmla_f32_x(pred_full, zero185, v1560, v184, 0), v1560, - v184, 90); - svfloat32_t zero192; - asm volatile("mov %0.s, #0" : "=w"(zero192)); - svfloat32_t v192 = svcmla_f32_x( - pred_full, svcmla_f32_x(pred_full, zero192, v1569, v191, 0), v1569, - v191, 90); - svfloat32_t zero206; - asm volatile("mov %0.s, #0" : "=w"(zero206)); - svfloat32_t v206 = svcmla_f32_x( - pred_full, svcmla_f32_x(pred_full, zero206, v1578, v205, 0), v1578, - v205, 90); - svfloat32_t zero220; - asm volatile("mov %0.s, #0" : "=w"(zero220)); - svfloat32_t v220 = svcmla_f32_x( - pred_full, svcmla_f32_x(pred_full, zero220, v1587, v219, 0), v1587, - v219, 90); - svfloat32_t zero255; - asm volatile("mov %0.s, #0" : "=w"(zero255)); - svfloat32_t v255 = svcmla_f32_x( - pred_full, svcmla_f32_x(pred_full, zero255, v1596, v254, 0), v1596, - v254, 90); - svfloat32_t zero262; - asm volatile("mov %0.s, #0" : "=w"(zero262)); - svfloat32_t v262 = svcmla_f32_x( - pred_full, svcmla_f32_x(pred_full, zero262, v1605, v261, 0), v1605, - v261, 90); - svfloat32_t zero276; - asm volatile("mov %0.s, #0" : "=w"(zero276)); - svfloat32_t v276 = svcmla_f32_x( - pred_full, svcmla_f32_x(pred_full, zero276, v1614, v275, 0), v1614, - v275, 90); - svfloat32_t zero290; - asm volatile("mov %0.s, #0" : "=w"(zero290)); - svfloat32_t v290 = svcmla_f32_x( - pred_full, svcmla_f32_x(pred_full, zero290, v1623, v289, 0), v1623, - v289, 90); - svfloat32_t zero332; - asm volatile("mov %0.s, #0" : "=w"(zero332)); - svfloat32_t v332 = svcmla_f32_x( - pred_full, svcmla_f32_x(pred_full, zero332, v1642, v331, 0), v1642, - v331, 90); - svfloat32_t zero346; - asm volatile("mov %0.s, #0" : "=w"(zero346)); - svfloat32_t v346 = svcmla_f32_x( - pred_full, svcmla_f32_x(pred_full, zero346, v1652, v345, 0), v1652, - v345, 90); - svfloat32_t zero360; - asm volatile("mov %0.s, #0" : "=w"(zero360)); - svfloat32_t v360 = svcmla_f32_x( - pred_full, svcmla_f32_x(pred_full, zero360, v1661, v359, 0), v1661, - v359, 90); - svfloat32_t zero395; - asm volatile("mov %0.s, #0" : "=w"(zero395)); - svfloat32_t v395 = svcmla_f32_x( - pred_full, svcmla_f32_x(pred_full, zero395, v1670, v394, 0), v1670, - v394, 90); - svfloat32_t zero402; - asm volatile("mov %0.s, #0" : "=w"(zero402)); - svfloat32_t v402 = svcmla_f32_x( - pred_full, svcmla_f32_x(pred_full, zero402, v1679, v401, 0), v1679, - v401, 90); - svfloat32_t zero437; - asm volatile("mov %0.s, #0" : "=w"(zero437)); - svfloat32_t v437 = svcmla_f32_x( - pred_full, svcmla_f32_x(pred_full, zero437, v1688, v436, 0), v1688, - v436, 90); - svfloat32_t zero444; - asm volatile("mov %0.s, #0" : "=w"(zero444)); - svfloat32_t v444 = svcmla_f32_x( - pred_full, svcmla_f32_x(pred_full, zero444, v1697, v443, 0), v1697, - v443, 90); - svfloat32_t zero479; - asm volatile("mov %0.s, #0" : "=w"(zero479)); - svfloat32_t v479 = svcmla_f32_x( - pred_full, svcmla_f32_x(pred_full, zero479, v1706, v478, 0), v1706, - v478, 90); - svfloat32_t zero486; - asm volatile("mov %0.s, #0" : "=w"(zero486)); - svfloat32_t v486 = svcmla_f32_x( - pred_full, svcmla_f32_x(pred_full, zero486, v1715, v485, 0), v1715, - v485, 90); - svfloat32_t zero500; - asm volatile("mov %0.s, #0" : "=w"(zero500)); - svfloat32_t v500 = svcmla_f32_x( - pred_full, svcmla_f32_x(pred_full, zero500, v1724, v499, 0), v1724, - v499, 90); - svfloat32_t zero514; - asm volatile("mov %0.s, #0" : "=w"(zero514)); - svfloat32_t v514 = svcmla_f32_x( - pred_full, svcmla_f32_x(pred_full, zero514, v1733, v513, 0), v1733, - v513, 90); - svfloat32_t zero549; - asm volatile("mov %0.s, #0" : "=w"(zero549)); - svfloat32_t v549 = svcmla_f32_x( - pred_full, svcmla_f32_x(pred_full, zero549, v1742, v548, 0), v1742, - v548, 90); - svfloat32_t zero556; - asm volatile("mov %0.s, #0" : "=w"(zero556)); - svfloat32_t v556 = svcmla_f32_x( - pred_full, svcmla_f32_x(pred_full, zero556, v1751, v555, 0), v1751, - v555, 90); - svfloat32_t zero591; - asm volatile("mov %0.s, #0" : "=w"(zero591)); - svfloat32_t v591 = svcmla_f32_x( - pred_full, svcmla_f32_x(pred_full, zero591, v1760, v590, 0), v1760, - v590, 90); - svfloat32_t zero598; - asm volatile("mov %0.s, #0" : "=w"(zero598)); - svfloat32_t v598 = svcmla_f32_x( - pred_full, svcmla_f32_x(pred_full, zero598, v1769, v597, 0), v1769, - v597, 90); - svfloat32_t v606; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v606) : "w"(v1779), "w"(v38)); - svfloat32_t v607; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v607) : "w"(v1779), "w"(v38)); - svfloat32_t v608; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v608) : "w"(v52), "w"(v66)); - svfloat32_t v609; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v609) : "w"(v52), "w"(v66)); - svfloat32_t v621; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v621) : "w"(v101), "w"(v108)); - svfloat32_t v622; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v622) : "w"(v101), "w"(v108)); - svfloat32_t v623; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v623) : "w"(v143), "w"(v150)); - svfloat32_t v624; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v624) : "w"(v143), "w"(v150)); - svfloat32_t v677; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v677) : "w"(v185), "w"(v192)); - svfloat32_t v678; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v678) : "w"(v185), "w"(v192)); - svfloat32_t v679; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v679) : "w"(v206), "w"(v220)); - svfloat32_t v680; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v680) : "w"(v206), "w"(v220)); - svfloat32_t v692; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v692) : "w"(v255), "w"(v262)); - svfloat32_t v693; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v693) : "w"(v255), "w"(v262)); - svfloat32_t v694; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v694) : "w"(v276), "w"(v290)); - svfloat32_t v695; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v695) : "w"(v276), "w"(v290)); - svfloat32_t v837; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v837) : "w"(v325), "w"(v332)); - svfloat32_t v838; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v838) : "w"(v325), "w"(v332)); - svfloat32_t v839; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v839) : "w"(v346), "w"(v360)); - svfloat32_t v840; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v840) : "w"(v346), "w"(v360)); - svfloat32_t v852; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v852) : "w"(v395), "w"(v402)); - svfloat32_t v853; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v853) : "w"(v395), "w"(v402)); - svfloat32_t v854; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v854) : "w"(v437), "w"(v444)); - svfloat32_t v855; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v855) : "w"(v437), "w"(v444)); - svfloat32_t v908; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v908) : "w"(v479), "w"(v486)); - svfloat32_t v909; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v909) : "w"(v479), "w"(v486)); - svfloat32_t v910; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v910) : "w"(v500), "w"(v514)); - svfloat32_t v911; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v911) : "w"(v500), "w"(v514)); - svfloat32_t v923; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v923) : "w"(v549), "w"(v556)); - svfloat32_t v924; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v924) : "w"(v549), "w"(v556)); - svfloat32_t v925; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v925) : "w"(v591), "w"(v598)); - svfloat32_t v926; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v926) : "w"(v591), "w"(v598)); - svfloat32_t zero616; - asm volatile("mov %0.s, #0" : "=w"(zero616)); - svfloat32_t v616 = svcmla_f32_x(pred_full, zero616, v1983, v609, 90); - svfloat32_t v617; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v617) : "w"(v606), "w"(v608)); - svfloat32_t v618; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v618) : "w"(v606), "w"(v608)); - svfloat32_t v625; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v625) : "w"(v621), "w"(v623)); - svfloat32_t v626; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v626) : "w"(v621), "w"(v623)); - svfloat32_t v642; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v642) : "w"(v622), "w"(v1980)); - svfloat32_t v654; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v654) : "w"(v624), "w"(v1982)); - svfloat32_t zero687; - asm volatile("mov %0.s, #0" : "=w"(zero687)); - svfloat32_t v687 = svcmla_f32_x(pred_full, zero687, v1983, v680, 90); - svfloat32_t v688; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v688) : "w"(v677), "w"(v679)); - svfloat32_t v689; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v689) : "w"(v677), "w"(v679)); - svfloat32_t zero702; - asm volatile("mov %0.s, #0" : "=w"(zero702)); - svfloat32_t v702 = svcmla_f32_x(pred_full, zero702, v1983, v695, 90); - svfloat32_t v703; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v703) : "w"(v692), "w"(v694)); - svfloat32_t v704; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v704) : "w"(v692), "w"(v694)); - svfloat32_t zero847; - asm volatile("mov %0.s, #0" : "=w"(zero847)); - svfloat32_t v847 = svcmla_f32_x(pred_full, zero847, v1983, v840, 90); - svfloat32_t v848; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v848) : "w"(v837), "w"(v839)); - svfloat32_t v849; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v849) : "w"(v837), "w"(v839)); - svfloat32_t v856; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v856) : "w"(v852), "w"(v854)); - svfloat32_t v857; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v857) : "w"(v852), "w"(v854)); - svfloat32_t v873; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v873) : "w"(v853), "w"(v1980)); - svfloat32_t v885; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v885) : "w"(v855), "w"(v1982)); - svfloat32_t zero918; - asm volatile("mov %0.s, #0" : "=w"(zero918)); - svfloat32_t v918 = svcmla_f32_x(pred_full, zero918, v1983, v911, 90); - svfloat32_t v919; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v919) : "w"(v908), "w"(v910)); - svfloat32_t v920; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v920) : "w"(v908), "w"(v910)); - svfloat32_t v927; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v927) : "w"(v923), "w"(v925)); - svfloat32_t v928; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v928) : "w"(v923), "w"(v925)); - svfloat32_t v944; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v944) : "w"(v924), "w"(v1980)); - svfloat32_t v956; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v956) : "w"(v926), "w"(v1982)); - svfloat32_t v619; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v619) : "w"(v607), "w"(v616)); - svfloat32_t v620; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v620) : "w"(v607), "w"(v616)); - svfloat32_t zero633; - asm volatile("mov %0.s, #0" : "=w"(zero633)); - svfloat32_t v633 = svcmla_f32_x(pred_full, zero633, v1983, v626, 90); - svfloat32_t v634; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v634) : "w"(v617), "w"(v625)); - svfloat32_t v635; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v635) : "w"(v617), "w"(v625)); - svfloat32_t v690; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v690) : "w"(v678), "w"(v687)); - svfloat32_t v691; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v691) : "w"(v678), "w"(v687)); - svfloat32_t v705; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v705) : "w"(v693), "w"(v702)); - svfloat32_t v706; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v706) : "w"(v693), "w"(v702)); - svfloat32_t v707; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v707) : "w"(v688), "w"(v703)); - svfloat32_t v708; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v708) : "w"(v688), "w"(v703)); - svfloat32_t v763; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v763) : "w"(v689), "w"(v1980)); - svfloat32_t v775; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v775) : "w"(v704), "w"(v1982)); - svfloat32_t v850; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v850) : "w"(v838), "w"(v847)); - svfloat32_t v851; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v851) : "w"(v838), "w"(v847)); - svfloat32_t zero864; - asm volatile("mov %0.s, #0" : "=w"(zero864)); - svfloat32_t v864 = svcmla_f32_x(pred_full, zero864, v1983, v857, 90); - svfloat32_t v865; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v865) : "w"(v848), "w"(v856)); - svfloat32_t v866; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v866) : "w"(v848), "w"(v856)); - svfloat32_t v921; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v921) : "w"(v909), "w"(v918)); - svfloat32_t v922; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v922) : "w"(v909), "w"(v918)); - svfloat32_t zero935; - asm volatile("mov %0.s, #0" : "=w"(zero935)); - svfloat32_t v935 = svcmla_f32_x(pred_full, zero935, v1983, v928, 90); - svfloat32_t v936; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v936) : "w"(v919), "w"(v927)); - svfloat32_t v937; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v937) : "w"(v919), "w"(v927)); - svfloat32_t v636; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v636) : "w"(v618), "w"(v633)); - svfloat32_t v637; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v637) : "w"(v618), "w"(v633)); - svfloat32_t v662 = svcmla_f32_x(pred_full, v642, v2107, v642, 90); - svfloat32_t v663 = svcmla_f32_x(pred_full, v654, v1983, v654, 90); - svfloat32_t zero715; - asm volatile("mov %0.s, #0" : "=w"(zero715)); - svfloat32_t v715 = svcmla_f32_x(pred_full, zero715, v1983, v708, 90); - svfloat32_t v716; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v716) : "w"(v634), "w"(v707)); - svfloat32_t v717; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v717) : "w"(v634), "w"(v707)); - svfloat32_t v724; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v724) : "w"(v690), "w"(v1898)); - svfloat32_t v736; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v736) : "w"(v705), "w"(v2062)); - svfloat32_t v802; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v802) : "w"(v691), "w"(v2062)); - svfloat32_t v814; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v814) : "w"(v706), "w"(v2064)); - svfloat32_t v867; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v867) : "w"(v849), "w"(v864)); - svfloat32_t v868; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v868) : "w"(v849), "w"(v864)); - svfloat32_t v893 = svcmla_f32_x(pred_full, v873, v2107, v873, 90); - svfloat32_t v894 = svcmla_f32_x(pred_full, v885, v1983, v885, 90); - svfloat32_t v938; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v938) : "w"(v920), "w"(v935)); - svfloat32_t v939; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v939) : "w"(v920), "w"(v935)); - svfloat32_t v964 = svcmla_f32_x(pred_full, v944, v2107, v944, 90); - svfloat32_t v965 = svcmla_f32_x(pred_full, v956, v1983, v956, 90); - svfloat32_t v979; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v979) : "w"(v865), "w"(v936)); - svfloat32_t v980; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v980) : "w"(v865), "w"(v936)); - svfloat32_t v1225; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v1225) : "w"(v866), "w"(v1980)); - svfloat32_t v1237; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v1237) : "w"(v937), "w"(v1982)); - svfloat32_t v664; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v664) : "w"(v662), "w"(v663)); - svfloat32_t v665; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v665) : "w"(v663), "w"(v662)); - svfloat32_t v718; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v718) : "w"(v635), "w"(v715)); - svfloat32_t v719; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v719) : "w"(v635), "w"(v715)); - svfloat32_t v744 = svcmla_f32_x(pred_full, v724, v1899, v690, 90); - svfloat32_t v745 = svcmla_f32_x(pred_full, v736, v2063, v705, 90); - svfloat32_t v783 = svcmla_f32_x(pred_full, v763, v2107, v763, 90); - svfloat32_t v784 = svcmla_f32_x(pred_full, v775, v1983, v775, 90); - svfloat32_t v822 = svcmla_f32_x(pred_full, v802, v2063, v691, 90); - svfloat32_t v823 = svcmla_f32_x(pred_full, v814, v2065, v706, 90); - svfloat32_t v895; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v895) : "w"(v893), "w"(v894)); - svfloat32_t v896; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v896) : "w"(v894), "w"(v893)); - svfloat32_t v966; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v966) : "w"(v964), "w"(v965)); - svfloat32_t v967; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v967) : "w"(v965), "w"(v964)); - svfloat32_t zero987; - asm volatile("mov %0.s, #0" : "=w"(zero987)); - svfloat32_t v987 = svcmla_f32_x(pred_full, zero987, v1983, v980, 90); - svfloat32_t v988; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v988) : "w"(v716), "w"(v979)); - svfloat32_t v989; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v989) : "w"(v716), "w"(v979)); - svfloat32_t v1091; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v1091) : "w"(v867), "w"(v1898)); - svfloat32_t v1103; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v1103) : "w"(v938), "w"(v2062)); - svfloat32_t v1359; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v1359) : "w"(v868), "w"(v2062)); - svfloat32_t v1371; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v1371) : "w"(v939), "w"(v2064)); - svfloat32_t zero672; - asm volatile("mov %0.s, #0" : "=w"(zero672)); - svfloat32_t v672 = svcmla_f32_x(pred_full, zero672, v2107, v665, 90); - svfloat32_t v673; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v673) : "w"(v619), "w"(v664)); - svfloat32_t v674; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v674) : "w"(v619), "w"(v664)); - svfloat32_t v746; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v746) : "w"(v744), "w"(v745)); - svfloat32_t v747; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v747) : "w"(v745), "w"(v744)); - svfloat32_t v785; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v785) : "w"(v783), "w"(v784)); - svfloat32_t v786; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v786) : "w"(v784), "w"(v783)); - svfloat32_t v824; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v824) : "w"(v822), "w"(v823)); - svfloat32_t v825; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v825) : "w"(v823), "w"(v822)); - svfloat32_t zero903; - asm volatile("mov %0.s, #0" : "=w"(zero903)); - svfloat32_t v903 = svcmla_f32_x(pred_full, zero903, v2107, v896, 90); - svfloat32_t v904; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v904) : "w"(v850), "w"(v895)); - svfloat32_t v905; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v905) : "w"(v850), "w"(v895)); - svfloat32_t zero974; - asm volatile("mov %0.s, #0" : "=w"(zero974)); - svfloat32_t v974 = svcmla_f32_x(pred_full, zero974, v2107, v967, 90); - svfloat32_t v975; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v975) : "w"(v921), "w"(v966)); - svfloat32_t v976; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v976) : "w"(v921), "w"(v966)); - svfloat32_t v990; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v990) : "w"(v717), "w"(v987)); - svfloat32_t v991; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v991) : "w"(v717), "w"(v987)); - svfloat32_t v1111 = svcmla_f32_x(pred_full, v1091, v1899, v867, 90); - svfloat32_t v1112 = svcmla_f32_x(pred_full, v1103, v2063, v938, 90); - svfloat32_t v1245 = svcmla_f32_x(pred_full, v1225, v2107, v1225, 90); - svfloat32_t v1246 = svcmla_f32_x(pred_full, v1237, v1983, v1237, 90); - svfloat32_t v1379 = svcmla_f32_x(pred_full, v1359, v2063, v868, 90); - svfloat32_t v1380 = svcmla_f32_x(pred_full, v1371, v2065, v939, 90); - svst1_scatter_s64index_f64(pred_full, (double *)(v1827), v2142, - svreinterpret_f64_f32(v988)); - svst1_scatter_s64index_f64(pred_full, (double *)(v1845), v2142, - svreinterpret_f64_f32(v989)); - svfloat32_t v675; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v675) : "w"(v620), "w"(v672)); - svfloat32_t v676; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v676) : "w"(v620), "w"(v672)); - svfloat32_t zero754; - asm volatile("mov %0.s, #0" : "=w"(zero754)); - svfloat32_t v754 = svcmla_f32_x(pred_full, zero754, v2107, v747, 90); - svfloat32_t v755; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v755) : "w"(v673), "w"(v746)); - svfloat32_t v756; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v756) : "w"(v673), "w"(v746)); - svfloat32_t zero793; - asm volatile("mov %0.s, #0" : "=w"(zero793)); - svfloat32_t v793 = svcmla_f32_x(pred_full, zero793, v2107, v786, 90); - svfloat32_t v794; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v794) : "w"(v636), "w"(v785)); - svfloat32_t v795; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v795) : "w"(v636), "w"(v785)); - svfloat32_t zero832; - asm volatile("mov %0.s, #0" : "=w"(zero832)); - svfloat32_t v832 = svcmla_f32_x(pred_full, zero832, v2107, v825, 90); - svfloat32_t v906; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v906) : "w"(v851), "w"(v903)); - svfloat32_t v907; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v907) : "w"(v851), "w"(v903)); - svfloat32_t v977; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v977) : "w"(v922), "w"(v974)); - svfloat32_t v978; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v978) : "w"(v922), "w"(v974)); - svfloat32_t v1024; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v1024) : "w"(v904), "w"(v1857)); - svfloat32_t v1036; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v1036) : "w"(v975), "w"(v1939)); - svfloat32_t v1113; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v1113) : "w"(v1111), "w"(v1112)); - svfloat32_t v1114; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1114) : "w"(v1112), "w"(v1111)); - svfloat32_t v1247; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v1247) : "w"(v1245), "w"(v1246)); - svfloat32_t v1248; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1248) : "w"(v1246), "w"(v1245)); - svfloat32_t v1292; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v1292) : "w"(v905), "w"(v2021)); - svfloat32_t v1304; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v1304) : "w"(v976), "w"(v2023)); - svfloat32_t v1381; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v1381) : "w"(v1379), "w"(v1380)); - svfloat32_t v1382; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1382) : "w"(v1380), "w"(v1379)); - svst1_scatter_s64index_f64(pred_full, (double *)(v1836), v2142, - svreinterpret_f64_f32(v990)); - svst1_scatter_s64index_f64(pred_full, (double *)(v1854), v2142, - svreinterpret_f64_f32(v991)); - svfloat32_t v757; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v757) : "w"(v674), "w"(v754)); - svfloat32_t v758; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v758) : "w"(v674), "w"(v754)); - svfloat32_t v796; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v796) : "w"(v637), "w"(v793)); - svfloat32_t v797; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v797) : "w"(v637), "w"(v793)); - svfloat32_t v833; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v833) : "w"(v675), "w"(v824)); - svfloat32_t v834; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v834) : "w"(v675), "w"(v824)); - svfloat32_t v835; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v835) : "w"(v676), "w"(v832)); - svfloat32_t v836; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v836) : "w"(v676), "w"(v832)); - svfloat32_t v1044 = svcmla_f32_x(pred_full, v1024, v2024, v904, 90); - svfloat32_t v1045 = svcmla_f32_x(pred_full, v1036, v1940, v975, 90); - svfloat32_t zero1121; - asm volatile("mov %0.s, #0" : "=w"(zero1121)); - svfloat32_t v1121 = svcmla_f32_x(pred_full, zero1121, v2107, v1114, 90); - svfloat32_t v1122; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v1122) : "w"(v794), "w"(v1113)); - svfloat32_t v1123; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1123) : "w"(v794), "w"(v1113)); - svfloat32_t v1158; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v1158) : "w"(v906), "w"(v1939)); - svfloat32_t v1170; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v1170) : "w"(v977), "w"(v1941)); - svfloat32_t zero1255; - asm volatile("mov %0.s, #0" : "=w"(zero1255)); - svfloat32_t v1255 = svcmla_f32_x(pred_full, zero1255, v2107, v1248, 90); - svfloat32_t v1256; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v1256) : "w"(v718), "w"(v1247)); - svfloat32_t v1257; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1257) : "w"(v718), "w"(v1247)); - svfloat32_t v1312 = svcmla_f32_x(pred_full, v1292, v2022, v905, 90); - svfloat32_t v1313 = svcmla_f32_x(pred_full, v1304, v2024, v976, 90); - svfloat32_t zero1389; - asm volatile("mov %0.s, #0" : "=w"(zero1389)); - svfloat32_t v1389 = svcmla_f32_x(pred_full, zero1389, v2107, v1382, 90); - svfloat32_t v1426; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v1426) : "w"(v907), "w"(v2103)); - svfloat32_t v1438; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v1438) : "w"(v978), "w"(v2105)); - svfloat32_t v1046; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v1046) : "w"(v1044), "w"(v1045)); - svfloat32_t v1047; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1047) : "w"(v1045), "w"(v1044)); - svfloat32_t v1124; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1124) : "w"(v795), "w"(v1121)); - svfloat32_t v1125; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v1125) : "w"(v795), "w"(v1121)); - svfloat32_t v1178 = svcmla_f32_x(pred_full, v1158, v1940, v906, 90); - svfloat32_t v1179 = svcmla_f32_x(pred_full, v1170, v2104, v977, 90); - svfloat32_t v1258; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1258) : "w"(v719), "w"(v1255)); - svfloat32_t v1259; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v1259) : "w"(v719), "w"(v1255)); - svfloat32_t v1314; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v1314) : "w"(v1312), "w"(v1313)); - svfloat32_t v1315; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1315) : "w"(v1313), "w"(v1312)); - svfloat32_t v1390; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v1390) : "w"(v796), "w"(v1381)); - svfloat32_t v1391; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1391) : "w"(v796), "w"(v1381)); - svfloat32_t v1392; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1392) : "w"(v797), "w"(v1389)); - svfloat32_t v1393; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v1393) : "w"(v797), "w"(v1389)); - svfloat32_t v1446 = svcmla_f32_x(pred_full, v1426, v2104, v907, 90); - svfloat32_t v1447 = svcmla_f32_x(pred_full, v1438, v2106, v978, 90); - svst1_scatter_s64index_f64(pred_full, (double *)(v1909), v2142, - svreinterpret_f64_f32(v1122)); - svst1_scatter_s64index_f64(pred_full, (double *)(v1927), v2142, - svreinterpret_f64_f32(v1123)); - svst1_scatter_s64index_f64(pred_full, (double *)(v1991), v2142, - svreinterpret_f64_f32(v1256)); - svst1_scatter_s64index_f64(pred_full, (double *)(v2009), v2142, - svreinterpret_f64_f32(v1257)); - svfloat32_t zero1054; - asm volatile("mov %0.s, #0" : "=w"(zero1054)); - svfloat32_t v1054 = svcmla_f32_x(pred_full, zero1054, v2107, v1047, 90); - svfloat32_t v1055; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v1055) : "w"(v755), "w"(v1046)); - svfloat32_t v1056; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1056) : "w"(v755), "w"(v1046)); - svfloat32_t v1180; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v1180) : "w"(v1178), "w"(v1179)); - svfloat32_t v1181; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1181) : "w"(v1179), "w"(v1178)); - svfloat32_t zero1322; - asm volatile("mov %0.s, #0" : "=w"(zero1322)); - svfloat32_t v1322 = svcmla_f32_x(pred_full, zero1322, v2107, v1315, 90); - svfloat32_t v1323; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v1323) : "w"(v757), "w"(v1314)); - svfloat32_t v1324; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1324) : "w"(v757), "w"(v1314)); - svfloat32_t v1448; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v1448) : "w"(v1446), "w"(v1447)); - svfloat32_t v1449; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1449) : "w"(v1447), "w"(v1446)); - svst1_scatter_s64index_f64(pred_full, (double *)(v1918), v2142, - svreinterpret_f64_f32(v1124)); - svst1_scatter_s64index_f64(pred_full, (double *)(v1936), v2142, - svreinterpret_f64_f32(v1125)); - svst1_scatter_s64index_f64(pred_full, (double *)(v2000), v2142, - svreinterpret_f64_f32(v1258)); - svst1_scatter_s64index_f64(pred_full, (double *)(v2018), v2142, - svreinterpret_f64_f32(v1259)); - svst1_scatter_s64index_f64(pred_full, (double *)(v2073), v2142, - svreinterpret_f64_f32(v1390)); - svst1_scatter_s64index_f64(pred_full, (double *)(v2082), v2142, - svreinterpret_f64_f32(v1392)); - svst1_scatter_s64index_f64(pred_full, (double *)(v2091), v2142, - svreinterpret_f64_f32(v1391)); - svst1_scatter_s64index_f64(pred_full, (double *)(v2100), v2142, - svreinterpret_f64_f32(v1393)); - svfloat32_t v1057; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1057) : "w"(v756), "w"(v1054)); - svfloat32_t v1058; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v1058) : "w"(v756), "w"(v1054)); - svfloat32_t zero1188; - asm volatile("mov %0.s, #0" : "=w"(zero1188)); - svfloat32_t v1188 = svcmla_f32_x(pred_full, zero1188, v2107, v1181, 90); - svfloat32_t v1189; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v1189) : "w"(v833), "w"(v1180)); - svfloat32_t v1190; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1190) : "w"(v833), "w"(v1180)); - svfloat32_t v1325; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1325) : "w"(v758), "w"(v1322)); - svfloat32_t v1326; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v1326) : "w"(v758), "w"(v1322)); - svfloat32_t zero1456; - asm volatile("mov %0.s, #0" : "=w"(zero1456)); - svfloat32_t v1456 = svcmla_f32_x(pred_full, zero1456, v2107, v1449, 90); - svfloat32_t v1457; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v1457) : "w"(v835), "w"(v1448)); - svfloat32_t v1458; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1458) : "w"(v835), "w"(v1448)); - svst1_scatter_s64index_f64(pred_full, (double *)(v1868), v2142, - svreinterpret_f64_f32(v1055)); - svst1_scatter_s64index_f64(pred_full, (double *)(v1886), v2142, - svreinterpret_f64_f32(v1056)); - svst1_scatter_s64index_f64(pred_full, (double *)(v2032), v2142, - svreinterpret_f64_f32(v1323)); - svst1_scatter_s64index_f64(pred_full, (double *)(v2050), v2142, - svreinterpret_f64_f32(v1324)); - svfloat32_t v1191; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1191) : "w"(v834), "w"(v1188)); - svfloat32_t v1192; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v1192) : "w"(v834), "w"(v1188)); - svfloat32_t v1459; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1459) : "w"(v836), "w"(v1456)); - svfloat32_t v1460; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v1460) : "w"(v836), "w"(v1456)); - svst1_scatter_s64index_f64(pred_full, (double *)(v1877), v2142, - svreinterpret_f64_f32(v1057)); - svst1_scatter_s64index_f64(pred_full, (double *)(v1895), v2142, - svreinterpret_f64_f32(v1058)); - svst1_scatter_s64index_f64(pred_full, (double *)(v1950), v2142, - svreinterpret_f64_f32(v1189)); - svst1_scatter_s64index_f64(pred_full, (double *)(v1968), v2142, - svreinterpret_f64_f32(v1190)); - svst1_scatter_s64index_f64(pred_full, (double *)(v2041), v2142, - svreinterpret_f64_f32(v1325)); - svst1_scatter_s64index_f64(pred_full, (double *)(v2059), v2142, - svreinterpret_f64_f32(v1326)); - svst1_scatter_s64index_f64(pred_full, (double *)(v2114), v2142, - svreinterpret_f64_f32(v1457)); - svst1_scatter_s64index_f64(pred_full, (double *)(v2132), v2142, - svreinterpret_f64_f32(v1458)); - svst1_scatter_s64index_f64(pred_full, (double *)(v1959), v2142, - svreinterpret_f64_f32(v1191)); - svst1_scatter_s64index_f64(pred_full, (double *)(v1977), v2142, - svreinterpret_f64_f32(v1192)); - svst1_scatter_s64index_f64(pred_full, (double *)(v2123), v2142, - svreinterpret_f64_f32(v1459)); - svst1_scatter_s64index_f64(pred_full, (double *)(v2141), v2142, - svreinterpret_f64_f32(v1460)); - v5 += v11; - v6 += v12; - } -} -#endif diff --git a/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ab_t_gs.h b/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ab_t_gs.h index ba99f76072c9030e9b323d03c15456ca6537dd63..b66da6b54073abdfe2c5d6ad044c99534a44988c 100644 --- a/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ab_t_gs.h +++ b/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ab_t_gs.h @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #pragma once @@ -41,7 +43,6 @@ cf32_cf32_cf32_ab_t_gs_fft_t armral_fft_cf32_cf32_cf32_ab_t_gs21; cf32_cf32_cf32_ab_t_gs_fft_t armral_fft_cf32_cf32_cf32_ab_t_gs22; cf32_cf32_cf32_ab_t_gs_fft_t armral_fft_cf32_cf32_cf32_ab_t_gs24; cf32_cf32_cf32_ab_t_gs_fft_t armral_fft_cf32_cf32_cf32_ab_t_gs25; -cf32_cf32_cf32_ab_t_gs_fft_t armral_fft_cf32_cf32_cf32_ab_t_gs32; #ifdef __cplusplus } // extern "C" diff --git a/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ab_t_gu.c b/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ab_t_gu.c index c70d093a5112e1cb65c78c438f961904584fbc56..bd75b8dd60dcbf5e608f11ff830e508c68b26efd 100644 --- a/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ab_t_gu.c +++ b/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ab_t_gu.c @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "fft_cf32_cf32_cf32_ab_t_gu.h" @@ -68,14 +70,11 @@ void armral_fft_cf32_cf32_cf32_ab_t_gu2(const armral_cmplx_f32_t *restrict x, svld1_gather_s64index_f64(pred_full, (const double *)(v78), v91)); svfloat32_t v92 = svreinterpret_f32_f64( svld1_gather_s64index_f64(pred_full, (const double *)(v90), v91)); - svfloat32_t zero38; - asm volatile("mov %0.s, #0" : "=w"(zero38)); + svfloat32_t zero38 = svdup_n_f32(0); svfloat32_t v38 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero38, v80, v37, 0), v80, v37, 90); - svfloat32_t v46; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v46) : "w"(v92), "w"(v38)); - svfloat32_t v47; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v47) : "w"(v92), "w"(v38)); + svfloat32_t v46 = svadd_f32_x(svptrue_b32(), v92, v38); + svfloat32_t v47 = svsub_f32_x(svptrue_b32(), v92, v38); svst1_f64(pred_full, (double *)(v102), svreinterpret_f64_f32(v46)); svst1_f64(pred_full, (double *)(v111), svreinterpret_f64_f32(v47)); v5 += v11; @@ -180,8 +179,7 @@ void armral_fft_cf32_cf32_cf32_ab_t_gu3(const armral_cmplx_f32_t *restrict x, svld1_gather_s64index_f64(pred_full, (const double *)(v138), v139)); svfloat32_t v143 = svdup_n_f32(v84); float32x2_t *v169 = &v6[v105]; - svfloat32_t zero52; - asm volatile("mov %0.s, #0" : "=w"(zero52)); + svfloat32_t zero52 = svdup_n_f32(0); svfloat32_t v52 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero52, v119, v51, 0), v119, v51, 90); @@ -189,26 +187,19 @@ void armral_fft_cf32_cf32_cf32_ab_t_gu3(const armral_cmplx_f32_t *restrict x, svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v7)[v57])); svfloat32_t v129 = svreinterpret_f32_f64( svld1_gather_s64index_f64(pred_full, (const double *)(v127), v139)); - svfloat32_t zero59; - asm volatile("mov %0.s, #0" : "=w"(zero59)); + svfloat32_t zero59 = svdup_n_f32(0); svfloat32_t v59 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero59, v129, v58, 0), v129, v58, 90); - svfloat32_t v60; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v60) : "w"(v52), "w"(v59)); - svfloat32_t v61; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v61) : "w"(v52), "w"(v59)); - svfloat32_t v69; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v69) : "w"(v60), "w"(v140)); - svfloat32_t zero86; - asm volatile("mov %0.s, #0" : "=w"(zero86)); + svfloat32_t v60 = svadd_f32_x(svptrue_b32(), v52, v59); + svfloat32_t v61 = svsub_f32_x(svptrue_b32(), v52, v59); + svfloat32_t v69 = svadd_f32_x(svptrue_b32(), v60, v140); + svfloat32_t zero86 = svdup_n_f32(0); svfloat32_t v86 = svcmla_f32_x(pred_full, zero86, v143, v61, 90); svfloat32_t v87 = svmla_f32_x(pred_full, v69, v60, v142); svst1_f64(pred_full, (double *)(v151), svreinterpret_f64_f32(v69)); - svfloat32_t v88; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v88) : "w"(v87), "w"(v86)); - svfloat32_t v89; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v89) : "w"(v87), "w"(v86)); + svfloat32_t v88 = svadd_f32_x(svptrue_b32(), v87, v86); + svfloat32_t v89 = svsub_f32_x(svptrue_b32(), v87, v86); svst1_f64(pred_full, (double *)(v160), svreinterpret_f64_f32(v89)); svst1_f64(pred_full, (double *)(v169), svreinterpret_f64_f32(v88)); v5 += v11; @@ -328,8 +319,7 @@ void armral_fft_cf32_cf32_cf32_ab_t_gu4(const armral_cmplx_f32_t *restrict x, float32x2_t *v223 = &v6[v140]; svfloat32_t v37 = svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v7)[v36])); - svfloat32_t zero73; - asm volatile("mov %0.s, #0" : "=w"(zero73)); + svfloat32_t zero73 = svdup_n_f32(0); svfloat32_t v73 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero73, v163, v72, 0), v163, v72, 90); @@ -339,35 +329,24 @@ void armral_fft_cf32_cf32_cf32_ab_t_gu4(const armral_cmplx_f32_t *restrict x, svld1_gather_s64index_f64(pred_full, (const double *)(v152), v183)); svfloat32_t v173 = svreinterpret_f32_f64( svld1_gather_s64index_f64(pred_full, (const double *)(v171), v183)); - svfloat32_t zero38; - asm volatile("mov %0.s, #0" : "=w"(zero38)); + svfloat32_t zero38 = svdup_n_f32(0); svfloat32_t v38 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero38, v154, v37, 0), v154, v37, 90); - svfloat32_t zero80; - asm volatile("mov %0.s, #0" : "=w"(zero80)); + svfloat32_t zero80 = svdup_n_f32(0); svfloat32_t v80 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero80, v173, v79, 0), v173, v79, 90); - svfloat32_t v88; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v88) : "w"(v184), "w"(v38)); - svfloat32_t v89; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v89) : "w"(v184), "w"(v38)); - svfloat32_t v90; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v90) : "w"(v73), "w"(v80)); - svfloat32_t v91; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v91) : "w"(v73), "w"(v80)); - svfloat32_t v92; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v92) : "w"(v88), "w"(v90)); - svfloat32_t v93; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v93) : "w"(v88), "w"(v90)); - svfloat32_t zero115; - asm volatile("mov %0.s, #0" : "=w"(zero115)); + svfloat32_t v88 = svadd_f32_x(svptrue_b32(), v184, v38); + svfloat32_t v89 = svsub_f32_x(svptrue_b32(), v184, v38); + svfloat32_t v90 = svadd_f32_x(svptrue_b32(), v73, v80); + svfloat32_t v91 = svsub_f32_x(svptrue_b32(), v73, v80); + svfloat32_t v92 = svadd_f32_x(svptrue_b32(), v88, v90); + svfloat32_t v93 = svsub_f32_x(svptrue_b32(), v88, v90); + svfloat32_t zero115 = svdup_n_f32(0); svfloat32_t v115 = svcmla_f32_x(pred_full, zero115, v188, v91, 90); - svfloat32_t v116; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v116) : "w"(v89), "w"(v115)); - svfloat32_t v117; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v117) : "w"(v89), "w"(v115)); + svfloat32_t v116 = svadd_f32_x(svptrue_b32(), v89, v115); + svfloat32_t v117 = svsub_f32_x(svptrue_b32(), v89, v115); svst1_f64(pred_full, (double *)(v196), svreinterpret_f64_f32(v92)); svst1_f64(pred_full, (double *)(v214), svreinterpret_f64_f32(v93)); svst1_f64(pred_full, (double *)(v205), svreinterpret_f64_f32(v117)); @@ -540,8 +519,7 @@ void armral_fft_cf32_cf32_cf32_ab_t_gu5(const armral_cmplx_f32_t *restrict x, float32x2_t *v276 = &v6[v177]; float32x2_t *v285 = &v6[v184]; float32x2_t *v294 = &v6[v191]; - svfloat32_t zero52; - asm volatile("mov %0.s, #0" : "=w"(zero52)); + svfloat32_t zero52 = svdup_n_f32(0); svfloat32_t v52 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero52, v205, v51, 0), v205, v51, 90); @@ -557,58 +535,40 @@ void armral_fft_cf32_cf32_cf32_ab_t_gu5(const armral_cmplx_f32_t *restrict x, svld1_gather_s64index_f64(pred_full, (const double *)(v223), v243)); svfloat32_t v234 = svreinterpret_f32_f64( svld1_gather_s64index_f64(pred_full, (const double *)(v232), v243)); - svfloat32_t zero59; - asm volatile("mov %0.s, #0" : "=w"(zero59)); + svfloat32_t zero59 = svdup_n_f32(0); svfloat32_t v59 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero59, v215, v58, 0), v215, v58, 90); - svfloat32_t zero94; - asm volatile("mov %0.s, #0" : "=w"(zero94)); + svfloat32_t zero94 = svdup_n_f32(0); svfloat32_t v94 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero94, v225, v93, 0), v225, v93, 90); - svfloat32_t zero101; - asm volatile("mov %0.s, #0" : "=w"(zero101)); + svfloat32_t zero101 = svdup_n_f32(0); svfloat32_t v101 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero101, v234, v100, 0), v234, v100, 90); - svfloat32_t v102; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v102) : "w"(v52), "w"(v59)); - svfloat32_t v103; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v103) : "w"(v52), "w"(v59)); - svfloat32_t v104; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v104) : "w"(v94), "w"(v101)); - svfloat32_t v105; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v105) : "w"(v94), "w"(v101)); - svfloat32_t v106; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v106) : "w"(v102), "w"(v104)); - svfloat32_t v107; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v107) : "w"(v102), "w"(v104)); - svfloat32_t v108; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v108) : "w"(v103), "w"(v105)); - svfloat32_t zero138; - asm volatile("mov %0.s, #0" : "=w"(zero138)); + svfloat32_t v102 = svadd_f32_x(svptrue_b32(), v52, v59); + svfloat32_t v103 = svsub_f32_x(svptrue_b32(), v52, v59); + svfloat32_t v104 = svadd_f32_x(svptrue_b32(), v94, v101); + svfloat32_t v105 = svsub_f32_x(svptrue_b32(), v94, v101); + svfloat32_t v106 = svadd_f32_x(svptrue_b32(), v102, v104); + svfloat32_t v107 = svsub_f32_x(svptrue_b32(), v102, v104); + svfloat32_t v108 = svadd_f32_x(svptrue_b32(), v103, v105); + svfloat32_t zero138 = svdup_n_f32(0); svfloat32_t v138 = svcmla_f32_x(pred_full, zero138, v248, v103, 90); - svfloat32_t v116; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v116) : "w"(v106), "w"(v244)); - svfloat32_t zero145; - asm volatile("mov %0.s, #0" : "=w"(zero145)); + svfloat32_t v116 = svadd_f32_x(svptrue_b32(), v106, v244); + svfloat32_t zero145 = svdup_n_f32(0); svfloat32_t v145 = svcmla_f32_x(pred_full, zero145, v249, v108, 90); svfloat32_t v153 = svmla_f32_x(pred_full, v116, v106, v246); - svfloat32_t v156; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v156) : "w"(v138), "w"(v145)); + svfloat32_t v156 = svsub_f32_x(svptrue_b32(), v138, v145); svfloat32_t v157 = svcmla_f32_x(pred_full, v145, v250, v105, 90); svst1_f64(pred_full, (double *)(v258), svreinterpret_f64_f32(v116)); svfloat32_t v154 = svmla_f32_x(pred_full, v153, v107, v247); svfloat32_t v155 = svmls_f32_x(pred_full, v153, v107, v247); - svfloat32_t v158; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v158) : "w"(v154), "w"(v156)); - svfloat32_t v159; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v159) : "w"(v154), "w"(v156)); - svfloat32_t v160; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v160) : "w"(v155), "w"(v157)); - svfloat32_t v161; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v161) : "w"(v155), "w"(v157)); + svfloat32_t v158 = svadd_f32_x(svptrue_b32(), v154, v156); + svfloat32_t v159 = svsub_f32_x(svptrue_b32(), v154, v156); + svfloat32_t v160 = svadd_f32_x(svptrue_b32(), v155, v157); + svfloat32_t v161 = svsub_f32_x(svptrue_b32(), v155, v157); svst1_f64(pred_full, (double *)(v267), svreinterpret_f64_f32(v159)); svst1_f64(pred_full, (double *)(v276), svreinterpret_f64_f32(v161)); svst1_f64(pred_full, (double *)(v285), svreinterpret_f64_f32(v160)); @@ -786,8 +746,7 @@ void armral_fft_cf32_cf32_cf32_ab_t_gu6(const armral_cmplx_f32_t *restrict x, svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v7)[v78])); svfloat32_t v114 = svreinterpret_f32_f64( svld1_f64(pred_full, &((const double *)v7)[v113])); - svfloat32_t zero122; - asm volatile("mov %0.s, #0" : "=w"(zero122)); + svfloat32_t zero122 = svdup_n_f32(0); svfloat32_t v122 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero122, v268, v121, 0), v268, v121, 90); @@ -799,68 +758,46 @@ void armral_fft_cf32_cf32_cf32_ab_t_gu6(const armral_cmplx_f32_t *restrict x, svld1_gather_s64index_f64(pred_full, (const double *)(v248), v279)); svfloat32_t v259 = svreinterpret_f32_f64( svld1_gather_s64index_f64(pred_full, (const double *)(v257), v279)); - svfloat32_t zero38; - asm volatile("mov %0.s, #0" : "=w"(zero38)); + svfloat32_t zero38 = svdup_n_f32(0); svfloat32_t v38 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero38, v232, v37, 0), v232, v37, 90); - svfloat32_t zero73; - asm volatile("mov %0.s, #0" : "=w"(zero73)); + svfloat32_t zero73 = svdup_n_f32(0); svfloat32_t v73 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero73, v241, v72, 0), v241, v72, 90); - svfloat32_t zero80; - asm volatile("mov %0.s, #0" : "=w"(zero80)); + svfloat32_t zero80 = svdup_n_f32(0); svfloat32_t v80 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero80, v250, v79, 0), v250, v79, 90); - svfloat32_t zero115; - asm volatile("mov %0.s, #0" : "=w"(zero115)); + svfloat32_t zero115 = svdup_n_f32(0); svfloat32_t v115 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero115, v259, v114, 0), v259, v114, 90); - svfloat32_t v130; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v130) : "w"(v280), "w"(v38)); - svfloat32_t v131; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v131) : "w"(v280), "w"(v38)); - svfloat32_t v132; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v132) : "w"(v73), "w"(v80)); - svfloat32_t v133; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v133) : "w"(v73), "w"(v80)); - svfloat32_t v134; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v134) : "w"(v115), "w"(v122)); - svfloat32_t v135; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v135) : "w"(v115), "w"(v122)); - svfloat32_t v136; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v136) : "w"(v132), "w"(v134)); - svfloat32_t v137; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v137) : "w"(v132), "w"(v134)); - svfloat32_t v159; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v159) : "w"(v133), "w"(v135)); - svfloat32_t v160; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v160) : "w"(v133), "w"(v135)); - svfloat32_t v138; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v138) : "w"(v136), "w"(v130)); - svfloat32_t zero155; - asm volatile("mov %0.s, #0" : "=w"(zero155)); + svfloat32_t v130 = svadd_f32_x(svptrue_b32(), v280, v38); + svfloat32_t v131 = svsub_f32_x(svptrue_b32(), v280, v38); + svfloat32_t v132 = svadd_f32_x(svptrue_b32(), v73, v80); + svfloat32_t v133 = svsub_f32_x(svptrue_b32(), v73, v80); + svfloat32_t v134 = svadd_f32_x(svptrue_b32(), v115, v122); + svfloat32_t v135 = svsub_f32_x(svptrue_b32(), v115, v122); + svfloat32_t v136 = svadd_f32_x(svptrue_b32(), v132, v134); + svfloat32_t v137 = svsub_f32_x(svptrue_b32(), v132, v134); + svfloat32_t v159 = svadd_f32_x(svptrue_b32(), v133, v135); + svfloat32_t v160 = svsub_f32_x(svptrue_b32(), v133, v135); + svfloat32_t v138 = svadd_f32_x(svptrue_b32(), v136, v130); + svfloat32_t zero155 = svdup_n_f32(0); svfloat32_t v155 = svcmla_f32_x(pred_full, zero155, v286, v137, 90); - svfloat32_t v161; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v161) : "w"(v159), "w"(v131)); - svfloat32_t zero178; - asm volatile("mov %0.s, #0" : "=w"(zero178)); + svfloat32_t v161 = svadd_f32_x(svptrue_b32(), v159, v131); + svfloat32_t zero178 = svdup_n_f32(0); svfloat32_t v178 = svcmla_f32_x(pred_full, zero178, v286, v160, 90); svfloat32_t v156 = svmla_f32_x(pred_full, v138, v136, v285); svfloat32_t v179 = svmla_f32_x(pred_full, v161, v159, v285); svst1_f64(pred_full, (double *)(v294), svreinterpret_f64_f32(v138)); svst1_f64(pred_full, (double *)(v303), svreinterpret_f64_f32(v161)); - svfloat32_t v157; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v157) : "w"(v156), "w"(v155)); - svfloat32_t v158; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v158) : "w"(v156), "w"(v155)); - svfloat32_t v180; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v180) : "w"(v179), "w"(v178)); - svfloat32_t v181; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v181) : "w"(v179), "w"(v178)); + svfloat32_t v157 = svadd_f32_x(svptrue_b32(), v156, v155); + svfloat32_t v158 = svsub_f32_x(svptrue_b32(), v156, v155); + svfloat32_t v180 = svadd_f32_x(svptrue_b32(), v179, v178); + svfloat32_t v181 = svsub_f32_x(svptrue_b32(), v179, v178); svst1_f64(pred_full, (double *)(v312), svreinterpret_f64_f32(v158)); svst1_f64(pred_full, (double *)(v321), svreinterpret_f64_f32(v181)); svst1_f64(pred_full, (double *)(v330), svreinterpret_f64_f32(v157)); @@ -1103,8 +1040,7 @@ void armral_fft_cf32_cf32_cf32_ab_t_gu7(const armral_cmplx_f32_t *restrict x, float32x2_t *v407 = &v6[v269]; float32x2_t *v416 = &v6[v276]; float32x2_t *v425 = &v6[v283]; - svfloat32_t zero52; - asm volatile("mov %0.s, #0" : "=w"(zero52)); + svfloat32_t zero52 = svdup_n_f32(0); svfloat32_t v52 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero52, v297, v51, 0), v297, v51, 90); @@ -1128,109 +1064,71 @@ void armral_fft_cf32_cf32_cf32_ab_t_gu7(const armral_cmplx_f32_t *restrict x, svld1_gather_s64index_f64(pred_full, (const double *)(v333), v353)); svfloat32_t v344 = svreinterpret_f32_f64( svld1_gather_s64index_f64(pred_full, (const double *)(v342), v353)); - svfloat32_t zero59; - asm volatile("mov %0.s, #0" : "=w"(zero59)); + svfloat32_t zero59 = svdup_n_f32(0); svfloat32_t v59 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero59, v307, v58, 0), v307, v58, 90); - svfloat32_t zero94; - asm volatile("mov %0.s, #0" : "=w"(zero94)); + svfloat32_t zero94 = svdup_n_f32(0); svfloat32_t v94 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero94, v317, v93, 0), v317, v93, 90); - svfloat32_t zero101; - asm volatile("mov %0.s, #0" : "=w"(zero101)); + svfloat32_t zero101 = svdup_n_f32(0); svfloat32_t v101 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero101, v326, v100, 0), v326, v100, 90); - svfloat32_t zero136; - asm volatile("mov %0.s, #0" : "=w"(zero136)); + svfloat32_t zero136 = svdup_n_f32(0); svfloat32_t v136 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero136, v335, v135, 0), v335, v135, 90); - svfloat32_t zero143; - asm volatile("mov %0.s, #0" : "=w"(zero143)); + svfloat32_t zero143 = svdup_n_f32(0); svfloat32_t v143 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero143, v344, v142, 0), v344, v142, 90); - svfloat32_t v144; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v144) : "w"(v52), "w"(v59)); - svfloat32_t v145; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v145) : "w"(v52), "w"(v59)); - svfloat32_t v146; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v146) : "w"(v94), "w"(v101)); - svfloat32_t v147; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v147) : "w"(v94), "w"(v101)); - svfloat32_t v148; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v148) : "w"(v136), "w"(v143)); - svfloat32_t v149; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v149) : "w"(v136), "w"(v143)); - svfloat32_t v150; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v150) : "w"(v144), "w"(v146)); - svfloat32_t v160; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v160) : "w"(v144), "w"(v146)); - svfloat32_t v161; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v161) : "w"(v146), "w"(v148)); - svfloat32_t v162; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v162) : "w"(v148), "w"(v144)); - svfloat32_t v163; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v163) : "w"(v145), "w"(v147)); - svfloat32_t v165; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v165) : "w"(v145), "w"(v147)); - svfloat32_t v166; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v166) : "w"(v147), "w"(v149)); - svfloat32_t v167; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v167) : "w"(v149), "w"(v145)); - svfloat32_t v151; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v151) : "w"(v150), "w"(v148)); - svfloat32_t v164; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v164) : "w"(v163), "w"(v149)); - svfloat32_t zero206; - asm volatile("mov %0.s, #0" : "=w"(zero206)); + svfloat32_t v144 = svadd_f32_x(svptrue_b32(), v52, v59); + svfloat32_t v145 = svsub_f32_x(svptrue_b32(), v52, v59); + svfloat32_t v146 = svadd_f32_x(svptrue_b32(), v94, v101); + svfloat32_t v147 = svsub_f32_x(svptrue_b32(), v94, v101); + svfloat32_t v148 = svadd_f32_x(svptrue_b32(), v136, v143); + svfloat32_t v149 = svsub_f32_x(svptrue_b32(), v136, v143); + svfloat32_t v150 = svadd_f32_x(svptrue_b32(), v144, v146); + svfloat32_t v160 = svsub_f32_x(svptrue_b32(), v144, v146); + svfloat32_t v161 = svsub_f32_x(svptrue_b32(), v146, v148); + svfloat32_t v162 = svsub_f32_x(svptrue_b32(), v148, v144); + svfloat32_t v163 = svadd_f32_x(svptrue_b32(), v145, v147); + svfloat32_t v165 = svsub_f32_x(svptrue_b32(), v145, v147); + svfloat32_t v166 = svsub_f32_x(svptrue_b32(), v147, v149); + svfloat32_t v167 = svsub_f32_x(svptrue_b32(), v149, v145); + svfloat32_t v151 = svadd_f32_x(svptrue_b32(), v150, v148); + svfloat32_t v164 = svadd_f32_x(svptrue_b32(), v163, v149); + svfloat32_t zero206 = svdup_n_f32(0); svfloat32_t v206 = svcmla_f32_x(pred_full, zero206, v361, v165, 90); - svfloat32_t zero213; - asm volatile("mov %0.s, #0" : "=w"(zero213)); + svfloat32_t zero213 = svdup_n_f32(0); svfloat32_t v213 = svcmla_f32_x(pred_full, zero213, v362, v166, 90); - svfloat32_t zero220; - asm volatile("mov %0.s, #0" : "=w"(zero220)); + svfloat32_t zero220 = svdup_n_f32(0); svfloat32_t v220 = svcmla_f32_x(pred_full, zero220, v363, v167, 90); - svfloat32_t v159; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v159) : "w"(v151), "w"(v354)); - svfloat32_t zero199; - asm volatile("mov %0.s, #0" : "=w"(zero199)); + svfloat32_t v159 = svadd_f32_x(svptrue_b32(), v151, v354); + svfloat32_t zero199 = svdup_n_f32(0); svfloat32_t v199 = svcmla_f32_x(pred_full, zero199, v360, v164, 90); svfloat32_t v221 = svmla_f32_x(pred_full, v159, v151, v356); - svfloat32_t v228; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v228) : "w"(v199), "w"(v206)); - svfloat32_t v230; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v230) : "w"(v199), "w"(v206)); - svfloat32_t v232; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v232) : "w"(v199), "w"(v213)); + svfloat32_t v228 = svadd_f32_x(svptrue_b32(), v199, v206); + svfloat32_t v230 = svsub_f32_x(svptrue_b32(), v199, v206); + svfloat32_t v232 = svsub_f32_x(svptrue_b32(), v199, v213); svst1_f64(pred_full, (double *)(v371), svreinterpret_f64_f32(v159)); svfloat32_t v222 = svmla_f32_x(pred_full, v221, v160, v357); svfloat32_t v224 = svmls_f32_x(pred_full, v221, v160, v357); svfloat32_t v226 = svmls_f32_x(pred_full, v221, v161, v358); - svfloat32_t v229; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v229) : "w"(v228), "w"(v213)); - svfloat32_t v231; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v231) : "w"(v230), "w"(v220)); - svfloat32_t v233; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v233) : "w"(v232), "w"(v220)); + svfloat32_t v229 = svadd_f32_x(svptrue_b32(), v228, v213); + svfloat32_t v231 = svsub_f32_x(svptrue_b32(), v230, v220); + svfloat32_t v233 = svadd_f32_x(svptrue_b32(), v232, v220); svfloat32_t v223 = svmla_f32_x(pred_full, v222, v161, v358); svfloat32_t v225 = svmls_f32_x(pred_full, v224, v162, v359); svfloat32_t v227 = svmla_f32_x(pred_full, v226, v162, v359); - svfloat32_t v234; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v234) : "w"(v223), "w"(v229)); - svfloat32_t v235; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v235) : "w"(v223), "w"(v229)); - svfloat32_t v236; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v236) : "w"(v225), "w"(v231)); - svfloat32_t v237; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v237) : "w"(v225), "w"(v231)); - svfloat32_t v238; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v238) : "w"(v227), "w"(v233)); - svfloat32_t v239; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v239) : "w"(v227), "w"(v233)); + svfloat32_t v234 = svadd_f32_x(svptrue_b32(), v223, v229); + svfloat32_t v235 = svsub_f32_x(svptrue_b32(), v223, v229); + svfloat32_t v236 = svadd_f32_x(svptrue_b32(), v225, v231); + svfloat32_t v237 = svsub_f32_x(svptrue_b32(), v225, v231); + svfloat32_t v238 = svadd_f32_x(svptrue_b32(), v227, v233); + svfloat32_t v239 = svsub_f32_x(svptrue_b32(), v227, v233); svst1_f64(pred_full, (double *)(v380), svreinterpret_f64_f32(v235)); svst1_f64(pred_full, (double *)(v389), svreinterpret_f64_f32(v237)); svst1_f64(pred_full, (double *)(v398), svreinterpret_f64_f32(v238)); @@ -1455,8 +1353,7 @@ void armral_fft_cf32_cf32_cf32_ab_t_gu8(const armral_cmplx_f32_t *restrict x, svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v7)[v71])); svfloat32_t v79 = svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v7)[v78])); - svfloat32_t zero115; - asm volatile("mov %0.s, #0" : "=w"(zero115)); + svfloat32_t zero115 = svdup_n_f32(0); svfloat32_t v115 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero115, v335, v114, 0), v335, v114, 90); @@ -1478,97 +1375,64 @@ void armral_fft_cf32_cf32_cf32_ab_t_gu8(const armral_cmplx_f32_t *restrict x, svld1_gather_s64index_f64(pred_full, (const double *)(v353), v373)); svfloat32_t v364 = svreinterpret_f32_f64( svld1_gather_s64index_f64(pred_full, (const double *)(v362), v373)); - svfloat32_t zero38; - asm volatile("mov %0.s, #0" : "=w"(zero38)); + svfloat32_t zero38 = svdup_n_f32(0); svfloat32_t v38 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero38, v308, v37, 0), v308, v37, 90); - svfloat32_t zero73; - asm volatile("mov %0.s, #0" : "=w"(zero73)); + svfloat32_t zero73 = svdup_n_f32(0); svfloat32_t v73 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero73, v317, v72, 0), v317, v72, 90); - svfloat32_t zero80; - asm volatile("mov %0.s, #0" : "=w"(zero80)); + svfloat32_t zero80 = svdup_n_f32(0); svfloat32_t v80 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero80, v326, v79, 0), v326, v79, 90); - svfloat32_t zero122; - asm volatile("mov %0.s, #0" : "=w"(zero122)); + svfloat32_t zero122 = svdup_n_f32(0); svfloat32_t v122 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero122, v345, v121, 0), v345, v121, 90); - svfloat32_t zero157; - asm volatile("mov %0.s, #0" : "=w"(zero157)); + svfloat32_t zero157 = svdup_n_f32(0); svfloat32_t v157 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero157, v355, v156, 0), v355, v156, 90); - svfloat32_t zero164; - asm volatile("mov %0.s, #0" : "=w"(zero164)); + svfloat32_t zero164 = svdup_n_f32(0); svfloat32_t v164 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero164, v364, v163, 0), v364, v163, 90); - svfloat32_t v172; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v172) : "w"(v374), "w"(v38)); - svfloat32_t v173; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v173) : "w"(v374), "w"(v38)); - svfloat32_t v174; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v174) : "w"(v73), "w"(v80)); - svfloat32_t v175; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v175) : "w"(v73), "w"(v80)); - svfloat32_t v176; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v176) : "w"(v115), "w"(v122)); - svfloat32_t v177; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v177) : "w"(v115), "w"(v122)); - svfloat32_t v178; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v178) : "w"(v157), "w"(v164)); - svfloat32_t v179; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v179) : "w"(v157), "w"(v164)); - svfloat32_t v180; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v180) : "w"(v172), "w"(v174)); - svfloat32_t v181; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v181) : "w"(v172), "w"(v174)); - svfloat32_t v182; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v182) : "w"(v176), "w"(v178)); - svfloat32_t v183; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v183) : "w"(v176), "w"(v178)); - svfloat32_t v186; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v186) : "w"(v177), "w"(v179)); - svfloat32_t v187; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v187) : "w"(v177), "w"(v179)); - svfloat32_t zero221; - asm volatile("mov %0.s, #0" : "=w"(zero221)); + svfloat32_t v172 = svadd_f32_x(svptrue_b32(), v374, v38); + svfloat32_t v173 = svsub_f32_x(svptrue_b32(), v374, v38); + svfloat32_t v174 = svadd_f32_x(svptrue_b32(), v73, v80); + svfloat32_t v175 = svsub_f32_x(svptrue_b32(), v73, v80); + svfloat32_t v176 = svadd_f32_x(svptrue_b32(), v115, v122); + svfloat32_t v177 = svsub_f32_x(svptrue_b32(), v115, v122); + svfloat32_t v178 = svadd_f32_x(svptrue_b32(), v157, v164); + svfloat32_t v179 = svsub_f32_x(svptrue_b32(), v157, v164); + svfloat32_t v180 = svadd_f32_x(svptrue_b32(), v172, v174); + svfloat32_t v181 = svsub_f32_x(svptrue_b32(), v172, v174); + svfloat32_t v182 = svadd_f32_x(svptrue_b32(), v176, v178); + svfloat32_t v183 = svsub_f32_x(svptrue_b32(), v176, v178); + svfloat32_t v186 = svadd_f32_x(svptrue_b32(), v177, v179); + svfloat32_t v187 = svsub_f32_x(svptrue_b32(), v177, v179); + svfloat32_t zero221 = svdup_n_f32(0); svfloat32_t v221 = svcmla_f32_x(pred_full, zero221, v380, v175, 90); - svfloat32_t v184; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v184) : "w"(v180), "w"(v182)); - svfloat32_t v185; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v185) : "w"(v180), "w"(v182)); - svfloat32_t zero209; - asm volatile("mov %0.s, #0" : "=w"(zero209)); + svfloat32_t v184 = svadd_f32_x(svptrue_b32(), v180, v182); + svfloat32_t v185 = svsub_f32_x(svptrue_b32(), v180, v182); + svfloat32_t zero209 = svdup_n_f32(0); svfloat32_t v209 = svcmla_f32_x(pred_full, zero209, v380, v183, 90); - svfloat32_t zero228; - asm volatile("mov %0.s, #0" : "=w"(zero228)); + svfloat32_t zero228 = svdup_n_f32(0); svfloat32_t v228 = svcmla_f32_x(pred_full, zero228, v381, v186, 90); - svfloat32_t v234; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v234) : "w"(v181), "w"(v209)); - svfloat32_t v235; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v235) : "w"(v181), "w"(v209)); + svfloat32_t v234 = svadd_f32_x(svptrue_b32(), v181, v209); + svfloat32_t v235 = svsub_f32_x(svptrue_b32(), v181, v209); svfloat32_t v236 = svmla_f32_x(pred_full, v173, v187, v382); svfloat32_t v237 = svmls_f32_x(pred_full, v173, v187, v382); - svfloat32_t v238; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v238) : "w"(v221), "w"(v228)); - svfloat32_t v239; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v239) : "w"(v221), "w"(v228)); + svfloat32_t v238 = svadd_f32_x(svptrue_b32(), v221, v228); + svfloat32_t v239 = svsub_f32_x(svptrue_b32(), v221, v228); svst1_f64(pred_full, (double *)(v390), svreinterpret_f64_f32(v184)); svst1_f64(pred_full, (double *)(v426), svreinterpret_f64_f32(v185)); - svfloat32_t v240; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v240) : "w"(v236), "w"(v238)); - svfloat32_t v241; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v241) : "w"(v236), "w"(v238)); - svfloat32_t v242; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v242) : "w"(v237), "w"(v239)); - svfloat32_t v243; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v243) : "w"(v237), "w"(v239)); + svfloat32_t v240 = svadd_f32_x(svptrue_b32(), v236, v238); + svfloat32_t v241 = svsub_f32_x(svptrue_b32(), v236, v238); + svfloat32_t v242 = svadd_f32_x(svptrue_b32(), v237, v239); + svfloat32_t v243 = svsub_f32_x(svptrue_b32(), v237, v239); svst1_f64(pred_full, (double *)(v408), svreinterpret_f64_f32(v235)); svst1_f64(pred_full, (double *)(v444), svreinterpret_f64_f32(v234)); svst1_f64(pred_full, (double *)(v399), svreinterpret_f64_f32(v241)); @@ -1861,8 +1725,7 @@ void armral_fft_cf32_cf32_cf32_ab_t_gu9(const armral_cmplx_f32_t *restrict x, float32x2_t *v522 = &v6[v346]; float32x2_t *v531 = &v6[v353]; float32x2_t *v540 = &v6[v360]; - svfloat32_t zero52; - asm volatile("mov %0.s, #0" : "=w"(zero52)); + svfloat32_t zero52 = svdup_n_f32(0); svfloat32_t v52 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero52, v374, v51, 0), v374, v51, 90); @@ -1894,123 +1757,79 @@ void armral_fft_cf32_cf32_cf32_ab_t_gu9(const armral_cmplx_f32_t *restrict x, svld1_gather_s64index_f64(pred_full, (const double *)(v428), v448)); svfloat32_t v439 = svreinterpret_f32_f64( svld1_gather_s64index_f64(pred_full, (const double *)(v437), v448)); - svfloat32_t zero59; - asm volatile("mov %0.s, #0" : "=w"(zero59)); + svfloat32_t zero59 = svdup_n_f32(0); svfloat32_t v59 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero59, v384, v58, 0), v384, v58, 90); - svfloat32_t zero94; - asm volatile("mov %0.s, #0" : "=w"(zero94)); + svfloat32_t zero94 = svdup_n_f32(0); svfloat32_t v94 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero94, v394, v93, 0), v394, v93, 90); - svfloat32_t zero101; - asm volatile("mov %0.s, #0" : "=w"(zero101)); + svfloat32_t zero101 = svdup_n_f32(0); svfloat32_t v101 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero101, v403, v100, 0), v403, v100, 90); - svfloat32_t zero136; - asm volatile("mov %0.s, #0" : "=w"(zero136)); + svfloat32_t zero136 = svdup_n_f32(0); svfloat32_t v136 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero136, v412, v135, 0), v412, v135, 90); - svfloat32_t zero143; - asm volatile("mov %0.s, #0" : "=w"(zero143)); + svfloat32_t zero143 = svdup_n_f32(0); svfloat32_t v143 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero143, v421, v142, 0), v421, v142, 90); - svfloat32_t zero178; - asm volatile("mov %0.s, #0" : "=w"(zero178)); + svfloat32_t zero178 = svdup_n_f32(0); svfloat32_t v178 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero178, v430, v177, 0), v430, v177, 90); - svfloat32_t zero185; - asm volatile("mov %0.s, #0" : "=w"(zero185)); + svfloat32_t zero185 = svdup_n_f32(0); svfloat32_t v185 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero185, v439, v184, 0), v439, v184, 90); - svfloat32_t v186; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v186) : "w"(v52), "w"(v59)); - svfloat32_t v187; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v187) : "w"(v52), "w"(v59)); - svfloat32_t v188; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v188) : "w"(v94), "w"(v101)); - svfloat32_t v189; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v189) : "w"(v94), "w"(v101)); - svfloat32_t v190; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v190) : "w"(v136), "w"(v143)); - svfloat32_t v191; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v191) : "w"(v136), "w"(v143)); - svfloat32_t v192; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v192) : "w"(v178), "w"(v185)); - svfloat32_t v193; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v193) : "w"(v178), "w"(v185)); - svfloat32_t v194; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v194) : "w"(v186), "w"(v188)); - svfloat32_t v205; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v205) : "w"(v187), "w"(v189)); - svfloat32_t v207; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v207) : "w"(v186), "w"(v188)); - svfloat32_t v208; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v208) : "w"(v188), "w"(v192)); - svfloat32_t v209; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v209) : "w"(v192), "w"(v186)); - svfloat32_t v210; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v210) : "w"(v187), "w"(v189)); - svfloat32_t v211; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v211) : "w"(v189), "w"(v193)); - svfloat32_t v212; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v212) : "w"(v193), "w"(v187)); - svfloat32_t zero241; - asm volatile("mov %0.s, #0" : "=w"(zero241)); + svfloat32_t v186 = svadd_f32_x(svptrue_b32(), v52, v59); + svfloat32_t v187 = svsub_f32_x(svptrue_b32(), v52, v59); + svfloat32_t v188 = svadd_f32_x(svptrue_b32(), v94, v101); + svfloat32_t v189 = svsub_f32_x(svptrue_b32(), v94, v101); + svfloat32_t v190 = svadd_f32_x(svptrue_b32(), v136, v143); + svfloat32_t v191 = svsub_f32_x(svptrue_b32(), v136, v143); + svfloat32_t v192 = svadd_f32_x(svptrue_b32(), v178, v185); + svfloat32_t v193 = svsub_f32_x(svptrue_b32(), v178, v185); + svfloat32_t v194 = svadd_f32_x(svptrue_b32(), v186, v188); + svfloat32_t v205 = svadd_f32_x(svptrue_b32(), v187, v189); + svfloat32_t v207 = svsub_f32_x(svptrue_b32(), v186, v188); + svfloat32_t v208 = svsub_f32_x(svptrue_b32(), v188, v192); + svfloat32_t v209 = svsub_f32_x(svptrue_b32(), v192, v186); + svfloat32_t v210 = svsub_f32_x(svptrue_b32(), v187, v189); + svfloat32_t v211 = svsub_f32_x(svptrue_b32(), v189, v193); + svfloat32_t v212 = svsub_f32_x(svptrue_b32(), v193, v187); + svfloat32_t zero241 = svdup_n_f32(0); svfloat32_t v241 = svcmla_f32_x(pred_full, zero241, v454, v191, 90); - svfloat32_t v195; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v195) : "w"(v194), "w"(v192)); - svfloat32_t v206; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v206) : "w"(v205), "w"(v193)); - svfloat32_t zero263; - asm volatile("mov %0.s, #0" : "=w"(zero263)); + svfloat32_t v195 = svadd_f32_x(svptrue_b32(), v194, v192); + svfloat32_t v206 = svadd_f32_x(svptrue_b32(), v205, v193); + svfloat32_t zero263 = svdup_n_f32(0); svfloat32_t v263 = svcmla_f32_x(pred_full, zero263, v458, v210, 90); - svfloat32_t zero270; - asm volatile("mov %0.s, #0" : "=w"(zero270)); + svfloat32_t zero270 = svdup_n_f32(0); svfloat32_t v270 = svcmla_f32_x(pred_full, zero270, v459, v211, 90); - svfloat32_t zero277; - asm volatile("mov %0.s, #0" : "=w"(zero277)); + svfloat32_t zero277 = svdup_n_f32(0); svfloat32_t v277 = svcmla_f32_x(pred_full, zero277, v460, v212, 90); - svfloat32_t v196; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v196) : "w"(v195), "w"(v190)); - svfloat32_t v222; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v222) : "w"(v195), "w"(v451)); - svfloat32_t zero229; - asm volatile("mov %0.s, #0" : "=w"(zero229)); + svfloat32_t v196 = svadd_f32_x(svptrue_b32(), v195, v190); + svfloat32_t v222 = svmul_f32_x(svptrue_b32(), v195, v451); + svfloat32_t zero229 = svdup_n_f32(0); svfloat32_t v229 = svcmla_f32_x(pred_full, zero229, v454, v206, 90); - svfloat32_t v291; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v291) : "w"(v241), "w"(v263)); - svfloat32_t v293; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v293) : "w"(v241), "w"(v270)); - svfloat32_t v295; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v295) : "w"(v241), "w"(v263)); - svfloat32_t v204; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v204) : "w"(v196), "w"(v449)); - svfloat32_t v278; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v278) : "w"(v222), "w"(v222)); - svfloat32_t v292; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v292) : "w"(v291), "w"(v270)); - svfloat32_t v294; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v294) : "w"(v293), "w"(v277)); - svfloat32_t v296; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v296) : "w"(v295), "w"(v277)); + svfloat32_t v291 = svadd_f32_x(svptrue_b32(), v241, v263); + svfloat32_t v293 = svsub_f32_x(svptrue_b32(), v241, v270); + svfloat32_t v295 = svsub_f32_x(svptrue_b32(), v241, v263); + svfloat32_t v204 = svadd_f32_x(svptrue_b32(), v196, v449); + svfloat32_t v278 = svadd_f32_x(svptrue_b32(), v222, v222); + svfloat32_t v292 = svadd_f32_x(svptrue_b32(), v291, v270); + svfloat32_t v294 = svadd_f32_x(svptrue_b32(), v293, v277); + svfloat32_t v296 = svsub_f32_x(svptrue_b32(), v295, v277); svfloat32_t v279 = svmla_f32_x(pred_full, v278, v195, v451); svfloat32_t v283 = svmla_f32_x(pred_full, v204, v190, v453); svst1_f64(pred_full, (double *)(v468), svreinterpret_f64_f32(v204)); - svfloat32_t v280; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v280) : "w"(v204), "w"(v279)); - svfloat32_t v284; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v284) : "w"(v283), "w"(v278)); - svfloat32_t v281; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v281) : "w"(v280), "w"(v229)); - svfloat32_t v282; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v282) : "w"(v280), "w"(v229)); + svfloat32_t v280 = svadd_f32_x(svptrue_b32(), v204, v279); + svfloat32_t v284 = svadd_f32_x(svptrue_b32(), v283, v278); + svfloat32_t v281 = svadd_f32_x(svptrue_b32(), v280, v229); + svfloat32_t v282 = svsub_f32_x(svptrue_b32(), v280, v229); svfloat32_t v285 = svmla_f32_x(pred_full, v284, v207, v455); svfloat32_t v287 = svmls_f32_x(pred_full, v284, v208, v456); svfloat32_t v289 = svmls_f32_x(pred_full, v284, v207, v455); @@ -2019,18 +1838,12 @@ void armral_fft_cf32_cf32_cf32_ab_t_gu9(const armral_cmplx_f32_t *restrict x, svfloat32_t v290 = svmls_f32_x(pred_full, v289, v209, v457); svst1_f64(pred_full, (double *)(v495), svreinterpret_f64_f32(v282)); svst1_f64(pred_full, (double *)(v522), svreinterpret_f64_f32(v281)); - svfloat32_t v297; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v297) : "w"(v286), "w"(v292)); - svfloat32_t v298; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v298) : "w"(v286), "w"(v292)); - svfloat32_t v299; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v299) : "w"(v288), "w"(v294)); - svfloat32_t v300; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v300) : "w"(v288), "w"(v294)); - svfloat32_t v301; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v301) : "w"(v290), "w"(v296)); - svfloat32_t v302; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v302) : "w"(v290), "w"(v296)); + svfloat32_t v297 = svadd_f32_x(svptrue_b32(), v286, v292); + svfloat32_t v298 = svsub_f32_x(svptrue_b32(), v286, v292); + svfloat32_t v299 = svadd_f32_x(svptrue_b32(), v288, v294); + svfloat32_t v300 = svsub_f32_x(svptrue_b32(), v288, v294); + svfloat32_t v301 = svadd_f32_x(svptrue_b32(), v290, v296); + svfloat32_t v302 = svsub_f32_x(svptrue_b32(), v290, v296); svst1_f64(pred_full, (double *)(v477), svreinterpret_f64_f32(v298)); svst1_f64(pred_full, (double *)(v486), svreinterpret_f64_f32(v299)); svst1_f64(pred_full, (double *)(v504), svreinterpret_f64_f32(v302)); @@ -2332,8 +2145,7 @@ void armral_fft_cf32_cf32_cf32_ab_t_gu10(const armral_cmplx_f32_t *restrict x, svld1_f64(pred_full, &((const double *)v7)[v120])); svfloat32_t v156 = svreinterpret_f32_f64( svld1_f64(pred_full, &((const double *)v7)[v155])); - svfloat32_t zero164; - asm volatile("mov %0.s, #0" : "=w"(zero164)); + svfloat32_t zero164 = svdup_n_f32(0); svfloat32_t v164 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero164, v462, v163, 0), v462, v163, 90); @@ -2357,117 +2169,77 @@ void armral_fft_cf32_cf32_cf32_ab_t_gu10(const armral_cmplx_f32_t *restrict x, svld1_gather_s64index_f64(pred_full, (const double *)(v471), v491)); svfloat32_t v482 = svreinterpret_f32_f64( svld1_gather_s64index_f64(pred_full, (const double *)(v480), v491)); - svfloat32_t zero38; - asm volatile("mov %0.s, #0" : "=w"(zero38)); + svfloat32_t zero38 = svdup_n_f32(0); svfloat32_t v38 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero38, v408, v37, 0), v408, v37, 90); - svfloat32_t zero73; - asm volatile("mov %0.s, #0" : "=w"(zero73)); + svfloat32_t zero73 = svdup_n_f32(0); svfloat32_t v73 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero73, v417, v72, 0), v417, v72, 90); - svfloat32_t zero80; - asm volatile("mov %0.s, #0" : "=w"(zero80)); + svfloat32_t zero80 = svdup_n_f32(0); svfloat32_t v80 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero80, v426, v79, 0), v426, v79, 90); - svfloat32_t zero115; - asm volatile("mov %0.s, #0" : "=w"(zero115)); + svfloat32_t zero115 = svdup_n_f32(0); svfloat32_t v115 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero115, v435, v114, 0), v435, v114, 90); - svfloat32_t zero122; - asm volatile("mov %0.s, #0" : "=w"(zero122)); + svfloat32_t zero122 = svdup_n_f32(0); svfloat32_t v122 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero122, v444, v121, 0), v444, v121, 90); - svfloat32_t zero157; - asm volatile("mov %0.s, #0" : "=w"(zero157)); + svfloat32_t zero157 = svdup_n_f32(0); svfloat32_t v157 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero157, v453, v156, 0), v453, v156, 90); - svfloat32_t zero199; - asm volatile("mov %0.s, #0" : "=w"(zero199)); + svfloat32_t zero199 = svdup_n_f32(0); svfloat32_t v199 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero199, v473, v198, 0), v473, v198, 90); - svfloat32_t zero206; - asm volatile("mov %0.s, #0" : "=w"(zero206)); + svfloat32_t zero206 = svdup_n_f32(0); svfloat32_t v206 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero206, v482, v205, 0), v482, v205, 90); - svfloat32_t v214; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v214) : "w"(v492), "w"(v38)); - svfloat32_t v215; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v215) : "w"(v492), "w"(v38)); - svfloat32_t v216; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v216) : "w"(v73), "w"(v80)); - svfloat32_t v217; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v217) : "w"(v73), "w"(v80)); - svfloat32_t v218; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v218) : "w"(v115), "w"(v122)); - svfloat32_t v219; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v219) : "w"(v115), "w"(v122)); - svfloat32_t v220; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v220) : "w"(v157), "w"(v164)); - svfloat32_t v221; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v221) : "w"(v157), "w"(v164)); - svfloat32_t v222; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v222) : "w"(v199), "w"(v206)); - svfloat32_t v223; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v223) : "w"(v199), "w"(v206)); - svfloat32_t v224; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v224) : "w"(v216), "w"(v222)); - svfloat32_t v225; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v225) : "w"(v216), "w"(v222)); - svfloat32_t v226; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v226) : "w"(v220), "w"(v218)); - svfloat32_t v227; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v227) : "w"(v220), "w"(v218)); - svfloat32_t v277; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v277) : "w"(v217), "w"(v223)); - svfloat32_t v278; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v278) : "w"(v217), "w"(v223)); - svfloat32_t v279; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v279) : "w"(v221), "w"(v219)); - svfloat32_t v280; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v280) : "w"(v221), "w"(v219)); - svfloat32_t v228; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v228) : "w"(v224), "w"(v226)); - svfloat32_t v229; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v229) : "w"(v224), "w"(v226)); - svfloat32_t v230; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v230) : "w"(v225), "w"(v227)); - svfloat32_t zero253; - asm volatile("mov %0.s, #0" : "=w"(zero253)); + svfloat32_t v214 = svadd_f32_x(svptrue_b32(), v492, v38); + svfloat32_t v215 = svsub_f32_x(svptrue_b32(), v492, v38); + svfloat32_t v216 = svadd_f32_x(svptrue_b32(), v73, v80); + svfloat32_t v217 = svsub_f32_x(svptrue_b32(), v73, v80); + svfloat32_t v218 = svadd_f32_x(svptrue_b32(), v115, v122); + svfloat32_t v219 = svsub_f32_x(svptrue_b32(), v115, v122); + svfloat32_t v220 = svadd_f32_x(svptrue_b32(), v157, v164); + svfloat32_t v221 = svsub_f32_x(svptrue_b32(), v157, v164); + svfloat32_t v222 = svadd_f32_x(svptrue_b32(), v199, v206); + svfloat32_t v223 = svsub_f32_x(svptrue_b32(), v199, v206); + svfloat32_t v224 = svadd_f32_x(svptrue_b32(), v216, v222); + svfloat32_t v225 = svsub_f32_x(svptrue_b32(), v216, v222); + svfloat32_t v226 = svadd_f32_x(svptrue_b32(), v220, v218); + svfloat32_t v227 = svsub_f32_x(svptrue_b32(), v220, v218); + svfloat32_t v277 = svadd_f32_x(svptrue_b32(), v217, v223); + svfloat32_t v278 = svsub_f32_x(svptrue_b32(), v217, v223); + svfloat32_t v279 = svadd_f32_x(svptrue_b32(), v221, v219); + svfloat32_t v280 = svsub_f32_x(svptrue_b32(), v221, v219); + svfloat32_t v228 = svadd_f32_x(svptrue_b32(), v224, v226); + svfloat32_t v229 = svsub_f32_x(svptrue_b32(), v224, v226); + svfloat32_t v230 = svadd_f32_x(svptrue_b32(), v225, v227); + svfloat32_t zero253 = svdup_n_f32(0); svfloat32_t v253 = svcmla_f32_x(pred_full, zero253, v502, v225, 90); - svfloat32_t v281; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v281) : "w"(v277), "w"(v279)); - svfloat32_t v282; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v282) : "w"(v277), "w"(v279)); - svfloat32_t v283; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v283) : "w"(v278), "w"(v280)); - svfloat32_t zero306; - asm volatile("mov %0.s, #0" : "=w"(zero306)); + svfloat32_t v281 = svadd_f32_x(svptrue_b32(), v277, v279); + svfloat32_t v282 = svsub_f32_x(svptrue_b32(), v277, v279); + svfloat32_t v283 = svadd_f32_x(svptrue_b32(), v278, v280); + svfloat32_t zero306 = svdup_n_f32(0); svfloat32_t v306 = svcmla_f32_x(pred_full, zero306, v502, v278, 90); - svfloat32_t v231; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v231) : "w"(v228), "w"(v214)); - svfloat32_t zero260; - asm volatile("mov %0.s, #0" : "=w"(zero260)); + svfloat32_t v231 = svadd_f32_x(svptrue_b32(), v228, v214); + svfloat32_t zero260 = svdup_n_f32(0); svfloat32_t v260 = svcmla_f32_x(pred_full, zero260, v503, v230, 90); - svfloat32_t v284; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v284) : "w"(v281), "w"(v215)); - svfloat32_t zero313; - asm volatile("mov %0.s, #0" : "=w"(zero313)); + svfloat32_t v284 = svadd_f32_x(svptrue_b32(), v281, v215); + svfloat32_t zero313 = svdup_n_f32(0); svfloat32_t v313 = svcmla_f32_x(pred_full, zero313, v503, v283, 90); svfloat32_t v268 = svmla_f32_x(pred_full, v231, v228, v500); - svfloat32_t v271; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v271) : "w"(v253), "w"(v260)); + svfloat32_t v271 = svsub_f32_x(svptrue_b32(), v253, v260); svfloat32_t v272 = svcmla_f32_x(pred_full, v260, v504, v227, 90); svfloat32_t v321 = svmla_f32_x(pred_full, v284, v281, v500); - svfloat32_t v324; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v324) : "w"(v306), "w"(v313)); + svfloat32_t v324 = svsub_f32_x(svptrue_b32(), v306, v313); svfloat32_t v325 = svcmla_f32_x(pred_full, v313, v504, v280, 90); svst1_f64(pred_full, (double *)(v512), svreinterpret_f64_f32(v231)); svst1_f64(pred_full, (double *)(v521), svreinterpret_f64_f32(v284)); @@ -2475,22 +2247,14 @@ void armral_fft_cf32_cf32_cf32_ab_t_gu10(const armral_cmplx_f32_t *restrict x, svfloat32_t v270 = svmls_f32_x(pred_full, v268, v229, v501); svfloat32_t v322 = svmla_f32_x(pred_full, v321, v282, v501); svfloat32_t v323 = svmls_f32_x(pred_full, v321, v282, v501); - svfloat32_t v273; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v273) : "w"(v269), "w"(v271)); - svfloat32_t v274; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v274) : "w"(v269), "w"(v271)); - svfloat32_t v275; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v275) : "w"(v270), "w"(v272)); - svfloat32_t v276; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v276) : "w"(v270), "w"(v272)); - svfloat32_t v326; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v326) : "w"(v322), "w"(v324)); - svfloat32_t v327; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v327) : "w"(v322), "w"(v324)); - svfloat32_t v328; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v328) : "w"(v323), "w"(v325)); - svfloat32_t v329; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v329) : "w"(v323), "w"(v325)); + svfloat32_t v273 = svadd_f32_x(svptrue_b32(), v269, v271); + svfloat32_t v274 = svsub_f32_x(svptrue_b32(), v269, v271); + svfloat32_t v275 = svadd_f32_x(svptrue_b32(), v270, v272); + svfloat32_t v276 = svsub_f32_x(svptrue_b32(), v270, v272); + svfloat32_t v326 = svadd_f32_x(svptrue_b32(), v322, v324); + svfloat32_t v327 = svsub_f32_x(svptrue_b32(), v322, v324); + svfloat32_t v328 = svadd_f32_x(svptrue_b32(), v323, v325); + svfloat32_t v329 = svsub_f32_x(svptrue_b32(), v323, v325); svst1_f64(pred_full, (double *)(v530), svreinterpret_f64_f32(v274)); svst1_f64(pred_full, (double *)(v539), svreinterpret_f64_f32(v327)); svst1_f64(pred_full, (double *)(v548), svreinterpret_f64_f32(v276)); @@ -2933,8 +2697,7 @@ void armral_fft_cf32_cf32_cf32_ab_t_gu11(const armral_cmplx_f32_t *restrict x, float32x2_t *v714 = &v6[v494]; float32x2_t *v723 = &v6[v501]; float32x2_t *v732 = &v6[v508]; - svfloat32_t zero164; - asm volatile("mov %0.s, #0" : "=w"(zero164)); + svfloat32_t zero164 = svdup_n_f32(0); svfloat32_t v164 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero164, v529, v163, 0), v529, v163, 90); @@ -2974,150 +2737,94 @@ void armral_fft_cf32_cf32_cf32_ab_t_gu11(const armral_cmplx_f32_t *restrict x, svld1_gather_s64index_f64(pred_full, (const double *)(v600), v621)); svfloat32_t v611 = svreinterpret_f32_f64( svld1_gather_s64index_f64(pred_full, (const double *)(v609), v621)); - svfloat32_t zero171; - asm volatile("mov %0.s, #0" : "=w"(zero171)); + svfloat32_t zero171 = svdup_n_f32(0); svfloat32_t v171 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero171, v539, v170, 0), v539, v170, 90); - svfloat32_t zero178; - asm volatile("mov %0.s, #0" : "=w"(zero178)); + svfloat32_t zero178 = svdup_n_f32(0); svfloat32_t v178 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero178, v548, v177, 0), v548, v177, 90); - svfloat32_t zero185; - asm volatile("mov %0.s, #0" : "=w"(zero185)); + svfloat32_t zero185 = svdup_n_f32(0); svfloat32_t v185 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero185, v557, v184, 0), v557, v184, 90); - svfloat32_t zero192; - asm volatile("mov %0.s, #0" : "=w"(zero192)); + svfloat32_t zero192 = svdup_n_f32(0); svfloat32_t v192 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero192, v566, v191, 0), v566, v191, 90); - svfloat32_t zero199; - asm volatile("mov %0.s, #0" : "=w"(zero199)); + svfloat32_t zero199 = svdup_n_f32(0); svfloat32_t v199 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero199, v575, v198, 0), v575, v198, 90); - svfloat32_t zero206; - asm volatile("mov %0.s, #0" : "=w"(zero206)); + svfloat32_t zero206 = svdup_n_f32(0); svfloat32_t v206 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero206, v584, v205, 0), v584, v205, 90); - svfloat32_t zero213; - asm volatile("mov %0.s, #0" : "=w"(zero213)); + svfloat32_t zero213 = svdup_n_f32(0); svfloat32_t v213 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero213, v593, v212, 0), v593, v212, 90); - svfloat32_t zero220; - asm volatile("mov %0.s, #0" : "=w"(zero220)); + svfloat32_t zero220 = svdup_n_f32(0); svfloat32_t v220 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero220, v602, v219, 0), v602, v219, 90); - svfloat32_t zero227; - asm volatile("mov %0.s, #0" : "=w"(zero227)); + svfloat32_t zero227 = svdup_n_f32(0); svfloat32_t v227 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero227, v611, v226, 0), v611, v226, 90); - svfloat32_t v228; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v228) : "w"(v164), "w"(v171)); - svfloat32_t v229; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v229) : "w"(v178), "w"(v185)); - svfloat32_t v230; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v230) : "w"(v192), "w"(v199)); - svfloat32_t v231; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v231) : "w"(v206), "w"(v213)); - svfloat32_t v232; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v232) : "w"(v220), "w"(v227)); - svfloat32_t v233; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v233) : "w"(v164), "w"(v171)); - svfloat32_t v234; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v234) : "w"(v178), "w"(v185)); - svfloat32_t v235; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v235) : "w"(v192), "w"(v199)); - svfloat32_t v236; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v236) : "w"(v206), "w"(v213)); - svfloat32_t v237; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v237) : "w"(v220), "w"(v227)); - svfloat32_t v238; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v238) : "w"(v228), "w"(v229)); - svfloat32_t v239; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v239) : "w"(v230), "w"(v232)); - svfloat32_t v241; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v241) : "w"(v234), "w"(v235)); - svfloat32_t v242; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v242) : "w"(v233), "w"(v237)); - svfloat32_t v254; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v254) : "w"(v229), "w"(v231)); - svfloat32_t v255; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v255) : "w"(v228), "w"(v231)); - svfloat32_t v256; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v256) : "w"(v229), "w"(v228)); - svfloat32_t v257; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v257) : "w"(v232), "w"(v231)); - svfloat32_t v258; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v258) : "w"(v230), "w"(v231)); - svfloat32_t v259; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v259) : "w"(v232), "w"(v230)); - svfloat32_t v260; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v260) : "w"(v229), "w"(v232)); - svfloat32_t v261; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v261) : "w"(v228), "w"(v230)); - svfloat32_t v263; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v263) : "w"(v234), "w"(v236)); - svfloat32_t v264; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v264) : "w"(v233), "w"(v236)); - svfloat32_t v265; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v265) : "w"(v233), "w"(v234)); - svfloat32_t v266; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v266) : "w"(v236), "w"(v237)); - svfloat32_t v267; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v267) : "w"(v235), "w"(v236)); - svfloat32_t v268; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v268) : "w"(v235), "w"(v237)); - svfloat32_t v269; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v269) : "w"(v234), "w"(v237)); - svfloat32_t v270; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v270) : "w"(v233), "w"(v235)); - svfloat32_t v240; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v240) : "w"(v231), "w"(v238)); - svfloat32_t v252; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v252) : "w"(v241), "w"(v242)); - svfloat32_t v262; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v262) : "w"(v239), "w"(v238)); - svfloat32_t v271; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v271) : "w"(v241), "w"(v242)); - svfloat32_t v298; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v298) : "w"(v255), "w"(v627)); - svfloat32_t v303; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v303) : "w"(v256), "w"(v628)); - svfloat32_t v313; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v313) : "w"(v258), "w"(v630)); - svfloat32_t v318; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v318) : "w"(v259), "w"(v631)); - svfloat32_t zero340; - asm volatile("mov %0.s, #0" : "=w"(zero340)); + svfloat32_t v228 = svadd_f32_x(svptrue_b32(), v164, v171); + svfloat32_t v229 = svadd_f32_x(svptrue_b32(), v178, v185); + svfloat32_t v230 = svadd_f32_x(svptrue_b32(), v192, v199); + svfloat32_t v231 = svadd_f32_x(svptrue_b32(), v206, v213); + svfloat32_t v232 = svadd_f32_x(svptrue_b32(), v220, v227); + svfloat32_t v233 = svsub_f32_x(svptrue_b32(), v164, v171); + svfloat32_t v234 = svsub_f32_x(svptrue_b32(), v178, v185); + svfloat32_t v235 = svsub_f32_x(svptrue_b32(), v192, v199); + svfloat32_t v236 = svsub_f32_x(svptrue_b32(), v206, v213); + svfloat32_t v237 = svsub_f32_x(svptrue_b32(), v220, v227); + svfloat32_t v238 = svadd_f32_x(svptrue_b32(), v228, v229); + svfloat32_t v239 = svadd_f32_x(svptrue_b32(), v230, v232); + svfloat32_t v241 = svsub_f32_x(svptrue_b32(), v234, v235); + svfloat32_t v242 = svadd_f32_x(svptrue_b32(), v233, v237); + svfloat32_t v254 = svsub_f32_x(svptrue_b32(), v229, v231); + svfloat32_t v255 = svsub_f32_x(svptrue_b32(), v228, v231); + svfloat32_t v256 = svsub_f32_x(svptrue_b32(), v229, v228); + svfloat32_t v257 = svsub_f32_x(svptrue_b32(), v232, v231); + svfloat32_t v258 = svsub_f32_x(svptrue_b32(), v230, v231); + svfloat32_t v259 = svsub_f32_x(svptrue_b32(), v232, v230); + svfloat32_t v260 = svsub_f32_x(svptrue_b32(), v229, v232); + svfloat32_t v261 = svsub_f32_x(svptrue_b32(), v228, v230); + svfloat32_t v263 = svadd_f32_x(svptrue_b32(), v234, v236); + svfloat32_t v264 = svsub_f32_x(svptrue_b32(), v233, v236); + svfloat32_t v265 = svadd_f32_x(svptrue_b32(), v233, v234); + svfloat32_t v266 = svsub_f32_x(svptrue_b32(), v236, v237); + svfloat32_t v267 = svsub_f32_x(svptrue_b32(), v235, v236); + svfloat32_t v268 = svsub_f32_x(svptrue_b32(), v235, v237); + svfloat32_t v269 = svadd_f32_x(svptrue_b32(), v234, v237); + svfloat32_t v270 = svsub_f32_x(svptrue_b32(), v233, v235); + svfloat32_t v240 = svadd_f32_x(svptrue_b32(), v231, v238); + svfloat32_t v252 = svsub_f32_x(svptrue_b32(), v241, v242); + svfloat32_t v262 = svsub_f32_x(svptrue_b32(), v239, v238); + svfloat32_t v271 = svadd_f32_x(svptrue_b32(), v241, v242); + svfloat32_t v298 = svmul_f32_x(svptrue_b32(), v255, v627); + svfloat32_t v303 = svmul_f32_x(svptrue_b32(), v256, v628); + svfloat32_t v313 = svmul_f32_x(svptrue_b32(), v258, v630); + svfloat32_t v318 = svmul_f32_x(svptrue_b32(), v259, v631); + svfloat32_t zero340 = svdup_n_f32(0); svfloat32_t v340 = svcmla_f32_x(pred_full, zero340, v635, v263, 90); - svfloat32_t zero354; - asm volatile("mov %0.s, #0" : "=w"(zero354)); + svfloat32_t zero354 = svdup_n_f32(0); svfloat32_t v354 = svcmla_f32_x(pred_full, zero354, v637, v265, 90); - svfloat32_t zero361; - asm volatile("mov %0.s, #0" : "=w"(zero361)); + svfloat32_t zero361 = svdup_n_f32(0); svfloat32_t v361 = svcmla_f32_x(pred_full, zero361, v638, v266, 90); - svfloat32_t zero375; - asm volatile("mov %0.s, #0" : "=w"(zero375)); + svfloat32_t zero375 = svdup_n_f32(0); svfloat32_t v375 = svcmla_f32_x(pred_full, zero375, v640, v268, 90); - svfloat32_t zero382; - asm volatile("mov %0.s, #0" : "=w"(zero382)); + svfloat32_t zero382 = svdup_n_f32(0); svfloat32_t v382 = svcmla_f32_x(pred_full, zero382, v641, v269, 90); - svfloat32_t v243; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v243) : "w"(v240), "w"(v239)); - svfloat32_t v253; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v253) : "w"(v252), "w"(v236)); - svfloat32_t v333; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v333) : "w"(v262), "w"(v634)); - svfloat32_t zero396; - asm volatile("mov %0.s, #0" : "=w"(zero396)); + svfloat32_t v243 = svadd_f32_x(svptrue_b32(), v240, v239); + svfloat32_t v253 = svsub_f32_x(svptrue_b32(), v252, v236); + svfloat32_t v333 = svmul_f32_x(svptrue_b32(), v262, v634); + svfloat32_t zero396 = svdup_n_f32(0); svfloat32_t v396 = svcmla_f32_x(pred_full, zero396, v643, v271, 90); svfloat32_t v398 = svmla_f32_x(pred_full, v298, v254, v626); svfloat32_t v399 = svmla_f32_x(pred_full, v303, v255, v627); @@ -3126,87 +2833,50 @@ void armral_fft_cf32_cf32_cf32_ab_t_gu11(const armral_cmplx_f32_t *restrict x, svfloat32_t v402 = svmla_f32_x(pred_full, v318, v258, v630); svfloat32_t v403 = svnmls_f32_x(pred_full, v318, v257, v629); svfloat32_t v406 = svcmla_f32_x(pred_full, v354, v636, v264, 90); - svfloat32_t v407; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v407) : "w"(v340), "w"(v354)); + svfloat32_t v407 = svsub_f32_x(svptrue_b32(), v340, v354); svfloat32_t v408 = svcmla_f32_x(pred_full, v375, v639, v267, 90); - svfloat32_t v409; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v409) : "w"(v361), "w"(v375)); - svfloat32_t v251; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v251) : "w"(v622), "w"(v243)); - svfloat32_t zero288; - asm volatile("mov %0.s, #0" : "=w"(zero288)); + svfloat32_t v409 = svsub_f32_x(svptrue_b32(), v361, v375); + svfloat32_t v251 = svadd_f32_x(svptrue_b32(), v622, v243); + svfloat32_t zero288 = svdup_n_f32(0); svfloat32_t v288 = svcmla_f32_x(pred_full, zero288, v625, v253, 90); svfloat32_t v404 = svmla_f32_x(pred_full, v333, v261, v633); svfloat32_t v405 = svmla_f32_x(pred_full, v333, v260, v632); svfloat32_t v410 = svcmla_f32_x(pred_full, v396, v642, v270, 90); - svfloat32_t v411; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v411) : "w"(v382), "w"(v396)); - svfloat32_t v430; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v430) : "w"(v406), "w"(v407)); + svfloat32_t v411 = svsub_f32_x(svptrue_b32(), v382, v396); + svfloat32_t v430 = svadd_f32_x(svptrue_b32(), v406, v407); svfloat32_t v397 = svmls_f32_x(pred_full, v251, v243, v624); - svfloat32_t v412; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v412) : "w"(v402), "w"(v404)); - svfloat32_t v422; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v422) : "w"(v288), "w"(v408)); - svfloat32_t v424; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v424) : "w"(v410), "w"(v406)); - svfloat32_t v426; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v426) : "w"(v288), "w"(v411)); - svfloat32_t v428; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v428) : "w"(v411), "w"(v407)); - svfloat32_t v431; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v431) : "w"(v430), "w"(v408)); + svfloat32_t v412 = svadd_f32_x(svptrue_b32(), v402, v404); + svfloat32_t v422 = svadd_f32_x(svptrue_b32(), v288, v408); + svfloat32_t v424 = svsub_f32_x(svptrue_b32(), v410, v406); + svfloat32_t v426 = svadd_f32_x(svptrue_b32(), v288, v411); + svfloat32_t v428 = svsub_f32_x(svptrue_b32(), v411, v407); + svfloat32_t v431 = svadd_f32_x(svptrue_b32(), v430, v408); svst1_f64(pred_full, (double *)(v651), svreinterpret_f64_f32(v251)); - svfloat32_t v413; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v413) : "w"(v412), "w"(v397)); - svfloat32_t v414; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v414) : "w"(v397), "w"(v399)); - svfloat32_t v416; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v416) : "w"(v397), "w"(v403)); - svfloat32_t v418; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v418) : "w"(v397), "w"(v400)); - svfloat32_t v420; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v420) : "w"(v397), "w"(v398)); - svfloat32_t v423; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v423) : "w"(v422), "w"(v410)); - svfloat32_t v425; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v425) : "w"(v424), "w"(v288)); - svfloat32_t v427; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v427) : "w"(v426), "w"(v409)); - svfloat32_t v429; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v429) : "w"(v428), "w"(v288)); - svfloat32_t v432; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v432) : "w"(v431), "w"(v409)); - svfloat32_t v415; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v415) : "w"(v414), "w"(v404)); - svfloat32_t v417; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v417) : "w"(v416), "w"(v405)); - svfloat32_t v419; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v419) : "w"(v418), "w"(v405)); - svfloat32_t v421; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v421) : "w"(v420), "w"(v401)); - svfloat32_t v433; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v433) : "w"(v432), "w"(v288)); - svfloat32_t v435; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v435) : "w"(v413), "w"(v423)); - svfloat32_t v442; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v442) : "w"(v413), "w"(v423)); - svfloat32_t v434; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v434) : "w"(v421), "w"(v433)); - svfloat32_t v436; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v436) : "w"(v415), "w"(v425)); - svfloat32_t v437; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v437) : "w"(v417), "w"(v427)); - svfloat32_t v438; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v438) : "w"(v419), "w"(v429)); - svfloat32_t v439; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v439) : "w"(v419), "w"(v429)); - svfloat32_t v440; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v440) : "w"(v417), "w"(v427)); - svfloat32_t v441; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v441) : "w"(v415), "w"(v425)); - svfloat32_t v443; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v443) : "w"(v421), "w"(v433)); + svfloat32_t v413 = svadd_f32_x(svptrue_b32(), v412, v397); + svfloat32_t v414 = svsub_f32_x(svptrue_b32(), v397, v399); + svfloat32_t v416 = svadd_f32_x(svptrue_b32(), v397, v403); + svfloat32_t v418 = svsub_f32_x(svptrue_b32(), v397, v400); + svfloat32_t v420 = svadd_f32_x(svptrue_b32(), v397, v398); + svfloat32_t v423 = svadd_f32_x(svptrue_b32(), v422, v410); + svfloat32_t v425 = svsub_f32_x(svptrue_b32(), v424, v288); + svfloat32_t v427 = svadd_f32_x(svptrue_b32(), v426, v409); + svfloat32_t v429 = svsub_f32_x(svptrue_b32(), v428, v288); + svfloat32_t v432 = svadd_f32_x(svptrue_b32(), v431, v409); + svfloat32_t v415 = svsub_f32_x(svptrue_b32(), v414, v404); + svfloat32_t v417 = svadd_f32_x(svptrue_b32(), v416, v405); + svfloat32_t v419 = svsub_f32_x(svptrue_b32(), v418, v405); + svfloat32_t v421 = svsub_f32_x(svptrue_b32(), v420, v401); + svfloat32_t v433 = svsub_f32_x(svptrue_b32(), v432, v288); + svfloat32_t v435 = svadd_f32_x(svptrue_b32(), v413, v423); + svfloat32_t v442 = svsub_f32_x(svptrue_b32(), v413, v423); + svfloat32_t v434 = svadd_f32_x(svptrue_b32(), v421, v433); + svfloat32_t v436 = svadd_f32_x(svptrue_b32(), v415, v425); + svfloat32_t v437 = svsub_f32_x(svptrue_b32(), v417, v427); + svfloat32_t v438 = svadd_f32_x(svptrue_b32(), v419, v429); + svfloat32_t v439 = svsub_f32_x(svptrue_b32(), v419, v429); + svfloat32_t v440 = svadd_f32_x(svptrue_b32(), v417, v427); + svfloat32_t v441 = svsub_f32_x(svptrue_b32(), v415, v425); + svfloat32_t v443 = svsub_f32_x(svptrue_b32(), v421, v433); svst1_f64(pred_full, (double *)(v669), svreinterpret_f64_f32(v435)); svst1_f64(pred_full, (double *)(v732), svreinterpret_f64_f32(v442)); svst1_f64(pred_full, (double *)(v660), svreinterpret_f64_f32(v434)); @@ -3547,8 +3217,7 @@ void armral_fft_cf32_cf32_cf32_ab_t_gu12(const armral_cmplx_f32_t *restrict x, svld1_f64(pred_full, &((const double *)v7)[v155])); svfloat32_t v170 = svreinterpret_f32_f64( svld1_f64(pred_full, &((const double *)v7)[v169])); - svfloat32_t zero206; - asm volatile("mov %0.s, #0" : "=w"(zero206)); + svfloat32_t zero206 = svdup_n_f32(0); svfloat32_t v206 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero206, v517, v205, 0), v517, v205, 90); @@ -3576,59 +3245,43 @@ void armral_fft_cf32_cf32_cf32_ab_t_gu12(const armral_cmplx_f32_t *restrict x, svld1_gather_s64index_f64(pred_full, (const double *)(v525), v546)); svfloat32_t v537 = svreinterpret_f32_f64( svld1_gather_s64index_f64(pred_full, (const double *)(v535), v546)); - svfloat32_t zero52; - asm volatile("mov %0.s, #0" : "=w"(zero52)); + svfloat32_t zero52 = svdup_n_f32(0); svfloat32_t v52 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero52, v445, v51, 0), v445, v51, 90); - svfloat32_t zero59; - asm volatile("mov %0.s, #0" : "=w"(zero59)); + svfloat32_t zero59 = svdup_n_f32(0); svfloat32_t v59 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero59, v454, v58, 0), v454, v58, 90); - svfloat32_t zero94; - asm volatile("mov %0.s, #0" : "=w"(zero94)); + svfloat32_t zero94 = svdup_n_f32(0); svfloat32_t v94 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero94, v463, v93, 0), v463, v93, 90); - svfloat32_t zero101; - asm volatile("mov %0.s, #0" : "=w"(zero101)); + svfloat32_t zero101 = svdup_n_f32(0); svfloat32_t v101 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero101, v472, v100, 0), v472, v100, 90); - svfloat32_t zero150; - asm volatile("mov %0.s, #0" : "=w"(zero150)); + svfloat32_t zero150 = svdup_n_f32(0); svfloat32_t v150 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero150, v490, v149, 0), v490, v149, 90); - svfloat32_t zero157; - asm volatile("mov %0.s, #0" : "=w"(zero157)); + svfloat32_t zero157 = svdup_n_f32(0); svfloat32_t v157 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero157, v499, v156, 0), v499, v156, 90); - svfloat32_t zero213; - asm volatile("mov %0.s, #0" : "=w"(zero213)); + svfloat32_t zero213 = svdup_n_f32(0); svfloat32_t v213 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero213, v527, v212, 0), v527, v212, 90); - svfloat32_t v228; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v228) : "w"(v52), "w"(v59)); - svfloat32_t v229; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v229) : "w"(v52), "w"(v59)); - svfloat32_t v238; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v238) : "w"(v94), "w"(v101)); - svfloat32_t v239; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v239) : "w"(v94), "w"(v101)); - svfloat32_t v241; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v241) : "w"(v150), "w"(v157)); - svfloat32_t v242; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v242) : "w"(v150), "w"(v157)); - svfloat32_t v244; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v244) : "w"(v206), "w"(v213)); - svfloat32_t v245; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v245) : "w"(v206), "w"(v213)); - svfloat32_t v237; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v237) : "w"(v228), "w"(v547)); + svfloat32_t v228 = svadd_f32_x(svptrue_b32(), v52, v59); + svfloat32_t v229 = svsub_f32_x(svptrue_b32(), v52, v59); + svfloat32_t v238 = svadd_f32_x(svptrue_b32(), v94, v101); + svfloat32_t v239 = svsub_f32_x(svptrue_b32(), v94, v101); + svfloat32_t v241 = svadd_f32_x(svptrue_b32(), v150, v157); + svfloat32_t v242 = svsub_f32_x(svptrue_b32(), v150, v157); + svfloat32_t v244 = svadd_f32_x(svptrue_b32(), v206, v213); + svfloat32_t v245 = svsub_f32_x(svptrue_b32(), v206, v213); + svfloat32_t v237 = svadd_f32_x(svptrue_b32(), v228, v547); svfloat32_t v240 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, v238, v481, v114, 0), v481, v114, 90); @@ -3638,91 +3291,56 @@ void armral_fft_cf32_cf32_cf32_ab_t_gu12(const armral_cmplx_f32_t *restrict x, svfloat32_t v246 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, v244, v537, v226, 0), v537, v226, 90); - svfloat32_t v277; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v277) : "w"(v228), "w"(v241)); - svfloat32_t v278; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v278) : "w"(v228), "w"(v241)); - svfloat32_t v279; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v279) : "w"(v238), "w"(v244)); - svfloat32_t v280; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v280) : "w"(v238), "w"(v244)); - svfloat32_t v307; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v307) : "w"(v229), "w"(v242)); - svfloat32_t v308; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v308) : "w"(v229), "w"(v242)); - svfloat32_t v309; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v309) : "w"(v239), "w"(v245)); - svfloat32_t v310; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v310) : "w"(v239), "w"(v245)); - svfloat32_t v247; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v247) : "w"(v237), "w"(v243)); - svfloat32_t v248; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v248) : "w"(v237), "w"(v243)); - svfloat32_t v249; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v249) : "w"(v240), "w"(v246)); - svfloat32_t v250; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v250) : "w"(v240), "w"(v246)); - svfloat32_t v281; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v281) : "w"(v277), "w"(v279)); - svfloat32_t v282; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v282) : "w"(v277), "w"(v279)); - svfloat32_t zero304; - asm volatile("mov %0.s, #0" : "=w"(zero304)); + svfloat32_t v277 = svadd_f32_x(svptrue_b32(), v228, v241); + svfloat32_t v278 = svsub_f32_x(svptrue_b32(), v228, v241); + svfloat32_t v279 = svadd_f32_x(svptrue_b32(), v238, v244); + svfloat32_t v280 = svsub_f32_x(svptrue_b32(), v238, v244); + svfloat32_t v307 = svadd_f32_x(svptrue_b32(), v229, v242); + svfloat32_t v308 = svsub_f32_x(svptrue_b32(), v229, v242); + svfloat32_t v309 = svadd_f32_x(svptrue_b32(), v239, v245); + svfloat32_t v310 = svsub_f32_x(svptrue_b32(), v239, v245); + svfloat32_t v247 = svadd_f32_x(svptrue_b32(), v237, v243); + svfloat32_t v248 = svsub_f32_x(svptrue_b32(), v237, v243); + svfloat32_t v249 = svadd_f32_x(svptrue_b32(), v240, v246); + svfloat32_t v250 = svsub_f32_x(svptrue_b32(), v240, v246); + svfloat32_t v281 = svadd_f32_x(svptrue_b32(), v277, v279); + svfloat32_t v282 = svsub_f32_x(svptrue_b32(), v277, v279); + svfloat32_t zero304 = svdup_n_f32(0); svfloat32_t v304 = svcmla_f32_x(pred_full, zero304, v555, v280, 90); - svfloat32_t v311; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v311) : "w"(v307), "w"(v309)); - svfloat32_t v312; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v312) : "w"(v307), "w"(v309)); - svfloat32_t zero333; - asm volatile("mov %0.s, #0" : "=w"(zero333)); + svfloat32_t v311 = svadd_f32_x(svptrue_b32(), v307, v309); + svfloat32_t v312 = svsub_f32_x(svptrue_b32(), v307, v309); + svfloat32_t zero333 = svdup_n_f32(0); svfloat32_t v333 = svcmla_f32_x(pred_full, zero333, v558, v308, 90); - svfloat32_t v251; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v251) : "w"(v247), "w"(v249)); - svfloat32_t v252; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v252) : "w"(v247), "w"(v249)); - svfloat32_t zero274; - asm volatile("mov %0.s, #0" : "=w"(zero274)); + svfloat32_t v251 = svadd_f32_x(svptrue_b32(), v247, v249); + svfloat32_t v252 = svsub_f32_x(svptrue_b32(), v247, v249); + svfloat32_t zero274 = svdup_n_f32(0); svfloat32_t v274 = svcmla_f32_x(pred_full, zero274, v551, v250, 90); svfloat32_t v305 = svmla_f32_x(pred_full, v304, v278, v554); svfloat32_t v306 = svnmls_f32_x(pred_full, v304, v278, v554); - svfloat32_t zero319; - asm volatile("mov %0.s, #0" : "=w"(zero319)); + svfloat32_t zero319 = svdup_n_f32(0); svfloat32_t v319 = svcmla_f32_x(pred_full, zero319, v558, v311, 90); - svfloat32_t zero326; - asm volatile("mov %0.s, #0" : "=w"(zero326)); + svfloat32_t zero326 = svdup_n_f32(0); svfloat32_t v326 = svcmla_f32_x(pred_full, zero326, v558, v312, 90); svfloat32_t v339 = svmla_f32_x(pred_full, v333, v310, v559); svfloat32_t v340 = svmls_f32_x(pred_full, v333, v310, v559); - svfloat32_t v275; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v275) : "w"(v248), "w"(v274)); - svfloat32_t v276; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v276) : "w"(v248), "w"(v274)); + svfloat32_t v275 = svadd_f32_x(svptrue_b32(), v248, v274); + svfloat32_t v276 = svsub_f32_x(svptrue_b32(), v248, v274); svfloat32_t v341 = svmla_f32_x(pred_full, v251, v281, v554); svfloat32_t v389 = svmla_f32_x(pred_full, v252, v282, v554); svst1_f64(pred_full, (double *)(v567), svreinterpret_f64_f32(v251)); svst1_f64(pred_full, (double *)(v621), svreinterpret_f64_f32(v252)); - svfloat32_t v342; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v342) : "w"(v341), "w"(v319)); - svfloat32_t v343; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v343) : "w"(v341), "w"(v319)); - svfloat32_t v365; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v365) : "w"(v276), "w"(v306)); - svfloat32_t v390; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v390) : "w"(v389), "w"(v326)); - svfloat32_t v391; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v391) : "w"(v389), "w"(v326)); - svfloat32_t v413; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v413) : "w"(v275), "w"(v305)); + svfloat32_t v342 = svadd_f32_x(svptrue_b32(), v341, v319); + svfloat32_t v343 = svsub_f32_x(svptrue_b32(), v341, v319); + svfloat32_t v365 = svadd_f32_x(svptrue_b32(), v276, v306); + svfloat32_t v390 = svadd_f32_x(svptrue_b32(), v389, v326); + svfloat32_t v391 = svsub_f32_x(svptrue_b32(), v389, v326); + svfloat32_t v413 = svadd_f32_x(svptrue_b32(), v275, v305); svst1_f64(pred_full, (double *)(v594), svreinterpret_f64_f32(v276)); svst1_f64(pred_full, (double *)(v648), svreinterpret_f64_f32(v275)); - svfloat32_t v366; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v366) : "w"(v365), "w"(v340)); - svfloat32_t v367; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v367) : "w"(v365), "w"(v340)); - svfloat32_t v414; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v414) : "w"(v413), "w"(v339)); - svfloat32_t v415; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v415) : "w"(v413), "w"(v339)); + svfloat32_t v366 = svadd_f32_x(svptrue_b32(), v365, v340); + svfloat32_t v367 = svsub_f32_x(svptrue_b32(), v365, v340); + svfloat32_t v414 = svadd_f32_x(svptrue_b32(), v413, v339); + svfloat32_t v415 = svsub_f32_x(svptrue_b32(), v413, v339); svst1_f64(pred_full, (double *)(v576), svreinterpret_f64_f32(v343)); svst1_f64(pred_full, (double *)(v585), svreinterpret_f64_f32(v342)); svst1_f64(pred_full, (double *)(v630), svreinterpret_f64_f32(v391)); @@ -4215,8 +3833,7 @@ void armral_fft_cf32_cf32_cf32_ab_t_gu13(const armral_cmplx_f32_t *restrict x, float32x2_t *v820 = &v6[v564]; float32x2_t *v829 = &v6[v571]; float32x2_t *v838 = &v6[v578]; - svfloat32_t zero192; - asm volatile("mov %0.s, #0" : "=w"(zero192)); + svfloat32_t zero192 = svdup_n_f32(0); svfloat32_t v192 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero192, v599, v191, 0), v599, v191, 90); @@ -4264,273 +3881,169 @@ void armral_fft_cf32_cf32_cf32_ab_t_gu13(const armral_cmplx_f32_t *restrict x, svld1_gather_s64index_f64(pred_full, (const double *)(v688), v709)); svfloat32_t v699 = svreinterpret_f32_f64( svld1_gather_s64index_f64(pred_full, (const double *)(v697), v709)); - svfloat32_t zero199; - asm volatile("mov %0.s, #0" : "=w"(zero199)); + svfloat32_t zero199 = svdup_n_f32(0); svfloat32_t v199 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero199, v609, v198, 0), v609, v198, 90); - svfloat32_t zero206; - asm volatile("mov %0.s, #0" : "=w"(zero206)); + svfloat32_t zero206 = svdup_n_f32(0); svfloat32_t v206 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero206, v618, v205, 0), v618, v205, 90); - svfloat32_t zero213; - asm volatile("mov %0.s, #0" : "=w"(zero213)); + svfloat32_t zero213 = svdup_n_f32(0); svfloat32_t v213 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero213, v627, v212, 0), v627, v212, 90); - svfloat32_t zero220; - asm volatile("mov %0.s, #0" : "=w"(zero220)); + svfloat32_t zero220 = svdup_n_f32(0); svfloat32_t v220 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero220, v636, v219, 0), v636, v219, 90); - svfloat32_t zero227; - asm volatile("mov %0.s, #0" : "=w"(zero227)); + svfloat32_t zero227 = svdup_n_f32(0); svfloat32_t v227 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero227, v645, v226, 0), v645, v226, 90); - svfloat32_t zero234; - asm volatile("mov %0.s, #0" : "=w"(zero234)); + svfloat32_t zero234 = svdup_n_f32(0); svfloat32_t v234 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero234, v654, v233, 0), v654, v233, 90); - svfloat32_t zero241; - asm volatile("mov %0.s, #0" : "=w"(zero241)); + svfloat32_t zero241 = svdup_n_f32(0); svfloat32_t v241 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero241, v663, v240, 0), v663, v240, 90); - svfloat32_t zero248; - asm volatile("mov %0.s, #0" : "=w"(zero248)); + svfloat32_t zero248 = svdup_n_f32(0); svfloat32_t v248 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero248, v672, v247, 0), v672, v247, 90); - svfloat32_t zero255; - asm volatile("mov %0.s, #0" : "=w"(zero255)); + svfloat32_t zero255 = svdup_n_f32(0); svfloat32_t v255 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero255, v681, v254, 0), v681, v254, 90); - svfloat32_t zero262; - asm volatile("mov %0.s, #0" : "=w"(zero262)); + svfloat32_t zero262 = svdup_n_f32(0); svfloat32_t v262 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero262, v690, v261, 0), v690, v261, 90); - svfloat32_t zero269; - asm volatile("mov %0.s, #0" : "=w"(zero269)); + svfloat32_t zero269 = svdup_n_f32(0); svfloat32_t v269 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero269, v699, v268, 0), v699, v268, 90); - svfloat32_t v270; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v270) : "w"(v192), "w"(v199)); - svfloat32_t v271; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v271) : "w"(v206), "w"(v213)); - svfloat32_t v272; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v272) : "w"(v220), "w"(v227)); - svfloat32_t v273; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v273) : "w"(v234), "w"(v241)); - svfloat32_t v274; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v274) : "w"(v248), "w"(v255)); - svfloat32_t v275; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v275) : "w"(v262), "w"(v269)); - svfloat32_t v276; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v276) : "w"(v192), "w"(v199)); - svfloat32_t v277; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v277) : "w"(v206), "w"(v213)); - svfloat32_t v278; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v278) : "w"(v220), "w"(v227)); - svfloat32_t v279; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v279) : "w"(v234), "w"(v241)); - svfloat32_t v280; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v280) : "w"(v248), "w"(v255)); - svfloat32_t v281; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v281) : "w"(v262), "w"(v269)); - svfloat32_t v282; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v282) : "w"(v271), "w"(v274)); - svfloat32_t v284; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v284) : "w"(v270), "w"(v272)); - svfloat32_t v287; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v287) : "w"(v277), "w"(v280)); - svfloat32_t v289; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v289) : "w"(v276), "w"(v278)); - svfloat32_t v291; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v291) : "w"(v271), "w"(v275)); - svfloat32_t v292; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v292) : "w"(v272), "w"(v273)); - svfloat32_t v293; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v293) : "w"(v270), "w"(v273)); - svfloat32_t v294; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v294) : "w"(v274), "w"(v275)); - svfloat32_t v299; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v299) : "w"(v277), "w"(v281)); - svfloat32_t v300; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v300) : "w"(v276), "w"(v278)); - svfloat32_t v301; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v301) : "w"(v277), "w"(v280)); - svfloat32_t v302; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v302) : "w"(v276), "w"(v279)); - svfloat32_t v303; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v303) : "w"(v280), "w"(v281)); - svfloat32_t v304; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v304) : "w"(v278), "w"(v279)); - svfloat32_t v283; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v283) : "w"(v282), "w"(v275)); - svfloat32_t v285; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v285) : "w"(v284), "w"(v273)); - svfloat32_t v288; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v288) : "w"(v287), "w"(v281)); - svfloat32_t v290; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v290) : "w"(v289), "w"(v279)); - svfloat32_t v295; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v295) : "w"(v291), "w"(v292)); - svfloat32_t v296; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v296) : "w"(v293), "w"(v294)); - svfloat32_t v297; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v297) : "w"(v291), "w"(v292)); - svfloat32_t v298; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v298) : "w"(v293), "w"(v294)); - svfloat32_t v317; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v317) : "w"(v299), "w"(v300)); - svfloat32_t v318; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v318) : "w"(v301), "w"(v302)); - svfloat32_t v319; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v319) : "w"(v303), "w"(v304)); - svfloat32_t zero392; - asm volatile("mov %0.s, #0" : "=w"(zero392)); + svfloat32_t v270 = svadd_f32_x(svptrue_b32(), v192, v199); + svfloat32_t v271 = svadd_f32_x(svptrue_b32(), v206, v213); + svfloat32_t v272 = svadd_f32_x(svptrue_b32(), v220, v227); + svfloat32_t v273 = svadd_f32_x(svptrue_b32(), v234, v241); + svfloat32_t v274 = svadd_f32_x(svptrue_b32(), v248, v255); + svfloat32_t v275 = svadd_f32_x(svptrue_b32(), v262, v269); + svfloat32_t v276 = svsub_f32_x(svptrue_b32(), v192, v199); + svfloat32_t v277 = svsub_f32_x(svptrue_b32(), v206, v213); + svfloat32_t v278 = svsub_f32_x(svptrue_b32(), v220, v227); + svfloat32_t v279 = svsub_f32_x(svptrue_b32(), v234, v241); + svfloat32_t v280 = svsub_f32_x(svptrue_b32(), v248, v255); + svfloat32_t v281 = svsub_f32_x(svptrue_b32(), v262, v269); + svfloat32_t v282 = svadd_f32_x(svptrue_b32(), v271, v274); + svfloat32_t v284 = svadd_f32_x(svptrue_b32(), v270, v272); + svfloat32_t v287 = svadd_f32_x(svptrue_b32(), v277, v280); + svfloat32_t v289 = svadd_f32_x(svptrue_b32(), v276, v278); + svfloat32_t v291 = svsub_f32_x(svptrue_b32(), v271, v275); + svfloat32_t v292 = svsub_f32_x(svptrue_b32(), v272, v273); + svfloat32_t v293 = svsub_f32_x(svptrue_b32(), v270, v273); + svfloat32_t v294 = svsub_f32_x(svptrue_b32(), v274, v275); + svfloat32_t v299 = svsub_f32_x(svptrue_b32(), v277, v281); + svfloat32_t v300 = svsub_f32_x(svptrue_b32(), v276, v278); + svfloat32_t v301 = svsub_f32_x(svptrue_b32(), v277, v280); + svfloat32_t v302 = svadd_f32_x(svptrue_b32(), v276, v279); + svfloat32_t v303 = svsub_f32_x(svptrue_b32(), v280, v281); + svfloat32_t v304 = svadd_f32_x(svptrue_b32(), v278, v279); + svfloat32_t v283 = svadd_f32_x(svptrue_b32(), v282, v275); + svfloat32_t v285 = svadd_f32_x(svptrue_b32(), v284, v273); + svfloat32_t v288 = svadd_f32_x(svptrue_b32(), v287, v281); + svfloat32_t v290 = svsub_f32_x(svptrue_b32(), v289, v279); + svfloat32_t v295 = svsub_f32_x(svptrue_b32(), v291, v292); + svfloat32_t v296 = svsub_f32_x(svptrue_b32(), v293, v294); + svfloat32_t v297 = svadd_f32_x(svptrue_b32(), v291, v292); + svfloat32_t v298 = svadd_f32_x(svptrue_b32(), v293, v294); + svfloat32_t v317 = svadd_f32_x(svptrue_b32(), v299, v300); + svfloat32_t v318 = svadd_f32_x(svptrue_b32(), v301, v302); + svfloat32_t v319 = svsub_f32_x(svptrue_b32(), v303, v304); + svfloat32_t zero392 = svdup_n_f32(0); svfloat32_t v392 = svcmla_f32_x(pred_full, zero392, v723, v299, 90); - svfloat32_t zero399; - asm volatile("mov %0.s, #0" : "=w"(zero399)); + svfloat32_t zero399 = svdup_n_f32(0); svfloat32_t v399 = svcmla_f32_x(pred_full, zero399, v724, v300, 90); - svfloat32_t zero413; - asm volatile("mov %0.s, #0" : "=w"(zero413)); + svfloat32_t zero413 = svdup_n_f32(0); svfloat32_t v413 = svcmla_f32_x(pred_full, zero413, v726, v301, 90); - svfloat32_t zero420; - asm volatile("mov %0.s, #0" : "=w"(zero420)); + svfloat32_t zero420 = svdup_n_f32(0); svfloat32_t v420 = svcmla_f32_x(pred_full, zero420, v727, v302, 90); - svfloat32_t zero434; - asm volatile("mov %0.s, #0" : "=w"(zero434)); + svfloat32_t zero434 = svdup_n_f32(0); svfloat32_t v434 = svcmla_f32_x(pred_full, zero434, v729, v303, 90); - svfloat32_t v286; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v286) : "w"(v283), "w"(v285)); - svfloat32_t v313; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v313) : "w"(v285), "w"(v283)); - svfloat32_t v314; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v314) : "w"(v288), "w"(v290)); - svfloat32_t v315; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v315) : "w"(v295), "w"(v296)); - svfloat32_t v316; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v316) : "w"(v297), "w"(v298)); - svfloat32_t zero341; - asm volatile("mov %0.s, #0" : "=w"(zero341)); + svfloat32_t v286 = svadd_f32_x(svptrue_b32(), v283, v285); + svfloat32_t v313 = svsub_f32_x(svptrue_b32(), v285, v283); + svfloat32_t v314 = svadd_f32_x(svptrue_b32(), v288, v290); + svfloat32_t v315 = svadd_f32_x(svptrue_b32(), v295, v296); + svfloat32_t v316 = svsub_f32_x(svptrue_b32(), v297, v298); + svfloat32_t zero341 = svdup_n_f32(0); svfloat32_t v341 = svcmla_f32_x(pred_full, zero341, v714, v288, 90); - svfloat32_t zero348; - asm volatile("mov %0.s, #0" : "=w"(zero348)); + svfloat32_t zero348 = svdup_n_f32(0); svfloat32_t v348 = svcmla_f32_x(pred_full, zero348, v715, v290, 90); - svfloat32_t v360; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v360) : "w"(v295), "w"(v717)); - svfloat32_t zero406; - asm volatile("mov %0.s, #0" : "=w"(zero406)); + svfloat32_t v360 = svmul_f32_x(svptrue_b32(), v295, v717); + svfloat32_t zero406 = svdup_n_f32(0); svfloat32_t v406 = svcmla_f32_x(pred_full, zero406, v725, v317, 90); - svfloat32_t zero427; - asm volatile("mov %0.s, #0" : "=w"(zero427)); + svfloat32_t zero427 = svdup_n_f32(0); svfloat32_t v427 = svcmla_f32_x(pred_full, zero427, v728, v318, 90); - svfloat32_t zero448; - asm volatile("mov %0.s, #0" : "=w"(zero448)); + svfloat32_t zero448 = svdup_n_f32(0); svfloat32_t v448 = svcmla_f32_x(pred_full, zero448, v731, v319, 90); - svfloat32_t v312; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v312) : "w"(v710), "w"(v286)); - svfloat32_t zero355; - asm volatile("mov %0.s, #0" : "=w"(zero355)); + svfloat32_t v312 = svadd_f32_x(svptrue_b32(), v710, v286); + svfloat32_t zero355 = svdup_n_f32(0); svfloat32_t v355 = svcmla_f32_x(pred_full, zero355, v716, v314, 90); - svfloat32_t v370; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v370) : "w"(v315), "w"(v719)); + svfloat32_t v370 = svmul_f32_x(svptrue_b32(), v315, v719); svfloat32_t v450 = svmla_f32_x(pred_full, v360, v296, v718); - svfloat32_t v462; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v462) : "w"(v392), "w"(v406)); - svfloat32_t v463; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v463) : "w"(v399), "w"(v406)); - svfloat32_t v464; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v464) : "w"(v413), "w"(v427)); - svfloat32_t v465; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v465) : "w"(v420), "w"(v427)); - svfloat32_t v466; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v466) : "w"(v434), "w"(v448)); + svfloat32_t v462 = svsub_f32_x(svptrue_b32(), v392, v406); + svfloat32_t v463 = svsub_f32_x(svptrue_b32(), v399, v406); + svfloat32_t v464 = svsub_f32_x(svptrue_b32(), v413, v427); + svfloat32_t v465 = svsub_f32_x(svptrue_b32(), v420, v427); + svfloat32_t v466 = svsub_f32_x(svptrue_b32(), v434, v448); svfloat32_t v467 = svcmla_f32_x(pred_full, v448, v730, v304, 90); svfloat32_t v449 = svmls_f32_x(pred_full, v312, v286, v712); svfloat32_t v451 = svmls_f32_x(pred_full, v450, v313, v713); svfloat32_t v452 = svmla_f32_x(pred_full, v370, v296, v718); svfloat32_t v454 = svnmls_f32_x(pred_full, v360, v315, v719); - svfloat32_t v468; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v468) : "w"(v341), "w"(v355)); - svfloat32_t v469; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v469) : "w"(v348), "w"(v355)); - svfloat32_t v480; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v480) : "w"(v462), "w"(v466)); - svfloat32_t v482; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v482) : "w"(v464), "w"(v466)); - svfloat32_t v484; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v484) : "w"(v463), "w"(v467)); + svfloat32_t v468 = svsub_f32_x(svptrue_b32(), v341, v355); + svfloat32_t v469 = svsub_f32_x(svptrue_b32(), v348, v355); + svfloat32_t v480 = svadd_f32_x(svptrue_b32(), v462, v466); + svfloat32_t v482 = svadd_f32_x(svptrue_b32(), v464, v466); + svfloat32_t v484 = svsub_f32_x(svptrue_b32(), v463, v467); svst1_f64(pred_full, (double *)(v739), svreinterpret_f64_f32(v312)); svfloat32_t v453 = svmla_f32_x(pred_full, v452, v313, v713); svfloat32_t v455 = svmls_f32_x(pred_full, v454, v313, v713); svfloat32_t v456 = svmla_f32_x(pred_full, v449, v297, v720); svfloat32_t v458 = svmls_f32_x(pred_full, v449, v298, v721); svfloat32_t v460 = svmls_f32_x(pred_full, v449, v297, v720); - svfloat32_t v476; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v476) : "w"(v469), "w"(v462)); - svfloat32_t v478; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v478) : "w"(v467), "w"(v468)); - svfloat32_t v481; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v481) : "w"(v480), "w"(v469)); - svfloat32_t v483; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v483) : "w"(v482), "w"(v469)); - svfloat32_t v485; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v485) : "w"(v484), "w"(v468)); - svfloat32_t v486; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v486) : "w"(v468), "w"(v463)); + svfloat32_t v476 = svsub_f32_x(svptrue_b32(), v469, v462); + svfloat32_t v478 = svsub_f32_x(svptrue_b32(), v467, v468); + svfloat32_t v481 = svadd_f32_x(svptrue_b32(), v480, v469); + svfloat32_t v483 = svsub_f32_x(svptrue_b32(), v482, v469); + svfloat32_t v485 = svsub_f32_x(svptrue_b32(), v484, v468); + svfloat32_t v486 = svadd_f32_x(svptrue_b32(), v468, v463); svfloat32_t v457 = svmla_f32_x(pred_full, v456, v298, v721); svfloat32_t v459 = svmls_f32_x(pred_full, v458, v316, v722); svfloat32_t v461 = svmla_f32_x(pred_full, v460, v316, v722); - svfloat32_t v477; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v477) : "w"(v476), "w"(v464)); - svfloat32_t v479; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v479) : "w"(v478), "w"(v465)); - svfloat32_t v487; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v487) : "w"(v486), "w"(v465)); - svfloat32_t v470; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v470) : "w"(v451), "w"(v457)); - svfloat32_t v471; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v471) : "w"(v453), "w"(v459)); - svfloat32_t v472; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v472) : "w"(v459), "w"(v453)); - svfloat32_t v473; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v473) : "w"(v455), "w"(v461)); - svfloat32_t v474; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v474) : "w"(v457), "w"(v451)); - svfloat32_t v475; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v475) : "w"(v461), "w"(v455)); - svfloat32_t v488; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v488) : "w"(v470), "w"(v477)); - svfloat32_t v489; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v489) : "w"(v471), "w"(v479)); - svfloat32_t v490; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v490) : "w"(v472), "w"(v481)); - svfloat32_t v491; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v491) : "w"(v473), "w"(v483)); - svfloat32_t v492; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v492) : "w"(v474), "w"(v485)); - svfloat32_t v493; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v493) : "w"(v475), "w"(v487)); - svfloat32_t v494; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v494) : "w"(v475), "w"(v487)); - svfloat32_t v495; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v495) : "w"(v474), "w"(v485)); - svfloat32_t v496; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v496) : "w"(v473), "w"(v483)); - svfloat32_t v497; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v497) : "w"(v472), "w"(v481)); - svfloat32_t v498; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v498) : "w"(v471), "w"(v479)); - svfloat32_t v499; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v499) : "w"(v470), "w"(v477)); + svfloat32_t v477 = svadd_f32_x(svptrue_b32(), v476, v464); + svfloat32_t v479 = svsub_f32_x(svptrue_b32(), v478, v465); + svfloat32_t v487 = svsub_f32_x(svptrue_b32(), v486, v465); + svfloat32_t v470 = svadd_f32_x(svptrue_b32(), v451, v457); + svfloat32_t v471 = svadd_f32_x(svptrue_b32(), v453, v459); + svfloat32_t v472 = svsub_f32_x(svptrue_b32(), v459, v453); + svfloat32_t v473 = svadd_f32_x(svptrue_b32(), v455, v461); + svfloat32_t v474 = svsub_f32_x(svptrue_b32(), v457, v451); + svfloat32_t v475 = svsub_f32_x(svptrue_b32(), v461, v455); + svfloat32_t v488 = svsub_f32_x(svptrue_b32(), v470, v477); + svfloat32_t v489 = svadd_f32_x(svptrue_b32(), v471, v479); + svfloat32_t v490 = svsub_f32_x(svptrue_b32(), v472, v481); + svfloat32_t v491 = svsub_f32_x(svptrue_b32(), v473, v483); + svfloat32_t v492 = svadd_f32_x(svptrue_b32(), v474, v485); + svfloat32_t v493 = svsub_f32_x(svptrue_b32(), v475, v487); + svfloat32_t v494 = svadd_f32_x(svptrue_b32(), v475, v487); + svfloat32_t v495 = svsub_f32_x(svptrue_b32(), v474, v485); + svfloat32_t v496 = svadd_f32_x(svptrue_b32(), v473, v483); + svfloat32_t v497 = svadd_f32_x(svptrue_b32(), v472, v481); + svfloat32_t v498 = svsub_f32_x(svptrue_b32(), v471, v479); + svfloat32_t v499 = svadd_f32_x(svptrue_b32(), v470, v477); svst1_f64(pred_full, (double *)(v748), svreinterpret_f64_f32(v488)); svst1_f64(pred_full, (double *)(v757), svreinterpret_f64_f32(v489)); svst1_f64(pred_full, (double *)(v766), svreinterpret_f64_f32(v490)); @@ -4971,8 +4484,7 @@ void armral_fft_cf32_cf32_cf32_ab_t_gu14(const armral_cmplx_f32_t *restrict x, svld1_f64(pred_full, &((const double *)v7)[v162])); svfloat32_t v198 = svreinterpret_f32_f64( svld1_f64(pred_full, &((const double *)v7)[v197])); - svfloat32_t zero206; - asm volatile("mov %0.s, #0" : "=w"(zero206)); + svfloat32_t zero206 = svdup_n_f32(0); svfloat32_t v206 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero206, v668, v205, 0), v668, v205, 90); @@ -5008,250 +4520,158 @@ void armral_fft_cf32_cf32_cf32_ab_t_gu14(const armral_cmplx_f32_t *restrict x, svld1_gather_s64index_f64(pred_full, (const double *)(v695), v715)); svfloat32_t v706 = svreinterpret_f32_f64( svld1_gather_s64index_f64(pred_full, (const double *)(v704), v715)); - svfloat32_t zero38; - asm volatile("mov %0.s, #0" : "=w"(zero38)); + svfloat32_t zero38 = svdup_n_f32(0); svfloat32_t v38 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero38, v596, v37, 0), v596, v37, 90); - svfloat32_t zero73; - asm volatile("mov %0.s, #0" : "=w"(zero73)); + svfloat32_t zero73 = svdup_n_f32(0); svfloat32_t v73 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero73, v605, v72, 0), v605, v72, 90); - svfloat32_t zero80; - asm volatile("mov %0.s, #0" : "=w"(zero80)); + svfloat32_t zero80 = svdup_n_f32(0); svfloat32_t v80 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero80, v614, v79, 0), v614, v79, 90); - svfloat32_t zero115; - asm volatile("mov %0.s, #0" : "=w"(zero115)); + svfloat32_t zero115 = svdup_n_f32(0); svfloat32_t v115 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero115, v623, v114, 0), v623, v114, 90); - svfloat32_t zero122; - asm volatile("mov %0.s, #0" : "=w"(zero122)); + svfloat32_t zero122 = svdup_n_f32(0); svfloat32_t v122 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero122, v632, v121, 0), v632, v121, 90); - svfloat32_t zero157; - asm volatile("mov %0.s, #0" : "=w"(zero157)); + svfloat32_t zero157 = svdup_n_f32(0); svfloat32_t v157 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero157, v641, v156, 0), v641, v156, 90); - svfloat32_t zero164; - asm volatile("mov %0.s, #0" : "=w"(zero164)); + svfloat32_t zero164 = svdup_n_f32(0); svfloat32_t v164 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero164, v650, v163, 0), v650, v163, 90); - svfloat32_t zero199; - asm volatile("mov %0.s, #0" : "=w"(zero199)); + svfloat32_t zero199 = svdup_n_f32(0); svfloat32_t v199 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero199, v659, v198, 0), v659, v198, 90); - svfloat32_t zero241; - asm volatile("mov %0.s, #0" : "=w"(zero241)); + svfloat32_t zero241 = svdup_n_f32(0); svfloat32_t v241 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero241, v679, v240, 0), v679, v240, 90); - svfloat32_t zero248; - asm volatile("mov %0.s, #0" : "=w"(zero248)); + svfloat32_t zero248 = svdup_n_f32(0); svfloat32_t v248 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero248, v688, v247, 0), v688, v247, 90); - svfloat32_t zero283; - asm volatile("mov %0.s, #0" : "=w"(zero283)); + svfloat32_t zero283 = svdup_n_f32(0); svfloat32_t v283 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero283, v697, v282, 0), v697, v282, 90); - svfloat32_t zero290; - asm volatile("mov %0.s, #0" : "=w"(zero290)); + svfloat32_t zero290 = svdup_n_f32(0); svfloat32_t v290 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero290, v706, v289, 0), v706, v289, 90); - svfloat32_t v298; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v298) : "w"(v716), "w"(v38)); - svfloat32_t v299; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v299) : "w"(v716), "w"(v38)); - svfloat32_t v300; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v300) : "w"(v73), "w"(v80)); - svfloat32_t v301; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v301) : "w"(v73), "w"(v80)); - svfloat32_t v302; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v302) : "w"(v115), "w"(v122)); - svfloat32_t v303; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v303) : "w"(v115), "w"(v122)); - svfloat32_t v304; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v304) : "w"(v157), "w"(v164)); - svfloat32_t v305; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v305) : "w"(v157), "w"(v164)); - svfloat32_t v306; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v306) : "w"(v199), "w"(v206)); - svfloat32_t v307; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v307) : "w"(v199), "w"(v206)); - svfloat32_t v308; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v308) : "w"(v241), "w"(v248)); - svfloat32_t v309; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v309) : "w"(v241), "w"(v248)); - svfloat32_t v310; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v310) : "w"(v283), "w"(v290)); - svfloat32_t v311; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v311) : "w"(v283), "w"(v290)); - svfloat32_t v312; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v312) : "w"(v300), "w"(v310)); - svfloat32_t v313; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v313) : "w"(v300), "w"(v310)); - svfloat32_t v314; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v314) : "w"(v306), "w"(v304)); - svfloat32_t v315; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v315) : "w"(v306), "w"(v304)); - svfloat32_t v316; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v316) : "w"(v302), "w"(v308)); - svfloat32_t v317; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v317) : "w"(v302), "w"(v308)); - svfloat32_t v401; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v401) : "w"(v301), "w"(v311)); - svfloat32_t v402; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v402) : "w"(v301), "w"(v311)); - svfloat32_t v403; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v403) : "w"(v307), "w"(v305)); - svfloat32_t v404; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v404) : "w"(v307), "w"(v305)); - svfloat32_t v405; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v405) : "w"(v303), "w"(v309)); - svfloat32_t v406; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v406) : "w"(v303), "w"(v309)); - svfloat32_t v318; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v318) : "w"(v312), "w"(v314)); - svfloat32_t v321; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v321) : "w"(v312), "w"(v314)); - svfloat32_t v322; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v322) : "w"(v314), "w"(v316)); - svfloat32_t v323; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v323) : "w"(v316), "w"(v312)); - svfloat32_t v324; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v324) : "w"(v313), "w"(v315)); - svfloat32_t v326; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v326) : "w"(v313), "w"(v315)); - svfloat32_t v327; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v327) : "w"(v315), "w"(v317)); - svfloat32_t v328; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v328) : "w"(v317), "w"(v313)); - svfloat32_t v407; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v407) : "w"(v401), "w"(v403)); - svfloat32_t v410; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v410) : "w"(v401), "w"(v403)); - svfloat32_t v411; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v411) : "w"(v403), "w"(v405)); - svfloat32_t v412; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v412) : "w"(v405), "w"(v401)); - svfloat32_t v413; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v413) : "w"(v402), "w"(v404)); - svfloat32_t v415; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v415) : "w"(v402), "w"(v404)); - svfloat32_t v416; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v416) : "w"(v404), "w"(v406)); - svfloat32_t v417; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v417) : "w"(v406), "w"(v402)); - svfloat32_t v319; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v319) : "w"(v318), "w"(v316)); - svfloat32_t v325; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v325) : "w"(v324), "w"(v317)); - svfloat32_t zero367; - asm volatile("mov %0.s, #0" : "=w"(zero367)); + svfloat32_t v298 = svadd_f32_x(svptrue_b32(), v716, v38); + svfloat32_t v299 = svsub_f32_x(svptrue_b32(), v716, v38); + svfloat32_t v300 = svadd_f32_x(svptrue_b32(), v73, v80); + svfloat32_t v301 = svsub_f32_x(svptrue_b32(), v73, v80); + svfloat32_t v302 = svadd_f32_x(svptrue_b32(), v115, v122); + svfloat32_t v303 = svsub_f32_x(svptrue_b32(), v115, v122); + svfloat32_t v304 = svadd_f32_x(svptrue_b32(), v157, v164); + svfloat32_t v305 = svsub_f32_x(svptrue_b32(), v157, v164); + svfloat32_t v306 = svadd_f32_x(svptrue_b32(), v199, v206); + svfloat32_t v307 = svsub_f32_x(svptrue_b32(), v199, v206); + svfloat32_t v308 = svadd_f32_x(svptrue_b32(), v241, v248); + svfloat32_t v309 = svsub_f32_x(svptrue_b32(), v241, v248); + svfloat32_t v310 = svadd_f32_x(svptrue_b32(), v283, v290); + svfloat32_t v311 = svsub_f32_x(svptrue_b32(), v283, v290); + svfloat32_t v312 = svadd_f32_x(svptrue_b32(), v300, v310); + svfloat32_t v313 = svsub_f32_x(svptrue_b32(), v300, v310); + svfloat32_t v314 = svadd_f32_x(svptrue_b32(), v306, v304); + svfloat32_t v315 = svsub_f32_x(svptrue_b32(), v306, v304); + svfloat32_t v316 = svadd_f32_x(svptrue_b32(), v302, v308); + svfloat32_t v317 = svsub_f32_x(svptrue_b32(), v302, v308); + svfloat32_t v401 = svadd_f32_x(svptrue_b32(), v301, v311); + svfloat32_t v402 = svsub_f32_x(svptrue_b32(), v301, v311); + svfloat32_t v403 = svadd_f32_x(svptrue_b32(), v307, v305); + svfloat32_t v404 = svsub_f32_x(svptrue_b32(), v307, v305); + svfloat32_t v405 = svadd_f32_x(svptrue_b32(), v303, v309); + svfloat32_t v406 = svsub_f32_x(svptrue_b32(), v303, v309); + svfloat32_t v318 = svadd_f32_x(svptrue_b32(), v312, v314); + svfloat32_t v321 = svsub_f32_x(svptrue_b32(), v312, v314); + svfloat32_t v322 = svsub_f32_x(svptrue_b32(), v314, v316); + svfloat32_t v323 = svsub_f32_x(svptrue_b32(), v316, v312); + svfloat32_t v324 = svadd_f32_x(svptrue_b32(), v313, v315); + svfloat32_t v326 = svsub_f32_x(svptrue_b32(), v313, v315); + svfloat32_t v327 = svsub_f32_x(svptrue_b32(), v315, v317); + svfloat32_t v328 = svsub_f32_x(svptrue_b32(), v317, v313); + svfloat32_t v407 = svadd_f32_x(svptrue_b32(), v401, v403); + svfloat32_t v410 = svsub_f32_x(svptrue_b32(), v401, v403); + svfloat32_t v411 = svsub_f32_x(svptrue_b32(), v403, v405); + svfloat32_t v412 = svsub_f32_x(svptrue_b32(), v405, v401); + svfloat32_t v413 = svadd_f32_x(svptrue_b32(), v402, v404); + svfloat32_t v415 = svsub_f32_x(svptrue_b32(), v402, v404); + svfloat32_t v416 = svsub_f32_x(svptrue_b32(), v404, v406); + svfloat32_t v417 = svsub_f32_x(svptrue_b32(), v406, v402); + svfloat32_t v319 = svadd_f32_x(svptrue_b32(), v318, v316); + svfloat32_t v325 = svadd_f32_x(svptrue_b32(), v324, v317); + svfloat32_t zero367 = svdup_n_f32(0); svfloat32_t v367 = svcmla_f32_x(pred_full, zero367, v732, v326, 90); - svfloat32_t zero374; - asm volatile("mov %0.s, #0" : "=w"(zero374)); + svfloat32_t zero374 = svdup_n_f32(0); svfloat32_t v374 = svcmla_f32_x(pred_full, zero374, v733, v327, 90); - svfloat32_t zero381; - asm volatile("mov %0.s, #0" : "=w"(zero381)); + svfloat32_t zero381 = svdup_n_f32(0); svfloat32_t v381 = svcmla_f32_x(pred_full, zero381, v734, v328, 90); - svfloat32_t v408; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v408) : "w"(v407), "w"(v405)); - svfloat32_t v414; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v414) : "w"(v413), "w"(v406)); - svfloat32_t zero456; - asm volatile("mov %0.s, #0" : "=w"(zero456)); + svfloat32_t v408 = svadd_f32_x(svptrue_b32(), v407, v405); + svfloat32_t v414 = svadd_f32_x(svptrue_b32(), v413, v406); + svfloat32_t zero456 = svdup_n_f32(0); svfloat32_t v456 = svcmla_f32_x(pred_full, zero456, v732, v415, 90); - svfloat32_t zero463; - asm volatile("mov %0.s, #0" : "=w"(zero463)); + svfloat32_t zero463 = svdup_n_f32(0); svfloat32_t v463 = svcmla_f32_x(pred_full, zero463, v733, v416, 90); - svfloat32_t zero470; - asm volatile("mov %0.s, #0" : "=w"(zero470)); + svfloat32_t zero470 = svdup_n_f32(0); svfloat32_t v470 = svcmla_f32_x(pred_full, zero470, v734, v417, 90); - svfloat32_t v320; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v320) : "w"(v319), "w"(v298)); - svfloat32_t zero360; - asm volatile("mov %0.s, #0" : "=w"(zero360)); + svfloat32_t v320 = svadd_f32_x(svptrue_b32(), v319, v298); + svfloat32_t zero360 = svdup_n_f32(0); svfloat32_t v360 = svcmla_f32_x(pred_full, zero360, v731, v325, 90); - svfloat32_t v409; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v409) : "w"(v408), "w"(v299)); - svfloat32_t zero449; - asm volatile("mov %0.s, #0" : "=w"(zero449)); + svfloat32_t v409 = svadd_f32_x(svptrue_b32(), v408, v299); + svfloat32_t zero449 = svdup_n_f32(0); svfloat32_t v449 = svcmla_f32_x(pred_full, zero449, v731, v414, 90); svfloat32_t v382 = svmla_f32_x(pred_full, v320, v319, v727); - svfloat32_t v389; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v389) : "w"(v360), "w"(v367)); - svfloat32_t v391; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v391) : "w"(v360), "w"(v367)); - svfloat32_t v393; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v393) : "w"(v360), "w"(v374)); + svfloat32_t v389 = svadd_f32_x(svptrue_b32(), v360, v367); + svfloat32_t v391 = svsub_f32_x(svptrue_b32(), v360, v367); + svfloat32_t v393 = svsub_f32_x(svptrue_b32(), v360, v374); svfloat32_t v471 = svmla_f32_x(pred_full, v409, v408, v727); - svfloat32_t v478; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v478) : "w"(v449), "w"(v456)); - svfloat32_t v480; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v480) : "w"(v449), "w"(v456)); - svfloat32_t v482; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v482) : "w"(v449), "w"(v463)); + svfloat32_t v478 = svadd_f32_x(svptrue_b32(), v449, v456); + svfloat32_t v480 = svsub_f32_x(svptrue_b32(), v449, v456); + svfloat32_t v482 = svsub_f32_x(svptrue_b32(), v449, v463); svst1_f64(pred_full, (double *)(v742), svreinterpret_f64_f32(v320)); svst1_f64(pred_full, (double *)(v751), svreinterpret_f64_f32(v409)); svfloat32_t v383 = svmla_f32_x(pred_full, v382, v321, v728); svfloat32_t v385 = svmls_f32_x(pred_full, v382, v321, v728); svfloat32_t v387 = svmls_f32_x(pred_full, v382, v322, v729); - svfloat32_t v390; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v390) : "w"(v389), "w"(v374)); - svfloat32_t v392; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v392) : "w"(v391), "w"(v381)); - svfloat32_t v394; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v394) : "w"(v393), "w"(v381)); + svfloat32_t v390 = svadd_f32_x(svptrue_b32(), v389, v374); + svfloat32_t v392 = svsub_f32_x(svptrue_b32(), v391, v381); + svfloat32_t v394 = svadd_f32_x(svptrue_b32(), v393, v381); svfloat32_t v472 = svmla_f32_x(pred_full, v471, v410, v728); svfloat32_t v474 = svmls_f32_x(pred_full, v471, v410, v728); svfloat32_t v476 = svmls_f32_x(pred_full, v471, v411, v729); - svfloat32_t v479; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v479) : "w"(v478), "w"(v463)); - svfloat32_t v481; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v481) : "w"(v480), "w"(v470)); - svfloat32_t v483; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v483) : "w"(v482), "w"(v470)); + svfloat32_t v479 = svadd_f32_x(svptrue_b32(), v478, v463); + svfloat32_t v481 = svsub_f32_x(svptrue_b32(), v480, v470); + svfloat32_t v483 = svadd_f32_x(svptrue_b32(), v482, v470); svfloat32_t v384 = svmla_f32_x(pred_full, v383, v322, v729); svfloat32_t v386 = svmls_f32_x(pred_full, v385, v323, v730); svfloat32_t v388 = svmla_f32_x(pred_full, v387, v323, v730); svfloat32_t v473 = svmla_f32_x(pred_full, v472, v411, v729); svfloat32_t v475 = svmls_f32_x(pred_full, v474, v412, v730); svfloat32_t v477 = svmla_f32_x(pred_full, v476, v412, v730); - svfloat32_t v395; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v395) : "w"(v384), "w"(v390)); - svfloat32_t v396; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v396) : "w"(v384), "w"(v390)); - svfloat32_t v397; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v397) : "w"(v386), "w"(v392)); - svfloat32_t v398; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v398) : "w"(v386), "w"(v392)); - svfloat32_t v399; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v399) : "w"(v388), "w"(v394)); - svfloat32_t v400; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v400) : "w"(v388), "w"(v394)); - svfloat32_t v484; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v484) : "w"(v473), "w"(v479)); - svfloat32_t v485; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v485) : "w"(v473), "w"(v479)); - svfloat32_t v486; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v486) : "w"(v475), "w"(v481)); - svfloat32_t v487; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v487) : "w"(v475), "w"(v481)); - svfloat32_t v488; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v488) : "w"(v477), "w"(v483)); - svfloat32_t v489; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v489) : "w"(v477), "w"(v483)); + svfloat32_t v395 = svadd_f32_x(svptrue_b32(), v384, v390); + svfloat32_t v396 = svsub_f32_x(svptrue_b32(), v384, v390); + svfloat32_t v397 = svadd_f32_x(svptrue_b32(), v386, v392); + svfloat32_t v398 = svsub_f32_x(svptrue_b32(), v386, v392); + svfloat32_t v399 = svadd_f32_x(svptrue_b32(), v388, v394); + svfloat32_t v400 = svsub_f32_x(svptrue_b32(), v388, v394); + svfloat32_t v484 = svadd_f32_x(svptrue_b32(), v473, v479); + svfloat32_t v485 = svsub_f32_x(svptrue_b32(), v473, v479); + svfloat32_t v486 = svadd_f32_x(svptrue_b32(), v475, v481); + svfloat32_t v487 = svsub_f32_x(svptrue_b32(), v475, v481); + svfloat32_t v488 = svadd_f32_x(svptrue_b32(), v477, v483); + svfloat32_t v489 = svsub_f32_x(svptrue_b32(), v477, v483); svst1_f64(pred_full, (double *)(v760), svreinterpret_f64_f32(v396)); svst1_f64(pred_full, (double *)(v769), svreinterpret_f64_f32(v485)); svst1_f64(pred_full, (double *)(v778), svreinterpret_f64_f32(v398)); @@ -5752,8 +5172,7 @@ void armral_fft_cf32_cf32_cf32_ab_t_gu15(const armral_cmplx_f32_t *restrict x, svld1_f64(pred_full, &((const double *)v7)[v113])); svfloat32_t v149 = svreinterpret_f32_f64( svld1_f64(pred_full, &((const double *)v7)[v148])); - svfloat32_t zero157; - asm volatile("mov %0.s, #0" : "=w"(zero157)); + svfloat32_t zero157 = svdup_n_f32(0); svfloat32_t v157 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero157, v647, v156, 0), v647, v156, 90); @@ -5797,73 +5216,53 @@ void armral_fft_cf32_cf32_cf32_ab_t_gu15(const armral_cmplx_f32_t *restrict x, svld1_gather_s64index_f64(pred_full, (const double *)(v701), v721)); svfloat32_t v712 = svreinterpret_f32_f64( svld1_gather_s64index_f64(pred_full, (const double *)(v710), v721)); - svfloat32_t zero52; - asm volatile("mov %0.s, #0" : "=w"(zero52)); + svfloat32_t zero52 = svdup_n_f32(0); svfloat32_t v52 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero52, v593, v51, 0), v593, v51, 90); - svfloat32_t zero59; - asm volatile("mov %0.s, #0" : "=w"(zero59)); + svfloat32_t zero59 = svdup_n_f32(0); svfloat32_t v59 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero59, v602, v58, 0), v602, v58, 90); - svfloat32_t zero94; - asm volatile("mov %0.s, #0" : "=w"(zero94)); + svfloat32_t zero94 = svdup_n_f32(0); svfloat32_t v94 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero94, v611, v93, 0), v611, v93, 90); - svfloat32_t zero101; - asm volatile("mov %0.s, #0" : "=w"(zero101)); + svfloat32_t zero101 = svdup_n_f32(0); svfloat32_t v101 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero101, v620, v100, 0), v620, v100, 90); - svfloat32_t zero150; - asm volatile("mov %0.s, #0" : "=w"(zero150)); + svfloat32_t zero150 = svdup_n_f32(0); svfloat32_t v150 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero150, v638, v149, 0), v638, v149, 90); - svfloat32_t zero206; - asm volatile("mov %0.s, #0" : "=w"(zero206)); + svfloat32_t zero206 = svdup_n_f32(0); svfloat32_t v206 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero206, v667, v205, 0), v667, v205, 90); - svfloat32_t zero213; - asm volatile("mov %0.s, #0" : "=w"(zero213)); + svfloat32_t zero213 = svdup_n_f32(0); svfloat32_t v213 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero213, v676, v212, 0), v676, v212, 90); - svfloat32_t zero262; - asm volatile("mov %0.s, #0" : "=w"(zero262)); + svfloat32_t zero262 = svdup_n_f32(0); svfloat32_t v262 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero262, v694, v261, 0), v694, v261, 90); - svfloat32_t zero269; - asm volatile("mov %0.s, #0" : "=w"(zero269)); + svfloat32_t zero269 = svdup_n_f32(0); svfloat32_t v269 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero269, v703, v268, 0), v703, v268, 90); - svfloat32_t v284; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v284) : "w"(v52), "w"(v59)); - svfloat32_t v285; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v285) : "w"(v52), "w"(v59)); - svfloat32_t v294; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v294) : "w"(v94), "w"(v101)); - svfloat32_t v295; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v295) : "w"(v94), "w"(v101)); - svfloat32_t v297; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v297) : "w"(v150), "w"(v157)); - svfloat32_t v298; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v298) : "w"(v150), "w"(v157)); - svfloat32_t v300; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v300) : "w"(v206), "w"(v213)); - svfloat32_t v301; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v301) : "w"(v206), "w"(v213)); - svfloat32_t v303; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v303) : "w"(v262), "w"(v269)); - svfloat32_t v304; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v304) : "w"(v262), "w"(v269)); - svfloat32_t v293; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v293) : "w"(v284), "w"(v722)); + svfloat32_t v284 = svadd_f32_x(svptrue_b32(), v52, v59); + svfloat32_t v285 = svsub_f32_x(svptrue_b32(), v52, v59); + svfloat32_t v294 = svadd_f32_x(svptrue_b32(), v94, v101); + svfloat32_t v295 = svsub_f32_x(svptrue_b32(), v94, v101); + svfloat32_t v297 = svadd_f32_x(svptrue_b32(), v150, v157); + svfloat32_t v298 = svsub_f32_x(svptrue_b32(), v150, v157); + svfloat32_t v300 = svadd_f32_x(svptrue_b32(), v206, v213); + svfloat32_t v301 = svsub_f32_x(svptrue_b32(), v206, v213); + svfloat32_t v303 = svadd_f32_x(svptrue_b32(), v262, v269); + svfloat32_t v304 = svsub_f32_x(svptrue_b32(), v262, v269); + svfloat32_t v293 = svadd_f32_x(svptrue_b32(), v284, v722); svfloat32_t v296 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, v294, v629, v114, 0), v629, v114, 90); @@ -5876,86 +5275,51 @@ void armral_fft_cf32_cf32_cf32_ab_t_gu15(const armral_cmplx_f32_t *restrict x, svfloat32_t v305 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, v303, v712, v282, 0), v712, v282, 90); - svfloat32_t v359; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v359) : "w"(v294), "w"(v303)); - svfloat32_t v360; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v360) : "w"(v294), "w"(v303)); - svfloat32_t v361; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v361) : "w"(v300), "w"(v297)); - svfloat32_t v362; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v362) : "w"(v300), "w"(v297)); - svfloat32_t v412; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v412) : "w"(v295), "w"(v304)); - svfloat32_t v413; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v413) : "w"(v295), "w"(v304)); - svfloat32_t v414; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v414) : "w"(v301), "w"(v298)); - svfloat32_t v415; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v415) : "w"(v301), "w"(v298)); - svfloat32_t v306; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v306) : "w"(v296), "w"(v305)); - svfloat32_t v307; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v307) : "w"(v296), "w"(v305)); - svfloat32_t v308; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v308) : "w"(v302), "w"(v299)); - svfloat32_t v309; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v309) : "w"(v302), "w"(v299)); - svfloat32_t v363; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v363) : "w"(v359), "w"(v361)); - svfloat32_t v364; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v364) : "w"(v359), "w"(v361)); - svfloat32_t v365; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v365) : "w"(v360), "w"(v362)); - svfloat32_t zero388; - asm volatile("mov %0.s, #0" : "=w"(zero388)); + svfloat32_t v359 = svadd_f32_x(svptrue_b32(), v294, v303); + svfloat32_t v360 = svsub_f32_x(svptrue_b32(), v294, v303); + svfloat32_t v361 = svadd_f32_x(svptrue_b32(), v300, v297); + svfloat32_t v362 = svsub_f32_x(svptrue_b32(), v300, v297); + svfloat32_t v412 = svadd_f32_x(svptrue_b32(), v295, v304); + svfloat32_t v413 = svsub_f32_x(svptrue_b32(), v295, v304); + svfloat32_t v414 = svadd_f32_x(svptrue_b32(), v301, v298); + svfloat32_t v415 = svsub_f32_x(svptrue_b32(), v301, v298); + svfloat32_t v306 = svadd_f32_x(svptrue_b32(), v296, v305); + svfloat32_t v307 = svsub_f32_x(svptrue_b32(), v296, v305); + svfloat32_t v308 = svadd_f32_x(svptrue_b32(), v302, v299); + svfloat32_t v309 = svsub_f32_x(svptrue_b32(), v302, v299); + svfloat32_t v363 = svadd_f32_x(svptrue_b32(), v359, v361); + svfloat32_t v364 = svsub_f32_x(svptrue_b32(), v359, v361); + svfloat32_t v365 = svadd_f32_x(svptrue_b32(), v360, v362); + svfloat32_t zero388 = svdup_n_f32(0); svfloat32_t v388 = svcmla_f32_x(pred_full, zero388, v732, v360, 90); - svfloat32_t v416; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v416) : "w"(v412), "w"(v414)); - svfloat32_t v417; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v417) : "w"(v412), "w"(v414)); - svfloat32_t v418; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v418) : "w"(v413), "w"(v415)); - svfloat32_t v455; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v455) : "w"(v415), "w"(v740)); - svfloat32_t v310; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v310) : "w"(v306), "w"(v308)); - svfloat32_t v311; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v311) : "w"(v306), "w"(v308)); - svfloat32_t v312; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v312) : "w"(v307), "w"(v309)); - svfloat32_t zero335; - asm volatile("mov %0.s, #0" : "=w"(zero335)); + svfloat32_t v416 = svadd_f32_x(svptrue_b32(), v412, v414); + svfloat32_t v417 = svsub_f32_x(svptrue_b32(), v412, v414); + svfloat32_t v418 = svadd_f32_x(svptrue_b32(), v413, v415); + svfloat32_t v455 = svmul_f32_x(svptrue_b32(), v415, v740); + svfloat32_t v310 = svadd_f32_x(svptrue_b32(), v306, v308); + svfloat32_t v311 = svsub_f32_x(svptrue_b32(), v306, v308); + svfloat32_t v312 = svadd_f32_x(svptrue_b32(), v307, v309); + svfloat32_t zero335 = svdup_n_f32(0); svfloat32_t v335 = svcmla_f32_x(pred_full, zero335, v726, v307, 90); - svfloat32_t v366; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v366) : "w"(v363), "w"(v284)); - svfloat32_t v376; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v376) : "w"(v363), "w"(v730)); - svfloat32_t zero395; - asm volatile("mov %0.s, #0" : "=w"(zero395)); + svfloat32_t v366 = svadd_f32_x(svptrue_b32(), v363, v284); + svfloat32_t v376 = svmul_f32_x(svptrue_b32(), v363, v730); + svfloat32_t zero395 = svdup_n_f32(0); svfloat32_t v395 = svcmla_f32_x(pred_full, zero395, v733, v365, 90); - svfloat32_t v419; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v419) : "w"(v416), "w"(v285)); - svfloat32_t zero440; - asm volatile("mov %0.s, #0" : "=w"(zero440)); + svfloat32_t v419 = svadd_f32_x(svptrue_b32(), v416, v285); + svfloat32_t zero440 = svdup_n_f32(0); svfloat32_t v440 = svcmla_f32_x(pred_full, zero440, v737, v417, 90); - svfloat32_t v450; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v450) : "w"(v418), "w"(v739)); - svfloat32_t v313; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v313) : "w"(v310), "w"(v293)); - svfloat32_t zero342; - asm volatile("mov %0.s, #0" : "=w"(zero342)); + svfloat32_t v450 = svmul_f32_x(svptrue_b32(), v418, v739); + svfloat32_t v313 = svadd_f32_x(svptrue_b32(), v310, v293); + svfloat32_t zero342 = svdup_n_f32(0); svfloat32_t v342 = svcmla_f32_x(pred_full, zero342, v727, v312, 90); - svfloat32_t v406; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v406) : "w"(v388), "w"(v395)); + svfloat32_t v406 = svsub_f32_x(svptrue_b32(), v388, v395); svfloat32_t v407 = svcmla_f32_x(pred_full, v395, v734, v362, 90); - svfloat32_t zero426; - asm volatile("mov %0.s, #0" : "=w"(zero426)); + svfloat32_t zero426 = svdup_n_f32(0); svfloat32_t v426 = svcmla_f32_x(pred_full, zero426, v735, v419, 90); svfloat32_t v459 = svnmls_f32_x(pred_full, v450, v413, v738); svfloat32_t v460 = svmla_f32_x(pred_full, v455, v418, v739); svfloat32_t v350 = svmla_f32_x(pred_full, v313, v310, v724); - svfloat32_t v353; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v353) : "w"(v335), "w"(v342)); + svfloat32_t v353 = svsub_f32_x(svptrue_b32(), v335, v342); svfloat32_t v354 = svcmla_f32_x(pred_full, v342, v728, v309, 90); svfloat32_t v403 = svmla_f32_x(pred_full, v376, v366, v729); svfloat32_t v456 = svcmla_f32_x(pred_full, v426, v736, v416, 90); @@ -5965,68 +5329,40 @@ void armral_fft_cf32_cf32_cf32_ab_t_gu15(const armral_cmplx_f32_t *restrict x, svfloat32_t v352 = svmls_f32_x(pred_full, v350, v311, v725); svfloat32_t v404 = svmla_f32_x(pred_full, v403, v364, v731); svfloat32_t v405 = svmls_f32_x(pred_full, v403, v364, v731); - svfloat32_t v457; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v457) : "w"(v456), "w"(v440)); - svfloat32_t v458; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v458) : "w"(v456), "w"(v440)); - svfloat32_t v466; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v466) : "w"(v465), "w"(v426)); - svfloat32_t v467; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v467) : "w"(v465), "w"(v426)); - svfloat32_t v355; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v355) : "w"(v351), "w"(v353)); - svfloat32_t v356; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v356) : "w"(v351), "w"(v353)); - svfloat32_t v357; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v357) : "w"(v352), "w"(v354)); - svfloat32_t v358; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v358) : "w"(v352), "w"(v354)); - svfloat32_t v408; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v408) : "w"(v404), "w"(v406)); - svfloat32_t v409; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v409) : "w"(v404), "w"(v406)); - svfloat32_t v410; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v410) : "w"(v405), "w"(v407)); - svfloat32_t v411; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v411) : "w"(v405), "w"(v407)); - svfloat32_t v461; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v461) : "w"(v457), "w"(v459)); - svfloat32_t v462; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v462) : "w"(v457), "w"(v459)); - svfloat32_t v463; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v463) : "w"(v458), "w"(v460)); - svfloat32_t v464; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v464) : "w"(v458), "w"(v460)); + svfloat32_t v457 = svadd_f32_x(svptrue_b32(), v456, v440); + svfloat32_t v458 = svsub_f32_x(svptrue_b32(), v456, v440); + svfloat32_t v466 = svadd_f32_x(svptrue_b32(), v465, v426); + svfloat32_t v467 = svsub_f32_x(svptrue_b32(), v465, v426); + svfloat32_t v355 = svadd_f32_x(svptrue_b32(), v351, v353); + svfloat32_t v356 = svsub_f32_x(svptrue_b32(), v351, v353); + svfloat32_t v357 = svadd_f32_x(svptrue_b32(), v352, v354); + svfloat32_t v358 = svsub_f32_x(svptrue_b32(), v352, v354); + svfloat32_t v408 = svadd_f32_x(svptrue_b32(), v404, v406); + svfloat32_t v409 = svsub_f32_x(svptrue_b32(), v404, v406); + svfloat32_t v410 = svadd_f32_x(svptrue_b32(), v405, v407); + svfloat32_t v411 = svsub_f32_x(svptrue_b32(), v405, v407); + svfloat32_t v461 = svadd_f32_x(svptrue_b32(), v457, v459); + svfloat32_t v462 = svsub_f32_x(svptrue_b32(), v457, v459); + svfloat32_t v463 = svadd_f32_x(svptrue_b32(), v458, v460); + svfloat32_t v464 = svsub_f32_x(svptrue_b32(), v458, v460); svst1_f64(pred_full, (double *)(v757), svreinterpret_f64_f32(v467)); svst1_f64(pred_full, (double *)(v766), svreinterpret_f64_f32(v466)); - svfloat32_t v489; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v489) : "w"(v356), "w"(v409)); - svfloat32_t v513; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v513) : "w"(v358), "w"(v411)); - svfloat32_t v537; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v537) : "w"(v357), "w"(v410)); - svfloat32_t v561; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v561) : "w"(v355), "w"(v408)); + svfloat32_t v489 = svadd_f32_x(svptrue_b32(), v356, v409); + svfloat32_t v513 = svadd_f32_x(svptrue_b32(), v358, v411); + svfloat32_t v537 = svadd_f32_x(svptrue_b32(), v357, v410); + svfloat32_t v561 = svadd_f32_x(svptrue_b32(), v355, v408); svst1_f64(pred_full, (double *)(v775), svreinterpret_f64_f32(v356)); svst1_f64(pred_full, (double *)(v802), svreinterpret_f64_f32(v358)); svst1_f64(pred_full, (double *)(v829), svreinterpret_f64_f32(v357)); svst1_f64(pred_full, (double *)(v856), svreinterpret_f64_f32(v355)); - svfloat32_t v490; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v490) : "w"(v489), "w"(v462)); - svfloat32_t v491; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v491) : "w"(v489), "w"(v462)); - svfloat32_t v514; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v514) : "w"(v513), "w"(v464)); - svfloat32_t v515; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v515) : "w"(v513), "w"(v464)); - svfloat32_t v538; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v538) : "w"(v537), "w"(v463)); - svfloat32_t v539; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v539) : "w"(v537), "w"(v463)); - svfloat32_t v562; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v562) : "w"(v561), "w"(v461)); - svfloat32_t v563; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v563) : "w"(v561), "w"(v461)); + svfloat32_t v490 = svadd_f32_x(svptrue_b32(), v489, v462); + svfloat32_t v491 = svsub_f32_x(svptrue_b32(), v489, v462); + svfloat32_t v514 = svadd_f32_x(svptrue_b32(), v513, v464); + svfloat32_t v515 = svsub_f32_x(svptrue_b32(), v513, v464); + svfloat32_t v538 = svadd_f32_x(svptrue_b32(), v537, v463); + svfloat32_t v539 = svsub_f32_x(svptrue_b32(), v537, v463); + svfloat32_t v562 = svadd_f32_x(svptrue_b32(), v561, v461); + svfloat32_t v563 = svsub_f32_x(svptrue_b32(), v561, v461); svst1_f64(pred_full, (double *)(v784), svreinterpret_f64_f32(v491)); svst1_f64(pred_full, (double *)(v793), svreinterpret_f64_f32(v490)); svst1_f64(pred_full, (double *)(v811), svreinterpret_f64_f32(v515)); @@ -6490,8 +5826,7 @@ void armral_fft_cf32_cf32_cf32_ab_t_gu16(const armral_cmplx_f32_t *restrict x, svld1_f64(pred_full, &((const double *)v7)[v155])); svfloat32_t v163 = svreinterpret_f32_f64( svld1_f64(pred_full, &((const double *)v7)[v162])); - svfloat32_t zero199; - asm volatile("mov %0.s, #0" : "=w"(zero199)); + svfloat32_t zero199 = svdup_n_f32(0); svfloat32_t v199 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero199, v713, v198, 0), v713, v198, 90); @@ -6537,262 +5872,165 @@ void armral_fft_cf32_cf32_cf32_ab_t_gu16(const armral_cmplx_f32_t *restrict x, svld1_gather_s64index_f64(pred_full, (const double *)(v767), v787)); svfloat32_t v778 = svreinterpret_f32_f64( svld1_gather_s64index_f64(pred_full, (const double *)(v776), v787)); - svfloat32_t zero38; - asm volatile("mov %0.s, #0" : "=w"(zero38)); + svfloat32_t zero38 = svdup_n_f32(0); svfloat32_t v38 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero38, v650, v37, 0), v650, v37, 90); - svfloat32_t zero73; - asm volatile("mov %0.s, #0" : "=w"(zero73)); + svfloat32_t zero73 = svdup_n_f32(0); svfloat32_t v73 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero73, v659, v72, 0), v659, v72, 90); - svfloat32_t zero80; - asm volatile("mov %0.s, #0" : "=w"(zero80)); + svfloat32_t zero80 = svdup_n_f32(0); svfloat32_t v80 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero80, v668, v79, 0), v668, v79, 90); - svfloat32_t zero115; - asm volatile("mov %0.s, #0" : "=w"(zero115)); + svfloat32_t zero115 = svdup_n_f32(0); svfloat32_t v115 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero115, v677, v114, 0), v677, v114, 90); - svfloat32_t zero122; - asm volatile("mov %0.s, #0" : "=w"(zero122)); + svfloat32_t zero122 = svdup_n_f32(0); svfloat32_t v122 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero122, v686, v121, 0), v686, v121, 90); - svfloat32_t zero157; - asm volatile("mov %0.s, #0" : "=w"(zero157)); + svfloat32_t zero157 = svdup_n_f32(0); svfloat32_t v157 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero157, v695, v156, 0), v695, v156, 90); - svfloat32_t zero164; - asm volatile("mov %0.s, #0" : "=w"(zero164)); + svfloat32_t zero164 = svdup_n_f32(0); svfloat32_t v164 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero164, v704, v163, 0), v704, v163, 90); - svfloat32_t zero206; - asm volatile("mov %0.s, #0" : "=w"(zero206)); + svfloat32_t zero206 = svdup_n_f32(0); svfloat32_t v206 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero206, v723, v205, 0), v723, v205, 90); - svfloat32_t zero241; - asm volatile("mov %0.s, #0" : "=w"(zero241)); + svfloat32_t zero241 = svdup_n_f32(0); svfloat32_t v241 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero241, v733, v240, 0), v733, v240, 90); - svfloat32_t zero248; - asm volatile("mov %0.s, #0" : "=w"(zero248)); + svfloat32_t zero248 = svdup_n_f32(0); svfloat32_t v248 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero248, v742, v247, 0), v742, v247, 90); - svfloat32_t zero283; - asm volatile("mov %0.s, #0" : "=w"(zero283)); + svfloat32_t zero283 = svdup_n_f32(0); svfloat32_t v283 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero283, v751, v282, 0), v751, v282, 90); - svfloat32_t zero290; - asm volatile("mov %0.s, #0" : "=w"(zero290)); + svfloat32_t zero290 = svdup_n_f32(0); svfloat32_t v290 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero290, v760, v289, 0), v760, v289, 90); - svfloat32_t zero325; - asm volatile("mov %0.s, #0" : "=w"(zero325)); + svfloat32_t zero325 = svdup_n_f32(0); svfloat32_t v325 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero325, v769, v324, 0), v769, v324, 90); - svfloat32_t zero332; - asm volatile("mov %0.s, #0" : "=w"(zero332)); + svfloat32_t zero332 = svdup_n_f32(0); svfloat32_t v332 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero332, v778, v331, 0), v778, v331, 90); - svfloat32_t v340; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v340) : "w"(v788), "w"(v38)); - svfloat32_t v341; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v341) : "w"(v788), "w"(v38)); - svfloat32_t v342; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v342) : "w"(v73), "w"(v80)); - svfloat32_t v343; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v343) : "w"(v73), "w"(v80)); - svfloat32_t v344; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v344) : "w"(v115), "w"(v122)); - svfloat32_t v345; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v345) : "w"(v115), "w"(v122)); - svfloat32_t v346; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v346) : "w"(v157), "w"(v164)); - svfloat32_t v347; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v347) : "w"(v157), "w"(v164)); - svfloat32_t v348; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v348) : "w"(v199), "w"(v206)); - svfloat32_t v349; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v349) : "w"(v199), "w"(v206)); - svfloat32_t v350; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v350) : "w"(v241), "w"(v248)); - svfloat32_t v351; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v351) : "w"(v241), "w"(v248)); - svfloat32_t v352; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v352) : "w"(v283), "w"(v290)); - svfloat32_t v353; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v353) : "w"(v283), "w"(v290)); - svfloat32_t v354; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v354) : "w"(v325), "w"(v332)); - svfloat32_t v355; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v355) : "w"(v325), "w"(v332)); - svfloat32_t v356; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v356) : "w"(v340), "w"(v342)); - svfloat32_t v357; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v357) : "w"(v340), "w"(v342)); - svfloat32_t v358; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v358) : "w"(v344), "w"(v346)); - svfloat32_t v359; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v359) : "w"(v344), "w"(v346)); - svfloat32_t v360; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v360) : "w"(v348), "w"(v350)); - svfloat32_t v361; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v361) : "w"(v348), "w"(v350)); - svfloat32_t v362; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v362) : "w"(v352), "w"(v354)); - svfloat32_t v363; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v363) : "w"(v352), "w"(v354)); - svfloat32_t v372; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v372) : "w"(v345), "w"(v347)); - svfloat32_t v373; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v373) : "w"(v345), "w"(v347)); - svfloat32_t v374; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v374) : "w"(v349), "w"(v355)); - svfloat32_t v375; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v375) : "w"(v349), "w"(v355)); - svfloat32_t v376; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v376) : "w"(v351), "w"(v353)); - svfloat32_t v377; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v377) : "w"(v351), "w"(v353)); - svfloat32_t zero437; - asm volatile("mov %0.s, #0" : "=w"(zero437)); + svfloat32_t v340 = svadd_f32_x(svptrue_b32(), v788, v38); + svfloat32_t v341 = svsub_f32_x(svptrue_b32(), v788, v38); + svfloat32_t v342 = svadd_f32_x(svptrue_b32(), v73, v80); + svfloat32_t v343 = svsub_f32_x(svptrue_b32(), v73, v80); + svfloat32_t v344 = svadd_f32_x(svptrue_b32(), v115, v122); + svfloat32_t v345 = svsub_f32_x(svptrue_b32(), v115, v122); + svfloat32_t v346 = svadd_f32_x(svptrue_b32(), v157, v164); + svfloat32_t v347 = svsub_f32_x(svptrue_b32(), v157, v164); + svfloat32_t v348 = svadd_f32_x(svptrue_b32(), v199, v206); + svfloat32_t v349 = svsub_f32_x(svptrue_b32(), v199, v206); + svfloat32_t v350 = svadd_f32_x(svptrue_b32(), v241, v248); + svfloat32_t v351 = svsub_f32_x(svptrue_b32(), v241, v248); + svfloat32_t v352 = svadd_f32_x(svptrue_b32(), v283, v290); + svfloat32_t v353 = svsub_f32_x(svptrue_b32(), v283, v290); + svfloat32_t v354 = svadd_f32_x(svptrue_b32(), v325, v332); + svfloat32_t v355 = svsub_f32_x(svptrue_b32(), v325, v332); + svfloat32_t v356 = svadd_f32_x(svptrue_b32(), v340, v342); + svfloat32_t v357 = svsub_f32_x(svptrue_b32(), v340, v342); + svfloat32_t v358 = svadd_f32_x(svptrue_b32(), v344, v346); + svfloat32_t v359 = svsub_f32_x(svptrue_b32(), v344, v346); + svfloat32_t v360 = svadd_f32_x(svptrue_b32(), v348, v350); + svfloat32_t v361 = svsub_f32_x(svptrue_b32(), v348, v350); + svfloat32_t v362 = svadd_f32_x(svptrue_b32(), v352, v354); + svfloat32_t v363 = svsub_f32_x(svptrue_b32(), v352, v354); + svfloat32_t v372 = svadd_f32_x(svptrue_b32(), v345, v347); + svfloat32_t v373 = svsub_f32_x(svptrue_b32(), v345, v347); + svfloat32_t v374 = svadd_f32_x(svptrue_b32(), v349, v355); + svfloat32_t v375 = svsub_f32_x(svptrue_b32(), v349, v355); + svfloat32_t v376 = svadd_f32_x(svptrue_b32(), v351, v353); + svfloat32_t v377 = svsub_f32_x(svptrue_b32(), v351, v353); + svfloat32_t zero437 = svdup_n_f32(0); svfloat32_t v437 = svcmla_f32_x(pred_full, zero437, v798, v343, 90); - svfloat32_t v364; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v364) : "w"(v356), "w"(v358)); - svfloat32_t v365; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v365) : "w"(v356), "w"(v358)); - svfloat32_t v366; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v366) : "w"(v360), "w"(v362)); - svfloat32_t v367; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v367) : "w"(v360), "w"(v362)); - svfloat32_t v370; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v370) : "w"(v361), "w"(v363)); - svfloat32_t v371; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v371) : "w"(v361), "w"(v363)); - svfloat32_t v378; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v378) : "w"(v374), "w"(v376)); - svfloat32_t v379; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v379) : "w"(v375), "w"(v377)); - svfloat32_t zero413; - asm volatile("mov %0.s, #0" : "=w"(zero413)); + svfloat32_t v364 = svadd_f32_x(svptrue_b32(), v356, v358); + svfloat32_t v365 = svsub_f32_x(svptrue_b32(), v356, v358); + svfloat32_t v366 = svadd_f32_x(svptrue_b32(), v360, v362); + svfloat32_t v367 = svsub_f32_x(svptrue_b32(), v360, v362); + svfloat32_t v370 = svadd_f32_x(svptrue_b32(), v361, v363); + svfloat32_t v371 = svsub_f32_x(svptrue_b32(), v361, v363); + svfloat32_t v378 = svadd_f32_x(svptrue_b32(), v374, v376); + svfloat32_t v379 = svadd_f32_x(svptrue_b32(), v375, v377); + svfloat32_t zero413 = svdup_n_f32(0); svfloat32_t v413 = svcmla_f32_x(pred_full, zero413, v798, v359, 90); - svfloat32_t zero444; - asm volatile("mov %0.s, #0" : "=w"(zero444)); + svfloat32_t zero444 = svdup_n_f32(0); svfloat32_t v444 = svcmla_f32_x(pred_full, zero444, v799, v372, 90); - svfloat32_t zero470; - asm volatile("mov %0.s, #0" : "=w"(zero470)); + svfloat32_t zero470 = svdup_n_f32(0); svfloat32_t v470 = svcmla_f32_x(pred_full, zero470, v803, v376, 90); - svfloat32_t v480; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v480) : "w"(v375), "w"(v805)); - svfloat32_t v485; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v485) : "w"(v377), "w"(v806)); - svfloat32_t v368; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v368) : "w"(v364), "w"(v366)); - svfloat32_t v369; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v369) : "w"(v364), "w"(v366)); - svfloat32_t zero401; - asm volatile("mov %0.s, #0" : "=w"(zero401)); + svfloat32_t v480 = svmul_f32_x(svptrue_b32(), v375, v805); + svfloat32_t v485 = svmul_f32_x(svptrue_b32(), v377, v806); + svfloat32_t v368 = svadd_f32_x(svptrue_b32(), v364, v366); + svfloat32_t v369 = svsub_f32_x(svptrue_b32(), v364, v366); + svfloat32_t zero401 = svdup_n_f32(0); svfloat32_t v401 = svcmla_f32_x(pred_full, zero401, v798, v367, 90); - svfloat32_t zero420; - asm volatile("mov %0.s, #0" : "=w"(zero420)); + svfloat32_t zero420 = svdup_n_f32(0); svfloat32_t v420 = svcmla_f32_x(pred_full, zero420, v799, v370, 90); - svfloat32_t zero456; - asm volatile("mov %0.s, #0" : "=w"(zero456)); + svfloat32_t zero456 = svdup_n_f32(0); svfloat32_t v456 = svcmla_f32_x(pred_full, zero456, v801, v378, 90); - svfloat32_t v475; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v475) : "w"(v379), "w"(v804)); + svfloat32_t v475 = svmul_f32_x(svptrue_b32(), v379, v804); svfloat32_t v496 = svmla_f32_x(pred_full, v341, v373, v800); svfloat32_t v497 = svmls_f32_x(pred_full, v341, v373, v800); - svfloat32_t v498; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v498) : "w"(v437), "w"(v444)); - svfloat32_t v499; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v499) : "w"(v437), "w"(v444)); - svfloat32_t v486; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v486) : "w"(v365), "w"(v401)); - svfloat32_t v487; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v487) : "w"(v365), "w"(v401)); + svfloat32_t v498 = svadd_f32_x(svptrue_b32(), v437, v444); + svfloat32_t v499 = svsub_f32_x(svptrue_b32(), v437, v444); + svfloat32_t v486 = svadd_f32_x(svptrue_b32(), v365, v401); + svfloat32_t v487 = svsub_f32_x(svptrue_b32(), v365, v401); svfloat32_t v488 = svmla_f32_x(pred_full, v357, v371, v800); - svfloat32_t v489; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v489) : "w"(v413), "w"(v420)); + svfloat32_t v489 = svadd_f32_x(svptrue_b32(), v413, v420); svfloat32_t v490 = svmls_f32_x(pred_full, v357, v371, v800); - svfloat32_t v491; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v491) : "w"(v420), "w"(v413)); + svfloat32_t v491 = svsub_f32_x(svptrue_b32(), v420, v413); svfloat32_t v500 = svcmla_f32_x(pred_full, v456, v802, v374, 90); - svfloat32_t v501; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v501) : "w"(v456), "w"(v470)); + svfloat32_t v501 = svsub_f32_x(svptrue_b32(), v456, v470); svfloat32_t v502 = svnmls_f32_x(pred_full, v475, v375, v805); svfloat32_t v503 = svnmls_f32_x(pred_full, v475, v377, v806); svfloat32_t v504 = svnmls_f32_x(pred_full, v480, v379, v804); svfloat32_t v505 = svnmls_f32_x(pred_full, v485, v379, v804); - svfloat32_t v510; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v510) : "w"(v497), "w"(v499)); - svfloat32_t v511; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v511) : "w"(v497), "w"(v499)); + svfloat32_t v510 = svadd_f32_x(svptrue_b32(), v497, v499); + svfloat32_t v511 = svsub_f32_x(svptrue_b32(), v497, v499); svst1_f64(pred_full, (double *)(v814), svreinterpret_f64_f32(v368)); svst1_f64(pred_full, (double *)(v886), svreinterpret_f64_f32(v369)); - svfloat32_t v492; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v492) : "w"(v488), "w"(v489)); - svfloat32_t v493; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v493) : "w"(v490), "w"(v491)); - svfloat32_t v494; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v494) : "w"(v490), "w"(v491)); - svfloat32_t v495; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v495) : "w"(v488), "w"(v489)); - svfloat32_t v506; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v506) : "w"(v496), "w"(v502)); - svfloat32_t v507; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v507) : "w"(v496), "w"(v502)); - svfloat32_t v508; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v508) : "w"(v496), "w"(v504)); - svfloat32_t v509; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v509) : "w"(v496), "w"(v504)); - svfloat32_t v512; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v512) : "w"(v497), "w"(v505)); - svfloat32_t v513; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v513) : "w"(v497), "w"(v505)); - svfloat32_t v516; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v516) : "w"(v500), "w"(v498)); - svfloat32_t v517; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v517) : "w"(v500), "w"(v498)); - svfloat32_t v518; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v518) : "w"(v501), "w"(v503)); - svfloat32_t v519; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v519) : "w"(v501), "w"(v503)); - svfloat32_t v520; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v520) : "w"(v501), "w"(v499)); - svfloat32_t v521; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v521) : "w"(v501), "w"(v499)); + svfloat32_t v492 = svadd_f32_x(svptrue_b32(), v488, v489); + svfloat32_t v493 = svadd_f32_x(svptrue_b32(), v490, v491); + svfloat32_t v494 = svsub_f32_x(svptrue_b32(), v490, v491); + svfloat32_t v495 = svsub_f32_x(svptrue_b32(), v488, v489); + svfloat32_t v506 = svadd_f32_x(svptrue_b32(), v496, v502); + svfloat32_t v507 = svsub_f32_x(svptrue_b32(), v496, v502); + svfloat32_t v508 = svadd_f32_x(svptrue_b32(), v496, v504); + svfloat32_t v509 = svsub_f32_x(svptrue_b32(), v496, v504); + svfloat32_t v512 = svadd_f32_x(svptrue_b32(), v497, v505); + svfloat32_t v513 = svsub_f32_x(svptrue_b32(), v497, v505); + svfloat32_t v516 = svadd_f32_x(svptrue_b32(), v500, v498); + svfloat32_t v517 = svsub_f32_x(svptrue_b32(), v500, v498); + svfloat32_t v518 = svadd_f32_x(svptrue_b32(), v501, v503); + svfloat32_t v519 = svsub_f32_x(svptrue_b32(), v501, v503); + svfloat32_t v520 = svadd_f32_x(svptrue_b32(), v501, v499); + svfloat32_t v521 = svsub_f32_x(svptrue_b32(), v501, v499); svst1_f64(pred_full, (double *)(v850), svreinterpret_f64_f32(v487)); svst1_f64(pred_full, (double *)(v922), svreinterpret_f64_f32(v486)); - svfloat32_t v522; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v522) : "w"(v506), "w"(v516)); - svfloat32_t v523; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v523) : "w"(v507), "w"(v517)); - svfloat32_t v524; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v524) : "w"(v508), "w"(v517)); - svfloat32_t v525; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v525) : "w"(v509), "w"(v516)); - svfloat32_t v526; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v526) : "w"(v510), "w"(v518)); - svfloat32_t v527; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v527) : "w"(v511), "w"(v519)); - svfloat32_t v528; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v528) : "w"(v512), "w"(v521)); - svfloat32_t v529; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v529) : "w"(v513), "w"(v520)); + svfloat32_t v522 = svadd_f32_x(svptrue_b32(), v506, v516); + svfloat32_t v523 = svadd_f32_x(svptrue_b32(), v507, v517); + svfloat32_t v524 = svsub_f32_x(svptrue_b32(), v508, v517); + svfloat32_t v525 = svsub_f32_x(svptrue_b32(), v509, v516); + svfloat32_t v526 = svadd_f32_x(svptrue_b32(), v510, v518); + svfloat32_t v527 = svadd_f32_x(svptrue_b32(), v511, v519); + svfloat32_t v528 = svsub_f32_x(svptrue_b32(), v512, v521); + svfloat32_t v529 = svsub_f32_x(svptrue_b32(), v513, v520); svst1_f64(pred_full, (double *)(v832), svreinterpret_f64_f32(v495)); svst1_f64(pred_full, (double *)(v868), svreinterpret_f64_f32(v494)); svst1_f64(pred_full, (double *)(v904), svreinterpret_f64_f32(v493)); @@ -7527,8 +6765,7 @@ void armral_fft_cf32_cf32_cf32_ab_t_gu17(const armral_cmplx_f32_t *restrict x, float32x2_t *v1184 = &v6[v837]; float32x2_t *v1193 = &v6[v845]; float32x2_t *v1202 = &v6[v853]; - svfloat32_t zero52; - asm volatile("mov %0.s, #0" : "=w"(zero52)); + svfloat32_t zero52 = svdup_n_f32(0); svfloat32_t v52 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero52, v867, v51, 0), v867, v51, 90); @@ -7592,242 +6829,151 @@ void armral_fft_cf32_cf32_cf32_ab_t_gu17(const armral_cmplx_f32_t *restrict x, svld1_gather_s64index_f64(pred_full, (const double *)(v993), v1013)); svfloat32_t v1004 = svreinterpret_f32_f64( svld1_gather_s64index_f64(pred_full, (const double *)(v1002), v1013)); - svfloat32_t zero59; - asm volatile("mov %0.s, #0" : "=w"(zero59)); + svfloat32_t zero59 = svdup_n_f32(0); svfloat32_t v59 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero59, v877, v58, 0), v877, v58, 90); - svfloat32_t zero94; - asm volatile("mov %0.s, #0" : "=w"(zero94)); + svfloat32_t zero94 = svdup_n_f32(0); svfloat32_t v94 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero94, v887, v93, 0), v887, v93, 90); - svfloat32_t zero101; - asm volatile("mov %0.s, #0" : "=w"(zero101)); + svfloat32_t zero101 = svdup_n_f32(0); svfloat32_t v101 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero101, v896, v100, 0), v896, v100, 90); - svfloat32_t zero136; - asm volatile("mov %0.s, #0" : "=w"(zero136)); + svfloat32_t zero136 = svdup_n_f32(0); svfloat32_t v136 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero136, v905, v135, 0), v905, v135, 90); - svfloat32_t zero143; - asm volatile("mov %0.s, #0" : "=w"(zero143)); + svfloat32_t zero143 = svdup_n_f32(0); svfloat32_t v143 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero143, v914, v142, 0), v914, v142, 90); - svfloat32_t zero178; - asm volatile("mov %0.s, #0" : "=w"(zero178)); + svfloat32_t zero178 = svdup_n_f32(0); svfloat32_t v178 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero178, v923, v177, 0), v923, v177, 90); - svfloat32_t zero185; - asm volatile("mov %0.s, #0" : "=w"(zero185)); + svfloat32_t zero185 = svdup_n_f32(0); svfloat32_t v185 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero185, v932, v184, 0), v932, v184, 90); - svfloat32_t zero220; - asm volatile("mov %0.s, #0" : "=w"(zero220)); + svfloat32_t zero220 = svdup_n_f32(0); svfloat32_t v220 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero220, v941, v219, 0), v941, v219, 90); - svfloat32_t zero227; - asm volatile("mov %0.s, #0" : "=w"(zero227)); + svfloat32_t zero227 = svdup_n_f32(0); svfloat32_t v227 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero227, v950, v226, 0), v950, v226, 90); - svfloat32_t zero262; - asm volatile("mov %0.s, #0" : "=w"(zero262)); + svfloat32_t zero262 = svdup_n_f32(0); svfloat32_t v262 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero262, v959, v261, 0), v959, v261, 90); - svfloat32_t zero269; - asm volatile("mov %0.s, #0" : "=w"(zero269)); + svfloat32_t zero269 = svdup_n_f32(0); svfloat32_t v269 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero269, v968, v268, 0), v968, v268, 90); - svfloat32_t zero304; - asm volatile("mov %0.s, #0" : "=w"(zero304)); + svfloat32_t zero304 = svdup_n_f32(0); svfloat32_t v304 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero304, v977, v303, 0), v977, v303, 90); - svfloat32_t zero311; - asm volatile("mov %0.s, #0" : "=w"(zero311)); + svfloat32_t zero311 = svdup_n_f32(0); svfloat32_t v311 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero311, v986, v310, 0), v986, v310, 90); - svfloat32_t zero346; - asm volatile("mov %0.s, #0" : "=w"(zero346)); + svfloat32_t zero346 = svdup_n_f32(0); svfloat32_t v346 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero346, v995, v345, 0), v995, v345, 90); - svfloat32_t zero353; - asm volatile("mov %0.s, #0" : "=w"(zero353)); + svfloat32_t zero353 = svdup_n_f32(0); svfloat32_t v353 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero353, v1004, v352, 0), v1004, v352, 90); - svfloat32_t v354; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v354) : "w"(v52), "w"(v59)); - svfloat32_t v355; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v355) : "w"(v52), "w"(v59)); - svfloat32_t v356; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v356) : "w"(v94), "w"(v101)); - svfloat32_t v357; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v357) : "w"(v94), "w"(v101)); - svfloat32_t v358; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v358) : "w"(v136), "w"(v143)); - svfloat32_t v359; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v359) : "w"(v136), "w"(v143)); - svfloat32_t v360; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v360) : "w"(v178), "w"(v185)); - svfloat32_t v361; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v361) : "w"(v178), "w"(v185)); - svfloat32_t v362; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v362) : "w"(v220), "w"(v227)); - svfloat32_t v363; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v363) : "w"(v220), "w"(v227)); - svfloat32_t v364; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v364) : "w"(v262), "w"(v269)); - svfloat32_t v365; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v365) : "w"(v262), "w"(v269)); - svfloat32_t v366; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v366) : "w"(v304), "w"(v311)); - svfloat32_t v367; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v367) : "w"(v304), "w"(v311)); - svfloat32_t v368; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v368) : "w"(v346), "w"(v353)); - svfloat32_t v369; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v369) : "w"(v346), "w"(v353)); - svfloat32_t v370; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v370) : "w"(v354), "w"(v362)); - svfloat32_t v371; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v371) : "w"(v356), "w"(v364)); - svfloat32_t v372; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v372) : "w"(v358), "w"(v366)); - svfloat32_t v373; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v373) : "w"(v360), "w"(v368)); - svfloat32_t v376; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v376) : "w"(v354), "w"(v362)); - svfloat32_t v377; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v377) : "w"(v356), "w"(v364)); - svfloat32_t v378; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v378) : "w"(v358), "w"(v366)); - svfloat32_t v379; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v379) : "w"(v360), "w"(v368)); - svfloat32_t v390; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v390) : "w"(v355), "w"(v359)); - svfloat32_t v391; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v391) : "w"(v357), "w"(v361)); - svfloat32_t v392; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v392) : "w"(v355), "w"(v359)); - svfloat32_t v393; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v393) : "w"(v369), "w"(v365)); - svfloat32_t v394; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v394) : "w"(v363), "w"(v367)); - svfloat32_t v395; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v395) : "w"(v365), "w"(v369)); - svfloat32_t v396; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v396) : "w"(v363), "w"(v367)); - svfloat32_t v397; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v397) : "w"(v357), "w"(v361)); - svfloat32_t v410; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v410) : "w"(v355), "w"(v363)); - svfloat32_t v411; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v411) : "w"(v361), "w"(v369)); - svfloat32_t v374; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v374) : "w"(v370), "w"(v372)); - svfloat32_t v375; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v375) : "w"(v371), "w"(v373)); - svfloat32_t v380; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v380) : "w"(v370), "w"(v372)); - svfloat32_t v381; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v381) : "w"(v371), "w"(v373)); - svfloat32_t v384; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v384) : "w"(v377), "w"(v379)); - svfloat32_t v385; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v385) : "w"(v376), "w"(v378)); - svfloat32_t v387; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v387) : "w"(v378), "w"(v379)); - svfloat32_t v388; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v388) : "w"(v376), "w"(v377)); - svfloat32_t v398; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v398) : "w"(v390), "w"(v391)); - svfloat32_t v399; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v399) : "w"(v394), "w"(v395)); - svfloat32_t v401; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v401) : "w"(v390), "w"(v391)); - svfloat32_t v402; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v402) : "w"(v394), "w"(v395)); - svfloat32_t v404; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v404) : "w"(v392), "w"(v393)); - svfloat32_t v405; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v405) : "w"(v396), "w"(v397)); - svfloat32_t v407; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v407) : "w"(v392), "w"(v393)); - svfloat32_t v408; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v408) : "w"(v396), "w"(v397)); - svfloat32_t v447; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v447) : "w"(v378), "w"(v1018)); - svfloat32_t zero614; - asm volatile("mov %0.s, #0" : "=w"(zero614)); + svfloat32_t v354 = svadd_f32_x(svptrue_b32(), v52, v59); + svfloat32_t v355 = svsub_f32_x(svptrue_b32(), v52, v59); + svfloat32_t v356 = svadd_f32_x(svptrue_b32(), v94, v101); + svfloat32_t v357 = svsub_f32_x(svptrue_b32(), v94, v101); + svfloat32_t v358 = svadd_f32_x(svptrue_b32(), v136, v143); + svfloat32_t v359 = svsub_f32_x(svptrue_b32(), v136, v143); + svfloat32_t v360 = svadd_f32_x(svptrue_b32(), v178, v185); + svfloat32_t v361 = svsub_f32_x(svptrue_b32(), v178, v185); + svfloat32_t v362 = svadd_f32_x(svptrue_b32(), v220, v227); + svfloat32_t v363 = svsub_f32_x(svptrue_b32(), v220, v227); + svfloat32_t v364 = svadd_f32_x(svptrue_b32(), v262, v269); + svfloat32_t v365 = svsub_f32_x(svptrue_b32(), v262, v269); + svfloat32_t v366 = svadd_f32_x(svptrue_b32(), v304, v311); + svfloat32_t v367 = svsub_f32_x(svptrue_b32(), v304, v311); + svfloat32_t v368 = svadd_f32_x(svptrue_b32(), v346, v353); + svfloat32_t v369 = svsub_f32_x(svptrue_b32(), v346, v353); + svfloat32_t v370 = svadd_f32_x(svptrue_b32(), v354, v362); + svfloat32_t v371 = svadd_f32_x(svptrue_b32(), v356, v364); + svfloat32_t v372 = svadd_f32_x(svptrue_b32(), v358, v366); + svfloat32_t v373 = svadd_f32_x(svptrue_b32(), v360, v368); + svfloat32_t v376 = svsub_f32_x(svptrue_b32(), v354, v362); + svfloat32_t v377 = svsub_f32_x(svptrue_b32(), v356, v364); + svfloat32_t v378 = svsub_f32_x(svptrue_b32(), v358, v366); + svfloat32_t v379 = svsub_f32_x(svptrue_b32(), v360, v368); + svfloat32_t v390 = svadd_f32_x(svptrue_b32(), v355, v359); + svfloat32_t v391 = svadd_f32_x(svptrue_b32(), v357, v361); + svfloat32_t v392 = svsub_f32_x(svptrue_b32(), v355, v359); + svfloat32_t v393 = svsub_f32_x(svptrue_b32(), v369, v365); + svfloat32_t v394 = svadd_f32_x(svptrue_b32(), v363, v367); + svfloat32_t v395 = svadd_f32_x(svptrue_b32(), v365, v369); + svfloat32_t v396 = svsub_f32_x(svptrue_b32(), v363, v367); + svfloat32_t v397 = svsub_f32_x(svptrue_b32(), v357, v361); + svfloat32_t v410 = svadd_f32_x(svptrue_b32(), v355, v363); + svfloat32_t v411 = svadd_f32_x(svptrue_b32(), v361, v369); + svfloat32_t v374 = svadd_f32_x(svptrue_b32(), v370, v372); + svfloat32_t v375 = svadd_f32_x(svptrue_b32(), v371, v373); + svfloat32_t v380 = svsub_f32_x(svptrue_b32(), v370, v372); + svfloat32_t v381 = svsub_f32_x(svptrue_b32(), v371, v373); + svfloat32_t v384 = svadd_f32_x(svptrue_b32(), v377, v379); + svfloat32_t v385 = svadd_f32_x(svptrue_b32(), v376, v378); + svfloat32_t v387 = svsub_f32_x(svptrue_b32(), v378, v379); + svfloat32_t v388 = svsub_f32_x(svptrue_b32(), v376, v377); + svfloat32_t v398 = svadd_f32_x(svptrue_b32(), v390, v391); + svfloat32_t v399 = svadd_f32_x(svptrue_b32(), v394, v395); + svfloat32_t v401 = svsub_f32_x(svptrue_b32(), v390, v391); + svfloat32_t v402 = svsub_f32_x(svptrue_b32(), v394, v395); + svfloat32_t v404 = svadd_f32_x(svptrue_b32(), v392, v393); + svfloat32_t v405 = svadd_f32_x(svptrue_b32(), v396, v397); + svfloat32_t v407 = svsub_f32_x(svptrue_b32(), v392, v393); + svfloat32_t v408 = svsub_f32_x(svptrue_b32(), v396, v397); + svfloat32_t v447 = svmul_f32_x(svptrue_b32(), v378, v1018); + svfloat32_t zero614 = svdup_n_f32(0); svfloat32_t v614 = svcmla_f32_x(pred_full, zero614, v1045, v411, 90); - svfloat32_t v382; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v382) : "w"(v374), "w"(v375)); - svfloat32_t v383; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v383) : "w"(v374), "w"(v375)); - svfloat32_t v386; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v386) : "w"(v385), "w"(v384)); - svfloat32_t v389; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v389) : "w"(v380), "w"(v381)); - svfloat32_t v400; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v400) : "w"(v398), "w"(v399)); - svfloat32_t v403; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v403) : "w"(v401), "w"(v402)); - svfloat32_t v406; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v406) : "w"(v404), "w"(v405)); - svfloat32_t v409; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v409) : "w"(v407), "w"(v408)); - svfloat32_t v412; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v412) : "w"(v405), "w"(v399)); - svfloat32_t v415; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v415) : "w"(v398), "w"(v404)); - svfloat32_t v457; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v457) : "w"(v380), "w"(v1020)); - svfloat32_t v462; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v462) : "w"(v381), "w"(v1021)); - svfloat32_t v492; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v492) : "w"(v387), "w"(v1027)); - svfloat32_t v497; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v497) : "w"(v388), "w"(v1028)); - svfloat32_t v413; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v413) : "w"(v412), "w"(v355)); - svfloat32_t v416; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v416) : "w"(v415), "w"(v361)); - svfloat32_t v427; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v427) : "w"(v1014), "w"(v382)); - svfloat32_t v487; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v487) : "w"(v386), "w"(v1026)); - svfloat32_t zero523; - asm volatile("mov %0.s, #0" : "=w"(zero523)); + svfloat32_t v382 = svadd_f32_x(svptrue_b32(), v374, v375); + svfloat32_t v383 = svsub_f32_x(svptrue_b32(), v374, v375); + svfloat32_t v386 = svsub_f32_x(svptrue_b32(), v385, v384); + svfloat32_t v389 = svadd_f32_x(svptrue_b32(), v380, v381); + svfloat32_t v400 = svadd_f32_x(svptrue_b32(), v398, v399); + svfloat32_t v403 = svadd_f32_x(svptrue_b32(), v401, v402); + svfloat32_t v406 = svadd_f32_x(svptrue_b32(), v404, v405); + svfloat32_t v409 = svadd_f32_x(svptrue_b32(), v407, v408); + svfloat32_t v412 = svsub_f32_x(svptrue_b32(), v405, v399); + svfloat32_t v415 = svsub_f32_x(svptrue_b32(), v398, v404); + svfloat32_t v457 = svmul_f32_x(svptrue_b32(), v380, v1020); + svfloat32_t v462 = svmul_f32_x(svptrue_b32(), v381, v1021); + svfloat32_t v492 = svmul_f32_x(svptrue_b32(), v387, v1027); + svfloat32_t v497 = svmul_f32_x(svptrue_b32(), v388, v1028); + svfloat32_t v413 = svadd_f32_x(svptrue_b32(), v412, v355); + svfloat32_t v416 = svadd_f32_x(svptrue_b32(), v415, v361); + svfloat32_t v427 = svadd_f32_x(svptrue_b32(), v1014, v382); + svfloat32_t v487 = svmul_f32_x(svptrue_b32(), v386, v1026); + svfloat32_t zero523 = svdup_n_f32(0); svfloat32_t v523 = svcmla_f32_x(pred_full, zero523, v1032, v400, 90); - svfloat32_t zero544; - asm volatile("mov %0.s, #0" : "=w"(zero544)); + svfloat32_t zero544 = svdup_n_f32(0); svfloat32_t v544 = svcmla_f32_x(pred_full, zero544, v1035, v403, 90); - svfloat32_t zero565; - asm volatile("mov %0.s, #0" : "=w"(zero565)); + svfloat32_t zero565 = svdup_n_f32(0); svfloat32_t v565 = svcmla_f32_x(pred_full, zero565, v1038, v406, 90); - svfloat32_t zero586; - asm volatile("mov %0.s, #0" : "=w"(zero586)); + svfloat32_t zero586 = svdup_n_f32(0); svfloat32_t v586 = svcmla_f32_x(pred_full, zero586, v1041, v409, 90); svfloat32_t v652 = svmla_f32_x(pred_full, v492, v379, v1019); svfloat32_t v653 = svnmls_f32_x(pred_full, v447, v387, v1027); svfloat32_t v654 = svmla_f32_x(pred_full, v497, v377, v1017); svfloat32_t v655 = svnmls_f32_x(pred_full, v497, v376, v1016); - svfloat32_t v414; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v414) : "w"(v413), "w"(v411)); - svfloat32_t v417; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v417) : "w"(v416), "w"(v363)); + svfloat32_t v414 = svsub_f32_x(svptrue_b32(), v413, v411); + svfloat32_t v417 = svadd_f32_x(svptrue_b32(), v416, v363); svfloat32_t v650 = svmla_f32_x(pred_full, v487, v384, v1024); svfloat32_t v651 = svnmls_f32_x(pred_full, v487, v385, v1025); svfloat32_t v656 = svnmls_f32_x(pred_full, v462, v389, v1029); @@ -7842,160 +6988,91 @@ void armral_fft_cf32_cf32_cf32_ab_t_gu17(const armral_cmplx_f32_t *restrict x, svfloat32_t v683 = svcmla_f32_x(pred_full, v586, v1039, v407, 90); svfloat32_t v684 = svcmla_f32_x(pred_full, v586, v1040, v408, 90); svst1_f64(pred_full, (double *)(v1058), svreinterpret_f64_f32(v427)); - svfloat32_t v418; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v418) : "w"(v417), "w"(v369)); - svfloat32_t zero635; - asm volatile("mov %0.s, #0" : "=w"(zero635)); + svfloat32_t v418 = svsub_f32_x(svptrue_b32(), v417, v369); + svfloat32_t zero635 = svdup_n_f32(0); svfloat32_t v635 = svcmla_f32_x(pred_full, zero635, v1048, v414, 90); svfloat32_t v659 = svmla_f32_x(pred_full, v658, v383, v1023); svfloat32_t v660 = svmls_f32_x(pred_full, v658, v383, v1023); - svfloat32_t v661; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v661) : "w"(v650), "w"(v652)); - svfloat32_t v663; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v663) : "w"(v651), "w"(v653)); - svfloat32_t v665; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v665) : "w"(v650), "w"(v654)); - svfloat32_t v667; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v667) : "w"(v651), "w"(v655)); - svfloat32_t v688; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v688) : "w"(v677), "w"(v679)); - svfloat32_t v689; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v689) : "w"(v677), "w"(v679)); - svfloat32_t v690; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v690) : "w"(v678), "w"(v680)); - svfloat32_t v691; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v691) : "w"(v678), "w"(v680)); - svfloat32_t v692; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v692) : "w"(v681), "w"(v683)); - svfloat32_t v693; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v693) : "w"(v683), "w"(v681)); - svfloat32_t v694; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v694) : "w"(v682), "w"(v684)); - svfloat32_t v695; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v695) : "w"(v684), "w"(v682)); - svfloat32_t v419; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v419) : "w"(v414), "w"(v418)); - svfloat32_t zero642; - asm volatile("mov %0.s, #0" : "=w"(zero642)); + svfloat32_t v661 = svsub_f32_x(svptrue_b32(), v650, v652); + svfloat32_t v663 = svadd_f32_x(svptrue_b32(), v651, v653); + svfloat32_t v665 = svadd_f32_x(svptrue_b32(), v650, v654); + svfloat32_t v667 = svadd_f32_x(svptrue_b32(), v651, v655); + svfloat32_t v688 = svadd_f32_x(svptrue_b32(), v677, v679); + svfloat32_t v689 = svsub_f32_x(svptrue_b32(), v677, v679); + svfloat32_t v690 = svadd_f32_x(svptrue_b32(), v678, v680); + svfloat32_t v691 = svsub_f32_x(svptrue_b32(), v678, v680); + svfloat32_t v692 = svadd_f32_x(svptrue_b32(), v681, v683); + svfloat32_t v693 = svsub_f32_x(svptrue_b32(), v683, v681); + svfloat32_t v694 = svadd_f32_x(svptrue_b32(), v682, v684); + svfloat32_t v695 = svsub_f32_x(svptrue_b32(), v684, v682); + svfloat32_t v419 = svadd_f32_x(svptrue_b32(), v414, v418); + svfloat32_t zero642 = svdup_n_f32(0); svfloat32_t v642 = svcmla_f32_x(pred_full, zero642, v1049, v418, 90); - svfloat32_t v662; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v662) : "w"(v656), "w"(v659)); - svfloat32_t v664; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v664) : "w"(v657), "w"(v660)); - svfloat32_t v666; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v666) : "w"(v659), "w"(v656)); - svfloat32_t v668; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v668) : "w"(v660), "w"(v657)); - svfloat32_t v705; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v705) : "w"(v690), "w"(v694)); - svfloat32_t v707; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v707) : "w"(v689), "w"(v695)); - svfloat32_t v709; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v709) : "w"(v688), "w"(v692)); - svfloat32_t v711; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v711) : "w"(v695), "w"(v689)); - svfloat32_t v713; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v713) : "w"(v688), "w"(v692)); - svfloat32_t v716; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v716) : "w"(v693), "w"(v691)); - svfloat32_t v719; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v719) : "w"(v694), "w"(v690)); - svfloat32_t v722; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v722) : "w"(v691), "w"(v693)); - svfloat32_t v669; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v669) : "w"(v661), "w"(v662)); - svfloat32_t v670; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v670) : "w"(v663), "w"(v664)); - svfloat32_t v671; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v671) : "w"(v665), "w"(v666)); - svfloat32_t v672; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v672) : "w"(v667), "w"(v668)); - svfloat32_t v673; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v673) : "w"(v662), "w"(v661)); - svfloat32_t v674; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v674) : "w"(v664), "w"(v663)); - svfloat32_t v675; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v675) : "w"(v666), "w"(v665)); - svfloat32_t v676; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v676) : "w"(v668), "w"(v667)); - svfloat32_t v696; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v696) : "w"(v635), "w"(v642)); + svfloat32_t v662 = svadd_f32_x(svptrue_b32(), v656, v659); + svfloat32_t v664 = svadd_f32_x(svptrue_b32(), v657, v660); + svfloat32_t v666 = svsub_f32_x(svptrue_b32(), v659, v656); + svfloat32_t v668 = svsub_f32_x(svptrue_b32(), v660, v657); + svfloat32_t v705 = svadd_f32_x(svptrue_b32(), v690, v694); + svfloat32_t v707 = svadd_f32_x(svptrue_b32(), v689, v695); + svfloat32_t v709 = svsub_f32_x(svptrue_b32(), v688, v692); + svfloat32_t v711 = svsub_f32_x(svptrue_b32(), v695, v689); + svfloat32_t v713 = svadd_f32_x(svptrue_b32(), v688, v692); + svfloat32_t v716 = svsub_f32_x(svptrue_b32(), v693, v691); + svfloat32_t v719 = svsub_f32_x(svptrue_b32(), v694, v690); + svfloat32_t v722 = svadd_f32_x(svptrue_b32(), v691, v693); + svfloat32_t v669 = svadd_f32_x(svptrue_b32(), v661, v662); + svfloat32_t v670 = svadd_f32_x(svptrue_b32(), v663, v664); + svfloat32_t v671 = svadd_f32_x(svptrue_b32(), v665, v666); + svfloat32_t v672 = svadd_f32_x(svptrue_b32(), v667, v668); + svfloat32_t v673 = svsub_f32_x(svptrue_b32(), v662, v661); + svfloat32_t v674 = svsub_f32_x(svptrue_b32(), v664, v663); + svfloat32_t v675 = svsub_f32_x(svptrue_b32(), v666, v665); + svfloat32_t v676 = svsub_f32_x(svptrue_b32(), v668, v667); + svfloat32_t v696 = svsub_f32_x(svptrue_b32(), v635, v642); svfloat32_t v685 = svcmla_f32_x(pred_full, v642, v1050, v419, 90); - svfloat32_t v698; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v698) : "w"(v696), "w"(v696)); - svfloat32_t v723; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v723) : "w"(v722), "w"(v696)); + svfloat32_t v698 = svadd_f32_x(svptrue_b32(), v696, v696); + svfloat32_t v723 = svsub_f32_x(svptrue_b32(), v722, v696); svfloat32_t v686 = svcmla_f32_x(pred_full, v685, v1042, v410, 90); - svfloat32_t v699; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v699) : "w"(v614), "w"(v698)); - svfloat32_t v702; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v702) : "w"(v685), "w"(v685)); - svfloat32_t v720; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v720) : "w"(v719), "w"(v698)); - svfloat32_t v763; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v763) : "w"(v676), "w"(v723)); - svfloat32_t v771; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v771) : "w"(v676), "w"(v723)); + svfloat32_t v699 = svsub_f32_x(svptrue_b32(), v614, v698); + svfloat32_t v702 = svadd_f32_x(svptrue_b32(), v685, v685); + svfloat32_t v720 = svadd_f32_x(svptrue_b32(), v719, v698); + svfloat32_t v763 = svadd_f32_x(svptrue_b32(), v676, v723); + svfloat32_t v771 = svsub_f32_x(svptrue_b32(), v676, v723); svfloat32_t v687 = svcmla_f32_x(pred_full, v686, v1043, v355, 90); svfloat32_t v697 = svcmla_f32_x(pred_full, v686, v1044, v363, 90); svfloat32_t v700 = svcmla_f32_x(pred_full, v699, v1046, v361, 90); svfloat32_t v701 = svcmla_f32_x(pred_full, v699, v1047, v369, 90); - svfloat32_t v703; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v703) : "w"(v702), "w"(v702)); - svfloat32_t v704; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v704) : "w"(v696), "w"(v702)); - svfloat32_t v710; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v710) : "w"(v709), "w"(v702)); - svfloat32_t v721; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v721) : "w"(v720), "w"(v702)); + svfloat32_t v703 = svadd_f32_x(svptrue_b32(), v702, v702); + svfloat32_t v704 = svadd_f32_x(svptrue_b32(), v696, v702); + svfloat32_t v710 = svadd_f32_x(svptrue_b32(), v709, v702); + svfloat32_t v721 = svadd_f32_x(svptrue_b32(), v720, v702); svst1_f64(pred_full, (double *)(v1103), svreinterpret_f64_f32(v763)); svst1_f64(pred_full, (double *)(v1112), svreinterpret_f64_f32(v771)); - svfloat32_t v706; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v706) : "w"(v705), "w"(v697)); - svfloat32_t v708; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v708) : "w"(v707), "w"(v700)); - svfloat32_t v712; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v712) : "w"(v711), "w"(v704)); - svfloat32_t v714; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v714) : "w"(v713), "w"(v687)); - svfloat32_t v717; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v717) : "w"(v716), "w"(v701)); - svfloat32_t v747; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v747) : "w"(v671), "w"(v710)); - svfloat32_t v755; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v755) : "w"(v671), "w"(v710)); - svfloat32_t v843; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v843) : "w"(v675), "w"(v721)); - svfloat32_t v851; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v851) : "w"(v675), "w"(v721)); - svfloat32_t v715; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v715) : "w"(v714), "w"(v696)); - svfloat32_t v718; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v718) : "w"(v717), "w"(v703)); - svfloat32_t v731; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v731) : "w"(v669), "w"(v706)); - svfloat32_t v739; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v739) : "w"(v669), "w"(v706)); - svfloat32_t v795; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v795) : "w"(v672), "w"(v712)); - svfloat32_t v803; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v803) : "w"(v672), "w"(v712)); - svfloat32_t v811; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v811) : "w"(v670), "w"(v708)); - svfloat32_t v819; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v819) : "w"(v670), "w"(v708)); + svfloat32_t v706 = svadd_f32_x(svptrue_b32(), v705, v697); + svfloat32_t v708 = svadd_f32_x(svptrue_b32(), v707, v700); + svfloat32_t v712 = svsub_f32_x(svptrue_b32(), v711, v704); + svfloat32_t v714 = svadd_f32_x(svptrue_b32(), v713, v687); + svfloat32_t v717 = svsub_f32_x(svptrue_b32(), v716, v701); + svfloat32_t v747 = svadd_f32_x(svptrue_b32(), v671, v710); + svfloat32_t v755 = svsub_f32_x(svptrue_b32(), v671, v710); + svfloat32_t v843 = svadd_f32_x(svptrue_b32(), v675, v721); + svfloat32_t v851 = svsub_f32_x(svptrue_b32(), v675, v721); + svfloat32_t v715 = svadd_f32_x(svptrue_b32(), v714, v696); + svfloat32_t v718 = svadd_f32_x(svptrue_b32(), v717, v703); + svfloat32_t v731 = svadd_f32_x(svptrue_b32(), v669, v706); + svfloat32_t v739 = svsub_f32_x(svptrue_b32(), v669, v706); + svfloat32_t v795 = svadd_f32_x(svptrue_b32(), v672, v712); + svfloat32_t v803 = svsub_f32_x(svptrue_b32(), v672, v712); + svfloat32_t v811 = svadd_f32_x(svptrue_b32(), v670, v708); + svfloat32_t v819 = svsub_f32_x(svptrue_b32(), v670, v708); svst1_f64(pred_full, (double *)(v1085), svreinterpret_f64_f32(v747)); svst1_f64(pred_full, (double *)(v1094), svreinterpret_f64_f32(v755)); svst1_f64(pred_full, (double *)(v1193), svreinterpret_f64_f32(v843)); svst1_f64(pred_full, (double *)(v1202), svreinterpret_f64_f32(v851)); - svfloat32_t v779; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v779) : "w"(v673), "w"(v715)); - svfloat32_t v787; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v787) : "w"(v673), "w"(v715)); - svfloat32_t v827; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v827) : "w"(v674), "w"(v718)); - svfloat32_t v835; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v835) : "w"(v674), "w"(v718)); + svfloat32_t v779 = svadd_f32_x(svptrue_b32(), v673, v715); + svfloat32_t v787 = svsub_f32_x(svptrue_b32(), v673, v715); + svfloat32_t v827 = svadd_f32_x(svptrue_b32(), v674, v718); + svfloat32_t v835 = svsub_f32_x(svptrue_b32(), v674, v718); svst1_f64(pred_full, (double *)(v1067), svreinterpret_f64_f32(v731)); svst1_f64(pred_full, (double *)(v1076), svreinterpret_f64_f32(v739)); svst1_f64(pred_full, (double *)(v1139), svreinterpret_f64_f32(v795)); @@ -8534,8 +7611,7 @@ void armral_fft_cf32_cf32_cf32_ab_t_gu18(const armral_cmplx_f32_t *restrict x, svld1_f64(pred_full, &((const double *)v7)[v204])); svfloat32_t v240 = svreinterpret_f32_f64( svld1_f64(pred_full, &((const double *)v7)[v239])); - svfloat32_t zero248; - asm volatile("mov %0.s, #0" : "=w"(zero248)); + svfloat32_t zero248 = svdup_n_f32(0); svfloat32_t v248 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero248, v844, v247, 0), v844, v247, 90); @@ -8583,289 +7659,181 @@ void armral_fft_cf32_cf32_cf32_ab_t_gu18(const armral_cmplx_f32_t *restrict x, svld1_gather_s64index_f64(pred_full, (const double *)(v889), v909)); svfloat32_t v900 = svreinterpret_f32_f64( svld1_gather_s64index_f64(pred_full, (const double *)(v898), v909)); - svfloat32_t zero38; - asm volatile("mov %0.s, #0" : "=w"(zero38)); + svfloat32_t zero38 = svdup_n_f32(0); svfloat32_t v38 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero38, v754, v37, 0), v754, v37, 90); - svfloat32_t zero73; - asm volatile("mov %0.s, #0" : "=w"(zero73)); + svfloat32_t zero73 = svdup_n_f32(0); svfloat32_t v73 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero73, v763, v72, 0), v763, v72, 90); - svfloat32_t zero80; - asm volatile("mov %0.s, #0" : "=w"(zero80)); + svfloat32_t zero80 = svdup_n_f32(0); svfloat32_t v80 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero80, v772, v79, 0), v772, v79, 90); - svfloat32_t zero115; - asm volatile("mov %0.s, #0" : "=w"(zero115)); + svfloat32_t zero115 = svdup_n_f32(0); svfloat32_t v115 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero115, v781, v114, 0), v781, v114, 90); - svfloat32_t zero122; - asm volatile("mov %0.s, #0" : "=w"(zero122)); + svfloat32_t zero122 = svdup_n_f32(0); svfloat32_t v122 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero122, v790, v121, 0), v790, v121, 90); - svfloat32_t zero157; - asm volatile("mov %0.s, #0" : "=w"(zero157)); + svfloat32_t zero157 = svdup_n_f32(0); svfloat32_t v157 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero157, v799, v156, 0), v799, v156, 90); - svfloat32_t zero164; - asm volatile("mov %0.s, #0" : "=w"(zero164)); + svfloat32_t zero164 = svdup_n_f32(0); svfloat32_t v164 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero164, v808, v163, 0), v808, v163, 90); - svfloat32_t zero199; - asm volatile("mov %0.s, #0" : "=w"(zero199)); + svfloat32_t zero199 = svdup_n_f32(0); svfloat32_t v199 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero199, v817, v198, 0), v817, v198, 90); - svfloat32_t zero206; - asm volatile("mov %0.s, #0" : "=w"(zero206)); + svfloat32_t zero206 = svdup_n_f32(0); svfloat32_t v206 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero206, v826, v205, 0), v826, v205, 90); - svfloat32_t zero241; - asm volatile("mov %0.s, #0" : "=w"(zero241)); + svfloat32_t zero241 = svdup_n_f32(0); svfloat32_t v241 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero241, v835, v240, 0), v835, v240, 90); - svfloat32_t zero283; - asm volatile("mov %0.s, #0" : "=w"(zero283)); + svfloat32_t zero283 = svdup_n_f32(0); svfloat32_t v283 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero283, v855, v282, 0), v855, v282, 90); - svfloat32_t zero290; - asm volatile("mov %0.s, #0" : "=w"(zero290)); + svfloat32_t zero290 = svdup_n_f32(0); svfloat32_t v290 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero290, v864, v289, 0), v864, v289, 90); - svfloat32_t zero325; - asm volatile("mov %0.s, #0" : "=w"(zero325)); + svfloat32_t zero325 = svdup_n_f32(0); svfloat32_t v325 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero325, v873, v324, 0), v873, v324, 90); - svfloat32_t zero332; - asm volatile("mov %0.s, #0" : "=w"(zero332)); + svfloat32_t zero332 = svdup_n_f32(0); svfloat32_t v332 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero332, v882, v331, 0), v882, v331, 90); - svfloat32_t zero367; - asm volatile("mov %0.s, #0" : "=w"(zero367)); + svfloat32_t zero367 = svdup_n_f32(0); svfloat32_t v367 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero367, v891, v366, 0), v891, v366, 90); - svfloat32_t zero374; - asm volatile("mov %0.s, #0" : "=w"(zero374)); + svfloat32_t zero374 = svdup_n_f32(0); svfloat32_t v374 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero374, v900, v373, 0), v900, v373, 90); - svfloat32_t v382; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v382) : "w"(v910), "w"(v38)); - svfloat32_t v383; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v383) : "w"(v910), "w"(v38)); - svfloat32_t v384; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v384) : "w"(v73), "w"(v80)); - svfloat32_t v385; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v385) : "w"(v73), "w"(v80)); - svfloat32_t v386; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v386) : "w"(v115), "w"(v122)); - svfloat32_t v387; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v387) : "w"(v115), "w"(v122)); - svfloat32_t v388; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v388) : "w"(v157), "w"(v164)); - svfloat32_t v389; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v389) : "w"(v157), "w"(v164)); - svfloat32_t v390; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v390) : "w"(v199), "w"(v206)); - svfloat32_t v391; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v391) : "w"(v199), "w"(v206)); - svfloat32_t v392; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v392) : "w"(v241), "w"(v248)); - svfloat32_t v393; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v393) : "w"(v241), "w"(v248)); - svfloat32_t v394; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v394) : "w"(v283), "w"(v290)); - svfloat32_t v395; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v395) : "w"(v283), "w"(v290)); - svfloat32_t v396; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v396) : "w"(v325), "w"(v332)); - svfloat32_t v397; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v397) : "w"(v325), "w"(v332)); - svfloat32_t v398; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v398) : "w"(v367), "w"(v374)); - svfloat32_t v399; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v399) : "w"(v367), "w"(v374)); - svfloat32_t v400; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v400) : "w"(v384), "w"(v398)); - svfloat32_t v401; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v401) : "w"(v384), "w"(v398)); - svfloat32_t v402; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v402) : "w"(v396), "w"(v386)); - svfloat32_t v403; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v403) : "w"(v396), "w"(v386)); - svfloat32_t v404; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v404) : "w"(v388), "w"(v394)); - svfloat32_t v405; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v405) : "w"(v388), "w"(v394)); - svfloat32_t v406; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v406) : "w"(v390), "w"(v392)); - svfloat32_t v407; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v407) : "w"(v390), "w"(v392)); - svfloat32_t v510; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v510) : "w"(v385), "w"(v399)); - svfloat32_t v511; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v511) : "w"(v385), "w"(v399)); - svfloat32_t v512; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v512) : "w"(v397), "w"(v387)); - svfloat32_t v513; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v513) : "w"(v397), "w"(v387)); - svfloat32_t v514; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v514) : "w"(v389), "w"(v395)); - svfloat32_t v515; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v515) : "w"(v389), "w"(v395)); - svfloat32_t v516; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v516) : "w"(v391), "w"(v393)); - svfloat32_t v517; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v517) : "w"(v391), "w"(v393)); - svfloat32_t v408; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v408) : "w"(v400), "w"(v402)); - svfloat32_t v412; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v412) : "w"(v401), "w"(v403)); - svfloat32_t v414; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v414) : "w"(v400), "w"(v402)); - svfloat32_t v415; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v415) : "w"(v402), "w"(v406)); - svfloat32_t v416; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v416) : "w"(v406), "w"(v400)); - svfloat32_t v417; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v417) : "w"(v401), "w"(v403)); - svfloat32_t v418; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v418) : "w"(v403), "w"(v407)); - svfloat32_t v419; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v419) : "w"(v407), "w"(v401)); - svfloat32_t zero448; - asm volatile("mov %0.s, #0" : "=w"(zero448)); + svfloat32_t v382 = svadd_f32_x(svptrue_b32(), v910, v38); + svfloat32_t v383 = svsub_f32_x(svptrue_b32(), v910, v38); + svfloat32_t v384 = svadd_f32_x(svptrue_b32(), v73, v80); + svfloat32_t v385 = svsub_f32_x(svptrue_b32(), v73, v80); + svfloat32_t v386 = svadd_f32_x(svptrue_b32(), v115, v122); + svfloat32_t v387 = svsub_f32_x(svptrue_b32(), v115, v122); + svfloat32_t v388 = svadd_f32_x(svptrue_b32(), v157, v164); + svfloat32_t v389 = svsub_f32_x(svptrue_b32(), v157, v164); + svfloat32_t v390 = svadd_f32_x(svptrue_b32(), v199, v206); + svfloat32_t v391 = svsub_f32_x(svptrue_b32(), v199, v206); + svfloat32_t v392 = svadd_f32_x(svptrue_b32(), v241, v248); + svfloat32_t v393 = svsub_f32_x(svptrue_b32(), v241, v248); + svfloat32_t v394 = svadd_f32_x(svptrue_b32(), v283, v290); + svfloat32_t v395 = svsub_f32_x(svptrue_b32(), v283, v290); + svfloat32_t v396 = svadd_f32_x(svptrue_b32(), v325, v332); + svfloat32_t v397 = svsub_f32_x(svptrue_b32(), v325, v332); + svfloat32_t v398 = svadd_f32_x(svptrue_b32(), v367, v374); + svfloat32_t v399 = svsub_f32_x(svptrue_b32(), v367, v374); + svfloat32_t v400 = svadd_f32_x(svptrue_b32(), v384, v398); + svfloat32_t v401 = svsub_f32_x(svptrue_b32(), v384, v398); + svfloat32_t v402 = svadd_f32_x(svptrue_b32(), v396, v386); + svfloat32_t v403 = svsub_f32_x(svptrue_b32(), v396, v386); + svfloat32_t v404 = svadd_f32_x(svptrue_b32(), v388, v394); + svfloat32_t v405 = svsub_f32_x(svptrue_b32(), v388, v394); + svfloat32_t v406 = svadd_f32_x(svptrue_b32(), v390, v392); + svfloat32_t v407 = svsub_f32_x(svptrue_b32(), v390, v392); + svfloat32_t v510 = svadd_f32_x(svptrue_b32(), v385, v399); + svfloat32_t v511 = svsub_f32_x(svptrue_b32(), v385, v399); + svfloat32_t v512 = svadd_f32_x(svptrue_b32(), v397, v387); + svfloat32_t v513 = svsub_f32_x(svptrue_b32(), v397, v387); + svfloat32_t v514 = svadd_f32_x(svptrue_b32(), v389, v395); + svfloat32_t v515 = svsub_f32_x(svptrue_b32(), v389, v395); + svfloat32_t v516 = svadd_f32_x(svptrue_b32(), v391, v393); + svfloat32_t v517 = svsub_f32_x(svptrue_b32(), v391, v393); + svfloat32_t v408 = svadd_f32_x(svptrue_b32(), v400, v402); + svfloat32_t v412 = svadd_f32_x(svptrue_b32(), v401, v403); + svfloat32_t v414 = svsub_f32_x(svptrue_b32(), v400, v402); + svfloat32_t v415 = svsub_f32_x(svptrue_b32(), v402, v406); + svfloat32_t v416 = svsub_f32_x(svptrue_b32(), v406, v400); + svfloat32_t v417 = svsub_f32_x(svptrue_b32(), v401, v403); + svfloat32_t v418 = svsub_f32_x(svptrue_b32(), v403, v407); + svfloat32_t v419 = svsub_f32_x(svptrue_b32(), v407, v401); + svfloat32_t zero448 = svdup_n_f32(0); svfloat32_t v448 = svcmla_f32_x(pred_full, zero448, v926, v405, 90); - svfloat32_t v518; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v518) : "w"(v510), "w"(v512)); - svfloat32_t v522; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v522) : "w"(v511), "w"(v513)); - svfloat32_t v524; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v524) : "w"(v510), "w"(v512)); - svfloat32_t v525; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v525) : "w"(v512), "w"(v516)); - svfloat32_t v526; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v526) : "w"(v516), "w"(v510)); - svfloat32_t v527; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v527) : "w"(v511), "w"(v513)); - svfloat32_t v528; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v528) : "w"(v513), "w"(v517)); - svfloat32_t v529; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v529) : "w"(v517), "w"(v511)); - svfloat32_t zero558; - asm volatile("mov %0.s, #0" : "=w"(zero558)); + svfloat32_t v518 = svadd_f32_x(svptrue_b32(), v510, v512); + svfloat32_t v522 = svadd_f32_x(svptrue_b32(), v511, v513); + svfloat32_t v524 = svsub_f32_x(svptrue_b32(), v510, v512); + svfloat32_t v525 = svsub_f32_x(svptrue_b32(), v512, v516); + svfloat32_t v526 = svsub_f32_x(svptrue_b32(), v516, v510); + svfloat32_t v527 = svsub_f32_x(svptrue_b32(), v511, v513); + svfloat32_t v528 = svsub_f32_x(svptrue_b32(), v513, v517); + svfloat32_t v529 = svsub_f32_x(svptrue_b32(), v517, v511); + svfloat32_t zero558 = svdup_n_f32(0); svfloat32_t v558 = svcmla_f32_x(pred_full, zero558, v926, v515, 90); - svfloat32_t v409; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v409) : "w"(v408), "w"(v406)); - svfloat32_t v413; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v413) : "w"(v412), "w"(v407)); - svfloat32_t zero470; - asm volatile("mov %0.s, #0" : "=w"(zero470)); + svfloat32_t v409 = svadd_f32_x(svptrue_b32(), v408, v406); + svfloat32_t v413 = svadd_f32_x(svptrue_b32(), v412, v407); + svfloat32_t zero470 = svdup_n_f32(0); svfloat32_t v470 = svcmla_f32_x(pred_full, zero470, v930, v417, 90); - svfloat32_t zero477; - asm volatile("mov %0.s, #0" : "=w"(zero477)); + svfloat32_t zero477 = svdup_n_f32(0); svfloat32_t v477 = svcmla_f32_x(pred_full, zero477, v931, v418, 90); - svfloat32_t zero484; - asm volatile("mov %0.s, #0" : "=w"(zero484)); + svfloat32_t zero484 = svdup_n_f32(0); svfloat32_t v484 = svcmla_f32_x(pred_full, zero484, v932, v419, 90); - svfloat32_t v519; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v519) : "w"(v518), "w"(v516)); - svfloat32_t v523; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v523) : "w"(v522), "w"(v517)); - svfloat32_t zero580; - asm volatile("mov %0.s, #0" : "=w"(zero580)); + svfloat32_t v519 = svadd_f32_x(svptrue_b32(), v518, v516); + svfloat32_t v523 = svadd_f32_x(svptrue_b32(), v522, v517); + svfloat32_t zero580 = svdup_n_f32(0); svfloat32_t v580 = svcmla_f32_x(pred_full, zero580, v930, v527, 90); - svfloat32_t zero587; - asm volatile("mov %0.s, #0" : "=w"(zero587)); + svfloat32_t zero587 = svdup_n_f32(0); svfloat32_t v587 = svcmla_f32_x(pred_full, zero587, v931, v528, 90); - svfloat32_t zero594; - asm volatile("mov %0.s, #0" : "=w"(zero594)); + svfloat32_t zero594 = svdup_n_f32(0); svfloat32_t v594 = svcmla_f32_x(pred_full, zero594, v932, v529, 90); - svfloat32_t v410; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v410) : "w"(v409), "w"(v404)); - svfloat32_t v429; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v429) : "w"(v409), "w"(v923)); - svfloat32_t zero436; - asm volatile("mov %0.s, #0" : "=w"(zero436)); + svfloat32_t v410 = svadd_f32_x(svptrue_b32(), v409, v404); + svfloat32_t v429 = svmul_f32_x(svptrue_b32(), v409, v923); + svfloat32_t zero436 = svdup_n_f32(0); svfloat32_t v436 = svcmla_f32_x(pred_full, zero436, v926, v413, 90); - svfloat32_t v498; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v498) : "w"(v448), "w"(v470)); - svfloat32_t v500; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v500) : "w"(v448), "w"(v477)); - svfloat32_t v502; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v502) : "w"(v448), "w"(v470)); - svfloat32_t v520; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v520) : "w"(v519), "w"(v514)); - svfloat32_t v539; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v539) : "w"(v519), "w"(v923)); - svfloat32_t zero546; - asm volatile("mov %0.s, #0" : "=w"(zero546)); + svfloat32_t v498 = svadd_f32_x(svptrue_b32(), v448, v470); + svfloat32_t v500 = svsub_f32_x(svptrue_b32(), v448, v477); + svfloat32_t v502 = svsub_f32_x(svptrue_b32(), v448, v470); + svfloat32_t v520 = svadd_f32_x(svptrue_b32(), v519, v514); + svfloat32_t v539 = svmul_f32_x(svptrue_b32(), v519, v923); + svfloat32_t zero546 = svdup_n_f32(0); svfloat32_t v546 = svcmla_f32_x(pred_full, zero546, v926, v523, 90); - svfloat32_t v608; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v608) : "w"(v558), "w"(v580)); - svfloat32_t v610; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v610) : "w"(v558), "w"(v587)); - svfloat32_t v612; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v612) : "w"(v558), "w"(v580)); - svfloat32_t v411; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v411) : "w"(v410), "w"(v382)); - svfloat32_t v485; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v485) : "w"(v429), "w"(v429)); - svfloat32_t v499; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v499) : "w"(v498), "w"(v477)); - svfloat32_t v501; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v501) : "w"(v500), "w"(v484)); - svfloat32_t v503; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v503) : "w"(v502), "w"(v484)); - svfloat32_t v521; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v521) : "w"(v520), "w"(v383)); - svfloat32_t v595; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v595) : "w"(v539), "w"(v539)); - svfloat32_t v609; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v609) : "w"(v608), "w"(v587)); - svfloat32_t v611; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v611) : "w"(v610), "w"(v594)); - svfloat32_t v613; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v613) : "w"(v612), "w"(v594)); + svfloat32_t v608 = svadd_f32_x(svptrue_b32(), v558, v580); + svfloat32_t v610 = svsub_f32_x(svptrue_b32(), v558, v587); + svfloat32_t v612 = svsub_f32_x(svptrue_b32(), v558, v580); + svfloat32_t v411 = svadd_f32_x(svptrue_b32(), v410, v382); + svfloat32_t v485 = svadd_f32_x(svptrue_b32(), v429, v429); + svfloat32_t v499 = svadd_f32_x(svptrue_b32(), v498, v477); + svfloat32_t v501 = svadd_f32_x(svptrue_b32(), v500, v484); + svfloat32_t v503 = svsub_f32_x(svptrue_b32(), v502, v484); + svfloat32_t v521 = svadd_f32_x(svptrue_b32(), v520, v383); + svfloat32_t v595 = svadd_f32_x(svptrue_b32(), v539, v539); + svfloat32_t v609 = svadd_f32_x(svptrue_b32(), v608, v587); + svfloat32_t v611 = svadd_f32_x(svptrue_b32(), v610, v594); + svfloat32_t v613 = svsub_f32_x(svptrue_b32(), v612, v594); svfloat32_t v486 = svmla_f32_x(pred_full, v485, v409, v923); svfloat32_t v490 = svmla_f32_x(pred_full, v411, v404, v925); svfloat32_t v596 = svmla_f32_x(pred_full, v595, v519, v923); svfloat32_t v600 = svmla_f32_x(pred_full, v521, v514, v925); svst1_f64(pred_full, (double *)(v940), svreinterpret_f64_f32(v411)); svst1_f64(pred_full, (double *)(v949), svreinterpret_f64_f32(v521)); - svfloat32_t v487; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v487) : "w"(v411), "w"(v486)); - svfloat32_t v491; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v491) : "w"(v490), "w"(v485)); - svfloat32_t v597; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v597) : "w"(v521), "w"(v596)); - svfloat32_t v601; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v601) : "w"(v600), "w"(v595)); - svfloat32_t v488; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v488) : "w"(v487), "w"(v436)); - svfloat32_t v489; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v489) : "w"(v487), "w"(v436)); + svfloat32_t v487 = svadd_f32_x(svptrue_b32(), v411, v486); + svfloat32_t v491 = svadd_f32_x(svptrue_b32(), v490, v485); + svfloat32_t v597 = svadd_f32_x(svptrue_b32(), v521, v596); + svfloat32_t v601 = svadd_f32_x(svptrue_b32(), v600, v595); + svfloat32_t v488 = svadd_f32_x(svptrue_b32(), v487, v436); + svfloat32_t v489 = svsub_f32_x(svptrue_b32(), v487, v436); svfloat32_t v492 = svmla_f32_x(pred_full, v491, v414, v927); svfloat32_t v494 = svmls_f32_x(pred_full, v491, v415, v928); svfloat32_t v496 = svmls_f32_x(pred_full, v491, v414, v927); - svfloat32_t v598; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v598) : "w"(v597), "w"(v546)); - svfloat32_t v599; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v599) : "w"(v597), "w"(v546)); + svfloat32_t v598 = svadd_f32_x(svptrue_b32(), v597, v546); + svfloat32_t v599 = svsub_f32_x(svptrue_b32(), v597, v546); svfloat32_t v602 = svmla_f32_x(pred_full, v601, v524, v927); svfloat32_t v604 = svmls_f32_x(pred_full, v601, v525, v928); svfloat32_t v606 = svmls_f32_x(pred_full, v601, v524, v927); @@ -8879,30 +7847,18 @@ void armral_fft_cf32_cf32_cf32_ab_t_gu18(const armral_cmplx_f32_t *restrict x, svst1_f64(pred_full, (double *)(v1003), svreinterpret_f64_f32(v599)); svst1_f64(pred_full, (double *)(v1048), svreinterpret_f64_f32(v488)); svst1_f64(pred_full, (double *)(v1057), svreinterpret_f64_f32(v598)); - svfloat32_t v504; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v504) : "w"(v493), "w"(v499)); - svfloat32_t v505; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v505) : "w"(v493), "w"(v499)); - svfloat32_t v506; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v506) : "w"(v495), "w"(v501)); - svfloat32_t v507; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v507) : "w"(v495), "w"(v501)); - svfloat32_t v508; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v508) : "w"(v497), "w"(v503)); - svfloat32_t v509; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v509) : "w"(v497), "w"(v503)); - svfloat32_t v614; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v614) : "w"(v603), "w"(v609)); - svfloat32_t v615; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v615) : "w"(v603), "w"(v609)); - svfloat32_t v616; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v616) : "w"(v605), "w"(v611)); - svfloat32_t v617; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v617) : "w"(v605), "w"(v611)); - svfloat32_t v618; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v618) : "w"(v607), "w"(v613)); - svfloat32_t v619; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v619) : "w"(v607), "w"(v613)); + svfloat32_t v504 = svadd_f32_x(svptrue_b32(), v493, v499); + svfloat32_t v505 = svsub_f32_x(svptrue_b32(), v493, v499); + svfloat32_t v506 = svadd_f32_x(svptrue_b32(), v495, v501); + svfloat32_t v507 = svsub_f32_x(svptrue_b32(), v495, v501); + svfloat32_t v508 = svadd_f32_x(svptrue_b32(), v497, v503); + svfloat32_t v509 = svsub_f32_x(svptrue_b32(), v497, v503); + svfloat32_t v614 = svadd_f32_x(svptrue_b32(), v603, v609); + svfloat32_t v615 = svsub_f32_x(svptrue_b32(), v603, v609); + svfloat32_t v616 = svadd_f32_x(svptrue_b32(), v605, v611); + svfloat32_t v617 = svsub_f32_x(svptrue_b32(), v605, v611); + svfloat32_t v618 = svadd_f32_x(svptrue_b32(), v607, v613); + svfloat32_t v619 = svsub_f32_x(svptrue_b32(), v607, v613); svst1_f64(pred_full, (double *)(v958), svreinterpret_f64_f32(v505)); svst1_f64(pred_full, (double *)(v967), svreinterpret_f64_f32(v615)); svst1_f64(pred_full, (double *)(v976), svreinterpret_f64_f32(v506)); @@ -9703,8 +8659,7 @@ void armral_fft_cf32_cf32_cf32_ab_t_gu19(const armral_cmplx_f32_t *restrict x, float32x2_t *v1319 = &v6[v933]; float32x2_t *v1328 = &v6[v941]; float32x2_t *v1337 = &v6[v949]; - svfloat32_t zero52; - asm volatile("mov %0.s, #0" : "=w"(zero52)); + svfloat32_t zero52 = svdup_n_f32(0); svfloat32_t v52 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero52, v963, v51, 0), v963, v51, 90); @@ -9776,488 +8731,295 @@ void armral_fft_cf32_cf32_cf32_ab_t_gu19(const armral_cmplx_f32_t *restrict x, svld1_gather_s64index_f64(pred_full, (const double *)(v1107), v1127)); svfloat32_t v1118 = svreinterpret_f32_f64( svld1_gather_s64index_f64(pred_full, (const double *)(v1116), v1127)); - svfloat32_t zero59; - asm volatile("mov %0.s, #0" : "=w"(zero59)); + svfloat32_t zero59 = svdup_n_f32(0); svfloat32_t v59 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero59, v973, v58, 0), v973, v58, 90); - svfloat32_t zero94; - asm volatile("mov %0.s, #0" : "=w"(zero94)); + svfloat32_t zero94 = svdup_n_f32(0); svfloat32_t v94 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero94, v992, v93, 0), v992, v93, 90); - svfloat32_t zero101; - asm volatile("mov %0.s, #0" : "=w"(zero101)); + svfloat32_t zero101 = svdup_n_f32(0); svfloat32_t v101 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero101, v983, v100, 0), v983, v100, 90); - svfloat32_t zero136; - asm volatile("mov %0.s, #0" : "=w"(zero136)); + svfloat32_t zero136 = svdup_n_f32(0); svfloat32_t v136 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero136, v1001, v135, 0), v1001, v135, 90); - svfloat32_t zero143; - asm volatile("mov %0.s, #0" : "=w"(zero143)); + svfloat32_t zero143 = svdup_n_f32(0); svfloat32_t v143 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero143, v1010, v142, 0), v1010, v142, 90); - svfloat32_t zero178; - asm volatile("mov %0.s, #0" : "=w"(zero178)); + svfloat32_t zero178 = svdup_n_f32(0); svfloat32_t v178 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero178, v1028, v177, 0), v1028, v177, 90); - svfloat32_t zero185; - asm volatile("mov %0.s, #0" : "=w"(zero185)); + svfloat32_t zero185 = svdup_n_f32(0); svfloat32_t v185 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero185, v1019, v184, 0), v1019, v184, 90); - svfloat32_t zero220; - asm volatile("mov %0.s, #0" : "=w"(zero220)); + svfloat32_t zero220 = svdup_n_f32(0); svfloat32_t v220 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero220, v1037, v219, 0), v1037, v219, 90); - svfloat32_t zero227; - asm volatile("mov %0.s, #0" : "=w"(zero227)); + svfloat32_t zero227 = svdup_n_f32(0); svfloat32_t v227 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero227, v1046, v226, 0), v1046, v226, 90); - svfloat32_t zero262; - asm volatile("mov %0.s, #0" : "=w"(zero262)); + svfloat32_t zero262 = svdup_n_f32(0); svfloat32_t v262 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero262, v1064, v261, 0), v1064, v261, 90); - svfloat32_t zero269; - asm volatile("mov %0.s, #0" : "=w"(zero269)); + svfloat32_t zero269 = svdup_n_f32(0); svfloat32_t v269 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero269, v1055, v268, 0), v1055, v268, 90); - svfloat32_t zero304; - asm volatile("mov %0.s, #0" : "=w"(zero304)); + svfloat32_t zero304 = svdup_n_f32(0); svfloat32_t v304 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero304, v1073, v303, 0), v1073, v303, 90); - svfloat32_t zero311; - asm volatile("mov %0.s, #0" : "=w"(zero311)); + svfloat32_t zero311 = svdup_n_f32(0); svfloat32_t v311 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero311, v1082, v310, 0), v1082, v310, 90); - svfloat32_t zero346; - asm volatile("mov %0.s, #0" : "=w"(zero346)); + svfloat32_t zero346 = svdup_n_f32(0); svfloat32_t v346 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero346, v1100, v345, 0), v1100, v345, 90); - svfloat32_t zero353; - asm volatile("mov %0.s, #0" : "=w"(zero353)); + svfloat32_t zero353 = svdup_n_f32(0); svfloat32_t v353 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero353, v1091, v352, 0), v1091, v352, 90); - svfloat32_t zero388; - asm volatile("mov %0.s, #0" : "=w"(zero388)); + svfloat32_t zero388 = svdup_n_f32(0); svfloat32_t v388 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero388, v1109, v387, 0), v1109, v387, 90); - svfloat32_t zero395; - asm volatile("mov %0.s, #0" : "=w"(zero395)); + svfloat32_t zero395 = svdup_n_f32(0); svfloat32_t v395 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero395, v1118, v394, 0), v1118, v394, 90); - svfloat32_t v396; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v396) : "w"(v52), "w"(v59)); - svfloat32_t v397; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v397) : "w"(v52), "w"(v59)); - svfloat32_t v398; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v398) : "w"(v101), "w"(v94)); - svfloat32_t v399; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v399) : "w"(v94), "w"(v101)); - svfloat32_t v400; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v400) : "w"(v136), "w"(v143)); - svfloat32_t v401; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v401) : "w"(v136), "w"(v143)); - svfloat32_t v402; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v402) : "w"(v185), "w"(v178)); - svfloat32_t v403; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v403) : "w"(v178), "w"(v185)); - svfloat32_t v404; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v404) : "w"(v220), "w"(v227)); - svfloat32_t v405; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v405) : "w"(v220), "w"(v227)); - svfloat32_t v406; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v406) : "w"(v269), "w"(v262)); - svfloat32_t v407; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v407) : "w"(v262), "w"(v269)); - svfloat32_t v408; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v408) : "w"(v304), "w"(v311)); - svfloat32_t v409; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v409) : "w"(v304), "w"(v311)); - svfloat32_t v410; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v410) : "w"(v353), "w"(v346)); - svfloat32_t v411; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v411) : "w"(v346), "w"(v353)); - svfloat32_t v412; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v412) : "w"(v388), "w"(v395)); - svfloat32_t v413; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v413) : "w"(v388), "w"(v395)); - svfloat32_t v414; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v414) : "w"(v396), "w"(v408)); - svfloat32_t v415; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v415) : "w"(v398), "w"(v410)); - svfloat32_t v416; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v416) : "w"(v400), "w"(v412)); - svfloat32_t v417; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v417) : "w"(v402), "w"(v408)); - svfloat32_t v418; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v418) : "w"(v404), "w"(v410)); - svfloat32_t v419; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v419) : "w"(v406), "w"(v412)); - svfloat32_t v420; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v420) : "w"(v396), "w"(v402)); - svfloat32_t v422; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v422) : "w"(v398), "w"(v404)); - svfloat32_t v424; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v424) : "w"(v400), "w"(v406)); - svfloat32_t v454; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v454) : "w"(v397), "w"(v409)); - svfloat32_t v455; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v455) : "w"(v399), "w"(v411)); - svfloat32_t v456; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v456) : "w"(v401), "w"(v413)); - svfloat32_t v457; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v457) : "w"(v403), "w"(v409)); - svfloat32_t v458; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v458) : "w"(v405), "w"(v411)); - svfloat32_t v459; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v459) : "w"(v407), "w"(v413)); - svfloat32_t v460; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v460) : "w"(v397), "w"(v403)); - svfloat32_t v462; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v462) : "w"(v399), "w"(v405)); - svfloat32_t v464; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v464) : "w"(v401), "w"(v407)); - svfloat32_t v421; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v421) : "w"(v420), "w"(v408)); - svfloat32_t v423; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v423) : "w"(v422), "w"(v410)); - svfloat32_t v425; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v425) : "w"(v424), "w"(v412)); - svfloat32_t v426; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v426) : "w"(v414), "w"(v416)); - svfloat32_t v427; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v427) : "w"(v417), "w"(v419)); - svfloat32_t v444; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v444) : "w"(v414), "w"(v417)); - svfloat32_t v445; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v445) : "w"(v416), "w"(v419)); - svfloat32_t v461; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v461) : "w"(v460), "w"(v409)); - svfloat32_t v463; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v463) : "w"(v462), "w"(v411)); - svfloat32_t v465; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v465) : "w"(v464), "w"(v413)); - svfloat32_t v466; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v466) : "w"(v454), "w"(v456)); - svfloat32_t v467; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v467) : "w"(v457), "w"(v459)); - svfloat32_t v476; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v476) : "w"(v454), "w"(v457)); - svfloat32_t v477; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v477) : "w"(v456), "w"(v459)); - svfloat32_t zero641; - asm volatile("mov %0.s, #0" : "=w"(zero641)); + svfloat32_t v396 = svadd_f32_x(svptrue_b32(), v52, v59); + svfloat32_t v397 = svsub_f32_x(svptrue_b32(), v52, v59); + svfloat32_t v398 = svadd_f32_x(svptrue_b32(), v101, v94); + svfloat32_t v399 = svsub_f32_x(svptrue_b32(), v94, v101); + svfloat32_t v400 = svadd_f32_x(svptrue_b32(), v136, v143); + svfloat32_t v401 = svsub_f32_x(svptrue_b32(), v136, v143); + svfloat32_t v402 = svadd_f32_x(svptrue_b32(), v185, v178); + svfloat32_t v403 = svsub_f32_x(svptrue_b32(), v178, v185); + svfloat32_t v404 = svadd_f32_x(svptrue_b32(), v220, v227); + svfloat32_t v405 = svsub_f32_x(svptrue_b32(), v220, v227); + svfloat32_t v406 = svadd_f32_x(svptrue_b32(), v269, v262); + svfloat32_t v407 = svsub_f32_x(svptrue_b32(), v262, v269); + svfloat32_t v408 = svadd_f32_x(svptrue_b32(), v304, v311); + svfloat32_t v409 = svsub_f32_x(svptrue_b32(), v304, v311); + svfloat32_t v410 = svadd_f32_x(svptrue_b32(), v353, v346); + svfloat32_t v411 = svsub_f32_x(svptrue_b32(), v346, v353); + svfloat32_t v412 = svadd_f32_x(svptrue_b32(), v388, v395); + svfloat32_t v413 = svsub_f32_x(svptrue_b32(), v388, v395); + svfloat32_t v414 = svsub_f32_x(svptrue_b32(), v396, v408); + svfloat32_t v415 = svsub_f32_x(svptrue_b32(), v398, v410); + svfloat32_t v416 = svsub_f32_x(svptrue_b32(), v400, v412); + svfloat32_t v417 = svsub_f32_x(svptrue_b32(), v402, v408); + svfloat32_t v418 = svsub_f32_x(svptrue_b32(), v404, v410); + svfloat32_t v419 = svsub_f32_x(svptrue_b32(), v406, v412); + svfloat32_t v420 = svadd_f32_x(svptrue_b32(), v396, v402); + svfloat32_t v422 = svadd_f32_x(svptrue_b32(), v398, v404); + svfloat32_t v424 = svadd_f32_x(svptrue_b32(), v400, v406); + svfloat32_t v454 = svsub_f32_x(svptrue_b32(), v397, v409); + svfloat32_t v455 = svsub_f32_x(svptrue_b32(), v399, v411); + svfloat32_t v456 = svsub_f32_x(svptrue_b32(), v401, v413); + svfloat32_t v457 = svsub_f32_x(svptrue_b32(), v403, v409); + svfloat32_t v458 = svsub_f32_x(svptrue_b32(), v405, v411); + svfloat32_t v459 = svsub_f32_x(svptrue_b32(), v407, v413); + svfloat32_t v460 = svadd_f32_x(svptrue_b32(), v397, v403); + svfloat32_t v462 = svadd_f32_x(svptrue_b32(), v399, v405); + svfloat32_t v464 = svadd_f32_x(svptrue_b32(), v401, v407); + svfloat32_t v421 = svadd_f32_x(svptrue_b32(), v420, v408); + svfloat32_t v423 = svadd_f32_x(svptrue_b32(), v422, v410); + svfloat32_t v425 = svadd_f32_x(svptrue_b32(), v424, v412); + svfloat32_t v426 = svadd_f32_x(svptrue_b32(), v414, v416); + svfloat32_t v427 = svadd_f32_x(svptrue_b32(), v417, v419); + svfloat32_t v444 = svsub_f32_x(svptrue_b32(), v414, v417); + svfloat32_t v445 = svsub_f32_x(svptrue_b32(), v416, v419); + svfloat32_t v461 = svadd_f32_x(svptrue_b32(), v460, v409); + svfloat32_t v463 = svadd_f32_x(svptrue_b32(), v462, v411); + svfloat32_t v465 = svadd_f32_x(svptrue_b32(), v464, v413); + svfloat32_t v466 = svadd_f32_x(svptrue_b32(), v454, v456); + svfloat32_t v467 = svadd_f32_x(svptrue_b32(), v457, v459); + svfloat32_t v476 = svsub_f32_x(svptrue_b32(), v454, v457); + svfloat32_t v477 = svsub_f32_x(svptrue_b32(), v456, v459); + svfloat32_t zero641 = svdup_n_f32(0); svfloat32_t v641 = svcmla_f32_x(pred_full, zero641, v1156, v457, 90); - svfloat32_t zero662; - asm volatile("mov %0.s, #0" : "=w"(zero662)); + svfloat32_t zero662 = svdup_n_f32(0); svfloat32_t v662 = svcmla_f32_x(pred_full, zero662, v1159, v459, 90); - svfloat32_t v428; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v428) : "w"(v421), "w"(v423)); - svfloat32_t v438; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v438) : "w"(v427), "w"(v418)); - svfloat32_t v439; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v439) : "w"(v426), "w"(v415)); - svfloat32_t v441; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v441) : "w"(v427), "w"(v418)); - svfloat32_t v442; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v442) : "w"(v426), "w"(v415)); - svfloat32_t v446; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v446) : "w"(v414), "w"(v445)); - svfloat32_t v448; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v448) : "w"(v444), "w"(v419)); - svfloat32_t v451; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v451) : "w"(v421), "w"(v425)); - svfloat32_t v452; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v452) : "w"(v423), "w"(v425)); - svfloat32_t v468; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v468) : "w"(v461), "w"(v463)); - svfloat32_t v470; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v470) : "w"(v467), "w"(v458)); - svfloat32_t v471; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v471) : "w"(v466), "w"(v455)); - svfloat32_t v473; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v473) : "w"(v467), "w"(v458)); - svfloat32_t v474; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v474) : "w"(v466), "w"(v455)); - svfloat32_t v478; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v478) : "w"(v454), "w"(v477)); - svfloat32_t v480; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v480) : "w"(v476), "w"(v459)); - svfloat32_t v483; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v483) : "w"(v461), "w"(v465)); - svfloat32_t v484; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v484) : "w"(v463), "w"(v465)); - svfloat32_t v429; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v429) : "w"(v428), "w"(v425)); - svfloat32_t v440; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v440) : "w"(v439), "w"(v438)); - svfloat32_t v443; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v443) : "w"(v442), "w"(v441)); - svfloat32_t v447; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v447) : "w"(v446), "w"(v418)); - svfloat32_t v449; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v449) : "w"(v448), "w"(v415)); - svfloat32_t v453; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v453) : "w"(v451), "w"(v452)); - svfloat32_t v469; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v469) : "w"(v468), "w"(v465)); - svfloat32_t v472; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v472) : "w"(v471), "w"(v470)); - svfloat32_t v475; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v475) : "w"(v474), "w"(v473)); - svfloat32_t v479; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v479) : "w"(v478), "w"(v458)); - svfloat32_t v481; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v481) : "w"(v480), "w"(v455)); - svfloat32_t v485; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v485) : "w"(v483), "w"(v484)); - svfloat32_t v505; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v505) : "w"(v439), "w"(v1132)); - svfloat32_t v520; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v520) : "w"(v442), "w"(v1135)); - svfloat32_t zero599; - asm volatile("mov %0.s, #0" : "=w"(zero599)); + svfloat32_t v428 = svadd_f32_x(svptrue_b32(), v421, v423); + svfloat32_t v438 = svadd_f32_x(svptrue_b32(), v427, v418); + svfloat32_t v439 = svadd_f32_x(svptrue_b32(), v426, v415); + svfloat32_t v441 = svsub_f32_x(svptrue_b32(), v427, v418); + svfloat32_t v442 = svsub_f32_x(svptrue_b32(), v426, v415); + svfloat32_t v446 = svsub_f32_x(svptrue_b32(), v414, v445); + svfloat32_t v448 = svadd_f32_x(svptrue_b32(), v444, v419); + svfloat32_t v451 = svsub_f32_x(svptrue_b32(), v421, v425); + svfloat32_t v452 = svsub_f32_x(svptrue_b32(), v423, v425); + svfloat32_t v468 = svadd_f32_x(svptrue_b32(), v461, v463); + svfloat32_t v470 = svadd_f32_x(svptrue_b32(), v467, v458); + svfloat32_t v471 = svadd_f32_x(svptrue_b32(), v466, v455); + svfloat32_t v473 = svsub_f32_x(svptrue_b32(), v467, v458); + svfloat32_t v474 = svsub_f32_x(svptrue_b32(), v466, v455); + svfloat32_t v478 = svsub_f32_x(svptrue_b32(), v454, v477); + svfloat32_t v480 = svadd_f32_x(svptrue_b32(), v476, v459); + svfloat32_t v483 = svsub_f32_x(svptrue_b32(), v461, v465); + svfloat32_t v484 = svsub_f32_x(svptrue_b32(), v463, v465); + svfloat32_t v429 = svadd_f32_x(svptrue_b32(), v428, v425); + svfloat32_t v440 = svsub_f32_x(svptrue_b32(), v439, v438); + svfloat32_t v443 = svsub_f32_x(svptrue_b32(), v442, v441); + svfloat32_t v447 = svsub_f32_x(svptrue_b32(), v446, v418); + svfloat32_t v449 = svsub_f32_x(svptrue_b32(), v448, v415); + svfloat32_t v453 = svadd_f32_x(svptrue_b32(), v451, v452); + svfloat32_t v469 = svadd_f32_x(svptrue_b32(), v468, v465); + svfloat32_t v472 = svsub_f32_x(svptrue_b32(), v471, v470); + svfloat32_t v475 = svsub_f32_x(svptrue_b32(), v474, v473); + svfloat32_t v479 = svsub_f32_x(svptrue_b32(), v478, v458); + svfloat32_t v481 = svsub_f32_x(svptrue_b32(), v480, v455); + svfloat32_t v485 = svadd_f32_x(svptrue_b32(), v483, v484); + svfloat32_t v505 = svmul_f32_x(svptrue_b32(), v439, v1132); + svfloat32_t v520 = svmul_f32_x(svptrue_b32(), v442, v1135); + svfloat32_t zero599 = svdup_n_f32(0); svfloat32_t v599 = svcmla_f32_x(pred_full, zero599, v1150, v470, 90); - svfloat32_t zero620; - asm volatile("mov %0.s, #0" : "=w"(zero620)); + svfloat32_t zero620 = svdup_n_f32(0); svfloat32_t v620 = svcmla_f32_x(pred_full, zero620, v1153, v473, 90); - svfloat32_t zero704; - asm volatile("mov %0.s, #0" : "=w"(zero704)); + svfloat32_t zero704 = svdup_n_f32(0); svfloat32_t v704 = svcmla_f32_x(pred_full, zero704, v1165, v483, 90); - svfloat32_t zero711; - asm volatile("mov %0.s, #0" : "=w"(zero711)); + svfloat32_t zero711 = svdup_n_f32(0); svfloat32_t v711 = svcmla_f32_x(pred_full, zero711, v1166, v484, 90); - svfloat32_t v437; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v437) : "w"(v1128), "w"(v429)); - svfloat32_t v450; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v450) : "w"(v447), "w"(v449)); - svfloat32_t v482; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v482) : "w"(v479), "w"(v481)); - svfloat32_t v510; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v510) : "w"(v440), "w"(v1133)); - svfloat32_t v525; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v525) : "w"(v443), "w"(v1136)); - svfloat32_t v585; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v585) : "w"(v453), "w"(v1148)); - svfloat32_t zero592; - asm volatile("mov %0.s, #0" : "=w"(zero592)); + svfloat32_t v437 = svadd_f32_x(svptrue_b32(), v1128, v429); + svfloat32_t v450 = svsub_f32_x(svptrue_b32(), v447, v449); + svfloat32_t v482 = svsub_f32_x(svptrue_b32(), v479, v481); + svfloat32_t v510 = svmul_f32_x(svptrue_b32(), v440, v1133); + svfloat32_t v525 = svmul_f32_x(svptrue_b32(), v443, v1136); + svfloat32_t v585 = svmul_f32_x(svptrue_b32(), v453, v1148); + svfloat32_t zero592 = svdup_n_f32(0); svfloat32_t v592 = svcmla_f32_x(pred_full, zero592, v1149, v469, 90); - svfloat32_t zero718; - asm volatile("mov %0.s, #0" : "=w"(zero718)); + svfloat32_t zero718 = svdup_n_f32(0); svfloat32_t v718 = svcmla_f32_x(pred_full, zero718, v1167, v485, 90); svfloat32_t v719 = svmla_f32_x(pred_full, v505, v438, v1131); svfloat32_t v720 = svmla_f32_x(pred_full, v520, v441, v1134); svfloat32_t v750 = svcmla_f32_x(pred_full, v599, v1151, v471, 90); svfloat32_t v751 = svcmla_f32_x(pred_full, v620, v1154, v474, 90); - svfloat32_t v570; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v570) : "w"(v450), "w"(v1145)); - svfloat32_t zero697; - asm volatile("mov %0.s, #0" : "=w"(zero697)); + svfloat32_t v570 = svmul_f32_x(svptrue_b32(), v450, v1145); + svfloat32_t zero697 = svdup_n_f32(0); svfloat32_t v697 = svcmla_f32_x(pred_full, zero697, v1164, v482, 90); - svfloat32_t v722; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v722) : "w"(v719), "w"(v720)); + svfloat32_t v722 = svadd_f32_x(svptrue_b32(), v719, v720); svfloat32_t v723 = svmla_f32_x(pred_full, v510, v438, v1131); svfloat32_t v724 = svmla_f32_x(pred_full, v525, v441, v1134); - svfloat32_t v741; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v741) : "w"(v719), "w"(v720)); + svfloat32_t v741 = svsub_f32_x(svptrue_b32(), v719, v720); svfloat32_t v743 = svnmls_f32_x(pred_full, v585, v451, v1146); svfloat32_t v744 = svnmls_f32_x(pred_full, v585, v452, v1147); svfloat32_t v745 = svmla_f32_x(pred_full, v437, v429, v1130); - svfloat32_t v753; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v753) : "w"(v750), "w"(v751)); + svfloat32_t v753 = svadd_f32_x(svptrue_b32(), v750, v751); svfloat32_t v754 = svcmla_f32_x(pred_full, v599, v1152, v472, 90); svfloat32_t v755 = svcmla_f32_x(pred_full, v620, v1155, v475, 90); - svfloat32_t v772; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v772) : "w"(v750), "w"(v751)); - svfloat32_t v774; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v774) : "w"(v704), "w"(v718)); - svfloat32_t v775; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v775) : "w"(v711), "w"(v718)); + svfloat32_t v772 = svsub_f32_x(svptrue_b32(), v750, v751); + svfloat32_t v774 = svsub_f32_x(svptrue_b32(), v704, v718); + svfloat32_t v775 = svsub_f32_x(svptrue_b32(), v711, v718); svst1_f64(pred_full, (double *)(v1175), svreinterpret_f64_f32(v437)); svfloat32_t v721 = svmla_f32_x(pred_full, v570, v449, v1144); svfloat32_t v725 = svmla_f32_x(pred_full, v570, v447, v1143); svfloat32_t v726 = svnmls_f32_x(pred_full, v722, v417, v1137); - svfloat32_t v727; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v727) : "w"(v723), "w"(v724)); - svfloat32_t v733; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v733) : "w"(v723), "w"(v724)); + svfloat32_t v727 = svadd_f32_x(svptrue_b32(), v723, v724); + svfloat32_t v733 = svsub_f32_x(svptrue_b32(), v723, v724); svfloat32_t v738 = svmla_f32_x(pred_full, v722, v416, v1142); - svfloat32_t v746; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v746) : "w"(v745), "w"(v743)); - svfloat32_t v747; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v747) : "w"(v745), "w"(v743)); - svfloat32_t v749; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v749) : "w"(v745), "w"(v744)); + svfloat32_t v746 = svadd_f32_x(svptrue_b32(), v745, v743); + svfloat32_t v747 = svsub_f32_x(svptrue_b32(), v745, v743); + svfloat32_t v749 = svadd_f32_x(svptrue_b32(), v745, v744); svfloat32_t v752 = svcmla_f32_x(pred_full, v697, v1163, v481, 90); svfloat32_t v756 = svcmla_f32_x(pred_full, v697, v1162, v479, 90); - svfloat32_t v757; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v757) : "w"(v641), "w"(v753)); - svfloat32_t v758; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v758) : "w"(v754), "w"(v755)); - svfloat32_t v764; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v764) : "w"(v754), "w"(v755)); + svfloat32_t v757 = svsub_f32_x(svptrue_b32(), v641, v753); + svfloat32_t v758 = svadd_f32_x(svptrue_b32(), v754, v755); + svfloat32_t v764 = svsub_f32_x(svptrue_b32(), v754, v755); svfloat32_t v769 = svcmla_f32_x(pred_full, v753, v1161, v456, 90); - svfloat32_t v776; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v776) : "w"(v592), "w"(v774)); - svfloat32_t v777; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v777) : "w"(v592), "w"(v774)); - svfloat32_t v779; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v779) : "w"(v592), "w"(v775)); + svfloat32_t v776 = svadd_f32_x(svptrue_b32(), v592, v774); + svfloat32_t v777 = svsub_f32_x(svptrue_b32(), v592, v774); + svfloat32_t v779 = svadd_f32_x(svptrue_b32(), v592, v775); svfloat32_t v728 = svnmls_f32_x(pred_full, v725, v419, v1140); svfloat32_t v729 = svmla_f32_x(pred_full, v721, v444, v1138); svfloat32_t v731 = svmla_f32_x(pred_full, v727, v445, v1141); - svfloat32_t v734; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v734) : "w"(v733), "w"(v721)); - svfloat32_t v735; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v735) : "w"(v726), "w"(v727)); - svfloat32_t v742; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v742) : "w"(v741), "w"(v725)); - svfloat32_t v748; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v748) : "w"(v747), "w"(v744)); - svfloat32_t v759; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v759) : "w"(v662), "w"(v756)); + svfloat32_t v734 = svadd_f32_x(svptrue_b32(), v733, v721); + svfloat32_t v735 = svadd_f32_x(svptrue_b32(), v726, v727); + svfloat32_t v742 = svadd_f32_x(svptrue_b32(), v741, v725); + svfloat32_t v748 = svsub_f32_x(svptrue_b32(), v747, v744); + svfloat32_t v759 = svsub_f32_x(svptrue_b32(), v662, v756); svfloat32_t v760 = svcmla_f32_x(pred_full, v752, v1157, v476, 90); svfloat32_t v762 = svcmla_f32_x(pred_full, v758, v1160, v477, 90); - svfloat32_t v765; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v765) : "w"(v764), "w"(v752)); - svfloat32_t v766; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v766) : "w"(v757), "w"(v758)); - svfloat32_t v773; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v773) : "w"(v772), "w"(v756)); - svfloat32_t v778; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v778) : "w"(v777), "w"(v775)); - svfloat32_t v730; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v730) : "w"(v729), "w"(v726)); - svfloat32_t v732; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v732) : "w"(v731), "w"(v728)); + svfloat32_t v765 = svadd_f32_x(svptrue_b32(), v764, v752); + svfloat32_t v766 = svadd_f32_x(svptrue_b32(), v757, v758); + svfloat32_t v773 = svadd_f32_x(svptrue_b32(), v772, v756); + svfloat32_t v778 = svsub_f32_x(svptrue_b32(), v777, v775); + svfloat32_t v730 = svadd_f32_x(svptrue_b32(), v729, v726); + svfloat32_t v732 = svadd_f32_x(svptrue_b32(), v731, v728); svfloat32_t v736 = svmla_f32_x(pred_full, v735, v414, v1139); - svfloat32_t v739; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v739) : "w"(v738), "w"(v728)); - svfloat32_t v761; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v761) : "w"(v760), "w"(v757)); - svfloat32_t v763; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v763) : "w"(v762), "w"(v759)); + svfloat32_t v739 = svadd_f32_x(svptrue_b32(), v738, v728); + svfloat32_t v761 = svadd_f32_x(svptrue_b32(), v760, v757); + svfloat32_t v763 = svadd_f32_x(svptrue_b32(), v762, v759); svfloat32_t v767 = svcmla_f32_x(pred_full, v766, v1158, v454, 90); - svfloat32_t v770; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v770) : "w"(v769), "w"(v759)); - svfloat32_t v784; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v784) : "w"(v742), "w"(v734)); - svfloat32_t v788; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v788) : "w"(v749), "w"(v742)); - svfloat32_t v791; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v791) : "w"(v734), "w"(v749)); - svfloat32_t v796; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v796) : "w"(v773), "w"(v765)); - svfloat32_t v800; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v800) : "w"(v773), "w"(v779)); - svfloat32_t v803; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v803) : "w"(v765), "w"(v779)); - svfloat32_t v737; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v737) : "w"(v736), "w"(v725)); - svfloat32_t v740; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v740) : "w"(v739), "w"(v721)); - svfloat32_t v768; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v768) : "w"(v767), "w"(v756)); - svfloat32_t v771; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v771) : "w"(v770), "w"(v752)); - svfloat32_t v785; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v785) : "w"(v784), "w"(v749)); - svfloat32_t v789; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v789) : "w"(v730), "w"(v746)); - svfloat32_t v790; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v790) : "w"(v732), "w"(v748)); - svfloat32_t v797; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v797) : "w"(v796), "w"(v779)); - svfloat32_t v801; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v801) : "w"(v761), "w"(v776)); - svfloat32_t v802; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v802) : "w"(v763), "w"(v778)); - svfloat32_t v827; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v827) : "w"(v791), "w"(v803)); - svfloat32_t v835; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v835) : "w"(v791), "w"(v803)); - svfloat32_t v843; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v843) : "w"(v788), "w"(v800)); - svfloat32_t v851; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v851) : "w"(v788), "w"(v800)); - svfloat32_t v780; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v780) : "w"(v737), "w"(v730)); - svfloat32_t v782; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v782) : "w"(v740), "w"(v732)); - svfloat32_t v786; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v786) : "w"(v746), "w"(v737)); - svfloat32_t v787; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v787) : "w"(v748), "w"(v740)); - svfloat32_t v792; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v792) : "w"(v768), "w"(v761)); - svfloat32_t v794; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v794) : "w"(v771), "w"(v763)); - svfloat32_t v798; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v798) : "w"(v776), "w"(v768)); - svfloat32_t v799; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v799) : "w"(v778), "w"(v771)); - svfloat32_t v859; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v859) : "w"(v790), "w"(v802)); - svfloat32_t v867; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v867) : "w"(v790), "w"(v802)); - svfloat32_t v875; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v875) : "w"(v785), "w"(v797)); - svfloat32_t v883; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v883) : "w"(v785), "w"(v797)); - svfloat32_t v923; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v923) : "w"(v789), "w"(v801)); - svfloat32_t v931; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v931) : "w"(v789), "w"(v801)); + svfloat32_t v770 = svadd_f32_x(svptrue_b32(), v769, v759); + svfloat32_t v784 = svsub_f32_x(svptrue_b32(), v742, v734); + svfloat32_t v788 = svsub_f32_x(svptrue_b32(), v749, v742); + svfloat32_t v791 = svadd_f32_x(svptrue_b32(), v734, v749); + svfloat32_t v796 = svsub_f32_x(svptrue_b32(), v773, v765); + svfloat32_t v800 = svsub_f32_x(svptrue_b32(), v773, v779); + svfloat32_t v803 = svadd_f32_x(svptrue_b32(), v765, v779); + svfloat32_t v737 = svadd_f32_x(svptrue_b32(), v736, v725); + svfloat32_t v740 = svadd_f32_x(svptrue_b32(), v739, v721); + svfloat32_t v768 = svadd_f32_x(svptrue_b32(), v767, v756); + svfloat32_t v771 = svadd_f32_x(svptrue_b32(), v770, v752); + svfloat32_t v785 = svadd_f32_x(svptrue_b32(), v784, v749); + svfloat32_t v789 = svadd_f32_x(svptrue_b32(), v730, v746); + svfloat32_t v790 = svadd_f32_x(svptrue_b32(), v732, v748); + svfloat32_t v797 = svadd_f32_x(svptrue_b32(), v796, v779); + svfloat32_t v801 = svadd_f32_x(svptrue_b32(), v761, v776); + svfloat32_t v802 = svadd_f32_x(svptrue_b32(), v763, v778); + svfloat32_t v827 = svsub_f32_x(svptrue_b32(), v791, v803); + svfloat32_t v835 = svadd_f32_x(svptrue_b32(), v791, v803); + svfloat32_t v843 = svadd_f32_x(svptrue_b32(), v788, v800); + svfloat32_t v851 = svsub_f32_x(svptrue_b32(), v788, v800); + svfloat32_t v780 = svsub_f32_x(svptrue_b32(), v737, v730); + svfloat32_t v782 = svsub_f32_x(svptrue_b32(), v740, v732); + svfloat32_t v786 = svsub_f32_x(svptrue_b32(), v746, v737); + svfloat32_t v787 = svsub_f32_x(svptrue_b32(), v748, v740); + svfloat32_t v792 = svsub_f32_x(svptrue_b32(), v768, v761); + svfloat32_t v794 = svsub_f32_x(svptrue_b32(), v771, v763); + svfloat32_t v798 = svsub_f32_x(svptrue_b32(), v776, v768); + svfloat32_t v799 = svsub_f32_x(svptrue_b32(), v778, v771); + svfloat32_t v859 = svadd_f32_x(svptrue_b32(), v790, v802); + svfloat32_t v867 = svsub_f32_x(svptrue_b32(), v790, v802); + svfloat32_t v875 = svadd_f32_x(svptrue_b32(), v785, v797); + svfloat32_t v883 = svsub_f32_x(svptrue_b32(), v785, v797); + svfloat32_t v923 = svsub_f32_x(svptrue_b32(), v789, v801); + svfloat32_t v931 = svadd_f32_x(svptrue_b32(), v789, v801); svst1_f64(pred_full, (double *)(v1202), svreinterpret_f64_f32(v827)); svst1_f64(pred_full, (double *)(v1211), svreinterpret_f64_f32(v835)); svst1_f64(pred_full, (double *)(v1220), svreinterpret_f64_f32(v843)); svst1_f64(pred_full, (double *)(v1229), svreinterpret_f64_f32(v851)); - svfloat32_t v781; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v781) : "w"(v780), "w"(v746)); - svfloat32_t v783; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v783) : "w"(v782), "w"(v748)); - svfloat32_t v793; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v793) : "w"(v792), "w"(v776)); - svfloat32_t v795; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v795) : "w"(v794), "w"(v778)); - svfloat32_t v891; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v891) : "w"(v787), "w"(v799)); - svfloat32_t v899; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v899) : "w"(v787), "w"(v799)); - svfloat32_t v907; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v907) : "w"(v786), "w"(v798)); - svfloat32_t v915; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v915) : "w"(v786), "w"(v798)); + svfloat32_t v781 = svadd_f32_x(svptrue_b32(), v780, v746); + svfloat32_t v783 = svadd_f32_x(svptrue_b32(), v782, v748); + svfloat32_t v793 = svadd_f32_x(svptrue_b32(), v792, v776); + svfloat32_t v795 = svadd_f32_x(svptrue_b32(), v794, v778); + svfloat32_t v891 = svadd_f32_x(svptrue_b32(), v787, v799); + svfloat32_t v899 = svsub_f32_x(svptrue_b32(), v787, v799); + svfloat32_t v907 = svadd_f32_x(svptrue_b32(), v786, v798); + svfloat32_t v915 = svsub_f32_x(svptrue_b32(), v786, v798); svst1_f64(pred_full, (double *)(v1238), svreinterpret_f64_f32(v859)); svst1_f64(pred_full, (double *)(v1247), svreinterpret_f64_f32(v867)); svst1_f64(pred_full, (double *)(v1256), svreinterpret_f64_f32(v875)); svst1_f64(pred_full, (double *)(v1265), svreinterpret_f64_f32(v883)); svst1_f64(pred_full, (double *)(v1310), svreinterpret_f64_f32(v923)); svst1_f64(pred_full, (double *)(v1319), svreinterpret_f64_f32(v931)); - svfloat32_t v811; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v811) : "w"(v781), "w"(v793)); - svfloat32_t v819; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v819) : "w"(v781), "w"(v793)); - svfloat32_t v939; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v939) : "w"(v783), "w"(v795)); - svfloat32_t v947; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v947) : "w"(v783), "w"(v795)); + svfloat32_t v811 = svadd_f32_x(svptrue_b32(), v781, v793); + svfloat32_t v819 = svsub_f32_x(svptrue_b32(), v781, v793); + svfloat32_t v939 = svadd_f32_x(svptrue_b32(), v783, v795); + svfloat32_t v947 = svsub_f32_x(svptrue_b32(), v783, v795); svst1_f64(pred_full, (double *)(v1274), svreinterpret_f64_f32(v891)); svst1_f64(pred_full, (double *)(v1283), svreinterpret_f64_f32(v899)); svst1_f64(pred_full, (double *)(v1292), svreinterpret_f64_f32(v907)); @@ -10849,8 +9611,7 @@ void armral_fft_cf32_cf32_cf32_ab_t_gu20(const armral_cmplx_f32_t *restrict x, svld1_f64(pred_full, &((const double *)v7)[v365])); svfloat32_t v373 = svreinterpret_f32_f64( svld1_f64(pred_full, &((const double *)v7)[v372])); - svfloat32_t zero409; - asm volatile("mov %0.s, #0" : "=w"(zero409)); + svfloat32_t zero409 = svdup_n_f32(0); svfloat32_t v409 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero409, v977, v408, 0), v977, v408, 90); @@ -10892,329 +9653,208 @@ void armral_fft_cf32_cf32_cf32_ab_t_gu20(const armral_cmplx_f32_t *restrict x, svld1_gather_s64index_f64(pred_full, (const double *)(v966), v997)); svfloat32_t v987 = svreinterpret_f32_f64( svld1_gather_s64index_f64(pred_full, (const double *)(v985), v997)); - svfloat32_t zero38; - asm volatile("mov %0.s, #0" : "=w"(zero38)); + svfloat32_t zero38 = svdup_n_f32(0); svfloat32_t v38 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero38, v824, v37, 0), v824, v37, 90); - svfloat32_t zero73; - asm volatile("mov %0.s, #0" : "=w"(zero73)); + svfloat32_t zero73 = svdup_n_f32(0); svfloat32_t v73 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero73, v833, v72, 0), v833, v72, 90); - svfloat32_t zero80; - asm volatile("mov %0.s, #0" : "=w"(zero80)); + svfloat32_t zero80 = svdup_n_f32(0); svfloat32_t v80 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero80, v842, v79, 0), v842, v79, 90); - svfloat32_t zero115; - asm volatile("mov %0.s, #0" : "=w"(zero115)); + svfloat32_t zero115 = svdup_n_f32(0); svfloat32_t v115 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero115, v851, v114, 0), v851, v114, 90); - svfloat32_t zero122; - asm volatile("mov %0.s, #0" : "=w"(zero122)); + svfloat32_t zero122 = svdup_n_f32(0); svfloat32_t v122 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero122, v860, v121, 0), v860, v121, 90); - svfloat32_t zero157; - asm volatile("mov %0.s, #0" : "=w"(zero157)); + svfloat32_t zero157 = svdup_n_f32(0); svfloat32_t v157 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero157, v869, v156, 0), v869, v156, 90); - svfloat32_t zero164; - asm volatile("mov %0.s, #0" : "=w"(zero164)); + svfloat32_t zero164 = svdup_n_f32(0); svfloat32_t v164 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero164, v878, v163, 0), v878, v163, 90); - svfloat32_t zero199; - asm volatile("mov %0.s, #0" : "=w"(zero199)); + svfloat32_t zero199 = svdup_n_f32(0); svfloat32_t v199 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero199, v887, v198, 0), v887, v198, 90); - svfloat32_t zero206; - asm volatile("mov %0.s, #0" : "=w"(zero206)); + svfloat32_t zero206 = svdup_n_f32(0); svfloat32_t v206 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero206, v896, v205, 0), v896, v205, 90); - svfloat32_t zero241; - asm volatile("mov %0.s, #0" : "=w"(zero241)); + svfloat32_t zero241 = svdup_n_f32(0); svfloat32_t v241 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero241, v905, v240, 0), v905, v240, 90); - svfloat32_t zero248; - asm volatile("mov %0.s, #0" : "=w"(zero248)); + svfloat32_t zero248 = svdup_n_f32(0); svfloat32_t v248 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero248, v914, v247, 0), v914, v247, 90); - svfloat32_t zero283; - asm volatile("mov %0.s, #0" : "=w"(zero283)); + svfloat32_t zero283 = svdup_n_f32(0); svfloat32_t v283 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero283, v923, v282, 0), v923, v282, 90); - svfloat32_t zero290; - asm volatile("mov %0.s, #0" : "=w"(zero290)); + svfloat32_t zero290 = svdup_n_f32(0); svfloat32_t v290 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero290, v932, v289, 0), v932, v289, 90); - svfloat32_t zero325; - asm volatile("mov %0.s, #0" : "=w"(zero325)); + svfloat32_t zero325 = svdup_n_f32(0); svfloat32_t v325 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero325, v941, v324, 0), v941, v324, 90); - svfloat32_t zero332; - asm volatile("mov %0.s, #0" : "=w"(zero332)); + svfloat32_t zero332 = svdup_n_f32(0); svfloat32_t v332 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero332, v950, v331, 0), v950, v331, 90); - svfloat32_t zero367; - asm volatile("mov %0.s, #0" : "=w"(zero367)); + svfloat32_t zero367 = svdup_n_f32(0); svfloat32_t v367 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero367, v959, v366, 0), v959, v366, 90); - svfloat32_t zero374; - asm volatile("mov %0.s, #0" : "=w"(zero374)); + svfloat32_t zero374 = svdup_n_f32(0); svfloat32_t v374 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero374, v968, v373, 0), v968, v373, 90); - svfloat32_t zero416; - asm volatile("mov %0.s, #0" : "=w"(zero416)); + svfloat32_t zero416 = svdup_n_f32(0); svfloat32_t v416 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero416, v987, v415, 0), v987, v415, 90); - svfloat32_t v424; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v424) : "w"(v998), "w"(v38)); - svfloat32_t v425; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v425) : "w"(v998), "w"(v38)); - svfloat32_t v426; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v426) : "w"(v73), "w"(v80)); - svfloat32_t v427; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v427) : "w"(v73), "w"(v80)); - svfloat32_t v430; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v430) : "w"(v115), "w"(v122)); - svfloat32_t v431; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v431) : "w"(v115), "w"(v122)); - svfloat32_t v432; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v432) : "w"(v157), "w"(v164)); - svfloat32_t v433; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v433) : "w"(v157), "w"(v164)); - svfloat32_t v436; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v436) : "w"(v199), "w"(v206)); - svfloat32_t v437; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v437) : "w"(v199), "w"(v206)); - svfloat32_t v438; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v438) : "w"(v241), "w"(v248)); - svfloat32_t v439; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v439) : "w"(v241), "w"(v248)); - svfloat32_t v442; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v442) : "w"(v283), "w"(v290)); - svfloat32_t v443; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v443) : "w"(v283), "w"(v290)); - svfloat32_t v444; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v444) : "w"(v325), "w"(v332)); - svfloat32_t v445; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v445) : "w"(v325), "w"(v332)); - svfloat32_t v448; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v448) : "w"(v367), "w"(v374)); - svfloat32_t v449; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v449) : "w"(v367), "w"(v374)); - svfloat32_t v450; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v450) : "w"(v409), "w"(v416)); - svfloat32_t v451; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v451) : "w"(v409), "w"(v416)); - svfloat32_t v428; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v428) : "w"(v424), "w"(v426)); - svfloat32_t v429; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v429) : "w"(v424), "w"(v426)); - svfloat32_t v434; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v434) : "w"(v430), "w"(v432)); - svfloat32_t v435; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v435) : "w"(v430), "w"(v432)); - svfloat32_t v440; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v440) : "w"(v436), "w"(v438)); - svfloat32_t v441; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v441) : "w"(v436), "w"(v438)); - svfloat32_t v446; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v446) : "w"(v442), "w"(v444)); - svfloat32_t v447; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v447) : "w"(v442), "w"(v444)); - svfloat32_t v452; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v452) : "w"(v448), "w"(v450)); - svfloat32_t v453; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v453) : "w"(v448), "w"(v450)); - svfloat32_t v560; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v560) : "w"(v431), "w"(v449)); - svfloat32_t v561; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v561) : "w"(v431), "w"(v449)); - svfloat32_t v562; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v562) : "w"(v443), "w"(v437)); - svfloat32_t v563; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v563) : "w"(v443), "w"(v437)); - svfloat32_t v613; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v613) : "w"(v433), "w"(v451)); - svfloat32_t v614; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v614) : "w"(v433), "w"(v451)); - svfloat32_t v615; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v615) : "w"(v445), "w"(v439)); - svfloat32_t v616; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v616) : "w"(v445), "w"(v439)); - svfloat32_t v454; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v454) : "w"(v434), "w"(v452)); - svfloat32_t v455; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v455) : "w"(v434), "w"(v452)); - svfloat32_t v456; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v456) : "w"(v446), "w"(v440)); - svfloat32_t v457; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v457) : "w"(v446), "w"(v440)); - svfloat32_t v507; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v507) : "w"(v435), "w"(v453)); - svfloat32_t v508; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v508) : "w"(v435), "w"(v453)); - svfloat32_t v509; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v509) : "w"(v447), "w"(v441)); - svfloat32_t v510; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v510) : "w"(v447), "w"(v441)); - svfloat32_t v564; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v564) : "w"(v560), "w"(v562)); - svfloat32_t v565; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v565) : "w"(v560), "w"(v562)); - svfloat32_t v566; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v566) : "w"(v561), "w"(v563)); - svfloat32_t zero589; - asm volatile("mov %0.s, #0" : "=w"(zero589)); + svfloat32_t v424 = svadd_f32_x(svptrue_b32(), v998, v38); + svfloat32_t v425 = svsub_f32_x(svptrue_b32(), v998, v38); + svfloat32_t v426 = svadd_f32_x(svptrue_b32(), v73, v80); + svfloat32_t v427 = svsub_f32_x(svptrue_b32(), v73, v80); + svfloat32_t v430 = svadd_f32_x(svptrue_b32(), v115, v122); + svfloat32_t v431 = svsub_f32_x(svptrue_b32(), v115, v122); + svfloat32_t v432 = svadd_f32_x(svptrue_b32(), v157, v164); + svfloat32_t v433 = svsub_f32_x(svptrue_b32(), v157, v164); + svfloat32_t v436 = svadd_f32_x(svptrue_b32(), v199, v206); + svfloat32_t v437 = svsub_f32_x(svptrue_b32(), v199, v206); + svfloat32_t v438 = svadd_f32_x(svptrue_b32(), v241, v248); + svfloat32_t v439 = svsub_f32_x(svptrue_b32(), v241, v248); + svfloat32_t v442 = svadd_f32_x(svptrue_b32(), v283, v290); + svfloat32_t v443 = svsub_f32_x(svptrue_b32(), v283, v290); + svfloat32_t v444 = svadd_f32_x(svptrue_b32(), v325, v332); + svfloat32_t v445 = svsub_f32_x(svptrue_b32(), v325, v332); + svfloat32_t v448 = svadd_f32_x(svptrue_b32(), v367, v374); + svfloat32_t v449 = svsub_f32_x(svptrue_b32(), v367, v374); + svfloat32_t v450 = svadd_f32_x(svptrue_b32(), v409, v416); + svfloat32_t v451 = svsub_f32_x(svptrue_b32(), v409, v416); + svfloat32_t v428 = svadd_f32_x(svptrue_b32(), v424, v426); + svfloat32_t v429 = svsub_f32_x(svptrue_b32(), v424, v426); + svfloat32_t v434 = svadd_f32_x(svptrue_b32(), v430, v432); + svfloat32_t v435 = svsub_f32_x(svptrue_b32(), v430, v432); + svfloat32_t v440 = svadd_f32_x(svptrue_b32(), v436, v438); + svfloat32_t v441 = svsub_f32_x(svptrue_b32(), v436, v438); + svfloat32_t v446 = svadd_f32_x(svptrue_b32(), v442, v444); + svfloat32_t v447 = svsub_f32_x(svptrue_b32(), v442, v444); + svfloat32_t v452 = svadd_f32_x(svptrue_b32(), v448, v450); + svfloat32_t v453 = svsub_f32_x(svptrue_b32(), v448, v450); + svfloat32_t v560 = svadd_f32_x(svptrue_b32(), v431, v449); + svfloat32_t v561 = svsub_f32_x(svptrue_b32(), v431, v449); + svfloat32_t v562 = svadd_f32_x(svptrue_b32(), v443, v437); + svfloat32_t v563 = svsub_f32_x(svptrue_b32(), v443, v437); + svfloat32_t v613 = svadd_f32_x(svptrue_b32(), v433, v451); + svfloat32_t v614 = svsub_f32_x(svptrue_b32(), v433, v451); + svfloat32_t v615 = svadd_f32_x(svptrue_b32(), v445, v439); + svfloat32_t v616 = svsub_f32_x(svptrue_b32(), v445, v439); + svfloat32_t v454 = svadd_f32_x(svptrue_b32(), v434, v452); + svfloat32_t v455 = svsub_f32_x(svptrue_b32(), v434, v452); + svfloat32_t v456 = svadd_f32_x(svptrue_b32(), v446, v440); + svfloat32_t v457 = svsub_f32_x(svptrue_b32(), v446, v440); + svfloat32_t v507 = svadd_f32_x(svptrue_b32(), v435, v453); + svfloat32_t v508 = svsub_f32_x(svptrue_b32(), v435, v453); + svfloat32_t v509 = svadd_f32_x(svptrue_b32(), v447, v441); + svfloat32_t v510 = svsub_f32_x(svptrue_b32(), v447, v441); + svfloat32_t v564 = svadd_f32_x(svptrue_b32(), v560, v562); + svfloat32_t v565 = svsub_f32_x(svptrue_b32(), v560, v562); + svfloat32_t v566 = svadd_f32_x(svptrue_b32(), v561, v563); + svfloat32_t zero589 = svdup_n_f32(0); svfloat32_t v589 = svcmla_f32_x(pred_full, zero589, v1014, v561, 90); - svfloat32_t v617; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v617) : "w"(v613), "w"(v615)); - svfloat32_t v618; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v618) : "w"(v613), "w"(v615)); - svfloat32_t v619; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v619) : "w"(v614), "w"(v616)); - svfloat32_t v656; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v656) : "w"(v616), "w"(v1022)); - svfloat32_t v458; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v458) : "w"(v454), "w"(v456)); - svfloat32_t v459; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v459) : "w"(v454), "w"(v456)); - svfloat32_t v460; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v460) : "w"(v455), "w"(v457)); - svfloat32_t zero483; - asm volatile("mov %0.s, #0" : "=w"(zero483)); + svfloat32_t v617 = svadd_f32_x(svptrue_b32(), v613, v615); + svfloat32_t v618 = svsub_f32_x(svptrue_b32(), v613, v615); + svfloat32_t v619 = svadd_f32_x(svptrue_b32(), v614, v616); + svfloat32_t v656 = svmul_f32_x(svptrue_b32(), v616, v1022); + svfloat32_t v458 = svadd_f32_x(svptrue_b32(), v454, v456); + svfloat32_t v459 = svsub_f32_x(svptrue_b32(), v454, v456); + svfloat32_t v460 = svadd_f32_x(svptrue_b32(), v455, v457); + svfloat32_t zero483 = svdup_n_f32(0); svfloat32_t v483 = svcmla_f32_x(pred_full, zero483, v1014, v455, 90); - svfloat32_t v511; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v511) : "w"(v507), "w"(v509)); - svfloat32_t v512; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v512) : "w"(v507), "w"(v509)); - svfloat32_t v513; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v513) : "w"(v508), "w"(v510)); - svfloat32_t zero536; - asm volatile("mov %0.s, #0" : "=w"(zero536)); + svfloat32_t v511 = svadd_f32_x(svptrue_b32(), v507, v509); + svfloat32_t v512 = svsub_f32_x(svptrue_b32(), v507, v509); + svfloat32_t v513 = svadd_f32_x(svptrue_b32(), v508, v510); + svfloat32_t zero536 = svdup_n_f32(0); svfloat32_t v536 = svcmla_f32_x(pred_full, zero536, v1014, v508, 90); - svfloat32_t v567; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v567) : "w"(v564), "w"(v425)); - svfloat32_t zero596; - asm volatile("mov %0.s, #0" : "=w"(zero596)); + svfloat32_t v567 = svadd_f32_x(svptrue_b32(), v564, v425); + svfloat32_t zero596 = svdup_n_f32(0); svfloat32_t v596 = svcmla_f32_x(pred_full, zero596, v1015, v566, 90); - svfloat32_t v620; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v620) : "w"(v617), "w"(v427)); - svfloat32_t zero641; - asm volatile("mov %0.s, #0" : "=w"(zero641)); + svfloat32_t v620 = svadd_f32_x(svptrue_b32(), v617, v427); + svfloat32_t zero641 = svdup_n_f32(0); svfloat32_t v641 = svcmla_f32_x(pred_full, zero641, v1019, v618, 90); - svfloat32_t v651; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v651) : "w"(v619), "w"(v1021)); - svfloat32_t v461; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v461) : "w"(v458), "w"(v428)); - svfloat32_t zero490; - asm volatile("mov %0.s, #0" : "=w"(zero490)); + svfloat32_t v651 = svmul_f32_x(svptrue_b32(), v619, v1021); + svfloat32_t v461 = svadd_f32_x(svptrue_b32(), v458, v428); + svfloat32_t zero490 = svdup_n_f32(0); svfloat32_t v490 = svcmla_f32_x(pred_full, zero490, v1015, v460, 90); - svfloat32_t v514; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v514) : "w"(v511), "w"(v429)); - svfloat32_t zero543; - asm volatile("mov %0.s, #0" : "=w"(zero543)); + svfloat32_t v514 = svadd_f32_x(svptrue_b32(), v511, v429); + svfloat32_t zero543 = svdup_n_f32(0); svfloat32_t v543 = svcmla_f32_x(pred_full, zero543, v1015, v513, 90); svfloat32_t v604 = svmla_f32_x(pred_full, v567, v564, v1012); - svfloat32_t v607; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v607) : "w"(v589), "w"(v596)); + svfloat32_t v607 = svsub_f32_x(svptrue_b32(), v589, v596); svfloat32_t v608 = svcmla_f32_x(pred_full, v596, v1016, v563, 90); - svfloat32_t zero627; - asm volatile("mov %0.s, #0" : "=w"(zero627)); + svfloat32_t zero627 = svdup_n_f32(0); svfloat32_t v627 = svcmla_f32_x(pred_full, zero627, v1017, v620, 90); svfloat32_t v660 = svnmls_f32_x(pred_full, v651, v614, v1020); svfloat32_t v661 = svmla_f32_x(pred_full, v656, v619, v1021); svfloat32_t v498 = svmla_f32_x(pred_full, v461, v458, v1012); - svfloat32_t v501; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v501) : "w"(v483), "w"(v490)); + svfloat32_t v501 = svsub_f32_x(svptrue_b32(), v483, v490); svfloat32_t v502 = svcmla_f32_x(pred_full, v490, v1016, v457, 90); svfloat32_t v551 = svmla_f32_x(pred_full, v514, v511, v1012); - svfloat32_t v554; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v554) : "w"(v536), "w"(v543)); + svfloat32_t v554 = svsub_f32_x(svptrue_b32(), v536, v543); svfloat32_t v555 = svcmla_f32_x(pred_full, v543, v1016, v510, 90); svfloat32_t v605 = svmla_f32_x(pred_full, v604, v565, v1013); svfloat32_t v606 = svmls_f32_x(pred_full, v604, v565, v1013); svfloat32_t v657 = svcmla_f32_x(pred_full, v627, v1018, v617, 90); - svfloat32_t v666; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v666) : "w"(v567), "w"(v627)); - svfloat32_t v667; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v667) : "w"(v567), "w"(v627)); + svfloat32_t v666 = svadd_f32_x(svptrue_b32(), v567, v627); + svfloat32_t v667 = svsub_f32_x(svptrue_b32(), v567, v627); svst1_f64(pred_full, (double *)(v1030), svreinterpret_f64_f32(v461)); svst1_f64(pred_full, (double *)(v1048), svreinterpret_f64_f32(v514)); svfloat32_t v499 = svmla_f32_x(pred_full, v498, v459, v1013); svfloat32_t v500 = svmls_f32_x(pred_full, v498, v459, v1013); svfloat32_t v552 = svmla_f32_x(pred_full, v551, v512, v1013); svfloat32_t v553 = svmls_f32_x(pred_full, v551, v512, v1013); - svfloat32_t v609; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v609) : "w"(v605), "w"(v607)); - svfloat32_t v610; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v610) : "w"(v605), "w"(v607)); - svfloat32_t v611; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v611) : "w"(v606), "w"(v608)); - svfloat32_t v612; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v612) : "w"(v606), "w"(v608)); - svfloat32_t v658; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v658) : "w"(v657), "w"(v641)); - svfloat32_t v659; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v659) : "w"(v657), "w"(v641)); + svfloat32_t v609 = svadd_f32_x(svptrue_b32(), v605, v607); + svfloat32_t v610 = svsub_f32_x(svptrue_b32(), v605, v607); + svfloat32_t v611 = svadd_f32_x(svptrue_b32(), v606, v608); + svfloat32_t v612 = svsub_f32_x(svptrue_b32(), v606, v608); + svfloat32_t v658 = svadd_f32_x(svptrue_b32(), v657, v641); + svfloat32_t v659 = svsub_f32_x(svptrue_b32(), v657, v641); svst1_f64(pred_full, (double *)(v1039), svreinterpret_f64_f32(v667)); svst1_f64(pred_full, (double *)(v1057), svreinterpret_f64_f32(v666)); - svfloat32_t v503; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v503) : "w"(v499), "w"(v501)); - svfloat32_t v504; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v504) : "w"(v499), "w"(v501)); - svfloat32_t v505; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v505) : "w"(v500), "w"(v502)); - svfloat32_t v506; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v506) : "w"(v500), "w"(v502)); - svfloat32_t v556; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v556) : "w"(v552), "w"(v554)); - svfloat32_t v557; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v557) : "w"(v552), "w"(v554)); - svfloat32_t v558; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v558) : "w"(v553), "w"(v555)); - svfloat32_t v559; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v559) : "w"(v553), "w"(v555)); - svfloat32_t v662; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v662) : "w"(v658), "w"(v660)); - svfloat32_t v663; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v663) : "w"(v658), "w"(v660)); - svfloat32_t v664; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v664) : "w"(v659), "w"(v661)); - svfloat32_t v665; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v665) : "w"(v659), "w"(v661)); - svfloat32_t v696; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v696) : "w"(v610), "w"(v663)); - svfloat32_t v697; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v697) : "w"(v610), "w"(v663)); - svfloat32_t v726; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v726) : "w"(v612), "w"(v665)); - svfloat32_t v727; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v727) : "w"(v612), "w"(v665)); - svfloat32_t v756; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v756) : "w"(v611), "w"(v664)); - svfloat32_t v757; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v757) : "w"(v611), "w"(v664)); - svfloat32_t v786; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v786) : "w"(v609), "w"(v662)); - svfloat32_t v787; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v787) : "w"(v609), "w"(v662)); + svfloat32_t v503 = svadd_f32_x(svptrue_b32(), v499, v501); + svfloat32_t v504 = svsub_f32_x(svptrue_b32(), v499, v501); + svfloat32_t v505 = svadd_f32_x(svptrue_b32(), v500, v502); + svfloat32_t v506 = svsub_f32_x(svptrue_b32(), v500, v502); + svfloat32_t v556 = svadd_f32_x(svptrue_b32(), v552, v554); + svfloat32_t v557 = svsub_f32_x(svptrue_b32(), v552, v554); + svfloat32_t v558 = svadd_f32_x(svptrue_b32(), v553, v555); + svfloat32_t v559 = svsub_f32_x(svptrue_b32(), v553, v555); + svfloat32_t v662 = svadd_f32_x(svptrue_b32(), v658, v660); + svfloat32_t v663 = svsub_f32_x(svptrue_b32(), v658, v660); + svfloat32_t v664 = svadd_f32_x(svptrue_b32(), v659, v661); + svfloat32_t v665 = svsub_f32_x(svptrue_b32(), v659, v661); + svfloat32_t v696 = svadd_f32_x(svptrue_b32(), v610, v663); + svfloat32_t v697 = svsub_f32_x(svptrue_b32(), v610, v663); + svfloat32_t v726 = svadd_f32_x(svptrue_b32(), v612, v665); + svfloat32_t v727 = svsub_f32_x(svptrue_b32(), v612, v665); + svfloat32_t v756 = svadd_f32_x(svptrue_b32(), v611, v664); + svfloat32_t v757 = svsub_f32_x(svptrue_b32(), v611, v664); + svfloat32_t v786 = svadd_f32_x(svptrue_b32(), v609, v662); + svfloat32_t v787 = svsub_f32_x(svptrue_b32(), v609, v662); svst1_f64(pred_full, (double *)(v1066), svreinterpret_f64_f32(v504)); svst1_f64(pred_full, (double *)(v1084), svreinterpret_f64_f32(v557)); svst1_f64(pred_full, (double *)(v1102), svreinterpret_f64_f32(v506)); @@ -11961,8 +10601,7 @@ void armral_fft_cf32_cf32_cf32_ab_t_gu21(const armral_cmplx_f32_t *restrict x, svld1_f64(pred_full, &((const double *)v7)[v267])); svfloat32_t v282 = svreinterpret_f32_f64( svld1_f64(pred_full, &((const double *)v7)[v281])); - svfloat32_t zero318; - asm volatile("mov %0.s, #0" : "=w"(zero318)); + svfloat32_t zero318 = svdup_n_f32(0); svfloat32_t v318 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero318, v995, v317, 0), v995, v317, 90); @@ -12014,101 +10653,73 @@ void armral_fft_cf32_cf32_cf32_ab_t_gu21(const armral_cmplx_f32_t *restrict x, svld1_gather_s64index_f64(pred_full, (const double *)(v1031), v1051)); svfloat32_t v1042 = svreinterpret_f32_f64( svld1_gather_s64index_f64(pred_full, (const double *)(v1040), v1051)); - svfloat32_t zero52; - asm volatile("mov %0.s, #0" : "=w"(zero52)); + svfloat32_t zero52 = svdup_n_f32(0); svfloat32_t v52 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero52, v869, v51, 0), v869, v51, 90); - svfloat32_t zero59; - asm volatile("mov %0.s, #0" : "=w"(zero59)); + svfloat32_t zero59 = svdup_n_f32(0); svfloat32_t v59 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero59, v878, v58, 0), v878, v58, 90); - svfloat32_t zero94; - asm volatile("mov %0.s, #0" : "=w"(zero94)); + svfloat32_t zero94 = svdup_n_f32(0); svfloat32_t v94 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero94, v887, v93, 0), v887, v93, 90); - svfloat32_t zero101; - asm volatile("mov %0.s, #0" : "=w"(zero101)); + svfloat32_t zero101 = svdup_n_f32(0); svfloat32_t v101 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero101, v896, v100, 0), v896, v100, 90); - svfloat32_t zero150; - asm volatile("mov %0.s, #0" : "=w"(zero150)); + svfloat32_t zero150 = svdup_n_f32(0); svfloat32_t v150 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero150, v914, v149, 0), v914, v149, 90); - svfloat32_t zero157; - asm volatile("mov %0.s, #0" : "=w"(zero157)); + svfloat32_t zero157 = svdup_n_f32(0); svfloat32_t v157 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero157, v923, v156, 0), v923, v156, 90); - svfloat32_t zero206; - asm volatile("mov %0.s, #0" : "=w"(zero206)); + svfloat32_t zero206 = svdup_n_f32(0); svfloat32_t v206 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero206, v941, v205, 0), v941, v205, 90); - svfloat32_t zero213; - asm volatile("mov %0.s, #0" : "=w"(zero213)); + svfloat32_t zero213 = svdup_n_f32(0); svfloat32_t v213 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero213, v950, v212, 0), v950, v212, 90); - svfloat32_t zero262; - asm volatile("mov %0.s, #0" : "=w"(zero262)); + svfloat32_t zero262 = svdup_n_f32(0); svfloat32_t v262 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero262, v968, v261, 0), v968, v261, 90); - svfloat32_t zero269; - asm volatile("mov %0.s, #0" : "=w"(zero269)); + svfloat32_t zero269 = svdup_n_f32(0); svfloat32_t v269 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero269, v977, v268, 0), v977, v268, 90); - svfloat32_t zero325; - asm volatile("mov %0.s, #0" : "=w"(zero325)); + svfloat32_t zero325 = svdup_n_f32(0); svfloat32_t v325 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero325, v1005, v324, 0), v1005, v324, 90); - svfloat32_t zero374; - asm volatile("mov %0.s, #0" : "=w"(zero374)); + svfloat32_t zero374 = svdup_n_f32(0); svfloat32_t v374 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero374, v1024, v373, 0), v1024, v373, 90); - svfloat32_t zero381; - asm volatile("mov %0.s, #0" : "=w"(zero381)); + svfloat32_t zero381 = svdup_n_f32(0); svfloat32_t v381 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero381, v1033, v380, 0), v1033, v380, 90); - svfloat32_t v396; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v396) : "w"(v52), "w"(v59)); - svfloat32_t v397; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v397) : "w"(v52), "w"(v59)); - svfloat32_t v406; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v406) : "w"(v94), "w"(v101)); - svfloat32_t v407; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v407) : "w"(v94), "w"(v101)); - svfloat32_t v409; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v409) : "w"(v150), "w"(v157)); - svfloat32_t v410; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v410) : "w"(v150), "w"(v157)); - svfloat32_t v412; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v412) : "w"(v206), "w"(v213)); - svfloat32_t v413; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v413) : "w"(v206), "w"(v213)); - svfloat32_t v415; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v415) : "w"(v262), "w"(v269)); - svfloat32_t v416; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v416) : "w"(v262), "w"(v269)); - svfloat32_t v418; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v418) : "w"(v318), "w"(v325)); - svfloat32_t v419; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v419) : "w"(v318), "w"(v325)); - svfloat32_t v421; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v421) : "w"(v374), "w"(v381)); - svfloat32_t v422; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v422) : "w"(v374), "w"(v381)); - svfloat32_t v405; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v405) : "w"(v396), "w"(v1052)); + svfloat32_t v396 = svadd_f32_x(svptrue_b32(), v52, v59); + svfloat32_t v397 = svsub_f32_x(svptrue_b32(), v52, v59); + svfloat32_t v406 = svadd_f32_x(svptrue_b32(), v94, v101); + svfloat32_t v407 = svsub_f32_x(svptrue_b32(), v94, v101); + svfloat32_t v409 = svadd_f32_x(svptrue_b32(), v150, v157); + svfloat32_t v410 = svsub_f32_x(svptrue_b32(), v150, v157); + svfloat32_t v412 = svadd_f32_x(svptrue_b32(), v206, v213); + svfloat32_t v413 = svsub_f32_x(svptrue_b32(), v206, v213); + svfloat32_t v415 = svadd_f32_x(svptrue_b32(), v262, v269); + svfloat32_t v416 = svsub_f32_x(svptrue_b32(), v262, v269); + svfloat32_t v418 = svadd_f32_x(svptrue_b32(), v318, v325); + svfloat32_t v419 = svsub_f32_x(svptrue_b32(), v318, v325); + svfloat32_t v421 = svadd_f32_x(svptrue_b32(), v374, v381); + svfloat32_t v422 = svsub_f32_x(svptrue_b32(), v374, v381); + svfloat32_t v405 = svadd_f32_x(svptrue_b32(), v396, v1052); svfloat32_t v408 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, v406, v905, v114, 0), v905, v114, 90); @@ -12127,173 +10738,98 @@ void armral_fft_cf32_cf32_cf32_ab_t_gu21(const armral_cmplx_f32_t *restrict x, svfloat32_t v423 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, v421, v1042, v394, 0), v1042, v394, 90); - svfloat32_t v513; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v513) : "w"(v406), "w"(v421)); - svfloat32_t v514; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v514) : "w"(v406), "w"(v421)); - svfloat32_t v515; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v515) : "w"(v415), "w"(v412)); - svfloat32_t v516; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v516) : "w"(v415), "w"(v412)); - svfloat32_t v517; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v517) : "w"(v409), "w"(v418)); - svfloat32_t v518; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v518) : "w"(v409), "w"(v418)); - svfloat32_t v602; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v602) : "w"(v407), "w"(v422)); - svfloat32_t v603; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v603) : "w"(v407), "w"(v422)); - svfloat32_t v604; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v604) : "w"(v416), "w"(v413)); - svfloat32_t v605; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v605) : "w"(v416), "w"(v413)); - svfloat32_t v606; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v606) : "w"(v410), "w"(v419)); - svfloat32_t v607; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v607) : "w"(v410), "w"(v419)); - svfloat32_t v424; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v424) : "w"(v408), "w"(v423)); - svfloat32_t v425; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v425) : "w"(v408), "w"(v423)); - svfloat32_t v426; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v426) : "w"(v417), "w"(v414)); - svfloat32_t v427; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v427) : "w"(v417), "w"(v414)); - svfloat32_t v428; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v428) : "w"(v411), "w"(v420)); - svfloat32_t v429; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v429) : "w"(v411), "w"(v420)); - svfloat32_t v519; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v519) : "w"(v513), "w"(v515)); - svfloat32_t v522; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v522) : "w"(v513), "w"(v515)); - svfloat32_t v523; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v523) : "w"(v515), "w"(v517)); - svfloat32_t v524; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v524) : "w"(v517), "w"(v513)); - svfloat32_t v525; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v525) : "w"(v514), "w"(v516)); - svfloat32_t v527; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v527) : "w"(v514), "w"(v516)); - svfloat32_t v528; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v528) : "w"(v516), "w"(v518)); - svfloat32_t v529; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v529) : "w"(v518), "w"(v514)); - svfloat32_t v608; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v608) : "w"(v602), "w"(v604)); - svfloat32_t v611; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v611) : "w"(v602), "w"(v604)); - svfloat32_t v612; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v612) : "w"(v604), "w"(v606)); - svfloat32_t v613; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v613) : "w"(v606), "w"(v602)); - svfloat32_t v614; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v614) : "w"(v603), "w"(v605)); - svfloat32_t v616; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v616) : "w"(v603), "w"(v605)); - svfloat32_t v617; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v617) : "w"(v605), "w"(v607)); - svfloat32_t v618; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v618) : "w"(v607), "w"(v603)); - svfloat32_t v430; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v430) : "w"(v424), "w"(v426)); - svfloat32_t v433; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v433) : "w"(v424), "w"(v426)); - svfloat32_t v434; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v434) : "w"(v426), "w"(v428)); - svfloat32_t v435; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v435) : "w"(v428), "w"(v424)); - svfloat32_t v436; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v436) : "w"(v425), "w"(v427)); - svfloat32_t v438; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v438) : "w"(v425), "w"(v427)); - svfloat32_t v439; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v439) : "w"(v427), "w"(v429)); - svfloat32_t v440; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v440) : "w"(v429), "w"(v425)); - svfloat32_t v520; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v520) : "w"(v519), "w"(v517)); - svfloat32_t v526; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v526) : "w"(v525), "w"(v518)); - svfloat32_t zero568; - asm volatile("mov %0.s, #0" : "=w"(zero568)); + svfloat32_t v513 = svadd_f32_x(svptrue_b32(), v406, v421); + svfloat32_t v514 = svsub_f32_x(svptrue_b32(), v406, v421); + svfloat32_t v515 = svadd_f32_x(svptrue_b32(), v415, v412); + svfloat32_t v516 = svsub_f32_x(svptrue_b32(), v415, v412); + svfloat32_t v517 = svadd_f32_x(svptrue_b32(), v409, v418); + svfloat32_t v518 = svsub_f32_x(svptrue_b32(), v409, v418); + svfloat32_t v602 = svadd_f32_x(svptrue_b32(), v407, v422); + svfloat32_t v603 = svsub_f32_x(svptrue_b32(), v407, v422); + svfloat32_t v604 = svadd_f32_x(svptrue_b32(), v416, v413); + svfloat32_t v605 = svsub_f32_x(svptrue_b32(), v416, v413); + svfloat32_t v606 = svadd_f32_x(svptrue_b32(), v410, v419); + svfloat32_t v607 = svsub_f32_x(svptrue_b32(), v410, v419); + svfloat32_t v424 = svadd_f32_x(svptrue_b32(), v408, v423); + svfloat32_t v425 = svsub_f32_x(svptrue_b32(), v408, v423); + svfloat32_t v426 = svadd_f32_x(svptrue_b32(), v417, v414); + svfloat32_t v427 = svsub_f32_x(svptrue_b32(), v417, v414); + svfloat32_t v428 = svadd_f32_x(svptrue_b32(), v411, v420); + svfloat32_t v429 = svsub_f32_x(svptrue_b32(), v411, v420); + svfloat32_t v519 = svadd_f32_x(svptrue_b32(), v513, v515); + svfloat32_t v522 = svsub_f32_x(svptrue_b32(), v513, v515); + svfloat32_t v523 = svsub_f32_x(svptrue_b32(), v515, v517); + svfloat32_t v524 = svsub_f32_x(svptrue_b32(), v517, v513); + svfloat32_t v525 = svadd_f32_x(svptrue_b32(), v514, v516); + svfloat32_t v527 = svsub_f32_x(svptrue_b32(), v514, v516); + svfloat32_t v528 = svsub_f32_x(svptrue_b32(), v516, v518); + svfloat32_t v529 = svsub_f32_x(svptrue_b32(), v518, v514); + svfloat32_t v608 = svadd_f32_x(svptrue_b32(), v602, v604); + svfloat32_t v611 = svsub_f32_x(svptrue_b32(), v602, v604); + svfloat32_t v612 = svsub_f32_x(svptrue_b32(), v604, v606); + svfloat32_t v613 = svsub_f32_x(svptrue_b32(), v606, v602); + svfloat32_t v614 = svadd_f32_x(svptrue_b32(), v603, v605); + svfloat32_t v616 = svsub_f32_x(svptrue_b32(), v603, v605); + svfloat32_t v617 = svsub_f32_x(svptrue_b32(), v605, v607); + svfloat32_t v618 = svsub_f32_x(svptrue_b32(), v607, v603); + svfloat32_t v430 = svadd_f32_x(svptrue_b32(), v424, v426); + svfloat32_t v433 = svsub_f32_x(svptrue_b32(), v424, v426); + svfloat32_t v434 = svsub_f32_x(svptrue_b32(), v426, v428); + svfloat32_t v435 = svsub_f32_x(svptrue_b32(), v428, v424); + svfloat32_t v436 = svadd_f32_x(svptrue_b32(), v425, v427); + svfloat32_t v438 = svsub_f32_x(svptrue_b32(), v425, v427); + svfloat32_t v439 = svsub_f32_x(svptrue_b32(), v427, v429); + svfloat32_t v440 = svsub_f32_x(svptrue_b32(), v429, v425); + svfloat32_t v520 = svadd_f32_x(svptrue_b32(), v519, v517); + svfloat32_t v526 = svadd_f32_x(svptrue_b32(), v525, v518); + svfloat32_t zero568 = svdup_n_f32(0); svfloat32_t v568 = svcmla_f32_x(pred_full, zero568, v1068, v527, 90); - svfloat32_t zero575; - asm volatile("mov %0.s, #0" : "=w"(zero575)); + svfloat32_t zero575 = svdup_n_f32(0); svfloat32_t v575 = svcmla_f32_x(pred_full, zero575, v1069, v528, 90); - svfloat32_t zero582; - asm volatile("mov %0.s, #0" : "=w"(zero582)); + svfloat32_t zero582 = svdup_n_f32(0); svfloat32_t v582 = svcmla_f32_x(pred_full, zero582, v1070, v529, 90); - svfloat32_t v609; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v609) : "w"(v608), "w"(v606)); - svfloat32_t v615; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v615) : "w"(v614), "w"(v607)); - svfloat32_t zero639; - asm volatile("mov %0.s, #0" : "=w"(zero639)); + svfloat32_t v609 = svadd_f32_x(svptrue_b32(), v608, v606); + svfloat32_t v615 = svadd_f32_x(svptrue_b32(), v614, v607); + svfloat32_t zero639 = svdup_n_f32(0); svfloat32_t v639 = svcmla_f32_x(pred_full, zero639, v1073, v611, 90); - svfloat32_t zero646; - asm volatile("mov %0.s, #0" : "=w"(zero646)); + svfloat32_t zero646 = svdup_n_f32(0); svfloat32_t v646 = svcmla_f32_x(pred_full, zero646, v1074, v612, 90); - svfloat32_t zero653; - asm volatile("mov %0.s, #0" : "=w"(zero653)); + svfloat32_t zero653 = svdup_n_f32(0); svfloat32_t v653 = svcmla_f32_x(pred_full, zero653, v1075, v613, 90); - svfloat32_t v663; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v663) : "w"(v616), "w"(v1077)); - svfloat32_t v668; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v668) : "w"(v617), "w"(v1078)); - svfloat32_t v431; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v431) : "w"(v430), "w"(v428)); - svfloat32_t v437; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v437) : "w"(v436), "w"(v429)); - svfloat32_t zero479; - asm volatile("mov %0.s, #0" : "=w"(zero479)); + svfloat32_t v663 = svmul_f32_x(svptrue_b32(), v616, v1077); + svfloat32_t v668 = svmul_f32_x(svptrue_b32(), v617, v1078); + svfloat32_t v431 = svadd_f32_x(svptrue_b32(), v430, v428); + svfloat32_t v437 = svadd_f32_x(svptrue_b32(), v436, v429); + svfloat32_t zero479 = svdup_n_f32(0); svfloat32_t v479 = svcmla_f32_x(pred_full, zero479, v1059, v438, 90); - svfloat32_t zero486; - asm volatile("mov %0.s, #0" : "=w"(zero486)); + svfloat32_t zero486 = svdup_n_f32(0); svfloat32_t v486 = svcmla_f32_x(pred_full, zero486, v1060, v439, 90); - svfloat32_t zero493; - asm volatile("mov %0.s, #0" : "=w"(zero493)); + svfloat32_t zero493 = svdup_n_f32(0); svfloat32_t v493 = svcmla_f32_x(pred_full, zero493, v1061, v440, 90); - svfloat32_t v521; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v521) : "w"(v520), "w"(v396)); - svfloat32_t v539; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v539) : "w"(v520), "w"(v1063)); - svfloat32_t zero561; - asm volatile("mov %0.s, #0" : "=w"(zero561)); + svfloat32_t v521 = svadd_f32_x(svptrue_b32(), v520, v396); + svfloat32_t v539 = svmul_f32_x(svptrue_b32(), v520, v1063); + svfloat32_t zero561 = svdup_n_f32(0); svfloat32_t v561 = svcmla_f32_x(pred_full, zero561, v1067, v526, 90); - svfloat32_t v610; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v610) : "w"(v609), "w"(v397)); - svfloat32_t v432; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v432) : "w"(v431), "w"(v405)); - svfloat32_t zero472; - asm volatile("mov %0.s, #0" : "=w"(zero472)); + svfloat32_t v610 = svadd_f32_x(svptrue_b32(), v609, v397); + svfloat32_t v432 = svadd_f32_x(svptrue_b32(), v431, v405); + svfloat32_t zero472 = svdup_n_f32(0); svfloat32_t v472 = svcmla_f32_x(pred_full, zero472, v1058, v437, 90); - svfloat32_t v590; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v590) : "w"(v561), "w"(v568)); - svfloat32_t v592; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v592) : "w"(v561), "w"(v568)); - svfloat32_t v594; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v594) : "w"(v561), "w"(v575)); - svfloat32_t zero625; - asm volatile("mov %0.s, #0" : "=w"(zero625)); + svfloat32_t v590 = svadd_f32_x(svptrue_b32(), v561, v568); + svfloat32_t v592 = svsub_f32_x(svptrue_b32(), v561, v568); + svfloat32_t v594 = svsub_f32_x(svptrue_b32(), v561, v575); + svfloat32_t zero625 = svdup_n_f32(0); svfloat32_t v625 = svcmla_f32_x(pred_full, zero625, v1071, v610, 90); svfloat32_t v681 = svmla_f32_x(pred_full, v663, v615, v1076); svfloat32_t v683 = svnmls_f32_x(pred_full, v663, v615, v1076); svfloat32_t v685 = svnmls_f32_x(pred_full, v668, v615, v1076); svfloat32_t v494 = svmla_f32_x(pred_full, v432, v431, v1054); - svfloat32_t v501; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v501) : "w"(v472), "w"(v479)); - svfloat32_t v503; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v503) : "w"(v472), "w"(v479)); - svfloat32_t v505; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v505) : "w"(v472), "w"(v486)); + svfloat32_t v501 = svadd_f32_x(svptrue_b32(), v472, v479); + svfloat32_t v503 = svsub_f32_x(svptrue_b32(), v472, v479); + svfloat32_t v505 = svsub_f32_x(svptrue_b32(), v472, v486); svfloat32_t v583 = svmla_f32_x(pred_full, v539, v521, v1062); - svfloat32_t v591; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v591) : "w"(v590), "w"(v575)); - svfloat32_t v593; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v593) : "w"(v592), "w"(v582)); - svfloat32_t v595; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v595) : "w"(v594), "w"(v582)); + svfloat32_t v591 = svadd_f32_x(svptrue_b32(), v590, v575); + svfloat32_t v593 = svsub_f32_x(svptrue_b32(), v592, v582); + svfloat32_t v595 = svadd_f32_x(svptrue_b32(), v594, v582); svfloat32_t v674 = svcmla_f32_x(pred_full, v625, v1072, v609, 90); svfloat32_t v682 = svmla_f32_x(pred_full, v681, v617, v1078); svfloat32_t v684 = svmls_f32_x(pred_full, v683, v618, v1079); @@ -12303,117 +10839,70 @@ void armral_fft_cf32_cf32_cf32_ab_t_gu21(const armral_cmplx_f32_t *restrict x, svfloat32_t v495 = svmla_f32_x(pred_full, v494, v433, v1055); svfloat32_t v497 = svmls_f32_x(pred_full, v494, v433, v1055); svfloat32_t v499 = svmls_f32_x(pred_full, v494, v434, v1056); - svfloat32_t v502; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v502) : "w"(v501), "w"(v486)); - svfloat32_t v504; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v504) : "w"(v503), "w"(v493)); - svfloat32_t v506; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v506) : "w"(v505), "w"(v493)); + svfloat32_t v502 = svadd_f32_x(svptrue_b32(), v501, v486); + svfloat32_t v504 = svsub_f32_x(svptrue_b32(), v503, v493); + svfloat32_t v506 = svadd_f32_x(svptrue_b32(), v505, v493); svfloat32_t v584 = svmla_f32_x(pred_full, v583, v522, v1064); svfloat32_t v586 = svmls_f32_x(pred_full, v583, v522, v1064); svfloat32_t v588 = svmls_f32_x(pred_full, v583, v523, v1065); - svfloat32_t v675; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v675) : "w"(v674), "w"(v639)); - svfloat32_t v677; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v677) : "w"(v674), "w"(v639)); - svfloat32_t v679; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v679) : "w"(v674), "w"(v646)); - svfloat32_t v694; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v694) : "w"(v693), "w"(v625)); - svfloat32_t v695; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v695) : "w"(v693), "w"(v625)); + svfloat32_t v675 = svadd_f32_x(svptrue_b32(), v674, v639); + svfloat32_t v677 = svsub_f32_x(svptrue_b32(), v674, v639); + svfloat32_t v679 = svsub_f32_x(svptrue_b32(), v674, v646); + svfloat32_t v694 = svadd_f32_x(svptrue_b32(), v693, v625); + svfloat32_t v695 = svsub_f32_x(svptrue_b32(), v693, v625); svfloat32_t v496 = svmla_f32_x(pred_full, v495, v434, v1056); svfloat32_t v498 = svmls_f32_x(pred_full, v497, v435, v1057); svfloat32_t v500 = svmla_f32_x(pred_full, v499, v435, v1057); svfloat32_t v585 = svmla_f32_x(pred_full, v584, v523, v1065); svfloat32_t v587 = svmls_f32_x(pred_full, v586, v524, v1066); svfloat32_t v589 = svmla_f32_x(pred_full, v588, v524, v1066); - svfloat32_t v676; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v676) : "w"(v675), "w"(v646)); - svfloat32_t v678; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v678) : "w"(v677), "w"(v653)); - svfloat32_t v680; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v680) : "w"(v679), "w"(v653)); + svfloat32_t v676 = svadd_f32_x(svptrue_b32(), v675, v646); + svfloat32_t v678 = svsub_f32_x(svptrue_b32(), v677, v653); + svfloat32_t v680 = svadd_f32_x(svptrue_b32(), v679, v653); svst1_f64(pred_full, (double *)(v1096), svreinterpret_f64_f32(v695)); svst1_f64(pred_full, (double *)(v1105), svreinterpret_f64_f32(v694)); - svfloat32_t v507; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v507) : "w"(v496), "w"(v502)); - svfloat32_t v508; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v508) : "w"(v496), "w"(v502)); - svfloat32_t v509; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v509) : "w"(v498), "w"(v504)); - svfloat32_t v510; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v510) : "w"(v498), "w"(v504)); - svfloat32_t v511; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v511) : "w"(v500), "w"(v506)); - svfloat32_t v512; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v512) : "w"(v500), "w"(v506)); - svfloat32_t v596; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v596) : "w"(v585), "w"(v591)); - svfloat32_t v597; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v597) : "w"(v585), "w"(v591)); - svfloat32_t v598; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v598) : "w"(v587), "w"(v593)); - svfloat32_t v599; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v599) : "w"(v587), "w"(v593)); - svfloat32_t v600; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v600) : "w"(v589), "w"(v595)); - svfloat32_t v601; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v601) : "w"(v589), "w"(v595)); - svfloat32_t v687; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v687) : "w"(v676), "w"(v682)); - svfloat32_t v688; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v688) : "w"(v676), "w"(v682)); - svfloat32_t v689; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v689) : "w"(v678), "w"(v684)); - svfloat32_t v690; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v690) : "w"(v678), "w"(v684)); - svfloat32_t v691; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v691) : "w"(v680), "w"(v686)); - svfloat32_t v692; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v692) : "w"(v680), "w"(v686)); - svfloat32_t v717; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v717) : "w"(v508), "w"(v597)); - svfloat32_t v741; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v741) : "w"(v510), "w"(v599)); - svfloat32_t v765; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v765) : "w"(v511), "w"(v600)); - svfloat32_t v789; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v789) : "w"(v512), "w"(v601)); - svfloat32_t v813; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v813) : "w"(v509), "w"(v598)); - svfloat32_t v837; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v837) : "w"(v507), "w"(v596)); + svfloat32_t v507 = svadd_f32_x(svptrue_b32(), v496, v502); + svfloat32_t v508 = svsub_f32_x(svptrue_b32(), v496, v502); + svfloat32_t v509 = svadd_f32_x(svptrue_b32(), v498, v504); + svfloat32_t v510 = svsub_f32_x(svptrue_b32(), v498, v504); + svfloat32_t v511 = svadd_f32_x(svptrue_b32(), v500, v506); + svfloat32_t v512 = svsub_f32_x(svptrue_b32(), v500, v506); + svfloat32_t v596 = svadd_f32_x(svptrue_b32(), v585, v591); + svfloat32_t v597 = svsub_f32_x(svptrue_b32(), v585, v591); + svfloat32_t v598 = svadd_f32_x(svptrue_b32(), v587, v593); + svfloat32_t v599 = svsub_f32_x(svptrue_b32(), v587, v593); + svfloat32_t v600 = svadd_f32_x(svptrue_b32(), v589, v595); + svfloat32_t v601 = svsub_f32_x(svptrue_b32(), v589, v595); + svfloat32_t v687 = svadd_f32_x(svptrue_b32(), v676, v682); + svfloat32_t v688 = svsub_f32_x(svptrue_b32(), v676, v682); + svfloat32_t v689 = svadd_f32_x(svptrue_b32(), v678, v684); + svfloat32_t v690 = svsub_f32_x(svptrue_b32(), v678, v684); + svfloat32_t v691 = svadd_f32_x(svptrue_b32(), v680, v686); + svfloat32_t v692 = svsub_f32_x(svptrue_b32(), v680, v686); + svfloat32_t v717 = svadd_f32_x(svptrue_b32(), v508, v597); + svfloat32_t v741 = svadd_f32_x(svptrue_b32(), v510, v599); + svfloat32_t v765 = svadd_f32_x(svptrue_b32(), v511, v600); + svfloat32_t v789 = svadd_f32_x(svptrue_b32(), v512, v601); + svfloat32_t v813 = svadd_f32_x(svptrue_b32(), v509, v598); + svfloat32_t v837 = svadd_f32_x(svptrue_b32(), v507, v596); svst1_f64(pred_full, (double *)(v1114), svreinterpret_f64_f32(v508)); svst1_f64(pred_full, (double *)(v1141), svreinterpret_f64_f32(v510)); svst1_f64(pred_full, (double *)(v1168), svreinterpret_f64_f32(v511)); svst1_f64(pred_full, (double *)(v1195), svreinterpret_f64_f32(v512)); svst1_f64(pred_full, (double *)(v1222), svreinterpret_f64_f32(v509)); svst1_f64(pred_full, (double *)(v1249), svreinterpret_f64_f32(v507)); - svfloat32_t v718; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v718) : "w"(v717), "w"(v688)); - svfloat32_t v719; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v719) : "w"(v717), "w"(v688)); - svfloat32_t v742; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v742) : "w"(v741), "w"(v690)); - svfloat32_t v743; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v743) : "w"(v741), "w"(v690)); - svfloat32_t v766; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v766) : "w"(v765), "w"(v691)); - svfloat32_t v767; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v767) : "w"(v765), "w"(v691)); - svfloat32_t v790; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v790) : "w"(v789), "w"(v692)); - svfloat32_t v791; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v791) : "w"(v789), "w"(v692)); - svfloat32_t v814; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v814) : "w"(v813), "w"(v689)); - svfloat32_t v815; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v815) : "w"(v813), "w"(v689)); - svfloat32_t v838; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v838) : "w"(v837), "w"(v687)); - svfloat32_t v839; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v839) : "w"(v837), "w"(v687)); + svfloat32_t v718 = svadd_f32_x(svptrue_b32(), v717, v688); + svfloat32_t v719 = svsub_f32_x(svptrue_b32(), v717, v688); + svfloat32_t v742 = svadd_f32_x(svptrue_b32(), v741, v690); + svfloat32_t v743 = svsub_f32_x(svptrue_b32(), v741, v690); + svfloat32_t v766 = svadd_f32_x(svptrue_b32(), v765, v691); + svfloat32_t v767 = svsub_f32_x(svptrue_b32(), v765, v691); + svfloat32_t v790 = svadd_f32_x(svptrue_b32(), v789, v692); + svfloat32_t v791 = svsub_f32_x(svptrue_b32(), v789, v692); + svfloat32_t v814 = svadd_f32_x(svptrue_b32(), v813, v689); + svfloat32_t v815 = svsub_f32_x(svptrue_b32(), v813, v689); + svfloat32_t v838 = svadd_f32_x(svptrue_b32(), v837, v687); + svfloat32_t v839 = svsub_f32_x(svptrue_b32(), v837, v687); svst1_f64(pred_full, (double *)(v1123), svreinterpret_f64_f32(v719)); svst1_f64(pred_full, (double *)(v1132), svreinterpret_f64_f32(v718)); svst1_f64(pred_full, (double *)(v1150), svreinterpret_f64_f32(v743)); @@ -13196,8 +11685,7 @@ void armral_fft_cf32_cf32_cf32_ab_t_gu22(const armral_cmplx_f32_t *restrict x, svld1_f64(pred_full, &((const double *)v7)[v246])); svfloat32_t v282 = svreinterpret_f32_f64( svld1_f64(pred_full, &((const double *)v7)[v281])); - svfloat32_t zero290; - asm volatile("mov %0.s, #0" : "=w"(zero290)); + svfloat32_t zero290 = svdup_n_f32(0); svfloat32_t v290 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero290, v1176, v289, 0), v1176, v289, 90); @@ -13257,340 +11745,208 @@ void armral_fft_cf32_cf32_cf32_ab_t_gu22(const armral_cmplx_f32_t *restrict x, svld1_gather_s64index_f64(pred_full, (const double *)(v1239), v1259)); svfloat32_t v1250 = svreinterpret_f32_f64( svld1_gather_s64index_f64(pred_full, (const double *)(v1248), v1259)); - svfloat32_t zero38; - asm volatile("mov %0.s, #0" : "=w"(zero38)); + svfloat32_t zero38 = svdup_n_f32(0); svfloat32_t v38 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero38, v1068, v37, 0), v1068, v37, 90); - svfloat32_t zero73; - asm volatile("mov %0.s, #0" : "=w"(zero73)); + svfloat32_t zero73 = svdup_n_f32(0); svfloat32_t v73 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero73, v1077, v72, 0), v1077, v72, 90); - svfloat32_t zero80; - asm volatile("mov %0.s, #0" : "=w"(zero80)); + svfloat32_t zero80 = svdup_n_f32(0); svfloat32_t v80 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero80, v1086, v79, 0), v1086, v79, 90); - svfloat32_t zero115; - asm volatile("mov %0.s, #0" : "=w"(zero115)); + svfloat32_t zero115 = svdup_n_f32(0); svfloat32_t v115 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero115, v1095, v114, 0), v1095, v114, 90); - svfloat32_t zero122; - asm volatile("mov %0.s, #0" : "=w"(zero122)); + svfloat32_t zero122 = svdup_n_f32(0); svfloat32_t v122 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero122, v1104, v121, 0), v1104, v121, 90); - svfloat32_t zero157; - asm volatile("mov %0.s, #0" : "=w"(zero157)); + svfloat32_t zero157 = svdup_n_f32(0); svfloat32_t v157 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero157, v1113, v156, 0), v1113, v156, 90); - svfloat32_t zero164; - asm volatile("mov %0.s, #0" : "=w"(zero164)); + svfloat32_t zero164 = svdup_n_f32(0); svfloat32_t v164 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero164, v1122, v163, 0), v1122, v163, 90); - svfloat32_t zero199; - asm volatile("mov %0.s, #0" : "=w"(zero199)); + svfloat32_t zero199 = svdup_n_f32(0); svfloat32_t v199 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero199, v1131, v198, 0), v1131, v198, 90); - svfloat32_t zero206; - asm volatile("mov %0.s, #0" : "=w"(zero206)); + svfloat32_t zero206 = svdup_n_f32(0); svfloat32_t v206 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero206, v1140, v205, 0), v1140, v205, 90); - svfloat32_t zero241; - asm volatile("mov %0.s, #0" : "=w"(zero241)); + svfloat32_t zero241 = svdup_n_f32(0); svfloat32_t v241 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero241, v1149, v240, 0), v1149, v240, 90); - svfloat32_t zero248; - asm volatile("mov %0.s, #0" : "=w"(zero248)); + svfloat32_t zero248 = svdup_n_f32(0); svfloat32_t v248 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero248, v1158, v247, 0), v1158, v247, 90); - svfloat32_t zero283; - asm volatile("mov %0.s, #0" : "=w"(zero283)); + svfloat32_t zero283 = svdup_n_f32(0); svfloat32_t v283 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero283, v1167, v282, 0), v1167, v282, 90); - svfloat32_t zero325; - asm volatile("mov %0.s, #0" : "=w"(zero325)); + svfloat32_t zero325 = svdup_n_f32(0); svfloat32_t v325 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero325, v1187, v324, 0), v1187, v324, 90); - svfloat32_t zero332; - asm volatile("mov %0.s, #0" : "=w"(zero332)); + svfloat32_t zero332 = svdup_n_f32(0); svfloat32_t v332 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero332, v1196, v331, 0), v1196, v331, 90); - svfloat32_t zero367; - asm volatile("mov %0.s, #0" : "=w"(zero367)); + svfloat32_t zero367 = svdup_n_f32(0); svfloat32_t v367 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero367, v1205, v366, 0), v1205, v366, 90); - svfloat32_t zero374; - asm volatile("mov %0.s, #0" : "=w"(zero374)); + svfloat32_t zero374 = svdup_n_f32(0); svfloat32_t v374 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero374, v1214, v373, 0), v1214, v373, 90); - svfloat32_t zero409; - asm volatile("mov %0.s, #0" : "=w"(zero409)); + svfloat32_t zero409 = svdup_n_f32(0); svfloat32_t v409 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero409, v1223, v408, 0), v1223, v408, 90); - svfloat32_t zero416; - asm volatile("mov %0.s, #0" : "=w"(zero416)); + svfloat32_t zero416 = svdup_n_f32(0); svfloat32_t v416 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero416, v1232, v415, 0), v1232, v415, 90); - svfloat32_t zero451; - asm volatile("mov %0.s, #0" : "=w"(zero451)); + svfloat32_t zero451 = svdup_n_f32(0); svfloat32_t v451 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero451, v1241, v450, 0), v1241, v450, 90); - svfloat32_t zero458; - asm volatile("mov %0.s, #0" : "=w"(zero458)); + svfloat32_t zero458 = svdup_n_f32(0); svfloat32_t v458 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero458, v1250, v457, 0), v1250, v457, 90); - svfloat32_t v466; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v466) : "w"(v1260), "w"(v38)); - svfloat32_t v467; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v467) : "w"(v1260), "w"(v38)); - svfloat32_t v468; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v468) : "w"(v73), "w"(v80)); - svfloat32_t v469; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v469) : "w"(v73), "w"(v80)); - svfloat32_t v470; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v470) : "w"(v115), "w"(v122)); - svfloat32_t v471; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v471) : "w"(v115), "w"(v122)); - svfloat32_t v472; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v472) : "w"(v157), "w"(v164)); - svfloat32_t v473; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v473) : "w"(v157), "w"(v164)); - svfloat32_t v474; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v474) : "w"(v199), "w"(v206)); - svfloat32_t v475; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v475) : "w"(v199), "w"(v206)); - svfloat32_t v476; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v476) : "w"(v241), "w"(v248)); - svfloat32_t v477; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v477) : "w"(v241), "w"(v248)); - svfloat32_t v478; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v478) : "w"(v283), "w"(v290)); - svfloat32_t v479; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v479) : "w"(v283), "w"(v290)); - svfloat32_t v480; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v480) : "w"(v325), "w"(v332)); - svfloat32_t v481; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v481) : "w"(v325), "w"(v332)); - svfloat32_t v482; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v482) : "w"(v367), "w"(v374)); - svfloat32_t v483; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v483) : "w"(v367), "w"(v374)); - svfloat32_t v484; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v484) : "w"(v409), "w"(v416)); - svfloat32_t v485; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v485) : "w"(v409), "w"(v416)); - svfloat32_t v486; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v486) : "w"(v451), "w"(v458)); - svfloat32_t v487; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v487) : "w"(v451), "w"(v458)); - svfloat32_t v488; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v488) : "w"(v468), "w"(v486)); - svfloat32_t v489; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v489) : "w"(v470), "w"(v484)); - svfloat32_t v490; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v490) : "w"(v472), "w"(v482)); - svfloat32_t v491; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v491) : "w"(v474), "w"(v480)); - svfloat32_t v492; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v492) : "w"(v476), "w"(v478)); - svfloat32_t v493; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v493) : "w"(v468), "w"(v486)); - svfloat32_t v494; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v494) : "w"(v470), "w"(v484)); - svfloat32_t v495; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v495) : "w"(v472), "w"(v482)); - svfloat32_t v496; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v496) : "w"(v474), "w"(v480)); - svfloat32_t v497; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v497) : "w"(v476), "w"(v478)); - svfloat32_t v697; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v697) : "w"(v469), "w"(v487)); - svfloat32_t v698; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v698) : "w"(v471), "w"(v485)); - svfloat32_t v699; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v699) : "w"(v473), "w"(v483)); - svfloat32_t v700; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v700) : "w"(v475), "w"(v481)); - svfloat32_t v701; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v701) : "w"(v477), "w"(v479)); - svfloat32_t v702; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v702) : "w"(v469), "w"(v487)); - svfloat32_t v703; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v703) : "w"(v471), "w"(v485)); - svfloat32_t v704; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v704) : "w"(v473), "w"(v483)); - svfloat32_t v705; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v705) : "w"(v475), "w"(v481)); - svfloat32_t v706; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v706) : "w"(v477), "w"(v479)); - svfloat32_t v498; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v498) : "w"(v488), "w"(v489)); - svfloat32_t v499; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v499) : "w"(v490), "w"(v492)); - svfloat32_t v501; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v501) : "w"(v494), "w"(v495)); - svfloat32_t v502; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v502) : "w"(v493), "w"(v497)); - svfloat32_t v507; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v507) : "w"(v489), "w"(v491)); - svfloat32_t v508; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v508) : "w"(v488), "w"(v491)); - svfloat32_t v509; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v509) : "w"(v489), "w"(v488)); - svfloat32_t v510; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v510) : "w"(v492), "w"(v491)); - svfloat32_t v511; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v511) : "w"(v490), "w"(v491)); - svfloat32_t v512; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v512) : "w"(v492), "w"(v490)); - svfloat32_t v513; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v513) : "w"(v489), "w"(v492)); - svfloat32_t v514; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v514) : "w"(v488), "w"(v490)); - svfloat32_t v516; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v516) : "w"(v494), "w"(v496)); - svfloat32_t v517; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v517) : "w"(v493), "w"(v496)); - svfloat32_t v518; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v518) : "w"(v493), "w"(v494)); - svfloat32_t v519; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v519) : "w"(v496), "w"(v497)); - svfloat32_t v520; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v520) : "w"(v495), "w"(v496)); - svfloat32_t v521; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v521) : "w"(v495), "w"(v497)); - svfloat32_t v522; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v522) : "w"(v494), "w"(v497)); - svfloat32_t v523; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v523) : "w"(v493), "w"(v495)); - svfloat32_t v707; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v707) : "w"(v697), "w"(v698)); - svfloat32_t v708; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v708) : "w"(v699), "w"(v701)); - svfloat32_t v710; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v710) : "w"(v703), "w"(v704)); - svfloat32_t v711; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v711) : "w"(v702), "w"(v706)); - svfloat32_t v716; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v716) : "w"(v698), "w"(v700)); - svfloat32_t v717; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v717) : "w"(v697), "w"(v700)); - svfloat32_t v718; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v718) : "w"(v698), "w"(v697)); - svfloat32_t v719; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v719) : "w"(v701), "w"(v700)); - svfloat32_t v720; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v720) : "w"(v699), "w"(v700)); - svfloat32_t v721; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v721) : "w"(v701), "w"(v699)); - svfloat32_t v722; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v722) : "w"(v698), "w"(v701)); - svfloat32_t v723; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v723) : "w"(v697), "w"(v699)); - svfloat32_t v725; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v725) : "w"(v703), "w"(v705)); - svfloat32_t v726; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v726) : "w"(v702), "w"(v705)); - svfloat32_t v727; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v727) : "w"(v702), "w"(v703)); - svfloat32_t v728; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v728) : "w"(v705), "w"(v706)); - svfloat32_t v729; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v729) : "w"(v704), "w"(v705)); - svfloat32_t v730; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v730) : "w"(v704), "w"(v706)); - svfloat32_t v731; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v731) : "w"(v703), "w"(v706)); - svfloat32_t v732; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v732) : "w"(v702), "w"(v704)); - svfloat32_t v500; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v500) : "w"(v491), "w"(v498)); - svfloat32_t v505; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v505) : "w"(v501), "w"(v502)); - svfloat32_t v515; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v515) : "w"(v499), "w"(v498)); - svfloat32_t v524; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v524) : "w"(v501), "w"(v502)); - svfloat32_t v551; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v551) : "w"(v508), "w"(v1286)); - svfloat32_t v556; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v556) : "w"(v509), "w"(v1287)); - svfloat32_t v566; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v566) : "w"(v511), "w"(v1289)); - svfloat32_t v571; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v571) : "w"(v512), "w"(v1290)); - svfloat32_t zero593; - asm volatile("mov %0.s, #0" : "=w"(zero593)); + svfloat32_t v466 = svadd_f32_x(svptrue_b32(), v1260, v38); + svfloat32_t v467 = svsub_f32_x(svptrue_b32(), v1260, v38); + svfloat32_t v468 = svadd_f32_x(svptrue_b32(), v73, v80); + svfloat32_t v469 = svsub_f32_x(svptrue_b32(), v73, v80); + svfloat32_t v470 = svadd_f32_x(svptrue_b32(), v115, v122); + svfloat32_t v471 = svsub_f32_x(svptrue_b32(), v115, v122); + svfloat32_t v472 = svadd_f32_x(svptrue_b32(), v157, v164); + svfloat32_t v473 = svsub_f32_x(svptrue_b32(), v157, v164); + svfloat32_t v474 = svadd_f32_x(svptrue_b32(), v199, v206); + svfloat32_t v475 = svsub_f32_x(svptrue_b32(), v199, v206); + svfloat32_t v476 = svadd_f32_x(svptrue_b32(), v241, v248); + svfloat32_t v477 = svsub_f32_x(svptrue_b32(), v241, v248); + svfloat32_t v478 = svadd_f32_x(svptrue_b32(), v283, v290); + svfloat32_t v479 = svsub_f32_x(svptrue_b32(), v283, v290); + svfloat32_t v480 = svadd_f32_x(svptrue_b32(), v325, v332); + svfloat32_t v481 = svsub_f32_x(svptrue_b32(), v325, v332); + svfloat32_t v482 = svadd_f32_x(svptrue_b32(), v367, v374); + svfloat32_t v483 = svsub_f32_x(svptrue_b32(), v367, v374); + svfloat32_t v484 = svadd_f32_x(svptrue_b32(), v409, v416); + svfloat32_t v485 = svsub_f32_x(svptrue_b32(), v409, v416); + svfloat32_t v486 = svadd_f32_x(svptrue_b32(), v451, v458); + svfloat32_t v487 = svsub_f32_x(svptrue_b32(), v451, v458); + svfloat32_t v488 = svadd_f32_x(svptrue_b32(), v468, v486); + svfloat32_t v489 = svadd_f32_x(svptrue_b32(), v470, v484); + svfloat32_t v490 = svadd_f32_x(svptrue_b32(), v472, v482); + svfloat32_t v491 = svadd_f32_x(svptrue_b32(), v474, v480); + svfloat32_t v492 = svadd_f32_x(svptrue_b32(), v476, v478); + svfloat32_t v493 = svsub_f32_x(svptrue_b32(), v468, v486); + svfloat32_t v494 = svsub_f32_x(svptrue_b32(), v470, v484); + svfloat32_t v495 = svsub_f32_x(svptrue_b32(), v472, v482); + svfloat32_t v496 = svsub_f32_x(svptrue_b32(), v474, v480); + svfloat32_t v497 = svsub_f32_x(svptrue_b32(), v476, v478); + svfloat32_t v697 = svadd_f32_x(svptrue_b32(), v469, v487); + svfloat32_t v698 = svadd_f32_x(svptrue_b32(), v471, v485); + svfloat32_t v699 = svadd_f32_x(svptrue_b32(), v473, v483); + svfloat32_t v700 = svadd_f32_x(svptrue_b32(), v475, v481); + svfloat32_t v701 = svadd_f32_x(svptrue_b32(), v477, v479); + svfloat32_t v702 = svsub_f32_x(svptrue_b32(), v469, v487); + svfloat32_t v703 = svsub_f32_x(svptrue_b32(), v471, v485); + svfloat32_t v704 = svsub_f32_x(svptrue_b32(), v473, v483); + svfloat32_t v705 = svsub_f32_x(svptrue_b32(), v475, v481); + svfloat32_t v706 = svsub_f32_x(svptrue_b32(), v477, v479); + svfloat32_t v498 = svadd_f32_x(svptrue_b32(), v488, v489); + svfloat32_t v499 = svadd_f32_x(svptrue_b32(), v490, v492); + svfloat32_t v501 = svsub_f32_x(svptrue_b32(), v494, v495); + svfloat32_t v502 = svadd_f32_x(svptrue_b32(), v493, v497); + svfloat32_t v507 = svsub_f32_x(svptrue_b32(), v489, v491); + svfloat32_t v508 = svsub_f32_x(svptrue_b32(), v488, v491); + svfloat32_t v509 = svsub_f32_x(svptrue_b32(), v489, v488); + svfloat32_t v510 = svsub_f32_x(svptrue_b32(), v492, v491); + svfloat32_t v511 = svsub_f32_x(svptrue_b32(), v490, v491); + svfloat32_t v512 = svsub_f32_x(svptrue_b32(), v492, v490); + svfloat32_t v513 = svsub_f32_x(svptrue_b32(), v489, v492); + svfloat32_t v514 = svsub_f32_x(svptrue_b32(), v488, v490); + svfloat32_t v516 = svadd_f32_x(svptrue_b32(), v494, v496); + svfloat32_t v517 = svsub_f32_x(svptrue_b32(), v493, v496); + svfloat32_t v518 = svadd_f32_x(svptrue_b32(), v493, v494); + svfloat32_t v519 = svsub_f32_x(svptrue_b32(), v496, v497); + svfloat32_t v520 = svsub_f32_x(svptrue_b32(), v495, v496); + svfloat32_t v521 = svsub_f32_x(svptrue_b32(), v495, v497); + svfloat32_t v522 = svadd_f32_x(svptrue_b32(), v494, v497); + svfloat32_t v523 = svsub_f32_x(svptrue_b32(), v493, v495); + svfloat32_t v707 = svadd_f32_x(svptrue_b32(), v697, v698); + svfloat32_t v708 = svadd_f32_x(svptrue_b32(), v699, v701); + svfloat32_t v710 = svsub_f32_x(svptrue_b32(), v703, v704); + svfloat32_t v711 = svadd_f32_x(svptrue_b32(), v702, v706); + svfloat32_t v716 = svsub_f32_x(svptrue_b32(), v698, v700); + svfloat32_t v717 = svsub_f32_x(svptrue_b32(), v697, v700); + svfloat32_t v718 = svsub_f32_x(svptrue_b32(), v698, v697); + svfloat32_t v719 = svsub_f32_x(svptrue_b32(), v701, v700); + svfloat32_t v720 = svsub_f32_x(svptrue_b32(), v699, v700); + svfloat32_t v721 = svsub_f32_x(svptrue_b32(), v701, v699); + svfloat32_t v722 = svsub_f32_x(svptrue_b32(), v698, v701); + svfloat32_t v723 = svsub_f32_x(svptrue_b32(), v697, v699); + svfloat32_t v725 = svadd_f32_x(svptrue_b32(), v703, v705); + svfloat32_t v726 = svsub_f32_x(svptrue_b32(), v702, v705); + svfloat32_t v727 = svadd_f32_x(svptrue_b32(), v702, v703); + svfloat32_t v728 = svsub_f32_x(svptrue_b32(), v705, v706); + svfloat32_t v729 = svsub_f32_x(svptrue_b32(), v704, v705); + svfloat32_t v730 = svsub_f32_x(svptrue_b32(), v704, v706); + svfloat32_t v731 = svadd_f32_x(svptrue_b32(), v703, v706); + svfloat32_t v732 = svsub_f32_x(svptrue_b32(), v702, v704); + svfloat32_t v500 = svadd_f32_x(svptrue_b32(), v491, v498); + svfloat32_t v505 = svsub_f32_x(svptrue_b32(), v501, v502); + svfloat32_t v515 = svsub_f32_x(svptrue_b32(), v499, v498); + svfloat32_t v524 = svadd_f32_x(svptrue_b32(), v501, v502); + svfloat32_t v551 = svmul_f32_x(svptrue_b32(), v508, v1286); + svfloat32_t v556 = svmul_f32_x(svptrue_b32(), v509, v1287); + svfloat32_t v566 = svmul_f32_x(svptrue_b32(), v511, v1289); + svfloat32_t v571 = svmul_f32_x(svptrue_b32(), v512, v1290); + svfloat32_t zero593 = svdup_n_f32(0); svfloat32_t v593 = svcmla_f32_x(pred_full, zero593, v1294, v516, 90); - svfloat32_t zero607; - asm volatile("mov %0.s, #0" : "=w"(zero607)); + svfloat32_t zero607 = svdup_n_f32(0); svfloat32_t v607 = svcmla_f32_x(pred_full, zero607, v1296, v518, 90); - svfloat32_t zero614; - asm volatile("mov %0.s, #0" : "=w"(zero614)); + svfloat32_t zero614 = svdup_n_f32(0); svfloat32_t v614 = svcmla_f32_x(pred_full, zero614, v1297, v519, 90); - svfloat32_t zero628; - asm volatile("mov %0.s, #0" : "=w"(zero628)); + svfloat32_t zero628 = svdup_n_f32(0); svfloat32_t v628 = svcmla_f32_x(pred_full, zero628, v1299, v521, 90); - svfloat32_t zero635; - asm volatile("mov %0.s, #0" : "=w"(zero635)); + svfloat32_t zero635 = svdup_n_f32(0); svfloat32_t v635 = svcmla_f32_x(pred_full, zero635, v1300, v522, 90); - svfloat32_t v709; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v709) : "w"(v700), "w"(v707)); - svfloat32_t v714; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v714) : "w"(v710), "w"(v711)); - svfloat32_t v724; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v724) : "w"(v708), "w"(v707)); - svfloat32_t v733; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v733) : "w"(v710), "w"(v711)); - svfloat32_t v760; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v760) : "w"(v717), "w"(v1286)); - svfloat32_t v765; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v765) : "w"(v718), "w"(v1287)); - svfloat32_t v775; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v775) : "w"(v720), "w"(v1289)); - svfloat32_t v780; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v780) : "w"(v721), "w"(v1290)); - svfloat32_t zero802; - asm volatile("mov %0.s, #0" : "=w"(zero802)); + svfloat32_t v709 = svadd_f32_x(svptrue_b32(), v700, v707); + svfloat32_t v714 = svsub_f32_x(svptrue_b32(), v710, v711); + svfloat32_t v724 = svsub_f32_x(svptrue_b32(), v708, v707); + svfloat32_t v733 = svadd_f32_x(svptrue_b32(), v710, v711); + svfloat32_t v760 = svmul_f32_x(svptrue_b32(), v717, v1286); + svfloat32_t v765 = svmul_f32_x(svptrue_b32(), v718, v1287); + svfloat32_t v775 = svmul_f32_x(svptrue_b32(), v720, v1289); + svfloat32_t v780 = svmul_f32_x(svptrue_b32(), v721, v1290); + svfloat32_t zero802 = svdup_n_f32(0); svfloat32_t v802 = svcmla_f32_x(pred_full, zero802, v1294, v725, 90); - svfloat32_t zero816; - asm volatile("mov %0.s, #0" : "=w"(zero816)); + svfloat32_t zero816 = svdup_n_f32(0); svfloat32_t v816 = svcmla_f32_x(pred_full, zero816, v1296, v727, 90); - svfloat32_t zero823; - asm volatile("mov %0.s, #0" : "=w"(zero823)); + svfloat32_t zero823 = svdup_n_f32(0); svfloat32_t v823 = svcmla_f32_x(pred_full, zero823, v1297, v728, 90); - svfloat32_t zero837; - asm volatile("mov %0.s, #0" : "=w"(zero837)); + svfloat32_t zero837 = svdup_n_f32(0); svfloat32_t v837 = svcmla_f32_x(pred_full, zero837, v1299, v730, 90); - svfloat32_t zero844; - asm volatile("mov %0.s, #0" : "=w"(zero844)); + svfloat32_t zero844 = svdup_n_f32(0); svfloat32_t v844 = svcmla_f32_x(pred_full, zero844, v1300, v731, 90); - svfloat32_t v503; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v503) : "w"(v500), "w"(v499)); - svfloat32_t v506; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v506) : "w"(v505), "w"(v496)); - svfloat32_t v586; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v586) : "w"(v515), "w"(v1293)); - svfloat32_t zero649; - asm volatile("mov %0.s, #0" : "=w"(zero649)); + svfloat32_t v503 = svadd_f32_x(svptrue_b32(), v500, v499); + svfloat32_t v506 = svsub_f32_x(svptrue_b32(), v505, v496); + svfloat32_t v586 = svmul_f32_x(svptrue_b32(), v515, v1293); + svfloat32_t zero649 = svdup_n_f32(0); svfloat32_t v649 = svcmla_f32_x(pred_full, zero649, v1302, v524, 90); svfloat32_t v651 = svmla_f32_x(pred_full, v551, v507, v1285); svfloat32_t v652 = svmla_f32_x(pred_full, v556, v508, v1286); @@ -13599,19 +11955,13 @@ void armral_fft_cf32_cf32_cf32_ab_t_gu22(const armral_cmplx_f32_t *restrict x, svfloat32_t v655 = svmla_f32_x(pred_full, v571, v511, v1289); svfloat32_t v656 = svnmls_f32_x(pred_full, v571, v510, v1288); svfloat32_t v659 = svcmla_f32_x(pred_full, v607, v1295, v517, 90); - svfloat32_t v660; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v660) : "w"(v593), "w"(v607)); + svfloat32_t v660 = svsub_f32_x(svptrue_b32(), v593, v607); svfloat32_t v661 = svcmla_f32_x(pred_full, v628, v1298, v520, 90); - svfloat32_t v662; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v662) : "w"(v614), "w"(v628)); - svfloat32_t v712; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v712) : "w"(v709), "w"(v708)); - svfloat32_t v715; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v715) : "w"(v714), "w"(v705)); - svfloat32_t v795; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v795) : "w"(v724), "w"(v1293)); - svfloat32_t zero858; - asm volatile("mov %0.s, #0" : "=w"(zero858)); + svfloat32_t v662 = svsub_f32_x(svptrue_b32(), v614, v628); + svfloat32_t v712 = svadd_f32_x(svptrue_b32(), v709, v708); + svfloat32_t v715 = svsub_f32_x(svptrue_b32(), v714, v705); + svfloat32_t v795 = svmul_f32_x(svptrue_b32(), v724, v1293); + svfloat32_t zero858 = svdup_n_f32(0); svfloat32_t v858 = svcmla_f32_x(pred_full, zero858, v1302, v733, 90); svfloat32_t v860 = svmla_f32_x(pred_full, v760, v716, v1285); svfloat32_t v861 = svmla_f32_x(pred_full, v765, v717, v1286); @@ -13620,163 +11970,91 @@ void armral_fft_cf32_cf32_cf32_ab_t_gu22(const armral_cmplx_f32_t *restrict x, svfloat32_t v864 = svmla_f32_x(pred_full, v780, v720, v1289); svfloat32_t v865 = svnmls_f32_x(pred_full, v780, v719, v1288); svfloat32_t v868 = svcmla_f32_x(pred_full, v816, v1295, v726, 90); - svfloat32_t v869; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v869) : "w"(v802), "w"(v816)); + svfloat32_t v869 = svsub_f32_x(svptrue_b32(), v802, v816); svfloat32_t v870 = svcmla_f32_x(pred_full, v837, v1298, v729, 90); - svfloat32_t v871; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v871) : "w"(v823), "w"(v837)); - svfloat32_t v504; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v504) : "w"(v466), "w"(v503)); - svfloat32_t zero541; - asm volatile("mov %0.s, #0" : "=w"(zero541)); + svfloat32_t v871 = svsub_f32_x(svptrue_b32(), v823, v837); + svfloat32_t v504 = svadd_f32_x(svptrue_b32(), v466, v503); + svfloat32_t zero541 = svdup_n_f32(0); svfloat32_t v541 = svcmla_f32_x(pred_full, zero541, v1284, v506, 90); svfloat32_t v657 = svmla_f32_x(pred_full, v586, v514, v1292); svfloat32_t v658 = svmla_f32_x(pred_full, v586, v513, v1291); svfloat32_t v663 = svcmla_f32_x(pred_full, v649, v1301, v523, 90); - svfloat32_t v664; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v664) : "w"(v635), "w"(v649)); - svfloat32_t v683; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v683) : "w"(v659), "w"(v660)); - svfloat32_t v713; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v713) : "w"(v467), "w"(v712)); - svfloat32_t zero750; - asm volatile("mov %0.s, #0" : "=w"(zero750)); + svfloat32_t v664 = svsub_f32_x(svptrue_b32(), v635, v649); + svfloat32_t v683 = svadd_f32_x(svptrue_b32(), v659, v660); + svfloat32_t v713 = svadd_f32_x(svptrue_b32(), v467, v712); + svfloat32_t zero750 = svdup_n_f32(0); svfloat32_t v750 = svcmla_f32_x(pred_full, zero750, v1284, v715, 90); svfloat32_t v866 = svmla_f32_x(pred_full, v795, v723, v1292); svfloat32_t v867 = svmla_f32_x(pred_full, v795, v722, v1291); svfloat32_t v872 = svcmla_f32_x(pred_full, v858, v1301, v732, 90); - svfloat32_t v873; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v873) : "w"(v844), "w"(v858)); - svfloat32_t v892; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v892) : "w"(v868), "w"(v869)); + svfloat32_t v873 = svsub_f32_x(svptrue_b32(), v844, v858); + svfloat32_t v892 = svadd_f32_x(svptrue_b32(), v868, v869); svfloat32_t v650 = svmls_f32_x(pred_full, v504, v503, v1283); - svfloat32_t v665; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v665) : "w"(v655), "w"(v657)); - svfloat32_t v675; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v675) : "w"(v541), "w"(v661)); - svfloat32_t v677; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v677) : "w"(v663), "w"(v659)); - svfloat32_t v679; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v679) : "w"(v541), "w"(v664)); - svfloat32_t v681; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v681) : "w"(v664), "w"(v660)); - svfloat32_t v684; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v684) : "w"(v683), "w"(v661)); + svfloat32_t v665 = svadd_f32_x(svptrue_b32(), v655, v657); + svfloat32_t v675 = svadd_f32_x(svptrue_b32(), v541, v661); + svfloat32_t v677 = svsub_f32_x(svptrue_b32(), v663, v659); + svfloat32_t v679 = svadd_f32_x(svptrue_b32(), v541, v664); + svfloat32_t v681 = svsub_f32_x(svptrue_b32(), v664, v660); + svfloat32_t v684 = svadd_f32_x(svptrue_b32(), v683, v661); svfloat32_t v859 = svmls_f32_x(pred_full, v713, v712, v1283); - svfloat32_t v874; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v874) : "w"(v864), "w"(v866)); - svfloat32_t v884; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v884) : "w"(v750), "w"(v870)); - svfloat32_t v886; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v886) : "w"(v872), "w"(v868)); - svfloat32_t v888; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v888) : "w"(v750), "w"(v873)); - svfloat32_t v890; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v890) : "w"(v873), "w"(v869)); - svfloat32_t v893; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v893) : "w"(v892), "w"(v870)); + svfloat32_t v874 = svadd_f32_x(svptrue_b32(), v864, v866); + svfloat32_t v884 = svadd_f32_x(svptrue_b32(), v750, v870); + svfloat32_t v886 = svsub_f32_x(svptrue_b32(), v872, v868); + svfloat32_t v888 = svadd_f32_x(svptrue_b32(), v750, v873); + svfloat32_t v890 = svsub_f32_x(svptrue_b32(), v873, v869); + svfloat32_t v893 = svadd_f32_x(svptrue_b32(), v892, v870); svst1_f64(pred_full, (double *)(v1310), svreinterpret_f64_f32(v504)); svst1_f64(pred_full, (double *)(v1319), svreinterpret_f64_f32(v713)); - svfloat32_t v666; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v666) : "w"(v665), "w"(v650)); - svfloat32_t v667; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v667) : "w"(v650), "w"(v652)); - svfloat32_t v669; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v669) : "w"(v650), "w"(v656)); - svfloat32_t v671; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v671) : "w"(v650), "w"(v653)); - svfloat32_t v673; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v673) : "w"(v650), "w"(v651)); - svfloat32_t v676; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v676) : "w"(v675), "w"(v663)); - svfloat32_t v678; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v678) : "w"(v677), "w"(v541)); - svfloat32_t v680; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v680) : "w"(v679), "w"(v662)); - svfloat32_t v682; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v682) : "w"(v681), "w"(v541)); - svfloat32_t v685; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v685) : "w"(v684), "w"(v662)); - svfloat32_t v875; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v875) : "w"(v874), "w"(v859)); - svfloat32_t v876; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v876) : "w"(v859), "w"(v861)); - svfloat32_t v878; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v878) : "w"(v859), "w"(v865)); - svfloat32_t v880; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v880) : "w"(v859), "w"(v862)); - svfloat32_t v882; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v882) : "w"(v859), "w"(v860)); - svfloat32_t v885; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v885) : "w"(v884), "w"(v872)); - svfloat32_t v887; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v887) : "w"(v886), "w"(v750)); - svfloat32_t v889; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v889) : "w"(v888), "w"(v871)); - svfloat32_t v891; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v891) : "w"(v890), "w"(v750)); - svfloat32_t v894; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v894) : "w"(v893), "w"(v871)); - svfloat32_t v668; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v668) : "w"(v667), "w"(v657)); - svfloat32_t v670; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v670) : "w"(v669), "w"(v658)); - svfloat32_t v672; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v672) : "w"(v671), "w"(v658)); - svfloat32_t v674; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v674) : "w"(v673), "w"(v654)); - svfloat32_t v686; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v686) : "w"(v685), "w"(v541)); - svfloat32_t v688; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v688) : "w"(v666), "w"(v676)); - svfloat32_t v695; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v695) : "w"(v666), "w"(v676)); - svfloat32_t v877; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v877) : "w"(v876), "w"(v866)); - svfloat32_t v879; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v879) : "w"(v878), "w"(v867)); - svfloat32_t v881; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v881) : "w"(v880), "w"(v867)); - svfloat32_t v883; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v883) : "w"(v882), "w"(v863)); - svfloat32_t v895; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v895) : "w"(v894), "w"(v750)); - svfloat32_t v897; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v897) : "w"(v875), "w"(v885)); - svfloat32_t v904; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v904) : "w"(v875), "w"(v885)); - svfloat32_t v687; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v687) : "w"(v674), "w"(v686)); - svfloat32_t v689; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v689) : "w"(v668), "w"(v678)); - svfloat32_t v690; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v690) : "w"(v670), "w"(v680)); - svfloat32_t v691; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v691) : "w"(v672), "w"(v682)); - svfloat32_t v692; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v692) : "w"(v672), "w"(v682)); - svfloat32_t v693; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v693) : "w"(v670), "w"(v680)); - svfloat32_t v694; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v694) : "w"(v668), "w"(v678)); - svfloat32_t v696; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v696) : "w"(v674), "w"(v686)); - svfloat32_t v896; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v896) : "w"(v883), "w"(v895)); - svfloat32_t v898; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v898) : "w"(v877), "w"(v887)); - svfloat32_t v899; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v899) : "w"(v879), "w"(v889)); - svfloat32_t v900; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v900) : "w"(v881), "w"(v891)); - svfloat32_t v901; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v901) : "w"(v881), "w"(v891)); - svfloat32_t v902; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v902) : "w"(v879), "w"(v889)); - svfloat32_t v903; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v903) : "w"(v877), "w"(v887)); - svfloat32_t v905; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v905) : "w"(v883), "w"(v895)); + svfloat32_t v666 = svadd_f32_x(svptrue_b32(), v665, v650); + svfloat32_t v667 = svsub_f32_x(svptrue_b32(), v650, v652); + svfloat32_t v669 = svadd_f32_x(svptrue_b32(), v650, v656); + svfloat32_t v671 = svsub_f32_x(svptrue_b32(), v650, v653); + svfloat32_t v673 = svadd_f32_x(svptrue_b32(), v650, v651); + svfloat32_t v676 = svadd_f32_x(svptrue_b32(), v675, v663); + svfloat32_t v678 = svsub_f32_x(svptrue_b32(), v677, v541); + svfloat32_t v680 = svadd_f32_x(svptrue_b32(), v679, v662); + svfloat32_t v682 = svsub_f32_x(svptrue_b32(), v681, v541); + svfloat32_t v685 = svadd_f32_x(svptrue_b32(), v684, v662); + svfloat32_t v875 = svadd_f32_x(svptrue_b32(), v874, v859); + svfloat32_t v876 = svsub_f32_x(svptrue_b32(), v859, v861); + svfloat32_t v878 = svadd_f32_x(svptrue_b32(), v859, v865); + svfloat32_t v880 = svsub_f32_x(svptrue_b32(), v859, v862); + svfloat32_t v882 = svadd_f32_x(svptrue_b32(), v859, v860); + svfloat32_t v885 = svadd_f32_x(svptrue_b32(), v884, v872); + svfloat32_t v887 = svsub_f32_x(svptrue_b32(), v886, v750); + svfloat32_t v889 = svadd_f32_x(svptrue_b32(), v888, v871); + svfloat32_t v891 = svsub_f32_x(svptrue_b32(), v890, v750); + svfloat32_t v894 = svadd_f32_x(svptrue_b32(), v893, v871); + svfloat32_t v668 = svsub_f32_x(svptrue_b32(), v667, v657); + svfloat32_t v670 = svadd_f32_x(svptrue_b32(), v669, v658); + svfloat32_t v672 = svsub_f32_x(svptrue_b32(), v671, v658); + svfloat32_t v674 = svsub_f32_x(svptrue_b32(), v673, v654); + svfloat32_t v686 = svsub_f32_x(svptrue_b32(), v685, v541); + svfloat32_t v688 = svadd_f32_x(svptrue_b32(), v666, v676); + svfloat32_t v695 = svsub_f32_x(svptrue_b32(), v666, v676); + svfloat32_t v877 = svsub_f32_x(svptrue_b32(), v876, v866); + svfloat32_t v879 = svadd_f32_x(svptrue_b32(), v878, v867); + svfloat32_t v881 = svsub_f32_x(svptrue_b32(), v880, v867); + svfloat32_t v883 = svsub_f32_x(svptrue_b32(), v882, v863); + svfloat32_t v895 = svsub_f32_x(svptrue_b32(), v894, v750); + svfloat32_t v897 = svadd_f32_x(svptrue_b32(), v875, v885); + svfloat32_t v904 = svsub_f32_x(svptrue_b32(), v875, v885); + svfloat32_t v687 = svadd_f32_x(svptrue_b32(), v674, v686); + svfloat32_t v689 = svadd_f32_x(svptrue_b32(), v668, v678); + svfloat32_t v690 = svsub_f32_x(svptrue_b32(), v670, v680); + svfloat32_t v691 = svadd_f32_x(svptrue_b32(), v672, v682); + svfloat32_t v692 = svsub_f32_x(svptrue_b32(), v672, v682); + svfloat32_t v693 = svadd_f32_x(svptrue_b32(), v670, v680); + svfloat32_t v694 = svsub_f32_x(svptrue_b32(), v668, v678); + svfloat32_t v696 = svsub_f32_x(svptrue_b32(), v674, v686); + svfloat32_t v896 = svadd_f32_x(svptrue_b32(), v883, v895); + svfloat32_t v898 = svadd_f32_x(svptrue_b32(), v877, v887); + svfloat32_t v899 = svsub_f32_x(svptrue_b32(), v879, v889); + svfloat32_t v900 = svadd_f32_x(svptrue_b32(), v881, v891); + svfloat32_t v901 = svsub_f32_x(svptrue_b32(), v881, v891); + svfloat32_t v902 = svadd_f32_x(svptrue_b32(), v879, v889); + svfloat32_t v903 = svsub_f32_x(svptrue_b32(), v877, v887); + svfloat32_t v905 = svsub_f32_x(svptrue_b32(), v883, v895); svst1_f64(pred_full, (double *)(v1346), svreinterpret_f64_f32(v695)); svst1_f64(pred_full, (double *)(v1355), svreinterpret_f64_f32(v904)); svst1_f64(pred_full, (double *)(v1472), svreinterpret_f64_f32(v688)); @@ -14445,8 +12723,7 @@ void armral_fft_cf32_cf32_cf32_ab_t_gu24(const armral_cmplx_f32_t *restrict x, svld1_f64(pred_full, &((const double *)v7)[v169])); svfloat32_t v205 = svreinterpret_f32_f64( svld1_f64(pred_full, &((const double *)v7)[v204])); - svfloat32_t zero213; - asm volatile("mov %0.s, #0" : "=w"(zero213)); + svfloat32_t zero213 = svdup_n_f32(0); svfloat32_t v213 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero213, v984, v212, 0), v984, v212, 90); @@ -14520,115 +12797,83 @@ void armral_fft_cf32_cf32_cf32_ab_t_gu24(const armral_cmplx_f32_t *restrict x, svld1_gather_s64index_f64(pred_full, (const double *)(v1092), v1112)); svfloat32_t v1103 = svreinterpret_f32_f64( svld1_gather_s64index_f64(pred_full, (const double *)(v1101), v1112)); - svfloat32_t zero52; - asm volatile("mov %0.s, #0" : "=w"(zero52)); + svfloat32_t zero52 = svdup_n_f32(0); svfloat32_t v52 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero52, v903, v51, 0), v903, v51, 90); - svfloat32_t zero59; - asm volatile("mov %0.s, #0" : "=w"(zero59)); + svfloat32_t zero59 = svdup_n_f32(0); svfloat32_t v59 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero59, v912, v58, 0), v912, v58, 90); - svfloat32_t zero94; - asm volatile("mov %0.s, #0" : "=w"(zero94)); + svfloat32_t zero94 = svdup_n_f32(0); svfloat32_t v94 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero94, v921, v93, 0), v921, v93, 90); - svfloat32_t zero101; - asm volatile("mov %0.s, #0" : "=w"(zero101)); + svfloat32_t zero101 = svdup_n_f32(0); svfloat32_t v101 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero101, v930, v100, 0), v930, v100, 90); - svfloat32_t zero150; - asm volatile("mov %0.s, #0" : "=w"(zero150)); + svfloat32_t zero150 = svdup_n_f32(0); svfloat32_t v150 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero150, v948, v149, 0), v948, v149, 90); - svfloat32_t zero157; - asm volatile("mov %0.s, #0" : "=w"(zero157)); + svfloat32_t zero157 = svdup_n_f32(0); svfloat32_t v157 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero157, v957, v156, 0), v957, v156, 90); - svfloat32_t zero206; - asm volatile("mov %0.s, #0" : "=w"(zero206)); + svfloat32_t zero206 = svdup_n_f32(0); svfloat32_t v206 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero206, v975, v205, 0), v975, v205, 90); - svfloat32_t zero262; - asm volatile("mov %0.s, #0" : "=w"(zero262)); + svfloat32_t zero262 = svdup_n_f32(0); svfloat32_t v262 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero262, v1004, v261, 0), v1004, v261, 90); - svfloat32_t zero269; - asm volatile("mov %0.s, #0" : "=w"(zero269)); + svfloat32_t zero269 = svdup_n_f32(0); svfloat32_t v269 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero269, v1013, v268, 0), v1013, v268, 90); - svfloat32_t zero318; - asm volatile("mov %0.s, #0" : "=w"(zero318)); + svfloat32_t zero318 = svdup_n_f32(0); svfloat32_t v318 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero318, v1031, v317, 0), v1031, v317, 90); - svfloat32_t zero325; - asm volatile("mov %0.s, #0" : "=w"(zero325)); + svfloat32_t zero325 = svdup_n_f32(0); svfloat32_t v325 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero325, v1040, v324, 0), v1040, v324, 90); - svfloat32_t zero374; - asm volatile("mov %0.s, #0" : "=w"(zero374)); + svfloat32_t zero374 = svdup_n_f32(0); svfloat32_t v374 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero374, v1058, v373, 0), v1058, v373, 90); - svfloat32_t zero381; - asm volatile("mov %0.s, #0" : "=w"(zero381)); + svfloat32_t zero381 = svdup_n_f32(0); svfloat32_t v381 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero381, v1067, v380, 0), v1067, v380, 90); - svfloat32_t zero430; - asm volatile("mov %0.s, #0" : "=w"(zero430)); + svfloat32_t zero430 = svdup_n_f32(0); svfloat32_t v430 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero430, v1085, v429, 0), v1085, v429, 90); - svfloat32_t zero437; - asm volatile("mov %0.s, #0" : "=w"(zero437)); + svfloat32_t zero437 = svdup_n_f32(0); svfloat32_t v437 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero437, v1094, v436, 0), v1094, v436, 90); - svfloat32_t v452; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v452) : "w"(v52), "w"(v59)); - svfloat32_t v453; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v453) : "w"(v52), "w"(v59)); - svfloat32_t v462; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v462) : "w"(v94), "w"(v101)); - svfloat32_t v463; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v463) : "w"(v94), "w"(v101)); - svfloat32_t v465; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v465) : "w"(v150), "w"(v157)); - svfloat32_t v466; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v466) : "w"(v150), "w"(v157)); - svfloat32_t v468; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v468) : "w"(v206), "w"(v213)); - svfloat32_t v469; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v469) : "w"(v206), "w"(v213)); - svfloat32_t v471; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v471) : "w"(v262), "w"(v269)); - svfloat32_t v472; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v472) : "w"(v262), "w"(v269)); - svfloat32_t v474; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v474) : "w"(v318), "w"(v325)); - svfloat32_t v475; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v475) : "w"(v318), "w"(v325)); - svfloat32_t v477; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v477) : "w"(v374), "w"(v381)); - svfloat32_t v478; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v478) : "w"(v374), "w"(v381)); - svfloat32_t v480; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v480) : "w"(v430), "w"(v437)); - svfloat32_t v481; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v481) : "w"(v430), "w"(v437)); - svfloat32_t v461; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v461) : "w"(v452), "w"(v1113)); + svfloat32_t v452 = svadd_f32_x(svptrue_b32(), v52, v59); + svfloat32_t v453 = svsub_f32_x(svptrue_b32(), v52, v59); + svfloat32_t v462 = svadd_f32_x(svptrue_b32(), v94, v101); + svfloat32_t v463 = svsub_f32_x(svptrue_b32(), v94, v101); + svfloat32_t v465 = svadd_f32_x(svptrue_b32(), v150, v157); + svfloat32_t v466 = svsub_f32_x(svptrue_b32(), v150, v157); + svfloat32_t v468 = svadd_f32_x(svptrue_b32(), v206, v213); + svfloat32_t v469 = svsub_f32_x(svptrue_b32(), v206, v213); + svfloat32_t v471 = svadd_f32_x(svptrue_b32(), v262, v269); + svfloat32_t v472 = svsub_f32_x(svptrue_b32(), v262, v269); + svfloat32_t v474 = svadd_f32_x(svptrue_b32(), v318, v325); + svfloat32_t v475 = svsub_f32_x(svptrue_b32(), v318, v325); + svfloat32_t v477 = svadd_f32_x(svptrue_b32(), v374, v381); + svfloat32_t v478 = svsub_f32_x(svptrue_b32(), v374, v381); + svfloat32_t v480 = svadd_f32_x(svptrue_b32(), v430, v437); + svfloat32_t v481 = svsub_f32_x(svptrue_b32(), v430, v437); + svfloat32_t v461 = svadd_f32_x(svptrue_b32(), v452, v1113); svfloat32_t v464 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, v462, v939, v114, 0), v939, v114, 90); @@ -14650,223 +12895,128 @@ void armral_fft_cf32_cf32_cf32_ab_t_gu24(const armral_cmplx_f32_t *restrict x, svfloat32_t v482 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, v480, v1103, v450, 0), v1103, v450, 90); - svfloat32_t v555; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v555) : "w"(v452), "w"(v471)); - svfloat32_t v556; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v556) : "w"(v452), "w"(v471)); - svfloat32_t v557; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v557) : "w"(v465), "w"(v477)); - svfloat32_t v558; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v558) : "w"(v465), "w"(v477)); - svfloat32_t v559; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v559) : "w"(v462), "w"(v474)); - svfloat32_t v560; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v560) : "w"(v462), "w"(v474)); - svfloat32_t v561; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v561) : "w"(v468), "w"(v480)); - svfloat32_t v562; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v562) : "w"(v468), "w"(v480)); - svfloat32_t v627; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v627) : "w"(v453), "w"(v472)); - svfloat32_t v628; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v628) : "w"(v453), "w"(v472)); - svfloat32_t v629; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v629) : "w"(v466), "w"(v478)); - svfloat32_t v630; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v630) : "w"(v466), "w"(v478)); - svfloat32_t v631; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v631) : "w"(v463), "w"(v475)); - svfloat32_t v632; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v632) : "w"(v463), "w"(v475)); - svfloat32_t v633; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v633) : "w"(v469), "w"(v481)); - svfloat32_t v634; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v634) : "w"(v469), "w"(v481)); - svfloat32_t v483; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v483) : "w"(v461), "w"(v473)); - svfloat32_t v484; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v484) : "w"(v461), "w"(v473)); - svfloat32_t v485; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v485) : "w"(v467), "w"(v479)); - svfloat32_t v486; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v486) : "w"(v467), "w"(v479)); - svfloat32_t v487; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v487) : "w"(v464), "w"(v476)); - svfloat32_t v488; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v488) : "w"(v464), "w"(v476)); - svfloat32_t v489; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v489) : "w"(v470), "w"(v482)); - svfloat32_t v490; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v490) : "w"(v470), "w"(v482)); - svfloat32_t v563; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v563) : "w"(v555), "w"(v557)); - svfloat32_t v564; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v564) : "w"(v555), "w"(v557)); - svfloat32_t v565; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v565) : "w"(v559), "w"(v561)); - svfloat32_t v566; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v566) : "w"(v559), "w"(v561)); - svfloat32_t v569; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v569) : "w"(v560), "w"(v562)); - svfloat32_t v570; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v570) : "w"(v560), "w"(v562)); - svfloat32_t zero604; - asm volatile("mov %0.s, #0" : "=w"(zero604)); + svfloat32_t v555 = svadd_f32_x(svptrue_b32(), v452, v471); + svfloat32_t v556 = svsub_f32_x(svptrue_b32(), v452, v471); + svfloat32_t v557 = svadd_f32_x(svptrue_b32(), v465, v477); + svfloat32_t v558 = svsub_f32_x(svptrue_b32(), v465, v477); + svfloat32_t v559 = svadd_f32_x(svptrue_b32(), v462, v474); + svfloat32_t v560 = svsub_f32_x(svptrue_b32(), v462, v474); + svfloat32_t v561 = svadd_f32_x(svptrue_b32(), v468, v480); + svfloat32_t v562 = svsub_f32_x(svptrue_b32(), v468, v480); + svfloat32_t v627 = svadd_f32_x(svptrue_b32(), v453, v472); + svfloat32_t v628 = svsub_f32_x(svptrue_b32(), v453, v472); + svfloat32_t v629 = svadd_f32_x(svptrue_b32(), v466, v478); + svfloat32_t v630 = svsub_f32_x(svptrue_b32(), v466, v478); + svfloat32_t v631 = svadd_f32_x(svptrue_b32(), v463, v475); + svfloat32_t v632 = svsub_f32_x(svptrue_b32(), v463, v475); + svfloat32_t v633 = svadd_f32_x(svptrue_b32(), v469, v481); + svfloat32_t v634 = svsub_f32_x(svptrue_b32(), v469, v481); + svfloat32_t v483 = svadd_f32_x(svptrue_b32(), v461, v473); + svfloat32_t v484 = svsub_f32_x(svptrue_b32(), v461, v473); + svfloat32_t v485 = svadd_f32_x(svptrue_b32(), v467, v479); + svfloat32_t v486 = svsub_f32_x(svptrue_b32(), v467, v479); + svfloat32_t v487 = svadd_f32_x(svptrue_b32(), v464, v476); + svfloat32_t v488 = svsub_f32_x(svptrue_b32(), v464, v476); + svfloat32_t v489 = svadd_f32_x(svptrue_b32(), v470, v482); + svfloat32_t v490 = svsub_f32_x(svptrue_b32(), v470, v482); + svfloat32_t v563 = svadd_f32_x(svptrue_b32(), v555, v557); + svfloat32_t v564 = svsub_f32_x(svptrue_b32(), v555, v557); + svfloat32_t v565 = svadd_f32_x(svptrue_b32(), v559, v561); + svfloat32_t v566 = svsub_f32_x(svptrue_b32(), v559, v561); + svfloat32_t v569 = svadd_f32_x(svptrue_b32(), v560, v562); + svfloat32_t v570 = svsub_f32_x(svptrue_b32(), v560, v562); + svfloat32_t zero604 = svdup_n_f32(0); svfloat32_t v604 = svcmla_f32_x(pred_full, zero604, v1127, v558, 90); - svfloat32_t v635; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v635) : "w"(v627), "w"(v629)); - svfloat32_t v636; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v636) : "w"(v627), "w"(v629)); - svfloat32_t v637; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v637) : "w"(v631), "w"(v633)); - svfloat32_t v638; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v638) : "w"(v631), "w"(v633)); - svfloat32_t v641; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v641) : "w"(v632), "w"(v634)); - svfloat32_t v642; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v642) : "w"(v632), "w"(v634)); - svfloat32_t zero675; - asm volatile("mov %0.s, #0" : "=w"(zero675)); + svfloat32_t v635 = svadd_f32_x(svptrue_b32(), v627, v629); + svfloat32_t v636 = svsub_f32_x(svptrue_b32(), v627, v629); + svfloat32_t v637 = svadd_f32_x(svptrue_b32(), v631, v633); + svfloat32_t v638 = svsub_f32_x(svptrue_b32(), v631, v633); + svfloat32_t v641 = svadd_f32_x(svptrue_b32(), v632, v634); + svfloat32_t v642 = svsub_f32_x(svptrue_b32(), v632, v634); + svfloat32_t zero675 = svdup_n_f32(0); svfloat32_t v675 = svcmla_f32_x(pred_full, zero675, v1134, v628, 90); - svfloat32_t v491; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v491) : "w"(v483), "w"(v485)); - svfloat32_t v492; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v492) : "w"(v483), "w"(v485)); - svfloat32_t v493; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v493) : "w"(v487), "w"(v489)); - svfloat32_t v494; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v494) : "w"(v487), "w"(v489)); - svfloat32_t v497; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v497) : "w"(v488), "w"(v490)); - svfloat32_t v498; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v498) : "w"(v488), "w"(v490)); - svfloat32_t zero532; - asm volatile("mov %0.s, #0" : "=w"(zero532)); + svfloat32_t v491 = svadd_f32_x(svptrue_b32(), v483, v485); + svfloat32_t v492 = svsub_f32_x(svptrue_b32(), v483, v485); + svfloat32_t v493 = svadd_f32_x(svptrue_b32(), v487, v489); + svfloat32_t v494 = svsub_f32_x(svptrue_b32(), v487, v489); + svfloat32_t v497 = svadd_f32_x(svptrue_b32(), v488, v490); + svfloat32_t v498 = svsub_f32_x(svptrue_b32(), v488, v490); + svfloat32_t zero532 = svdup_n_f32(0); svfloat32_t v532 = svcmla_f32_x(pred_full, zero532, v1119, v486, 90); - svfloat32_t v567; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v567) : "w"(v563), "w"(v565)); - svfloat32_t v568; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v568) : "w"(v563), "w"(v565)); - svfloat32_t zero592; - asm volatile("mov %0.s, #0" : "=w"(zero592)); + svfloat32_t v567 = svadd_f32_x(svptrue_b32(), v563, v565); + svfloat32_t v568 = svsub_f32_x(svptrue_b32(), v563, v565); + svfloat32_t zero592 = svdup_n_f32(0); svfloat32_t v592 = svcmla_f32_x(pred_full, zero592, v1127, v566, 90); - svfloat32_t zero611; - asm volatile("mov %0.s, #0" : "=w"(zero611)); + svfloat32_t zero611 = svdup_n_f32(0); svfloat32_t v611 = svcmla_f32_x(pred_full, zero611, v1128, v569, 90); - svfloat32_t v616; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v616) : "w"(v570), "w"(v1129)); - svfloat32_t v639; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v639) : "w"(v635), "w"(v637)); - svfloat32_t v640; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v640) : "w"(v635), "w"(v637)); - svfloat32_t zero663; - asm volatile("mov %0.s, #0" : "=w"(zero663)); + svfloat32_t v616 = svmul_f32_x(svptrue_b32(), v570, v1129); + svfloat32_t v639 = svadd_f32_x(svptrue_b32(), v635, v637); + svfloat32_t v640 = svsub_f32_x(svptrue_b32(), v635, v637); + svfloat32_t zero663 = svdup_n_f32(0); svfloat32_t v663 = svcmla_f32_x(pred_full, zero663, v1134, v636, 90); - svfloat32_t v685; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v685) : "w"(v641), "w"(v1136)); - svfloat32_t zero692; - asm volatile("mov %0.s, #0" : "=w"(zero692)); + svfloat32_t v685 = svmul_f32_x(svptrue_b32(), v641, v1136); + svfloat32_t zero692 = svdup_n_f32(0); svfloat32_t v692 = svcmla_f32_x(pred_full, zero692, v1137, v642, 90); - svfloat32_t v495; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v495) : "w"(v491), "w"(v493)); - svfloat32_t v496; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v496) : "w"(v491), "w"(v493)); - svfloat32_t zero520; - asm volatile("mov %0.s, #0" : "=w"(zero520)); + svfloat32_t v495 = svadd_f32_x(svptrue_b32(), v491, v493); + svfloat32_t v496 = svsub_f32_x(svptrue_b32(), v491, v493); + svfloat32_t zero520 = svdup_n_f32(0); svfloat32_t v520 = svcmla_f32_x(pred_full, zero520, v1119, v494, 90); - svfloat32_t zero539; - asm volatile("mov %0.s, #0" : "=w"(zero539)); + svfloat32_t zero539 = svdup_n_f32(0); svfloat32_t v539 = svcmla_f32_x(pred_full, zero539, v1120, v497, 90); svfloat32_t v617 = svmla_f32_x(pred_full, v592, v564, v1126); svfloat32_t v618 = svnmls_f32_x(pred_full, v592, v564, v1126); svfloat32_t v619 = svmla_f32_x(pred_full, v616, v556, v1126); svfloat32_t v620 = svnmls_f32_x(pred_full, v616, v556, v1126); - svfloat32_t v621; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v621) : "w"(v604), "w"(v611)); - svfloat32_t v622; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v622) : "w"(v604), "w"(v611)); - svfloat32_t zero649; - asm volatile("mov %0.s, #0" : "=w"(zero649)); + svfloat32_t v621 = svadd_f32_x(svptrue_b32(), v604, v611); + svfloat32_t v622 = svsub_f32_x(svptrue_b32(), v604, v611); + svfloat32_t zero649 = svdup_n_f32(0); svfloat32_t v649 = svcmla_f32_x(pred_full, zero649, v1134, v639, 90); - svfloat32_t zero656; - asm volatile("mov %0.s, #0" : "=w"(zero656)); + svfloat32_t zero656 = svdup_n_f32(0); svfloat32_t v656 = svcmla_f32_x(pred_full, zero656, v1134, v640, 90); svfloat32_t v693 = svmla_f32_x(pred_full, v663, v638, v1135); svfloat32_t v694 = svmls_f32_x(pred_full, v663, v638, v1135); - svfloat32_t v695; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v695) : "w"(v675), "w"(v692)); - svfloat32_t v696; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v696) : "w"(v675), "w"(v692)); + svfloat32_t v695 = svadd_f32_x(svptrue_b32(), v675, v692); + svfloat32_t v696 = svsub_f32_x(svptrue_b32(), v675, v692); svfloat32_t v697 = svmla_f32_x(pred_full, v685, v630, v1135); svfloat32_t v698 = svnmls_f32_x(pred_full, v685, v630, v1135); - svfloat32_t v545; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v545) : "w"(v492), "w"(v520)); - svfloat32_t v546; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v546) : "w"(v492), "w"(v520)); + svfloat32_t v545 = svadd_f32_x(svptrue_b32(), v492, v520); + svfloat32_t v546 = svsub_f32_x(svptrue_b32(), v492, v520); svfloat32_t v547 = svmla_f32_x(pred_full, v484, v498, v1121); svfloat32_t v548 = svmls_f32_x(pred_full, v484, v498, v1121); - svfloat32_t v549; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v549) : "w"(v532), "w"(v539)); - svfloat32_t v550; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v550) : "w"(v532), "w"(v539)); - svfloat32_t v623; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v623) : "w"(v619), "w"(v621)); - svfloat32_t v624; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v624) : "w"(v619), "w"(v621)); - svfloat32_t v625; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v625) : "w"(v620), "w"(v622)); - svfloat32_t v626; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v626) : "w"(v620), "w"(v622)); - svfloat32_t v699; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v699) : "w"(v695), "w"(v697)); - svfloat32_t v700; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v700) : "w"(v695), "w"(v697)); - svfloat32_t v701; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v701) : "w"(v696), "w"(v698)); - svfloat32_t v702; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v702) : "w"(v696), "w"(v698)); + svfloat32_t v549 = svadd_f32_x(svptrue_b32(), v532, v539); + svfloat32_t v550 = svsub_f32_x(svptrue_b32(), v532, v539); + svfloat32_t v623 = svadd_f32_x(svptrue_b32(), v619, v621); + svfloat32_t v624 = svsub_f32_x(svptrue_b32(), v619, v621); + svfloat32_t v625 = svadd_f32_x(svptrue_b32(), v620, v622); + svfloat32_t v626 = svsub_f32_x(svptrue_b32(), v620, v622); + svfloat32_t v699 = svadd_f32_x(svptrue_b32(), v695, v697); + svfloat32_t v700 = svsub_f32_x(svptrue_b32(), v695, v697); + svfloat32_t v701 = svadd_f32_x(svptrue_b32(), v696, v698); + svfloat32_t v702 = svsub_f32_x(svptrue_b32(), v696, v698); svfloat32_t v703 = svmla_f32_x(pred_full, v495, v567, v1126); svfloat32_t v799 = svmla_f32_x(pred_full, v496, v568, v1126); svst1_f64(pred_full, (double *)(v1145), svreinterpret_f64_f32(v495)); svst1_f64(pred_full, (double *)(v1253), svreinterpret_f64_f32(v496)); - svfloat32_t v551; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v551) : "w"(v547), "w"(v549)); - svfloat32_t v552; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v552) : "w"(v547), "w"(v549)); - svfloat32_t v553; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v553) : "w"(v548), "w"(v550)); - svfloat32_t v554; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v554) : "w"(v548), "w"(v550)); - svfloat32_t v704; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v704) : "w"(v703), "w"(v649)); - svfloat32_t v705; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v705) : "w"(v703), "w"(v649)); - svfloat32_t v751; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v751) : "w"(v546), "w"(v618)); - svfloat32_t v800; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v800) : "w"(v799), "w"(v656)); - svfloat32_t v801; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v801) : "w"(v799), "w"(v656)); - svfloat32_t v847; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v847) : "w"(v545), "w"(v617)); + svfloat32_t v551 = svadd_f32_x(svptrue_b32(), v547, v549); + svfloat32_t v552 = svsub_f32_x(svptrue_b32(), v547, v549); + svfloat32_t v553 = svadd_f32_x(svptrue_b32(), v548, v550); + svfloat32_t v554 = svsub_f32_x(svptrue_b32(), v548, v550); + svfloat32_t v704 = svadd_f32_x(svptrue_b32(), v703, v649); + svfloat32_t v705 = svsub_f32_x(svptrue_b32(), v703, v649); + svfloat32_t v751 = svadd_f32_x(svptrue_b32(), v546, v618); + svfloat32_t v800 = svadd_f32_x(svptrue_b32(), v799, v656); + svfloat32_t v801 = svsub_f32_x(svptrue_b32(), v799, v656); + svfloat32_t v847 = svadd_f32_x(svptrue_b32(), v545, v617); svst1_f64(pred_full, (double *)(v1199), svreinterpret_f64_f32(v546)); svst1_f64(pred_full, (double *)(v1307), svreinterpret_f64_f32(v545)); - svfloat32_t v727; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v727) : "w"(v552), "w"(v624)); - svfloat32_t v752; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v752) : "w"(v751), "w"(v694)); - svfloat32_t v753; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v753) : "w"(v751), "w"(v694)); - svfloat32_t v775; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v775) : "w"(v553), "w"(v625)); - svfloat32_t v823; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v823) : "w"(v554), "w"(v626)); - svfloat32_t v848; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v848) : "w"(v847), "w"(v693)); - svfloat32_t v849; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v849) : "w"(v847), "w"(v693)); - svfloat32_t v871; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v871) : "w"(v551), "w"(v623)); + svfloat32_t v727 = svadd_f32_x(svptrue_b32(), v552, v624); + svfloat32_t v752 = svadd_f32_x(svptrue_b32(), v751, v694); + svfloat32_t v753 = svsub_f32_x(svptrue_b32(), v751, v694); + svfloat32_t v775 = svadd_f32_x(svptrue_b32(), v553, v625); + svfloat32_t v823 = svadd_f32_x(svptrue_b32(), v554, v626); + svfloat32_t v848 = svadd_f32_x(svptrue_b32(), v847, v693); + svfloat32_t v849 = svsub_f32_x(svptrue_b32(), v847, v693); + svfloat32_t v871 = svadd_f32_x(svptrue_b32(), v551, v623); svst1_f64(pred_full, (double *)(v1154), svreinterpret_f64_f32(v705)); svst1_f64(pred_full, (double *)(v1163), svreinterpret_f64_f32(v704)); svst1_f64(pred_full, (double *)(v1172), svreinterpret_f64_f32(v552)); @@ -14875,22 +13025,14 @@ void armral_fft_cf32_cf32_cf32_ab_t_gu24(const armral_cmplx_f32_t *restrict x, svst1_f64(pred_full, (double *)(v1271), svreinterpret_f64_f32(v800)); svst1_f64(pred_full, (double *)(v1280), svreinterpret_f64_f32(v554)); svst1_f64(pred_full, (double *)(v1334), svreinterpret_f64_f32(v551)); - svfloat32_t v728; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v728) : "w"(v727), "w"(v700)); - svfloat32_t v729; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v729) : "w"(v727), "w"(v700)); - svfloat32_t v776; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v776) : "w"(v775), "w"(v701)); - svfloat32_t v777; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v777) : "w"(v775), "w"(v701)); - svfloat32_t v824; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v824) : "w"(v823), "w"(v702)); - svfloat32_t v825; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v825) : "w"(v823), "w"(v702)); - svfloat32_t v872; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v872) : "w"(v871), "w"(v699)); - svfloat32_t v873; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v873) : "w"(v871), "w"(v699)); + svfloat32_t v728 = svadd_f32_x(svptrue_b32(), v727, v700); + svfloat32_t v729 = svsub_f32_x(svptrue_b32(), v727, v700); + svfloat32_t v776 = svadd_f32_x(svptrue_b32(), v775, v701); + svfloat32_t v777 = svsub_f32_x(svptrue_b32(), v775, v701); + svfloat32_t v824 = svadd_f32_x(svptrue_b32(), v823, v702); + svfloat32_t v825 = svsub_f32_x(svptrue_b32(), v823, v702); + svfloat32_t v872 = svadd_f32_x(svptrue_b32(), v871, v699); + svfloat32_t v873 = svsub_f32_x(svptrue_b32(), v871, v699); svst1_f64(pred_full, (double *)(v1208), svreinterpret_f64_f32(v753)); svst1_f64(pred_full, (double *)(v1217), svreinterpret_f64_f32(v752)); svst1_f64(pred_full, (double *)(v1316), svreinterpret_f64_f32(v849)); @@ -14921,7 +13063,6 @@ void armral_fft_cf32_cf32_cf32_ab_t_gu25(const armral_cmplx_f32_t *restrict x, const float32x2_t *v7 = (const float32x2_t *)w; for (int j = 0; j < howmany; j += 1) { float32x2_t v92 = v5[istride]; - float v1070 = 0.0000000000000000e+00F; float v1163 = 9.6858316112863108e-01F; float v1166 = -2.4868988716485479e-01F; float v1167 = 2.4868988716485479e-01F; @@ -14956,7 +13097,6 @@ void armral_fft_cf32_cf32_cf32_ab_t_gu25(const armral_cmplx_f32_t *restrict x, float32x2_t v98 = vtrn1_f32(v92, v92); float32x2_t v99 = vtrn2_f32(v92, v92); float32x2_t v452 = v5[0]; - float v1073 = dir * v1070; float32x2_t v1164 = (float32x2_t){v1163, v1163}; float32x2_t v1168 = (float32x2_t){v1166, v1167}; float32x2_t v1303 = (float32x2_t){v1302, v1302}; @@ -15027,7 +13167,6 @@ void armral_fft_cf32_cf32_cf32_ab_t_gu25(const armral_cmplx_f32_t *restrict x, int64_t v420 = 36 + j * 48; float32x2_t v434 = v5[istride * 24]; int64_t v438 = 46 + j * 48; - float32x2_t v1071 = (float32x2_t){v1070, v1073}; float32x2_t v1170 = vmul_f32(v1688, v1168); float32x2_t v1309 = vmul_f32(v1688, v1307); float32x2_t v1448 = vmul_f32(v1688, v1446); @@ -15202,86 +13341,26 @@ void armral_fft_cf32_cf32_cf32_ab_t_gu25(const armral_cmplx_f32_t *restrict x, float32x2_t v411 = vfma_f32(v409, v405, v408); float32x2_t v429 = vfma_f32(v427, v423, v426); float32x2_t v447 = vfma_f32(v445, v441, v444); - float32x2_t v462 = vrev64_f32(v33); - float32x2_t v474 = vrev64_f32(v51); - float32x2_t v486 = vrev64_f32(v87); - float32x2_t v504 = vrev64_f32(v69); - float32x2_t v576 = vrev64_f32(v123); - float32x2_t v588 = vrev64_f32(v141); - float32x2_t v600 = vrev64_f32(v177); - float32x2_t v618 = vrev64_f32(v159); - float32x2_t v690 = vrev64_f32(v213); - float32x2_t v702 = vrev64_f32(v231); - float32x2_t v714 = vrev64_f32(v267); - float32x2_t v732 = vrev64_f32(v249); - float32x2_t v804 = vrev64_f32(v303); - float32x2_t v816 = vrev64_f32(v321); - float32x2_t v828 = vrev64_f32(v357); - float32x2_t v846 = vrev64_f32(v339); - float32x2_t v918 = vrev64_f32(v393); - float32x2_t v930 = vrev64_f32(v411); - float32x2_t v942 = vrev64_f32(v447); - float32x2_t v960 = vrev64_f32(v429); - float32x2_t v463 = vmul_f32(v462, v1071); - float32x2_t v475 = vmul_f32(v474, v1071); - float32x2_t v487 = vmul_f32(v486, v1071); - float32x2_t v505 = vmul_f32(v504, v1071); - float32x2_t v577 = vmul_f32(v576, v1071); - float32x2_t v589 = vmul_f32(v588, v1071); - float32x2_t v601 = vmul_f32(v600, v1071); - float32x2_t v619 = vmul_f32(v618, v1071); - float32x2_t v691 = vmul_f32(v690, v1071); - float32x2_t v703 = vmul_f32(v702, v1071); - float32x2_t v715 = vmul_f32(v714, v1071); - float32x2_t v733 = vmul_f32(v732, v1071); - float32x2_t v805 = vmul_f32(v804, v1071); - float32x2_t v817 = vmul_f32(v816, v1071); - float32x2_t v829 = vmul_f32(v828, v1071); - float32x2_t v847 = vmul_f32(v846, v1071); - float32x2_t v919 = vmul_f32(v918, v1071); - float32x2_t v931 = vmul_f32(v930, v1071); - float32x2_t v943 = vmul_f32(v942, v1071); - float32x2_t v961 = vmul_f32(v960, v1071); - float32x2_t v464 = vadd_f32(v463, v33); - float32x2_t v476 = vadd_f32(v475, v51); - float32x2_t v488 = vadd_f32(v487, v87); - float32x2_t v506 = vadd_f32(v505, v69); - float32x2_t v578 = vadd_f32(v577, v123); - float32x2_t v590 = vadd_f32(v589, v141); - float32x2_t v602 = vadd_f32(v601, v177); - float32x2_t v620 = vadd_f32(v619, v159); - float32x2_t v692 = vadd_f32(v691, v213); - float32x2_t v704 = vadd_f32(v703, v231); - float32x2_t v716 = vadd_f32(v715, v267); - float32x2_t v734 = vadd_f32(v733, v249); - float32x2_t v806 = vadd_f32(v805, v303); - float32x2_t v818 = vadd_f32(v817, v321); - float32x2_t v830 = vadd_f32(v829, v357); - float32x2_t v848 = vadd_f32(v847, v339); - float32x2_t v920 = vadd_f32(v919, v393); - float32x2_t v932 = vadd_f32(v931, v411); - float32x2_t v944 = vadd_f32(v943, v447); - float32x2_t v962 = vadd_f32(v961, v429); - float32x2_t v489 = vsub_f32(v464, v488); - float32x2_t v493 = vmul_f32(v464, v1710); - float32x2_t v507 = vsub_f32(v476, v506); - float32x2_t v511 = vmul_f32(v476, v1710); - float32x2_t v603 = vsub_f32(v578, v602); - float32x2_t v607 = vmul_f32(v578, v1710); - float32x2_t v621 = vsub_f32(v590, v620); - float32x2_t v625 = vmul_f32(v590, v1710); - float32x2_t v717 = vsub_f32(v692, v716); - float32x2_t v721 = vmul_f32(v692, v1710); - float32x2_t v735 = vsub_f32(v704, v734); - float32x2_t v739 = vmul_f32(v704, v1710); - float32x2_t v831 = vsub_f32(v806, v830); - float32x2_t v835 = vmul_f32(v806, v1710); - float32x2_t v849 = vsub_f32(v818, v848); - float32x2_t v853 = vmul_f32(v818, v1710); - float32x2_t v945 = vsub_f32(v920, v944); - float32x2_t v949 = vmul_f32(v920, v1710); - float32x2_t v963 = vsub_f32(v932, v962); - float32x2_t v967 = vmul_f32(v932, v1710); + float32x2_t v489 = vsub_f32(v33, v87); + float32x2_t v493 = vmul_f32(v33, v1710); + float32x2_t v507 = vsub_f32(v51, v69); + float32x2_t v511 = vmul_f32(v51, v1710); + float32x2_t v603 = vsub_f32(v123, v177); + float32x2_t v607 = vmul_f32(v123, v1710); + float32x2_t v621 = vsub_f32(v141, v159); + float32x2_t v625 = vmul_f32(v141, v1710); + float32x2_t v717 = vsub_f32(v213, v267); + float32x2_t v721 = vmul_f32(v213, v1710); + float32x2_t v735 = vsub_f32(v231, v249); + float32x2_t v739 = vmul_f32(v231, v1710); + float32x2_t v831 = vsub_f32(v303, v357); + float32x2_t v835 = vmul_f32(v303, v1710); + float32x2_t v849 = vsub_f32(v321, v339); + float32x2_t v853 = vmul_f32(v321, v1710); + float32x2_t v945 = vsub_f32(v393, v447); + float32x2_t v949 = vmul_f32(v393, v1710); + float32x2_t v963 = vsub_f32(v411, v429); + float32x2_t v967 = vmul_f32(v411, v1710); float32x2_t v494 = vsub_f32(v493, v489); float32x2_t v512 = vsub_f32(v511, v507); float32x2_t v523 = vmul_f32(v507, v1663); @@ -15362,10 +13441,10 @@ void armral_fft_cf32_cf32_cf32_ab_t_gu25(const armral_cmplx_f32_t *restrict x, float32x2_t v975 = vsub_f32(v375, v974); float32x2_t v1003 = vmul_f32(v1002, v1689); float32x2_t v1011 = vmul_f32(v1010, v1689); - float32x2_t v1032 = vrev64_f32(v654); - float32x2_t v1044 = vrev64_f32(v768); - float32x2_t v1056 = vrev64_f32(v996); - float32x2_t v1074 = vrev64_f32(v882); + float32x2_t v1059 = vsub_f32(v654, v996); + float32x2_t v1063 = vmul_f32(v654, v1710); + float32x2_t v1077 = vsub_f32(v768, v882); + float32x2_t v1081 = vmul_f32(v768, v1710); float32x2_t v529 = vsub_f32(v519, v528); float32x2_t v533 = vmul_f32(v519, v1710); float32x2_t v643 = vsub_f32(v633, v642); @@ -15376,10 +13455,10 @@ void armral_fft_cf32_cf32_cf32_ab_t_gu25(const armral_cmplx_f32_t *restrict x, float32x2_t v875 = vmul_f32(v861, v1710); float32x2_t v985 = vsub_f32(v975, v984); float32x2_t v989 = vmul_f32(v975, v1710); - float32x2_t v1033 = vmul_f32(v1032, v1071); - float32x2_t v1045 = vmul_f32(v1044, v1071); - float32x2_t v1057 = vmul_f32(v1056, v1071); - float32x2_t v1075 = vmul_f32(v1074, v1071); + float32x2_t v1064 = vsub_f32(v1063, v1059); + float32x2_t v1082 = vsub_f32(v1081, v1077); + float32x2_t v1093 = vmul_f32(v1077, v1663); + float32x2_t v1108 = vmul_f32(v1059, v1663); float32x2_t v534 = vsub_f32(v533, v529); float32x2_t v556 = vsub_f32(v529, v555); float32x2_t v560 = vmul_f32(v529, v1710); @@ -15395,10 +13474,10 @@ void armral_fft_cf32_cf32_cf32_ab_t_gu25(const armral_cmplx_f32_t *restrict x, float32x2_t v990 = vsub_f32(v989, v985); float32x2_t v1012 = vsub_f32(v985, v1011); float32x2_t v1016 = vmul_f32(v985, v1710); - float32x2_t v1034 = vadd_f32(v1033, v654); - float32x2_t v1046 = vadd_f32(v1045, v768); - float32x2_t v1058 = vadd_f32(v1057, v996); - float32x2_t v1076 = vadd_f32(v1075, v882); + float32x2_t v1083 = vadd_f32(v1064, v1082); + float32x2_t v1084 = vsub_f32(v1064, v1082); + float32x2_t v1094 = vadd_f32(v1059, v1093); + float32x2_t v1109 = vsub_f32(v1108, v1077); float32x2_t v548 = vsub_f32(v534, v547); float32x2_t v561 = vsub_f32(v560, v556); float32x2_t v565 = vmul_f32(v534, v1710); @@ -15414,10 +13493,11 @@ void armral_fft_cf32_cf32_cf32_ab_t_gu25(const armral_cmplx_f32_t *restrict x, float32x2_t v1004 = vsub_f32(v990, v1003); float32x2_t v1017 = vsub_f32(v1016, v1012); float32x2_t v1021 = vmul_f32(v990, v1710); - float32x2_t v1059 = vsub_f32(v1034, v1058); - float32x2_t v1063 = vmul_f32(v1034, v1710); - float32x2_t v1077 = vsub_f32(v1046, v1076); - float32x2_t v1081 = vmul_f32(v1046, v1710); + float32x2_t v1088 = vmul_f32(v1083, v1643); + float32x2_t v1098 = vmul_f32(v1084, v1653); + float32x2_t v1110 = vadd_f32(v540, v1083); + float32x2_t v1121 = vrev64_f32(v1094); + float32x2_t v1134 = vrev64_f32(v1109); float32x2_t v1310 = vrev64_f32(v670); float32x2_t v1322 = vrev64_f32(v784); float32x2_t v1334 = vrev64_f32(v1012); @@ -15427,10 +13507,10 @@ void armral_fft_cf32_cf32_cf32_ab_t_gu25(const armral_cmplx_f32_t *restrict x, float32x2_t v794 = vsub_f32(v793, v776); float32x2_t v908 = vsub_f32(v907, v890); float32x2_t v1022 = vsub_f32(v1021, v1004); - float32x2_t v1064 = vsub_f32(v1063, v1059); - float32x2_t v1082 = vsub_f32(v1081, v1077); - float32x2_t v1093 = vmul_f32(v1077, v1663); - float32x2_t v1108 = vmul_f32(v1059, v1663); + float32x2_t v1089 = vsub_f32(v540, v1088); + v6[0] = v1110; + float32x2_t v1122 = vmul_f32(v1121, v1689); + float32x2_t v1135 = vmul_f32(v1134, v1689); float32x2_t v1171 = vrev64_f32(v662); float32x2_t v1183 = vrev64_f32(v776); float32x2_t v1195 = vrev64_f32(v1004); @@ -15443,10 +13523,8 @@ void armral_fft_cf32_cf32_cf32_ab_t_gu25(const armral_cmplx_f32_t *restrict x, float32x2_t v1461 = vrev64_f32(v789); float32x2_t v1473 = vrev64_f32(v1017); float32x2_t v1491 = vrev64_f32(v903); - float32x2_t v1083 = vadd_f32(v1064, v1082); - float32x2_t v1084 = vsub_f32(v1064, v1082); - float32x2_t v1094 = vadd_f32(v1059, v1093); - float32x2_t v1109 = vsub_f32(v1108, v1077); + float32x2_t v1099 = vsub_f32(v1089, v1098); + float32x2_t v1103 = vmul_f32(v1089, v1710); float32x2_t v1172 = vmul_f32(v1171, v1170); float32x2_t v1184 = vmul_f32(v1183, v1309); float32x2_t v1196 = vmul_f32(v1195, v1587); @@ -15463,11 +13541,9 @@ void armral_fft_cf32_cf32_cf32_ab_t_gu25(const armral_cmplx_f32_t *restrict x, float32x2_t v1600 = vrev64_f32(v794); float32x2_t v1612 = vrev64_f32(v1022); float32x2_t v1630 = vrev64_f32(v908); - float32x2_t v1088 = vmul_f32(v1083, v1643); - float32x2_t v1098 = vmul_f32(v1084, v1653); - float32x2_t v1110 = vadd_f32(v540, v1083); - float32x2_t v1121 = vrev64_f32(v1094); - float32x2_t v1134 = vrev64_f32(v1109); + float32x2_t v1104 = vsub_f32(v1103, v1099); + float32x2_t v1136 = vsub_f32(v1099, v1135); + float32x2_t v1145 = vmul_f32(v1099, v1710); float32x2_t v1173 = vfma_f32(v1172, v662, v1164); float32x2_t v1185 = vfma_f32(v1184, v776, v1303); float32x2_t v1197 = vfma_f32(v1196, v1004, v1581); @@ -15484,10 +13560,10 @@ void armral_fft_cf32_cf32_cf32_ab_t_gu25(const armral_cmplx_f32_t *restrict x, float32x2_t v1601 = vmul_f32(v1600, v1599); float32x2_t v1613 = vmul_f32(v1612, v1611); float32x2_t v1631 = vmul_f32(v1630, v1629); - float32x2_t v1089 = vsub_f32(v540, v1088); - v6[0] = v1110; - float32x2_t v1122 = vmul_f32(v1121, v1689); - float32x2_t v1135 = vmul_f32(v1134, v1689); + float32x2_t v1123 = vsub_f32(v1104, v1122); + v6[ostride * 10] = v1136; + float32x2_t v1146 = vsub_f32(v1145, v1136); + float32x2_t v1155 = vmul_f32(v1104, v1710); float32x2_t v1198 = vsub_f32(v1173, v1197); float32x2_t v1202 = vmul_f32(v1173, v1710); float32x2_t v1216 = vsub_f32(v1185, v1215); @@ -15504,8 +13580,9 @@ void armral_fft_cf32_cf32_cf32_ab_t_gu25(const armral_cmplx_f32_t *restrict x, float32x2_t v1602 = vfma_f32(v1601, v794, v1593); float32x2_t v1614 = vfma_f32(v1613, v1022, v1605); float32x2_t v1632 = vfma_f32(v1631, v908, v1623); - float32x2_t v1099 = vsub_f32(v1089, v1098); - float32x2_t v1103 = vmul_f32(v1089, v1710); + v6[ostride * 5] = v1123; + v6[ostride * 15] = v1146; + float32x2_t v1156 = vsub_f32(v1155, v1123); float32x2_t v1203 = vsub_f32(v1202, v1198); float32x2_t v1221 = vsub_f32(v1220, v1216); float32x2_t v1232 = vmul_f32(v1216, v1663); @@ -15522,9 +13599,7 @@ void armral_fft_cf32_cf32_cf32_ab_t_gu25(const armral_cmplx_f32_t *restrict x, float32x2_t v1619 = vmul_f32(v1590, v1710); float32x2_t v1633 = vsub_f32(v1602, v1632); float32x2_t v1637 = vmul_f32(v1602, v1710); - float32x2_t v1104 = vsub_f32(v1103, v1099); - float32x2_t v1136 = vsub_f32(v1099, v1135); - float32x2_t v1145 = vmul_f32(v1099, v1710); + v6[ostride * 20] = v1156; float32x2_t v1222 = vadd_f32(v1203, v1221); float32x2_t v1223 = vsub_f32(v1203, v1221); float32x2_t v1233 = vadd_f32(v1198, v1232); @@ -15542,10 +13617,6 @@ void armral_fft_cf32_cf32_cf32_ab_t_gu25(const armral_cmplx_f32_t *restrict x, float32x2_t v1638 = vsub_f32(v1637, v1633); float32x2_t v1649 = vmul_f32(v1633, v1663); float32x2_t v1664 = vmul_f32(v1615, v1663); - float32x2_t v1123 = vsub_f32(v1104, v1122); - v6[ostride * 10] = v1136; - float32x2_t v1146 = vsub_f32(v1145, v1136); - float32x2_t v1155 = vmul_f32(v1104, v1710); float32x2_t v1227 = vmul_f32(v1222, v1643); float32x2_t v1237 = vmul_f32(v1223, v1653); float32x2_t v1249 = vadd_f32(v548, v1222); @@ -15564,9 +13635,6 @@ void armral_fft_cf32_cf32_cf32_ab_t_gu25(const armral_cmplx_f32_t *restrict x, float32x2_t v1640 = vsub_f32(v1620, v1638); float32x2_t v1650 = vadd_f32(v1615, v1649); float32x2_t v1665 = vsub_f32(v1664, v1633); - v6[ostride * 5] = v1123; - v6[ostride * 15] = v1146; - float32x2_t v1156 = vsub_f32(v1155, v1123); float32x2_t v1228 = vsub_f32(v548, v1227); v6[ostride] = v1249; float32x2_t v1261 = vmul_f32(v1260, v1689); @@ -15582,7 +13650,6 @@ void armral_fft_cf32_cf32_cf32_ab_t_gu25(const armral_cmplx_f32_t *restrict x, float32x2_t v1666 = vadd_f32(v566, v1639); float32x2_t v1677 = vrev64_f32(v1650); float32x2_t v1690 = vrev64_f32(v1665); - v6[ostride * 20] = v1156; float32x2_t v1238 = vsub_f32(v1228, v1237); float32x2_t v1242 = vmul_f32(v1228, v1710); float32x2_t v1382 = vsub_f32(v1381, v1377); @@ -15683,7 +13750,6 @@ void armral_fft_cf32_cf32_cf32_ab_t_gu25(const armral_cmplx_f32_t *restrict x, float v1713 = 2.5000000000000000e-01F; float v1725 = 5.5901699437494745e-01F; float v1737 = 6.1803398874989490e-01F; - float v1765 = 0.0000000000000000e+00F; float v1766 = -9.5105651629515353e-01F; float v1794 = 2.0000000000000000e+00F; const float32x2_t *v1848 = &v5[v0]; @@ -15734,7 +13800,6 @@ void armral_fft_cf32_cf32_cf32_ab_t_gu25(const armral_cmplx_f32_t *restrict x, int64_t v341 = v0 * 24; int64_t v349 = v10 * 23; int64_t v350 = v13 * 24; - float v1051 = v4 * v1765; int64_t v1111 = v2 * 5; int64_t v1126 = v2 * 10; int64_t v1139 = v2 * 15; @@ -15770,6 +13835,7 @@ void armral_fft_cf32_cf32_cf32_ab_t_gu25(const armral_cmplx_f32_t *restrict x, int64_t v1800 = v2 * 24; const float32x2_t *v2030 = &v5[0]; svint64_t v2031 = svindex_s64(0, v1); + svfloat32_t v2136 = svdup_n_f32(0); float32x2_t *v2150 = &v6[0]; svfloat32_t v2193 = svdup_n_f32(v1159); svfloat32_t v2257 = svdup_n_f32(v1321); @@ -15835,7 +13901,6 @@ void armral_fft_cf32_cf32_cf32_ab_t_gu25(const armral_cmplx_f32_t *restrict x, const float32x2_t *v2020 = &v5[v341]; svfloat32_t v2032 = svreinterpret_f32_f64( svld1_gather_s64index_f64(pred_full, (const double *)(v2030), v2031)); - svfloat32_t v2136 = svdup_n_f32(v1051); float32x2_t *v2160 = &v6[v1111]; float32x2_t *v2170 = &v6[v1126]; float32x2_t *v2180 = &v6[v1139]; @@ -15877,8 +13942,7 @@ void armral_fft_cf32_cf32_cf32_ab_t_gu25(const armral_cmplx_f32_t *restrict x, svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v7)[v57])); svfloat32_t v72 = svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v7)[v71])); - svfloat32_t zero87; - asm volatile("mov %0.s, #0" : "=w"(zero87)); + svfloat32_t zero87 = svdup_n_f32(0); svfloat32_t v87 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero87, v1850, v86, 0), v1850, v86, 90); @@ -15966,118 +14030,95 @@ void armral_fft_cf32_cf32_cf32_ab_t_gu25(const armral_cmplx_f32_t *restrict x, svld1_gather_s64index_f64(pred_full, (const double *)(v2011), v2031)); svfloat32_t v2022 = svreinterpret_f32_f64( svld1_gather_s64index_f64(pred_full, (const double *)(v2020), v2031)); - svfloat32_t zero31; - asm volatile("mov %0.s, #0" : "=w"(zero31)); + svfloat32_t zero31 = svdup_n_f32(0); svfloat32_t v31 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero31, v1814, v30, 0), v1814, v30, 90); - svfloat32_t zero45; - asm volatile("mov %0.s, #0" : "=w"(zero45)); + svfloat32_t zero45 = svdup_n_f32(0); svfloat32_t v45 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero45, v1823, v44, 0), v1823, v44, 90); - svfloat32_t zero59; - asm volatile("mov %0.s, #0" : "=w"(zero59)); + svfloat32_t zero59 = svdup_n_f32(0); svfloat32_t v59 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero59, v1832, v58, 0), v1832, v58, 90); - svfloat32_t zero73; - asm volatile("mov %0.s, #0" : "=w"(zero73)); + svfloat32_t zero73 = svdup_n_f32(0); svfloat32_t v73 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero73, v1841, v72, 0), v1841, v72, 90); - svfloat32_t zero101; - asm volatile("mov %0.s, #0" : "=w"(zero101)); + svfloat32_t zero101 = svdup_n_f32(0); svfloat32_t v101 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero101, v1860, v100, 0), v1860, v100, 90); - svfloat32_t zero115; - asm volatile("mov %0.s, #0" : "=w"(zero115)); + svfloat32_t zero115 = svdup_n_f32(0); svfloat32_t v115 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero115, v1869, v114, 0), v1869, v114, 90); - svfloat32_t zero129; - asm volatile("mov %0.s, #0" : "=w"(zero129)); + svfloat32_t zero129 = svdup_n_f32(0); svfloat32_t v129 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero129, v1878, v128, 0), v1878, v128, 90); - svfloat32_t zero143; - asm volatile("mov %0.s, #0" : "=w"(zero143)); + svfloat32_t zero143 = svdup_n_f32(0); svfloat32_t v143 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero143, v1887, v142, 0), v1887, v142, 90); - svfloat32_t zero157; - asm volatile("mov %0.s, #0" : "=w"(zero157)); + svfloat32_t zero157 = svdup_n_f32(0); svfloat32_t v157 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero157, v1896, v156, 0), v1896, v156, 90); - svfloat32_t zero171; - asm volatile("mov %0.s, #0" : "=w"(zero171)); + svfloat32_t zero171 = svdup_n_f32(0); svfloat32_t v171 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero171, v1905, v170, 0), v1905, v170, 90); - svfloat32_t zero185; - asm volatile("mov %0.s, #0" : "=w"(zero185)); + svfloat32_t zero185 = svdup_n_f32(0); svfloat32_t v185 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero185, v1914, v184, 0), v1914, v184, 90); - svfloat32_t zero199; - asm volatile("mov %0.s, #0" : "=w"(zero199)); + svfloat32_t zero199 = svdup_n_f32(0); svfloat32_t v199 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero199, v1923, v198, 0), v1923, v198, 90); - svfloat32_t zero213; - asm volatile("mov %0.s, #0" : "=w"(zero213)); + svfloat32_t zero213 = svdup_n_f32(0); svfloat32_t v213 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero213, v1932, v212, 0), v1932, v212, 90); - svfloat32_t zero227; - asm volatile("mov %0.s, #0" : "=w"(zero227)); + svfloat32_t zero227 = svdup_n_f32(0); svfloat32_t v227 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero227, v1941, v226, 0), v1941, v226, 90); - svfloat32_t zero241; - asm volatile("mov %0.s, #0" : "=w"(zero241)); + svfloat32_t zero241 = svdup_n_f32(0); svfloat32_t v241 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero241, v1950, v240, 0), v1950, v240, 90); - svfloat32_t zero255; - asm volatile("mov %0.s, #0" : "=w"(zero255)); + svfloat32_t zero255 = svdup_n_f32(0); svfloat32_t v255 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero255, v1959, v254, 0), v1959, v254, 90); - svfloat32_t zero269; - asm volatile("mov %0.s, #0" : "=w"(zero269)); + svfloat32_t zero269 = svdup_n_f32(0); svfloat32_t v269 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero269, v1968, v268, 0), v1968, v268, 90); - svfloat32_t zero283; - asm volatile("mov %0.s, #0" : "=w"(zero283)); + svfloat32_t zero283 = svdup_n_f32(0); svfloat32_t v283 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero283, v1977, v282, 0), v1977, v282, 90); - svfloat32_t zero297; - asm volatile("mov %0.s, #0" : "=w"(zero297)); + svfloat32_t zero297 = svdup_n_f32(0); svfloat32_t v297 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero297, v1986, v296, 0), v1986, v296, 90); - svfloat32_t zero311; - asm volatile("mov %0.s, #0" : "=w"(zero311)); + svfloat32_t zero311 = svdup_n_f32(0); svfloat32_t v311 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero311, v1995, v310, 0), v1995, v310, 90); - svfloat32_t zero325; - asm volatile("mov %0.s, #0" : "=w"(zero325)); + svfloat32_t zero325 = svdup_n_f32(0); svfloat32_t v325 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero325, v2004, v324, 0), v2004, v324, 90); - svfloat32_t zero339; - asm volatile("mov %0.s, #0" : "=w"(zero339)); + svfloat32_t zero339 = svdup_n_f32(0); svfloat32_t v339 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero339, v2013, v338, 0), v2013, v338, 90); - svfloat32_t zero353; - asm volatile("mov %0.s, #0" : "=w"(zero353)); + svfloat32_t zero353 = svdup_n_f32(0); svfloat32_t v353 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero353, v2022, v352, 0), v2022, v352, 90); @@ -16101,26 +14142,16 @@ void armral_fft_cf32_cf32_cf32_ab_t_gu25(const armral_cmplx_f32_t *restrict x, svfloat32_t v894 = svcmla_f32_x(pred_full, v325, v2136, v325, 90); svfloat32_t v907 = svcmla_f32_x(pred_full, v353, v2136, v353, 90); svfloat32_t v927 = svcmla_f32_x(pred_full, v339, v2136, v339, 90); - svfloat32_t v400; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v400) : "w"(v373), "w"(v399)); - svfloat32_t v420; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v420) : "w"(v386), "w"(v419)); - svfloat32_t v527; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v527) : "w"(v500), "w"(v526)); - svfloat32_t v547; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v547) : "w"(v513), "w"(v546)); - svfloat32_t v654; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v654) : "w"(v627), "w"(v653)); - svfloat32_t v674; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v674) : "w"(v640), "w"(v673)); - svfloat32_t v781; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v781) : "w"(v754), "w"(v780)); - svfloat32_t v801; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v801) : "w"(v767), "w"(v800)); - svfloat32_t v908; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v908) : "w"(v881), "w"(v907)); - svfloat32_t v928; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v928) : "w"(v894), "w"(v927)); + svfloat32_t v400 = svsub_f32_x(svptrue_b32(), v373, v399); + svfloat32_t v420 = svsub_f32_x(svptrue_b32(), v386, v419); + svfloat32_t v527 = svsub_f32_x(svptrue_b32(), v500, v526); + svfloat32_t v547 = svsub_f32_x(svptrue_b32(), v513, v546); + svfloat32_t v654 = svsub_f32_x(svptrue_b32(), v627, v653); + svfloat32_t v674 = svsub_f32_x(svptrue_b32(), v640, v673); + svfloat32_t v781 = svsub_f32_x(svptrue_b32(), v754, v780); + svfloat32_t v801 = svsub_f32_x(svptrue_b32(), v767, v800); + svfloat32_t v908 = svsub_f32_x(svptrue_b32(), v881, v907); + svfloat32_t v928 = svsub_f32_x(svptrue_b32(), v894, v927); svfloat32_t v406 = svnmls_f32_x(pred_full, v400, v373, v2439); svfloat32_t v426 = svnmls_f32_x(pred_full, v420, v386, v2439); svfloat32_t v533 = svnmls_f32_x(pred_full, v527, v500, v2439); @@ -16131,75 +14162,50 @@ void armral_fft_cf32_cf32_cf32_ab_t_gu25(const armral_cmplx_f32_t *restrict x, svfloat32_t v807 = svnmls_f32_x(pred_full, v801, v767, v2439); svfloat32_t v914 = svnmls_f32_x(pred_full, v908, v881, v2439); svfloat32_t v934 = svnmls_f32_x(pred_full, v928, v894, v2439); - svfloat32_t v427; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v427) : "w"(v406), "w"(v426)); - svfloat32_t v428; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v428) : "w"(v406), "w"(v426)); + svfloat32_t v427 = svadd_f32_x(svptrue_b32(), v406, v426); + svfloat32_t v428 = svsub_f32_x(svptrue_b32(), v406, v426); svfloat32_t v440 = svmla_f32_x(pred_full, v400, v420, v2399); svfloat32_t v458 = svnmls_f32_x(pred_full, v420, v400, v2399); - svfloat32_t v554; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v554) : "w"(v533), "w"(v553)); - svfloat32_t v555; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v555) : "w"(v533), "w"(v553)); + svfloat32_t v554 = svadd_f32_x(svptrue_b32(), v533, v553); + svfloat32_t v555 = svsub_f32_x(svptrue_b32(), v533, v553); svfloat32_t v567 = svmla_f32_x(pred_full, v527, v547, v2399); svfloat32_t v585 = svnmls_f32_x(pred_full, v547, v527, v2399); - svfloat32_t v681; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v681) : "w"(v660), "w"(v680)); - svfloat32_t v682; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v682) : "w"(v660), "w"(v680)); + svfloat32_t v681 = svadd_f32_x(svptrue_b32(), v660, v680); + svfloat32_t v682 = svsub_f32_x(svptrue_b32(), v660, v680); svfloat32_t v694 = svmla_f32_x(pred_full, v654, v674, v2399); svfloat32_t v712 = svnmls_f32_x(pred_full, v674, v654, v2399); - svfloat32_t v808; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v808) : "w"(v787), "w"(v807)); - svfloat32_t v809; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v809) : "w"(v787), "w"(v807)); + svfloat32_t v808 = svadd_f32_x(svptrue_b32(), v787, v807); + svfloat32_t v809 = svsub_f32_x(svptrue_b32(), v787, v807); svfloat32_t v821 = svmla_f32_x(pred_full, v781, v801, v2399); svfloat32_t v839 = svnmls_f32_x(pred_full, v801, v781, v2399); - svfloat32_t v935; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v935) : "w"(v914), "w"(v934)); - svfloat32_t v936; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v936) : "w"(v914), "w"(v934)); + svfloat32_t v935 = svadd_f32_x(svptrue_b32(), v914, v934); + svfloat32_t v936 = svsub_f32_x(svptrue_b32(), v914, v934); svfloat32_t v948 = svmla_f32_x(pred_full, v908, v928, v2399); svfloat32_t v966 = svnmls_f32_x(pred_full, v928, v908, v2399); - svfloat32_t v459; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v459) : "w"(v2032), "w"(v427)); - svfloat32_t zero466; - asm volatile("mov %0.s, #0" : "=w"(zero466)); + svfloat32_t v459 = svadd_f32_x(svptrue_b32(), v2032, v427); + svfloat32_t zero466 = svdup_n_f32(0); svfloat32_t v466 = svcmla_f32_x(pred_full, zero466, v2419, v440, 90); - svfloat32_t zero474; - asm volatile("mov %0.s, #0" : "=w"(zero474)); + svfloat32_t zero474 = svdup_n_f32(0); svfloat32_t v474 = svcmla_f32_x(pred_full, zero474, v2419, v458, 90); - svfloat32_t v586; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v586) : "w"(v87), "w"(v554)); - svfloat32_t zero593; - asm volatile("mov %0.s, #0" : "=w"(zero593)); + svfloat32_t v586 = svadd_f32_x(svptrue_b32(), v87, v554); + svfloat32_t zero593 = svdup_n_f32(0); svfloat32_t v593 = svcmla_f32_x(pred_full, zero593, v2419, v567, 90); - svfloat32_t zero601; - asm volatile("mov %0.s, #0" : "=w"(zero601)); + svfloat32_t zero601 = svdup_n_f32(0); svfloat32_t v601 = svcmla_f32_x(pred_full, zero601, v2419, v585, 90); - svfloat32_t v713; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v713) : "w"(v157), "w"(v681)); - svfloat32_t zero720; - asm volatile("mov %0.s, #0" : "=w"(zero720)); + svfloat32_t v713 = svadd_f32_x(svptrue_b32(), v157, v681); + svfloat32_t zero720 = svdup_n_f32(0); svfloat32_t v720 = svcmla_f32_x(pred_full, zero720, v2419, v694, 90); - svfloat32_t zero728; - asm volatile("mov %0.s, #0" : "=w"(zero728)); + svfloat32_t zero728 = svdup_n_f32(0); svfloat32_t v728 = svcmla_f32_x(pred_full, zero728, v2419, v712, 90); - svfloat32_t v840; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v840) : "w"(v227), "w"(v808)); - svfloat32_t zero847; - asm volatile("mov %0.s, #0" : "=w"(zero847)); + svfloat32_t v840 = svadd_f32_x(svptrue_b32(), v227, v808); + svfloat32_t zero847 = svdup_n_f32(0); svfloat32_t v847 = svcmla_f32_x(pred_full, zero847, v2419, v821, 90); - svfloat32_t zero855; - asm volatile("mov %0.s, #0" : "=w"(zero855)); + svfloat32_t zero855 = svdup_n_f32(0); svfloat32_t v855 = svcmla_f32_x(pred_full, zero855, v2419, v839, 90); - svfloat32_t v967; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v967) : "w"(v297), "w"(v935)); - svfloat32_t zero974; - asm volatile("mov %0.s, #0" : "=w"(zero974)); + svfloat32_t v967 = svadd_f32_x(svptrue_b32(), v297, v935); + svfloat32_t zero974 = svdup_n_f32(0); svfloat32_t v974 = svcmla_f32_x(pred_full, zero974, v2419, v948, 90); - svfloat32_t zero982; - asm volatile("mov %0.s, #0" : "=w"(zero982)); + svfloat32_t zero982 = svdup_n_f32(0); svfloat32_t v982 = svcmla_f32_x(pred_full, zero982, v2419, v966, 90); svfloat32_t v434 = svmls_f32_x(pred_full, v2032, v427, v2395); svfloat32_t v561 = svmls_f32_x(pred_full, v87, v554, v2395); @@ -16216,119 +14222,80 @@ void armral_fft_cf32_cf32_cf32_ab_t_gu25(const armral_cmplx_f32_t *restrict x, svfloat32_t v1034 = svcmla_f32_x(pred_full, v967, v2136, v967, 90); svfloat32_t v1054 = svcmla_f32_x(pred_full, v840, v2136, v840, 90); svfloat32_t v452 = svnmls_f32_x(pred_full, v446, v434, v2439); - svfloat32_t v475; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v475) : "w"(v446), "w"(v474)); + svfloat32_t v475 = svsub_f32_x(svptrue_b32(), v446, v474); svfloat32_t v579 = svnmls_f32_x(pred_full, v573, v561, v2439); - svfloat32_t v602; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v602) : "w"(v573), "w"(v601)); + svfloat32_t v602 = svsub_f32_x(svptrue_b32(), v573, v601); svfloat32_t v706 = svnmls_f32_x(pred_full, v700, v688, v2439); - svfloat32_t v729; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v729) : "w"(v700), "w"(v728)); + svfloat32_t v729 = svsub_f32_x(svptrue_b32(), v700, v728); svfloat32_t v833 = svnmls_f32_x(pred_full, v827, v815, v2439); - svfloat32_t v856; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v856) : "w"(v827), "w"(v855)); + svfloat32_t v856 = svsub_f32_x(svptrue_b32(), v827, v855); svfloat32_t v960 = svnmls_f32_x(pred_full, v954, v942, v2439); - svfloat32_t v983; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v983) : "w"(v954), "w"(v982)); - svfloat32_t v1035; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1035) : "w"(v1008), "w"(v1034)); - svfloat32_t v1055; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1055) : "w"(v1021), "w"(v1054)); - svfloat32_t v467; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v467) : "w"(v452), "w"(v466)); + svfloat32_t v983 = svsub_f32_x(svptrue_b32(), v954, v982); + svfloat32_t v1035 = svsub_f32_x(svptrue_b32(), v1008, v1034); + svfloat32_t v1055 = svsub_f32_x(svptrue_b32(), v1021, v1054); + svfloat32_t v467 = svsub_f32_x(svptrue_b32(), v452, v466); svfloat32_t v481 = svnmls_f32_x(pred_full, v475, v446, v2439); - svfloat32_t v594; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v594) : "w"(v579), "w"(v593)); + svfloat32_t v594 = svsub_f32_x(svptrue_b32(), v579, v593); svfloat32_t v608 = svnmls_f32_x(pred_full, v602, v573, v2439); - svfloat32_t v721; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v721) : "w"(v706), "w"(v720)); + svfloat32_t v721 = svsub_f32_x(svptrue_b32(), v706, v720); svfloat32_t v735 = svnmls_f32_x(pred_full, v729, v700, v2439); - svfloat32_t v848; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v848) : "w"(v833), "w"(v847)); + svfloat32_t v848 = svsub_f32_x(svptrue_b32(), v833, v847); svfloat32_t v862 = svnmls_f32_x(pred_full, v856, v827, v2439); - svfloat32_t v975; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v975) : "w"(v960), "w"(v974)); + svfloat32_t v975 = svsub_f32_x(svptrue_b32(), v960, v974); svfloat32_t v989 = svnmls_f32_x(pred_full, v983, v954, v2439); svfloat32_t v1041 = svnmls_f32_x(pred_full, v1035, v1008, v2439); svfloat32_t v1061 = svnmls_f32_x(pred_full, v1055, v1021, v2439); - svfloat32_t v1324; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v1324) : "w"(v602), "w"(v2257)); - svfloat32_t v1337; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v1337) : "w"(v729), "w"(v2385)); - svfloat32_t v1350; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v1350) : "w"(v983), "w"(v2387)); - svfloat32_t v1370; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v1370) : "w"(v856), "w"(v2323)); + svfloat32_t v1324 = svmul_f32_x(svptrue_b32(), v602, v2257); + svfloat32_t v1337 = svmul_f32_x(svptrue_b32(), v729, v2385); + svfloat32_t v1350 = svmul_f32_x(svptrue_b32(), v983, v2387); + svfloat32_t v1370 = svmul_f32_x(svptrue_b32(), v856, v2323); svfloat32_t v487 = svnmls_f32_x(pred_full, v467, v452, v2439); svfloat32_t v614 = svnmls_f32_x(pred_full, v594, v579, v2439); svfloat32_t v741 = svnmls_f32_x(pred_full, v721, v706, v2439); svfloat32_t v868 = svnmls_f32_x(pred_full, v848, v833, v2439); svfloat32_t v995 = svnmls_f32_x(pred_full, v975, v960, v2439); - svfloat32_t v1062; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v1062) : "w"(v1041), "w"(v1061)); - svfloat32_t v1063; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1063) : "w"(v1041), "w"(v1061)); + svfloat32_t v1062 = svadd_f32_x(svptrue_b32(), v1041, v1061); + svfloat32_t v1063 = svsub_f32_x(svptrue_b32(), v1041, v1061); svfloat32_t v1075 = svmla_f32_x(pred_full, v1035, v1055, v2399); svfloat32_t v1093 = svnmls_f32_x(pred_full, v1055, v1035, v2399); - svfloat32_t v1162; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v1162) : "w"(v594), "w"(v2193)); - svfloat32_t v1175; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v1175) : "w"(v721), "w"(v2257)); - svfloat32_t v1188; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v1188) : "w"(v975), "w"(v2385)); - svfloat32_t v1208; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v1208) : "w"(v848), "w"(v2321)); + svfloat32_t v1162 = svmul_f32_x(svptrue_b32(), v594, v2193); + svfloat32_t v1175 = svmul_f32_x(svptrue_b32(), v721, v2257); + svfloat32_t v1188 = svmul_f32_x(svptrue_b32(), v975, v2385); + svfloat32_t v1208 = svmul_f32_x(svptrue_b32(), v848, v2321); svfloat32_t v1332 = svcmla_f32_x(pred_full, v1324, v2258, v602, 90); svfloat32_t v1345 = svcmla_f32_x(pred_full, v1337, v2386, v729, 90); svfloat32_t v1358 = svcmla_f32_x(pred_full, v1350, v2388, v983, 90); svfloat32_t v1378 = svcmla_f32_x(pred_full, v1370, v2324, v856, 90); - svfloat32_t v1486; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v1486) : "w"(v608), "w"(v2321)); - svfloat32_t v1499; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v1499) : "w"(v735), "w"(v2323)); - svfloat32_t v1512; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v1512) : "w"(v989), "w"(v2392)); - svfloat32_t v1532; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v1532) : "w"(v862), "w"(v2389)); - svfloat32_t v1094; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v1094) : "w"(v459), "w"(v1062)); - svfloat32_t zero1108; - asm volatile("mov %0.s, #0" : "=w"(zero1108)); + svfloat32_t v1486 = svmul_f32_x(svptrue_b32(), v608, v2321); + svfloat32_t v1499 = svmul_f32_x(svptrue_b32(), v735, v2323); + svfloat32_t v1512 = svmul_f32_x(svptrue_b32(), v989, v2392); + svfloat32_t v1532 = svmul_f32_x(svptrue_b32(), v862, v2389); + svfloat32_t v1094 = svadd_f32_x(svptrue_b32(), v459, v1062); + svfloat32_t zero1108 = svdup_n_f32(0); svfloat32_t v1108 = svcmla_f32_x(pred_full, zero1108, v2419, v1075, 90); - svfloat32_t zero1123; - asm volatile("mov %0.s, #0" : "=w"(zero1123)); + svfloat32_t zero1123 = svdup_n_f32(0); svfloat32_t v1123 = svcmla_f32_x(pred_full, zero1123, v2419, v1093, 90); svfloat32_t v1170 = svcmla_f32_x(pred_full, v1162, v2194, v594, 90); svfloat32_t v1183 = svcmla_f32_x(pred_full, v1175, v2258, v721, 90); svfloat32_t v1196 = svcmla_f32_x(pred_full, v1188, v2386, v975, 90); svfloat32_t v1216 = svcmla_f32_x(pred_full, v1208, v2322, v848, 90); - svfloat32_t v1359; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1359) : "w"(v1332), "w"(v1358)); - svfloat32_t v1379; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1379) : "w"(v1345), "w"(v1378)); + svfloat32_t v1359 = svsub_f32_x(svptrue_b32(), v1332, v1358); + svfloat32_t v1379 = svsub_f32_x(svptrue_b32(), v1345, v1378); svfloat32_t v1494 = svcmla_f32_x(pred_full, v1486, v2322, v608, 90); svfloat32_t v1507 = svcmla_f32_x(pred_full, v1499, v2324, v735, 90); svfloat32_t v1520 = svcmla_f32_x(pred_full, v1512, v2393, v989, 90); svfloat32_t v1540 = svcmla_f32_x(pred_full, v1532, v2329, v862, 90); - svfloat32_t v1648; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v1648) : "w"(v614), "w"(v2385)); - svfloat32_t v1661; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v1661) : "w"(v741), "w"(v2387)); - svfloat32_t v1674; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v1674) : "w"(v995), "w"(v2389)); - svfloat32_t v1694; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v1694) : "w"(v868), "w"(v2392)); + svfloat32_t v1648 = svmul_f32_x(svptrue_b32(), v614, v2385); + svfloat32_t v1661 = svmul_f32_x(svptrue_b32(), v741, v2387); + svfloat32_t v1674 = svmul_f32_x(svptrue_b32(), v995, v2389); + svfloat32_t v1694 = svmul_f32_x(svptrue_b32(), v868, v2392); svfloat32_t v1069 = svmls_f32_x(pred_full, v459, v1062, v2395); - svfloat32_t v1197; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1197) : "w"(v1170), "w"(v1196)); - svfloat32_t v1217; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1217) : "w"(v1183), "w"(v1216)); + svfloat32_t v1197 = svsub_f32_x(svptrue_b32(), v1170, v1196); + svfloat32_t v1217 = svsub_f32_x(svptrue_b32(), v1183, v1216); svfloat32_t v1365 = svnmls_f32_x(pred_full, v1359, v1332, v2439); svfloat32_t v1385 = svnmls_f32_x(pred_full, v1379, v1345, v2439); - svfloat32_t v1521; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1521) : "w"(v1494), "w"(v1520)); - svfloat32_t v1541; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1541) : "w"(v1507), "w"(v1540)); + svfloat32_t v1521 = svsub_f32_x(svptrue_b32(), v1494, v1520); + svfloat32_t v1541 = svsub_f32_x(svptrue_b32(), v1507, v1540); svfloat32_t v1656 = svcmla_f32_x(pred_full, v1648, v2386, v614, 90); svfloat32_t v1669 = svcmla_f32_x(pred_full, v1661, v2388, v741, 90); svfloat32_t v1682 = svcmla_f32_x(pred_full, v1674, v2390, v995, 90); @@ -16337,67 +14304,46 @@ void armral_fft_cf32_cf32_cf32_ab_t_gu25(const armral_cmplx_f32_t *restrict x, svfloat32_t v1081 = svmls_f32_x(pred_full, v1069, v1063, v2397); svfloat32_t v1203 = svnmls_f32_x(pred_full, v1197, v1170, v2439); svfloat32_t v1223 = svnmls_f32_x(pred_full, v1217, v1183, v2439); - svfloat32_t v1386; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v1386) : "w"(v1365), "w"(v1385)); - svfloat32_t v1387; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1387) : "w"(v1365), "w"(v1385)); + svfloat32_t v1386 = svadd_f32_x(svptrue_b32(), v1365, v1385); + svfloat32_t v1387 = svsub_f32_x(svptrue_b32(), v1365, v1385); svfloat32_t v1399 = svmla_f32_x(pred_full, v1359, v1379, v2399); svfloat32_t v1417 = svnmls_f32_x(pred_full, v1379, v1359, v2399); svfloat32_t v1527 = svnmls_f32_x(pred_full, v1521, v1494, v2439); svfloat32_t v1547 = svnmls_f32_x(pred_full, v1541, v1507, v2439); - svfloat32_t v1683; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1683) : "w"(v1656), "w"(v1682)); - svfloat32_t v1703; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1703) : "w"(v1669), "w"(v1702)); + svfloat32_t v1683 = svsub_f32_x(svptrue_b32(), v1656, v1682); + svfloat32_t v1703 = svsub_f32_x(svptrue_b32(), v1669, v1702); svfloat32_t v1087 = svnmls_f32_x(pred_full, v1081, v1069, v2439); - svfloat32_t v1124; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1124) : "w"(v1081), "w"(v1123)); - svfloat32_t v1224; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v1224) : "w"(v1203), "w"(v1223)); - svfloat32_t v1225; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1225) : "w"(v1203), "w"(v1223)); + svfloat32_t v1124 = svsub_f32_x(svptrue_b32(), v1081, v1123); + svfloat32_t v1224 = svadd_f32_x(svptrue_b32(), v1203, v1223); + svfloat32_t v1225 = svsub_f32_x(svptrue_b32(), v1203, v1223); svfloat32_t v1237 = svmla_f32_x(pred_full, v1197, v1217, v2399); svfloat32_t v1255 = svnmls_f32_x(pred_full, v1217, v1197, v2399); - svfloat32_t v1418; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v1418) : "w"(v475), "w"(v1386)); - svfloat32_t zero1432; - asm volatile("mov %0.s, #0" : "=w"(zero1432)); + svfloat32_t v1418 = svadd_f32_x(svptrue_b32(), v475, v1386); + svfloat32_t zero1432 = svdup_n_f32(0); svfloat32_t v1432 = svcmla_f32_x(pred_full, zero1432, v2419, v1399, 90); - svfloat32_t zero1447; - asm volatile("mov %0.s, #0" : "=w"(zero1447)); + svfloat32_t zero1447 = svdup_n_f32(0); svfloat32_t v1447 = svcmla_f32_x(pred_full, zero1447, v2419, v1417, 90); - svfloat32_t v1548; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v1548) : "w"(v1527), "w"(v1547)); - svfloat32_t v1549; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1549) : "w"(v1527), "w"(v1547)); + svfloat32_t v1548 = svadd_f32_x(svptrue_b32(), v1527, v1547); + svfloat32_t v1549 = svsub_f32_x(svptrue_b32(), v1527, v1547); svfloat32_t v1561 = svmla_f32_x(pred_full, v1521, v1541, v2399); svfloat32_t v1579 = svnmls_f32_x(pred_full, v1541, v1521, v2399); svfloat32_t v1689 = svnmls_f32_x(pred_full, v1683, v1656, v2439); svfloat32_t v1709 = svnmls_f32_x(pred_full, v1703, v1669, v2439); - svfloat32_t v1109; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1109) : "w"(v1087), "w"(v1108)); + svfloat32_t v1109 = svsub_f32_x(svptrue_b32(), v1087, v1108); svfloat32_t v1137 = svnmls_f32_x(pred_full, v1124, v1081, v2439); - svfloat32_t v1256; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v1256) : "w"(v467), "w"(v1224)); - svfloat32_t zero1270; - asm volatile("mov %0.s, #0" : "=w"(zero1270)); + svfloat32_t v1256 = svadd_f32_x(svptrue_b32(), v467, v1224); + svfloat32_t zero1270 = svdup_n_f32(0); svfloat32_t v1270 = svcmla_f32_x(pred_full, zero1270, v2419, v1237, 90); - svfloat32_t zero1285; - asm volatile("mov %0.s, #0" : "=w"(zero1285)); + svfloat32_t zero1285 = svdup_n_f32(0); svfloat32_t v1285 = svcmla_f32_x(pred_full, zero1285, v2419, v1255, 90); svfloat32_t v1393 = svmls_f32_x(pred_full, v475, v1386, v2395); - svfloat32_t v1580; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v1580) : "w"(v481), "w"(v1548)); - svfloat32_t zero1594; - asm volatile("mov %0.s, #0" : "=w"(zero1594)); + svfloat32_t v1580 = svadd_f32_x(svptrue_b32(), v481, v1548); + svfloat32_t zero1594 = svdup_n_f32(0); svfloat32_t v1594 = svcmla_f32_x(pred_full, zero1594, v2419, v1561, 90); - svfloat32_t zero1609; - asm volatile("mov %0.s, #0" : "=w"(zero1609)); + svfloat32_t zero1609 = svdup_n_f32(0); svfloat32_t v1609 = svcmla_f32_x(pred_full, zero1609, v2419, v1579, 90); - svfloat32_t v1710; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v1710) : "w"(v1689), "w"(v1709)); - svfloat32_t v1711; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1711) : "w"(v1689), "w"(v1709)); + svfloat32_t v1710 = svadd_f32_x(svptrue_b32(), v1689, v1709); + svfloat32_t v1711 = svsub_f32_x(svptrue_b32(), v1689, v1709); svfloat32_t v1723 = svmla_f32_x(pred_full, v1683, v1703, v2399); svfloat32_t v1741 = svnmls_f32_x(pred_full, v1703, v1683, v2399); svst1_f64(pred_full, (double *)(v2170), svreinterpret_f64_f32(v1124)); @@ -16406,13 +14352,10 @@ void armral_fft_cf32_cf32_cf32_ab_t_gu25(const armral_cmplx_f32_t *restrict x, svfloat32_t v1231 = svmls_f32_x(pred_full, v467, v1224, v2395); svfloat32_t v1405 = svmls_f32_x(pred_full, v1393, v1387, v2397); svfloat32_t v1555 = svmls_f32_x(pred_full, v481, v1548, v2395); - svfloat32_t v1742; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v1742) : "w"(v487), "w"(v1710)); - svfloat32_t zero1756; - asm volatile("mov %0.s, #0" : "=w"(zero1756)); + svfloat32_t v1742 = svadd_f32_x(svptrue_b32(), v487, v1710); + svfloat32_t zero1756 = svdup_n_f32(0); svfloat32_t v1756 = svcmla_f32_x(pred_full, zero1756, v2419, v1723, 90); - svfloat32_t zero1771; - asm volatile("mov %0.s, #0" : "=w"(zero1771)); + svfloat32_t zero1771 = svdup_n_f32(0); svfloat32_t v1771 = svcmla_f32_x(pred_full, zero1771, v2419, v1741, 90); svst1_f64(pred_full, (double *)(v2160), svreinterpret_f64_f32(v1109)); svst1_f64(pred_full, (double *)(v2180), svreinterpret_f64_f32(v1137)); @@ -16420,41 +14363,33 @@ void armral_fft_cf32_cf32_cf32_ab_t_gu25(const armral_cmplx_f32_t *restrict x, svst1_f64(pred_full, (double *)(v2342), svreinterpret_f64_f32(v1580)); svfloat32_t v1243 = svmls_f32_x(pred_full, v1231, v1225, v2397); svfloat32_t v1411 = svnmls_f32_x(pred_full, v1405, v1393, v2439); - svfloat32_t v1448; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1448) : "w"(v1405), "w"(v1447)); + svfloat32_t v1448 = svsub_f32_x(svptrue_b32(), v1405, v1447); svfloat32_t v1567 = svmls_f32_x(pred_full, v1555, v1549, v2397); svfloat32_t v1717 = svmls_f32_x(pred_full, v487, v1710, v2395); svst1_f64(pred_full, (double *)(v2190), svreinterpret_f64_f32(v1150)); svst1_f64(pred_full, (double *)(v2406), svreinterpret_f64_f32(v1742)); svfloat32_t v1249 = svnmls_f32_x(pred_full, v1243, v1231, v2439); - svfloat32_t v1286; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1286) : "w"(v1243), "w"(v1285)); - svfloat32_t v1433; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1433) : "w"(v1411), "w"(v1432)); + svfloat32_t v1286 = svsub_f32_x(svptrue_b32(), v1243, v1285); + svfloat32_t v1433 = svsub_f32_x(svptrue_b32(), v1411, v1432); svfloat32_t v1461 = svnmls_f32_x(pred_full, v1448, v1405, v2439); svfloat32_t v1573 = svnmls_f32_x(pred_full, v1567, v1555, v2439); - svfloat32_t v1610; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1610) : "w"(v1567), "w"(v1609)); + svfloat32_t v1610 = svsub_f32_x(svptrue_b32(), v1567, v1609); svfloat32_t v1729 = svmls_f32_x(pred_full, v1717, v1711, v2397); svst1_f64(pred_full, (double *)(v2298), svreinterpret_f64_f32(v1448)); - svfloat32_t v1271; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1271) : "w"(v1249), "w"(v1270)); + svfloat32_t v1271 = svsub_f32_x(svptrue_b32(), v1249, v1270); svfloat32_t v1299 = svnmls_f32_x(pred_full, v1286, v1243, v2439); svfloat32_t v1474 = svnmls_f32_x(pred_full, v1433, v1411, v2439); - svfloat32_t v1595; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1595) : "w"(v1573), "w"(v1594)); + svfloat32_t v1595 = svsub_f32_x(svptrue_b32(), v1573, v1594); svfloat32_t v1623 = svnmls_f32_x(pred_full, v1610, v1567, v2439); svfloat32_t v1735 = svnmls_f32_x(pred_full, v1729, v1717, v2439); - svfloat32_t v1772; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1772) : "w"(v1729), "w"(v1771)); + svfloat32_t v1772 = svsub_f32_x(svptrue_b32(), v1729, v1771); svst1_f64(pred_full, (double *)(v2234), svreinterpret_f64_f32(v1286)); svst1_f64(pred_full, (double *)(v2288), svreinterpret_f64_f32(v1433)); svst1_f64(pred_full, (double *)(v2308), svreinterpret_f64_f32(v1461)); svst1_f64(pred_full, (double *)(v2362), svreinterpret_f64_f32(v1610)); svfloat32_t v1312 = svnmls_f32_x(pred_full, v1271, v1249, v2439); svfloat32_t v1636 = svnmls_f32_x(pred_full, v1595, v1573, v2439); - svfloat32_t v1757; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1757) : "w"(v1735), "w"(v1756)); + svfloat32_t v1757 = svsub_f32_x(svptrue_b32(), v1735, v1756); svfloat32_t v1785 = svnmls_f32_x(pred_full, v1772, v1729, v2439); svst1_f64(pred_full, (double *)(v2224), svreinterpret_f64_f32(v1271)); svst1_f64(pred_full, (double *)(v2244), svreinterpret_f64_f32(v1299)); @@ -17180,7 +15115,6 @@ void armral_fft_cf32_cf32_cf32_ab_t_gu32(const armral_cmplx_f32_t *restrict x, float v1428 = 9.8078528040323043e-01F; float v1435 = -5.5557023301960218e-01F; float v1440 = -8.3146961230254524e-01F; - float v1451 = 1.0000000000000000e+00F; const float32x2_t *v1630 = &v5[v0]; float32x2_t *v1868 = &v6[v2]; int64_t v19 = v0 * 16; @@ -17278,7 +15212,6 @@ void armral_fft_cf32_cf32_cf32_ab_t_gu32(const armral_cmplx_f32_t *restrict x, int64_t v1416 = v2 * 30; float v1431 = v4 * v1428; float v1443 = v4 * v1440; - float v1454 = v4 * v1451; int64_t v1462 = v2 * 7; int64_t v1469 = v2 * 15; int64_t v1476 = v2 * 23; @@ -17298,6 +15231,7 @@ void armral_fft_cf32_cf32_cf32_ab_t_gu32(const armral_cmplx_f32_t *restrict x, svfloat32_t v2064 = svdup_n_f32(v1368); svfloat32_t v2103 = svdup_n_f32(v1423); svfloat32_t v2105 = svdup_n_f32(v1435); + svfloat32_t v2107 = svdup_n_f32(v4); int64_t v36 = v34 + v595; int64_t v50 = v48 + v595; int64_t v64 = v62 + v595; @@ -17399,7 +15333,6 @@ void armral_fft_cf32_cf32_cf32_ab_t_gu32(const armral_cmplx_f32_t *restrict x, float32x2_t *v2100 = &v6[v1416]; svfloat32_t v2104 = svdup_n_f32(v1431); svfloat32_t v2106 = svdup_n_f32(v1443); - svfloat32_t v2107 = svdup_n_f32(v1454); float32x2_t *v2114 = &v6[v1462]; float32x2_t *v2123 = &v6[v1469]; float32x2_t *v2132 = &v6[v1476]; @@ -17434,8 +15367,7 @@ void armral_fft_cf32_cf32_cf32_ab_t_gu32(const armral_cmplx_f32_t *restrict x, svld1_f64(pred_full, &((const double *)v7)[v274])); svfloat32_t v289 = svreinterpret_f32_f64( svld1_f64(pred_full, &((const double *)v7)[v288])); - svfloat32_t zero325; - asm volatile("mov %0.s, #0" : "=w"(zero325)); + svfloat32_t zero325 = svdup_n_f32(0); svfloat32_t v325 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero325, v1632, v324, 0), v1632, v324, 90); @@ -17529,443 +15461,282 @@ void armral_fft_cf32_cf32_cf32_ab_t_gu32(const armral_cmplx_f32_t *restrict x, svld1_gather_s64index_f64(pred_full, (const double *)(v1758), v1778)); svfloat32_t v1769 = svreinterpret_f32_f64( svld1_gather_s64index_f64(pred_full, (const double *)(v1767), v1778)); - svfloat32_t zero38; - asm volatile("mov %0.s, #0" : "=w"(zero38)); + svfloat32_t zero38 = svdup_n_f32(0); svfloat32_t v38 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero38, v1497, v37, 0), v1497, v37, 90); - svfloat32_t zero52; - asm volatile("mov %0.s, #0" : "=w"(zero52)); + svfloat32_t zero52 = svdup_n_f32(0); svfloat32_t v52 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero52, v1506, v51, 0), v1506, v51, 90); - svfloat32_t zero66; - asm volatile("mov %0.s, #0" : "=w"(zero66)); + svfloat32_t zero66 = svdup_n_f32(0); svfloat32_t v66 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero66, v1515, v65, 0), v1515, v65, 90); - svfloat32_t zero101; - asm volatile("mov %0.s, #0" : "=w"(zero101)); + svfloat32_t zero101 = svdup_n_f32(0); svfloat32_t v101 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero101, v1524, v100, 0), v1524, v100, 90); - svfloat32_t zero108; - asm volatile("mov %0.s, #0" : "=w"(zero108)); + svfloat32_t zero108 = svdup_n_f32(0); svfloat32_t v108 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero108, v1533, v107, 0), v1533, v107, 90); - svfloat32_t zero143; - asm volatile("mov %0.s, #0" : "=w"(zero143)); + svfloat32_t zero143 = svdup_n_f32(0); svfloat32_t v143 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero143, v1542, v142, 0), v1542, v142, 90); - svfloat32_t zero150; - asm volatile("mov %0.s, #0" : "=w"(zero150)); + svfloat32_t zero150 = svdup_n_f32(0); svfloat32_t v150 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero150, v1551, v149, 0), v1551, v149, 90); - svfloat32_t zero185; - asm volatile("mov %0.s, #0" : "=w"(zero185)); + svfloat32_t zero185 = svdup_n_f32(0); svfloat32_t v185 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero185, v1560, v184, 0), v1560, v184, 90); - svfloat32_t zero192; - asm volatile("mov %0.s, #0" : "=w"(zero192)); + svfloat32_t zero192 = svdup_n_f32(0); svfloat32_t v192 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero192, v1569, v191, 0), v1569, v191, 90); - svfloat32_t zero206; - asm volatile("mov %0.s, #0" : "=w"(zero206)); + svfloat32_t zero206 = svdup_n_f32(0); svfloat32_t v206 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero206, v1578, v205, 0), v1578, v205, 90); - svfloat32_t zero220; - asm volatile("mov %0.s, #0" : "=w"(zero220)); + svfloat32_t zero220 = svdup_n_f32(0); svfloat32_t v220 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero220, v1587, v219, 0), v1587, v219, 90); - svfloat32_t zero255; - asm volatile("mov %0.s, #0" : "=w"(zero255)); + svfloat32_t zero255 = svdup_n_f32(0); svfloat32_t v255 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero255, v1596, v254, 0), v1596, v254, 90); - svfloat32_t zero262; - asm volatile("mov %0.s, #0" : "=w"(zero262)); + svfloat32_t zero262 = svdup_n_f32(0); svfloat32_t v262 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero262, v1605, v261, 0), v1605, v261, 90); - svfloat32_t zero276; - asm volatile("mov %0.s, #0" : "=w"(zero276)); + svfloat32_t zero276 = svdup_n_f32(0); svfloat32_t v276 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero276, v1614, v275, 0), v1614, v275, 90); - svfloat32_t zero290; - asm volatile("mov %0.s, #0" : "=w"(zero290)); + svfloat32_t zero290 = svdup_n_f32(0); svfloat32_t v290 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero290, v1623, v289, 0), v1623, v289, 90); - svfloat32_t zero332; - asm volatile("mov %0.s, #0" : "=w"(zero332)); + svfloat32_t zero332 = svdup_n_f32(0); svfloat32_t v332 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero332, v1642, v331, 0), v1642, v331, 90); - svfloat32_t zero346; - asm volatile("mov %0.s, #0" : "=w"(zero346)); + svfloat32_t zero346 = svdup_n_f32(0); svfloat32_t v346 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero346, v1652, v345, 0), v1652, v345, 90); - svfloat32_t zero360; - asm volatile("mov %0.s, #0" : "=w"(zero360)); + svfloat32_t zero360 = svdup_n_f32(0); svfloat32_t v360 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero360, v1661, v359, 0), v1661, v359, 90); - svfloat32_t zero395; - asm volatile("mov %0.s, #0" : "=w"(zero395)); + svfloat32_t zero395 = svdup_n_f32(0); svfloat32_t v395 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero395, v1670, v394, 0), v1670, v394, 90); - svfloat32_t zero402; - asm volatile("mov %0.s, #0" : "=w"(zero402)); + svfloat32_t zero402 = svdup_n_f32(0); svfloat32_t v402 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero402, v1679, v401, 0), v1679, v401, 90); - svfloat32_t zero437; - asm volatile("mov %0.s, #0" : "=w"(zero437)); + svfloat32_t zero437 = svdup_n_f32(0); svfloat32_t v437 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero437, v1688, v436, 0), v1688, v436, 90); - svfloat32_t zero444; - asm volatile("mov %0.s, #0" : "=w"(zero444)); + svfloat32_t zero444 = svdup_n_f32(0); svfloat32_t v444 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero444, v1697, v443, 0), v1697, v443, 90); - svfloat32_t zero479; - asm volatile("mov %0.s, #0" : "=w"(zero479)); + svfloat32_t zero479 = svdup_n_f32(0); svfloat32_t v479 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero479, v1706, v478, 0), v1706, v478, 90); - svfloat32_t zero486; - asm volatile("mov %0.s, #0" : "=w"(zero486)); + svfloat32_t zero486 = svdup_n_f32(0); svfloat32_t v486 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero486, v1715, v485, 0), v1715, v485, 90); - svfloat32_t zero500; - asm volatile("mov %0.s, #0" : "=w"(zero500)); + svfloat32_t zero500 = svdup_n_f32(0); svfloat32_t v500 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero500, v1724, v499, 0), v1724, v499, 90); - svfloat32_t zero514; - asm volatile("mov %0.s, #0" : "=w"(zero514)); + svfloat32_t zero514 = svdup_n_f32(0); svfloat32_t v514 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero514, v1733, v513, 0), v1733, v513, 90); - svfloat32_t zero549; - asm volatile("mov %0.s, #0" : "=w"(zero549)); + svfloat32_t zero549 = svdup_n_f32(0); svfloat32_t v549 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero549, v1742, v548, 0), v1742, v548, 90); - svfloat32_t zero556; - asm volatile("mov %0.s, #0" : "=w"(zero556)); + svfloat32_t zero556 = svdup_n_f32(0); svfloat32_t v556 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero556, v1751, v555, 0), v1751, v555, 90); - svfloat32_t zero591; - asm volatile("mov %0.s, #0" : "=w"(zero591)); + svfloat32_t zero591 = svdup_n_f32(0); svfloat32_t v591 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero591, v1760, v590, 0), v1760, v590, 90); - svfloat32_t zero598; - asm volatile("mov %0.s, #0" : "=w"(zero598)); + svfloat32_t zero598 = svdup_n_f32(0); svfloat32_t v598 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero598, v1769, v597, 0), v1769, v597, 90); - svfloat32_t v606; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v606) : "w"(v1779), "w"(v38)); - svfloat32_t v607; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v607) : "w"(v1779), "w"(v38)); - svfloat32_t v608; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v608) : "w"(v52), "w"(v66)); - svfloat32_t v609; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v609) : "w"(v52), "w"(v66)); - svfloat32_t v621; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v621) : "w"(v101), "w"(v108)); - svfloat32_t v622; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v622) : "w"(v101), "w"(v108)); - svfloat32_t v623; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v623) : "w"(v143), "w"(v150)); - svfloat32_t v624; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v624) : "w"(v143), "w"(v150)); - svfloat32_t v677; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v677) : "w"(v185), "w"(v192)); - svfloat32_t v678; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v678) : "w"(v185), "w"(v192)); - svfloat32_t v679; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v679) : "w"(v206), "w"(v220)); - svfloat32_t v680; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v680) : "w"(v206), "w"(v220)); - svfloat32_t v692; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v692) : "w"(v255), "w"(v262)); - svfloat32_t v693; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v693) : "w"(v255), "w"(v262)); - svfloat32_t v694; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v694) : "w"(v276), "w"(v290)); - svfloat32_t v695; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v695) : "w"(v276), "w"(v290)); - svfloat32_t v837; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v837) : "w"(v325), "w"(v332)); - svfloat32_t v838; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v838) : "w"(v325), "w"(v332)); - svfloat32_t v839; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v839) : "w"(v346), "w"(v360)); - svfloat32_t v840; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v840) : "w"(v346), "w"(v360)); - svfloat32_t v852; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v852) : "w"(v395), "w"(v402)); - svfloat32_t v853; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v853) : "w"(v395), "w"(v402)); - svfloat32_t v854; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v854) : "w"(v437), "w"(v444)); - svfloat32_t v855; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v855) : "w"(v437), "w"(v444)); - svfloat32_t v908; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v908) : "w"(v479), "w"(v486)); - svfloat32_t v909; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v909) : "w"(v479), "w"(v486)); - svfloat32_t v910; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v910) : "w"(v500), "w"(v514)); - svfloat32_t v911; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v911) : "w"(v500), "w"(v514)); - svfloat32_t v923; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v923) : "w"(v549), "w"(v556)); - svfloat32_t v924; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v924) : "w"(v549), "w"(v556)); - svfloat32_t v925; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v925) : "w"(v591), "w"(v598)); - svfloat32_t v926; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v926) : "w"(v591), "w"(v598)); - svfloat32_t zero616; - asm volatile("mov %0.s, #0" : "=w"(zero616)); + svfloat32_t v606 = svadd_f32_x(svptrue_b32(), v1779, v38); + svfloat32_t v607 = svsub_f32_x(svptrue_b32(), v1779, v38); + svfloat32_t v608 = svadd_f32_x(svptrue_b32(), v52, v66); + svfloat32_t v609 = svsub_f32_x(svptrue_b32(), v52, v66); + svfloat32_t v621 = svadd_f32_x(svptrue_b32(), v101, v108); + svfloat32_t v622 = svsub_f32_x(svptrue_b32(), v101, v108); + svfloat32_t v623 = svadd_f32_x(svptrue_b32(), v143, v150); + svfloat32_t v624 = svsub_f32_x(svptrue_b32(), v143, v150); + svfloat32_t v677 = svadd_f32_x(svptrue_b32(), v185, v192); + svfloat32_t v678 = svsub_f32_x(svptrue_b32(), v185, v192); + svfloat32_t v679 = svadd_f32_x(svptrue_b32(), v206, v220); + svfloat32_t v680 = svsub_f32_x(svptrue_b32(), v206, v220); + svfloat32_t v692 = svadd_f32_x(svptrue_b32(), v255, v262); + svfloat32_t v693 = svsub_f32_x(svptrue_b32(), v255, v262); + svfloat32_t v694 = svadd_f32_x(svptrue_b32(), v276, v290); + svfloat32_t v695 = svsub_f32_x(svptrue_b32(), v276, v290); + svfloat32_t v837 = svadd_f32_x(svptrue_b32(), v325, v332); + svfloat32_t v838 = svsub_f32_x(svptrue_b32(), v325, v332); + svfloat32_t v839 = svadd_f32_x(svptrue_b32(), v346, v360); + svfloat32_t v840 = svsub_f32_x(svptrue_b32(), v346, v360); + svfloat32_t v852 = svadd_f32_x(svptrue_b32(), v395, v402); + svfloat32_t v853 = svsub_f32_x(svptrue_b32(), v395, v402); + svfloat32_t v854 = svadd_f32_x(svptrue_b32(), v437, v444); + svfloat32_t v855 = svsub_f32_x(svptrue_b32(), v437, v444); + svfloat32_t v908 = svadd_f32_x(svptrue_b32(), v479, v486); + svfloat32_t v909 = svsub_f32_x(svptrue_b32(), v479, v486); + svfloat32_t v910 = svadd_f32_x(svptrue_b32(), v500, v514); + svfloat32_t v911 = svsub_f32_x(svptrue_b32(), v500, v514); + svfloat32_t v923 = svadd_f32_x(svptrue_b32(), v549, v556); + svfloat32_t v924 = svsub_f32_x(svptrue_b32(), v549, v556); + svfloat32_t v925 = svadd_f32_x(svptrue_b32(), v591, v598); + svfloat32_t v926 = svsub_f32_x(svptrue_b32(), v591, v598); + svfloat32_t zero616 = svdup_n_f32(0); svfloat32_t v616 = svcmla_f32_x(pred_full, zero616, v1983, v609, 90); - svfloat32_t v617; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v617) : "w"(v606), "w"(v608)); - svfloat32_t v618; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v618) : "w"(v606), "w"(v608)); - svfloat32_t v625; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v625) : "w"(v621), "w"(v623)); - svfloat32_t v626; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v626) : "w"(v621), "w"(v623)); - svfloat32_t v642; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v642) : "w"(v622), "w"(v1980)); - svfloat32_t v654; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v654) : "w"(v624), "w"(v1982)); - svfloat32_t zero687; - asm volatile("mov %0.s, #0" : "=w"(zero687)); + svfloat32_t v617 = svadd_f32_x(svptrue_b32(), v606, v608); + svfloat32_t v618 = svsub_f32_x(svptrue_b32(), v606, v608); + svfloat32_t v625 = svadd_f32_x(svptrue_b32(), v621, v623); + svfloat32_t v626 = svsub_f32_x(svptrue_b32(), v621, v623); + svfloat32_t v642 = svmul_f32_x(svptrue_b32(), v622, v1980); + svfloat32_t v654 = svmul_f32_x(svptrue_b32(), v624, v1982); + svfloat32_t zero687 = svdup_n_f32(0); svfloat32_t v687 = svcmla_f32_x(pred_full, zero687, v1983, v680, 90); - svfloat32_t v688; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v688) : "w"(v677), "w"(v679)); - svfloat32_t v689; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v689) : "w"(v677), "w"(v679)); - svfloat32_t zero702; - asm volatile("mov %0.s, #0" : "=w"(zero702)); + svfloat32_t v688 = svadd_f32_x(svptrue_b32(), v677, v679); + svfloat32_t v689 = svsub_f32_x(svptrue_b32(), v677, v679); + svfloat32_t zero702 = svdup_n_f32(0); svfloat32_t v702 = svcmla_f32_x(pred_full, zero702, v1983, v695, 90); - svfloat32_t v703; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v703) : "w"(v692), "w"(v694)); - svfloat32_t v704; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v704) : "w"(v692), "w"(v694)); - svfloat32_t zero847; - asm volatile("mov %0.s, #0" : "=w"(zero847)); + svfloat32_t v703 = svadd_f32_x(svptrue_b32(), v692, v694); + svfloat32_t v704 = svsub_f32_x(svptrue_b32(), v692, v694); + svfloat32_t zero847 = svdup_n_f32(0); svfloat32_t v847 = svcmla_f32_x(pred_full, zero847, v1983, v840, 90); - svfloat32_t v848; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v848) : "w"(v837), "w"(v839)); - svfloat32_t v849; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v849) : "w"(v837), "w"(v839)); - svfloat32_t v856; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v856) : "w"(v852), "w"(v854)); - svfloat32_t v857; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v857) : "w"(v852), "w"(v854)); - svfloat32_t v873; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v873) : "w"(v853), "w"(v1980)); - svfloat32_t v885; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v885) : "w"(v855), "w"(v1982)); - svfloat32_t zero918; - asm volatile("mov %0.s, #0" : "=w"(zero918)); + svfloat32_t v848 = svadd_f32_x(svptrue_b32(), v837, v839); + svfloat32_t v849 = svsub_f32_x(svptrue_b32(), v837, v839); + svfloat32_t v856 = svadd_f32_x(svptrue_b32(), v852, v854); + svfloat32_t v857 = svsub_f32_x(svptrue_b32(), v852, v854); + svfloat32_t v873 = svmul_f32_x(svptrue_b32(), v853, v1980); + svfloat32_t v885 = svmul_f32_x(svptrue_b32(), v855, v1982); + svfloat32_t zero918 = svdup_n_f32(0); svfloat32_t v918 = svcmla_f32_x(pred_full, zero918, v1983, v911, 90); - svfloat32_t v919; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v919) : "w"(v908), "w"(v910)); - svfloat32_t v920; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v920) : "w"(v908), "w"(v910)); - svfloat32_t v927; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v927) : "w"(v923), "w"(v925)); - svfloat32_t v928; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v928) : "w"(v923), "w"(v925)); - svfloat32_t v944; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v944) : "w"(v924), "w"(v1980)); - svfloat32_t v956; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v956) : "w"(v926), "w"(v1982)); - svfloat32_t v619; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v619) : "w"(v607), "w"(v616)); - svfloat32_t v620; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v620) : "w"(v607), "w"(v616)); - svfloat32_t zero633; - asm volatile("mov %0.s, #0" : "=w"(zero633)); + svfloat32_t v919 = svadd_f32_x(svptrue_b32(), v908, v910); + svfloat32_t v920 = svsub_f32_x(svptrue_b32(), v908, v910); + svfloat32_t v927 = svadd_f32_x(svptrue_b32(), v923, v925); + svfloat32_t v928 = svsub_f32_x(svptrue_b32(), v923, v925); + svfloat32_t v944 = svmul_f32_x(svptrue_b32(), v924, v1980); + svfloat32_t v956 = svmul_f32_x(svptrue_b32(), v926, v1982); + svfloat32_t v619 = svsub_f32_x(svptrue_b32(), v607, v616); + svfloat32_t v620 = svadd_f32_x(svptrue_b32(), v607, v616); + svfloat32_t zero633 = svdup_n_f32(0); svfloat32_t v633 = svcmla_f32_x(pred_full, zero633, v1983, v626, 90); - svfloat32_t v634; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v634) : "w"(v617), "w"(v625)); - svfloat32_t v635; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v635) : "w"(v617), "w"(v625)); - svfloat32_t v690; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v690) : "w"(v678), "w"(v687)); - svfloat32_t v691; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v691) : "w"(v678), "w"(v687)); - svfloat32_t v705; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v705) : "w"(v693), "w"(v702)); - svfloat32_t v706; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v706) : "w"(v693), "w"(v702)); - svfloat32_t v707; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v707) : "w"(v688), "w"(v703)); - svfloat32_t v708; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v708) : "w"(v688), "w"(v703)); - svfloat32_t v763; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v763) : "w"(v689), "w"(v1980)); - svfloat32_t v775; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v775) : "w"(v704), "w"(v1982)); - svfloat32_t v850; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v850) : "w"(v838), "w"(v847)); - svfloat32_t v851; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v851) : "w"(v838), "w"(v847)); - svfloat32_t zero864; - asm volatile("mov %0.s, #0" : "=w"(zero864)); + svfloat32_t v634 = svadd_f32_x(svptrue_b32(), v617, v625); + svfloat32_t v635 = svsub_f32_x(svptrue_b32(), v617, v625); + svfloat32_t v690 = svsub_f32_x(svptrue_b32(), v678, v687); + svfloat32_t v691 = svadd_f32_x(svptrue_b32(), v678, v687); + svfloat32_t v705 = svsub_f32_x(svptrue_b32(), v693, v702); + svfloat32_t v706 = svadd_f32_x(svptrue_b32(), v693, v702); + svfloat32_t v707 = svadd_f32_x(svptrue_b32(), v688, v703); + svfloat32_t v708 = svsub_f32_x(svptrue_b32(), v688, v703); + svfloat32_t v763 = svmul_f32_x(svptrue_b32(), v689, v1980); + svfloat32_t v775 = svmul_f32_x(svptrue_b32(), v704, v1982); + svfloat32_t v850 = svsub_f32_x(svptrue_b32(), v838, v847); + svfloat32_t v851 = svadd_f32_x(svptrue_b32(), v838, v847); + svfloat32_t zero864 = svdup_n_f32(0); svfloat32_t v864 = svcmla_f32_x(pred_full, zero864, v1983, v857, 90); - svfloat32_t v865; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v865) : "w"(v848), "w"(v856)); - svfloat32_t v866; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v866) : "w"(v848), "w"(v856)); - svfloat32_t v921; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v921) : "w"(v909), "w"(v918)); - svfloat32_t v922; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v922) : "w"(v909), "w"(v918)); - svfloat32_t zero935; - asm volatile("mov %0.s, #0" : "=w"(zero935)); + svfloat32_t v865 = svadd_f32_x(svptrue_b32(), v848, v856); + svfloat32_t v866 = svsub_f32_x(svptrue_b32(), v848, v856); + svfloat32_t v921 = svsub_f32_x(svptrue_b32(), v909, v918); + svfloat32_t v922 = svadd_f32_x(svptrue_b32(), v909, v918); + svfloat32_t zero935 = svdup_n_f32(0); svfloat32_t v935 = svcmla_f32_x(pred_full, zero935, v1983, v928, 90); - svfloat32_t v936; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v936) : "w"(v919), "w"(v927)); - svfloat32_t v937; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v937) : "w"(v919), "w"(v927)); - svfloat32_t v636; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v636) : "w"(v618), "w"(v633)); - svfloat32_t v637; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v637) : "w"(v618), "w"(v633)); + svfloat32_t v936 = svadd_f32_x(svptrue_b32(), v919, v927); + svfloat32_t v937 = svsub_f32_x(svptrue_b32(), v919, v927); + svfloat32_t v636 = svsub_f32_x(svptrue_b32(), v618, v633); + svfloat32_t v637 = svadd_f32_x(svptrue_b32(), v618, v633); svfloat32_t v662 = svcmla_f32_x(pred_full, v642, v2107, v642, 90); svfloat32_t v663 = svcmla_f32_x(pred_full, v654, v1983, v654, 90); - svfloat32_t zero715; - asm volatile("mov %0.s, #0" : "=w"(zero715)); + svfloat32_t zero715 = svdup_n_f32(0); svfloat32_t v715 = svcmla_f32_x(pred_full, zero715, v1983, v708, 90); - svfloat32_t v716; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v716) : "w"(v634), "w"(v707)); - svfloat32_t v717; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v717) : "w"(v634), "w"(v707)); - svfloat32_t v724; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v724) : "w"(v690), "w"(v1898)); - svfloat32_t v736; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v736) : "w"(v705), "w"(v2062)); - svfloat32_t v802; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v802) : "w"(v691), "w"(v2062)); - svfloat32_t v814; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v814) : "w"(v706), "w"(v2064)); - svfloat32_t v867; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v867) : "w"(v849), "w"(v864)); - svfloat32_t v868; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v868) : "w"(v849), "w"(v864)); + svfloat32_t v716 = svadd_f32_x(svptrue_b32(), v634, v707); + svfloat32_t v717 = svsub_f32_x(svptrue_b32(), v634, v707); + svfloat32_t v724 = svmul_f32_x(svptrue_b32(), v690, v1898); + svfloat32_t v736 = svmul_f32_x(svptrue_b32(), v705, v2062); + svfloat32_t v802 = svmul_f32_x(svptrue_b32(), v691, v2062); + svfloat32_t v814 = svmul_f32_x(svptrue_b32(), v706, v2064); + svfloat32_t v867 = svsub_f32_x(svptrue_b32(), v849, v864); + svfloat32_t v868 = svadd_f32_x(svptrue_b32(), v849, v864); svfloat32_t v893 = svcmla_f32_x(pred_full, v873, v2107, v873, 90); svfloat32_t v894 = svcmla_f32_x(pred_full, v885, v1983, v885, 90); - svfloat32_t v938; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v938) : "w"(v920), "w"(v935)); - svfloat32_t v939; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v939) : "w"(v920), "w"(v935)); + svfloat32_t v938 = svsub_f32_x(svptrue_b32(), v920, v935); + svfloat32_t v939 = svadd_f32_x(svptrue_b32(), v920, v935); svfloat32_t v964 = svcmla_f32_x(pred_full, v944, v2107, v944, 90); svfloat32_t v965 = svcmla_f32_x(pred_full, v956, v1983, v956, 90); - svfloat32_t v979; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v979) : "w"(v865), "w"(v936)); - svfloat32_t v980; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v980) : "w"(v865), "w"(v936)); - svfloat32_t v1225; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v1225) : "w"(v866), "w"(v1980)); - svfloat32_t v1237; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v1237) : "w"(v937), "w"(v1982)); - svfloat32_t v664; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v664) : "w"(v662), "w"(v663)); - svfloat32_t v665; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v665) : "w"(v663), "w"(v662)); - svfloat32_t v718; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v718) : "w"(v635), "w"(v715)); - svfloat32_t v719; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v719) : "w"(v635), "w"(v715)); + svfloat32_t v979 = svadd_f32_x(svptrue_b32(), v865, v936); + svfloat32_t v980 = svsub_f32_x(svptrue_b32(), v865, v936); + svfloat32_t v1225 = svmul_f32_x(svptrue_b32(), v866, v1980); + svfloat32_t v1237 = svmul_f32_x(svptrue_b32(), v937, v1982); + svfloat32_t v664 = svadd_f32_x(svptrue_b32(), v662, v663); + svfloat32_t v665 = svsub_f32_x(svptrue_b32(), v663, v662); + svfloat32_t v718 = svsub_f32_x(svptrue_b32(), v635, v715); + svfloat32_t v719 = svadd_f32_x(svptrue_b32(), v635, v715); svfloat32_t v744 = svcmla_f32_x(pred_full, v724, v1899, v690, 90); svfloat32_t v745 = svcmla_f32_x(pred_full, v736, v2063, v705, 90); svfloat32_t v783 = svcmla_f32_x(pred_full, v763, v2107, v763, 90); svfloat32_t v784 = svcmla_f32_x(pred_full, v775, v1983, v775, 90); svfloat32_t v822 = svcmla_f32_x(pred_full, v802, v2063, v691, 90); svfloat32_t v823 = svcmla_f32_x(pred_full, v814, v2065, v706, 90); - svfloat32_t v895; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v895) : "w"(v893), "w"(v894)); - svfloat32_t v896; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v896) : "w"(v894), "w"(v893)); - svfloat32_t v966; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v966) : "w"(v964), "w"(v965)); - svfloat32_t v967; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v967) : "w"(v965), "w"(v964)); - svfloat32_t zero987; - asm volatile("mov %0.s, #0" : "=w"(zero987)); + svfloat32_t v895 = svadd_f32_x(svptrue_b32(), v893, v894); + svfloat32_t v896 = svsub_f32_x(svptrue_b32(), v894, v893); + svfloat32_t v966 = svadd_f32_x(svptrue_b32(), v964, v965); + svfloat32_t v967 = svsub_f32_x(svptrue_b32(), v965, v964); + svfloat32_t zero987 = svdup_n_f32(0); svfloat32_t v987 = svcmla_f32_x(pred_full, zero987, v1983, v980, 90); - svfloat32_t v988; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v988) : "w"(v716), "w"(v979)); - svfloat32_t v989; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v989) : "w"(v716), "w"(v979)); - svfloat32_t v1091; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v1091) : "w"(v867), "w"(v1898)); - svfloat32_t v1103; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v1103) : "w"(v938), "w"(v2062)); - svfloat32_t v1359; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v1359) : "w"(v868), "w"(v2062)); - svfloat32_t v1371; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v1371) : "w"(v939), "w"(v2064)); - svfloat32_t zero672; - asm volatile("mov %0.s, #0" : "=w"(zero672)); + svfloat32_t v988 = svadd_f32_x(svptrue_b32(), v716, v979); + svfloat32_t v989 = svsub_f32_x(svptrue_b32(), v716, v979); + svfloat32_t v1091 = svmul_f32_x(svptrue_b32(), v867, v1898); + svfloat32_t v1103 = svmul_f32_x(svptrue_b32(), v938, v2062); + svfloat32_t v1359 = svmul_f32_x(svptrue_b32(), v868, v2062); + svfloat32_t v1371 = svmul_f32_x(svptrue_b32(), v939, v2064); + svfloat32_t zero672 = svdup_n_f32(0); svfloat32_t v672 = svcmla_f32_x(pred_full, zero672, v2107, v665, 90); - svfloat32_t v673; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v673) : "w"(v619), "w"(v664)); - svfloat32_t v674; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v674) : "w"(v619), "w"(v664)); - svfloat32_t v746; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v746) : "w"(v744), "w"(v745)); - svfloat32_t v747; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v747) : "w"(v745), "w"(v744)); - svfloat32_t v785; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v785) : "w"(v783), "w"(v784)); - svfloat32_t v786; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v786) : "w"(v784), "w"(v783)); - svfloat32_t v824; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v824) : "w"(v822), "w"(v823)); - svfloat32_t v825; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v825) : "w"(v823), "w"(v822)); - svfloat32_t zero903; - asm volatile("mov %0.s, #0" : "=w"(zero903)); + svfloat32_t v673 = svadd_f32_x(svptrue_b32(), v619, v664); + svfloat32_t v674 = svsub_f32_x(svptrue_b32(), v619, v664); + svfloat32_t v746 = svadd_f32_x(svptrue_b32(), v744, v745); + svfloat32_t v747 = svsub_f32_x(svptrue_b32(), v745, v744); + svfloat32_t v785 = svadd_f32_x(svptrue_b32(), v783, v784); + svfloat32_t v786 = svsub_f32_x(svptrue_b32(), v784, v783); + svfloat32_t v824 = svadd_f32_x(svptrue_b32(), v822, v823); + svfloat32_t v825 = svsub_f32_x(svptrue_b32(), v823, v822); + svfloat32_t zero903 = svdup_n_f32(0); svfloat32_t v903 = svcmla_f32_x(pred_full, zero903, v2107, v896, 90); - svfloat32_t v904; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v904) : "w"(v850), "w"(v895)); - svfloat32_t v905; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v905) : "w"(v850), "w"(v895)); - svfloat32_t zero974; - asm volatile("mov %0.s, #0" : "=w"(zero974)); + svfloat32_t v904 = svadd_f32_x(svptrue_b32(), v850, v895); + svfloat32_t v905 = svsub_f32_x(svptrue_b32(), v850, v895); + svfloat32_t zero974 = svdup_n_f32(0); svfloat32_t v974 = svcmla_f32_x(pred_full, zero974, v2107, v967, 90); - svfloat32_t v975; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v975) : "w"(v921), "w"(v966)); - svfloat32_t v976; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v976) : "w"(v921), "w"(v966)); - svfloat32_t v990; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v990) : "w"(v717), "w"(v987)); - svfloat32_t v991; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v991) : "w"(v717), "w"(v987)); + svfloat32_t v975 = svadd_f32_x(svptrue_b32(), v921, v966); + svfloat32_t v976 = svsub_f32_x(svptrue_b32(), v921, v966); + svfloat32_t v990 = svsub_f32_x(svptrue_b32(), v717, v987); + svfloat32_t v991 = svadd_f32_x(svptrue_b32(), v717, v987); svfloat32_t v1111 = svcmla_f32_x(pred_full, v1091, v1899, v867, 90); svfloat32_t v1112 = svcmla_f32_x(pred_full, v1103, v2063, v938, 90); svfloat32_t v1245 = svcmla_f32_x(pred_full, v1225, v2107, v1225, 90); @@ -17974,156 +15745,92 @@ void armral_fft_cf32_cf32_cf32_ab_t_gu32(const armral_cmplx_f32_t *restrict x, svfloat32_t v1380 = svcmla_f32_x(pred_full, v1371, v2065, v939, 90); svst1_f64(pred_full, (double *)(v1827), svreinterpret_f64_f32(v988)); svst1_f64(pred_full, (double *)(v1845), svreinterpret_f64_f32(v989)); - svfloat32_t v675; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v675) : "w"(v620), "w"(v672)); - svfloat32_t v676; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v676) : "w"(v620), "w"(v672)); - svfloat32_t zero754; - asm volatile("mov %0.s, #0" : "=w"(zero754)); + svfloat32_t v675 = svsub_f32_x(svptrue_b32(), v620, v672); + svfloat32_t v676 = svadd_f32_x(svptrue_b32(), v620, v672); + svfloat32_t zero754 = svdup_n_f32(0); svfloat32_t v754 = svcmla_f32_x(pred_full, zero754, v2107, v747, 90); - svfloat32_t v755; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v755) : "w"(v673), "w"(v746)); - svfloat32_t v756; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v756) : "w"(v673), "w"(v746)); - svfloat32_t zero793; - asm volatile("mov %0.s, #0" : "=w"(zero793)); + svfloat32_t v755 = svadd_f32_x(svptrue_b32(), v673, v746); + svfloat32_t v756 = svsub_f32_x(svptrue_b32(), v673, v746); + svfloat32_t zero793 = svdup_n_f32(0); svfloat32_t v793 = svcmla_f32_x(pred_full, zero793, v2107, v786, 90); - svfloat32_t v794; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v794) : "w"(v636), "w"(v785)); - svfloat32_t v795; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v795) : "w"(v636), "w"(v785)); - svfloat32_t zero832; - asm volatile("mov %0.s, #0" : "=w"(zero832)); + svfloat32_t v794 = svadd_f32_x(svptrue_b32(), v636, v785); + svfloat32_t v795 = svsub_f32_x(svptrue_b32(), v636, v785); + svfloat32_t zero832 = svdup_n_f32(0); svfloat32_t v832 = svcmla_f32_x(pred_full, zero832, v2107, v825, 90); - svfloat32_t v906; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v906) : "w"(v851), "w"(v903)); - svfloat32_t v907; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v907) : "w"(v851), "w"(v903)); - svfloat32_t v977; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v977) : "w"(v922), "w"(v974)); - svfloat32_t v978; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v978) : "w"(v922), "w"(v974)); - svfloat32_t v1024; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v1024) : "w"(v904), "w"(v1857)); - svfloat32_t v1036; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v1036) : "w"(v975), "w"(v1939)); - svfloat32_t v1113; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v1113) : "w"(v1111), "w"(v1112)); - svfloat32_t v1114; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1114) : "w"(v1112), "w"(v1111)); - svfloat32_t v1247; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v1247) : "w"(v1245), "w"(v1246)); - svfloat32_t v1248; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1248) : "w"(v1246), "w"(v1245)); - svfloat32_t v1292; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v1292) : "w"(v905), "w"(v2021)); - svfloat32_t v1304; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v1304) : "w"(v976), "w"(v2023)); - svfloat32_t v1381; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v1381) : "w"(v1379), "w"(v1380)); - svfloat32_t v1382; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1382) : "w"(v1380), "w"(v1379)); + svfloat32_t v906 = svsub_f32_x(svptrue_b32(), v851, v903); + svfloat32_t v907 = svadd_f32_x(svptrue_b32(), v851, v903); + svfloat32_t v977 = svsub_f32_x(svptrue_b32(), v922, v974); + svfloat32_t v978 = svadd_f32_x(svptrue_b32(), v922, v974); + svfloat32_t v1024 = svmul_f32_x(svptrue_b32(), v904, v1857); + svfloat32_t v1036 = svmul_f32_x(svptrue_b32(), v975, v1939); + svfloat32_t v1113 = svadd_f32_x(svptrue_b32(), v1111, v1112); + svfloat32_t v1114 = svsub_f32_x(svptrue_b32(), v1112, v1111); + svfloat32_t v1247 = svadd_f32_x(svptrue_b32(), v1245, v1246); + svfloat32_t v1248 = svsub_f32_x(svptrue_b32(), v1246, v1245); + svfloat32_t v1292 = svmul_f32_x(svptrue_b32(), v905, v2021); + svfloat32_t v1304 = svmul_f32_x(svptrue_b32(), v976, v2023); + svfloat32_t v1381 = svadd_f32_x(svptrue_b32(), v1379, v1380); + svfloat32_t v1382 = svsub_f32_x(svptrue_b32(), v1380, v1379); svst1_f64(pred_full, (double *)(v1836), svreinterpret_f64_f32(v990)); svst1_f64(pred_full, (double *)(v1854), svreinterpret_f64_f32(v991)); - svfloat32_t v757; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v757) : "w"(v674), "w"(v754)); - svfloat32_t v758; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v758) : "w"(v674), "w"(v754)); - svfloat32_t v796; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v796) : "w"(v637), "w"(v793)); - svfloat32_t v797; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v797) : "w"(v637), "w"(v793)); - svfloat32_t v833; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v833) : "w"(v675), "w"(v824)); - svfloat32_t v834; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v834) : "w"(v675), "w"(v824)); - svfloat32_t v835; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v835) : "w"(v676), "w"(v832)); - svfloat32_t v836; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v836) : "w"(v676), "w"(v832)); + svfloat32_t v757 = svsub_f32_x(svptrue_b32(), v674, v754); + svfloat32_t v758 = svadd_f32_x(svptrue_b32(), v674, v754); + svfloat32_t v796 = svsub_f32_x(svptrue_b32(), v637, v793); + svfloat32_t v797 = svadd_f32_x(svptrue_b32(), v637, v793); + svfloat32_t v833 = svadd_f32_x(svptrue_b32(), v675, v824); + svfloat32_t v834 = svsub_f32_x(svptrue_b32(), v675, v824); + svfloat32_t v835 = svsub_f32_x(svptrue_b32(), v676, v832); + svfloat32_t v836 = svadd_f32_x(svptrue_b32(), v676, v832); svfloat32_t v1044 = svcmla_f32_x(pred_full, v1024, v2024, v904, 90); svfloat32_t v1045 = svcmla_f32_x(pred_full, v1036, v1940, v975, 90); - svfloat32_t zero1121; - asm volatile("mov %0.s, #0" : "=w"(zero1121)); + svfloat32_t zero1121 = svdup_n_f32(0); svfloat32_t v1121 = svcmla_f32_x(pred_full, zero1121, v2107, v1114, 90); - svfloat32_t v1122; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v1122) : "w"(v794), "w"(v1113)); - svfloat32_t v1123; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1123) : "w"(v794), "w"(v1113)); - svfloat32_t v1158; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v1158) : "w"(v906), "w"(v1939)); - svfloat32_t v1170; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v1170) : "w"(v977), "w"(v1941)); - svfloat32_t zero1255; - asm volatile("mov %0.s, #0" : "=w"(zero1255)); + svfloat32_t v1122 = svadd_f32_x(svptrue_b32(), v794, v1113); + svfloat32_t v1123 = svsub_f32_x(svptrue_b32(), v794, v1113); + svfloat32_t v1158 = svmul_f32_x(svptrue_b32(), v906, v1939); + svfloat32_t v1170 = svmul_f32_x(svptrue_b32(), v977, v1941); + svfloat32_t zero1255 = svdup_n_f32(0); svfloat32_t v1255 = svcmla_f32_x(pred_full, zero1255, v2107, v1248, 90); - svfloat32_t v1256; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v1256) : "w"(v718), "w"(v1247)); - svfloat32_t v1257; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1257) : "w"(v718), "w"(v1247)); + svfloat32_t v1256 = svadd_f32_x(svptrue_b32(), v718, v1247); + svfloat32_t v1257 = svsub_f32_x(svptrue_b32(), v718, v1247); svfloat32_t v1312 = svcmla_f32_x(pred_full, v1292, v2022, v905, 90); svfloat32_t v1313 = svcmla_f32_x(pred_full, v1304, v2024, v976, 90); - svfloat32_t zero1389; - asm volatile("mov %0.s, #0" : "=w"(zero1389)); + svfloat32_t zero1389 = svdup_n_f32(0); svfloat32_t v1389 = svcmla_f32_x(pred_full, zero1389, v2107, v1382, 90); - svfloat32_t v1426; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v1426) : "w"(v907), "w"(v2103)); - svfloat32_t v1438; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v1438) : "w"(v978), "w"(v2105)); - svfloat32_t v1046; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v1046) : "w"(v1044), "w"(v1045)); - svfloat32_t v1047; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1047) : "w"(v1045), "w"(v1044)); - svfloat32_t v1124; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1124) : "w"(v795), "w"(v1121)); - svfloat32_t v1125; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v1125) : "w"(v795), "w"(v1121)); + svfloat32_t v1426 = svmul_f32_x(svptrue_b32(), v907, v2103); + svfloat32_t v1438 = svmul_f32_x(svptrue_b32(), v978, v2105); + svfloat32_t v1046 = svadd_f32_x(svptrue_b32(), v1044, v1045); + svfloat32_t v1047 = svsub_f32_x(svptrue_b32(), v1045, v1044); + svfloat32_t v1124 = svsub_f32_x(svptrue_b32(), v795, v1121); + svfloat32_t v1125 = svadd_f32_x(svptrue_b32(), v795, v1121); svfloat32_t v1178 = svcmla_f32_x(pred_full, v1158, v1940, v906, 90); svfloat32_t v1179 = svcmla_f32_x(pred_full, v1170, v2104, v977, 90); - svfloat32_t v1258; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1258) : "w"(v719), "w"(v1255)); - svfloat32_t v1259; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v1259) : "w"(v719), "w"(v1255)); - svfloat32_t v1314; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v1314) : "w"(v1312), "w"(v1313)); - svfloat32_t v1315; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1315) : "w"(v1313), "w"(v1312)); - svfloat32_t v1390; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v1390) : "w"(v796), "w"(v1381)); - svfloat32_t v1391; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1391) : "w"(v796), "w"(v1381)); - svfloat32_t v1392; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1392) : "w"(v797), "w"(v1389)); - svfloat32_t v1393; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v1393) : "w"(v797), "w"(v1389)); + svfloat32_t v1258 = svsub_f32_x(svptrue_b32(), v719, v1255); + svfloat32_t v1259 = svadd_f32_x(svptrue_b32(), v719, v1255); + svfloat32_t v1314 = svadd_f32_x(svptrue_b32(), v1312, v1313); + svfloat32_t v1315 = svsub_f32_x(svptrue_b32(), v1313, v1312); + svfloat32_t v1390 = svadd_f32_x(svptrue_b32(), v796, v1381); + svfloat32_t v1391 = svsub_f32_x(svptrue_b32(), v796, v1381); + svfloat32_t v1392 = svsub_f32_x(svptrue_b32(), v797, v1389); + svfloat32_t v1393 = svadd_f32_x(svptrue_b32(), v797, v1389); svfloat32_t v1446 = svcmla_f32_x(pred_full, v1426, v2104, v907, 90); svfloat32_t v1447 = svcmla_f32_x(pred_full, v1438, v2106, v978, 90); svst1_f64(pred_full, (double *)(v1909), svreinterpret_f64_f32(v1122)); svst1_f64(pred_full, (double *)(v1927), svreinterpret_f64_f32(v1123)); svst1_f64(pred_full, (double *)(v1991), svreinterpret_f64_f32(v1256)); svst1_f64(pred_full, (double *)(v2009), svreinterpret_f64_f32(v1257)); - svfloat32_t zero1054; - asm volatile("mov %0.s, #0" : "=w"(zero1054)); + svfloat32_t zero1054 = svdup_n_f32(0); svfloat32_t v1054 = svcmla_f32_x(pred_full, zero1054, v2107, v1047, 90); - svfloat32_t v1055; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v1055) : "w"(v755), "w"(v1046)); - svfloat32_t v1056; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1056) : "w"(v755), "w"(v1046)); - svfloat32_t v1180; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v1180) : "w"(v1178), "w"(v1179)); - svfloat32_t v1181; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1181) : "w"(v1179), "w"(v1178)); - svfloat32_t zero1322; - asm volatile("mov %0.s, #0" : "=w"(zero1322)); + svfloat32_t v1055 = svadd_f32_x(svptrue_b32(), v755, v1046); + svfloat32_t v1056 = svsub_f32_x(svptrue_b32(), v755, v1046); + svfloat32_t v1180 = svadd_f32_x(svptrue_b32(), v1178, v1179); + svfloat32_t v1181 = svsub_f32_x(svptrue_b32(), v1179, v1178); + svfloat32_t zero1322 = svdup_n_f32(0); svfloat32_t v1322 = svcmla_f32_x(pred_full, zero1322, v2107, v1315, 90); - svfloat32_t v1323; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v1323) : "w"(v757), "w"(v1314)); - svfloat32_t v1324; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1324) : "w"(v757), "w"(v1314)); - svfloat32_t v1448; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v1448) : "w"(v1446), "w"(v1447)); - svfloat32_t v1449; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1449) : "w"(v1447), "w"(v1446)); + svfloat32_t v1323 = svadd_f32_x(svptrue_b32(), v757, v1314); + svfloat32_t v1324 = svsub_f32_x(svptrue_b32(), v757, v1314); + svfloat32_t v1448 = svadd_f32_x(svptrue_b32(), v1446, v1447); + svfloat32_t v1449 = svsub_f32_x(svptrue_b32(), v1447, v1446); svst1_f64(pred_full, (double *)(v1918), svreinterpret_f64_f32(v1124)); svst1_f64(pred_full, (double *)(v1936), svreinterpret_f64_f32(v1125)); svst1_f64(pred_full, (double *)(v2000), svreinterpret_f64_f32(v1258)); @@ -18132,40 +15839,26 @@ void armral_fft_cf32_cf32_cf32_ab_t_gu32(const armral_cmplx_f32_t *restrict x, svst1_f64(pred_full, (double *)(v2082), svreinterpret_f64_f32(v1392)); svst1_f64(pred_full, (double *)(v2091), svreinterpret_f64_f32(v1391)); svst1_f64(pred_full, (double *)(v2100), svreinterpret_f64_f32(v1393)); - svfloat32_t v1057; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1057) : "w"(v756), "w"(v1054)); - svfloat32_t v1058; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v1058) : "w"(v756), "w"(v1054)); - svfloat32_t zero1188; - asm volatile("mov %0.s, #0" : "=w"(zero1188)); + svfloat32_t v1057 = svsub_f32_x(svptrue_b32(), v756, v1054); + svfloat32_t v1058 = svadd_f32_x(svptrue_b32(), v756, v1054); + svfloat32_t zero1188 = svdup_n_f32(0); svfloat32_t v1188 = svcmla_f32_x(pred_full, zero1188, v2107, v1181, 90); - svfloat32_t v1189; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v1189) : "w"(v833), "w"(v1180)); - svfloat32_t v1190; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1190) : "w"(v833), "w"(v1180)); - svfloat32_t v1325; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1325) : "w"(v758), "w"(v1322)); - svfloat32_t v1326; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v1326) : "w"(v758), "w"(v1322)); - svfloat32_t zero1456; - asm volatile("mov %0.s, #0" : "=w"(zero1456)); + svfloat32_t v1189 = svadd_f32_x(svptrue_b32(), v833, v1180); + svfloat32_t v1190 = svsub_f32_x(svptrue_b32(), v833, v1180); + svfloat32_t v1325 = svsub_f32_x(svptrue_b32(), v758, v1322); + svfloat32_t v1326 = svadd_f32_x(svptrue_b32(), v758, v1322); + svfloat32_t zero1456 = svdup_n_f32(0); svfloat32_t v1456 = svcmla_f32_x(pred_full, zero1456, v2107, v1449, 90); - svfloat32_t v1457; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v1457) : "w"(v835), "w"(v1448)); - svfloat32_t v1458; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1458) : "w"(v835), "w"(v1448)); + svfloat32_t v1457 = svadd_f32_x(svptrue_b32(), v835, v1448); + svfloat32_t v1458 = svsub_f32_x(svptrue_b32(), v835, v1448); svst1_f64(pred_full, (double *)(v1868), svreinterpret_f64_f32(v1055)); svst1_f64(pred_full, (double *)(v1886), svreinterpret_f64_f32(v1056)); svst1_f64(pred_full, (double *)(v2032), svreinterpret_f64_f32(v1323)); svst1_f64(pred_full, (double *)(v2050), svreinterpret_f64_f32(v1324)); - svfloat32_t v1191; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1191) : "w"(v834), "w"(v1188)); - svfloat32_t v1192; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v1192) : "w"(v834), "w"(v1188)); - svfloat32_t v1459; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1459) : "w"(v836), "w"(v1456)); - svfloat32_t v1460; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v1460) : "w"(v836), "w"(v1456)); + svfloat32_t v1191 = svsub_f32_x(svptrue_b32(), v834, v1188); + svfloat32_t v1192 = svadd_f32_x(svptrue_b32(), v834, v1188); + svfloat32_t v1459 = svsub_f32_x(svptrue_b32(), v836, v1456); + svfloat32_t v1460 = svadd_f32_x(svptrue_b32(), v836, v1456); svst1_f64(pred_full, (double *)(v1877), svreinterpret_f64_f32(v1057)); svst1_f64(pred_full, (double *)(v1895), svreinterpret_f64_f32(v1058)); svst1_f64(pred_full, (double *)(v1950), svreinterpret_f64_f32(v1189)); diff --git a/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ab_t_gu.h b/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ab_t_gu.h index f6bf005a4178e065142091c854080bad190eb17c..c67dd8b207056e7487c454308366663b591eeccc 100644 --- a/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ab_t_gu.h +++ b/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ab_t_gu.h @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #pragma once diff --git a/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ac_n_gu.c b/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ac_n_gu.c index 1b42288739ad34d7eeb7b6d5ec8eb88a592e8999..da66b3f292873af7bbca7297a14bd4f942eea34b 100644 --- a/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ac_n_gu.c +++ b/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ac_n_gu.c @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "fft_cf32_cf32_cf32_ac_n_gu.h" @@ -9,2603 +11,6 @@ #include #endif -#ifndef ARMRAL_ARCH_SVE -void armral_fft_cf32_cf32_cf32_ac_n_gu2(const armral_cmplx_f32_t *restrict x, - armral_cmplx_f32_t *restrict y, - int istride, int ostride, int howmany, - int idist, float dir) { - const float32x2_t *v5 = (const float32x2_t *)x; - float32x2_t *v6 = (float32x2_t *)y; - for (int j = 0; j < howmany; j += 1) { - float32x2_t v25 = v5[istride]; - float32x2_t v20 = v5[0]; - float32x2_t v26 = vadd_f32(v20, v25); - float32x2_t v27 = vsub_f32(v20, v25); - v6[0] = v26; - v6[ostride] = v27; - v5 += 1 * idist; - v6 += 1 * 1; - } -} -#endif - -#ifdef ARMRAL_ARCH_SVE -void armral_fft_cf32_cf32_cf32_ac_n_gu2(const armral_cmplx_f32_t *restrict x, - armral_cmplx_f32_t *restrict y, - int istride, int ostride, int howmany, - int idist, float dir) { - int64_t v0 = istride; - int64_t v1 = idist; - int64_t v2 = ostride; - const float32x2_t *v5 = (const float32x2_t *)x; - float32x2_t *v6 = (float32x2_t *)y; - int64_t v8 = howmany; - int64_t v10 = svcntd(); - int64_t v11 = v10 * v1; - int64_t v12 = v10 * 1; - for (int j = 0; j < v8; j += v10) { - svbool_t pred_full = svwhilelt_b32(j * 2, howmany * 2); - const float32x2_t *v74 = &v5[v0]; - float32x2_t *v95 = &v6[v2]; - const float32x2_t *v65 = &v5[0]; - svint64_t v75 = svindex_s64(0, v1); - float32x2_t *v86 = &v6[0]; - svfloat32_t v67 = svreinterpret_f32_f64( - svld1_gather_s64index_f64(pred_full, (const double *)(v65), v75)); - svfloat32_t v76 = svreinterpret_f32_f64( - svld1_gather_s64index_f64(pred_full, (const double *)(v74), v75)); - svfloat32_t v32; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v32) : "w"(v67), "w"(v76)); - svfloat32_t v33; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v33) : "w"(v67), "w"(v76)); - svst1_f64(pred_full, (double *)(v86), svreinterpret_f64_f32(v32)); - svst1_f64(pred_full, (double *)(v95), svreinterpret_f64_f32(v33)); - v5 += v11; - v6 += v12; - } -} -#endif - -#ifndef ARMRAL_ARCH_SVE -void armral_fft_cf32_cf32_cf32_ac_n_gu3(const armral_cmplx_f32_t *restrict x, - armral_cmplx_f32_t *restrict y, - int istride, int ostride, int howmany, - int idist, float dir) { - float v4 = dir; - const float32x2_t *v5 = (const float32x2_t *)x; - float32x2_t *v6 = (float32x2_t *)y; - for (int j = 0; j < howmany; j += 1) { - float32x2_t v20 = v5[istride]; - float v39 = -1.4999999999999998e+00F; - float v42 = 8.6602540378443871e-01F; - float v43 = -8.6602540378443871e-01F; - float32x2_t v45 = (float32x2_t){v4, v4}; - float32x2_t v32 = v5[0]; - float32x2_t v40 = (float32x2_t){v39, v39}; - float32x2_t v44 = (float32x2_t){v42, v43}; - float32x2_t v25 = v5[istride * 2]; - float32x2_t v46 = vmul_f32(v45, v44); - float32x2_t v26 = vadd_f32(v20, v25); - float32x2_t v27 = vsub_f32(v20, v25); - float32x2_t v33 = vadd_f32(v26, v32); - float32x2_t v41 = vmul_f32(v26, v40); - float32x2_t v47 = vrev64_f32(v27); - float32x2_t v48 = vmul_f32(v47, v46); - float32x2_t v49 = vadd_f32(v33, v41); - v6[0] = v33; - float32x2_t v50 = vadd_f32(v49, v48); - float32x2_t v51 = vsub_f32(v49, v48); - v6[ostride] = v51; - v6[ostride * 2] = v50; - v5 += 1 * idist; - v6 += 1 * 1; - } -} -#endif - -#ifdef ARMRAL_ARCH_SVE -void armral_fft_cf32_cf32_cf32_ac_n_gu3(const armral_cmplx_f32_t *restrict x, - armral_cmplx_f32_t *restrict y, - int istride, int ostride, int howmany, - int idist, float dir) { - int64_t v0 = istride; - int64_t v1 = idist; - int64_t v2 = ostride; - float v4 = dir; - const float32x2_t *v5 = (const float32x2_t *)x; - float32x2_t *v6 = (float32x2_t *)y; - int64_t v8 = howmany; - int64_t v10 = svcntd(); - int64_t v11 = v10 * v1; - int64_t v12 = v10 * 1; - for (int j = 0; j < v8; j += v10) { - svbool_t pred_full = svwhilelt_b32(j * 2, howmany * 2); - float v48 = -1.4999999999999998e+00F; - float v53 = -8.6602540378443871e-01F; - const float32x2_t *v89 = &v5[v0]; - float32x2_t *v130 = &v6[v2]; - int64_t v26 = v0 * 2; - float v56 = v4 * v53; - int64_t v77 = v2 * 2; - const float32x2_t *v108 = &v5[0]; - svint64_t v109 = svindex_s64(0, v1); - svfloat32_t v112 = svdup_n_f32(v48); - float32x2_t *v121 = &v6[0]; - svfloat32_t v91 = svreinterpret_f32_f64( - svld1_gather_s64index_f64(pred_full, (const double *)(v89), v109)); - const float32x2_t *v98 = &v5[v26]; - svfloat32_t v110 = svreinterpret_f32_f64( - svld1_gather_s64index_f64(pred_full, (const double *)(v108), v109)); - svfloat32_t v113 = svdup_n_f32(v56); - float32x2_t *v139 = &v6[v77]; - svfloat32_t v100 = svreinterpret_f32_f64( - svld1_gather_s64index_f64(pred_full, (const double *)(v98), v109)); - svfloat32_t v32; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v32) : "w"(v91), "w"(v100)); - svfloat32_t v33; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v33) : "w"(v91), "w"(v100)); - svfloat32_t v41; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v41) : "w"(v32), "w"(v110)); - svfloat32_t zero58; - asm volatile("mov %0.s, #0" : "=w"(zero58)); - svfloat32_t v58 = svcmla_f32_x(pred_full, zero58, v113, v33, 90); - svfloat32_t v59 = svmla_f32_x(pred_full, v41, v32, v112); - svst1_f64(pred_full, (double *)(v121), svreinterpret_f64_f32(v41)); - svfloat32_t v60; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v60) : "w"(v59), "w"(v58)); - svfloat32_t v61; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v61) : "w"(v59), "w"(v58)); - svst1_f64(pred_full, (double *)(v130), svreinterpret_f64_f32(v61)); - svst1_f64(pred_full, (double *)(v139), svreinterpret_f64_f32(v60)); - v5 += v11; - v6 += v12; - } -} -#endif - -#ifndef ARMRAL_ARCH_SVE -void armral_fft_cf32_cf32_cf32_ac_n_gu4(const armral_cmplx_f32_t *restrict x, - armral_cmplx_f32_t *restrict y, - int istride, int ostride, int howmany, - int idist, float dir) { - float v4 = dir; - const float32x2_t *v5 = (const float32x2_t *)x; - float32x2_t *v6 = (float32x2_t *)y; - for (int j = 0; j < howmany; j += 1) { - float32x2_t v32 = v5[istride]; - float v54 = 1.0000000000000000e+00F; - float v55 = -1.0000000000000000e+00F; - float32x2_t v57 = (float32x2_t){v4, v4}; - float32x2_t v20 = v5[0]; - float32x2_t v56 = (float32x2_t){v54, v55}; - float32x2_t v25 = v5[istride * 2]; - float32x2_t v37 = v5[istride * 3]; - float32x2_t v58 = vmul_f32(v57, v56); - float32x2_t v26 = vadd_f32(v20, v25); - float32x2_t v27 = vsub_f32(v20, v25); - float32x2_t v38 = vadd_f32(v32, v37); - float32x2_t v39 = vsub_f32(v32, v37); - float32x2_t v40 = vadd_f32(v26, v38); - float32x2_t v41 = vsub_f32(v26, v38); - float32x2_t v59 = vrev64_f32(v39); - float32x2_t v60 = vmul_f32(v59, v58); - v6[0] = v40; - v6[ostride * 2] = v41; - float32x2_t v61 = vadd_f32(v27, v60); - float32x2_t v62 = vsub_f32(v27, v60); - v6[ostride] = v62; - v6[ostride * 3] = v61; - v5 += 1 * idist; - v6 += 1 * 1; - } -} -#endif - -#ifdef ARMRAL_ARCH_SVE -void armral_fft_cf32_cf32_cf32_ac_n_gu4(const armral_cmplx_f32_t *restrict x, - armral_cmplx_f32_t *restrict y, - int istride, int ostride, int howmany, - int idist, float dir) { - int64_t v0 = istride; - int64_t v1 = idist; - int64_t v2 = ostride; - float v4 = dir; - const float32x2_t *v5 = (const float32x2_t *)x; - float32x2_t *v6 = (float32x2_t *)y; - int64_t v8 = howmany; - int64_t v10 = svcntd(); - int64_t v11 = v10 * v1; - int64_t v12 = v10 * 1; - for (int j = 0; j < v8; j += v10) { - svbool_t pred_full = svwhilelt_b32(j * 2, howmany * 2); - float v68 = -1.0000000000000000e+00F; - const float32x2_t *v129 = &v5[v0]; - float32x2_t *v161 = &v6[v2]; - int64_t v26 = v0 * 2; - int64_t v42 = v0 * 3; - float v71 = v4 * v68; - int64_t v91 = v2 * 2; - int64_t v98 = v2 * 3; - const float32x2_t *v111 = &v5[0]; - svint64_t v139 = svindex_s64(0, v1); - float32x2_t *v152 = &v6[0]; - svfloat32_t v113 = svreinterpret_f32_f64( - svld1_gather_s64index_f64(pred_full, (const double *)(v111), v139)); - const float32x2_t *v120 = &v5[v26]; - svfloat32_t v131 = svreinterpret_f32_f64( - svld1_gather_s64index_f64(pred_full, (const double *)(v129), v139)); - const float32x2_t *v138 = &v5[v42]; - svfloat32_t v144 = svdup_n_f32(v71); - float32x2_t *v170 = &v6[v91]; - float32x2_t *v179 = &v6[v98]; - svfloat32_t v122 = svreinterpret_f32_f64( - svld1_gather_s64index_f64(pred_full, (const double *)(v120), v139)); - svfloat32_t v140 = svreinterpret_f32_f64( - svld1_gather_s64index_f64(pred_full, (const double *)(v138), v139)); - svfloat32_t v32; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v32) : "w"(v113), "w"(v122)); - svfloat32_t v33; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v33) : "w"(v113), "w"(v122)); - svfloat32_t v48; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v48) : "w"(v131), "w"(v140)); - svfloat32_t v49; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v49) : "w"(v131), "w"(v140)); - svfloat32_t v50; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v50) : "w"(v32), "w"(v48)); - svfloat32_t v51; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v51) : "w"(v32), "w"(v48)); - svfloat32_t zero73; - asm volatile("mov %0.s, #0" : "=w"(zero73)); - svfloat32_t v73 = svcmla_f32_x(pred_full, zero73, v144, v49, 90); - svfloat32_t v74; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v74) : "w"(v33), "w"(v73)); - svfloat32_t v75; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v75) : "w"(v33), "w"(v73)); - svst1_f64(pred_full, (double *)(v152), svreinterpret_f64_f32(v50)); - svst1_f64(pred_full, (double *)(v170), svreinterpret_f64_f32(v51)); - svst1_f64(pred_full, (double *)(v161), svreinterpret_f64_f32(v75)); - svst1_f64(pred_full, (double *)(v179), svreinterpret_f64_f32(v74)); - v5 += v11; - v6 += v12; - } -} -#endif - -#ifndef ARMRAL_ARCH_SVE -void armral_fft_cf32_cf32_cf32_ac_n_gu5(const armral_cmplx_f32_t *restrict x, - armral_cmplx_f32_t *restrict y, - int istride, int ostride, int howmany, - int idist, float dir) { - float v4 = dir; - const float32x2_t *v5 = (const float32x2_t *)x; - float32x2_t *v6 = (float32x2_t *)y; - for (int j = 0; j < howmany; j += 1) { - float32x2_t v20 = v5[istride]; - float v54 = -1.2500000000000000e+00F; - float v58 = 5.5901699437494745e-01F; - float v61 = 1.5388417685876268e+00F; - float v62 = -1.5388417685876268e+00F; - float v68 = 5.8778525229247325e-01F; - float v69 = -5.8778525229247325e-01F; - float v75 = 3.6327126400268028e-01F; - float v76 = -3.6327126400268028e-01F; - float32x2_t v78 = (float32x2_t){v4, v4}; - float32x2_t v47 = v5[0]; - float32x2_t v55 = (float32x2_t){v54, v54}; - float32x2_t v59 = (float32x2_t){v58, v58}; - float32x2_t v63 = (float32x2_t){v61, v62}; - float32x2_t v70 = (float32x2_t){v68, v69}; - float32x2_t v77 = (float32x2_t){v75, v76}; - float32x2_t v25 = v5[istride * 4]; - float32x2_t v32 = v5[istride * 3]; - float32x2_t v37 = v5[istride * 2]; - float32x2_t v65 = vmul_f32(v78, v63); - float32x2_t v72 = vmul_f32(v78, v70); - float32x2_t v79 = vmul_f32(v78, v77); - float32x2_t v26 = vadd_f32(v20, v25); - float32x2_t v27 = vsub_f32(v20, v25); - float32x2_t v38 = vadd_f32(v32, v37); - float32x2_t v39 = vsub_f32(v32, v37); - float32x2_t v40 = vadd_f32(v26, v38); - float32x2_t v41 = vsub_f32(v26, v38); - float32x2_t v42 = vadd_f32(v27, v39); - float32x2_t v66 = vrev64_f32(v27); - float32x2_t v80 = vrev64_f32(v39); - float32x2_t v48 = vadd_f32(v40, v47); - float32x2_t v56 = vmul_f32(v40, v55); - float32x2_t v60 = vmul_f32(v41, v59); - float32x2_t v67 = vmul_f32(v66, v65); - float32x2_t v73 = vrev64_f32(v42); - float32x2_t v81 = vmul_f32(v80, v79); - float32x2_t v74 = vmul_f32(v73, v72); - float32x2_t v82 = vadd_f32(v48, v56); - v6[0] = v48; - float32x2_t v83 = vadd_f32(v82, v60); - float32x2_t v84 = vsub_f32(v82, v60); - float32x2_t v85 = vsub_f32(v67, v74); - float32x2_t v86 = vadd_f32(v74, v81); - float32x2_t v87 = vadd_f32(v83, v85); - float32x2_t v88 = vsub_f32(v83, v85); - float32x2_t v89 = vadd_f32(v84, v86); - float32x2_t v90 = vsub_f32(v84, v86); - v6[ostride] = v88; - v6[ostride * 2] = v90; - v6[ostride * 3] = v89; - v6[ostride * 4] = v87; - v5 += 1 * idist; - v6 += 1 * 1; - } -} -#endif - -#ifdef ARMRAL_ARCH_SVE -void armral_fft_cf32_cf32_cf32_ac_n_gu5(const armral_cmplx_f32_t *restrict x, - armral_cmplx_f32_t *restrict y, - int istride, int ostride, int howmany, - int idist, float dir) { - int64_t v0 = istride; - int64_t v1 = idist; - int64_t v2 = ostride; - float v4 = dir; - const float32x2_t *v5 = (const float32x2_t *)x; - float32x2_t *v6 = (float32x2_t *)y; - int64_t v8 = howmany; - int64_t v10 = svcntd(); - int64_t v11 = v10 * v1; - int64_t v12 = v10 * 1; - for (int j = 0; j < v8; j += v10) { - svbool_t pred_full = svwhilelt_b32(j * 2, howmany * 2); - float v67 = -1.2500000000000000e+00F; - float v72 = 5.5901699437494745e-01F; - float v77 = -1.5388417685876268e+00F; - float v84 = -5.8778525229247325e-01F; - float v91 = -3.6327126400268028e-01F; - const float32x2_t *v147 = &v5[v0]; - float32x2_t *v209 = &v6[v2]; - int64_t v26 = v0 * 4; - int64_t v35 = v0 * 3; - int64_t v42 = v0 * 2; - float v80 = v4 * v77; - float v87 = v4 * v84; - float v94 = v4 * v91; - int64_t v121 = v2 * 2; - int64_t v128 = v2 * 3; - int64_t v135 = v2 * 4; - const float32x2_t *v184 = &v5[0]; - svint64_t v185 = svindex_s64(0, v1); - svfloat32_t v188 = svdup_n_f32(v67); - svfloat32_t v189 = svdup_n_f32(v72); - float32x2_t *v200 = &v6[0]; - svfloat32_t v149 = svreinterpret_f32_f64( - svld1_gather_s64index_f64(pred_full, (const double *)(v147), v185)); - const float32x2_t *v156 = &v5[v26]; - const float32x2_t *v165 = &v5[v35]; - const float32x2_t *v174 = &v5[v42]; - svfloat32_t v186 = svreinterpret_f32_f64( - svld1_gather_s64index_f64(pred_full, (const double *)(v184), v185)); - svfloat32_t v190 = svdup_n_f32(v80); - svfloat32_t v191 = svdup_n_f32(v87); - svfloat32_t v192 = svdup_n_f32(v94); - float32x2_t *v218 = &v6[v121]; - float32x2_t *v227 = &v6[v128]; - float32x2_t *v236 = &v6[v135]; - svfloat32_t v158 = svreinterpret_f32_f64( - svld1_gather_s64index_f64(pred_full, (const double *)(v156), v185)); - svfloat32_t v167 = svreinterpret_f32_f64( - svld1_gather_s64index_f64(pred_full, (const double *)(v165), v185)); - svfloat32_t v176 = svreinterpret_f32_f64( - svld1_gather_s64index_f64(pred_full, (const double *)(v174), v185)); - svfloat32_t v32; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v32) : "w"(v149), "w"(v158)); - svfloat32_t v33; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v33) : "w"(v149), "w"(v158)); - svfloat32_t v48; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v48) : "w"(v167), "w"(v176)); - svfloat32_t v49; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v49) : "w"(v167), "w"(v176)); - svfloat32_t v50; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v50) : "w"(v32), "w"(v48)); - svfloat32_t v51; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v51) : "w"(v32), "w"(v48)); - svfloat32_t v52; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v52) : "w"(v33), "w"(v49)); - svfloat32_t zero82; - asm volatile("mov %0.s, #0" : "=w"(zero82)); - svfloat32_t v82 = svcmla_f32_x(pred_full, zero82, v190, v33, 90); - svfloat32_t v60; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v60) : "w"(v50), "w"(v186)); - svfloat32_t zero89; - asm volatile("mov %0.s, #0" : "=w"(zero89)); - svfloat32_t v89 = svcmla_f32_x(pred_full, zero89, v191, v52, 90); - svfloat32_t v97 = svmla_f32_x(pred_full, v60, v50, v188); - svfloat32_t v100; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v100) : "w"(v82), "w"(v89)); - svfloat32_t v101 = svcmla_f32_x(pred_full, v89, v192, v49, 90); - svst1_f64(pred_full, (double *)(v200), svreinterpret_f64_f32(v60)); - svfloat32_t v98 = svmla_f32_x(pred_full, v97, v51, v189); - svfloat32_t v99 = svmls_f32_x(pred_full, v97, v51, v189); - svfloat32_t v102; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v102) : "w"(v98), "w"(v100)); - svfloat32_t v103; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v103) : "w"(v98), "w"(v100)); - svfloat32_t v104; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v104) : "w"(v99), "w"(v101)); - svfloat32_t v105; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v105) : "w"(v99), "w"(v101)); - svst1_f64(pred_full, (double *)(v209), svreinterpret_f64_f32(v103)); - svst1_f64(pred_full, (double *)(v218), svreinterpret_f64_f32(v105)); - svst1_f64(pred_full, (double *)(v227), svreinterpret_f64_f32(v104)); - svst1_f64(pred_full, (double *)(v236), svreinterpret_f64_f32(v102)); - v5 += v11; - v6 += v12; - } -} -#endif - -#ifndef ARMRAL_ARCH_SVE -void armral_fft_cf32_cf32_cf32_ac_n_gu6(const armral_cmplx_f32_t *restrict x, - armral_cmplx_f32_t *restrict y, - int istride, int ostride, int howmany, - int idist, float dir) { - float v4 = dir; - const float32x2_t *v5 = (const float32x2_t *)x; - float32x2_t *v6 = (float32x2_t *)y; - for (int j = 0; j < howmany; j += 1) { - float32x2_t v49 = v5[istride]; - float v81 = -1.4999999999999998e+00F; - float v84 = 8.6602540378443871e-01F; - float v85 = -8.6602540378443871e-01F; - float32x2_t v87 = (float32x2_t){v4, v4}; - float32x2_t v20 = v5[0]; - float32x2_t v82 = (float32x2_t){v81, v81}; - float32x2_t v86 = (float32x2_t){v84, v85}; - float32x2_t v25 = v5[istride * 3]; - float32x2_t v32 = v5[istride * 2]; - float32x2_t v37 = v5[istride * 5]; - float32x2_t v44 = v5[istride * 4]; - float32x2_t v88 = vmul_f32(v87, v86); - float32x2_t v26 = vadd_f32(v20, v25); - float32x2_t v27 = vsub_f32(v20, v25); - float32x2_t v38 = vadd_f32(v32, v37); - float32x2_t v39 = vsub_f32(v32, v37); - float32x2_t v50 = vadd_f32(v44, v49); - float32x2_t v51 = vsub_f32(v44, v49); - float32x2_t v52 = vadd_f32(v38, v50); - float32x2_t v53 = vsub_f32(v38, v50); - float32x2_t v73 = vadd_f32(v39, v51); - float32x2_t v74 = vsub_f32(v39, v51); - float32x2_t v54 = vadd_f32(v52, v26); - float32x2_t v62 = vmul_f32(v52, v82); - float32x2_t v68 = vrev64_f32(v53); - float32x2_t v75 = vadd_f32(v73, v27); - float32x2_t v83 = vmul_f32(v73, v82); - float32x2_t v89 = vrev64_f32(v74); - float32x2_t v69 = vmul_f32(v68, v88); - float32x2_t v70 = vadd_f32(v54, v62); - float32x2_t v90 = vmul_f32(v89, v88); - float32x2_t v91 = vadd_f32(v75, v83); - v6[0] = v54; - v6[ostride * 3] = v75; - float32x2_t v71 = vadd_f32(v70, v69); - float32x2_t v72 = vsub_f32(v70, v69); - float32x2_t v92 = vadd_f32(v91, v90); - float32x2_t v93 = vsub_f32(v91, v90); - v6[ostride * 4] = v72; - v6[ostride] = v93; - v6[ostride * 2] = v71; - v6[ostride * 5] = v92; - v5 += 1 * idist; - v6 += 1 * 1; - } -} -#endif - -#ifdef ARMRAL_ARCH_SVE -void armral_fft_cf32_cf32_cf32_ac_n_gu6(const armral_cmplx_f32_t *restrict x, - armral_cmplx_f32_t *restrict y, - int istride, int ostride, int howmany, - int idist, float dir) { - int64_t v0 = istride; - int64_t v1 = idist; - int64_t v2 = ostride; - float v4 = dir; - const float32x2_t *v5 = (const float32x2_t *)x; - float32x2_t *v6 = (float32x2_t *)y; - int64_t v8 = howmany; - int64_t v10 = svcntd(); - int64_t v11 = v10 * v1; - int64_t v12 = v10 * 1; - for (int j = 0; j < v8; j += v10) { - svbool_t pred_full = svwhilelt_b32(j * 2, howmany * 2); - float v98 = -1.4999999999999998e+00F; - float v103 = -8.6602540378443871e-01F; - const float32x2_t *v206 = &v5[v0]; - float32x2_t *v249 = &v6[v2]; - int64_t v26 = v0 * 3; - int64_t v35 = v0 * 2; - int64_t v42 = v0 * 5; - int64_t v51 = v0 * 4; - float v106 = v4 * v103; - int64_t v120 = v2 * 3; - int64_t v127 = v2 * 4; - int64_t v141 = v2 * 2; - int64_t v148 = v2 * 5; - const float32x2_t *v161 = &v5[0]; - svint64_t v207 = svindex_s64(0, v1); - svfloat32_t v213 = svdup_n_f32(v98); - float32x2_t *v222 = &v6[0]; - svfloat32_t v163 = svreinterpret_f32_f64( - svld1_gather_s64index_f64(pred_full, (const double *)(v161), v207)); - const float32x2_t *v170 = &v5[v26]; - const float32x2_t *v179 = &v5[v35]; - const float32x2_t *v188 = &v5[v42]; - const float32x2_t *v197 = &v5[v51]; - svfloat32_t v208 = svreinterpret_f32_f64( - svld1_gather_s64index_f64(pred_full, (const double *)(v206), v207)); - svfloat32_t v214 = svdup_n_f32(v106); - float32x2_t *v231 = &v6[v120]; - float32x2_t *v240 = &v6[v127]; - float32x2_t *v258 = &v6[v141]; - float32x2_t *v267 = &v6[v148]; - svfloat32_t v172 = svreinterpret_f32_f64( - svld1_gather_s64index_f64(pred_full, (const double *)(v170), v207)); - svfloat32_t v181 = svreinterpret_f32_f64( - svld1_gather_s64index_f64(pred_full, (const double *)(v179), v207)); - svfloat32_t v190 = svreinterpret_f32_f64( - svld1_gather_s64index_f64(pred_full, (const double *)(v188), v207)); - svfloat32_t v199 = svreinterpret_f32_f64( - svld1_gather_s64index_f64(pred_full, (const double *)(v197), v207)); - svfloat32_t v32; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v32) : "w"(v163), "w"(v172)); - svfloat32_t v33; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v33) : "w"(v163), "w"(v172)); - svfloat32_t v48; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v48) : "w"(v181), "w"(v190)); - svfloat32_t v49; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v49) : "w"(v181), "w"(v190)); - svfloat32_t v64; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v64) : "w"(v199), "w"(v208)); - svfloat32_t v65; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v65) : "w"(v199), "w"(v208)); - svfloat32_t v66; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v66) : "w"(v48), "w"(v64)); - svfloat32_t v67; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v67) : "w"(v48), "w"(v64)); - svfloat32_t v89; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v89) : "w"(v49), "w"(v65)); - svfloat32_t v90; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v90) : "w"(v49), "w"(v65)); - svfloat32_t v68; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v68) : "w"(v66), "w"(v32)); - svfloat32_t zero85; - asm volatile("mov %0.s, #0" : "=w"(zero85)); - svfloat32_t v85 = svcmla_f32_x(pred_full, zero85, v214, v67, 90); - svfloat32_t v91; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v91) : "w"(v89), "w"(v33)); - svfloat32_t zero108; - asm volatile("mov %0.s, #0" : "=w"(zero108)); - svfloat32_t v108 = svcmla_f32_x(pred_full, zero108, v214, v90, 90); - svfloat32_t v86 = svmla_f32_x(pred_full, v68, v66, v213); - svfloat32_t v109 = svmla_f32_x(pred_full, v91, v89, v213); - svst1_f64(pred_full, (double *)(v222), svreinterpret_f64_f32(v68)); - svst1_f64(pred_full, (double *)(v231), svreinterpret_f64_f32(v91)); - svfloat32_t v87; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v87) : "w"(v86), "w"(v85)); - svfloat32_t v88; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v88) : "w"(v86), "w"(v85)); - svfloat32_t v110; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v110) : "w"(v109), "w"(v108)); - svfloat32_t v111; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v111) : "w"(v109), "w"(v108)); - svst1_f64(pred_full, (double *)(v240), svreinterpret_f64_f32(v88)); - svst1_f64(pred_full, (double *)(v249), svreinterpret_f64_f32(v111)); - svst1_f64(pred_full, (double *)(v258), svreinterpret_f64_f32(v87)); - svst1_f64(pred_full, (double *)(v267), svreinterpret_f64_f32(v110)); - v5 += v11; - v6 += v12; - } -} -#endif - -#ifndef ARMRAL_ARCH_SVE -void armral_fft_cf32_cf32_cf32_ac_n_gu7(const armral_cmplx_f32_t *restrict x, - armral_cmplx_f32_t *restrict y, - int istride, int ostride, int howmany, - int idist, float dir) { - float v4 = dir; - const float32x2_t *v5 = (const float32x2_t *)x; - float32x2_t *v6 = (float32x2_t *)y; - for (int j = 0; j < howmany; j += 1) { - float32x2_t v20 = v5[istride]; - float v73 = -1.1666666666666665e+00F; - float v77 = 7.9015646852540022e-01F; - float v81 = 5.5854267289647742e-02F; - float v85 = 7.3430220123575241e-01F; - float v88 = 4.4095855184409838e-01F; - float v89 = -4.4095855184409838e-01F; - float v95 = 3.4087293062393137e-01F; - float v96 = -3.4087293062393137e-01F; - float v102 = -5.3396936033772524e-01F; - float v103 = 5.3396936033772524e-01F; - float v109 = 8.7484229096165667e-01F; - float v110 = -8.7484229096165667e-01F; - float32x2_t v112 = (float32x2_t){v4, v4}; - float32x2_t v58 = v5[0]; - float32x2_t v74 = (float32x2_t){v73, v73}; - float32x2_t v78 = (float32x2_t){v77, v77}; - float32x2_t v82 = (float32x2_t){v81, v81}; - float32x2_t v86 = (float32x2_t){v85, v85}; - float32x2_t v90 = (float32x2_t){v88, v89}; - float32x2_t v97 = (float32x2_t){v95, v96}; - float32x2_t v104 = (float32x2_t){v102, v103}; - float32x2_t v111 = (float32x2_t){v109, v110}; - float32x2_t v25 = v5[istride * 6]; - float32x2_t v32 = v5[istride * 4]; - float32x2_t v37 = v5[istride * 3]; - float32x2_t v44 = v5[istride * 2]; - float32x2_t v49 = v5[istride * 5]; - float32x2_t v92 = vmul_f32(v112, v90); - float32x2_t v99 = vmul_f32(v112, v97); - float32x2_t v106 = vmul_f32(v112, v104); - float32x2_t v113 = vmul_f32(v112, v111); - float32x2_t v26 = vadd_f32(v20, v25); - float32x2_t v27 = vsub_f32(v20, v25); - float32x2_t v38 = vadd_f32(v32, v37); - float32x2_t v39 = vsub_f32(v32, v37); - float32x2_t v50 = vadd_f32(v44, v49); - float32x2_t v51 = vsub_f32(v44, v49); - float32x2_t v52 = vadd_f32(v26, v38); - float32x2_t v60 = vsub_f32(v26, v38); - float32x2_t v61 = vsub_f32(v38, v50); - float32x2_t v62 = vsub_f32(v50, v26); - float32x2_t v63 = vadd_f32(v27, v39); - float32x2_t v65 = vsub_f32(v27, v39); - float32x2_t v66 = vsub_f32(v39, v51); - float32x2_t v67 = vsub_f32(v51, v27); - float32x2_t v53 = vadd_f32(v52, v50); - float32x2_t v64 = vadd_f32(v63, v51); - float32x2_t v79 = vmul_f32(v60, v78); - float32x2_t v83 = vmul_f32(v61, v82); - float32x2_t v87 = vmul_f32(v62, v86); - float32x2_t v100 = vrev64_f32(v65); - float32x2_t v107 = vrev64_f32(v66); - float32x2_t v114 = vrev64_f32(v67); - float32x2_t v59 = vadd_f32(v53, v58); - float32x2_t v75 = vmul_f32(v53, v74); - float32x2_t v93 = vrev64_f32(v64); - float32x2_t v101 = vmul_f32(v100, v99); - float32x2_t v108 = vmul_f32(v107, v106); - float32x2_t v115 = vmul_f32(v114, v113); - float32x2_t v94 = vmul_f32(v93, v92); - float32x2_t v116 = vadd_f32(v59, v75); - v6[0] = v59; - float32x2_t v117 = vadd_f32(v116, v79); - float32x2_t v119 = vsub_f32(v116, v79); - float32x2_t v121 = vsub_f32(v116, v83); - float32x2_t v123 = vadd_f32(v94, v101); - float32x2_t v125 = vsub_f32(v94, v101); - float32x2_t v127 = vsub_f32(v94, v108); - float32x2_t v118 = vadd_f32(v117, v83); - float32x2_t v120 = vsub_f32(v119, v87); - float32x2_t v122 = vadd_f32(v121, v87); - float32x2_t v124 = vadd_f32(v123, v108); - float32x2_t v126 = vsub_f32(v125, v115); - float32x2_t v128 = vadd_f32(v127, v115); - float32x2_t v129 = vadd_f32(v118, v124); - float32x2_t v130 = vsub_f32(v118, v124); - float32x2_t v131 = vadd_f32(v120, v126); - float32x2_t v132 = vsub_f32(v120, v126); - float32x2_t v133 = vadd_f32(v122, v128); - float32x2_t v134 = vsub_f32(v122, v128); - v6[ostride] = v130; - v6[ostride * 2] = v132; - v6[ostride * 3] = v133; - v6[ostride * 4] = v134; - v6[ostride * 5] = v131; - v6[ostride * 6] = v129; - v5 += 1 * idist; - v6 += 1 * 1; - } -} -#endif - -#ifdef ARMRAL_ARCH_SVE -void armral_fft_cf32_cf32_cf32_ac_n_gu7(const armral_cmplx_f32_t *restrict x, - armral_cmplx_f32_t *restrict y, - int istride, int ostride, int howmany, - int idist, float dir) { - int64_t v0 = istride; - int64_t v1 = idist; - int64_t v2 = ostride; - float v4 = dir; - const float32x2_t *v5 = (const float32x2_t *)x; - float32x2_t *v6 = (float32x2_t *)y; - int64_t v8 = howmany; - int64_t v10 = svcntd(); - int64_t v11 = v10 * v1; - int64_t v12 = v10 * 1; - for (int j = 0; j < v8; j += v10) { - svbool_t pred_full = svwhilelt_b32(j * 2, howmany * 2); - float v90 = -1.1666666666666665e+00F; - float v95 = 7.9015646852540022e-01F; - float v100 = 5.5854267289647742e-02F; - float v105 = 7.3430220123575241e-01F; - float v110 = -4.4095855184409838e-01F; - float v117 = -3.4087293062393137e-01F; - float v124 = 5.3396936033772524e-01F; - float v131 = -8.7484229096165667e-01F; - const float32x2_t *v211 = &v5[v0]; - float32x2_t *v294 = &v6[v2]; - int64_t v26 = v0 * 6; - int64_t v35 = v0 * 4; - int64_t v42 = v0 * 3; - int64_t v51 = v0 * 2; - int64_t v58 = v0 * 5; - float v113 = v4 * v110; - float v120 = v4 * v117; - float v127 = v4 * v124; - float v134 = v4 * v131; - int64_t v171 = v2 * 2; - int64_t v178 = v2 * 3; - int64_t v185 = v2 * 4; - int64_t v192 = v2 * 5; - int64_t v199 = v2 * 6; - const float32x2_t *v266 = &v5[0]; - svint64_t v267 = svindex_s64(0, v1); - svfloat32_t v270 = svdup_n_f32(v90); - svfloat32_t v271 = svdup_n_f32(v95); - svfloat32_t v272 = svdup_n_f32(v100); - svfloat32_t v273 = svdup_n_f32(v105); - float32x2_t *v285 = &v6[0]; - svfloat32_t v213 = svreinterpret_f32_f64( - svld1_gather_s64index_f64(pred_full, (const double *)(v211), v267)); - const float32x2_t *v220 = &v5[v26]; - const float32x2_t *v229 = &v5[v35]; - const float32x2_t *v238 = &v5[v42]; - const float32x2_t *v247 = &v5[v51]; - const float32x2_t *v256 = &v5[v58]; - svfloat32_t v268 = svreinterpret_f32_f64( - svld1_gather_s64index_f64(pred_full, (const double *)(v266), v267)); - svfloat32_t v274 = svdup_n_f32(v113); - svfloat32_t v275 = svdup_n_f32(v120); - svfloat32_t v276 = svdup_n_f32(v127); - svfloat32_t v277 = svdup_n_f32(v134); - float32x2_t *v303 = &v6[v171]; - float32x2_t *v312 = &v6[v178]; - float32x2_t *v321 = &v6[v185]; - float32x2_t *v330 = &v6[v192]; - float32x2_t *v339 = &v6[v199]; - svfloat32_t v222 = svreinterpret_f32_f64( - svld1_gather_s64index_f64(pred_full, (const double *)(v220), v267)); - svfloat32_t v231 = svreinterpret_f32_f64( - svld1_gather_s64index_f64(pred_full, (const double *)(v229), v267)); - svfloat32_t v240 = svreinterpret_f32_f64( - svld1_gather_s64index_f64(pred_full, (const double *)(v238), v267)); - svfloat32_t v249 = svreinterpret_f32_f64( - svld1_gather_s64index_f64(pred_full, (const double *)(v247), v267)); - svfloat32_t v258 = svreinterpret_f32_f64( - svld1_gather_s64index_f64(pred_full, (const double *)(v256), v267)); - svfloat32_t v32; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v32) : "w"(v213), "w"(v222)); - svfloat32_t v33; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v33) : "w"(v213), "w"(v222)); - svfloat32_t v48; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v48) : "w"(v231), "w"(v240)); - svfloat32_t v49; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v49) : "w"(v231), "w"(v240)); - svfloat32_t v64; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v64) : "w"(v249), "w"(v258)); - svfloat32_t v65; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v65) : "w"(v249), "w"(v258)); - svfloat32_t v66; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v66) : "w"(v32), "w"(v48)); - svfloat32_t v76; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v76) : "w"(v32), "w"(v48)); - svfloat32_t v77; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v77) : "w"(v48), "w"(v64)); - svfloat32_t v78; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v78) : "w"(v64), "w"(v32)); - svfloat32_t v79; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v79) : "w"(v33), "w"(v49)); - svfloat32_t v81; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v81) : "w"(v33), "w"(v49)); - svfloat32_t v82; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v82) : "w"(v49), "w"(v65)); - svfloat32_t v83; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v83) : "w"(v65), "w"(v33)); - svfloat32_t v67; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v67) : "w"(v66), "w"(v64)); - svfloat32_t v80; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v80) : "w"(v79), "w"(v65)); - svfloat32_t zero122; - asm volatile("mov %0.s, #0" : "=w"(zero122)); - svfloat32_t v122 = svcmla_f32_x(pred_full, zero122, v275, v81, 90); - svfloat32_t zero129; - asm volatile("mov %0.s, #0" : "=w"(zero129)); - svfloat32_t v129 = svcmla_f32_x(pred_full, zero129, v276, v82, 90); - svfloat32_t zero136; - asm volatile("mov %0.s, #0" : "=w"(zero136)); - svfloat32_t v136 = svcmla_f32_x(pred_full, zero136, v277, v83, 90); - svfloat32_t v75; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v75) : "w"(v67), "w"(v268)); - svfloat32_t zero115; - asm volatile("mov %0.s, #0" : "=w"(zero115)); - svfloat32_t v115 = svcmla_f32_x(pred_full, zero115, v274, v80, 90); - svfloat32_t v137 = svmla_f32_x(pred_full, v75, v67, v270); - svfloat32_t v144; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v144) : "w"(v115), "w"(v122)); - svfloat32_t v146; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v146) : "w"(v115), "w"(v122)); - svfloat32_t v148; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v148) : "w"(v115), "w"(v129)); - svst1_f64(pred_full, (double *)(v285), svreinterpret_f64_f32(v75)); - svfloat32_t v138 = svmla_f32_x(pred_full, v137, v76, v271); - svfloat32_t v140 = svmls_f32_x(pred_full, v137, v76, v271); - svfloat32_t v142 = svmls_f32_x(pred_full, v137, v77, v272); - svfloat32_t v145; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v145) : "w"(v144), "w"(v129)); - svfloat32_t v147; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v147) : "w"(v146), "w"(v136)); - svfloat32_t v149; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v149) : "w"(v148), "w"(v136)); - svfloat32_t v139 = svmla_f32_x(pred_full, v138, v77, v272); - svfloat32_t v141 = svmls_f32_x(pred_full, v140, v78, v273); - svfloat32_t v143 = svmla_f32_x(pred_full, v142, v78, v273); - svfloat32_t v150; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v150) : "w"(v139), "w"(v145)); - svfloat32_t v151; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v151) : "w"(v139), "w"(v145)); - svfloat32_t v152; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v152) : "w"(v141), "w"(v147)); - svfloat32_t v153; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v153) : "w"(v141), "w"(v147)); - svfloat32_t v154; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v154) : "w"(v143), "w"(v149)); - svfloat32_t v155; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v155) : "w"(v143), "w"(v149)); - svst1_f64(pred_full, (double *)(v294), svreinterpret_f64_f32(v151)); - svst1_f64(pred_full, (double *)(v303), svreinterpret_f64_f32(v153)); - svst1_f64(pred_full, (double *)(v312), svreinterpret_f64_f32(v154)); - svst1_f64(pred_full, (double *)(v321), svreinterpret_f64_f32(v155)); - svst1_f64(pred_full, (double *)(v330), svreinterpret_f64_f32(v152)); - svst1_f64(pred_full, (double *)(v339), svreinterpret_f64_f32(v150)); - v5 += v11; - v6 += v12; - } -} -#endif - -#ifndef ARMRAL_ARCH_SVE -void armral_fft_cf32_cf32_cf32_ac_n_gu8(const armral_cmplx_f32_t *restrict x, - armral_cmplx_f32_t *restrict y, - int istride, int ostride, int howmany, - int idist, float dir) { - float v4 = dir; - const float32x2_t *v5 = (const float32x2_t *)x; - float32x2_t *v6 = (float32x2_t *)y; - for (int j = 0; j < howmany; j += 1) { - float32x2_t v44 = v5[istride]; - float v95 = 1.0000000000000000e+00F; - float v96 = -1.0000000000000000e+00F; - float v103 = -7.0710678118654746e-01F; - float32x2_t v105 = (float32x2_t){v4, v4}; - float v110 = 7.0710678118654757e-01F; - float32x2_t v20 = v5[0]; - float32x2_t v97 = (float32x2_t){v95, v96}; - float32x2_t v104 = (float32x2_t){v110, v103}; - float32x2_t v111 = (float32x2_t){v110, v110}; - float32x2_t v25 = v5[istride * 4]; - float32x2_t v32 = v5[istride * 2]; - float32x2_t v37 = v5[istride * 6]; - float32x2_t v49 = v5[istride * 5]; - float32x2_t v56 = v5[istride * 3]; - float32x2_t v61 = v5[istride * 7]; - float32x2_t v99 = vmul_f32(v105, v97); - float32x2_t v106 = vmul_f32(v105, v104); - float32x2_t v26 = vadd_f32(v20, v25); - float32x2_t v27 = vsub_f32(v20, v25); - float32x2_t v38 = vadd_f32(v32, v37); - float32x2_t v39 = vsub_f32(v32, v37); - float32x2_t v50 = vadd_f32(v44, v49); - float32x2_t v51 = vsub_f32(v44, v49); - float32x2_t v62 = vadd_f32(v56, v61); - float32x2_t v63 = vsub_f32(v56, v61); - float32x2_t v64 = vadd_f32(v26, v38); - float32x2_t v65 = vsub_f32(v26, v38); - float32x2_t v66 = vadd_f32(v50, v62); - float32x2_t v67 = vsub_f32(v50, v62); - float32x2_t v70 = vadd_f32(v51, v63); - float32x2_t v71 = vsub_f32(v51, v63); - float32x2_t v100 = vrev64_f32(v39); - float32x2_t v68 = vadd_f32(v64, v66); - float32x2_t v69 = vsub_f32(v64, v66); - float32x2_t v89 = vrev64_f32(v67); - float32x2_t v101 = vmul_f32(v100, v99); - float32x2_t v107 = vrev64_f32(v70); - float32x2_t v112 = vmul_f32(v71, v111); - float32x2_t v90 = vmul_f32(v89, v99); - float32x2_t v108 = vmul_f32(v107, v106); - float32x2_t v115 = vadd_f32(v27, v112); - float32x2_t v116 = vsub_f32(v27, v112); - v6[0] = v68; - v6[ostride * 4] = v69; - float32x2_t v113 = vadd_f32(v65, v90); - float32x2_t v114 = vsub_f32(v65, v90); - float32x2_t v117 = vadd_f32(v101, v108); - float32x2_t v118 = vsub_f32(v101, v108); - float32x2_t v119 = vadd_f32(v115, v117); - float32x2_t v120 = vsub_f32(v115, v117); - float32x2_t v121 = vadd_f32(v116, v118); - float32x2_t v122 = vsub_f32(v116, v118); - v6[ostride * 2] = v114; - v6[ostride * 6] = v113; - v6[ostride] = v120; - v6[ostride * 3] = v121; - v6[ostride * 5] = v122; - v6[ostride * 7] = v119; - v5 += 1 * idist; - v6 += 1 * 1; - } -} -#endif - -#ifdef ARMRAL_ARCH_SVE -void armral_fft_cf32_cf32_cf32_ac_n_gu8(const armral_cmplx_f32_t *restrict x, - armral_cmplx_f32_t *restrict y, - int istride, int ostride, int howmany, - int idist, float dir) { - int64_t v0 = istride; - int64_t v1 = idist; - int64_t v2 = ostride; - float v4 = dir; - const float32x2_t *v5 = (const float32x2_t *)x; - float32x2_t *v6 = (float32x2_t *)y; - int64_t v8 = howmany; - int64_t v10 = svcntd(); - int64_t v11 = v10 * v1; - int64_t v12 = v10 * 1; - for (int j = 0; j < v8; j += v10) { - svbool_t pred_full = svwhilelt_b32(j * 2, howmany * 2); - float v118 = -1.0000000000000000e+00F; - float v125 = -7.0710678118654746e-01F; - float v132 = 7.0710678118654757e-01F; - const float32x2_t *v245 = &v5[v0]; - float32x2_t *v299 = &v6[v2]; - int64_t v26 = v0 * 4; - int64_t v35 = v0 * 2; - int64_t v42 = v0 * 6; - int64_t v58 = v0 * 5; - int64_t v67 = v0 * 3; - int64_t v74 = v0 * 7; - float v121 = v4 * v118; - float v128 = v4 * v125; - int64_t v161 = v2 * 2; - int64_t v168 = v2 * 3; - int64_t v175 = v2 * 4; - int64_t v182 = v2 * 5; - int64_t v189 = v2 * 6; - int64_t v196 = v2 * 7; - const float32x2_t *v209 = &v5[0]; - svint64_t v273 = svindex_s64(0, v1); - svfloat32_t v282 = svdup_n_f32(v132); - float32x2_t *v290 = &v6[0]; - svfloat32_t v211 = svreinterpret_f32_f64( - svld1_gather_s64index_f64(pred_full, (const double *)(v209), v273)); - const float32x2_t *v218 = &v5[v26]; - const float32x2_t *v227 = &v5[v35]; - const float32x2_t *v236 = &v5[v42]; - svfloat32_t v247 = svreinterpret_f32_f64( - svld1_gather_s64index_f64(pred_full, (const double *)(v245), v273)); - const float32x2_t *v254 = &v5[v58]; - const float32x2_t *v263 = &v5[v67]; - const float32x2_t *v272 = &v5[v74]; - svfloat32_t v280 = svdup_n_f32(v121); - svfloat32_t v281 = svdup_n_f32(v128); - float32x2_t *v308 = &v6[v161]; - float32x2_t *v317 = &v6[v168]; - float32x2_t *v326 = &v6[v175]; - float32x2_t *v335 = &v6[v182]; - float32x2_t *v344 = &v6[v189]; - float32x2_t *v353 = &v6[v196]; - svfloat32_t v220 = svreinterpret_f32_f64( - svld1_gather_s64index_f64(pred_full, (const double *)(v218), v273)); - svfloat32_t v229 = svreinterpret_f32_f64( - svld1_gather_s64index_f64(pred_full, (const double *)(v227), v273)); - svfloat32_t v238 = svreinterpret_f32_f64( - svld1_gather_s64index_f64(pred_full, (const double *)(v236), v273)); - svfloat32_t v256 = svreinterpret_f32_f64( - svld1_gather_s64index_f64(pred_full, (const double *)(v254), v273)); - svfloat32_t v265 = svreinterpret_f32_f64( - svld1_gather_s64index_f64(pred_full, (const double *)(v263), v273)); - svfloat32_t v274 = svreinterpret_f32_f64( - svld1_gather_s64index_f64(pred_full, (const double *)(v272), v273)); - svfloat32_t v32; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v32) : "w"(v211), "w"(v220)); - svfloat32_t v33; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v33) : "w"(v211), "w"(v220)); - svfloat32_t v48; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v48) : "w"(v229), "w"(v238)); - svfloat32_t v49; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v49) : "w"(v229), "w"(v238)); - svfloat32_t v64; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v64) : "w"(v247), "w"(v256)); - svfloat32_t v65; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v65) : "w"(v247), "w"(v256)); - svfloat32_t v80; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v80) : "w"(v265), "w"(v274)); - svfloat32_t v81; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v81) : "w"(v265), "w"(v274)); - svfloat32_t v82; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v82) : "w"(v32), "w"(v48)); - svfloat32_t v83; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v83) : "w"(v32), "w"(v48)); - svfloat32_t v84; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v84) : "w"(v64), "w"(v80)); - svfloat32_t v85; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v85) : "w"(v64), "w"(v80)); - svfloat32_t v88; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v88) : "w"(v65), "w"(v81)); - svfloat32_t v89; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v89) : "w"(v65), "w"(v81)); - svfloat32_t zero123; - asm volatile("mov %0.s, #0" : "=w"(zero123)); - svfloat32_t v123 = svcmla_f32_x(pred_full, zero123, v280, v49, 90); - svfloat32_t v86; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v86) : "w"(v82), "w"(v84)); - svfloat32_t v87; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v87) : "w"(v82), "w"(v84)); - svfloat32_t zero111; - asm volatile("mov %0.s, #0" : "=w"(zero111)); - svfloat32_t v111 = svcmla_f32_x(pred_full, zero111, v280, v85, 90); - svfloat32_t zero130; - asm volatile("mov %0.s, #0" : "=w"(zero130)); - svfloat32_t v130 = svcmla_f32_x(pred_full, zero130, v281, v88, 90); - svfloat32_t v136; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v136) : "w"(v83), "w"(v111)); - svfloat32_t v137; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v137) : "w"(v83), "w"(v111)); - svfloat32_t v138 = svmla_f32_x(pred_full, v33, v89, v282); - svfloat32_t v139 = svmls_f32_x(pred_full, v33, v89, v282); - svfloat32_t v140; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v140) : "w"(v123), "w"(v130)); - svfloat32_t v141; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v141) : "w"(v123), "w"(v130)); - svst1_f64(pred_full, (double *)(v290), svreinterpret_f64_f32(v86)); - svst1_f64(pred_full, (double *)(v326), svreinterpret_f64_f32(v87)); - svfloat32_t v142; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v142) : "w"(v138), "w"(v140)); - svfloat32_t v143; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v143) : "w"(v138), "w"(v140)); - svfloat32_t v144; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v144) : "w"(v139), "w"(v141)); - svfloat32_t v145; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v145) : "w"(v139), "w"(v141)); - svst1_f64(pred_full, (double *)(v308), svreinterpret_f64_f32(v137)); - svst1_f64(pred_full, (double *)(v344), svreinterpret_f64_f32(v136)); - svst1_f64(pred_full, (double *)(v299), svreinterpret_f64_f32(v143)); - svst1_f64(pred_full, (double *)(v317), svreinterpret_f64_f32(v144)); - svst1_f64(pred_full, (double *)(v335), svreinterpret_f64_f32(v145)); - svst1_f64(pred_full, (double *)(v353), svreinterpret_f64_f32(v142)); - v5 += v11; - v6 += v12; - } -} -#endif - -#ifndef ARMRAL_ARCH_SVE -void armral_fft_cf32_cf32_cf32_ac_n_gu9(const armral_cmplx_f32_t *restrict x, - armral_cmplx_f32_t *restrict y, - int istride, int ostride, int howmany, - int idist, float dir) { - float v4 = dir; - const float32x2_t *v5 = (const float32x2_t *)x; - float32x2_t *v6 = (float32x2_t *)y; - for (int j = 0; j < howmany; j += 1) { - float32x2_t v20 = v5[istride]; - float v86 = -5.0000000000000000e-01F; - float v97 = -1.4999999999999998e+00F; - float v100 = 8.6602540378443871e-01F; - float v101 = -8.6602540378443871e-01F; - float v108 = 7.6604444311897801e-01F; - float v112 = 9.3969262078590832e-01F; - float v116 = -1.7364817766693039e-01F; - float v119 = 6.4278760968653925e-01F; - float v120 = -6.4278760968653925e-01F; - float v126 = -3.4202014332566888e-01F; - float v127 = 3.4202014332566888e-01F; - float v133 = 9.8480775301220802e-01F; - float v134 = -9.8480775301220802e-01F; - float32x2_t v136 = (float32x2_t){v4, v4}; - float32x2_t v71 = v5[0]; - float32x2_t v87 = (float32x2_t){v86, v86}; - float32x2_t v98 = (float32x2_t){v97, v97}; - float32x2_t v102 = (float32x2_t){v100, v101}; - float32x2_t v109 = (float32x2_t){v108, v108}; - float32x2_t v113 = (float32x2_t){v112, v112}; - float32x2_t v117 = (float32x2_t){v116, v116}; - float32x2_t v121 = (float32x2_t){v119, v120}; - float32x2_t v128 = (float32x2_t){v126, v127}; - float32x2_t v135 = (float32x2_t){v133, v134}; - float32x2_t v25 = v5[istride * 8]; - float32x2_t v32 = v5[istride * 7]; - float32x2_t v37 = v5[istride * 2]; - float32x2_t v44 = v5[istride * 3]; - float32x2_t v49 = v5[istride * 6]; - float32x2_t v56 = v5[istride * 4]; - float32x2_t v61 = v5[istride * 5]; - float32x2_t v104 = vmul_f32(v136, v102); - float32x2_t v123 = vmul_f32(v136, v121); - float32x2_t v130 = vmul_f32(v136, v128); - float32x2_t v137 = vmul_f32(v136, v135); - float32x2_t v26 = vadd_f32(v20, v25); - float32x2_t v27 = vsub_f32(v20, v25); - float32x2_t v38 = vadd_f32(v32, v37); - float32x2_t v39 = vsub_f32(v32, v37); - float32x2_t v50 = vadd_f32(v44, v49); - float32x2_t v51 = vsub_f32(v44, v49); - float32x2_t v62 = vadd_f32(v56, v61); - float32x2_t v63 = vsub_f32(v56, v61); - float32x2_t v64 = vadd_f32(v26, v38); - float32x2_t v73 = vadd_f32(v27, v39); - float32x2_t v75 = vsub_f32(v26, v38); - float32x2_t v76 = vsub_f32(v38, v62); - float32x2_t v77 = vsub_f32(v62, v26); - float32x2_t v78 = vsub_f32(v27, v39); - float32x2_t v79 = vsub_f32(v39, v63); - float32x2_t v80 = vsub_f32(v63, v27); - float32x2_t v99 = vmul_f32(v50, v98); - float32x2_t v105 = vrev64_f32(v51); - float32x2_t v65 = vadd_f32(v64, v62); - float32x2_t v74 = vadd_f32(v73, v63); - float32x2_t v106 = vmul_f32(v105, v104); - float32x2_t v110 = vmul_f32(v75, v109); - float32x2_t v114 = vmul_f32(v76, v113); - float32x2_t v118 = vmul_f32(v77, v117); - float32x2_t v124 = vrev64_f32(v78); - float32x2_t v131 = vrev64_f32(v79); - float32x2_t v138 = vrev64_f32(v80); - float32x2_t v66 = vadd_f32(v65, v50); - float32x2_t v88 = vmul_f32(v65, v87); - float32x2_t v94 = vrev64_f32(v74); - float32x2_t v125 = vmul_f32(v124, v123); - float32x2_t v132 = vmul_f32(v131, v130); - float32x2_t v139 = vmul_f32(v138, v137); - float32x2_t v72 = vadd_f32(v66, v71); - float32x2_t v95 = vmul_f32(v94, v104); - float32x2_t v140 = vadd_f32(v88, v88); - float32x2_t v153 = vadd_f32(v106, v125); - float32x2_t v155 = vsub_f32(v106, v132); - float32x2_t v157 = vsub_f32(v106, v125); - float32x2_t v141 = vadd_f32(v140, v88); - float32x2_t v145 = vadd_f32(v72, v99); - float32x2_t v154 = vadd_f32(v153, v132); - float32x2_t v156 = vadd_f32(v155, v139); - float32x2_t v158 = vsub_f32(v157, v139); - v6[0] = v72; - float32x2_t v142 = vadd_f32(v72, v141); - float32x2_t v146 = vadd_f32(v145, v140); - float32x2_t v143 = vadd_f32(v142, v95); - float32x2_t v144 = vsub_f32(v142, v95); - float32x2_t v147 = vadd_f32(v146, v110); - float32x2_t v149 = vsub_f32(v146, v114); - float32x2_t v151 = vsub_f32(v146, v110); - float32x2_t v148 = vadd_f32(v147, v114); - float32x2_t v150 = vadd_f32(v149, v118); - float32x2_t v152 = vsub_f32(v151, v118); - v6[ostride * 3] = v144; - v6[ostride * 6] = v143; - float32x2_t v159 = vadd_f32(v148, v154); - float32x2_t v160 = vsub_f32(v148, v154); - float32x2_t v161 = vadd_f32(v150, v156); - float32x2_t v162 = vsub_f32(v150, v156); - float32x2_t v163 = vadd_f32(v152, v158); - float32x2_t v164 = vsub_f32(v152, v158); - v6[ostride] = v160; - v6[ostride * 2] = v161; - v6[ostride * 4] = v164; - v6[ostride * 5] = v163; - v6[ostride * 7] = v162; - v6[ostride * 8] = v159; - v5 += 1 * idist; - v6 += 1 * 1; - } -} -#endif - -#ifdef ARMRAL_ARCH_SVE -void armral_fft_cf32_cf32_cf32_ac_n_gu9(const armral_cmplx_f32_t *restrict x, - armral_cmplx_f32_t *restrict y, - int istride, int ostride, int howmany, - int idist, float dir) { - int64_t v0 = istride; - int64_t v1 = idist; - int64_t v2 = ostride; - float v4 = dir; - const float32x2_t *v5 = (const float32x2_t *)x; - float32x2_t *v6 = (float32x2_t *)y; - int64_t v8 = howmany; - int64_t v10 = svcntd(); - int64_t v11 = v10 * v1; - int64_t v12 = v10 * 1; - for (int j = 0; j < v8; j += v10) { - svbool_t pred_full = svwhilelt_b32(j * 2, howmany * 2); - float v107 = -5.0000000000000000e-01F; - float v119 = -1.4999999999999998e+00F; - float v124 = -8.6602540378443871e-01F; - float v131 = 7.6604444311897801e-01F; - float v136 = 9.3969262078590832e-01F; - float v141 = -1.7364817766693039e-01F; - float v146 = -6.4278760968653925e-01F; - float v153 = 3.4202014332566888e-01F; - float v160 = -9.8480775301220802e-01F; - const float32x2_t *v260 = &v5[v0]; - float32x2_t *v363 = &v6[v2]; - int64_t v26 = v0 * 8; - int64_t v35 = v0 * 7; - int64_t v42 = v0 * 2; - int64_t v51 = v0 * 3; - int64_t v58 = v0 * 6; - int64_t v67 = v0 * 4; - int64_t v74 = v0 * 5; - float v127 = v4 * v124; - float v149 = v4 * v146; - float v156 = v4 * v153; - float v163 = v4 * v160; - int64_t v206 = v2 * 2; - int64_t v213 = v2 * 3; - int64_t v220 = v2 * 4; - int64_t v227 = v2 * 5; - int64_t v234 = v2 * 6; - int64_t v241 = v2 * 7; - int64_t v248 = v2 * 8; - const float32x2_t *v333 = &v5[0]; - svint64_t v334 = svindex_s64(0, v1); - svfloat32_t v337 = svdup_n_f32(v107); - svfloat32_t v339 = svdup_n_f32(v119); - svfloat32_t v341 = svdup_n_f32(v131); - svfloat32_t v342 = svdup_n_f32(v136); - svfloat32_t v343 = svdup_n_f32(v141); - float32x2_t *v354 = &v6[0]; - svfloat32_t v262 = svreinterpret_f32_f64( - svld1_gather_s64index_f64(pred_full, (const double *)(v260), v334)); - const float32x2_t *v269 = &v5[v26]; - const float32x2_t *v278 = &v5[v35]; - const float32x2_t *v287 = &v5[v42]; - const float32x2_t *v296 = &v5[v51]; - const float32x2_t *v305 = &v5[v58]; - const float32x2_t *v314 = &v5[v67]; - const float32x2_t *v323 = &v5[v74]; - svfloat32_t v335 = svreinterpret_f32_f64( - svld1_gather_s64index_f64(pred_full, (const double *)(v333), v334)); - svfloat32_t v340 = svdup_n_f32(v127); - svfloat32_t v344 = svdup_n_f32(v149); - svfloat32_t v345 = svdup_n_f32(v156); - svfloat32_t v346 = svdup_n_f32(v163); - float32x2_t *v372 = &v6[v206]; - float32x2_t *v381 = &v6[v213]; - float32x2_t *v390 = &v6[v220]; - float32x2_t *v399 = &v6[v227]; - float32x2_t *v408 = &v6[v234]; - float32x2_t *v417 = &v6[v241]; - float32x2_t *v426 = &v6[v248]; - svfloat32_t v271 = svreinterpret_f32_f64( - svld1_gather_s64index_f64(pred_full, (const double *)(v269), v334)); - svfloat32_t v280 = svreinterpret_f32_f64( - svld1_gather_s64index_f64(pred_full, (const double *)(v278), v334)); - svfloat32_t v289 = svreinterpret_f32_f64( - svld1_gather_s64index_f64(pred_full, (const double *)(v287), v334)); - svfloat32_t v298 = svreinterpret_f32_f64( - svld1_gather_s64index_f64(pred_full, (const double *)(v296), v334)); - svfloat32_t v307 = svreinterpret_f32_f64( - svld1_gather_s64index_f64(pred_full, (const double *)(v305), v334)); - svfloat32_t v316 = svreinterpret_f32_f64( - svld1_gather_s64index_f64(pred_full, (const double *)(v314), v334)); - svfloat32_t v325 = svreinterpret_f32_f64( - svld1_gather_s64index_f64(pred_full, (const double *)(v323), v334)); - svfloat32_t v32; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v32) : "w"(v262), "w"(v271)); - svfloat32_t v33; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v33) : "w"(v262), "w"(v271)); - svfloat32_t v48; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v48) : "w"(v280), "w"(v289)); - svfloat32_t v49; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v49) : "w"(v280), "w"(v289)); - svfloat32_t v64; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v64) : "w"(v298), "w"(v307)); - svfloat32_t v65; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v65) : "w"(v298), "w"(v307)); - svfloat32_t v80; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v80) : "w"(v316), "w"(v325)); - svfloat32_t v81; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v81) : "w"(v316), "w"(v325)); - svfloat32_t v82; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v82) : "w"(v32), "w"(v48)); - svfloat32_t v93; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v93) : "w"(v33), "w"(v49)); - svfloat32_t v95; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v95) : "w"(v32), "w"(v48)); - svfloat32_t v96; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v96) : "w"(v48), "w"(v80)); - svfloat32_t v97; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v97) : "w"(v80), "w"(v32)); - svfloat32_t v98; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v98) : "w"(v33), "w"(v49)); - svfloat32_t v99; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v99) : "w"(v49), "w"(v81)); - svfloat32_t v100; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v100) : "w"(v81), "w"(v33)); - svfloat32_t zero129; - asm volatile("mov %0.s, #0" : "=w"(zero129)); - svfloat32_t v129 = svcmla_f32_x(pred_full, zero129, v340, v65, 90); - svfloat32_t v83; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v83) : "w"(v82), "w"(v80)); - svfloat32_t v94; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v94) : "w"(v93), "w"(v81)); - svfloat32_t zero151; - asm volatile("mov %0.s, #0" : "=w"(zero151)); - svfloat32_t v151 = svcmla_f32_x(pred_full, zero151, v344, v98, 90); - svfloat32_t zero158; - asm volatile("mov %0.s, #0" : "=w"(zero158)); - svfloat32_t v158 = svcmla_f32_x(pred_full, zero158, v345, v99, 90); - svfloat32_t zero165; - asm volatile("mov %0.s, #0" : "=w"(zero165)); - svfloat32_t v165 = svcmla_f32_x(pred_full, zero165, v346, v100, 90); - svfloat32_t v84; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v84) : "w"(v83), "w"(v64)); - svfloat32_t v110; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v110) : "w"(v83), "w"(v337)); - svfloat32_t zero117; - asm volatile("mov %0.s, #0" : "=w"(zero117)); - svfloat32_t v117 = svcmla_f32_x(pred_full, zero117, v340, v94, 90); - svfloat32_t v179; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v179) : "w"(v129), "w"(v151)); - svfloat32_t v181; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v181) : "w"(v129), "w"(v158)); - svfloat32_t v183; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v183) : "w"(v129), "w"(v151)); - svfloat32_t v92; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v92) : "w"(v84), "w"(v335)); - svfloat32_t v166; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v166) : "w"(v110), "w"(v110)); - svfloat32_t v180; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v180) : "w"(v179), "w"(v158)); - svfloat32_t v182; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v182) : "w"(v181), "w"(v165)); - svfloat32_t v184; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v184) : "w"(v183), "w"(v165)); - svfloat32_t v167 = svmla_f32_x(pred_full, v166, v83, v337); - svfloat32_t v171 = svmla_f32_x(pred_full, v92, v64, v339); - svst1_f64(pred_full, (double *)(v354), svreinterpret_f64_f32(v92)); - svfloat32_t v168; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v168) : "w"(v92), "w"(v167)); - svfloat32_t v172; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v172) : "w"(v171), "w"(v166)); - svfloat32_t v169; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v169) : "w"(v168), "w"(v117)); - svfloat32_t v170; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v170) : "w"(v168), "w"(v117)); - svfloat32_t v173 = svmla_f32_x(pred_full, v172, v95, v341); - svfloat32_t v175 = svmls_f32_x(pred_full, v172, v96, v342); - svfloat32_t v177 = svmls_f32_x(pred_full, v172, v95, v341); - svfloat32_t v174 = svmla_f32_x(pred_full, v173, v96, v342); - svfloat32_t v176 = svmla_f32_x(pred_full, v175, v97, v343); - svfloat32_t v178 = svmls_f32_x(pred_full, v177, v97, v343); - svst1_f64(pred_full, (double *)(v381), svreinterpret_f64_f32(v170)); - svst1_f64(pred_full, (double *)(v408), svreinterpret_f64_f32(v169)); - svfloat32_t v185; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v185) : "w"(v174), "w"(v180)); - svfloat32_t v186; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v186) : "w"(v174), "w"(v180)); - svfloat32_t v187; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v187) : "w"(v176), "w"(v182)); - svfloat32_t v188; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v188) : "w"(v176), "w"(v182)); - svfloat32_t v189; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v189) : "w"(v178), "w"(v184)); - svfloat32_t v190; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v190) : "w"(v178), "w"(v184)); - svst1_f64(pred_full, (double *)(v363), svreinterpret_f64_f32(v186)); - svst1_f64(pred_full, (double *)(v372), svreinterpret_f64_f32(v187)); - svst1_f64(pred_full, (double *)(v390), svreinterpret_f64_f32(v190)); - svst1_f64(pred_full, (double *)(v399), svreinterpret_f64_f32(v189)); - svst1_f64(pred_full, (double *)(v417), svreinterpret_f64_f32(v188)); - svst1_f64(pred_full, (double *)(v426), svreinterpret_f64_f32(v185)); - v5 += v11; - v6 += v12; - } -} -#endif - -#ifndef ARMRAL_ARCH_SVE -void armral_fft_cf32_cf32_cf32_ac_n_gu10(const armral_cmplx_f32_t *restrict x, - armral_cmplx_f32_t *restrict y, - int istride, int ostride, int howmany, - int idist, float dir) { - float v4 = dir; - const float32x2_t *v5 = (const float32x2_t *)x; - float32x2_t *v6 = (float32x2_t *)y; - for (int j = 0; j < howmany; j += 1) { - float32x2_t v61 = v5[istride]; - float v139 = -1.2500000000000000e+00F; - float v143 = 5.5901699437494745e-01F; - float v146 = 1.5388417685876268e+00F; - float v147 = -1.5388417685876268e+00F; - float v153 = 5.8778525229247325e-01F; - float v154 = -5.8778525229247325e-01F; - float v160 = 3.6327126400268028e-01F; - float v161 = -3.6327126400268028e-01F; - float32x2_t v163 = (float32x2_t){v4, v4}; - float32x2_t v20 = v5[0]; - float32x2_t v140 = (float32x2_t){v139, v139}; - float32x2_t v144 = (float32x2_t){v143, v143}; - float32x2_t v148 = (float32x2_t){v146, v147}; - float32x2_t v155 = (float32x2_t){v153, v154}; - float32x2_t v162 = (float32x2_t){v160, v161}; - float32x2_t v25 = v5[istride * 5]; - float32x2_t v32 = v5[istride * 2]; - float32x2_t v37 = v5[istride * 7]; - float32x2_t v44 = v5[istride * 4]; - float32x2_t v49 = v5[istride * 9]; - float32x2_t v56 = v5[istride * 6]; - float32x2_t v68 = v5[istride * 8]; - float32x2_t v73 = v5[istride * 3]; - float32x2_t v150 = vmul_f32(v163, v148); - float32x2_t v157 = vmul_f32(v163, v155); - float32x2_t v164 = vmul_f32(v163, v162); - float32x2_t v26 = vadd_f32(v20, v25); - float32x2_t v27 = vsub_f32(v20, v25); - float32x2_t v38 = vadd_f32(v32, v37); - float32x2_t v39 = vsub_f32(v32, v37); - float32x2_t v50 = vadd_f32(v44, v49); - float32x2_t v51 = vsub_f32(v44, v49); - float32x2_t v62 = vadd_f32(v56, v61); - float32x2_t v63 = vsub_f32(v56, v61); - float32x2_t v74 = vadd_f32(v68, v73); - float32x2_t v75 = vsub_f32(v68, v73); - float32x2_t v76 = vadd_f32(v38, v74); - float32x2_t v77 = vsub_f32(v38, v74); - float32x2_t v78 = vadd_f32(v62, v50); - float32x2_t v79 = vsub_f32(v62, v50); - float32x2_t v126 = vadd_f32(v39, v75); - float32x2_t v127 = vsub_f32(v39, v75); - float32x2_t v128 = vadd_f32(v63, v51); - float32x2_t v129 = vsub_f32(v63, v51); - float32x2_t v80 = vadd_f32(v76, v78); - float32x2_t v81 = vsub_f32(v76, v78); - float32x2_t v82 = vadd_f32(v77, v79); - float32x2_t v101 = vrev64_f32(v77); - float32x2_t v115 = vrev64_f32(v79); - float32x2_t v130 = vadd_f32(v126, v128); - float32x2_t v131 = vsub_f32(v126, v128); - float32x2_t v132 = vadd_f32(v127, v129); - float32x2_t v151 = vrev64_f32(v127); - float32x2_t v165 = vrev64_f32(v129); - float32x2_t v83 = vadd_f32(v80, v26); - float32x2_t v91 = vmul_f32(v80, v140); - float32x2_t v95 = vmul_f32(v81, v144); - float32x2_t v102 = vmul_f32(v101, v150); - float32x2_t v108 = vrev64_f32(v82); - float32x2_t v116 = vmul_f32(v115, v164); - float32x2_t v133 = vadd_f32(v130, v27); - float32x2_t v141 = vmul_f32(v130, v140); - float32x2_t v145 = vmul_f32(v131, v144); - float32x2_t v152 = vmul_f32(v151, v150); - float32x2_t v158 = vrev64_f32(v132); - float32x2_t v166 = vmul_f32(v165, v164); - float32x2_t v109 = vmul_f32(v108, v157); - float32x2_t v117 = vadd_f32(v83, v91); - float32x2_t v159 = vmul_f32(v158, v157); - float32x2_t v167 = vadd_f32(v133, v141); - v6[0] = v83; - v6[ostride * 5] = v133; - float32x2_t v118 = vadd_f32(v117, v95); - float32x2_t v119 = vsub_f32(v117, v95); - float32x2_t v120 = vsub_f32(v102, v109); - float32x2_t v121 = vadd_f32(v109, v116); - float32x2_t v168 = vadd_f32(v167, v145); - float32x2_t v169 = vsub_f32(v167, v145); - float32x2_t v170 = vsub_f32(v152, v159); - float32x2_t v171 = vadd_f32(v159, v166); - float32x2_t v122 = vadd_f32(v118, v120); - float32x2_t v123 = vsub_f32(v118, v120); - float32x2_t v124 = vadd_f32(v119, v121); - float32x2_t v125 = vsub_f32(v119, v121); - float32x2_t v172 = vadd_f32(v168, v170); - float32x2_t v173 = vsub_f32(v168, v170); - float32x2_t v174 = vadd_f32(v169, v171); - float32x2_t v175 = vsub_f32(v169, v171); - v6[ostride * 6] = v123; - v6[ostride] = v173; - v6[ostride * 2] = v125; - v6[ostride * 7] = v175; - v6[ostride * 8] = v124; - v6[ostride * 3] = v174; - v6[ostride * 4] = v122; - v6[ostride * 9] = v172; - v5 += 1 * idist; - v6 += 1 * 1; - } -} -#endif - -#ifdef ARMRAL_ARCH_SVE -void armral_fft_cf32_cf32_cf32_ac_n_gu10(const armral_cmplx_f32_t *restrict x, - armral_cmplx_f32_t *restrict y, - int istride, int ostride, int howmany, - int idist, float dir) { - int64_t v0 = istride; - int64_t v1 = idist; - int64_t v2 = ostride; - float v4 = dir; - const float32x2_t *v5 = (const float32x2_t *)x; - float32x2_t *v6 = (float32x2_t *)y; - int64_t v8 = howmany; - int64_t v10 = svcntd(); - int64_t v11 = v10 * v1; - int64_t v12 = v10 * 1; - for (int j = 0; j < v8; j += v10) { - svbool_t pred_full = svwhilelt_b32(j * 2, howmany * 2); - float v165 = -1.2500000000000000e+00F; - float v170 = 5.5901699437494745e-01F; - float v175 = -1.5388417685876268e+00F; - float v182 = -5.8778525229247325e-01F; - float v189 = -3.6327126400268028e-01F; - const float32x2_t *v344 = &v5[v0]; - float32x2_t *v411 = &v6[v2]; - int64_t v26 = v0 * 5; - int64_t v35 = v0 * 2; - int64_t v42 = v0 * 7; - int64_t v51 = v0 * 4; - int64_t v58 = v0 * 9; - int64_t v67 = v0 * 6; - int64_t v83 = v0 * 8; - int64_t v90 = v0 * 3; - float v178 = v4 * v175; - float v185 = v4 * v182; - float v192 = v4 * v189; - int64_t v212 = v2 * 5; - int64_t v219 = v2 * 6; - int64_t v233 = v2 * 2; - int64_t v240 = v2 * 7; - int64_t v247 = v2 * 8; - int64_t v254 = v2 * 3; - int64_t v261 = v2 * 4; - int64_t v268 = v2 * 9; - const float32x2_t *v281 = &v5[0]; - svint64_t v363 = svindex_s64(0, v1); - svfloat32_t v372 = svdup_n_f32(v165); - svfloat32_t v373 = svdup_n_f32(v170); - float32x2_t *v384 = &v6[0]; - svfloat32_t v283 = svreinterpret_f32_f64( - svld1_gather_s64index_f64(pred_full, (const double *)(v281), v363)); - const float32x2_t *v290 = &v5[v26]; - const float32x2_t *v299 = &v5[v35]; - const float32x2_t *v308 = &v5[v42]; - const float32x2_t *v317 = &v5[v51]; - const float32x2_t *v326 = &v5[v58]; - const float32x2_t *v335 = &v5[v67]; - svfloat32_t v346 = svreinterpret_f32_f64( - svld1_gather_s64index_f64(pred_full, (const double *)(v344), v363)); - const float32x2_t *v353 = &v5[v83]; - const float32x2_t *v362 = &v5[v90]; - svfloat32_t v374 = svdup_n_f32(v178); - svfloat32_t v375 = svdup_n_f32(v185); - svfloat32_t v376 = svdup_n_f32(v192); - float32x2_t *v393 = &v6[v212]; - float32x2_t *v402 = &v6[v219]; - float32x2_t *v420 = &v6[v233]; - float32x2_t *v429 = &v6[v240]; - float32x2_t *v438 = &v6[v247]; - float32x2_t *v447 = &v6[v254]; - float32x2_t *v456 = &v6[v261]; - float32x2_t *v465 = &v6[v268]; - svfloat32_t v292 = svreinterpret_f32_f64( - svld1_gather_s64index_f64(pred_full, (const double *)(v290), v363)); - svfloat32_t v301 = svreinterpret_f32_f64( - svld1_gather_s64index_f64(pred_full, (const double *)(v299), v363)); - svfloat32_t v310 = svreinterpret_f32_f64( - svld1_gather_s64index_f64(pred_full, (const double *)(v308), v363)); - svfloat32_t v319 = svreinterpret_f32_f64( - svld1_gather_s64index_f64(pred_full, (const double *)(v317), v363)); - svfloat32_t v328 = svreinterpret_f32_f64( - svld1_gather_s64index_f64(pred_full, (const double *)(v326), v363)); - svfloat32_t v337 = svreinterpret_f32_f64( - svld1_gather_s64index_f64(pred_full, (const double *)(v335), v363)); - svfloat32_t v355 = svreinterpret_f32_f64( - svld1_gather_s64index_f64(pred_full, (const double *)(v353), v363)); - svfloat32_t v364 = svreinterpret_f32_f64( - svld1_gather_s64index_f64(pred_full, (const double *)(v362), v363)); - svfloat32_t v32; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v32) : "w"(v283), "w"(v292)); - svfloat32_t v33; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v33) : "w"(v283), "w"(v292)); - svfloat32_t v48; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v48) : "w"(v301), "w"(v310)); - svfloat32_t v49; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v49) : "w"(v301), "w"(v310)); - svfloat32_t v64; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v64) : "w"(v319), "w"(v328)); - svfloat32_t v65; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v65) : "w"(v319), "w"(v328)); - svfloat32_t v80; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v80) : "w"(v337), "w"(v346)); - svfloat32_t v81; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v81) : "w"(v337), "w"(v346)); - svfloat32_t v96; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v96) : "w"(v355), "w"(v364)); - svfloat32_t v97; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v97) : "w"(v355), "w"(v364)); - svfloat32_t v98; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v98) : "w"(v48), "w"(v96)); - svfloat32_t v99; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v99) : "w"(v48), "w"(v96)); - svfloat32_t v100; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v100) : "w"(v80), "w"(v64)); - svfloat32_t v101; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v101) : "w"(v80), "w"(v64)); - svfloat32_t v151; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v151) : "w"(v49), "w"(v97)); - svfloat32_t v152; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v152) : "w"(v49), "w"(v97)); - svfloat32_t v153; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v153) : "w"(v81), "w"(v65)); - svfloat32_t v154; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v154) : "w"(v81), "w"(v65)); - svfloat32_t v102; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v102) : "w"(v98), "w"(v100)); - svfloat32_t v103; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v103) : "w"(v98), "w"(v100)); - svfloat32_t v104; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v104) : "w"(v99), "w"(v101)); - svfloat32_t zero127; - asm volatile("mov %0.s, #0" : "=w"(zero127)); - svfloat32_t v127 = svcmla_f32_x(pred_full, zero127, v374, v99, 90); - svfloat32_t v155; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v155) : "w"(v151), "w"(v153)); - svfloat32_t v156; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v156) : "w"(v151), "w"(v153)); - svfloat32_t v157; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v157) : "w"(v152), "w"(v154)); - svfloat32_t zero180; - asm volatile("mov %0.s, #0" : "=w"(zero180)); - svfloat32_t v180 = svcmla_f32_x(pred_full, zero180, v374, v152, 90); - svfloat32_t v105; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v105) : "w"(v102), "w"(v32)); - svfloat32_t zero134; - asm volatile("mov %0.s, #0" : "=w"(zero134)); - svfloat32_t v134 = svcmla_f32_x(pred_full, zero134, v375, v104, 90); - svfloat32_t v158; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v158) : "w"(v155), "w"(v33)); - svfloat32_t zero187; - asm volatile("mov %0.s, #0" : "=w"(zero187)); - svfloat32_t v187 = svcmla_f32_x(pred_full, zero187, v375, v157, 90); - svfloat32_t v142 = svmla_f32_x(pred_full, v105, v102, v372); - svfloat32_t v145; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v145) : "w"(v127), "w"(v134)); - svfloat32_t v146 = svcmla_f32_x(pred_full, v134, v376, v101, 90); - svfloat32_t v195 = svmla_f32_x(pred_full, v158, v155, v372); - svfloat32_t v198; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v198) : "w"(v180), "w"(v187)); - svfloat32_t v199 = svcmla_f32_x(pred_full, v187, v376, v154, 90); - svst1_f64(pred_full, (double *)(v384), svreinterpret_f64_f32(v105)); - svst1_f64(pred_full, (double *)(v393), svreinterpret_f64_f32(v158)); - svfloat32_t v143 = svmla_f32_x(pred_full, v142, v103, v373); - svfloat32_t v144 = svmls_f32_x(pred_full, v142, v103, v373); - svfloat32_t v196 = svmla_f32_x(pred_full, v195, v156, v373); - svfloat32_t v197 = svmls_f32_x(pred_full, v195, v156, v373); - svfloat32_t v147; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v147) : "w"(v143), "w"(v145)); - svfloat32_t v148; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v148) : "w"(v143), "w"(v145)); - svfloat32_t v149; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v149) : "w"(v144), "w"(v146)); - svfloat32_t v150; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v150) : "w"(v144), "w"(v146)); - svfloat32_t v200; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v200) : "w"(v196), "w"(v198)); - svfloat32_t v201; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v201) : "w"(v196), "w"(v198)); - svfloat32_t v202; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v202) : "w"(v197), "w"(v199)); - svfloat32_t v203; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v203) : "w"(v197), "w"(v199)); - svst1_f64(pred_full, (double *)(v402), svreinterpret_f64_f32(v148)); - svst1_f64(pred_full, (double *)(v411), svreinterpret_f64_f32(v201)); - svst1_f64(pred_full, (double *)(v420), svreinterpret_f64_f32(v150)); - svst1_f64(pred_full, (double *)(v429), svreinterpret_f64_f32(v203)); - svst1_f64(pred_full, (double *)(v438), svreinterpret_f64_f32(v149)); - svst1_f64(pred_full, (double *)(v447), svreinterpret_f64_f32(v202)); - svst1_f64(pred_full, (double *)(v456), svreinterpret_f64_f32(v147)); - svst1_f64(pred_full, (double *)(v465), svreinterpret_f64_f32(v200)); - v5 += v11; - v6 += v12; - } -} -#endif - -#ifndef ARMRAL_ARCH_SVE -void armral_fft_cf32_cf32_cf32_ac_n_gu11(const armral_cmplx_f32_t *restrict x, - armral_cmplx_f32_t *restrict y, - int istride, int ostride, int howmany, - int idist, float dir) { - float v4 = dir; - const float32x2_t *v5 = (const float32x2_t *)x; - float32x2_t *v6 = (float32x2_t *)y; - for (int j = 0; j < howmany; j += 1) { - float32x2_t v20 = v5[istride]; - float v113 = 1.1000000000000001e+00F; - float v116 = 3.3166247903554003e-01F; - float v117 = -3.3166247903554003e-01F; - float v124 = 5.1541501300188641e-01F; - float v128 = 9.4125353283118118e-01F; - float v132 = 1.4143537075597825e+00F; - float v136 = 8.5949297361449750e-01F; - float v140 = 4.2314838273285138e-02F; - float v144 = 3.8639279888589606e-01F; - float v148 = 5.1254589567200015e-01F; - float v152 = 1.0702757469471715e+00F; - float v156 = 5.5486073394528512e-01F; - float v159 = 1.2412944743900585e+00F; - float v160 = -1.2412944743900585e+00F; - float v166 = 2.0897833842005756e-01F; - float v167 = -2.0897833842005756e-01F; - float v173 = 3.7415717312460811e-01F; - float v174 = -3.7415717312460811e-01F; - float v180 = 4.9929922194110327e-02F; - float v181 = -4.9929922194110327e-02F; - float v187 = 6.5815896284539266e-01F; - float v188 = -6.5815896284539266e-01F; - float v194 = 6.3306543373877577e-01F; - float v195 = -6.3306543373877577e-01F; - float v201 = 1.0822460581641109e+00F; - float v202 = -1.0822460581641109e+00F; - float v208 = 8.1720737907134022e-01F; - float v209 = -8.1720737907134022e-01F; - float v215 = 4.2408709531871824e-01F; - float v216 = -4.2408709531871824e-01F; - float32x2_t v218 = (float32x2_t){v4, v4}; - float32x2_t v86 = v5[0]; - float32x2_t v114 = (float32x2_t){v113, v113}; - float32x2_t v118 = (float32x2_t){v116, v117}; - float32x2_t v125 = (float32x2_t){v124, v124}; - float32x2_t v129 = (float32x2_t){v128, v128}; - float32x2_t v133 = (float32x2_t){v132, v132}; - float32x2_t v137 = (float32x2_t){v136, v136}; - float32x2_t v141 = (float32x2_t){v140, v140}; - float32x2_t v145 = (float32x2_t){v144, v144}; - float32x2_t v149 = (float32x2_t){v148, v148}; - float32x2_t v153 = (float32x2_t){v152, v152}; - float32x2_t v157 = (float32x2_t){v156, v156}; - float32x2_t v161 = (float32x2_t){v159, v160}; - float32x2_t v168 = (float32x2_t){v166, v167}; - float32x2_t v175 = (float32x2_t){v173, v174}; - float32x2_t v182 = (float32x2_t){v180, v181}; - float32x2_t v189 = (float32x2_t){v187, v188}; - float32x2_t v196 = (float32x2_t){v194, v195}; - float32x2_t v203 = (float32x2_t){v201, v202}; - float32x2_t v210 = (float32x2_t){v208, v209}; - float32x2_t v217 = (float32x2_t){v215, v216}; - float32x2_t v25 = v5[istride * 10]; - float32x2_t v31 = v5[istride * 2]; - float32x2_t v36 = v5[istride * 9]; - float32x2_t v42 = v5[istride * 3]; - float32x2_t v47 = v5[istride * 8]; - float32x2_t v53 = v5[istride * 4]; - float32x2_t v58 = v5[istride * 7]; - float32x2_t v64 = v5[istride * 5]; - float32x2_t v69 = v5[istride * 6]; - float32x2_t v120 = vmul_f32(v218, v118); - float32x2_t v163 = vmul_f32(v218, v161); - float32x2_t v170 = vmul_f32(v218, v168); - float32x2_t v177 = vmul_f32(v218, v175); - float32x2_t v184 = vmul_f32(v218, v182); - float32x2_t v191 = vmul_f32(v218, v189); - float32x2_t v198 = vmul_f32(v218, v196); - float32x2_t v205 = vmul_f32(v218, v203); - float32x2_t v212 = vmul_f32(v218, v210); - float32x2_t v219 = vmul_f32(v218, v217); - float32x2_t v26 = vadd_f32(v20, v25); - float32x2_t v37 = vadd_f32(v31, v36); - float32x2_t v48 = vadd_f32(v42, v47); - float32x2_t v59 = vadd_f32(v53, v58); - float32x2_t v70 = vadd_f32(v64, v69); - float32x2_t v71 = vsub_f32(v20, v25); - float32x2_t v72 = vsub_f32(v31, v36); - float32x2_t v73 = vsub_f32(v42, v47); - float32x2_t v74 = vsub_f32(v53, v58); - float32x2_t v75 = vsub_f32(v64, v69); - float32x2_t v76 = vadd_f32(v26, v37); - float32x2_t v77 = vadd_f32(v48, v70); - float32x2_t v79 = vsub_f32(v72, v73); - float32x2_t v80 = vadd_f32(v71, v75); - float32x2_t v90 = vsub_f32(v37, v59); - float32x2_t v91 = vsub_f32(v26, v59); - float32x2_t v92 = vsub_f32(v37, v26); - float32x2_t v93 = vsub_f32(v70, v59); - float32x2_t v94 = vsub_f32(v48, v59); - float32x2_t v95 = vsub_f32(v70, v48); - float32x2_t v96 = vsub_f32(v37, v70); - float32x2_t v97 = vsub_f32(v26, v48); - float32x2_t v99 = vadd_f32(v72, v74); - float32x2_t v100 = vsub_f32(v71, v74); - float32x2_t v101 = vadd_f32(v71, v72); - float32x2_t v102 = vsub_f32(v74, v75); - float32x2_t v103 = vsub_f32(v73, v74); - float32x2_t v104 = vsub_f32(v73, v75); - float32x2_t v105 = vadd_f32(v72, v75); - float32x2_t v106 = vsub_f32(v71, v73); - float32x2_t v78 = vadd_f32(v59, v76); - float32x2_t v88 = vsub_f32(v79, v80); - float32x2_t v98 = vsub_f32(v77, v76); - float32x2_t v107 = vadd_f32(v79, v80); - float32x2_t v126 = vmul_f32(v90, v125); - float32x2_t v130 = vmul_f32(v91, v129); - float32x2_t v134 = vmul_f32(v92, v133); - float32x2_t v138 = vmul_f32(v93, v137); - float32x2_t v142 = vmul_f32(v94, v141); - float32x2_t v146 = vmul_f32(v95, v145); - float32x2_t v150 = vmul_f32(v96, v149); - float32x2_t v154 = vmul_f32(v97, v153); - float32x2_t v164 = vrev64_f32(v99); - float32x2_t v171 = vrev64_f32(v100); - float32x2_t v178 = vrev64_f32(v101); - float32x2_t v185 = vrev64_f32(v102); - float32x2_t v192 = vrev64_f32(v103); - float32x2_t v199 = vrev64_f32(v104); - float32x2_t v206 = vrev64_f32(v105); - float32x2_t v213 = vrev64_f32(v106); - float32x2_t v81 = vadd_f32(v78, v77); - float32x2_t v89 = vsub_f32(v88, v74); - float32x2_t v158 = vmul_f32(v98, v157); - float32x2_t v165 = vmul_f32(v164, v163); - float32x2_t v172 = vmul_f32(v171, v170); - float32x2_t v179 = vmul_f32(v178, v177); - float32x2_t v186 = vmul_f32(v185, v184); - float32x2_t v193 = vmul_f32(v192, v191); - float32x2_t v200 = vmul_f32(v199, v198); - float32x2_t v207 = vmul_f32(v206, v205); - float32x2_t v214 = vmul_f32(v213, v212); - float32x2_t v220 = vrev64_f32(v107); - float32x2_t v223 = vadd_f32(v126, v130); - float32x2_t v224 = vadd_f32(v130, v134); - float32x2_t v225 = vsub_f32(v126, v134); - float32x2_t v226 = vadd_f32(v138, v142); - float32x2_t v227 = vadd_f32(v142, v146); - float32x2_t v228 = vsub_f32(v138, v146); - float32x2_t v87 = vadd_f32(v86, v81); - float32x2_t v115 = vmul_f32(v81, v114); - float32x2_t v121 = vrev64_f32(v89); - float32x2_t v221 = vmul_f32(v220, v219); - float32x2_t v229 = vadd_f32(v154, v158); - float32x2_t v230 = vadd_f32(v150, v158); - float32x2_t v231 = vadd_f32(v172, v179); - float32x2_t v232 = vsub_f32(v165, v179); - float32x2_t v233 = vadd_f32(v193, v200); - float32x2_t v234 = vsub_f32(v186, v200); - float32x2_t v122 = vmul_f32(v121, v120); - float32x2_t v222 = vsub_f32(v87, v115); - float32x2_t v235 = vadd_f32(v214, v221); - float32x2_t v236 = vsub_f32(v207, v221); - float32x2_t v237 = vadd_f32(v227, v229); - float32x2_t v255 = vadd_f32(v231, v232); - v6[0] = v87; - float32x2_t v238 = vadd_f32(v237, v222); - float32x2_t v239 = vsub_f32(v222, v224); - float32x2_t v241 = vadd_f32(v222, v228); - float32x2_t v243 = vsub_f32(v222, v225); - float32x2_t v245 = vadd_f32(v222, v223); - float32x2_t v247 = vadd_f32(v122, v233); - float32x2_t v249 = vsub_f32(v235, v231); - float32x2_t v251 = vadd_f32(v122, v236); - float32x2_t v253 = vsub_f32(v236, v232); - float32x2_t v256 = vadd_f32(v255, v233); - float32x2_t v240 = vsub_f32(v239, v229); - float32x2_t v242 = vadd_f32(v241, v230); - float32x2_t v244 = vsub_f32(v243, v230); - float32x2_t v246 = vsub_f32(v245, v226); - float32x2_t v248 = vadd_f32(v247, v235); - float32x2_t v250 = vsub_f32(v249, v122); - float32x2_t v252 = vadd_f32(v251, v234); - float32x2_t v254 = vsub_f32(v253, v122); - float32x2_t v257 = vadd_f32(v256, v234); - float32x2_t v258 = vsub_f32(v257, v122); - float32x2_t v260 = vadd_f32(v238, v248); - float32x2_t v261 = vadd_f32(v240, v250); - float32x2_t v262 = vsub_f32(v242, v252); - float32x2_t v263 = vadd_f32(v244, v254); - float32x2_t v264 = vsub_f32(v244, v254); - float32x2_t v265 = vadd_f32(v242, v252); - float32x2_t v266 = vsub_f32(v240, v250); - float32x2_t v267 = vsub_f32(v238, v248); - float32x2_t v259 = vadd_f32(v246, v258); - float32x2_t v268 = vsub_f32(v246, v258); - v6[ostride * 9] = v260; - v6[ostride * 8] = v261; - v6[ostride * 7] = v262; - v6[ostride * 6] = v263; - v6[ostride * 5] = v264; - v6[ostride * 4] = v265; - v6[ostride * 3] = v266; - v6[ostride * 2] = v267; - v6[ostride * 10] = v259; - v6[ostride] = v268; - v5 += 1 * idist; - v6 += 1 * 1; - } -} -#endif - -#ifdef ARMRAL_ARCH_SVE -void armral_fft_cf32_cf32_cf32_ac_n_gu11(const armral_cmplx_f32_t *restrict x, - armral_cmplx_f32_t *restrict y, - int istride, int ostride, int howmany, - int idist, float dir) { - int64_t v0 = istride; - int64_t v1 = idist; - int64_t v2 = ostride; - float v4 = dir; - const float32x2_t *v5 = (const float32x2_t *)x; - float32x2_t *v6 = (float32x2_t *)y; - int64_t v8 = howmany; - int64_t v10 = svcntd(); - int64_t v11 = v10 * v1; - int64_t v12 = v10 * 1; - for (int j = 0; j < v8; j += v10) { - svbool_t pred_full = svwhilelt_b32(j * 2, howmany * 2); - float v138 = 1.1000000000000001e+00F; - float v143 = -3.3166247903554003e-01F; - float v150 = 5.1541501300188641e-01F; - float v155 = 9.4125353283118118e-01F; - float v160 = 1.4143537075597825e+00F; - float v165 = 8.5949297361449750e-01F; - float v170 = 4.2314838273285138e-02F; - float v175 = 3.8639279888589606e-01F; - float v180 = 5.1254589567200015e-01F; - float v185 = 1.0702757469471715e+00F; - float v190 = 5.5486073394528512e-01F; - float v195 = -1.2412944743900585e+00F; - float v202 = -2.0897833842005756e-01F; - float v209 = -3.7415717312460811e-01F; - float v216 = -4.9929922194110327e-02F; - float v223 = -6.5815896284539266e-01F; - float v230 = -6.3306543373877577e-01F; - float v237 = -1.0822460581641109e+00F; - float v244 = -8.1720737907134022e-01F; - float v251 = -4.2408709531871824e-01F; - const float32x2_t *v387 = &v5[v0]; - float32x2_t *v599 = &v6[v2]; - int64_t v26 = v0 * 10; - int64_t v34 = v0 * 2; - int64_t v41 = v0 * 9; - int64_t v49 = v0 * 3; - int64_t v56 = v0 * 8; - int64_t v64 = v0 * 4; - int64_t v71 = v0 * 7; - int64_t v79 = v0 * 5; - int64_t v86 = v0 * 6; - float v146 = v4 * v143; - float v198 = v4 * v195; - float v205 = v4 * v202; - float v212 = v4 * v209; - float v219 = v4 * v216; - float v226 = v4 * v223; - float v233 = v4 * v230; - float v240 = v4 * v237; - float v247 = v4 * v244; - float v254 = v4 * v251; - int64_t v312 = v2 * 10; - int64_t v319 = v2 * 9; - int64_t v326 = v2 * 8; - int64_t v333 = v2 * 7; - int64_t v340 = v2 * 6; - int64_t v347 = v2 * 5; - int64_t v354 = v2 * 4; - int64_t v361 = v2 * 3; - int64_t v368 = v2 * 2; - const float32x2_t *v478 = &v5[0]; - svint64_t v479 = svindex_s64(0, v1); - svfloat32_t v482 = svdup_n_f32(v138); - svfloat32_t v484 = svdup_n_f32(v150); - svfloat32_t v485 = svdup_n_f32(v155); - svfloat32_t v486 = svdup_n_f32(v160); - svfloat32_t v487 = svdup_n_f32(v165); - svfloat32_t v488 = svdup_n_f32(v170); - svfloat32_t v489 = svdup_n_f32(v175); - svfloat32_t v490 = svdup_n_f32(v180); - svfloat32_t v491 = svdup_n_f32(v185); - svfloat32_t v492 = svdup_n_f32(v190); - float32x2_t *v509 = &v6[0]; - svfloat32_t v389 = svreinterpret_f32_f64( - svld1_gather_s64index_f64(pred_full, (const double *)(v387), v479)); - const float32x2_t *v396 = &v5[v26]; - const float32x2_t *v405 = &v5[v34]; - const float32x2_t *v414 = &v5[v41]; - const float32x2_t *v423 = &v5[v49]; - const float32x2_t *v432 = &v5[v56]; - const float32x2_t *v441 = &v5[v64]; - const float32x2_t *v450 = &v5[v71]; - const float32x2_t *v459 = &v5[v79]; - const float32x2_t *v468 = &v5[v86]; - svfloat32_t v480 = svreinterpret_f32_f64( - svld1_gather_s64index_f64(pred_full, (const double *)(v478), v479)); - svfloat32_t v483 = svdup_n_f32(v146); - svfloat32_t v493 = svdup_n_f32(v198); - svfloat32_t v494 = svdup_n_f32(v205); - svfloat32_t v495 = svdup_n_f32(v212); - svfloat32_t v496 = svdup_n_f32(v219); - svfloat32_t v497 = svdup_n_f32(v226); - svfloat32_t v498 = svdup_n_f32(v233); - svfloat32_t v499 = svdup_n_f32(v240); - svfloat32_t v500 = svdup_n_f32(v247); - svfloat32_t v501 = svdup_n_f32(v254); - float32x2_t *v518 = &v6[v312]; - float32x2_t *v527 = &v6[v319]; - float32x2_t *v536 = &v6[v326]; - float32x2_t *v545 = &v6[v333]; - float32x2_t *v554 = &v6[v340]; - float32x2_t *v563 = &v6[v347]; - float32x2_t *v572 = &v6[v354]; - float32x2_t *v581 = &v6[v361]; - float32x2_t *v590 = &v6[v368]; - svfloat32_t v398 = svreinterpret_f32_f64( - svld1_gather_s64index_f64(pred_full, (const double *)(v396), v479)); - svfloat32_t v407 = svreinterpret_f32_f64( - svld1_gather_s64index_f64(pred_full, (const double *)(v405), v479)); - svfloat32_t v416 = svreinterpret_f32_f64( - svld1_gather_s64index_f64(pred_full, (const double *)(v414), v479)); - svfloat32_t v425 = svreinterpret_f32_f64( - svld1_gather_s64index_f64(pred_full, (const double *)(v423), v479)); - svfloat32_t v434 = svreinterpret_f32_f64( - svld1_gather_s64index_f64(pred_full, (const double *)(v432), v479)); - svfloat32_t v443 = svreinterpret_f32_f64( - svld1_gather_s64index_f64(pred_full, (const double *)(v441), v479)); - svfloat32_t v452 = svreinterpret_f32_f64( - svld1_gather_s64index_f64(pred_full, (const double *)(v450), v479)); - svfloat32_t v461 = svreinterpret_f32_f64( - svld1_gather_s64index_f64(pred_full, (const double *)(v459), v479)); - svfloat32_t v470 = svreinterpret_f32_f64( - svld1_gather_s64index_f64(pred_full, (const double *)(v468), v479)); - svfloat32_t v32; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v32) : "w"(v389), "w"(v398)); - svfloat32_t v47; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v47) : "w"(v407), "w"(v416)); - svfloat32_t v62; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v62) : "w"(v425), "w"(v434)); - svfloat32_t v77; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v77) : "w"(v443), "w"(v452)); - svfloat32_t v92; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v92) : "w"(v461), "w"(v470)); - svfloat32_t v93; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v93) : "w"(v389), "w"(v398)); - svfloat32_t v94; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v94) : "w"(v407), "w"(v416)); - svfloat32_t v95; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v95) : "w"(v425), "w"(v434)); - svfloat32_t v96; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v96) : "w"(v443), "w"(v452)); - svfloat32_t v97; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v97) : "w"(v461), "w"(v470)); - svfloat32_t v98; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v98) : "w"(v32), "w"(v47)); - svfloat32_t v99; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v99) : "w"(v62), "w"(v92)); - svfloat32_t v101; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v101) : "w"(v94), "w"(v95)); - svfloat32_t v102; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v102) : "w"(v93), "w"(v97)); - svfloat32_t v114; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v114) : "w"(v47), "w"(v77)); - svfloat32_t v115; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v115) : "w"(v32), "w"(v77)); - svfloat32_t v116; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v116) : "w"(v47), "w"(v32)); - svfloat32_t v117; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v117) : "w"(v92), "w"(v77)); - svfloat32_t v118; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v118) : "w"(v62), "w"(v77)); - svfloat32_t v119; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v119) : "w"(v92), "w"(v62)); - svfloat32_t v120; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v120) : "w"(v47), "w"(v92)); - svfloat32_t v121; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v121) : "w"(v32), "w"(v62)); - svfloat32_t v123; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v123) : "w"(v94), "w"(v96)); - svfloat32_t v124; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v124) : "w"(v93), "w"(v96)); - svfloat32_t v125; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v125) : "w"(v93), "w"(v94)); - svfloat32_t v126; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v126) : "w"(v96), "w"(v97)); - svfloat32_t v127; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v127) : "w"(v95), "w"(v96)); - svfloat32_t v128; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v128) : "w"(v95), "w"(v97)); - svfloat32_t v129; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v129) : "w"(v94), "w"(v97)); - svfloat32_t v130; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v130) : "w"(v93), "w"(v95)); - svfloat32_t v100; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v100) : "w"(v77), "w"(v98)); - svfloat32_t v112; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v112) : "w"(v101), "w"(v102)); - svfloat32_t v122; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v122) : "w"(v99), "w"(v98)); - svfloat32_t v131; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v131) : "w"(v101), "w"(v102)); - svfloat32_t v158; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v158) : "w"(v115), "w"(v485)); - svfloat32_t v163; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v163) : "w"(v116), "w"(v486)); - svfloat32_t v173; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v173) : "w"(v118), "w"(v488)); - svfloat32_t v178; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v178) : "w"(v119), "w"(v489)); - svfloat32_t zero200; - asm volatile("mov %0.s, #0" : "=w"(zero200)); - svfloat32_t v200 = svcmla_f32_x(pred_full, zero200, v493, v123, 90); - svfloat32_t zero214; - asm volatile("mov %0.s, #0" : "=w"(zero214)); - svfloat32_t v214 = svcmla_f32_x(pred_full, zero214, v495, v125, 90); - svfloat32_t zero221; - asm volatile("mov %0.s, #0" : "=w"(zero221)); - svfloat32_t v221 = svcmla_f32_x(pred_full, zero221, v496, v126, 90); - svfloat32_t zero235; - asm volatile("mov %0.s, #0" : "=w"(zero235)); - svfloat32_t v235 = svcmla_f32_x(pred_full, zero235, v498, v128, 90); - svfloat32_t zero242; - asm volatile("mov %0.s, #0" : "=w"(zero242)); - svfloat32_t v242 = svcmla_f32_x(pred_full, zero242, v499, v129, 90); - svfloat32_t v103; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v103) : "w"(v100), "w"(v99)); - svfloat32_t v113; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v113) : "w"(v112), "w"(v96)); - svfloat32_t v193; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v193) : "w"(v122), "w"(v492)); - svfloat32_t zero256; - asm volatile("mov %0.s, #0" : "=w"(zero256)); - svfloat32_t v256 = svcmla_f32_x(pred_full, zero256, v501, v131, 90); - svfloat32_t v258 = svmla_f32_x(pred_full, v158, v114, v484); - svfloat32_t v259 = svmla_f32_x(pred_full, v163, v115, v485); - svfloat32_t v260 = svnmls_f32_x(pred_full, v163, v114, v484); - svfloat32_t v261 = svmla_f32_x(pred_full, v173, v117, v487); - svfloat32_t v262 = svmla_f32_x(pred_full, v178, v118, v488); - svfloat32_t v263 = svnmls_f32_x(pred_full, v178, v117, v487); - svfloat32_t v266 = svcmla_f32_x(pred_full, v214, v494, v124, 90); - svfloat32_t v267; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v267) : "w"(v200), "w"(v214)); - svfloat32_t v268 = svcmla_f32_x(pred_full, v235, v497, v127, 90); - svfloat32_t v269; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v269) : "w"(v221), "w"(v235)); - svfloat32_t v111; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v111) : "w"(v480), "w"(v103)); - svfloat32_t zero148; - asm volatile("mov %0.s, #0" : "=w"(zero148)); - svfloat32_t v148 = svcmla_f32_x(pred_full, zero148, v483, v113, 90); - svfloat32_t v264 = svmla_f32_x(pred_full, v193, v121, v491); - svfloat32_t v265 = svmla_f32_x(pred_full, v193, v120, v490); - svfloat32_t v270 = svcmla_f32_x(pred_full, v256, v500, v130, 90); - svfloat32_t v271; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v271) : "w"(v242), "w"(v256)); - svfloat32_t v290; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v290) : "w"(v266), "w"(v267)); - svfloat32_t v257 = svmls_f32_x(pred_full, v111, v103, v482); - svfloat32_t v272; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v272) : "w"(v262), "w"(v264)); - svfloat32_t v282; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v282) : "w"(v148), "w"(v268)); - svfloat32_t v284; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v284) : "w"(v270), "w"(v266)); - svfloat32_t v286; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v286) : "w"(v148), "w"(v271)); - svfloat32_t v288; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v288) : "w"(v271), "w"(v267)); - svfloat32_t v291; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v291) : "w"(v290), "w"(v268)); - svst1_f64(pred_full, (double *)(v509), svreinterpret_f64_f32(v111)); - svfloat32_t v273; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v273) : "w"(v272), "w"(v257)); - svfloat32_t v274; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v274) : "w"(v257), "w"(v259)); - svfloat32_t v276; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v276) : "w"(v257), "w"(v263)); - svfloat32_t v278; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v278) : "w"(v257), "w"(v260)); - svfloat32_t v280; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v280) : "w"(v257), "w"(v258)); - svfloat32_t v283; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v283) : "w"(v282), "w"(v270)); - svfloat32_t v285; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v285) : "w"(v284), "w"(v148)); - svfloat32_t v287; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v287) : "w"(v286), "w"(v269)); - svfloat32_t v289; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v289) : "w"(v288), "w"(v148)); - svfloat32_t v292; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v292) : "w"(v291), "w"(v269)); - svfloat32_t v275; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v275) : "w"(v274), "w"(v264)); - svfloat32_t v277; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v277) : "w"(v276), "w"(v265)); - svfloat32_t v279; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v279) : "w"(v278), "w"(v265)); - svfloat32_t v281; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v281) : "w"(v280), "w"(v261)); - svfloat32_t v293; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v293) : "w"(v292), "w"(v148)); - svfloat32_t v295; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v295) : "w"(v273), "w"(v283)); - svfloat32_t v302; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v302) : "w"(v273), "w"(v283)); - svfloat32_t v294; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v294) : "w"(v281), "w"(v293)); - svfloat32_t v296; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v296) : "w"(v275), "w"(v285)); - svfloat32_t v297; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v297) : "w"(v277), "w"(v287)); - svfloat32_t v298; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v298) : "w"(v279), "w"(v289)); - svfloat32_t v299; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v299) : "w"(v279), "w"(v289)); - svfloat32_t v300; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v300) : "w"(v277), "w"(v287)); - svfloat32_t v301; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v301) : "w"(v275), "w"(v285)); - svfloat32_t v303; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v303) : "w"(v281), "w"(v293)); - svst1_f64(pred_full, (double *)(v527), svreinterpret_f64_f32(v295)); - svst1_f64(pred_full, (double *)(v590), svreinterpret_f64_f32(v302)); - svst1_f64(pred_full, (double *)(v518), svreinterpret_f64_f32(v294)); - svst1_f64(pred_full, (double *)(v536), svreinterpret_f64_f32(v296)); - svst1_f64(pred_full, (double *)(v545), svreinterpret_f64_f32(v297)); - svst1_f64(pred_full, (double *)(v554), svreinterpret_f64_f32(v298)); - svst1_f64(pred_full, (double *)(v563), svreinterpret_f64_f32(v299)); - svst1_f64(pred_full, (double *)(v572), svreinterpret_f64_f32(v300)); - svst1_f64(pred_full, (double *)(v581), svreinterpret_f64_f32(v301)); - svst1_f64(pred_full, (double *)(v599), svreinterpret_f64_f32(v303)); - v5 += v11; - v6 += v12; - } -} -#endif - -#ifndef ARMRAL_ARCH_SVE -void armral_fft_cf32_cf32_cf32_ac_n_gu12(const armral_cmplx_f32_t *restrict x, - armral_cmplx_f32_t *restrict y, - int istride, int ostride, int howmany, - int idist, float dir) { - float v4 = dir; - const float32x2_t *v5 = (const float32x2_t *)x; - float32x2_t *v6 = (float32x2_t *)y; - for (int j = 0; j < howmany; j += 1) { - float32x2_t v74 = v5[istride]; - float v106 = 1.0000000000000000e+00F; - float v107 = -1.0000000000000000e+00F; - float v133 = -1.4999999999999998e+00F; - float v134 = 1.4999999999999998e+00F; - float v162 = 8.6602540378443871e-01F; - float32x2_t v165 = (float32x2_t){v4, v4}; - float v170 = -8.6602540378443871e-01F; - float32x2_t v32 = v5[0]; - float32x2_t v108 = (float32x2_t){v106, v107}; - float32x2_t v131 = (float32x2_t){v133, v133}; - float32x2_t v135 = (float32x2_t){v133, v134}; - float32x2_t v164 = (float32x2_t){v162, v170}; - float32x2_t v171 = (float32x2_t){v170, v170}; - float32x2_t v20 = v5[istride * 4]; - float32x2_t v25 = v5[istride * 8]; - float32x2_t v38 = v5[istride * 7]; - float32x2_t v43 = v5[istride * 11]; - float32x2_t v50 = v5[istride * 3]; - float32x2_t v56 = v5[istride * 10]; - float32x2_t v61 = v5[istride * 2]; - float32x2_t v68 = v5[istride * 6]; - float32x2_t v79 = v5[istride * 5]; - float32x2_t v86 = v5[istride * 9]; - float32x2_t v110 = vmul_f32(v165, v108); - float32x2_t v137 = vmul_f32(v165, v135); - float32x2_t v166 = vmul_f32(v165, v164); - float32x2_t v26 = vadd_f32(v20, v25); - float32x2_t v27 = vsub_f32(v20, v25); - float32x2_t v44 = vadd_f32(v38, v43); - float32x2_t v45 = vsub_f32(v38, v43); - float32x2_t v62 = vadd_f32(v56, v61); - float32x2_t v63 = vsub_f32(v56, v61); - float32x2_t v80 = vadd_f32(v74, v79); - float32x2_t v81 = vsub_f32(v74, v79); - float32x2_t v33 = vadd_f32(v26, v32); - float32x2_t v51 = vadd_f32(v44, v50); - float32x2_t v69 = vadd_f32(v62, v68); - float32x2_t v87 = vadd_f32(v80, v86); - float32x2_t v115 = vadd_f32(v26, v62); - float32x2_t v116 = vsub_f32(v26, v62); - float32x2_t v117 = vadd_f32(v44, v80); - float32x2_t v118 = vsub_f32(v44, v80); - float32x2_t v142 = vadd_f32(v27, v63); - float32x2_t v143 = vsub_f32(v27, v63); - float32x2_t v144 = vadd_f32(v45, v81); - float32x2_t v145 = vsub_f32(v45, v81); - float32x2_t v88 = vadd_f32(v33, v69); - float32x2_t v89 = vsub_f32(v33, v69); - float32x2_t v90 = vadd_f32(v51, v87); - float32x2_t v91 = vsub_f32(v51, v87); - float32x2_t v119 = vadd_f32(v115, v117); - float32x2_t v120 = vsub_f32(v115, v117); - float32x2_t v132 = vmul_f32(v116, v131); - float32x2_t v138 = vrev64_f32(v118); - float32x2_t v146 = vadd_f32(v142, v144); - float32x2_t v147 = vsub_f32(v142, v144); - float32x2_t v167 = vrev64_f32(v143); - float32x2_t v172 = vmul_f32(v145, v171); - float32x2_t v92 = vadd_f32(v88, v90); - float32x2_t v93 = vsub_f32(v88, v90); - float32x2_t v111 = vrev64_f32(v91); - float32x2_t v124 = vmul_f32(v119, v131); - float32x2_t v128 = vmul_f32(v120, v131); - float32x2_t v139 = vmul_f32(v138, v137); - float32x2_t v153 = vrev64_f32(v146); - float32x2_t v160 = vrev64_f32(v147); - float32x2_t v168 = vmul_f32(v167, v166); - float32x2_t v112 = vmul_f32(v111, v110); - float32x2_t v140 = vadd_f32(v132, v139); - float32x2_t v141 = vsub_f32(v132, v139); - float32x2_t v154 = vmul_f32(v153, v166); - float32x2_t v161 = vmul_f32(v160, v166); - float32x2_t v173 = vadd_f32(v168, v172); - float32x2_t v174 = vsub_f32(v168, v172); - float32x2_t v175 = vadd_f32(v92, v124); - v6[0] = v92; - float32x2_t v211 = vadd_f32(v93, v128); - v6[ostride * 6] = v93; - float32x2_t v113 = vadd_f32(v89, v112); - float32x2_t v114 = vsub_f32(v89, v112); - float32x2_t v176 = vadd_f32(v175, v154); - float32x2_t v177 = vsub_f32(v175, v154); - float32x2_t v212 = vadd_f32(v211, v161); - float32x2_t v213 = vsub_f32(v211, v161); - v6[ostride * 4] = v177; - v6[ostride * 8] = v176; - float32x2_t v193 = vadd_f32(v114, v141); - v6[ostride * 9] = v114; - v6[ostride * 10] = v213; - v6[ostride * 2] = v212; - float32x2_t v229 = vadd_f32(v113, v140); - v6[ostride * 3] = v113; - float32x2_t v194 = vadd_f32(v193, v174); - float32x2_t v195 = vsub_f32(v193, v174); - float32x2_t v230 = vadd_f32(v229, v173); - float32x2_t v231 = vsub_f32(v229, v173); - v6[ostride] = v195; - v6[ostride * 5] = v194; - v6[ostride * 7] = v231; - v6[ostride * 11] = v230; - v5 += 1 * idist; - v6 += 1 * 1; - } -} -#endif - -#ifdef ARMRAL_ARCH_SVE -void armral_fft_cf32_cf32_cf32_ac_n_gu12(const armral_cmplx_f32_t *restrict x, - armral_cmplx_f32_t *restrict y, - int istride, int ostride, int howmany, - int idist, float dir) { - int64_t v0 = istride; - int64_t v1 = idist; - int64_t v2 = ostride; - float v4 = dir; - const float32x2_t *v5 = (const float32x2_t *)x; - float32x2_t *v6 = (float32x2_t *)y; - int64_t v8 = howmany; - int64_t v10 = svcntd(); - int64_t v11 = v10 * v1; - int64_t v12 = v10 * 1; - for (int j = 0; j < v8; j += v10) { - svbool_t pred_full = svwhilelt_b32(j * 2, howmany * 2); - float v136 = -1.0000000000000000e+00F; - float v161 = -1.4999999999999998e+00F; - float v166 = 1.4999999999999998e+00F; - float v202 = -8.6602540378443871e-01F; - const float32x2_t *v392 = &v5[v0]; - float32x2_t *v468 = &v6[v2]; - int64_t v19 = v0 * 4; - int64_t v26 = v0 * 8; - int64_t v43 = v0 * 7; - int64_t v50 = v0 * 11; - int64_t v59 = v0 * 3; - int64_t v67 = v0 * 10; - int64_t v74 = v0 * 2; - int64_t v83 = v0 * 6; - int64_t v98 = v0 * 5; - int64_t v107 = v0 * 9; - float v139 = v4 * v136; - float v169 = v4 * v166; - float v198 = v4 * v202; - int64_t v219 = v2 * 4; - int64_t v226 = v2 * 8; - int64_t v236 = v2 * 9; - int64_t v250 = v2 * 5; - int64_t v260 = v2 * 6; - int64_t v267 = v2 * 10; - int64_t v274 = v2 * 2; - int64_t v284 = v2 * 3; - int64_t v291 = v2 * 7; - int64_t v298 = v2 * 11; - const float32x2_t *v329 = &v5[0]; - svint64_t v411 = svindex_s64(0, v1); - svfloat32_t v419 = svdup_n_f32(v161); - svfloat32_t v424 = svdup_n_f32(v202); - float32x2_t *v432 = &v6[0]; - const float32x2_t *v310 = &v5[v19]; - const float32x2_t *v319 = &v5[v26]; - svfloat32_t v331 = svreinterpret_f32_f64( - svld1_gather_s64index_f64(pred_full, (const double *)(v329), v411)); - const float32x2_t *v338 = &v5[v43]; - const float32x2_t *v347 = &v5[v50]; - const float32x2_t *v356 = &v5[v59]; - const float32x2_t *v365 = &v5[v67]; - const float32x2_t *v374 = &v5[v74]; - const float32x2_t *v383 = &v5[v83]; - svfloat32_t v394 = svreinterpret_f32_f64( - svld1_gather_s64index_f64(pred_full, (const double *)(v392), v411)); - const float32x2_t *v401 = &v5[v98]; - const float32x2_t *v410 = &v5[v107]; - svfloat32_t v416 = svdup_n_f32(v139); - svfloat32_t v420 = svdup_n_f32(v169); - svfloat32_t v423 = svdup_n_f32(v198); - float32x2_t *v441 = &v6[v219]; - float32x2_t *v450 = &v6[v226]; - float32x2_t *v459 = &v6[v236]; - float32x2_t *v477 = &v6[v250]; - float32x2_t *v486 = &v6[v260]; - float32x2_t *v495 = &v6[v267]; - float32x2_t *v504 = &v6[v274]; - float32x2_t *v513 = &v6[v284]; - float32x2_t *v522 = &v6[v291]; - float32x2_t *v531 = &v6[v298]; - svfloat32_t v312 = svreinterpret_f32_f64( - svld1_gather_s64index_f64(pred_full, (const double *)(v310), v411)); - svfloat32_t v321 = svreinterpret_f32_f64( - svld1_gather_s64index_f64(pred_full, (const double *)(v319), v411)); - svfloat32_t v340 = svreinterpret_f32_f64( - svld1_gather_s64index_f64(pred_full, (const double *)(v338), v411)); - svfloat32_t v349 = svreinterpret_f32_f64( - svld1_gather_s64index_f64(pred_full, (const double *)(v347), v411)); - svfloat32_t v358 = svreinterpret_f32_f64( - svld1_gather_s64index_f64(pred_full, (const double *)(v356), v411)); - svfloat32_t v367 = svreinterpret_f32_f64( - svld1_gather_s64index_f64(pred_full, (const double *)(v365), v411)); - svfloat32_t v376 = svreinterpret_f32_f64( - svld1_gather_s64index_f64(pred_full, (const double *)(v374), v411)); - svfloat32_t v385 = svreinterpret_f32_f64( - svld1_gather_s64index_f64(pred_full, (const double *)(v383), v411)); - svfloat32_t v403 = svreinterpret_f32_f64( - svld1_gather_s64index_f64(pred_full, (const double *)(v401), v411)); - svfloat32_t v412 = svreinterpret_f32_f64( - svld1_gather_s64index_f64(pred_full, (const double *)(v410), v411)); - svfloat32_t v32; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v32) : "w"(v312), "w"(v321)); - svfloat32_t v33; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v33) : "w"(v312), "w"(v321)); - svfloat32_t v56; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v56) : "w"(v340), "w"(v349)); - svfloat32_t v57; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v57) : "w"(v340), "w"(v349)); - svfloat32_t v80; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v80) : "w"(v367), "w"(v376)); - svfloat32_t v81; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v81) : "w"(v367), "w"(v376)); - svfloat32_t v104; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v104) : "w"(v394), "w"(v403)); - svfloat32_t v105; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v105) : "w"(v394), "w"(v403)); - svfloat32_t v41; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v41) : "w"(v32), "w"(v331)); - svfloat32_t v65; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v65) : "w"(v56), "w"(v358)); - svfloat32_t v89; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v89) : "w"(v80), "w"(v385)); - svfloat32_t v113; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v113) : "w"(v104), "w"(v412)); - svfloat32_t v144; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v144) : "w"(v32), "w"(v80)); - svfloat32_t v145; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v145) : "w"(v32), "w"(v80)); - svfloat32_t v146; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v146) : "w"(v56), "w"(v104)); - svfloat32_t v147; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v147) : "w"(v56), "w"(v104)); - svfloat32_t v174; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v174) : "w"(v33), "w"(v81)); - svfloat32_t v175; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v175) : "w"(v33), "w"(v81)); - svfloat32_t v176; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v176) : "w"(v57), "w"(v105)); - svfloat32_t v177; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v177) : "w"(v57), "w"(v105)); - svfloat32_t v114; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v114) : "w"(v41), "w"(v89)); - svfloat32_t v115; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v115) : "w"(v41), "w"(v89)); - svfloat32_t v116; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v116) : "w"(v65), "w"(v113)); - svfloat32_t v117; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v117) : "w"(v65), "w"(v113)); - svfloat32_t v148; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v148) : "w"(v144), "w"(v146)); - svfloat32_t v149; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v149) : "w"(v144), "w"(v146)); - svfloat32_t zero171; - asm volatile("mov %0.s, #0" : "=w"(zero171)); - svfloat32_t v171 = svcmla_f32_x(pred_full, zero171, v420, v147, 90); - svfloat32_t v178; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v178) : "w"(v174), "w"(v176)); - svfloat32_t v179; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v179) : "w"(v174), "w"(v176)); - svfloat32_t zero200; - asm volatile("mov %0.s, #0" : "=w"(zero200)); - svfloat32_t v200 = svcmla_f32_x(pred_full, zero200, v423, v175, 90); - svfloat32_t v118; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v118) : "w"(v114), "w"(v116)); - svfloat32_t v119; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v119) : "w"(v114), "w"(v116)); - svfloat32_t zero141; - asm volatile("mov %0.s, #0" : "=w"(zero141)); - svfloat32_t v141 = svcmla_f32_x(pred_full, zero141, v416, v117, 90); - svfloat32_t v172 = svmla_f32_x(pred_full, v171, v145, v419); - svfloat32_t v173 = svnmls_f32_x(pred_full, v171, v145, v419); - svfloat32_t zero186; - asm volatile("mov %0.s, #0" : "=w"(zero186)); - svfloat32_t v186 = svcmla_f32_x(pred_full, zero186, v423, v178, 90); - svfloat32_t zero193; - asm volatile("mov %0.s, #0" : "=w"(zero193)); - svfloat32_t v193 = svcmla_f32_x(pred_full, zero193, v423, v179, 90); - svfloat32_t v206 = svmla_f32_x(pred_full, v200, v177, v424); - svfloat32_t v207 = svmls_f32_x(pred_full, v200, v177, v424); - svfloat32_t v142; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v142) : "w"(v115), "w"(v141)); - svfloat32_t v143; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v143) : "w"(v115), "w"(v141)); - svfloat32_t v208 = svmla_f32_x(pred_full, v118, v148, v419); - svfloat32_t v256 = svmla_f32_x(pred_full, v119, v149, v419); - svst1_f64(pred_full, (double *)(v432), svreinterpret_f64_f32(v118)); - svst1_f64(pred_full, (double *)(v486), svreinterpret_f64_f32(v119)); - svfloat32_t v209; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v209) : "w"(v208), "w"(v186)); - svfloat32_t v210; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v210) : "w"(v208), "w"(v186)); - svfloat32_t v232; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v232) : "w"(v143), "w"(v173)); - svfloat32_t v257; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v257) : "w"(v256), "w"(v193)); - svfloat32_t v258; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v258) : "w"(v256), "w"(v193)); - svfloat32_t v280; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v280) : "w"(v142), "w"(v172)); - svst1_f64(pred_full, (double *)(v459), svreinterpret_f64_f32(v143)); - svst1_f64(pred_full, (double *)(v513), svreinterpret_f64_f32(v142)); - svfloat32_t v233; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v233) : "w"(v232), "w"(v207)); - svfloat32_t v234; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v234) : "w"(v232), "w"(v207)); - svfloat32_t v281; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v281) : "w"(v280), "w"(v206)); - svfloat32_t v282; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v282) : "w"(v280), "w"(v206)); - svst1_f64(pred_full, (double *)(v441), svreinterpret_f64_f32(v210)); - svst1_f64(pred_full, (double *)(v450), svreinterpret_f64_f32(v209)); - svst1_f64(pred_full, (double *)(v495), svreinterpret_f64_f32(v258)); - svst1_f64(pred_full, (double *)(v504), svreinterpret_f64_f32(v257)); - svst1_f64(pred_full, (double *)(v468), svreinterpret_f64_f32(v234)); - svst1_f64(pred_full, (double *)(v477), svreinterpret_f64_f32(v233)); - svst1_f64(pred_full, (double *)(v522), svreinterpret_f64_f32(v282)); - svst1_f64(pred_full, (double *)(v531), svreinterpret_f64_f32(v281)); - v5 += v11; - v6 += v12; - } -} -#endif - #ifndef ARMRAL_ARCH_SVE void armral_fft_cf32_cf32_cf32_ac_n_gu13(const armral_cmplx_f32_t *restrict x, armral_cmplx_f32_t *restrict y, @@ -2982,218 +387,125 @@ void armral_fft_cf32_cf32_cf32_ac_n_gu13(const armral_cmplx_f32_t *restrict x, svld1_gather_s64index_f64(pred_full, (const double *)(v519), v539)); svfloat32_t v530 = svreinterpret_f32_f64( svld1_gather_s64index_f64(pred_full, (const double *)(v528), v539)); - svfloat32_t v32; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v32) : "w"(v431), "w"(v440)); - svfloat32_t v47; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v47) : "w"(v449), "w"(v458)); - svfloat32_t v62; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v62) : "w"(v467), "w"(v476)); - svfloat32_t v77; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v77) : "w"(v485), "w"(v494)); - svfloat32_t v92; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v92) : "w"(v503), "w"(v512)); - svfloat32_t v107; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v107) : "w"(v521), "w"(v530)); - svfloat32_t v108; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v108) : "w"(v431), "w"(v440)); - svfloat32_t v109; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v109) : "w"(v449), "w"(v458)); - svfloat32_t v110; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v110) : "w"(v467), "w"(v476)); - svfloat32_t v111; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v111) : "w"(v485), "w"(v494)); - svfloat32_t v112; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v112) : "w"(v503), "w"(v512)); - svfloat32_t v113; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v113) : "w"(v521), "w"(v530)); - svfloat32_t v114; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v114) : "w"(v47), "w"(v92)); - svfloat32_t v116; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v116) : "w"(v32), "w"(v62)); - svfloat32_t v119; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v119) : "w"(v109), "w"(v112)); - svfloat32_t v121; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v121) : "w"(v108), "w"(v110)); - svfloat32_t v123; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v123) : "w"(v47), "w"(v107)); - svfloat32_t v124; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v124) : "w"(v62), "w"(v77)); - svfloat32_t v125; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v125) : "w"(v32), "w"(v77)); - svfloat32_t v126; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v126) : "w"(v92), "w"(v107)); - svfloat32_t v131; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v131) : "w"(v109), "w"(v113)); - svfloat32_t v132; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v132) : "w"(v108), "w"(v110)); - svfloat32_t v133; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v133) : "w"(v109), "w"(v112)); - svfloat32_t v134; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v134) : "w"(v108), "w"(v111)); - svfloat32_t v135; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v135) : "w"(v112), "w"(v113)); - svfloat32_t v136; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v136) : "w"(v110), "w"(v111)); - svfloat32_t v115; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v115) : "w"(v114), "w"(v107)); - svfloat32_t v117; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v117) : "w"(v116), "w"(v77)); - svfloat32_t v120; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v120) : "w"(v119), "w"(v113)); - svfloat32_t v122; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v122) : "w"(v121), "w"(v111)); - svfloat32_t v127; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v127) : "w"(v123), "w"(v124)); - svfloat32_t v128; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v128) : "w"(v125), "w"(v126)); - svfloat32_t v129; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v129) : "w"(v123), "w"(v124)); - svfloat32_t v130; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v130) : "w"(v125), "w"(v126)); - svfloat32_t v149; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v149) : "w"(v131), "w"(v132)); - svfloat32_t v150; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v150) : "w"(v133), "w"(v134)); - svfloat32_t v151; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v151) : "w"(v135), "w"(v136)); - svfloat32_t zero224; - asm volatile("mov %0.s, #0" : "=w"(zero224)); + svfloat32_t v32 = svadd_f32_x(svptrue_b32(), v431, v440); + svfloat32_t v47 = svadd_f32_x(svptrue_b32(), v449, v458); + svfloat32_t v62 = svadd_f32_x(svptrue_b32(), v467, v476); + svfloat32_t v77 = svadd_f32_x(svptrue_b32(), v485, v494); + svfloat32_t v92 = svadd_f32_x(svptrue_b32(), v503, v512); + svfloat32_t v107 = svadd_f32_x(svptrue_b32(), v521, v530); + svfloat32_t v108 = svsub_f32_x(svptrue_b32(), v431, v440); + svfloat32_t v109 = svsub_f32_x(svptrue_b32(), v449, v458); + svfloat32_t v110 = svsub_f32_x(svptrue_b32(), v467, v476); + svfloat32_t v111 = svsub_f32_x(svptrue_b32(), v485, v494); + svfloat32_t v112 = svsub_f32_x(svptrue_b32(), v503, v512); + svfloat32_t v113 = svsub_f32_x(svptrue_b32(), v521, v530); + svfloat32_t v114 = svadd_f32_x(svptrue_b32(), v47, v92); + svfloat32_t v116 = svadd_f32_x(svptrue_b32(), v32, v62); + svfloat32_t v119 = svadd_f32_x(svptrue_b32(), v109, v112); + svfloat32_t v121 = svadd_f32_x(svptrue_b32(), v108, v110); + svfloat32_t v123 = svsub_f32_x(svptrue_b32(), v47, v107); + svfloat32_t v124 = svsub_f32_x(svptrue_b32(), v62, v77); + svfloat32_t v125 = svsub_f32_x(svptrue_b32(), v32, v77); + svfloat32_t v126 = svsub_f32_x(svptrue_b32(), v92, v107); + svfloat32_t v131 = svsub_f32_x(svptrue_b32(), v109, v113); + svfloat32_t v132 = svsub_f32_x(svptrue_b32(), v108, v110); + svfloat32_t v133 = svsub_f32_x(svptrue_b32(), v109, v112); + svfloat32_t v134 = svadd_f32_x(svptrue_b32(), v108, v111); + svfloat32_t v135 = svsub_f32_x(svptrue_b32(), v112, v113); + svfloat32_t v136 = svadd_f32_x(svptrue_b32(), v110, v111); + svfloat32_t v115 = svadd_f32_x(svptrue_b32(), v114, v107); + svfloat32_t v117 = svadd_f32_x(svptrue_b32(), v116, v77); + svfloat32_t v120 = svadd_f32_x(svptrue_b32(), v119, v113); + svfloat32_t v122 = svsub_f32_x(svptrue_b32(), v121, v111); + svfloat32_t v127 = svsub_f32_x(svptrue_b32(), v123, v124); + svfloat32_t v128 = svsub_f32_x(svptrue_b32(), v125, v126); + svfloat32_t v129 = svadd_f32_x(svptrue_b32(), v123, v124); + svfloat32_t v130 = svadd_f32_x(svptrue_b32(), v125, v126); + svfloat32_t v149 = svadd_f32_x(svptrue_b32(), v131, v132); + svfloat32_t v150 = svadd_f32_x(svptrue_b32(), v133, v134); + svfloat32_t v151 = svsub_f32_x(svptrue_b32(), v135, v136); + svfloat32_t zero224 = svdup_n_f32(0); svfloat32_t v224 = svcmla_f32_x(pred_full, zero224, v553, v131, 90); - svfloat32_t zero231; - asm volatile("mov %0.s, #0" : "=w"(zero231)); + svfloat32_t zero231 = svdup_n_f32(0); svfloat32_t v231 = svcmla_f32_x(pred_full, zero231, v554, v132, 90); - svfloat32_t zero245; - asm volatile("mov %0.s, #0" : "=w"(zero245)); + svfloat32_t zero245 = svdup_n_f32(0); svfloat32_t v245 = svcmla_f32_x(pred_full, zero245, v556, v133, 90); - svfloat32_t zero252; - asm volatile("mov %0.s, #0" : "=w"(zero252)); + svfloat32_t zero252 = svdup_n_f32(0); svfloat32_t v252 = svcmla_f32_x(pred_full, zero252, v557, v134, 90); - svfloat32_t zero266; - asm volatile("mov %0.s, #0" : "=w"(zero266)); + svfloat32_t zero266 = svdup_n_f32(0); svfloat32_t v266 = svcmla_f32_x(pred_full, zero266, v559, v135, 90); - svfloat32_t v118; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v118) : "w"(v115), "w"(v117)); - svfloat32_t v145; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v145) : "w"(v117), "w"(v115)); - svfloat32_t v146; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v146) : "w"(v120), "w"(v122)); - svfloat32_t v147; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v147) : "w"(v127), "w"(v128)); - svfloat32_t v148; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v148) : "w"(v129), "w"(v130)); - svfloat32_t zero173; - asm volatile("mov %0.s, #0" : "=w"(zero173)); + svfloat32_t v118 = svadd_f32_x(svptrue_b32(), v115, v117); + svfloat32_t v145 = svsub_f32_x(svptrue_b32(), v117, v115); + svfloat32_t v146 = svadd_f32_x(svptrue_b32(), v120, v122); + svfloat32_t v147 = svadd_f32_x(svptrue_b32(), v127, v128); + svfloat32_t v148 = svsub_f32_x(svptrue_b32(), v129, v130); + svfloat32_t zero173 = svdup_n_f32(0); svfloat32_t v173 = svcmla_f32_x(pred_full, zero173, v544, v120, 90); - svfloat32_t zero180; - asm volatile("mov %0.s, #0" : "=w"(zero180)); + svfloat32_t zero180 = svdup_n_f32(0); svfloat32_t v180 = svcmla_f32_x(pred_full, zero180, v545, v122, 90); - svfloat32_t v192; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v192) : "w"(v127), "w"(v547)); - svfloat32_t zero238; - asm volatile("mov %0.s, #0" : "=w"(zero238)); + svfloat32_t v192 = svmul_f32_x(svptrue_b32(), v127, v547); + svfloat32_t zero238 = svdup_n_f32(0); svfloat32_t v238 = svcmla_f32_x(pred_full, zero238, v555, v149, 90); - svfloat32_t zero259; - asm volatile("mov %0.s, #0" : "=w"(zero259)); + svfloat32_t zero259 = svdup_n_f32(0); svfloat32_t v259 = svcmla_f32_x(pred_full, zero259, v558, v150, 90); - svfloat32_t zero280; - asm volatile("mov %0.s, #0" : "=w"(zero280)); + svfloat32_t zero280 = svdup_n_f32(0); svfloat32_t v280 = svcmla_f32_x(pred_full, zero280, v561, v151, 90); - svfloat32_t v144; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v144) : "w"(v540), "w"(v118)); - svfloat32_t zero187; - asm volatile("mov %0.s, #0" : "=w"(zero187)); + svfloat32_t v144 = svadd_f32_x(svptrue_b32(), v540, v118); + svfloat32_t zero187 = svdup_n_f32(0); svfloat32_t v187 = svcmla_f32_x(pred_full, zero187, v546, v146, 90); - svfloat32_t v202; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v202) : "w"(v147), "w"(v549)); + svfloat32_t v202 = svmul_f32_x(svptrue_b32(), v147, v549); svfloat32_t v282 = svmla_f32_x(pred_full, v192, v128, v548); - svfloat32_t v294; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v294) : "w"(v224), "w"(v238)); - svfloat32_t v295; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v295) : "w"(v231), "w"(v238)); - svfloat32_t v296; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v296) : "w"(v245), "w"(v259)); - svfloat32_t v297; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v297) : "w"(v252), "w"(v259)); - svfloat32_t v298; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v298) : "w"(v266), "w"(v280)); + svfloat32_t v294 = svsub_f32_x(svptrue_b32(), v224, v238); + svfloat32_t v295 = svsub_f32_x(svptrue_b32(), v231, v238); + svfloat32_t v296 = svsub_f32_x(svptrue_b32(), v245, v259); + svfloat32_t v297 = svsub_f32_x(svptrue_b32(), v252, v259); + svfloat32_t v298 = svsub_f32_x(svptrue_b32(), v266, v280); svfloat32_t v299 = svcmla_f32_x(pred_full, v280, v560, v136, 90); svfloat32_t v281 = svmls_f32_x(pred_full, v144, v118, v542); svfloat32_t v283 = svmls_f32_x(pred_full, v282, v145, v543); svfloat32_t v284 = svmla_f32_x(pred_full, v202, v128, v548); svfloat32_t v286 = svnmls_f32_x(pred_full, v192, v147, v549); - svfloat32_t v300; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v300) : "w"(v173), "w"(v187)); - svfloat32_t v301; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v301) : "w"(v180), "w"(v187)); - svfloat32_t v312; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v312) : "w"(v294), "w"(v298)); - svfloat32_t v314; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v314) : "w"(v296), "w"(v298)); - svfloat32_t v316; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v316) : "w"(v295), "w"(v299)); + svfloat32_t v300 = svsub_f32_x(svptrue_b32(), v173, v187); + svfloat32_t v301 = svsub_f32_x(svptrue_b32(), v180, v187); + svfloat32_t v312 = svadd_f32_x(svptrue_b32(), v294, v298); + svfloat32_t v314 = svadd_f32_x(svptrue_b32(), v296, v298); + svfloat32_t v316 = svsub_f32_x(svptrue_b32(), v295, v299); svst1_f64(pred_full, (double *)(v569), svreinterpret_f64_f32(v144)); svfloat32_t v285 = svmla_f32_x(pred_full, v284, v145, v543); svfloat32_t v287 = svmls_f32_x(pred_full, v286, v145, v543); svfloat32_t v288 = svmla_f32_x(pred_full, v281, v129, v550); svfloat32_t v290 = svmls_f32_x(pred_full, v281, v130, v551); svfloat32_t v292 = svmls_f32_x(pred_full, v281, v129, v550); - svfloat32_t v308; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v308) : "w"(v301), "w"(v294)); - svfloat32_t v310; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v310) : "w"(v299), "w"(v300)); - svfloat32_t v313; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v313) : "w"(v312), "w"(v301)); - svfloat32_t v315; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v315) : "w"(v314), "w"(v301)); - svfloat32_t v317; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v317) : "w"(v316), "w"(v300)); - svfloat32_t v318; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v318) : "w"(v300), "w"(v295)); + svfloat32_t v308 = svsub_f32_x(svptrue_b32(), v301, v294); + svfloat32_t v310 = svsub_f32_x(svptrue_b32(), v299, v300); + svfloat32_t v313 = svadd_f32_x(svptrue_b32(), v312, v301); + svfloat32_t v315 = svsub_f32_x(svptrue_b32(), v314, v301); + svfloat32_t v317 = svsub_f32_x(svptrue_b32(), v316, v300); + svfloat32_t v318 = svadd_f32_x(svptrue_b32(), v300, v295); svfloat32_t v289 = svmla_f32_x(pred_full, v288, v130, v551); svfloat32_t v291 = svmls_f32_x(pred_full, v290, v148, v552); svfloat32_t v293 = svmla_f32_x(pred_full, v292, v148, v552); - svfloat32_t v309; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v309) : "w"(v308), "w"(v296)); - svfloat32_t v311; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v311) : "w"(v310), "w"(v297)); - svfloat32_t v319; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v319) : "w"(v318), "w"(v297)); - svfloat32_t v302; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v302) : "w"(v283), "w"(v289)); - svfloat32_t v303; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v303) : "w"(v285), "w"(v291)); - svfloat32_t v304; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v304) : "w"(v291), "w"(v285)); - svfloat32_t v305; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v305) : "w"(v287), "w"(v293)); - svfloat32_t v306; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v306) : "w"(v289), "w"(v283)); - svfloat32_t v307; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v307) : "w"(v293), "w"(v287)); - svfloat32_t v320; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v320) : "w"(v302), "w"(v309)); - svfloat32_t v321; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v321) : "w"(v303), "w"(v311)); - svfloat32_t v322; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v322) : "w"(v304), "w"(v313)); - svfloat32_t v323; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v323) : "w"(v305), "w"(v315)); - svfloat32_t v324; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v324) : "w"(v306), "w"(v317)); - svfloat32_t v325; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v325) : "w"(v307), "w"(v319)); - svfloat32_t v326; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v326) : "w"(v307), "w"(v319)); - svfloat32_t v327; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v327) : "w"(v306), "w"(v317)); - svfloat32_t v328; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v328) : "w"(v305), "w"(v315)); - svfloat32_t v329; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v329) : "w"(v304), "w"(v313)); - svfloat32_t v330; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v330) : "w"(v303), "w"(v311)); - svfloat32_t v331; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v331) : "w"(v302), "w"(v309)); + svfloat32_t v309 = svadd_f32_x(svptrue_b32(), v308, v296); + svfloat32_t v311 = svsub_f32_x(svptrue_b32(), v310, v297); + svfloat32_t v319 = svsub_f32_x(svptrue_b32(), v318, v297); + svfloat32_t v302 = svadd_f32_x(svptrue_b32(), v283, v289); + svfloat32_t v303 = svadd_f32_x(svptrue_b32(), v285, v291); + svfloat32_t v304 = svsub_f32_x(svptrue_b32(), v291, v285); + svfloat32_t v305 = svadd_f32_x(svptrue_b32(), v287, v293); + svfloat32_t v306 = svsub_f32_x(svptrue_b32(), v289, v283); + svfloat32_t v307 = svsub_f32_x(svptrue_b32(), v293, v287); + svfloat32_t v320 = svsub_f32_x(svptrue_b32(), v302, v309); + svfloat32_t v321 = svadd_f32_x(svptrue_b32(), v303, v311); + svfloat32_t v322 = svsub_f32_x(svptrue_b32(), v304, v313); + svfloat32_t v323 = svsub_f32_x(svptrue_b32(), v305, v315); + svfloat32_t v324 = svadd_f32_x(svptrue_b32(), v306, v317); + svfloat32_t v325 = svsub_f32_x(svptrue_b32(), v307, v319); + svfloat32_t v326 = svadd_f32_x(svptrue_b32(), v307, v319); + svfloat32_t v327 = svsub_f32_x(svptrue_b32(), v306, v317); + svfloat32_t v328 = svadd_f32_x(svptrue_b32(), v305, v315); + svfloat32_t v329 = svadd_f32_x(svptrue_b32(), v304, v313); + svfloat32_t v330 = svsub_f32_x(svptrue_b32(), v303, v311); + svfloat32_t v331 = svadd_f32_x(svptrue_b32(), v302, v309); svst1_f64(pred_full, (double *)(v578), svreinterpret_f64_f32(v320)); svst1_f64(pred_full, (double *)(v587), svreinterpret_f64_f32(v321)); svst1_f64(pred_full, (double *)(v596), svreinterpret_f64_f32(v322)); @@ -3508,190 +820,110 @@ void armral_fft_cf32_cf32_cf32_ac_n_gu14(const armral_cmplx_f32_t *restrict x, svld1_gather_s64index_f64(pred_full, (const double *)(v521), v531)); svfloat32_t v532 = svreinterpret_f32_f64( svld1_gather_s64index_f64(pred_full, (const double *)(v530), v531)); - svfloat32_t v32; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v32) : "w"(v415), "w"(v424)); - svfloat32_t v33; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v33) : "w"(v415), "w"(v424)); - svfloat32_t v48; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v48) : "w"(v433), "w"(v442)); - svfloat32_t v49; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v49) : "w"(v433), "w"(v442)); - svfloat32_t v64; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v64) : "w"(v451), "w"(v460)); - svfloat32_t v65; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v65) : "w"(v451), "w"(v460)); - svfloat32_t v80; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v80) : "w"(v469), "w"(v478)); - svfloat32_t v81; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v81) : "w"(v469), "w"(v478)); - svfloat32_t v96; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v96) : "w"(v487), "w"(v496)); - svfloat32_t v97; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v97) : "w"(v487), "w"(v496)); - svfloat32_t v112; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v112) : "w"(v505), "w"(v514)); - svfloat32_t v113; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v113) : "w"(v505), "w"(v514)); - svfloat32_t v128; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v128) : "w"(v523), "w"(v532)); - svfloat32_t v129; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v129) : "w"(v523), "w"(v532)); - svfloat32_t v130; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v130) : "w"(v48), "w"(v128)); - svfloat32_t v131; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v131) : "w"(v48), "w"(v128)); - svfloat32_t v132; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v132) : "w"(v96), "w"(v80)); - svfloat32_t v133; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v133) : "w"(v96), "w"(v80)); - svfloat32_t v134; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v134) : "w"(v64), "w"(v112)); - svfloat32_t v135; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v135) : "w"(v64), "w"(v112)); - svfloat32_t v219; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v219) : "w"(v49), "w"(v129)); - svfloat32_t v220; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v220) : "w"(v49), "w"(v129)); - svfloat32_t v221; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v221) : "w"(v97), "w"(v81)); - svfloat32_t v222; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v222) : "w"(v97), "w"(v81)); - svfloat32_t v223; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v223) : "w"(v65), "w"(v113)); - svfloat32_t v224; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v224) : "w"(v65), "w"(v113)); - svfloat32_t v136; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v136) : "w"(v130), "w"(v132)); - svfloat32_t v139; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v139) : "w"(v130), "w"(v132)); - svfloat32_t v140; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v140) : "w"(v132), "w"(v134)); - svfloat32_t v141; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v141) : "w"(v134), "w"(v130)); - svfloat32_t v142; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v142) : "w"(v131), "w"(v133)); - svfloat32_t v144; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v144) : "w"(v131), "w"(v133)); - svfloat32_t v145; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v145) : "w"(v133), "w"(v135)); - svfloat32_t v146; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v146) : "w"(v135), "w"(v131)); - svfloat32_t v225; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v225) : "w"(v219), "w"(v221)); - svfloat32_t v228; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v228) : "w"(v219), "w"(v221)); - svfloat32_t v229; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v229) : "w"(v221), "w"(v223)); - svfloat32_t v230; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v230) : "w"(v223), "w"(v219)); - svfloat32_t v231; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v231) : "w"(v220), "w"(v222)); - svfloat32_t v233; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v233) : "w"(v220), "w"(v222)); - svfloat32_t v234; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v234) : "w"(v222), "w"(v224)); - svfloat32_t v235; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v235) : "w"(v224), "w"(v220)); - svfloat32_t v137; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v137) : "w"(v136), "w"(v134)); - svfloat32_t v143; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v143) : "w"(v142), "w"(v135)); - svfloat32_t zero185; - asm volatile("mov %0.s, #0" : "=w"(zero185)); + svfloat32_t v32 = svadd_f32_x(svptrue_b32(), v415, v424); + svfloat32_t v33 = svsub_f32_x(svptrue_b32(), v415, v424); + svfloat32_t v48 = svadd_f32_x(svptrue_b32(), v433, v442); + svfloat32_t v49 = svsub_f32_x(svptrue_b32(), v433, v442); + svfloat32_t v64 = svadd_f32_x(svptrue_b32(), v451, v460); + svfloat32_t v65 = svsub_f32_x(svptrue_b32(), v451, v460); + svfloat32_t v80 = svadd_f32_x(svptrue_b32(), v469, v478); + svfloat32_t v81 = svsub_f32_x(svptrue_b32(), v469, v478); + svfloat32_t v96 = svadd_f32_x(svptrue_b32(), v487, v496); + svfloat32_t v97 = svsub_f32_x(svptrue_b32(), v487, v496); + svfloat32_t v112 = svadd_f32_x(svptrue_b32(), v505, v514); + svfloat32_t v113 = svsub_f32_x(svptrue_b32(), v505, v514); + svfloat32_t v128 = svadd_f32_x(svptrue_b32(), v523, v532); + svfloat32_t v129 = svsub_f32_x(svptrue_b32(), v523, v532); + svfloat32_t v130 = svadd_f32_x(svptrue_b32(), v48, v128); + svfloat32_t v131 = svsub_f32_x(svptrue_b32(), v48, v128); + svfloat32_t v132 = svadd_f32_x(svptrue_b32(), v96, v80); + svfloat32_t v133 = svsub_f32_x(svptrue_b32(), v96, v80); + svfloat32_t v134 = svadd_f32_x(svptrue_b32(), v64, v112); + svfloat32_t v135 = svsub_f32_x(svptrue_b32(), v64, v112); + svfloat32_t v219 = svadd_f32_x(svptrue_b32(), v49, v129); + svfloat32_t v220 = svsub_f32_x(svptrue_b32(), v49, v129); + svfloat32_t v221 = svadd_f32_x(svptrue_b32(), v97, v81); + svfloat32_t v222 = svsub_f32_x(svptrue_b32(), v97, v81); + svfloat32_t v223 = svadd_f32_x(svptrue_b32(), v65, v113); + svfloat32_t v224 = svsub_f32_x(svptrue_b32(), v65, v113); + svfloat32_t v136 = svadd_f32_x(svptrue_b32(), v130, v132); + svfloat32_t v139 = svsub_f32_x(svptrue_b32(), v130, v132); + svfloat32_t v140 = svsub_f32_x(svptrue_b32(), v132, v134); + svfloat32_t v141 = svsub_f32_x(svptrue_b32(), v134, v130); + svfloat32_t v142 = svadd_f32_x(svptrue_b32(), v131, v133); + svfloat32_t v144 = svsub_f32_x(svptrue_b32(), v131, v133); + svfloat32_t v145 = svsub_f32_x(svptrue_b32(), v133, v135); + svfloat32_t v146 = svsub_f32_x(svptrue_b32(), v135, v131); + svfloat32_t v225 = svadd_f32_x(svptrue_b32(), v219, v221); + svfloat32_t v228 = svsub_f32_x(svptrue_b32(), v219, v221); + svfloat32_t v229 = svsub_f32_x(svptrue_b32(), v221, v223); + svfloat32_t v230 = svsub_f32_x(svptrue_b32(), v223, v219); + svfloat32_t v231 = svadd_f32_x(svptrue_b32(), v220, v222); + svfloat32_t v233 = svsub_f32_x(svptrue_b32(), v220, v222); + svfloat32_t v234 = svsub_f32_x(svptrue_b32(), v222, v224); + svfloat32_t v235 = svsub_f32_x(svptrue_b32(), v224, v220); + svfloat32_t v137 = svadd_f32_x(svptrue_b32(), v136, v134); + svfloat32_t v143 = svadd_f32_x(svptrue_b32(), v142, v135); + svfloat32_t zero185 = svdup_n_f32(0); svfloat32_t v185 = svcmla_f32_x(pred_full, zero185, v548, v144, 90); - svfloat32_t zero192; - asm volatile("mov %0.s, #0" : "=w"(zero192)); + svfloat32_t zero192 = svdup_n_f32(0); svfloat32_t v192 = svcmla_f32_x(pred_full, zero192, v549, v145, 90); - svfloat32_t zero199; - asm volatile("mov %0.s, #0" : "=w"(zero199)); + svfloat32_t zero199 = svdup_n_f32(0); svfloat32_t v199 = svcmla_f32_x(pred_full, zero199, v550, v146, 90); - svfloat32_t v226; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v226) : "w"(v225), "w"(v223)); - svfloat32_t v232; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v232) : "w"(v231), "w"(v224)); - svfloat32_t zero274; - asm volatile("mov %0.s, #0" : "=w"(zero274)); + svfloat32_t v226 = svadd_f32_x(svptrue_b32(), v225, v223); + svfloat32_t v232 = svadd_f32_x(svptrue_b32(), v231, v224); + svfloat32_t zero274 = svdup_n_f32(0); svfloat32_t v274 = svcmla_f32_x(pred_full, zero274, v548, v233, 90); - svfloat32_t zero281; - asm volatile("mov %0.s, #0" : "=w"(zero281)); + svfloat32_t zero281 = svdup_n_f32(0); svfloat32_t v281 = svcmla_f32_x(pred_full, zero281, v549, v234, 90); - svfloat32_t zero288; - asm volatile("mov %0.s, #0" : "=w"(zero288)); + svfloat32_t zero288 = svdup_n_f32(0); svfloat32_t v288 = svcmla_f32_x(pred_full, zero288, v550, v235, 90); - svfloat32_t v138; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v138) : "w"(v137), "w"(v32)); - svfloat32_t zero178; - asm volatile("mov %0.s, #0" : "=w"(zero178)); + svfloat32_t v138 = svadd_f32_x(svptrue_b32(), v137, v32); + svfloat32_t zero178 = svdup_n_f32(0); svfloat32_t v178 = svcmla_f32_x(pred_full, zero178, v547, v143, 90); - svfloat32_t v227; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v227) : "w"(v226), "w"(v33)); - svfloat32_t zero267; - asm volatile("mov %0.s, #0" : "=w"(zero267)); + svfloat32_t v227 = svadd_f32_x(svptrue_b32(), v226, v33); + svfloat32_t zero267 = svdup_n_f32(0); svfloat32_t v267 = svcmla_f32_x(pred_full, zero267, v547, v232, 90); svfloat32_t v200 = svmla_f32_x(pred_full, v138, v137, v543); - svfloat32_t v207; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v207) : "w"(v178), "w"(v185)); - svfloat32_t v209; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v209) : "w"(v178), "w"(v185)); - svfloat32_t v211; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v211) : "w"(v178), "w"(v192)); + svfloat32_t v207 = svadd_f32_x(svptrue_b32(), v178, v185); + svfloat32_t v209 = svsub_f32_x(svptrue_b32(), v178, v185); + svfloat32_t v211 = svsub_f32_x(svptrue_b32(), v178, v192); svfloat32_t v289 = svmla_f32_x(pred_full, v227, v226, v543); - svfloat32_t v296; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v296) : "w"(v267), "w"(v274)); - svfloat32_t v298; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v298) : "w"(v267), "w"(v274)); - svfloat32_t v300; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v300) : "w"(v267), "w"(v281)); + svfloat32_t v296 = svadd_f32_x(svptrue_b32(), v267, v274); + svfloat32_t v298 = svsub_f32_x(svptrue_b32(), v267, v274); + svfloat32_t v300 = svsub_f32_x(svptrue_b32(), v267, v281); svst1_f64(pred_full, (double *)(v558), svreinterpret_f64_f32(v138)); svst1_f64(pred_full, (double *)(v567), svreinterpret_f64_f32(v227)); svfloat32_t v201 = svmla_f32_x(pred_full, v200, v139, v544); svfloat32_t v203 = svmls_f32_x(pred_full, v200, v139, v544); svfloat32_t v205 = svmls_f32_x(pred_full, v200, v140, v545); - svfloat32_t v208; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v208) : "w"(v207), "w"(v192)); - svfloat32_t v210; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v210) : "w"(v209), "w"(v199)); - svfloat32_t v212; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v212) : "w"(v211), "w"(v199)); + svfloat32_t v208 = svadd_f32_x(svptrue_b32(), v207, v192); + svfloat32_t v210 = svsub_f32_x(svptrue_b32(), v209, v199); + svfloat32_t v212 = svadd_f32_x(svptrue_b32(), v211, v199); svfloat32_t v290 = svmla_f32_x(pred_full, v289, v228, v544); svfloat32_t v292 = svmls_f32_x(pred_full, v289, v228, v544); svfloat32_t v294 = svmls_f32_x(pred_full, v289, v229, v545); - svfloat32_t v297; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v297) : "w"(v296), "w"(v281)); - svfloat32_t v299; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v299) : "w"(v298), "w"(v288)); - svfloat32_t v301; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v301) : "w"(v300), "w"(v288)); + svfloat32_t v297 = svadd_f32_x(svptrue_b32(), v296, v281); + svfloat32_t v299 = svsub_f32_x(svptrue_b32(), v298, v288); + svfloat32_t v301 = svadd_f32_x(svptrue_b32(), v300, v288); svfloat32_t v202 = svmla_f32_x(pred_full, v201, v140, v545); svfloat32_t v204 = svmls_f32_x(pred_full, v203, v141, v546); svfloat32_t v206 = svmla_f32_x(pred_full, v205, v141, v546); svfloat32_t v291 = svmla_f32_x(pred_full, v290, v229, v545); svfloat32_t v293 = svmls_f32_x(pred_full, v292, v230, v546); svfloat32_t v295 = svmla_f32_x(pred_full, v294, v230, v546); - svfloat32_t v213; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v213) : "w"(v202), "w"(v208)); - svfloat32_t v214; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v214) : "w"(v202), "w"(v208)); - svfloat32_t v215; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v215) : "w"(v204), "w"(v210)); - svfloat32_t v216; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v216) : "w"(v204), "w"(v210)); - svfloat32_t v217; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v217) : "w"(v206), "w"(v212)); - svfloat32_t v218; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v218) : "w"(v206), "w"(v212)); - svfloat32_t v302; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v302) : "w"(v291), "w"(v297)); - svfloat32_t v303; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v303) : "w"(v291), "w"(v297)); - svfloat32_t v304; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v304) : "w"(v293), "w"(v299)); - svfloat32_t v305; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v305) : "w"(v293), "w"(v299)); - svfloat32_t v306; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v306) : "w"(v295), "w"(v301)); - svfloat32_t v307; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v307) : "w"(v295), "w"(v301)); + svfloat32_t v213 = svadd_f32_x(svptrue_b32(), v202, v208); + svfloat32_t v214 = svsub_f32_x(svptrue_b32(), v202, v208); + svfloat32_t v215 = svadd_f32_x(svptrue_b32(), v204, v210); + svfloat32_t v216 = svsub_f32_x(svptrue_b32(), v204, v210); + svfloat32_t v217 = svadd_f32_x(svptrue_b32(), v206, v212); + svfloat32_t v218 = svsub_f32_x(svptrue_b32(), v206, v212); + svfloat32_t v302 = svadd_f32_x(svptrue_b32(), v291, v297); + svfloat32_t v303 = svsub_f32_x(svptrue_b32(), v291, v297); + svfloat32_t v304 = svadd_f32_x(svptrue_b32(), v293, v299); + svfloat32_t v305 = svsub_f32_x(svptrue_b32(), v293, v299); + svfloat32_t v306 = svadd_f32_x(svptrue_b32(), v295, v301); + svfloat32_t v307 = svsub_f32_x(svptrue_b32(), v295, v301); svst1_f64(pred_full, (double *)(v576), svreinterpret_f64_f32(v214)); svst1_f64(pred_full, (double *)(v585), svreinterpret_f64_f32(v303)); svst1_f64(pred_full, (double *)(v594), svreinterpret_f64_f32(v216)); @@ -3710,553 +942,6 @@ void armral_fft_cf32_cf32_cf32_ac_n_gu14(const armral_cmplx_f32_t *restrict x, } #endif -#ifndef ARMRAL_ARCH_SVE -void armral_fft_cf32_cf32_cf32_ac_n_gu15(const armral_cmplx_f32_t *restrict x, - armral_cmplx_f32_t *restrict y, - int istride, int ostride, int howmany, - int idist, float dir) { - float v4 = dir; - const float32x2_t *v5 = (const float32x2_t *)x; - float32x2_t *v6 = (float32x2_t *)y; - for (int j = 0; j < howmany; j += 1) { - float32x2_t v61 = v5[istride]; - float v119 = -1.2500000000000000e+00F; - float v123 = 5.5901699437494745e-01F; - float v126 = 1.5388417685876268e+00F; - float v127 = -1.5388417685876268e+00F; - float v133 = 5.8778525229247325e-01F; - float v134 = -5.8778525229247325e-01F; - float v140 = 3.6327126400268028e-01F; - float v141 = -3.6327126400268028e-01F; - float v165 = -1.4999999999999998e+00F; - float v169 = 1.8749999999999998e+00F; - float v173 = -8.3852549156242107e-01F; - float v176 = -2.3082626528814396e+00F; - float v177 = 2.3082626528814396e+00F; - float v183 = -8.8167787843870971e-01F; - float v184 = 8.8167787843870971e-01F; - float v190 = -5.4490689600402031e-01F; - float v191 = 5.4490689600402031e-01F; - float v214 = 8.6602540378443871e-01F; - float v215 = -8.6602540378443871e-01F; - float v221 = -1.0825317547305484e+00F; - float v222 = 1.0825317547305484e+00F; - float v228 = 4.8412291827592718e-01F; - float v229 = -4.8412291827592718e-01F; - float32x2_t v231 = (float32x2_t){v4, v4}; - float v236 = -1.3326760640014592e+00F; - float v240 = -5.0903696045512736e-01F; - float v244 = -3.1460214309120460e-01F; - float32x2_t v32 = v5[0]; - float32x2_t v120 = (float32x2_t){v119, v119}; - float32x2_t v124 = (float32x2_t){v123, v123}; - float32x2_t v128 = (float32x2_t){v126, v127}; - float32x2_t v135 = (float32x2_t){v133, v134}; - float32x2_t v142 = (float32x2_t){v140, v141}; - float32x2_t v166 = (float32x2_t){v165, v165}; - float32x2_t v170 = (float32x2_t){v169, v169}; - float32x2_t v174 = (float32x2_t){v173, v173}; - float32x2_t v178 = (float32x2_t){v176, v177}; - float32x2_t v185 = (float32x2_t){v183, v184}; - float32x2_t v192 = (float32x2_t){v190, v191}; - float32x2_t v216 = (float32x2_t){v214, v215}; - float32x2_t v223 = (float32x2_t){v221, v222}; - float32x2_t v230 = (float32x2_t){v228, v229}; - float32x2_t v237 = (float32x2_t){v236, v236}; - float32x2_t v241 = (float32x2_t){v240, v240}; - float32x2_t v245 = (float32x2_t){v244, v244}; - float32x2_t v20 = v5[istride * 5]; - float32x2_t v25 = v5[istride * 10]; - float32x2_t v38 = v5[istride * 8]; - float32x2_t v43 = v5[istride * 13]; - float32x2_t v50 = v5[istride * 3]; - float32x2_t v56 = v5[istride * 11]; - float32x2_t v68 = v5[istride * 6]; - float32x2_t v74 = v5[istride * 14]; - float32x2_t v79 = v5[istride * 4]; - float32x2_t v86 = v5[istride * 9]; - float32x2_t v92 = v5[istride * 2]; - float32x2_t v97 = v5[istride * 7]; - float32x2_t v104 = v5[istride * 12]; - float32x2_t v130 = vmul_f32(v231, v128); - float32x2_t v137 = vmul_f32(v231, v135); - float32x2_t v144 = vmul_f32(v231, v142); - float32x2_t v180 = vmul_f32(v231, v178); - float32x2_t v187 = vmul_f32(v231, v185); - float32x2_t v194 = vmul_f32(v231, v192); - float32x2_t v218 = vmul_f32(v231, v216); - float32x2_t v225 = vmul_f32(v231, v223); - float32x2_t v232 = vmul_f32(v231, v230); - float32x2_t v26 = vadd_f32(v20, v25); - float32x2_t v27 = vsub_f32(v20, v25); - float32x2_t v44 = vadd_f32(v38, v43); - float32x2_t v45 = vsub_f32(v38, v43); - float32x2_t v62 = vadd_f32(v56, v61); - float32x2_t v63 = vsub_f32(v56, v61); - float32x2_t v80 = vadd_f32(v74, v79); - float32x2_t v81 = vsub_f32(v74, v79); - float32x2_t v98 = vadd_f32(v92, v97); - float32x2_t v99 = vsub_f32(v92, v97); - float32x2_t v33 = vadd_f32(v26, v32); - float32x2_t v51 = vadd_f32(v44, v50); - float32x2_t v69 = vadd_f32(v62, v68); - float32x2_t v87 = vadd_f32(v80, v86); - float32x2_t v105 = vadd_f32(v98, v104); - float32x2_t v156 = vadd_f32(v44, v98); - float32x2_t v157 = vsub_f32(v44, v98); - float32x2_t v158 = vadd_f32(v80, v62); - float32x2_t v159 = vsub_f32(v80, v62); - float32x2_t v206 = vadd_f32(v45, v99); - float32x2_t v207 = vsub_f32(v45, v99); - float32x2_t v208 = vadd_f32(v81, v63); - float32x2_t v209 = vsub_f32(v81, v63); - float32x2_t v106 = vadd_f32(v51, v105); - float32x2_t v107 = vsub_f32(v51, v105); - float32x2_t v108 = vadd_f32(v87, v69); - float32x2_t v109 = vsub_f32(v87, v69); - float32x2_t v160 = vadd_f32(v156, v158); - float32x2_t v161 = vsub_f32(v156, v158); - float32x2_t v162 = vadd_f32(v157, v159); - float32x2_t v181 = vrev64_f32(v157); - float32x2_t v195 = vrev64_f32(v159); - float32x2_t v210 = vadd_f32(v206, v208); - float32x2_t v211 = vsub_f32(v206, v208); - float32x2_t v212 = vadd_f32(v207, v209); - float32x2_t v238 = vmul_f32(v207, v237); - float32x2_t v246 = vmul_f32(v209, v245); - float32x2_t v110 = vadd_f32(v106, v108); - float32x2_t v111 = vsub_f32(v106, v108); - float32x2_t v112 = vadd_f32(v107, v109); - float32x2_t v131 = vrev64_f32(v107); - float32x2_t v145 = vrev64_f32(v109); - float32x2_t v163 = vadd_f32(v160, v26); - float32x2_t v171 = vmul_f32(v160, v170); - float32x2_t v175 = vmul_f32(v161, v174); - float32x2_t v182 = vmul_f32(v181, v180); - float32x2_t v188 = vrev64_f32(v162); - float32x2_t v196 = vmul_f32(v195, v194); - float32x2_t v213 = vadd_f32(v210, v27); - float32x2_t v226 = vrev64_f32(v210); - float32x2_t v233 = vrev64_f32(v211); - float32x2_t v242 = vmul_f32(v212, v241); - float32x2_t v113 = vadd_f32(v110, v33); - float32x2_t v121 = vmul_f32(v110, v120); - float32x2_t v125 = vmul_f32(v111, v124); - float32x2_t v132 = vmul_f32(v131, v130); - float32x2_t v138 = vrev64_f32(v112); - float32x2_t v146 = vmul_f32(v145, v144); - float32x2_t v167 = vmul_f32(v163, v166); - float32x2_t v189 = vmul_f32(v188, v187); - float32x2_t v219 = vrev64_f32(v213); - float32x2_t v227 = vmul_f32(v226, v225); - float32x2_t v234 = vmul_f32(v233, v232); - float32x2_t v250 = vsub_f32(v238, v242); - float32x2_t v251 = vadd_f32(v242, v246); - float32x2_t v139 = vmul_f32(v138, v137); - float32x2_t v147 = vadd_f32(v113, v121); - float32x2_t v197 = vadd_f32(v167, v171); - float32x2_t v200 = vsub_f32(v182, v189); - float32x2_t v201 = vadd_f32(v189, v196); - float32x2_t v220 = vmul_f32(v219, v218); - float32x2_t v256 = vadd_f32(v113, v167); - v6[0] = v113; - float32x2_t v148 = vadd_f32(v147, v125); - float32x2_t v149 = vsub_f32(v147, v125); - float32x2_t v150 = vsub_f32(v132, v139); - float32x2_t v151 = vadd_f32(v139, v146); - float32x2_t v198 = vadd_f32(v197, v175); - float32x2_t v199 = vsub_f32(v197, v175); - float32x2_t v247 = vadd_f32(v220, v227); - float32x2_t v257 = vadd_f32(v256, v220); - float32x2_t v258 = vsub_f32(v256, v220); - float32x2_t v152 = vadd_f32(v148, v150); - float32x2_t v153 = vsub_f32(v148, v150); - float32x2_t v154 = vadd_f32(v149, v151); - float32x2_t v155 = vsub_f32(v149, v151); - float32x2_t v202 = vadd_f32(v198, v200); - float32x2_t v203 = vsub_f32(v198, v200); - float32x2_t v204 = vadd_f32(v199, v201); - float32x2_t v205 = vsub_f32(v199, v201); - float32x2_t v248 = vadd_f32(v247, v234); - float32x2_t v249 = vsub_f32(v247, v234); - v6[ostride * 10] = v258; - v6[ostride * 5] = v257; - float32x2_t v252 = vadd_f32(v248, v250); - float32x2_t v253 = vsub_f32(v248, v250); - float32x2_t v254 = vadd_f32(v249, v251); - float32x2_t v255 = vsub_f32(v249, v251); - float32x2_t v274 = vadd_f32(v153, v203); - v6[ostride * 6] = v153; - float32x2_t v292 = vadd_f32(v155, v205); - v6[ostride * 12] = v155; - float32x2_t v310 = vadd_f32(v154, v204); - v6[ostride * 3] = v154; - float32x2_t v328 = vadd_f32(v152, v202); - v6[ostride * 9] = v152; - float32x2_t v275 = vadd_f32(v274, v253); - float32x2_t v276 = vsub_f32(v274, v253); - float32x2_t v293 = vadd_f32(v292, v255); - float32x2_t v294 = vsub_f32(v292, v255); - float32x2_t v311 = vadd_f32(v310, v254); - float32x2_t v312 = vsub_f32(v310, v254); - float32x2_t v329 = vadd_f32(v328, v252); - float32x2_t v330 = vsub_f32(v328, v252); - v6[ostride] = v276; - v6[ostride * 11] = v275; - v6[ostride * 7] = v294; - v6[ostride * 2] = v293; - v6[ostride * 13] = v312; - v6[ostride * 8] = v311; - v6[ostride * 4] = v330; - v6[ostride * 14] = v329; - v5 += 1 * idist; - v6 += 1 * 1; - } -} -#endif - -#ifdef ARMRAL_ARCH_SVE -void armral_fft_cf32_cf32_cf32_ac_n_gu15(const armral_cmplx_f32_t *restrict x, - armral_cmplx_f32_t *restrict y, - int istride, int ostride, int howmany, - int idist, float dir) { - int64_t v0 = istride; - int64_t v1 = idist; - int64_t v2 = ostride; - float v4 = dir; - const float32x2_t *v5 = (const float32x2_t *)x; - float32x2_t *v6 = (float32x2_t *)y; - int64_t v8 = howmany; - int64_t v10 = svcntd(); - int64_t v11 = v10 * v1; - int64_t v12 = v10 * 1; - for (int j = 0; j < v8; j += v10) { - svbool_t pred_full = svwhilelt_b32(j * 2, howmany * 2); - float v152 = -1.2500000000000000e+00F; - float v157 = 5.5901699437494745e-01F; - float v162 = -1.5388417685876268e+00F; - float v169 = -5.8778525229247325e-01F; - float v176 = -3.6327126400268028e-01F; - float v200 = -1.4999999999999998e+00F; - float v205 = 1.8749999999999998e+00F; - float v210 = -8.3852549156242107e-01F; - float v215 = 2.3082626528814396e+00F; - float v222 = 8.8167787843870971e-01F; - float v229 = 5.4490689600402031e-01F; - float v253 = -8.6602540378443871e-01F; - float v260 = 1.0825317547305484e+00F; - float v267 = -4.8412291827592718e-01F; - float v274 = -1.3326760640014592e+00F; - float v279 = -5.0903696045512736e-01F; - float v284 = -3.1460214309120460e-01F; - const float32x2_t *v487 = &v5[v0]; - float32x2_t *v614 = &v6[v2]; - int64_t v19 = v0 * 5; - int64_t v26 = v0 * 10; - int64_t v43 = v0 * 8; - int64_t v50 = v0 * 13; - int64_t v59 = v0 * 3; - int64_t v67 = v0 * 11; - int64_t v83 = v0 * 6; - int64_t v91 = v0 * 14; - int64_t v98 = v0 * 4; - int64_t v107 = v0 * 9; - int64_t v115 = v0 * 2; - int64_t v122 = v0 * 7; - int64_t v131 = v0 * 12; - float v165 = v4 * v162; - float v172 = v4 * v169; - float v179 = v4 * v176; - float v218 = v4 * v215; - float v225 = v4 * v222; - float v232 = v4 * v229; - float v256 = v4 * v253; - float v263 = v4 * v260; - float v270 = v4 * v267; - int64_t v308 = v2 * 10; - int64_t v315 = v2 * 5; - int64_t v325 = v2 * 6; - int64_t v339 = v2 * 11; - int64_t v349 = v2 * 12; - int64_t v356 = v2 * 7; - int64_t v363 = v2 * 2; - int64_t v373 = v2 * 3; - int64_t v380 = v2 * 13; - int64_t v387 = v2 * 8; - int64_t v397 = v2 * 9; - int64_t v404 = v2 * 4; - int64_t v411 = v2 * 14; - const float32x2_t *v442 = &v5[0]; - svint64_t v551 = svindex_s64(0, v1); - svfloat32_t v554 = svdup_n_f32(v152); - svfloat32_t v555 = svdup_n_f32(v157); - svfloat32_t v559 = svdup_n_f32(v200); - svfloat32_t v560 = svdup_n_f32(v205); - svfloat32_t v561 = svdup_n_f32(v210); - svfloat32_t v568 = svdup_n_f32(v274); - svfloat32_t v569 = svdup_n_f32(v279); - svfloat32_t v570 = svdup_n_f32(v284); - float32x2_t *v578 = &v6[0]; - const float32x2_t *v423 = &v5[v19]; - const float32x2_t *v432 = &v5[v26]; - svfloat32_t v444 = svreinterpret_f32_f64( - svld1_gather_s64index_f64(pred_full, (const double *)(v442), v551)); - const float32x2_t *v451 = &v5[v43]; - const float32x2_t *v460 = &v5[v50]; - const float32x2_t *v469 = &v5[v59]; - const float32x2_t *v478 = &v5[v67]; - svfloat32_t v489 = svreinterpret_f32_f64( - svld1_gather_s64index_f64(pred_full, (const double *)(v487), v551)); - const float32x2_t *v496 = &v5[v83]; - const float32x2_t *v505 = &v5[v91]; - const float32x2_t *v514 = &v5[v98]; - const float32x2_t *v523 = &v5[v107]; - const float32x2_t *v532 = &v5[v115]; - const float32x2_t *v541 = &v5[v122]; - const float32x2_t *v550 = &v5[v131]; - svfloat32_t v556 = svdup_n_f32(v165); - svfloat32_t v557 = svdup_n_f32(v172); - svfloat32_t v558 = svdup_n_f32(v179); - svfloat32_t v562 = svdup_n_f32(v218); - svfloat32_t v563 = svdup_n_f32(v225); - svfloat32_t v564 = svdup_n_f32(v232); - svfloat32_t v565 = svdup_n_f32(v256); - svfloat32_t v566 = svdup_n_f32(v263); - svfloat32_t v567 = svdup_n_f32(v270); - float32x2_t *v587 = &v6[v308]; - float32x2_t *v596 = &v6[v315]; - float32x2_t *v605 = &v6[v325]; - float32x2_t *v623 = &v6[v339]; - float32x2_t *v632 = &v6[v349]; - float32x2_t *v641 = &v6[v356]; - float32x2_t *v650 = &v6[v363]; - float32x2_t *v659 = &v6[v373]; - float32x2_t *v668 = &v6[v380]; - float32x2_t *v677 = &v6[v387]; - float32x2_t *v686 = &v6[v397]; - float32x2_t *v695 = &v6[v404]; - float32x2_t *v704 = &v6[v411]; - svfloat32_t v425 = svreinterpret_f32_f64( - svld1_gather_s64index_f64(pred_full, (const double *)(v423), v551)); - svfloat32_t v434 = svreinterpret_f32_f64( - svld1_gather_s64index_f64(pred_full, (const double *)(v432), v551)); - svfloat32_t v453 = svreinterpret_f32_f64( - svld1_gather_s64index_f64(pred_full, (const double *)(v451), v551)); - svfloat32_t v462 = svreinterpret_f32_f64( - svld1_gather_s64index_f64(pred_full, (const double *)(v460), v551)); - svfloat32_t v471 = svreinterpret_f32_f64( - svld1_gather_s64index_f64(pred_full, (const double *)(v469), v551)); - svfloat32_t v480 = svreinterpret_f32_f64( - svld1_gather_s64index_f64(pred_full, (const double *)(v478), v551)); - svfloat32_t v498 = svreinterpret_f32_f64( - svld1_gather_s64index_f64(pred_full, (const double *)(v496), v551)); - svfloat32_t v507 = svreinterpret_f32_f64( - svld1_gather_s64index_f64(pred_full, (const double *)(v505), v551)); - svfloat32_t v516 = svreinterpret_f32_f64( - svld1_gather_s64index_f64(pred_full, (const double *)(v514), v551)); - svfloat32_t v525 = svreinterpret_f32_f64( - svld1_gather_s64index_f64(pred_full, (const double *)(v523), v551)); - svfloat32_t v534 = svreinterpret_f32_f64( - svld1_gather_s64index_f64(pred_full, (const double *)(v532), v551)); - svfloat32_t v543 = svreinterpret_f32_f64( - svld1_gather_s64index_f64(pred_full, (const double *)(v541), v551)); - svfloat32_t v552 = svreinterpret_f32_f64( - svld1_gather_s64index_f64(pred_full, (const double *)(v550), v551)); - svfloat32_t v32; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v32) : "w"(v425), "w"(v434)); - svfloat32_t v33; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v33) : "w"(v425), "w"(v434)); - svfloat32_t v56; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v56) : "w"(v453), "w"(v462)); - svfloat32_t v57; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v57) : "w"(v453), "w"(v462)); - svfloat32_t v80; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v80) : "w"(v480), "w"(v489)); - svfloat32_t v81; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v81) : "w"(v480), "w"(v489)); - svfloat32_t v104; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v104) : "w"(v507), "w"(v516)); - svfloat32_t v105; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v105) : "w"(v507), "w"(v516)); - svfloat32_t v128; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v128) : "w"(v534), "w"(v543)); - svfloat32_t v129; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v129) : "w"(v534), "w"(v543)); - svfloat32_t v41; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v41) : "w"(v32), "w"(v444)); - svfloat32_t v65; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v65) : "w"(v56), "w"(v471)); - svfloat32_t v89; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v89) : "w"(v80), "w"(v498)); - svfloat32_t v113; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v113) : "w"(v104), "w"(v525)); - svfloat32_t v137; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v137) : "w"(v128), "w"(v552)); - svfloat32_t v191; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v191) : "w"(v56), "w"(v128)); - svfloat32_t v192; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v192) : "w"(v56), "w"(v128)); - svfloat32_t v193; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v193) : "w"(v104), "w"(v80)); - svfloat32_t v194; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v194) : "w"(v104), "w"(v80)); - svfloat32_t v244; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v244) : "w"(v57), "w"(v129)); - svfloat32_t v245; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v245) : "w"(v57), "w"(v129)); - svfloat32_t v246; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v246) : "w"(v105), "w"(v81)); - svfloat32_t v247; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v247) : "w"(v105), "w"(v81)); - svfloat32_t v138; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v138) : "w"(v65), "w"(v137)); - svfloat32_t v139; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v139) : "w"(v65), "w"(v137)); - svfloat32_t v140; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v140) : "w"(v113), "w"(v89)); - svfloat32_t v141; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v141) : "w"(v113), "w"(v89)); - svfloat32_t v195; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v195) : "w"(v191), "w"(v193)); - svfloat32_t v196; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v196) : "w"(v191), "w"(v193)); - svfloat32_t v197; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v197) : "w"(v192), "w"(v194)); - svfloat32_t zero220; - asm volatile("mov %0.s, #0" : "=w"(zero220)); - svfloat32_t v220 = svcmla_f32_x(pred_full, zero220, v562, v192, 90); - svfloat32_t v248; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v248) : "w"(v244), "w"(v246)); - svfloat32_t v249; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v249) : "w"(v244), "w"(v246)); - svfloat32_t v250; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v250) : "w"(v245), "w"(v247)); - svfloat32_t v287; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v287) : "w"(v247), "w"(v570)); - svfloat32_t v142; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v142) : "w"(v138), "w"(v140)); - svfloat32_t v143; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v143) : "w"(v138), "w"(v140)); - svfloat32_t v144; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v144) : "w"(v139), "w"(v141)); - svfloat32_t zero167; - asm volatile("mov %0.s, #0" : "=w"(zero167)); - svfloat32_t v167 = svcmla_f32_x(pred_full, zero167, v556, v139, 90); - svfloat32_t v198; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v198) : "w"(v195), "w"(v32)); - svfloat32_t v208; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v208) : "w"(v195), "w"(v560)); - svfloat32_t zero227; - asm volatile("mov %0.s, #0" : "=w"(zero227)); - svfloat32_t v227 = svcmla_f32_x(pred_full, zero227, v563, v197, 90); - svfloat32_t v251; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v251) : "w"(v248), "w"(v33)); - svfloat32_t zero272; - asm volatile("mov %0.s, #0" : "=w"(zero272)); - svfloat32_t v272 = svcmla_f32_x(pred_full, zero272, v567, v249, 90); - svfloat32_t v282; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v282) : "w"(v250), "w"(v569)); - svfloat32_t v145; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v145) : "w"(v142), "w"(v41)); - svfloat32_t zero174; - asm volatile("mov %0.s, #0" : "=w"(zero174)); - svfloat32_t v174 = svcmla_f32_x(pred_full, zero174, v557, v144, 90); - svfloat32_t v238; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v238) : "w"(v220), "w"(v227)); - svfloat32_t v239 = svcmla_f32_x(pred_full, v227, v564, v194, 90); - svfloat32_t zero258; - asm volatile("mov %0.s, #0" : "=w"(zero258)); - svfloat32_t v258 = svcmla_f32_x(pred_full, zero258, v565, v251, 90); - svfloat32_t v291 = svnmls_f32_x(pred_full, v282, v245, v568); - svfloat32_t v292 = svmla_f32_x(pred_full, v287, v250, v569); - svfloat32_t v182 = svmla_f32_x(pred_full, v145, v142, v554); - svfloat32_t v185; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v185) : "w"(v167), "w"(v174)); - svfloat32_t v186 = svcmla_f32_x(pred_full, v174, v558, v141, 90); - svfloat32_t v235 = svmla_f32_x(pred_full, v208, v198, v559); - svfloat32_t v288 = svcmla_f32_x(pred_full, v258, v566, v248, 90); - svfloat32_t v297 = svmla_f32_x(pred_full, v145, v198, v559); - svst1_f64(pred_full, (double *)(v578), svreinterpret_f64_f32(v145)); - svfloat32_t v183 = svmla_f32_x(pred_full, v182, v143, v555); - svfloat32_t v184 = svmls_f32_x(pred_full, v182, v143, v555); - svfloat32_t v236 = svmla_f32_x(pred_full, v235, v196, v561); - svfloat32_t v237 = svmls_f32_x(pred_full, v235, v196, v561); - svfloat32_t v289; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v289) : "w"(v288), "w"(v272)); - svfloat32_t v290; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v290) : "w"(v288), "w"(v272)); - svfloat32_t v298; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v298) : "w"(v297), "w"(v258)); - svfloat32_t v299; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v299) : "w"(v297), "w"(v258)); - svfloat32_t v187; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v187) : "w"(v183), "w"(v185)); - svfloat32_t v188; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v188) : "w"(v183), "w"(v185)); - svfloat32_t v189; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v189) : "w"(v184), "w"(v186)); - svfloat32_t v190; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v190) : "w"(v184), "w"(v186)); - svfloat32_t v240; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v240) : "w"(v236), "w"(v238)); - svfloat32_t v241; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v241) : "w"(v236), "w"(v238)); - svfloat32_t v242; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v242) : "w"(v237), "w"(v239)); - svfloat32_t v243; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v243) : "w"(v237), "w"(v239)); - svfloat32_t v293; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v293) : "w"(v289), "w"(v291)); - svfloat32_t v294; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v294) : "w"(v289), "w"(v291)); - svfloat32_t v295; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v295) : "w"(v290), "w"(v292)); - svfloat32_t v296; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v296) : "w"(v290), "w"(v292)); - svst1_f64(pred_full, (double *)(v587), svreinterpret_f64_f32(v299)); - svst1_f64(pred_full, (double *)(v596), svreinterpret_f64_f32(v298)); - svfloat32_t v321; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v321) : "w"(v188), "w"(v241)); - svfloat32_t v345; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v345) : "w"(v190), "w"(v243)); - svfloat32_t v369; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v369) : "w"(v189), "w"(v242)); - svfloat32_t v393; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v393) : "w"(v187), "w"(v240)); - svst1_f64(pred_full, (double *)(v605), svreinterpret_f64_f32(v188)); - svst1_f64(pred_full, (double *)(v632), svreinterpret_f64_f32(v190)); - svst1_f64(pred_full, (double *)(v659), svreinterpret_f64_f32(v189)); - svst1_f64(pred_full, (double *)(v686), svreinterpret_f64_f32(v187)); - svfloat32_t v322; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v322) : "w"(v321), "w"(v294)); - svfloat32_t v323; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v323) : "w"(v321), "w"(v294)); - svfloat32_t v346; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v346) : "w"(v345), "w"(v296)); - svfloat32_t v347; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v347) : "w"(v345), "w"(v296)); - svfloat32_t v370; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v370) : "w"(v369), "w"(v295)); - svfloat32_t v371; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v371) : "w"(v369), "w"(v295)); - svfloat32_t v394; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v394) : "w"(v393), "w"(v293)); - svfloat32_t v395; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v395) : "w"(v393), "w"(v293)); - svst1_f64(pred_full, (double *)(v614), svreinterpret_f64_f32(v323)); - svst1_f64(pred_full, (double *)(v623), svreinterpret_f64_f32(v322)); - svst1_f64(pred_full, (double *)(v641), svreinterpret_f64_f32(v347)); - svst1_f64(pred_full, (double *)(v650), svreinterpret_f64_f32(v346)); - svst1_f64(pred_full, (double *)(v668), svreinterpret_f64_f32(v371)); - svst1_f64(pred_full, (double *)(v677), svreinterpret_f64_f32(v370)); - svst1_f64(pred_full, (double *)(v695), svreinterpret_f64_f32(v395)); - svst1_f64(pred_full, (double *)(v704), svreinterpret_f64_f32(v394)); - v5 += v11; - v6 += v12; - } -} -#endif - #ifndef ARMRAL_ARCH_SVE void armral_fft_cf32_cf32_cf32_ac_n_gu16(const armral_cmplx_f32_t *restrict x, armral_cmplx_f32_t *restrict y, @@ -4566,192 +1251,109 @@ void armral_fft_cf32_cf32_cf32_ac_n_gu16(const armral_cmplx_f32_t *restrict x, svld1_gather_s64index_f64(pred_full, (const double *)(v565), v575)); svfloat32_t v576 = svreinterpret_f32_f64( svld1_gather_s64index_f64(pred_full, (const double *)(v574), v575)); - svfloat32_t v32; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v32) : "w"(v441), "w"(v450)); - svfloat32_t v33; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v33) : "w"(v441), "w"(v450)); - svfloat32_t v48; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v48) : "w"(v459), "w"(v468)); - svfloat32_t v49; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v49) : "w"(v459), "w"(v468)); - svfloat32_t v64; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v64) : "w"(v477), "w"(v486)); - svfloat32_t v65; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v65) : "w"(v477), "w"(v486)); - svfloat32_t v80; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v80) : "w"(v495), "w"(v504)); - svfloat32_t v81; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v81) : "w"(v495), "w"(v504)); - svfloat32_t v96; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v96) : "w"(v513), "w"(v522)); - svfloat32_t v97; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v97) : "w"(v513), "w"(v522)); - svfloat32_t v112; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v112) : "w"(v531), "w"(v540)); - svfloat32_t v113; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v113) : "w"(v531), "w"(v540)); - svfloat32_t v128; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v128) : "w"(v549), "w"(v558)); - svfloat32_t v129; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v129) : "w"(v549), "w"(v558)); - svfloat32_t v144; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v144) : "w"(v567), "w"(v576)); - svfloat32_t v145; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v145) : "w"(v567), "w"(v576)); - svfloat32_t v146; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v146) : "w"(v32), "w"(v48)); - svfloat32_t v147; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v147) : "w"(v32), "w"(v48)); - svfloat32_t v148; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v148) : "w"(v64), "w"(v80)); - svfloat32_t v149; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v149) : "w"(v64), "w"(v80)); - svfloat32_t v150; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v150) : "w"(v96), "w"(v112)); - svfloat32_t v151; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v151) : "w"(v96), "w"(v112)); - svfloat32_t v152; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v152) : "w"(v128), "w"(v144)); - svfloat32_t v153; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v153) : "w"(v128), "w"(v144)); - svfloat32_t v162; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v162) : "w"(v65), "w"(v81)); - svfloat32_t v163; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v163) : "w"(v65), "w"(v81)); - svfloat32_t v164; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v164) : "w"(v97), "w"(v145)); - svfloat32_t v165; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v165) : "w"(v97), "w"(v145)); - svfloat32_t v166; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v166) : "w"(v113), "w"(v129)); - svfloat32_t v167; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v167) : "w"(v113), "w"(v129)); - svfloat32_t zero227; - asm volatile("mov %0.s, #0" : "=w"(zero227)); + svfloat32_t v32 = svadd_f32_x(svptrue_b32(), v441, v450); + svfloat32_t v33 = svsub_f32_x(svptrue_b32(), v441, v450); + svfloat32_t v48 = svadd_f32_x(svptrue_b32(), v459, v468); + svfloat32_t v49 = svsub_f32_x(svptrue_b32(), v459, v468); + svfloat32_t v64 = svadd_f32_x(svptrue_b32(), v477, v486); + svfloat32_t v65 = svsub_f32_x(svptrue_b32(), v477, v486); + svfloat32_t v80 = svadd_f32_x(svptrue_b32(), v495, v504); + svfloat32_t v81 = svsub_f32_x(svptrue_b32(), v495, v504); + svfloat32_t v96 = svadd_f32_x(svptrue_b32(), v513, v522); + svfloat32_t v97 = svsub_f32_x(svptrue_b32(), v513, v522); + svfloat32_t v112 = svadd_f32_x(svptrue_b32(), v531, v540); + svfloat32_t v113 = svsub_f32_x(svptrue_b32(), v531, v540); + svfloat32_t v128 = svadd_f32_x(svptrue_b32(), v549, v558); + svfloat32_t v129 = svsub_f32_x(svptrue_b32(), v549, v558); + svfloat32_t v144 = svadd_f32_x(svptrue_b32(), v567, v576); + svfloat32_t v145 = svsub_f32_x(svptrue_b32(), v567, v576); + svfloat32_t v146 = svadd_f32_x(svptrue_b32(), v32, v48); + svfloat32_t v147 = svsub_f32_x(svptrue_b32(), v32, v48); + svfloat32_t v148 = svadd_f32_x(svptrue_b32(), v64, v80); + svfloat32_t v149 = svsub_f32_x(svptrue_b32(), v64, v80); + svfloat32_t v150 = svadd_f32_x(svptrue_b32(), v96, v112); + svfloat32_t v151 = svsub_f32_x(svptrue_b32(), v96, v112); + svfloat32_t v152 = svadd_f32_x(svptrue_b32(), v128, v144); + svfloat32_t v153 = svsub_f32_x(svptrue_b32(), v128, v144); + svfloat32_t v162 = svadd_f32_x(svptrue_b32(), v65, v81); + svfloat32_t v163 = svsub_f32_x(svptrue_b32(), v65, v81); + svfloat32_t v164 = svadd_f32_x(svptrue_b32(), v97, v145); + svfloat32_t v165 = svsub_f32_x(svptrue_b32(), v97, v145); + svfloat32_t v166 = svadd_f32_x(svptrue_b32(), v113, v129); + svfloat32_t v167 = svsub_f32_x(svptrue_b32(), v113, v129); + svfloat32_t zero227 = svdup_n_f32(0); svfloat32_t v227 = svcmla_f32_x(pred_full, zero227, v586, v49, 90); - svfloat32_t v154; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v154) : "w"(v146), "w"(v148)); - svfloat32_t v155; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v155) : "w"(v146), "w"(v148)); - svfloat32_t v156; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v156) : "w"(v150), "w"(v152)); - svfloat32_t v157; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v157) : "w"(v150), "w"(v152)); - svfloat32_t v160; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v160) : "w"(v151), "w"(v153)); - svfloat32_t v161; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v161) : "w"(v151), "w"(v153)); - svfloat32_t v168; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v168) : "w"(v164), "w"(v166)); - svfloat32_t v169; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v169) : "w"(v165), "w"(v167)); - svfloat32_t zero203; - asm volatile("mov %0.s, #0" : "=w"(zero203)); + svfloat32_t v154 = svadd_f32_x(svptrue_b32(), v146, v148); + svfloat32_t v155 = svsub_f32_x(svptrue_b32(), v146, v148); + svfloat32_t v156 = svadd_f32_x(svptrue_b32(), v150, v152); + svfloat32_t v157 = svsub_f32_x(svptrue_b32(), v150, v152); + svfloat32_t v160 = svadd_f32_x(svptrue_b32(), v151, v153); + svfloat32_t v161 = svsub_f32_x(svptrue_b32(), v151, v153); + svfloat32_t v168 = svadd_f32_x(svptrue_b32(), v164, v166); + svfloat32_t v169 = svadd_f32_x(svptrue_b32(), v165, v167); + svfloat32_t zero203 = svdup_n_f32(0); svfloat32_t v203 = svcmla_f32_x(pred_full, zero203, v586, v149, 90); - svfloat32_t zero234; - asm volatile("mov %0.s, #0" : "=w"(zero234)); + svfloat32_t zero234 = svdup_n_f32(0); svfloat32_t v234 = svcmla_f32_x(pred_full, zero234, v587, v162, 90); - svfloat32_t zero260; - asm volatile("mov %0.s, #0" : "=w"(zero260)); + svfloat32_t zero260 = svdup_n_f32(0); svfloat32_t v260 = svcmla_f32_x(pred_full, zero260, v591, v166, 90); - svfloat32_t v270; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v270) : "w"(v165), "w"(v593)); - svfloat32_t v275; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v275) : "w"(v167), "w"(v594)); - svfloat32_t v158; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v158) : "w"(v154), "w"(v156)); - svfloat32_t v159; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v159) : "w"(v154), "w"(v156)); - svfloat32_t zero191; - asm volatile("mov %0.s, #0" : "=w"(zero191)); + svfloat32_t v270 = svmul_f32_x(svptrue_b32(), v165, v593); + svfloat32_t v275 = svmul_f32_x(svptrue_b32(), v167, v594); + svfloat32_t v158 = svadd_f32_x(svptrue_b32(), v154, v156); + svfloat32_t v159 = svsub_f32_x(svptrue_b32(), v154, v156); + svfloat32_t zero191 = svdup_n_f32(0); svfloat32_t v191 = svcmla_f32_x(pred_full, zero191, v586, v157, 90); - svfloat32_t zero210; - asm volatile("mov %0.s, #0" : "=w"(zero210)); + svfloat32_t zero210 = svdup_n_f32(0); svfloat32_t v210 = svcmla_f32_x(pred_full, zero210, v587, v160, 90); - svfloat32_t zero246; - asm volatile("mov %0.s, #0" : "=w"(zero246)); + svfloat32_t zero246 = svdup_n_f32(0); svfloat32_t v246 = svcmla_f32_x(pred_full, zero246, v589, v168, 90); - svfloat32_t v265; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v265) : "w"(v169), "w"(v592)); + svfloat32_t v265 = svmul_f32_x(svptrue_b32(), v169, v592); svfloat32_t v286 = svmla_f32_x(pred_full, v33, v163, v588); svfloat32_t v287 = svmls_f32_x(pred_full, v33, v163, v588); - svfloat32_t v288; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v288) : "w"(v227), "w"(v234)); - svfloat32_t v289; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v289) : "w"(v227), "w"(v234)); - svfloat32_t v276; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v276) : "w"(v155), "w"(v191)); - svfloat32_t v277; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v277) : "w"(v155), "w"(v191)); + svfloat32_t v288 = svadd_f32_x(svptrue_b32(), v227, v234); + svfloat32_t v289 = svsub_f32_x(svptrue_b32(), v227, v234); + svfloat32_t v276 = svadd_f32_x(svptrue_b32(), v155, v191); + svfloat32_t v277 = svsub_f32_x(svptrue_b32(), v155, v191); svfloat32_t v278 = svmla_f32_x(pred_full, v147, v161, v588); - svfloat32_t v279; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v279) : "w"(v203), "w"(v210)); + svfloat32_t v279 = svadd_f32_x(svptrue_b32(), v203, v210); svfloat32_t v280 = svmls_f32_x(pred_full, v147, v161, v588); - svfloat32_t v281; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v281) : "w"(v210), "w"(v203)); + svfloat32_t v281 = svsub_f32_x(svptrue_b32(), v210, v203); svfloat32_t v290 = svcmla_f32_x(pred_full, v246, v590, v164, 90); - svfloat32_t v291; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v291) : "w"(v246), "w"(v260)); + svfloat32_t v291 = svsub_f32_x(svptrue_b32(), v246, v260); svfloat32_t v292 = svnmls_f32_x(pred_full, v265, v165, v593); svfloat32_t v293 = svnmls_f32_x(pred_full, v265, v167, v594); svfloat32_t v294 = svnmls_f32_x(pred_full, v270, v169, v592); svfloat32_t v295 = svnmls_f32_x(pred_full, v275, v169, v592); - svfloat32_t v300; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v300) : "w"(v287), "w"(v289)); - svfloat32_t v301; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v301) : "w"(v287), "w"(v289)); + svfloat32_t v300 = svadd_f32_x(svptrue_b32(), v287, v289); + svfloat32_t v301 = svsub_f32_x(svptrue_b32(), v287, v289); svst1_f64(pred_full, (double *)(v602), svreinterpret_f64_f32(v158)); svst1_f64(pred_full, (double *)(v674), svreinterpret_f64_f32(v159)); - svfloat32_t v282; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v282) : "w"(v278), "w"(v279)); - svfloat32_t v283; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v283) : "w"(v280), "w"(v281)); - svfloat32_t v284; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v284) : "w"(v280), "w"(v281)); - svfloat32_t v285; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v285) : "w"(v278), "w"(v279)); - svfloat32_t v296; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v296) : "w"(v286), "w"(v292)); - svfloat32_t v297; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v297) : "w"(v286), "w"(v292)); - svfloat32_t v298; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v298) : "w"(v286), "w"(v294)); - svfloat32_t v299; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v299) : "w"(v286), "w"(v294)); - svfloat32_t v302; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v302) : "w"(v287), "w"(v295)); - svfloat32_t v303; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v303) : "w"(v287), "w"(v295)); - svfloat32_t v306; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v306) : "w"(v290), "w"(v288)); - svfloat32_t v307; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v307) : "w"(v290), "w"(v288)); - svfloat32_t v308; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v308) : "w"(v291), "w"(v293)); - svfloat32_t v309; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v309) : "w"(v291), "w"(v293)); - svfloat32_t v310; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v310) : "w"(v291), "w"(v289)); - svfloat32_t v311; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v311) : "w"(v291), "w"(v289)); + svfloat32_t v282 = svadd_f32_x(svptrue_b32(), v278, v279); + svfloat32_t v283 = svadd_f32_x(svptrue_b32(), v280, v281); + svfloat32_t v284 = svsub_f32_x(svptrue_b32(), v280, v281); + svfloat32_t v285 = svsub_f32_x(svptrue_b32(), v278, v279); + svfloat32_t v296 = svadd_f32_x(svptrue_b32(), v286, v292); + svfloat32_t v297 = svsub_f32_x(svptrue_b32(), v286, v292); + svfloat32_t v298 = svadd_f32_x(svptrue_b32(), v286, v294); + svfloat32_t v299 = svsub_f32_x(svptrue_b32(), v286, v294); + svfloat32_t v302 = svadd_f32_x(svptrue_b32(), v287, v295); + svfloat32_t v303 = svsub_f32_x(svptrue_b32(), v287, v295); + svfloat32_t v306 = svadd_f32_x(svptrue_b32(), v290, v288); + svfloat32_t v307 = svsub_f32_x(svptrue_b32(), v290, v288); + svfloat32_t v308 = svadd_f32_x(svptrue_b32(), v291, v293); + svfloat32_t v309 = svsub_f32_x(svptrue_b32(), v291, v293); + svfloat32_t v310 = svadd_f32_x(svptrue_b32(), v291, v289); + svfloat32_t v311 = svsub_f32_x(svptrue_b32(), v291, v289); svst1_f64(pred_full, (double *)(v638), svreinterpret_f64_f32(v277)); svst1_f64(pred_full, (double *)(v710), svreinterpret_f64_f32(v276)); - svfloat32_t v312; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v312) : "w"(v296), "w"(v306)); - svfloat32_t v313; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v313) : "w"(v297), "w"(v307)); - svfloat32_t v314; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v314) : "w"(v298), "w"(v307)); - svfloat32_t v315; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v315) : "w"(v299), "w"(v306)); - svfloat32_t v316; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v316) : "w"(v300), "w"(v308)); - svfloat32_t v317; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v317) : "w"(v301), "w"(v309)); - svfloat32_t v318; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v318) : "w"(v302), "w"(v311)); - svfloat32_t v319; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v319) : "w"(v303), "w"(v310)); + svfloat32_t v312 = svadd_f32_x(svptrue_b32(), v296, v306); + svfloat32_t v313 = svadd_f32_x(svptrue_b32(), v297, v307); + svfloat32_t v314 = svsub_f32_x(svptrue_b32(), v298, v307); + svfloat32_t v315 = svsub_f32_x(svptrue_b32(), v299, v306); + svfloat32_t v316 = svadd_f32_x(svptrue_b32(), v300, v308); + svfloat32_t v317 = svadd_f32_x(svptrue_b32(), v301, v309); + svfloat32_t v318 = svsub_f32_x(svptrue_b32(), v302, v311); + svfloat32_t v319 = svsub_f32_x(svptrue_b32(), v303, v310); svst1_f64(pred_full, (double *)(v620), svreinterpret_f64_f32(v285)); svst1_f64(pred_full, (double *)(v656), svreinterpret_f64_f32(v284)); svst1_f64(pred_full, (double *)(v692), svreinterpret_f64_f32(v283)); @@ -5352,167 +1954,91 @@ void armral_fft_cf32_cf32_cf32_ac_n_gu17(const armral_cmplx_f32_t *restrict x, svld1_gather_s64index_f64(pred_full, (const double *)(v767), v787)); svfloat32_t v778 = svreinterpret_f32_f64( svld1_gather_s64index_f64(pred_full, (const double *)(v776), v787)); - svfloat32_t v32; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v32) : "w"(v643), "w"(v652)); - svfloat32_t v33; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v33) : "w"(v643), "w"(v652)); - svfloat32_t v48; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v48) : "w"(v661), "w"(v670)); - svfloat32_t v49; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v49) : "w"(v661), "w"(v670)); - svfloat32_t v64; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v64) : "w"(v679), "w"(v688)); - svfloat32_t v65; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v65) : "w"(v679), "w"(v688)); - svfloat32_t v80; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v80) : "w"(v697), "w"(v706)); - svfloat32_t v81; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v81) : "w"(v697), "w"(v706)); - svfloat32_t v96; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v96) : "w"(v715), "w"(v724)); - svfloat32_t v97; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v97) : "w"(v715), "w"(v724)); - svfloat32_t v112; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v112) : "w"(v733), "w"(v742)); - svfloat32_t v113; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v113) : "w"(v733), "w"(v742)); - svfloat32_t v128; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v128) : "w"(v751), "w"(v760)); - svfloat32_t v129; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v129) : "w"(v751), "w"(v760)); - svfloat32_t v144; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v144) : "w"(v769), "w"(v778)); - svfloat32_t v145; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v145) : "w"(v769), "w"(v778)); - svfloat32_t v146; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v146) : "w"(v32), "w"(v96)); - svfloat32_t v147; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v147) : "w"(v48), "w"(v112)); - svfloat32_t v148; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v148) : "w"(v64), "w"(v128)); - svfloat32_t v149; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v149) : "w"(v80), "w"(v144)); - svfloat32_t v152; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v152) : "w"(v32), "w"(v96)); - svfloat32_t v153; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v153) : "w"(v48), "w"(v112)); - svfloat32_t v154; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v154) : "w"(v64), "w"(v128)); - svfloat32_t v155; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v155) : "w"(v80), "w"(v144)); - svfloat32_t v166; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v166) : "w"(v33), "w"(v65)); - svfloat32_t v167; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v167) : "w"(v49), "w"(v81)); - svfloat32_t v168; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v168) : "w"(v33), "w"(v65)); - svfloat32_t v169; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v169) : "w"(v145), "w"(v113)); - svfloat32_t v170; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v170) : "w"(v97), "w"(v129)); - svfloat32_t v171; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v171) : "w"(v113), "w"(v145)); - svfloat32_t v172; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v172) : "w"(v97), "w"(v129)); - svfloat32_t v173; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v173) : "w"(v49), "w"(v81)); - svfloat32_t v186; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v186) : "w"(v33), "w"(v97)); - svfloat32_t v187; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v187) : "w"(v81), "w"(v145)); - svfloat32_t v150; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v150) : "w"(v146), "w"(v148)); - svfloat32_t v151; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v151) : "w"(v147), "w"(v149)); - svfloat32_t v156; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v156) : "w"(v146), "w"(v148)); - svfloat32_t v157; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v157) : "w"(v147), "w"(v149)); - svfloat32_t v160; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v160) : "w"(v153), "w"(v155)); - svfloat32_t v161; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v161) : "w"(v152), "w"(v154)); - svfloat32_t v163; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v163) : "w"(v154), "w"(v155)); - svfloat32_t v164; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v164) : "w"(v152), "w"(v153)); - svfloat32_t v174; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v174) : "w"(v166), "w"(v167)); - svfloat32_t v175; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v175) : "w"(v170), "w"(v171)); - svfloat32_t v177; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v177) : "w"(v166), "w"(v167)); - svfloat32_t v178; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v178) : "w"(v170), "w"(v171)); - svfloat32_t v180; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v180) : "w"(v168), "w"(v169)); - svfloat32_t v181; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v181) : "w"(v172), "w"(v173)); - svfloat32_t v183; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v183) : "w"(v168), "w"(v169)); - svfloat32_t v184; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v184) : "w"(v172), "w"(v173)); - svfloat32_t v223; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v223) : "w"(v154), "w"(v792)); - svfloat32_t zero390; - asm volatile("mov %0.s, #0" : "=w"(zero390)); + svfloat32_t v32 = svadd_f32_x(svptrue_b32(), v643, v652); + svfloat32_t v33 = svsub_f32_x(svptrue_b32(), v643, v652); + svfloat32_t v48 = svadd_f32_x(svptrue_b32(), v661, v670); + svfloat32_t v49 = svsub_f32_x(svptrue_b32(), v661, v670); + svfloat32_t v64 = svadd_f32_x(svptrue_b32(), v679, v688); + svfloat32_t v65 = svsub_f32_x(svptrue_b32(), v679, v688); + svfloat32_t v80 = svadd_f32_x(svptrue_b32(), v697, v706); + svfloat32_t v81 = svsub_f32_x(svptrue_b32(), v697, v706); + svfloat32_t v96 = svadd_f32_x(svptrue_b32(), v715, v724); + svfloat32_t v97 = svsub_f32_x(svptrue_b32(), v715, v724); + svfloat32_t v112 = svadd_f32_x(svptrue_b32(), v733, v742); + svfloat32_t v113 = svsub_f32_x(svptrue_b32(), v733, v742); + svfloat32_t v128 = svadd_f32_x(svptrue_b32(), v751, v760); + svfloat32_t v129 = svsub_f32_x(svptrue_b32(), v751, v760); + svfloat32_t v144 = svadd_f32_x(svptrue_b32(), v769, v778); + svfloat32_t v145 = svsub_f32_x(svptrue_b32(), v769, v778); + svfloat32_t v146 = svadd_f32_x(svptrue_b32(), v32, v96); + svfloat32_t v147 = svadd_f32_x(svptrue_b32(), v48, v112); + svfloat32_t v148 = svadd_f32_x(svptrue_b32(), v64, v128); + svfloat32_t v149 = svadd_f32_x(svptrue_b32(), v80, v144); + svfloat32_t v152 = svsub_f32_x(svptrue_b32(), v32, v96); + svfloat32_t v153 = svsub_f32_x(svptrue_b32(), v48, v112); + svfloat32_t v154 = svsub_f32_x(svptrue_b32(), v64, v128); + svfloat32_t v155 = svsub_f32_x(svptrue_b32(), v80, v144); + svfloat32_t v166 = svadd_f32_x(svptrue_b32(), v33, v65); + svfloat32_t v167 = svadd_f32_x(svptrue_b32(), v49, v81); + svfloat32_t v168 = svsub_f32_x(svptrue_b32(), v33, v65); + svfloat32_t v169 = svsub_f32_x(svptrue_b32(), v145, v113); + svfloat32_t v170 = svadd_f32_x(svptrue_b32(), v97, v129); + svfloat32_t v171 = svadd_f32_x(svptrue_b32(), v113, v145); + svfloat32_t v172 = svsub_f32_x(svptrue_b32(), v97, v129); + svfloat32_t v173 = svsub_f32_x(svptrue_b32(), v49, v81); + svfloat32_t v186 = svadd_f32_x(svptrue_b32(), v33, v97); + svfloat32_t v187 = svadd_f32_x(svptrue_b32(), v81, v145); + svfloat32_t v150 = svadd_f32_x(svptrue_b32(), v146, v148); + svfloat32_t v151 = svadd_f32_x(svptrue_b32(), v147, v149); + svfloat32_t v156 = svsub_f32_x(svptrue_b32(), v146, v148); + svfloat32_t v157 = svsub_f32_x(svptrue_b32(), v147, v149); + svfloat32_t v160 = svadd_f32_x(svptrue_b32(), v153, v155); + svfloat32_t v161 = svadd_f32_x(svptrue_b32(), v152, v154); + svfloat32_t v163 = svsub_f32_x(svptrue_b32(), v154, v155); + svfloat32_t v164 = svsub_f32_x(svptrue_b32(), v152, v153); + svfloat32_t v174 = svadd_f32_x(svptrue_b32(), v166, v167); + svfloat32_t v175 = svadd_f32_x(svptrue_b32(), v170, v171); + svfloat32_t v177 = svsub_f32_x(svptrue_b32(), v166, v167); + svfloat32_t v178 = svsub_f32_x(svptrue_b32(), v170, v171); + svfloat32_t v180 = svadd_f32_x(svptrue_b32(), v168, v169); + svfloat32_t v181 = svadd_f32_x(svptrue_b32(), v172, v173); + svfloat32_t v183 = svsub_f32_x(svptrue_b32(), v168, v169); + svfloat32_t v184 = svsub_f32_x(svptrue_b32(), v172, v173); + svfloat32_t v223 = svmul_f32_x(svptrue_b32(), v154, v792); + svfloat32_t zero390 = svdup_n_f32(0); svfloat32_t v390 = svcmla_f32_x(pred_full, zero390, v819, v187, 90); - svfloat32_t v158; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v158) : "w"(v150), "w"(v151)); - svfloat32_t v159; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v159) : "w"(v150), "w"(v151)); - svfloat32_t v162; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v162) : "w"(v161), "w"(v160)); - svfloat32_t v165; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v165) : "w"(v156), "w"(v157)); - svfloat32_t v176; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v176) : "w"(v174), "w"(v175)); - svfloat32_t v179; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v179) : "w"(v177), "w"(v178)); - svfloat32_t v182; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v182) : "w"(v180), "w"(v181)); - svfloat32_t v185; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v185) : "w"(v183), "w"(v184)); - svfloat32_t v188; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v188) : "w"(v181), "w"(v175)); - svfloat32_t v191; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v191) : "w"(v174), "w"(v180)); - svfloat32_t v233; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v233) : "w"(v156), "w"(v794)); - svfloat32_t v238; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v238) : "w"(v157), "w"(v795)); - svfloat32_t v268; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v268) : "w"(v163), "w"(v801)); - svfloat32_t v273; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v273) : "w"(v164), "w"(v802)); - svfloat32_t v189; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v189) : "w"(v188), "w"(v33)); - svfloat32_t v192; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v192) : "w"(v191), "w"(v81)); - svfloat32_t v203; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v203) : "w"(v788), "w"(v158)); - svfloat32_t v263; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v263) : "w"(v162), "w"(v800)); - svfloat32_t zero299; - asm volatile("mov %0.s, #0" : "=w"(zero299)); + svfloat32_t v158 = svadd_f32_x(svptrue_b32(), v150, v151); + svfloat32_t v159 = svsub_f32_x(svptrue_b32(), v150, v151); + svfloat32_t v162 = svsub_f32_x(svptrue_b32(), v161, v160); + svfloat32_t v165 = svadd_f32_x(svptrue_b32(), v156, v157); + svfloat32_t v176 = svadd_f32_x(svptrue_b32(), v174, v175); + svfloat32_t v179 = svadd_f32_x(svptrue_b32(), v177, v178); + svfloat32_t v182 = svadd_f32_x(svptrue_b32(), v180, v181); + svfloat32_t v185 = svadd_f32_x(svptrue_b32(), v183, v184); + svfloat32_t v188 = svsub_f32_x(svptrue_b32(), v181, v175); + svfloat32_t v191 = svsub_f32_x(svptrue_b32(), v174, v180); + svfloat32_t v233 = svmul_f32_x(svptrue_b32(), v156, v794); + svfloat32_t v238 = svmul_f32_x(svptrue_b32(), v157, v795); + svfloat32_t v268 = svmul_f32_x(svptrue_b32(), v163, v801); + svfloat32_t v273 = svmul_f32_x(svptrue_b32(), v164, v802); + svfloat32_t v189 = svadd_f32_x(svptrue_b32(), v188, v33); + svfloat32_t v192 = svadd_f32_x(svptrue_b32(), v191, v81); + svfloat32_t v203 = svadd_f32_x(svptrue_b32(), v788, v158); + svfloat32_t v263 = svmul_f32_x(svptrue_b32(), v162, v800); + svfloat32_t zero299 = svdup_n_f32(0); svfloat32_t v299 = svcmla_f32_x(pred_full, zero299, v806, v176, 90); - svfloat32_t zero320; - asm volatile("mov %0.s, #0" : "=w"(zero320)); + svfloat32_t zero320 = svdup_n_f32(0); svfloat32_t v320 = svcmla_f32_x(pred_full, zero320, v809, v179, 90); - svfloat32_t zero341; - asm volatile("mov %0.s, #0" : "=w"(zero341)); + svfloat32_t zero341 = svdup_n_f32(0); svfloat32_t v341 = svcmla_f32_x(pred_full, zero341, v812, v182, 90); - svfloat32_t zero362; - asm volatile("mov %0.s, #0" : "=w"(zero362)); + svfloat32_t zero362 = svdup_n_f32(0); svfloat32_t v362 = svcmla_f32_x(pred_full, zero362, v815, v185, 90); svfloat32_t v428 = svmla_f32_x(pred_full, v268, v155, v793); svfloat32_t v429 = svnmls_f32_x(pred_full, v223, v163, v801); svfloat32_t v430 = svmla_f32_x(pred_full, v273, v153, v791); svfloat32_t v431 = svnmls_f32_x(pred_full, v273, v152, v790); - svfloat32_t v190; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v190) : "w"(v189), "w"(v187)); - svfloat32_t v193; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v193) : "w"(v192), "w"(v97)); + svfloat32_t v190 = svsub_f32_x(svptrue_b32(), v189, v187); + svfloat32_t v193 = svadd_f32_x(svptrue_b32(), v192, v97); svfloat32_t v426 = svmla_f32_x(pred_full, v263, v160, v798); svfloat32_t v427 = svnmls_f32_x(pred_full, v263, v161, v799); svfloat32_t v432 = svnmls_f32_x(pred_full, v238, v165, v803); @@ -5527,160 +2053,91 @@ void armral_fft_cf32_cf32_cf32_ac_n_gu17(const armral_cmplx_f32_t *restrict x, svfloat32_t v459 = svcmla_f32_x(pred_full, v362, v813, v183, 90); svfloat32_t v460 = svcmla_f32_x(pred_full, v362, v814, v184, 90); svst1_f64(pred_full, (double *)(v832), svreinterpret_f64_f32(v203)); - svfloat32_t v194; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v194) : "w"(v193), "w"(v145)); - svfloat32_t zero411; - asm volatile("mov %0.s, #0" : "=w"(zero411)); + svfloat32_t v194 = svsub_f32_x(svptrue_b32(), v193, v145); + svfloat32_t zero411 = svdup_n_f32(0); svfloat32_t v411 = svcmla_f32_x(pred_full, zero411, v822, v190, 90); svfloat32_t v435 = svmla_f32_x(pred_full, v434, v159, v797); svfloat32_t v436 = svmls_f32_x(pred_full, v434, v159, v797); - svfloat32_t v437; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v437) : "w"(v426), "w"(v428)); - svfloat32_t v439; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v439) : "w"(v427), "w"(v429)); - svfloat32_t v441; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v441) : "w"(v426), "w"(v430)); - svfloat32_t v443; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v443) : "w"(v427), "w"(v431)); - svfloat32_t v464; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v464) : "w"(v453), "w"(v455)); - svfloat32_t v465; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v465) : "w"(v453), "w"(v455)); - svfloat32_t v466; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v466) : "w"(v454), "w"(v456)); - svfloat32_t v467; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v467) : "w"(v454), "w"(v456)); - svfloat32_t v468; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v468) : "w"(v457), "w"(v459)); - svfloat32_t v469; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v469) : "w"(v459), "w"(v457)); - svfloat32_t v470; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v470) : "w"(v458), "w"(v460)); - svfloat32_t v471; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v471) : "w"(v460), "w"(v458)); - svfloat32_t v195; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v195) : "w"(v190), "w"(v194)); - svfloat32_t zero418; - asm volatile("mov %0.s, #0" : "=w"(zero418)); + svfloat32_t v437 = svsub_f32_x(svptrue_b32(), v426, v428); + svfloat32_t v439 = svadd_f32_x(svptrue_b32(), v427, v429); + svfloat32_t v441 = svadd_f32_x(svptrue_b32(), v426, v430); + svfloat32_t v443 = svadd_f32_x(svptrue_b32(), v427, v431); + svfloat32_t v464 = svadd_f32_x(svptrue_b32(), v453, v455); + svfloat32_t v465 = svsub_f32_x(svptrue_b32(), v453, v455); + svfloat32_t v466 = svadd_f32_x(svptrue_b32(), v454, v456); + svfloat32_t v467 = svsub_f32_x(svptrue_b32(), v454, v456); + svfloat32_t v468 = svadd_f32_x(svptrue_b32(), v457, v459); + svfloat32_t v469 = svsub_f32_x(svptrue_b32(), v459, v457); + svfloat32_t v470 = svadd_f32_x(svptrue_b32(), v458, v460); + svfloat32_t v471 = svsub_f32_x(svptrue_b32(), v460, v458); + svfloat32_t v195 = svadd_f32_x(svptrue_b32(), v190, v194); + svfloat32_t zero418 = svdup_n_f32(0); svfloat32_t v418 = svcmla_f32_x(pred_full, zero418, v823, v194, 90); - svfloat32_t v438; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v438) : "w"(v432), "w"(v435)); - svfloat32_t v440; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v440) : "w"(v433), "w"(v436)); - svfloat32_t v442; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v442) : "w"(v435), "w"(v432)); - svfloat32_t v444; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v444) : "w"(v436), "w"(v433)); - svfloat32_t v481; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v481) : "w"(v466), "w"(v470)); - svfloat32_t v483; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v483) : "w"(v465), "w"(v471)); - svfloat32_t v485; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v485) : "w"(v464), "w"(v468)); - svfloat32_t v487; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v487) : "w"(v471), "w"(v465)); - svfloat32_t v489; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v489) : "w"(v464), "w"(v468)); - svfloat32_t v492; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v492) : "w"(v469), "w"(v467)); - svfloat32_t v495; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v495) : "w"(v470), "w"(v466)); - svfloat32_t v498; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v498) : "w"(v467), "w"(v469)); - svfloat32_t v445; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v445) : "w"(v437), "w"(v438)); - svfloat32_t v446; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v446) : "w"(v439), "w"(v440)); - svfloat32_t v447; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v447) : "w"(v441), "w"(v442)); - svfloat32_t v448; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v448) : "w"(v443), "w"(v444)); - svfloat32_t v449; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v449) : "w"(v438), "w"(v437)); - svfloat32_t v450; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v450) : "w"(v440), "w"(v439)); - svfloat32_t v451; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v451) : "w"(v442), "w"(v441)); - svfloat32_t v452; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v452) : "w"(v444), "w"(v443)); - svfloat32_t v472; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v472) : "w"(v411), "w"(v418)); + svfloat32_t v438 = svadd_f32_x(svptrue_b32(), v432, v435); + svfloat32_t v440 = svadd_f32_x(svptrue_b32(), v433, v436); + svfloat32_t v442 = svsub_f32_x(svptrue_b32(), v435, v432); + svfloat32_t v444 = svsub_f32_x(svptrue_b32(), v436, v433); + svfloat32_t v481 = svadd_f32_x(svptrue_b32(), v466, v470); + svfloat32_t v483 = svadd_f32_x(svptrue_b32(), v465, v471); + svfloat32_t v485 = svsub_f32_x(svptrue_b32(), v464, v468); + svfloat32_t v487 = svsub_f32_x(svptrue_b32(), v471, v465); + svfloat32_t v489 = svadd_f32_x(svptrue_b32(), v464, v468); + svfloat32_t v492 = svsub_f32_x(svptrue_b32(), v469, v467); + svfloat32_t v495 = svsub_f32_x(svptrue_b32(), v470, v466); + svfloat32_t v498 = svadd_f32_x(svptrue_b32(), v467, v469); + svfloat32_t v445 = svadd_f32_x(svptrue_b32(), v437, v438); + svfloat32_t v446 = svadd_f32_x(svptrue_b32(), v439, v440); + svfloat32_t v447 = svadd_f32_x(svptrue_b32(), v441, v442); + svfloat32_t v448 = svadd_f32_x(svptrue_b32(), v443, v444); + svfloat32_t v449 = svsub_f32_x(svptrue_b32(), v438, v437); + svfloat32_t v450 = svsub_f32_x(svptrue_b32(), v440, v439); + svfloat32_t v451 = svsub_f32_x(svptrue_b32(), v442, v441); + svfloat32_t v452 = svsub_f32_x(svptrue_b32(), v444, v443); + svfloat32_t v472 = svsub_f32_x(svptrue_b32(), v411, v418); svfloat32_t v461 = svcmla_f32_x(pred_full, v418, v824, v195, 90); - svfloat32_t v474; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v474) : "w"(v472), "w"(v472)); - svfloat32_t v499; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v499) : "w"(v498), "w"(v472)); + svfloat32_t v474 = svadd_f32_x(svptrue_b32(), v472, v472); + svfloat32_t v499 = svsub_f32_x(svptrue_b32(), v498, v472); svfloat32_t v462 = svcmla_f32_x(pred_full, v461, v816, v186, 90); - svfloat32_t v475; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v475) : "w"(v390), "w"(v474)); - svfloat32_t v478; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v478) : "w"(v461), "w"(v461)); - svfloat32_t v496; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v496) : "w"(v495), "w"(v474)); - svfloat32_t v539; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v539) : "w"(v452), "w"(v499)); - svfloat32_t v547; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v547) : "w"(v452), "w"(v499)); + svfloat32_t v475 = svsub_f32_x(svptrue_b32(), v390, v474); + svfloat32_t v478 = svadd_f32_x(svptrue_b32(), v461, v461); + svfloat32_t v496 = svadd_f32_x(svptrue_b32(), v495, v474); + svfloat32_t v539 = svadd_f32_x(svptrue_b32(), v452, v499); + svfloat32_t v547 = svsub_f32_x(svptrue_b32(), v452, v499); svfloat32_t v463 = svcmla_f32_x(pred_full, v462, v817, v33, 90); svfloat32_t v473 = svcmla_f32_x(pred_full, v462, v818, v97, 90); svfloat32_t v476 = svcmla_f32_x(pred_full, v475, v820, v81, 90); svfloat32_t v477 = svcmla_f32_x(pred_full, v475, v821, v145, 90); - svfloat32_t v479; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v479) : "w"(v478), "w"(v478)); - svfloat32_t v480; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v480) : "w"(v472), "w"(v478)); - svfloat32_t v486; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v486) : "w"(v485), "w"(v478)); - svfloat32_t v497; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v497) : "w"(v496), "w"(v478)); + svfloat32_t v479 = svadd_f32_x(svptrue_b32(), v478, v478); + svfloat32_t v480 = svadd_f32_x(svptrue_b32(), v472, v478); + svfloat32_t v486 = svadd_f32_x(svptrue_b32(), v485, v478); + svfloat32_t v497 = svadd_f32_x(svptrue_b32(), v496, v478); svst1_f64(pred_full, (double *)(v877), svreinterpret_f64_f32(v539)); svst1_f64(pred_full, (double *)(v886), svreinterpret_f64_f32(v547)); - svfloat32_t v482; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v482) : "w"(v481), "w"(v473)); - svfloat32_t v484; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v484) : "w"(v483), "w"(v476)); - svfloat32_t v488; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v488) : "w"(v487), "w"(v480)); - svfloat32_t v490; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v490) : "w"(v489), "w"(v463)); - svfloat32_t v493; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v493) : "w"(v492), "w"(v477)); - svfloat32_t v523; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v523) : "w"(v447), "w"(v486)); - svfloat32_t v531; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v531) : "w"(v447), "w"(v486)); - svfloat32_t v619; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v619) : "w"(v451), "w"(v497)); - svfloat32_t v627; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v627) : "w"(v451), "w"(v497)); - svfloat32_t v491; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v491) : "w"(v490), "w"(v472)); - svfloat32_t v494; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v494) : "w"(v493), "w"(v479)); - svfloat32_t v507; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v507) : "w"(v445), "w"(v482)); - svfloat32_t v515; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v515) : "w"(v445), "w"(v482)); - svfloat32_t v571; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v571) : "w"(v448), "w"(v488)); - svfloat32_t v579; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v579) : "w"(v448), "w"(v488)); - svfloat32_t v587; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v587) : "w"(v446), "w"(v484)); - svfloat32_t v595; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v595) : "w"(v446), "w"(v484)); + svfloat32_t v482 = svadd_f32_x(svptrue_b32(), v481, v473); + svfloat32_t v484 = svadd_f32_x(svptrue_b32(), v483, v476); + svfloat32_t v488 = svsub_f32_x(svptrue_b32(), v487, v480); + svfloat32_t v490 = svadd_f32_x(svptrue_b32(), v489, v463); + svfloat32_t v493 = svsub_f32_x(svptrue_b32(), v492, v477); + svfloat32_t v523 = svadd_f32_x(svptrue_b32(), v447, v486); + svfloat32_t v531 = svsub_f32_x(svptrue_b32(), v447, v486); + svfloat32_t v619 = svadd_f32_x(svptrue_b32(), v451, v497); + svfloat32_t v627 = svsub_f32_x(svptrue_b32(), v451, v497); + svfloat32_t v491 = svadd_f32_x(svptrue_b32(), v490, v472); + svfloat32_t v494 = svadd_f32_x(svptrue_b32(), v493, v479); + svfloat32_t v507 = svadd_f32_x(svptrue_b32(), v445, v482); + svfloat32_t v515 = svsub_f32_x(svptrue_b32(), v445, v482); + svfloat32_t v571 = svadd_f32_x(svptrue_b32(), v448, v488); + svfloat32_t v579 = svsub_f32_x(svptrue_b32(), v448, v488); + svfloat32_t v587 = svadd_f32_x(svptrue_b32(), v446, v484); + svfloat32_t v595 = svsub_f32_x(svptrue_b32(), v446, v484); svst1_f64(pred_full, (double *)(v859), svreinterpret_f64_f32(v523)); svst1_f64(pred_full, (double *)(v868), svreinterpret_f64_f32(v531)); svst1_f64(pred_full, (double *)(v967), svreinterpret_f64_f32(v619)); svst1_f64(pred_full, (double *)(v976), svreinterpret_f64_f32(v627)); - svfloat32_t v555; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v555) : "w"(v449), "w"(v491)); - svfloat32_t v563; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v563) : "w"(v449), "w"(v491)); - svfloat32_t v603; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v603) : "w"(v450), "w"(v494)); - svfloat32_t v611; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v611) : "w"(v450), "w"(v494)); + svfloat32_t v555 = svadd_f32_x(svptrue_b32(), v449, v491); + svfloat32_t v563 = svsub_f32_x(svptrue_b32(), v449, v491); + svfloat32_t v603 = svadd_f32_x(svptrue_b32(), v450, v494); + svfloat32_t v611 = svsub_f32_x(svptrue_b32(), v450, v494); svst1_f64(pred_full, (double *)(v841), svreinterpret_f64_f32(v507)); svst1_f64(pred_full, (double *)(v850), svreinterpret_f64_f32(v515)); svst1_f64(pred_full, (double *)(v913), svreinterpret_f64_f32(v571)); @@ -6057,209 +2514,117 @@ void armral_fft_cf32_cf32_cf32_ac_n_gu18(const armral_cmplx_f32_t *restrict x, svld1_gather_s64index_f64(pred_full, (const double *)(v659), v669)); svfloat32_t v670 = svreinterpret_f32_f64( svld1_gather_s64index_f64(pred_full, (const double *)(v668), v669)); - svfloat32_t v32; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v32) : "w"(v517), "w"(v526)); - svfloat32_t v33; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v33) : "w"(v517), "w"(v526)); - svfloat32_t v48; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v48) : "w"(v535), "w"(v544)); - svfloat32_t v49; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v49) : "w"(v535), "w"(v544)); - svfloat32_t v64; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v64) : "w"(v553), "w"(v562)); - svfloat32_t v65; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v65) : "w"(v553), "w"(v562)); - svfloat32_t v80; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v80) : "w"(v571), "w"(v580)); - svfloat32_t v81; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v81) : "w"(v571), "w"(v580)); - svfloat32_t v96; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v96) : "w"(v589), "w"(v598)); - svfloat32_t v97; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v97) : "w"(v589), "w"(v598)); - svfloat32_t v112; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v112) : "w"(v607), "w"(v616)); - svfloat32_t v113; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v113) : "w"(v607), "w"(v616)); - svfloat32_t v128; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v128) : "w"(v625), "w"(v634)); - svfloat32_t v129; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v129) : "w"(v625), "w"(v634)); - svfloat32_t v144; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v144) : "w"(v643), "w"(v652)); - svfloat32_t v145; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v145) : "w"(v643), "w"(v652)); - svfloat32_t v160; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v160) : "w"(v661), "w"(v670)); - svfloat32_t v161; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v161) : "w"(v661), "w"(v670)); - svfloat32_t v162; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v162) : "w"(v48), "w"(v160)); - svfloat32_t v163; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v163) : "w"(v48), "w"(v160)); - svfloat32_t v164; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v164) : "w"(v144), "w"(v64)); - svfloat32_t v165; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v165) : "w"(v144), "w"(v64)); - svfloat32_t v166; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v166) : "w"(v80), "w"(v128)); - svfloat32_t v167; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v167) : "w"(v80), "w"(v128)); - svfloat32_t v168; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v168) : "w"(v96), "w"(v112)); - svfloat32_t v169; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v169) : "w"(v96), "w"(v112)); - svfloat32_t v272; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v272) : "w"(v49), "w"(v161)); - svfloat32_t v273; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v273) : "w"(v49), "w"(v161)); - svfloat32_t v274; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v274) : "w"(v145), "w"(v65)); - svfloat32_t v275; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v275) : "w"(v145), "w"(v65)); - svfloat32_t v276; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v276) : "w"(v81), "w"(v129)); - svfloat32_t v277; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v277) : "w"(v81), "w"(v129)); - svfloat32_t v278; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v278) : "w"(v97), "w"(v113)); - svfloat32_t v279; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v279) : "w"(v97), "w"(v113)); - svfloat32_t v170; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v170) : "w"(v162), "w"(v164)); - svfloat32_t v174; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v174) : "w"(v163), "w"(v165)); - svfloat32_t v176; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v176) : "w"(v162), "w"(v164)); - svfloat32_t v177; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v177) : "w"(v164), "w"(v168)); - svfloat32_t v178; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v178) : "w"(v168), "w"(v162)); - svfloat32_t v179; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v179) : "w"(v163), "w"(v165)); - svfloat32_t v180; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v180) : "w"(v165), "w"(v169)); - svfloat32_t v181; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v181) : "w"(v169), "w"(v163)); - svfloat32_t zero210; - asm volatile("mov %0.s, #0" : "=w"(zero210)); + svfloat32_t v32 = svadd_f32_x(svptrue_b32(), v517, v526); + svfloat32_t v33 = svsub_f32_x(svptrue_b32(), v517, v526); + svfloat32_t v48 = svadd_f32_x(svptrue_b32(), v535, v544); + svfloat32_t v49 = svsub_f32_x(svptrue_b32(), v535, v544); + svfloat32_t v64 = svadd_f32_x(svptrue_b32(), v553, v562); + svfloat32_t v65 = svsub_f32_x(svptrue_b32(), v553, v562); + svfloat32_t v80 = svadd_f32_x(svptrue_b32(), v571, v580); + svfloat32_t v81 = svsub_f32_x(svptrue_b32(), v571, v580); + svfloat32_t v96 = svadd_f32_x(svptrue_b32(), v589, v598); + svfloat32_t v97 = svsub_f32_x(svptrue_b32(), v589, v598); + svfloat32_t v112 = svadd_f32_x(svptrue_b32(), v607, v616); + svfloat32_t v113 = svsub_f32_x(svptrue_b32(), v607, v616); + svfloat32_t v128 = svadd_f32_x(svptrue_b32(), v625, v634); + svfloat32_t v129 = svsub_f32_x(svptrue_b32(), v625, v634); + svfloat32_t v144 = svadd_f32_x(svptrue_b32(), v643, v652); + svfloat32_t v145 = svsub_f32_x(svptrue_b32(), v643, v652); + svfloat32_t v160 = svadd_f32_x(svptrue_b32(), v661, v670); + svfloat32_t v161 = svsub_f32_x(svptrue_b32(), v661, v670); + svfloat32_t v162 = svadd_f32_x(svptrue_b32(), v48, v160); + svfloat32_t v163 = svsub_f32_x(svptrue_b32(), v48, v160); + svfloat32_t v164 = svadd_f32_x(svptrue_b32(), v144, v64); + svfloat32_t v165 = svsub_f32_x(svptrue_b32(), v144, v64); + svfloat32_t v166 = svadd_f32_x(svptrue_b32(), v80, v128); + svfloat32_t v167 = svsub_f32_x(svptrue_b32(), v80, v128); + svfloat32_t v168 = svadd_f32_x(svptrue_b32(), v96, v112); + svfloat32_t v169 = svsub_f32_x(svptrue_b32(), v96, v112); + svfloat32_t v272 = svadd_f32_x(svptrue_b32(), v49, v161); + svfloat32_t v273 = svsub_f32_x(svptrue_b32(), v49, v161); + svfloat32_t v274 = svadd_f32_x(svptrue_b32(), v145, v65); + svfloat32_t v275 = svsub_f32_x(svptrue_b32(), v145, v65); + svfloat32_t v276 = svadd_f32_x(svptrue_b32(), v81, v129); + svfloat32_t v277 = svsub_f32_x(svptrue_b32(), v81, v129); + svfloat32_t v278 = svadd_f32_x(svptrue_b32(), v97, v113); + svfloat32_t v279 = svsub_f32_x(svptrue_b32(), v97, v113); + svfloat32_t v170 = svadd_f32_x(svptrue_b32(), v162, v164); + svfloat32_t v174 = svadd_f32_x(svptrue_b32(), v163, v165); + svfloat32_t v176 = svsub_f32_x(svptrue_b32(), v162, v164); + svfloat32_t v177 = svsub_f32_x(svptrue_b32(), v164, v168); + svfloat32_t v178 = svsub_f32_x(svptrue_b32(), v168, v162); + svfloat32_t v179 = svsub_f32_x(svptrue_b32(), v163, v165); + svfloat32_t v180 = svsub_f32_x(svptrue_b32(), v165, v169); + svfloat32_t v181 = svsub_f32_x(svptrue_b32(), v169, v163); + svfloat32_t zero210 = svdup_n_f32(0); svfloat32_t v210 = svcmla_f32_x(pred_full, zero210, v686, v167, 90); - svfloat32_t v280; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v280) : "w"(v272), "w"(v274)); - svfloat32_t v284; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v284) : "w"(v273), "w"(v275)); - svfloat32_t v286; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v286) : "w"(v272), "w"(v274)); - svfloat32_t v287; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v287) : "w"(v274), "w"(v278)); - svfloat32_t v288; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v288) : "w"(v278), "w"(v272)); - svfloat32_t v289; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v289) : "w"(v273), "w"(v275)); - svfloat32_t v290; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v290) : "w"(v275), "w"(v279)); - svfloat32_t v291; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v291) : "w"(v279), "w"(v273)); - svfloat32_t zero320; - asm volatile("mov %0.s, #0" : "=w"(zero320)); + svfloat32_t v280 = svadd_f32_x(svptrue_b32(), v272, v274); + svfloat32_t v284 = svadd_f32_x(svptrue_b32(), v273, v275); + svfloat32_t v286 = svsub_f32_x(svptrue_b32(), v272, v274); + svfloat32_t v287 = svsub_f32_x(svptrue_b32(), v274, v278); + svfloat32_t v288 = svsub_f32_x(svptrue_b32(), v278, v272); + svfloat32_t v289 = svsub_f32_x(svptrue_b32(), v273, v275); + svfloat32_t v290 = svsub_f32_x(svptrue_b32(), v275, v279); + svfloat32_t v291 = svsub_f32_x(svptrue_b32(), v279, v273); + svfloat32_t zero320 = svdup_n_f32(0); svfloat32_t v320 = svcmla_f32_x(pred_full, zero320, v686, v277, 90); - svfloat32_t v171; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v171) : "w"(v170), "w"(v168)); - svfloat32_t v175; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v175) : "w"(v174), "w"(v169)); - svfloat32_t zero232; - asm volatile("mov %0.s, #0" : "=w"(zero232)); + svfloat32_t v171 = svadd_f32_x(svptrue_b32(), v170, v168); + svfloat32_t v175 = svadd_f32_x(svptrue_b32(), v174, v169); + svfloat32_t zero232 = svdup_n_f32(0); svfloat32_t v232 = svcmla_f32_x(pred_full, zero232, v690, v179, 90); - svfloat32_t zero239; - asm volatile("mov %0.s, #0" : "=w"(zero239)); + svfloat32_t zero239 = svdup_n_f32(0); svfloat32_t v239 = svcmla_f32_x(pred_full, zero239, v691, v180, 90); - svfloat32_t zero246; - asm volatile("mov %0.s, #0" : "=w"(zero246)); + svfloat32_t zero246 = svdup_n_f32(0); svfloat32_t v246 = svcmla_f32_x(pred_full, zero246, v692, v181, 90); - svfloat32_t v281; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v281) : "w"(v280), "w"(v278)); - svfloat32_t v285; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v285) : "w"(v284), "w"(v279)); - svfloat32_t zero342; - asm volatile("mov %0.s, #0" : "=w"(zero342)); + svfloat32_t v281 = svadd_f32_x(svptrue_b32(), v280, v278); + svfloat32_t v285 = svadd_f32_x(svptrue_b32(), v284, v279); + svfloat32_t zero342 = svdup_n_f32(0); svfloat32_t v342 = svcmla_f32_x(pred_full, zero342, v690, v289, 90); - svfloat32_t zero349; - asm volatile("mov %0.s, #0" : "=w"(zero349)); + svfloat32_t zero349 = svdup_n_f32(0); svfloat32_t v349 = svcmla_f32_x(pred_full, zero349, v691, v290, 90); - svfloat32_t zero356; - asm volatile("mov %0.s, #0" : "=w"(zero356)); + svfloat32_t zero356 = svdup_n_f32(0); svfloat32_t v356 = svcmla_f32_x(pred_full, zero356, v692, v291, 90); - svfloat32_t v172; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v172) : "w"(v171), "w"(v166)); - svfloat32_t v191; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v191) : "w"(v171), "w"(v683)); - svfloat32_t zero198; - asm volatile("mov %0.s, #0" : "=w"(zero198)); + svfloat32_t v172 = svadd_f32_x(svptrue_b32(), v171, v166); + svfloat32_t v191 = svmul_f32_x(svptrue_b32(), v171, v683); + svfloat32_t zero198 = svdup_n_f32(0); svfloat32_t v198 = svcmla_f32_x(pred_full, zero198, v686, v175, 90); - svfloat32_t v260; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v260) : "w"(v210), "w"(v232)); - svfloat32_t v262; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v262) : "w"(v210), "w"(v239)); - svfloat32_t v264; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v264) : "w"(v210), "w"(v232)); - svfloat32_t v282; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v282) : "w"(v281), "w"(v276)); - svfloat32_t v301; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v301) : "w"(v281), "w"(v683)); - svfloat32_t zero308; - asm volatile("mov %0.s, #0" : "=w"(zero308)); + svfloat32_t v260 = svadd_f32_x(svptrue_b32(), v210, v232); + svfloat32_t v262 = svsub_f32_x(svptrue_b32(), v210, v239); + svfloat32_t v264 = svsub_f32_x(svptrue_b32(), v210, v232); + svfloat32_t v282 = svadd_f32_x(svptrue_b32(), v281, v276); + svfloat32_t v301 = svmul_f32_x(svptrue_b32(), v281, v683); + svfloat32_t zero308 = svdup_n_f32(0); svfloat32_t v308 = svcmla_f32_x(pred_full, zero308, v686, v285, 90); - svfloat32_t v370; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v370) : "w"(v320), "w"(v342)); - svfloat32_t v372; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v372) : "w"(v320), "w"(v349)); - svfloat32_t v374; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v374) : "w"(v320), "w"(v342)); - svfloat32_t v173; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v173) : "w"(v172), "w"(v32)); - svfloat32_t v247; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v247) : "w"(v191), "w"(v191)); - svfloat32_t v261; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v261) : "w"(v260), "w"(v239)); - svfloat32_t v263; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v263) : "w"(v262), "w"(v246)); - svfloat32_t v265; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v265) : "w"(v264), "w"(v246)); - svfloat32_t v283; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v283) : "w"(v282), "w"(v33)); - svfloat32_t v357; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v357) : "w"(v301), "w"(v301)); - svfloat32_t v371; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v371) : "w"(v370), "w"(v349)); - svfloat32_t v373; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v373) : "w"(v372), "w"(v356)); - svfloat32_t v375; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v375) : "w"(v374), "w"(v356)); + svfloat32_t v370 = svadd_f32_x(svptrue_b32(), v320, v342); + svfloat32_t v372 = svsub_f32_x(svptrue_b32(), v320, v349); + svfloat32_t v374 = svsub_f32_x(svptrue_b32(), v320, v342); + svfloat32_t v173 = svadd_f32_x(svptrue_b32(), v172, v32); + svfloat32_t v247 = svadd_f32_x(svptrue_b32(), v191, v191); + svfloat32_t v261 = svadd_f32_x(svptrue_b32(), v260, v239); + svfloat32_t v263 = svadd_f32_x(svptrue_b32(), v262, v246); + svfloat32_t v265 = svsub_f32_x(svptrue_b32(), v264, v246); + svfloat32_t v283 = svadd_f32_x(svptrue_b32(), v282, v33); + svfloat32_t v357 = svadd_f32_x(svptrue_b32(), v301, v301); + svfloat32_t v371 = svadd_f32_x(svptrue_b32(), v370, v349); + svfloat32_t v373 = svadd_f32_x(svptrue_b32(), v372, v356); + svfloat32_t v375 = svsub_f32_x(svptrue_b32(), v374, v356); svfloat32_t v248 = svmla_f32_x(pred_full, v247, v171, v683); svfloat32_t v252 = svmla_f32_x(pred_full, v173, v166, v685); svfloat32_t v358 = svmla_f32_x(pred_full, v357, v281, v683); svfloat32_t v362 = svmla_f32_x(pred_full, v283, v276, v685); svst1_f64(pred_full, (double *)(v700), svreinterpret_f64_f32(v173)); svst1_f64(pred_full, (double *)(v709), svreinterpret_f64_f32(v283)); - svfloat32_t v249; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v249) : "w"(v173), "w"(v248)); - svfloat32_t v253; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v253) : "w"(v252), "w"(v247)); - svfloat32_t v359; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v359) : "w"(v283), "w"(v358)); - svfloat32_t v363; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v363) : "w"(v362), "w"(v357)); - svfloat32_t v250; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v250) : "w"(v249), "w"(v198)); - svfloat32_t v251; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v251) : "w"(v249), "w"(v198)); + svfloat32_t v249 = svadd_f32_x(svptrue_b32(), v173, v248); + svfloat32_t v253 = svadd_f32_x(svptrue_b32(), v252, v247); + svfloat32_t v359 = svadd_f32_x(svptrue_b32(), v283, v358); + svfloat32_t v363 = svadd_f32_x(svptrue_b32(), v362, v357); + svfloat32_t v250 = svadd_f32_x(svptrue_b32(), v249, v198); + svfloat32_t v251 = svsub_f32_x(svptrue_b32(), v249, v198); svfloat32_t v254 = svmla_f32_x(pred_full, v253, v176, v687); svfloat32_t v256 = svmls_f32_x(pred_full, v253, v177, v688); svfloat32_t v258 = svmls_f32_x(pred_full, v253, v176, v687); - svfloat32_t v360; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v360) : "w"(v359), "w"(v308)); - svfloat32_t v361; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v361) : "w"(v359), "w"(v308)); + svfloat32_t v360 = svadd_f32_x(svptrue_b32(), v359, v308); + svfloat32_t v361 = svsub_f32_x(svptrue_b32(), v359, v308); svfloat32_t v364 = svmla_f32_x(pred_full, v363, v286, v687); svfloat32_t v366 = svmls_f32_x(pred_full, v363, v287, v688); svfloat32_t v368 = svmls_f32_x(pred_full, v363, v286, v687); @@ -6273,30 +2638,18 @@ void armral_fft_cf32_cf32_cf32_ac_n_gu18(const armral_cmplx_f32_t *restrict x, svst1_f64(pred_full, (double *)(v763), svreinterpret_f64_f32(v361)); svst1_f64(pred_full, (double *)(v808), svreinterpret_f64_f32(v250)); svst1_f64(pred_full, (double *)(v817), svreinterpret_f64_f32(v360)); - svfloat32_t v266; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v266) : "w"(v255), "w"(v261)); - svfloat32_t v267; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v267) : "w"(v255), "w"(v261)); - svfloat32_t v268; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v268) : "w"(v257), "w"(v263)); - svfloat32_t v269; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v269) : "w"(v257), "w"(v263)); - svfloat32_t v270; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v270) : "w"(v259), "w"(v265)); - svfloat32_t v271; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v271) : "w"(v259), "w"(v265)); - svfloat32_t v376; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v376) : "w"(v365), "w"(v371)); - svfloat32_t v377; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v377) : "w"(v365), "w"(v371)); - svfloat32_t v378; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v378) : "w"(v367), "w"(v373)); - svfloat32_t v379; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v379) : "w"(v367), "w"(v373)); - svfloat32_t v380; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v380) : "w"(v369), "w"(v375)); - svfloat32_t v381; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v381) : "w"(v369), "w"(v375)); + svfloat32_t v266 = svadd_f32_x(svptrue_b32(), v255, v261); + svfloat32_t v267 = svsub_f32_x(svptrue_b32(), v255, v261); + svfloat32_t v268 = svadd_f32_x(svptrue_b32(), v257, v263); + svfloat32_t v269 = svsub_f32_x(svptrue_b32(), v257, v263); + svfloat32_t v270 = svadd_f32_x(svptrue_b32(), v259, v265); + svfloat32_t v271 = svsub_f32_x(svptrue_b32(), v259, v265); + svfloat32_t v376 = svadd_f32_x(svptrue_b32(), v365, v371); + svfloat32_t v377 = svsub_f32_x(svptrue_b32(), v365, v371); + svfloat32_t v378 = svadd_f32_x(svptrue_b32(), v367, v373); + svfloat32_t v379 = svsub_f32_x(svptrue_b32(), v367, v373); + svfloat32_t v380 = svadd_f32_x(svptrue_b32(), v369, v375); + svfloat32_t v381 = svsub_f32_x(svptrue_b32(), v369, v375); svst1_f64(pred_full, (double *)(v718), svreinterpret_f64_f32(v267)); svst1_f64(pred_full, (double *)(v727), svreinterpret_f64_f32(v377)); svst1_f64(pred_full, (double *)(v736), svreinterpret_f64_f32(v268)); @@ -6947,403 +3300,227 @@ void armral_fft_cf32_cf32_cf32_ac_n_gu19(const armral_cmplx_f32_t *restrict x, svld1_gather_s64index_f64(pred_full, (const double *)(v853), v873)); svfloat32_t v864 = svreinterpret_f32_f64( svld1_gather_s64index_f64(pred_full, (const double *)(v862), v873)); - svfloat32_t v32; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v32) : "w"(v711), "w"(v720)); - svfloat32_t v33; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v33) : "w"(v711), "w"(v720)); - svfloat32_t v48; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v48) : "w"(v729), "w"(v738)); - svfloat32_t v49; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v49) : "w"(v738), "w"(v729)); - svfloat32_t v64; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v64) : "w"(v747), "w"(v756)); - svfloat32_t v65; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v65) : "w"(v747), "w"(v756)); - svfloat32_t v80; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v80) : "w"(v765), "w"(v774)); - svfloat32_t v81; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v81) : "w"(v774), "w"(v765)); - svfloat32_t v96; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v96) : "w"(v783), "w"(v792)); - svfloat32_t v97; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v97) : "w"(v783), "w"(v792)); - svfloat32_t v112; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v112) : "w"(v801), "w"(v810)); - svfloat32_t v113; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v113) : "w"(v810), "w"(v801)); - svfloat32_t v128; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v128) : "w"(v819), "w"(v828)); - svfloat32_t v129; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v129) : "w"(v819), "w"(v828)); - svfloat32_t v144; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v144) : "w"(v837), "w"(v846)); - svfloat32_t v145; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v145) : "w"(v846), "w"(v837)); - svfloat32_t v160; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v160) : "w"(v855), "w"(v864)); - svfloat32_t v161; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v161) : "w"(v855), "w"(v864)); - svfloat32_t v162; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v162) : "w"(v32), "w"(v128)); - svfloat32_t v163; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v163) : "w"(v48), "w"(v144)); - svfloat32_t v164; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v164) : "w"(v64), "w"(v160)); - svfloat32_t v165; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v165) : "w"(v80), "w"(v128)); - svfloat32_t v166; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v166) : "w"(v96), "w"(v144)); - svfloat32_t v167; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v167) : "w"(v112), "w"(v160)); - svfloat32_t v168; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v168) : "w"(v32), "w"(v80)); - svfloat32_t v170; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v170) : "w"(v48), "w"(v96)); - svfloat32_t v172; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v172) : "w"(v64), "w"(v112)); - svfloat32_t v202; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v202) : "w"(v33), "w"(v129)); - svfloat32_t v203; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v203) : "w"(v49), "w"(v145)); - svfloat32_t v204; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v204) : "w"(v65), "w"(v161)); - svfloat32_t v205; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v205) : "w"(v81), "w"(v129)); - svfloat32_t v206; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v206) : "w"(v97), "w"(v145)); - svfloat32_t v207; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v207) : "w"(v113), "w"(v161)); - svfloat32_t v208; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v208) : "w"(v33), "w"(v81)); - svfloat32_t v210; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v210) : "w"(v49), "w"(v97)); - svfloat32_t v212; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v212) : "w"(v65), "w"(v113)); - svfloat32_t v169; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v169) : "w"(v168), "w"(v128)); - svfloat32_t v171; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v171) : "w"(v170), "w"(v144)); - svfloat32_t v173; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v173) : "w"(v172), "w"(v160)); - svfloat32_t v174; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v174) : "w"(v162), "w"(v164)); - svfloat32_t v175; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v175) : "w"(v165), "w"(v167)); - svfloat32_t v192; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v192) : "w"(v162), "w"(v165)); - svfloat32_t v193; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v193) : "w"(v164), "w"(v167)); - svfloat32_t v209; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v209) : "w"(v208), "w"(v129)); - svfloat32_t v211; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v211) : "w"(v210), "w"(v145)); - svfloat32_t v213; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v213) : "w"(v212), "w"(v161)); - svfloat32_t v214; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v214) : "w"(v202), "w"(v204)); - svfloat32_t v215; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v215) : "w"(v205), "w"(v207)); - svfloat32_t v224; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v224) : "w"(v202), "w"(v205)); - svfloat32_t v225; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v225) : "w"(v204), "w"(v207)); - svfloat32_t zero389; - asm volatile("mov %0.s, #0" : "=w"(zero389)); + svfloat32_t v32 = svadd_f32_x(svptrue_b32(), v711, v720); + svfloat32_t v33 = svsub_f32_x(svptrue_b32(), v711, v720); + svfloat32_t v48 = svadd_f32_x(svptrue_b32(), v729, v738); + svfloat32_t v49 = svsub_f32_x(svptrue_b32(), v738, v729); + svfloat32_t v64 = svadd_f32_x(svptrue_b32(), v747, v756); + svfloat32_t v65 = svsub_f32_x(svptrue_b32(), v747, v756); + svfloat32_t v80 = svadd_f32_x(svptrue_b32(), v765, v774); + svfloat32_t v81 = svsub_f32_x(svptrue_b32(), v774, v765); + svfloat32_t v96 = svadd_f32_x(svptrue_b32(), v783, v792); + svfloat32_t v97 = svsub_f32_x(svptrue_b32(), v783, v792); + svfloat32_t v112 = svadd_f32_x(svptrue_b32(), v801, v810); + svfloat32_t v113 = svsub_f32_x(svptrue_b32(), v810, v801); + svfloat32_t v128 = svadd_f32_x(svptrue_b32(), v819, v828); + svfloat32_t v129 = svsub_f32_x(svptrue_b32(), v819, v828); + svfloat32_t v144 = svadd_f32_x(svptrue_b32(), v837, v846); + svfloat32_t v145 = svsub_f32_x(svptrue_b32(), v846, v837); + svfloat32_t v160 = svadd_f32_x(svptrue_b32(), v855, v864); + svfloat32_t v161 = svsub_f32_x(svptrue_b32(), v855, v864); + svfloat32_t v162 = svsub_f32_x(svptrue_b32(), v32, v128); + svfloat32_t v163 = svsub_f32_x(svptrue_b32(), v48, v144); + svfloat32_t v164 = svsub_f32_x(svptrue_b32(), v64, v160); + svfloat32_t v165 = svsub_f32_x(svptrue_b32(), v80, v128); + svfloat32_t v166 = svsub_f32_x(svptrue_b32(), v96, v144); + svfloat32_t v167 = svsub_f32_x(svptrue_b32(), v112, v160); + svfloat32_t v168 = svadd_f32_x(svptrue_b32(), v32, v80); + svfloat32_t v170 = svadd_f32_x(svptrue_b32(), v48, v96); + svfloat32_t v172 = svadd_f32_x(svptrue_b32(), v64, v112); + svfloat32_t v202 = svsub_f32_x(svptrue_b32(), v33, v129); + svfloat32_t v203 = svsub_f32_x(svptrue_b32(), v49, v145); + svfloat32_t v204 = svsub_f32_x(svptrue_b32(), v65, v161); + svfloat32_t v205 = svsub_f32_x(svptrue_b32(), v81, v129); + svfloat32_t v206 = svsub_f32_x(svptrue_b32(), v97, v145); + svfloat32_t v207 = svsub_f32_x(svptrue_b32(), v113, v161); + svfloat32_t v208 = svadd_f32_x(svptrue_b32(), v33, v81); + svfloat32_t v210 = svadd_f32_x(svptrue_b32(), v49, v97); + svfloat32_t v212 = svadd_f32_x(svptrue_b32(), v65, v113); + svfloat32_t v169 = svadd_f32_x(svptrue_b32(), v168, v128); + svfloat32_t v171 = svadd_f32_x(svptrue_b32(), v170, v144); + svfloat32_t v173 = svadd_f32_x(svptrue_b32(), v172, v160); + svfloat32_t v174 = svadd_f32_x(svptrue_b32(), v162, v164); + svfloat32_t v175 = svadd_f32_x(svptrue_b32(), v165, v167); + svfloat32_t v192 = svsub_f32_x(svptrue_b32(), v162, v165); + svfloat32_t v193 = svsub_f32_x(svptrue_b32(), v164, v167); + svfloat32_t v209 = svadd_f32_x(svptrue_b32(), v208, v129); + svfloat32_t v211 = svadd_f32_x(svptrue_b32(), v210, v145); + svfloat32_t v213 = svadd_f32_x(svptrue_b32(), v212, v161); + svfloat32_t v214 = svadd_f32_x(svptrue_b32(), v202, v204); + svfloat32_t v215 = svadd_f32_x(svptrue_b32(), v205, v207); + svfloat32_t v224 = svsub_f32_x(svptrue_b32(), v202, v205); + svfloat32_t v225 = svsub_f32_x(svptrue_b32(), v204, v207); + svfloat32_t zero389 = svdup_n_f32(0); svfloat32_t v389 = svcmla_f32_x(pred_full, zero389, v902, v205, 90); - svfloat32_t zero410; - asm volatile("mov %0.s, #0" : "=w"(zero410)); + svfloat32_t zero410 = svdup_n_f32(0); svfloat32_t v410 = svcmla_f32_x(pred_full, zero410, v905, v207, 90); - svfloat32_t v176; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v176) : "w"(v169), "w"(v171)); - svfloat32_t v186; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v186) : "w"(v175), "w"(v166)); - svfloat32_t v187; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v187) : "w"(v174), "w"(v163)); - svfloat32_t v189; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v189) : "w"(v175), "w"(v166)); - svfloat32_t v190; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v190) : "w"(v174), "w"(v163)); - svfloat32_t v194; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v194) : "w"(v162), "w"(v193)); - svfloat32_t v196; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v196) : "w"(v192), "w"(v167)); - svfloat32_t v199; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v199) : "w"(v169), "w"(v173)); - svfloat32_t v200; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v200) : "w"(v171), "w"(v173)); - svfloat32_t v216; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v216) : "w"(v209), "w"(v211)); - svfloat32_t v218; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v218) : "w"(v215), "w"(v206)); - svfloat32_t v219; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v219) : "w"(v214), "w"(v203)); - svfloat32_t v221; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v221) : "w"(v215), "w"(v206)); - svfloat32_t v222; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v222) : "w"(v214), "w"(v203)); - svfloat32_t v226; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v226) : "w"(v202), "w"(v225)); - svfloat32_t v228; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v228) : "w"(v224), "w"(v207)); - svfloat32_t v231; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v231) : "w"(v209), "w"(v213)); - svfloat32_t v232; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v232) : "w"(v211), "w"(v213)); - svfloat32_t v177; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v177) : "w"(v176), "w"(v173)); - svfloat32_t v188; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v188) : "w"(v187), "w"(v186)); - svfloat32_t v191; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v191) : "w"(v190), "w"(v189)); - svfloat32_t v195; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v195) : "w"(v194), "w"(v166)); - svfloat32_t v197; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v197) : "w"(v196), "w"(v163)); - svfloat32_t v201; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v201) : "w"(v199), "w"(v200)); - svfloat32_t v217; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v217) : "w"(v216), "w"(v213)); - svfloat32_t v220; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v220) : "w"(v219), "w"(v218)); - svfloat32_t v223; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v223) : "w"(v222), "w"(v221)); - svfloat32_t v227; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v227) : "w"(v226), "w"(v206)); - svfloat32_t v229; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v229) : "w"(v228), "w"(v203)); - svfloat32_t v233; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v233) : "w"(v231), "w"(v232)); - svfloat32_t v253; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v253) : "w"(v187), "w"(v878)); - svfloat32_t v268; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v268) : "w"(v190), "w"(v881)); - svfloat32_t zero347; - asm volatile("mov %0.s, #0" : "=w"(zero347)); + svfloat32_t v176 = svadd_f32_x(svptrue_b32(), v169, v171); + svfloat32_t v186 = svadd_f32_x(svptrue_b32(), v175, v166); + svfloat32_t v187 = svadd_f32_x(svptrue_b32(), v174, v163); + svfloat32_t v189 = svsub_f32_x(svptrue_b32(), v175, v166); + svfloat32_t v190 = svsub_f32_x(svptrue_b32(), v174, v163); + svfloat32_t v194 = svsub_f32_x(svptrue_b32(), v162, v193); + svfloat32_t v196 = svadd_f32_x(svptrue_b32(), v192, v167); + svfloat32_t v199 = svsub_f32_x(svptrue_b32(), v169, v173); + svfloat32_t v200 = svsub_f32_x(svptrue_b32(), v171, v173); + svfloat32_t v216 = svadd_f32_x(svptrue_b32(), v209, v211); + svfloat32_t v218 = svadd_f32_x(svptrue_b32(), v215, v206); + svfloat32_t v219 = svadd_f32_x(svptrue_b32(), v214, v203); + svfloat32_t v221 = svsub_f32_x(svptrue_b32(), v215, v206); + svfloat32_t v222 = svsub_f32_x(svptrue_b32(), v214, v203); + svfloat32_t v226 = svsub_f32_x(svptrue_b32(), v202, v225); + svfloat32_t v228 = svadd_f32_x(svptrue_b32(), v224, v207); + svfloat32_t v231 = svsub_f32_x(svptrue_b32(), v209, v213); + svfloat32_t v232 = svsub_f32_x(svptrue_b32(), v211, v213); + svfloat32_t v177 = svadd_f32_x(svptrue_b32(), v176, v173); + svfloat32_t v188 = svsub_f32_x(svptrue_b32(), v187, v186); + svfloat32_t v191 = svsub_f32_x(svptrue_b32(), v190, v189); + svfloat32_t v195 = svsub_f32_x(svptrue_b32(), v194, v166); + svfloat32_t v197 = svsub_f32_x(svptrue_b32(), v196, v163); + svfloat32_t v201 = svadd_f32_x(svptrue_b32(), v199, v200); + svfloat32_t v217 = svadd_f32_x(svptrue_b32(), v216, v213); + svfloat32_t v220 = svsub_f32_x(svptrue_b32(), v219, v218); + svfloat32_t v223 = svsub_f32_x(svptrue_b32(), v222, v221); + svfloat32_t v227 = svsub_f32_x(svptrue_b32(), v226, v206); + svfloat32_t v229 = svsub_f32_x(svptrue_b32(), v228, v203); + svfloat32_t v233 = svadd_f32_x(svptrue_b32(), v231, v232); + svfloat32_t v253 = svmul_f32_x(svptrue_b32(), v187, v878); + svfloat32_t v268 = svmul_f32_x(svptrue_b32(), v190, v881); + svfloat32_t zero347 = svdup_n_f32(0); svfloat32_t v347 = svcmla_f32_x(pred_full, zero347, v896, v218, 90); - svfloat32_t zero368; - asm volatile("mov %0.s, #0" : "=w"(zero368)); + svfloat32_t zero368 = svdup_n_f32(0); svfloat32_t v368 = svcmla_f32_x(pred_full, zero368, v899, v221, 90); - svfloat32_t zero452; - asm volatile("mov %0.s, #0" : "=w"(zero452)); + svfloat32_t zero452 = svdup_n_f32(0); svfloat32_t v452 = svcmla_f32_x(pred_full, zero452, v911, v231, 90); - svfloat32_t zero459; - asm volatile("mov %0.s, #0" : "=w"(zero459)); + svfloat32_t zero459 = svdup_n_f32(0); svfloat32_t v459 = svcmla_f32_x(pred_full, zero459, v912, v232, 90); - svfloat32_t v185; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v185) : "w"(v874), "w"(v177)); - svfloat32_t v198; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v198) : "w"(v195), "w"(v197)); - svfloat32_t v230; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v230) : "w"(v227), "w"(v229)); - svfloat32_t v258; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v258) : "w"(v188), "w"(v879)); - svfloat32_t v273; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v273) : "w"(v191), "w"(v882)); - svfloat32_t v333; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v333) : "w"(v201), "w"(v894)); - svfloat32_t zero340; - asm volatile("mov %0.s, #0" : "=w"(zero340)); + svfloat32_t v185 = svadd_f32_x(svptrue_b32(), v874, v177); + svfloat32_t v198 = svsub_f32_x(svptrue_b32(), v195, v197); + svfloat32_t v230 = svsub_f32_x(svptrue_b32(), v227, v229); + svfloat32_t v258 = svmul_f32_x(svptrue_b32(), v188, v879); + svfloat32_t v273 = svmul_f32_x(svptrue_b32(), v191, v882); + svfloat32_t v333 = svmul_f32_x(svptrue_b32(), v201, v894); + svfloat32_t zero340 = svdup_n_f32(0); svfloat32_t v340 = svcmla_f32_x(pred_full, zero340, v895, v217, 90); - svfloat32_t zero466; - asm volatile("mov %0.s, #0" : "=w"(zero466)); + svfloat32_t zero466 = svdup_n_f32(0); svfloat32_t v466 = svcmla_f32_x(pred_full, zero466, v913, v233, 90); svfloat32_t v467 = svmla_f32_x(pred_full, v253, v186, v877); svfloat32_t v468 = svmla_f32_x(pred_full, v268, v189, v880); svfloat32_t v498 = svcmla_f32_x(pred_full, v347, v897, v219, 90); svfloat32_t v499 = svcmla_f32_x(pred_full, v368, v900, v222, 90); - svfloat32_t v318; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v318) : "w"(v198), "w"(v891)); - svfloat32_t zero445; - asm volatile("mov %0.s, #0" : "=w"(zero445)); + svfloat32_t v318 = svmul_f32_x(svptrue_b32(), v198, v891); + svfloat32_t zero445 = svdup_n_f32(0); svfloat32_t v445 = svcmla_f32_x(pred_full, zero445, v910, v230, 90); - svfloat32_t v470; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v470) : "w"(v467), "w"(v468)); + svfloat32_t v470 = svadd_f32_x(svptrue_b32(), v467, v468); svfloat32_t v471 = svmla_f32_x(pred_full, v258, v186, v877); svfloat32_t v472 = svmla_f32_x(pred_full, v273, v189, v880); - svfloat32_t v489; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v489) : "w"(v467), "w"(v468)); + svfloat32_t v489 = svsub_f32_x(svptrue_b32(), v467, v468); svfloat32_t v491 = svnmls_f32_x(pred_full, v333, v199, v892); svfloat32_t v492 = svnmls_f32_x(pred_full, v333, v200, v893); svfloat32_t v493 = svmla_f32_x(pred_full, v185, v177, v876); - svfloat32_t v501; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v501) : "w"(v498), "w"(v499)); + svfloat32_t v501 = svadd_f32_x(svptrue_b32(), v498, v499); svfloat32_t v502 = svcmla_f32_x(pred_full, v347, v898, v220, 90); svfloat32_t v503 = svcmla_f32_x(pred_full, v368, v901, v223, 90); - svfloat32_t v520; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v520) : "w"(v498), "w"(v499)); - svfloat32_t v522; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v522) : "w"(v452), "w"(v466)); - svfloat32_t v523; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v523) : "w"(v459), "w"(v466)); + svfloat32_t v520 = svsub_f32_x(svptrue_b32(), v498, v499); + svfloat32_t v522 = svsub_f32_x(svptrue_b32(), v452, v466); + svfloat32_t v523 = svsub_f32_x(svptrue_b32(), v459, v466); svst1_f64(pred_full, (double *)(v921), svreinterpret_f64_f32(v185)); svfloat32_t v469 = svmla_f32_x(pred_full, v318, v197, v890); svfloat32_t v473 = svmla_f32_x(pred_full, v318, v195, v889); svfloat32_t v474 = svnmls_f32_x(pred_full, v470, v165, v883); - svfloat32_t v475; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v475) : "w"(v471), "w"(v472)); - svfloat32_t v481; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v481) : "w"(v471), "w"(v472)); + svfloat32_t v475 = svadd_f32_x(svptrue_b32(), v471, v472); + svfloat32_t v481 = svsub_f32_x(svptrue_b32(), v471, v472); svfloat32_t v486 = svmla_f32_x(pred_full, v470, v164, v888); - svfloat32_t v494; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v494) : "w"(v493), "w"(v491)); - svfloat32_t v495; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v495) : "w"(v493), "w"(v491)); - svfloat32_t v497; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v497) : "w"(v493), "w"(v492)); + svfloat32_t v494 = svadd_f32_x(svptrue_b32(), v493, v491); + svfloat32_t v495 = svsub_f32_x(svptrue_b32(), v493, v491); + svfloat32_t v497 = svadd_f32_x(svptrue_b32(), v493, v492); svfloat32_t v500 = svcmla_f32_x(pred_full, v445, v909, v229, 90); svfloat32_t v504 = svcmla_f32_x(pred_full, v445, v908, v227, 90); - svfloat32_t v505; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v505) : "w"(v389), "w"(v501)); - svfloat32_t v506; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v506) : "w"(v502), "w"(v503)); - svfloat32_t v512; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v512) : "w"(v502), "w"(v503)); + svfloat32_t v505 = svsub_f32_x(svptrue_b32(), v389, v501); + svfloat32_t v506 = svadd_f32_x(svptrue_b32(), v502, v503); + svfloat32_t v512 = svsub_f32_x(svptrue_b32(), v502, v503); svfloat32_t v517 = svcmla_f32_x(pred_full, v501, v907, v204, 90); - svfloat32_t v524; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v524) : "w"(v340), "w"(v522)); - svfloat32_t v525; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v525) : "w"(v340), "w"(v522)); - svfloat32_t v527; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v527) : "w"(v340), "w"(v523)); + svfloat32_t v524 = svadd_f32_x(svptrue_b32(), v340, v522); + svfloat32_t v525 = svsub_f32_x(svptrue_b32(), v340, v522); + svfloat32_t v527 = svadd_f32_x(svptrue_b32(), v340, v523); svfloat32_t v476 = svnmls_f32_x(pred_full, v473, v167, v886); svfloat32_t v477 = svmla_f32_x(pred_full, v469, v192, v884); svfloat32_t v479 = svmla_f32_x(pred_full, v475, v193, v887); - svfloat32_t v482; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v482) : "w"(v481), "w"(v469)); - svfloat32_t v483; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v483) : "w"(v474), "w"(v475)); - svfloat32_t v490; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v490) : "w"(v489), "w"(v473)); - svfloat32_t v496; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v496) : "w"(v495), "w"(v492)); - svfloat32_t v507; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v507) : "w"(v410), "w"(v504)); + svfloat32_t v482 = svadd_f32_x(svptrue_b32(), v481, v469); + svfloat32_t v483 = svadd_f32_x(svptrue_b32(), v474, v475); + svfloat32_t v490 = svadd_f32_x(svptrue_b32(), v489, v473); + svfloat32_t v496 = svsub_f32_x(svptrue_b32(), v495, v492); + svfloat32_t v507 = svsub_f32_x(svptrue_b32(), v410, v504); svfloat32_t v508 = svcmla_f32_x(pred_full, v500, v903, v224, 90); svfloat32_t v510 = svcmla_f32_x(pred_full, v506, v906, v225, 90); - svfloat32_t v513; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v513) : "w"(v512), "w"(v500)); - svfloat32_t v514; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v514) : "w"(v505), "w"(v506)); - svfloat32_t v521; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v521) : "w"(v520), "w"(v504)); - svfloat32_t v526; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v526) : "w"(v525), "w"(v523)); - svfloat32_t v478; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v478) : "w"(v477), "w"(v474)); - svfloat32_t v480; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v480) : "w"(v479), "w"(v476)); + svfloat32_t v513 = svadd_f32_x(svptrue_b32(), v512, v500); + svfloat32_t v514 = svadd_f32_x(svptrue_b32(), v505, v506); + svfloat32_t v521 = svadd_f32_x(svptrue_b32(), v520, v504); + svfloat32_t v526 = svsub_f32_x(svptrue_b32(), v525, v523); + svfloat32_t v478 = svadd_f32_x(svptrue_b32(), v477, v474); + svfloat32_t v480 = svadd_f32_x(svptrue_b32(), v479, v476); svfloat32_t v484 = svmla_f32_x(pred_full, v483, v162, v885); - svfloat32_t v487; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v487) : "w"(v486), "w"(v476)); - svfloat32_t v509; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v509) : "w"(v508), "w"(v505)); - svfloat32_t v511; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v511) : "w"(v510), "w"(v507)); + svfloat32_t v487 = svadd_f32_x(svptrue_b32(), v486, v476); + svfloat32_t v509 = svadd_f32_x(svptrue_b32(), v508, v505); + svfloat32_t v511 = svadd_f32_x(svptrue_b32(), v510, v507); svfloat32_t v515 = svcmla_f32_x(pred_full, v514, v904, v202, 90); - svfloat32_t v518; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v518) : "w"(v517), "w"(v507)); - svfloat32_t v532; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v532) : "w"(v490), "w"(v482)); - svfloat32_t v536; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v536) : "w"(v497), "w"(v490)); - svfloat32_t v539; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v539) : "w"(v482), "w"(v497)); - svfloat32_t v544; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v544) : "w"(v521), "w"(v513)); - svfloat32_t v548; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v548) : "w"(v521), "w"(v527)); - svfloat32_t v551; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v551) : "w"(v513), "w"(v527)); - svfloat32_t v485; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v485) : "w"(v484), "w"(v473)); - svfloat32_t v488; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v488) : "w"(v487), "w"(v469)); - svfloat32_t v516; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v516) : "w"(v515), "w"(v504)); - svfloat32_t v519; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v519) : "w"(v518), "w"(v500)); - svfloat32_t v533; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v533) : "w"(v532), "w"(v497)); - svfloat32_t v537; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v537) : "w"(v478), "w"(v494)); - svfloat32_t v538; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v538) : "w"(v480), "w"(v496)); - svfloat32_t v545; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v545) : "w"(v544), "w"(v527)); - svfloat32_t v549; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v549) : "w"(v509), "w"(v524)); - svfloat32_t v550; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v550) : "w"(v511), "w"(v526)); - svfloat32_t v575; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v575) : "w"(v539), "w"(v551)); - svfloat32_t v583; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v583) : "w"(v539), "w"(v551)); - svfloat32_t v591; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v591) : "w"(v536), "w"(v548)); - svfloat32_t v599; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v599) : "w"(v536), "w"(v548)); - svfloat32_t v528; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v528) : "w"(v485), "w"(v478)); - svfloat32_t v530; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v530) : "w"(v488), "w"(v480)); - svfloat32_t v534; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v534) : "w"(v494), "w"(v485)); - svfloat32_t v535; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v535) : "w"(v496), "w"(v488)); - svfloat32_t v540; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v540) : "w"(v516), "w"(v509)); - svfloat32_t v542; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v542) : "w"(v519), "w"(v511)); - svfloat32_t v546; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v546) : "w"(v524), "w"(v516)); - svfloat32_t v547; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v547) : "w"(v526), "w"(v519)); - svfloat32_t v607; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v607) : "w"(v538), "w"(v550)); - svfloat32_t v615; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v615) : "w"(v538), "w"(v550)); - svfloat32_t v623; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v623) : "w"(v533), "w"(v545)); - svfloat32_t v631; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v631) : "w"(v533), "w"(v545)); - svfloat32_t v671; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v671) : "w"(v537), "w"(v549)); - svfloat32_t v679; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v679) : "w"(v537), "w"(v549)); + svfloat32_t v518 = svadd_f32_x(svptrue_b32(), v517, v507); + svfloat32_t v532 = svsub_f32_x(svptrue_b32(), v490, v482); + svfloat32_t v536 = svsub_f32_x(svptrue_b32(), v497, v490); + svfloat32_t v539 = svadd_f32_x(svptrue_b32(), v482, v497); + svfloat32_t v544 = svsub_f32_x(svptrue_b32(), v521, v513); + svfloat32_t v548 = svsub_f32_x(svptrue_b32(), v521, v527); + svfloat32_t v551 = svadd_f32_x(svptrue_b32(), v513, v527); + svfloat32_t v485 = svadd_f32_x(svptrue_b32(), v484, v473); + svfloat32_t v488 = svadd_f32_x(svptrue_b32(), v487, v469); + svfloat32_t v516 = svadd_f32_x(svptrue_b32(), v515, v504); + svfloat32_t v519 = svadd_f32_x(svptrue_b32(), v518, v500); + svfloat32_t v533 = svadd_f32_x(svptrue_b32(), v532, v497); + svfloat32_t v537 = svadd_f32_x(svptrue_b32(), v478, v494); + svfloat32_t v538 = svadd_f32_x(svptrue_b32(), v480, v496); + svfloat32_t v545 = svadd_f32_x(svptrue_b32(), v544, v527); + svfloat32_t v549 = svadd_f32_x(svptrue_b32(), v509, v524); + svfloat32_t v550 = svadd_f32_x(svptrue_b32(), v511, v526); + svfloat32_t v575 = svsub_f32_x(svptrue_b32(), v539, v551); + svfloat32_t v583 = svadd_f32_x(svptrue_b32(), v539, v551); + svfloat32_t v591 = svadd_f32_x(svptrue_b32(), v536, v548); + svfloat32_t v599 = svsub_f32_x(svptrue_b32(), v536, v548); + svfloat32_t v528 = svsub_f32_x(svptrue_b32(), v485, v478); + svfloat32_t v530 = svsub_f32_x(svptrue_b32(), v488, v480); + svfloat32_t v534 = svsub_f32_x(svptrue_b32(), v494, v485); + svfloat32_t v535 = svsub_f32_x(svptrue_b32(), v496, v488); + svfloat32_t v540 = svsub_f32_x(svptrue_b32(), v516, v509); + svfloat32_t v542 = svsub_f32_x(svptrue_b32(), v519, v511); + svfloat32_t v546 = svsub_f32_x(svptrue_b32(), v524, v516); + svfloat32_t v547 = svsub_f32_x(svptrue_b32(), v526, v519); + svfloat32_t v607 = svadd_f32_x(svptrue_b32(), v538, v550); + svfloat32_t v615 = svsub_f32_x(svptrue_b32(), v538, v550); + svfloat32_t v623 = svadd_f32_x(svptrue_b32(), v533, v545); + svfloat32_t v631 = svsub_f32_x(svptrue_b32(), v533, v545); + svfloat32_t v671 = svsub_f32_x(svptrue_b32(), v537, v549); + svfloat32_t v679 = svadd_f32_x(svptrue_b32(), v537, v549); svst1_f64(pred_full, (double *)(v948), svreinterpret_f64_f32(v575)); svst1_f64(pred_full, (double *)(v957), svreinterpret_f64_f32(v583)); svst1_f64(pred_full, (double *)(v966), svreinterpret_f64_f32(v591)); svst1_f64(pred_full, (double *)(v975), svreinterpret_f64_f32(v599)); - svfloat32_t v529; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v529) : "w"(v528), "w"(v494)); - svfloat32_t v531; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v531) : "w"(v530), "w"(v496)); - svfloat32_t v541; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v541) : "w"(v540), "w"(v524)); - svfloat32_t v543; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v543) : "w"(v542), "w"(v526)); - svfloat32_t v639; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v639) : "w"(v535), "w"(v547)); - svfloat32_t v647; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v647) : "w"(v535), "w"(v547)); - svfloat32_t v655; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v655) : "w"(v534), "w"(v546)); - svfloat32_t v663; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v663) : "w"(v534), "w"(v546)); + svfloat32_t v529 = svadd_f32_x(svptrue_b32(), v528, v494); + svfloat32_t v531 = svadd_f32_x(svptrue_b32(), v530, v496); + svfloat32_t v541 = svadd_f32_x(svptrue_b32(), v540, v524); + svfloat32_t v543 = svadd_f32_x(svptrue_b32(), v542, v526); + svfloat32_t v639 = svadd_f32_x(svptrue_b32(), v535, v547); + svfloat32_t v647 = svsub_f32_x(svptrue_b32(), v535, v547); + svfloat32_t v655 = svadd_f32_x(svptrue_b32(), v534, v546); + svfloat32_t v663 = svsub_f32_x(svptrue_b32(), v534, v546); svst1_f64(pred_full, (double *)(v984), svreinterpret_f64_f32(v607)); svst1_f64(pred_full, (double *)(v993), svreinterpret_f64_f32(v615)); svst1_f64(pred_full, (double *)(v1002), svreinterpret_f64_f32(v623)); svst1_f64(pred_full, (double *)(v1011), svreinterpret_f64_f32(v631)); svst1_f64(pred_full, (double *)(v1056), svreinterpret_f64_f32(v671)); svst1_f64(pred_full, (double *)(v1065), svreinterpret_f64_f32(v679)); - svfloat32_t v559; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v559) : "w"(v529), "w"(v541)); - svfloat32_t v567; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v567) : "w"(v529), "w"(v541)); - svfloat32_t v687; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v687) : "w"(v531), "w"(v543)); - svfloat32_t v695; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v695) : "w"(v531), "w"(v543)); + svfloat32_t v559 = svadd_f32_x(svptrue_b32(), v529, v541); + svfloat32_t v567 = svsub_f32_x(svptrue_b32(), v529, v541); + svfloat32_t v687 = svadd_f32_x(svptrue_b32(), v531, v543); + svfloat32_t v695 = svsub_f32_x(svptrue_b32(), v531, v543); svst1_f64(pred_full, (double *)(v1020), svreinterpret_f64_f32(v639)); svst1_f64(pred_full, (double *)(v1029), svreinterpret_f64_f32(v647)); svst1_f64(pred_full, (double *)(v1038), svreinterpret_f64_f32(v655)); @@ -7743,239 +3920,136 @@ void armral_fft_cf32_cf32_cf32_ac_n_gu20(const armral_cmplx_f32_t *restrict x, svld1_gather_s64index_f64(pred_full, (const double *)(v710), v729)); svfloat32_t v730 = svreinterpret_f32_f64( svld1_gather_s64index_f64(pred_full, (const double *)(v728), v729)); - svfloat32_t v32; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v32) : "w"(v559), "w"(v568)); - svfloat32_t v33; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v33) : "w"(v559), "w"(v568)); - svfloat32_t v48; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v48) : "w"(v577), "w"(v586)); - svfloat32_t v49; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v49) : "w"(v577), "w"(v586)); - svfloat32_t v66; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v66) : "w"(v595), "w"(v604)); - svfloat32_t v67; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v67) : "w"(v595), "w"(v604)); - svfloat32_t v82; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v82) : "w"(v613), "w"(v622)); - svfloat32_t v83; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v83) : "w"(v613), "w"(v622)); - svfloat32_t v100; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v100) : "w"(v631), "w"(v640)); - svfloat32_t v101; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v101) : "w"(v631), "w"(v640)); - svfloat32_t v116; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v116) : "w"(v649), "w"(v658)); - svfloat32_t v117; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v117) : "w"(v649), "w"(v658)); - svfloat32_t v134; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v134) : "w"(v667), "w"(v676)); - svfloat32_t v135; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v135) : "w"(v667), "w"(v676)); - svfloat32_t v150; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v150) : "w"(v685), "w"(v694)); - svfloat32_t v151; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v151) : "w"(v685), "w"(v694)); - svfloat32_t v168; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v168) : "w"(v703), "w"(v712)); - svfloat32_t v169; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v169) : "w"(v703), "w"(v712)); - svfloat32_t v184; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v184) : "w"(v721), "w"(v730)); - svfloat32_t v185; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v185) : "w"(v721), "w"(v730)); - svfloat32_t v50; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v50) : "w"(v32), "w"(v48)); - svfloat32_t v51; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v51) : "w"(v32), "w"(v48)); - svfloat32_t v84; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v84) : "w"(v66), "w"(v82)); - svfloat32_t v85; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v85) : "w"(v66), "w"(v82)); - svfloat32_t v118; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v118) : "w"(v100), "w"(v116)); - svfloat32_t v119; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v119) : "w"(v100), "w"(v116)); - svfloat32_t v152; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v152) : "w"(v134), "w"(v150)); - svfloat32_t v153; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v153) : "w"(v134), "w"(v150)); - svfloat32_t v186; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v186) : "w"(v168), "w"(v184)); - svfloat32_t v187; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v187) : "w"(v168), "w"(v184)); - svfloat32_t v294; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v294) : "w"(v67), "w"(v169)); - svfloat32_t v295; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v295) : "w"(v67), "w"(v169)); - svfloat32_t v296; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v296) : "w"(v135), "w"(v101)); - svfloat32_t v297; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v297) : "w"(v135), "w"(v101)); - svfloat32_t v347; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v347) : "w"(v83), "w"(v185)); - svfloat32_t v348; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v348) : "w"(v83), "w"(v185)); - svfloat32_t v349; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v349) : "w"(v151), "w"(v117)); - svfloat32_t v350; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v350) : "w"(v151), "w"(v117)); - svfloat32_t v188; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v188) : "w"(v84), "w"(v186)); - svfloat32_t v189; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v189) : "w"(v84), "w"(v186)); - svfloat32_t v190; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v190) : "w"(v152), "w"(v118)); - svfloat32_t v191; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v191) : "w"(v152), "w"(v118)); - svfloat32_t v241; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v241) : "w"(v85), "w"(v187)); - svfloat32_t v242; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v242) : "w"(v85), "w"(v187)); - svfloat32_t v243; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v243) : "w"(v153), "w"(v119)); - svfloat32_t v244; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v244) : "w"(v153), "w"(v119)); - svfloat32_t v298; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v298) : "w"(v294), "w"(v296)); - svfloat32_t v299; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v299) : "w"(v294), "w"(v296)); - svfloat32_t v300; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v300) : "w"(v295), "w"(v297)); - svfloat32_t zero323; - asm volatile("mov %0.s, #0" : "=w"(zero323)); + svfloat32_t v32 = svadd_f32_x(svptrue_b32(), v559, v568); + svfloat32_t v33 = svsub_f32_x(svptrue_b32(), v559, v568); + svfloat32_t v48 = svadd_f32_x(svptrue_b32(), v577, v586); + svfloat32_t v49 = svsub_f32_x(svptrue_b32(), v577, v586); + svfloat32_t v66 = svadd_f32_x(svptrue_b32(), v595, v604); + svfloat32_t v67 = svsub_f32_x(svptrue_b32(), v595, v604); + svfloat32_t v82 = svadd_f32_x(svptrue_b32(), v613, v622); + svfloat32_t v83 = svsub_f32_x(svptrue_b32(), v613, v622); + svfloat32_t v100 = svadd_f32_x(svptrue_b32(), v631, v640); + svfloat32_t v101 = svsub_f32_x(svptrue_b32(), v631, v640); + svfloat32_t v116 = svadd_f32_x(svptrue_b32(), v649, v658); + svfloat32_t v117 = svsub_f32_x(svptrue_b32(), v649, v658); + svfloat32_t v134 = svadd_f32_x(svptrue_b32(), v667, v676); + svfloat32_t v135 = svsub_f32_x(svptrue_b32(), v667, v676); + svfloat32_t v150 = svadd_f32_x(svptrue_b32(), v685, v694); + svfloat32_t v151 = svsub_f32_x(svptrue_b32(), v685, v694); + svfloat32_t v168 = svadd_f32_x(svptrue_b32(), v703, v712); + svfloat32_t v169 = svsub_f32_x(svptrue_b32(), v703, v712); + svfloat32_t v184 = svadd_f32_x(svptrue_b32(), v721, v730); + svfloat32_t v185 = svsub_f32_x(svptrue_b32(), v721, v730); + svfloat32_t v50 = svadd_f32_x(svptrue_b32(), v32, v48); + svfloat32_t v51 = svsub_f32_x(svptrue_b32(), v32, v48); + svfloat32_t v84 = svadd_f32_x(svptrue_b32(), v66, v82); + svfloat32_t v85 = svsub_f32_x(svptrue_b32(), v66, v82); + svfloat32_t v118 = svadd_f32_x(svptrue_b32(), v100, v116); + svfloat32_t v119 = svsub_f32_x(svptrue_b32(), v100, v116); + svfloat32_t v152 = svadd_f32_x(svptrue_b32(), v134, v150); + svfloat32_t v153 = svsub_f32_x(svptrue_b32(), v134, v150); + svfloat32_t v186 = svadd_f32_x(svptrue_b32(), v168, v184); + svfloat32_t v187 = svsub_f32_x(svptrue_b32(), v168, v184); + svfloat32_t v294 = svadd_f32_x(svptrue_b32(), v67, v169); + svfloat32_t v295 = svsub_f32_x(svptrue_b32(), v67, v169); + svfloat32_t v296 = svadd_f32_x(svptrue_b32(), v135, v101); + svfloat32_t v297 = svsub_f32_x(svptrue_b32(), v135, v101); + svfloat32_t v347 = svadd_f32_x(svptrue_b32(), v83, v185); + svfloat32_t v348 = svsub_f32_x(svptrue_b32(), v83, v185); + svfloat32_t v349 = svadd_f32_x(svptrue_b32(), v151, v117); + svfloat32_t v350 = svsub_f32_x(svptrue_b32(), v151, v117); + svfloat32_t v188 = svadd_f32_x(svptrue_b32(), v84, v186); + svfloat32_t v189 = svsub_f32_x(svptrue_b32(), v84, v186); + svfloat32_t v190 = svadd_f32_x(svptrue_b32(), v152, v118); + svfloat32_t v191 = svsub_f32_x(svptrue_b32(), v152, v118); + svfloat32_t v241 = svadd_f32_x(svptrue_b32(), v85, v187); + svfloat32_t v242 = svsub_f32_x(svptrue_b32(), v85, v187); + svfloat32_t v243 = svadd_f32_x(svptrue_b32(), v153, v119); + svfloat32_t v244 = svsub_f32_x(svptrue_b32(), v153, v119); + svfloat32_t v298 = svadd_f32_x(svptrue_b32(), v294, v296); + svfloat32_t v299 = svsub_f32_x(svptrue_b32(), v294, v296); + svfloat32_t v300 = svadd_f32_x(svptrue_b32(), v295, v297); + svfloat32_t zero323 = svdup_n_f32(0); svfloat32_t v323 = svcmla_f32_x(pred_full, zero323, v746, v295, 90); - svfloat32_t v351; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v351) : "w"(v347), "w"(v349)); - svfloat32_t v352; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v352) : "w"(v347), "w"(v349)); - svfloat32_t v353; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v353) : "w"(v348), "w"(v350)); - svfloat32_t v390; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v390) : "w"(v350), "w"(v754)); - svfloat32_t v192; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v192) : "w"(v188), "w"(v190)); - svfloat32_t v193; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v193) : "w"(v188), "w"(v190)); - svfloat32_t v194; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v194) : "w"(v189), "w"(v191)); - svfloat32_t zero217; - asm volatile("mov %0.s, #0" : "=w"(zero217)); + svfloat32_t v351 = svadd_f32_x(svptrue_b32(), v347, v349); + svfloat32_t v352 = svsub_f32_x(svptrue_b32(), v347, v349); + svfloat32_t v353 = svadd_f32_x(svptrue_b32(), v348, v350); + svfloat32_t v390 = svmul_f32_x(svptrue_b32(), v350, v754); + svfloat32_t v192 = svadd_f32_x(svptrue_b32(), v188, v190); + svfloat32_t v193 = svsub_f32_x(svptrue_b32(), v188, v190); + svfloat32_t v194 = svadd_f32_x(svptrue_b32(), v189, v191); + svfloat32_t zero217 = svdup_n_f32(0); svfloat32_t v217 = svcmla_f32_x(pred_full, zero217, v746, v189, 90); - svfloat32_t v245; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v245) : "w"(v241), "w"(v243)); - svfloat32_t v246; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v246) : "w"(v241), "w"(v243)); - svfloat32_t v247; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v247) : "w"(v242), "w"(v244)); - svfloat32_t zero270; - asm volatile("mov %0.s, #0" : "=w"(zero270)); + svfloat32_t v245 = svadd_f32_x(svptrue_b32(), v241, v243); + svfloat32_t v246 = svsub_f32_x(svptrue_b32(), v241, v243); + svfloat32_t v247 = svadd_f32_x(svptrue_b32(), v242, v244); + svfloat32_t zero270 = svdup_n_f32(0); svfloat32_t v270 = svcmla_f32_x(pred_full, zero270, v746, v242, 90); - svfloat32_t v301; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v301) : "w"(v298), "w"(v33)); - svfloat32_t zero330; - asm volatile("mov %0.s, #0" : "=w"(zero330)); + svfloat32_t v301 = svadd_f32_x(svptrue_b32(), v298, v33); + svfloat32_t zero330 = svdup_n_f32(0); svfloat32_t v330 = svcmla_f32_x(pred_full, zero330, v747, v300, 90); - svfloat32_t v354; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v354) : "w"(v351), "w"(v49)); - svfloat32_t zero375; - asm volatile("mov %0.s, #0" : "=w"(zero375)); + svfloat32_t v354 = svadd_f32_x(svptrue_b32(), v351, v49); + svfloat32_t zero375 = svdup_n_f32(0); svfloat32_t v375 = svcmla_f32_x(pred_full, zero375, v751, v352, 90); - svfloat32_t v385; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v385) : "w"(v353), "w"(v753)); - svfloat32_t v195; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v195) : "w"(v192), "w"(v50)); - svfloat32_t zero224; - asm volatile("mov %0.s, #0" : "=w"(zero224)); + svfloat32_t v385 = svmul_f32_x(svptrue_b32(), v353, v753); + svfloat32_t v195 = svadd_f32_x(svptrue_b32(), v192, v50); + svfloat32_t zero224 = svdup_n_f32(0); svfloat32_t v224 = svcmla_f32_x(pred_full, zero224, v747, v194, 90); - svfloat32_t v248; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v248) : "w"(v245), "w"(v51)); - svfloat32_t zero277; - asm volatile("mov %0.s, #0" : "=w"(zero277)); + svfloat32_t v248 = svadd_f32_x(svptrue_b32(), v245, v51); + svfloat32_t zero277 = svdup_n_f32(0); svfloat32_t v277 = svcmla_f32_x(pred_full, zero277, v747, v247, 90); svfloat32_t v338 = svmla_f32_x(pred_full, v301, v298, v744); - svfloat32_t v341; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v341) : "w"(v323), "w"(v330)); + svfloat32_t v341 = svsub_f32_x(svptrue_b32(), v323, v330); svfloat32_t v342 = svcmla_f32_x(pred_full, v330, v748, v297, 90); - svfloat32_t zero361; - asm volatile("mov %0.s, #0" : "=w"(zero361)); + svfloat32_t zero361 = svdup_n_f32(0); svfloat32_t v361 = svcmla_f32_x(pred_full, zero361, v749, v354, 90); svfloat32_t v394 = svnmls_f32_x(pred_full, v385, v348, v752); svfloat32_t v395 = svmla_f32_x(pred_full, v390, v353, v753); svfloat32_t v232 = svmla_f32_x(pred_full, v195, v192, v744); - svfloat32_t v235; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v235) : "w"(v217), "w"(v224)); + svfloat32_t v235 = svsub_f32_x(svptrue_b32(), v217, v224); svfloat32_t v236 = svcmla_f32_x(pred_full, v224, v748, v191, 90); svfloat32_t v285 = svmla_f32_x(pred_full, v248, v245, v744); - svfloat32_t v288; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v288) : "w"(v270), "w"(v277)); + svfloat32_t v288 = svsub_f32_x(svptrue_b32(), v270, v277); svfloat32_t v289 = svcmla_f32_x(pred_full, v277, v748, v244, 90); svfloat32_t v339 = svmla_f32_x(pred_full, v338, v299, v745); svfloat32_t v340 = svmls_f32_x(pred_full, v338, v299, v745); svfloat32_t v391 = svcmla_f32_x(pred_full, v361, v750, v351, 90); - svfloat32_t v400; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v400) : "w"(v301), "w"(v361)); - svfloat32_t v401; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v401) : "w"(v301), "w"(v361)); + svfloat32_t v400 = svadd_f32_x(svptrue_b32(), v301, v361); + svfloat32_t v401 = svsub_f32_x(svptrue_b32(), v301, v361); svst1_f64(pred_full, (double *)(v762), svreinterpret_f64_f32(v195)); svst1_f64(pred_full, (double *)(v780), svreinterpret_f64_f32(v248)); svfloat32_t v233 = svmla_f32_x(pred_full, v232, v193, v745); svfloat32_t v234 = svmls_f32_x(pred_full, v232, v193, v745); svfloat32_t v286 = svmla_f32_x(pred_full, v285, v246, v745); svfloat32_t v287 = svmls_f32_x(pred_full, v285, v246, v745); - svfloat32_t v343; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v343) : "w"(v339), "w"(v341)); - svfloat32_t v344; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v344) : "w"(v339), "w"(v341)); - svfloat32_t v345; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v345) : "w"(v340), "w"(v342)); - svfloat32_t v346; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v346) : "w"(v340), "w"(v342)); - svfloat32_t v392; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v392) : "w"(v391), "w"(v375)); - svfloat32_t v393; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v393) : "w"(v391), "w"(v375)); + svfloat32_t v343 = svadd_f32_x(svptrue_b32(), v339, v341); + svfloat32_t v344 = svsub_f32_x(svptrue_b32(), v339, v341); + svfloat32_t v345 = svadd_f32_x(svptrue_b32(), v340, v342); + svfloat32_t v346 = svsub_f32_x(svptrue_b32(), v340, v342); + svfloat32_t v392 = svadd_f32_x(svptrue_b32(), v391, v375); + svfloat32_t v393 = svsub_f32_x(svptrue_b32(), v391, v375); svst1_f64(pred_full, (double *)(v771), svreinterpret_f64_f32(v401)); svst1_f64(pred_full, (double *)(v789), svreinterpret_f64_f32(v400)); - svfloat32_t v237; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v237) : "w"(v233), "w"(v235)); - svfloat32_t v238; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v238) : "w"(v233), "w"(v235)); - svfloat32_t v239; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v239) : "w"(v234), "w"(v236)); - svfloat32_t v240; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v240) : "w"(v234), "w"(v236)); - svfloat32_t v290; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v290) : "w"(v286), "w"(v288)); - svfloat32_t v291; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v291) : "w"(v286), "w"(v288)); - svfloat32_t v292; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v292) : "w"(v287), "w"(v289)); - svfloat32_t v293; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v293) : "w"(v287), "w"(v289)); - svfloat32_t v396; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v396) : "w"(v392), "w"(v394)); - svfloat32_t v397; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v397) : "w"(v392), "w"(v394)); - svfloat32_t v398; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v398) : "w"(v393), "w"(v395)); - svfloat32_t v399; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v399) : "w"(v393), "w"(v395)); - svfloat32_t v430; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v430) : "w"(v344), "w"(v397)); - svfloat32_t v431; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v431) : "w"(v344), "w"(v397)); - svfloat32_t v460; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v460) : "w"(v346), "w"(v399)); - svfloat32_t v461; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v461) : "w"(v346), "w"(v399)); - svfloat32_t v490; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v490) : "w"(v345), "w"(v398)); - svfloat32_t v491; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v491) : "w"(v345), "w"(v398)); - svfloat32_t v520; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v520) : "w"(v343), "w"(v396)); - svfloat32_t v521; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v521) : "w"(v343), "w"(v396)); + svfloat32_t v237 = svadd_f32_x(svptrue_b32(), v233, v235); + svfloat32_t v238 = svsub_f32_x(svptrue_b32(), v233, v235); + svfloat32_t v239 = svadd_f32_x(svptrue_b32(), v234, v236); + svfloat32_t v240 = svsub_f32_x(svptrue_b32(), v234, v236); + svfloat32_t v290 = svadd_f32_x(svptrue_b32(), v286, v288); + svfloat32_t v291 = svsub_f32_x(svptrue_b32(), v286, v288); + svfloat32_t v292 = svadd_f32_x(svptrue_b32(), v287, v289); + svfloat32_t v293 = svsub_f32_x(svptrue_b32(), v287, v289); + svfloat32_t v396 = svadd_f32_x(svptrue_b32(), v392, v394); + svfloat32_t v397 = svsub_f32_x(svptrue_b32(), v392, v394); + svfloat32_t v398 = svadd_f32_x(svptrue_b32(), v393, v395); + svfloat32_t v399 = svsub_f32_x(svptrue_b32(), v393, v395); + svfloat32_t v430 = svadd_f32_x(svptrue_b32(), v344, v397); + svfloat32_t v431 = svsub_f32_x(svptrue_b32(), v344, v397); + svfloat32_t v460 = svadd_f32_x(svptrue_b32(), v346, v399); + svfloat32_t v461 = svsub_f32_x(svptrue_b32(), v346, v399); + svfloat32_t v490 = svadd_f32_x(svptrue_b32(), v345, v398); + svfloat32_t v491 = svsub_f32_x(svptrue_b32(), v345, v398); + svfloat32_t v520 = svadd_f32_x(svptrue_b32(), v343, v396); + svfloat32_t v521 = svsub_f32_x(svptrue_b32(), v343, v396); svst1_f64(pred_full, (double *)(v798), svreinterpret_f64_f32(v238)); svst1_f64(pred_full, (double *)(v816), svreinterpret_f64_f32(v291)); svst1_f64(pred_full, (double *)(v834), svreinterpret_f64_f32(v240)); @@ -8528,215 +4602,119 @@ void armral_fft_cf32_cf32_cf32_ac_n_gu21(const armral_cmplx_f32_t *restrict x, svld1_gather_s64index_f64(pred_full, (const double *)(v801), v811)); svfloat32_t v812 = svreinterpret_f32_f64( svld1_gather_s64index_f64(pred_full, (const double *)(v810), v811)); - svfloat32_t v32; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v32) : "w"(v631), "w"(v640)); - svfloat32_t v33; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v33) : "w"(v631), "w"(v640)); - svfloat32_t v56; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v56) : "w"(v659), "w"(v668)); - svfloat32_t v57; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v57) : "w"(v659), "w"(v668)); - svfloat32_t v80; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v80) : "w"(v686), "w"(v695)); - svfloat32_t v81; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v81) : "w"(v686), "w"(v695)); - svfloat32_t v104; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v104) : "w"(v713), "w"(v722)); - svfloat32_t v105; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v105) : "w"(v713), "w"(v722)); - svfloat32_t v128; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v128) : "w"(v740), "w"(v749)); - svfloat32_t v129; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v129) : "w"(v740), "w"(v749)); - svfloat32_t v152; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v152) : "w"(v767), "w"(v776)); - svfloat32_t v153; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v153) : "w"(v767), "w"(v776)); - svfloat32_t v176; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v176) : "w"(v794), "w"(v803)); - svfloat32_t v177; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v177) : "w"(v794), "w"(v803)); - svfloat32_t v41; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v41) : "w"(v32), "w"(v650)); - svfloat32_t v65; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v65) : "w"(v56), "w"(v677)); - svfloat32_t v89; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v89) : "w"(v80), "w"(v704)); - svfloat32_t v113; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v113) : "w"(v104), "w"(v731)); - svfloat32_t v137; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v137) : "w"(v128), "w"(v758)); - svfloat32_t v161; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v161) : "w"(v152), "w"(v785)); - svfloat32_t v185; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v185) : "w"(v176), "w"(v812)); - svfloat32_t v275; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v275) : "w"(v56), "w"(v176)); - svfloat32_t v276; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v276) : "w"(v56), "w"(v176)); - svfloat32_t v277; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v277) : "w"(v128), "w"(v104)); - svfloat32_t v278; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v278) : "w"(v128), "w"(v104)); - svfloat32_t v279; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v279) : "w"(v80), "w"(v152)); - svfloat32_t v280; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v280) : "w"(v80), "w"(v152)); - svfloat32_t v364; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v364) : "w"(v57), "w"(v177)); - svfloat32_t v365; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v365) : "w"(v57), "w"(v177)); - svfloat32_t v366; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v366) : "w"(v129), "w"(v105)); - svfloat32_t v367; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v367) : "w"(v129), "w"(v105)); - svfloat32_t v368; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v368) : "w"(v81), "w"(v153)); - svfloat32_t v369; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v369) : "w"(v81), "w"(v153)); - svfloat32_t v186; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v186) : "w"(v65), "w"(v185)); - svfloat32_t v187; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v187) : "w"(v65), "w"(v185)); - svfloat32_t v188; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v188) : "w"(v137), "w"(v113)); - svfloat32_t v189; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v189) : "w"(v137), "w"(v113)); - svfloat32_t v190; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v190) : "w"(v89), "w"(v161)); - svfloat32_t v191; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v191) : "w"(v89), "w"(v161)); - svfloat32_t v281; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v281) : "w"(v275), "w"(v277)); - svfloat32_t v284; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v284) : "w"(v275), "w"(v277)); - svfloat32_t v285; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v285) : "w"(v277), "w"(v279)); - svfloat32_t v286; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v286) : "w"(v279), "w"(v275)); - svfloat32_t v287; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v287) : "w"(v276), "w"(v278)); - svfloat32_t v289; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v289) : "w"(v276), "w"(v278)); - svfloat32_t v290; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v290) : "w"(v278), "w"(v280)); - svfloat32_t v291; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v291) : "w"(v280), "w"(v276)); - svfloat32_t v370; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v370) : "w"(v364), "w"(v366)); - svfloat32_t v373; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v373) : "w"(v364), "w"(v366)); - svfloat32_t v374; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v374) : "w"(v366), "w"(v368)); - svfloat32_t v375; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v375) : "w"(v368), "w"(v364)); - svfloat32_t v376; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v376) : "w"(v365), "w"(v367)); - svfloat32_t v378; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v378) : "w"(v365), "w"(v367)); - svfloat32_t v379; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v379) : "w"(v367), "w"(v369)); - svfloat32_t v380; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v380) : "w"(v369), "w"(v365)); - svfloat32_t v192; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v192) : "w"(v186), "w"(v188)); - svfloat32_t v195; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v195) : "w"(v186), "w"(v188)); - svfloat32_t v196; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v196) : "w"(v188), "w"(v190)); - svfloat32_t v197; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v197) : "w"(v190), "w"(v186)); - svfloat32_t v198; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v198) : "w"(v187), "w"(v189)); - svfloat32_t v200; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v200) : "w"(v187), "w"(v189)); - svfloat32_t v201; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v201) : "w"(v189), "w"(v191)); - svfloat32_t v202; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v202) : "w"(v191), "w"(v187)); - svfloat32_t v282; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v282) : "w"(v281), "w"(v279)); - svfloat32_t v288; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v288) : "w"(v287), "w"(v280)); - svfloat32_t zero330; - asm volatile("mov %0.s, #0" : "=w"(zero330)); + svfloat32_t v32 = svadd_f32_x(svptrue_b32(), v631, v640); + svfloat32_t v33 = svsub_f32_x(svptrue_b32(), v631, v640); + svfloat32_t v56 = svadd_f32_x(svptrue_b32(), v659, v668); + svfloat32_t v57 = svsub_f32_x(svptrue_b32(), v659, v668); + svfloat32_t v80 = svadd_f32_x(svptrue_b32(), v686, v695); + svfloat32_t v81 = svsub_f32_x(svptrue_b32(), v686, v695); + svfloat32_t v104 = svadd_f32_x(svptrue_b32(), v713, v722); + svfloat32_t v105 = svsub_f32_x(svptrue_b32(), v713, v722); + svfloat32_t v128 = svadd_f32_x(svptrue_b32(), v740, v749); + svfloat32_t v129 = svsub_f32_x(svptrue_b32(), v740, v749); + svfloat32_t v152 = svadd_f32_x(svptrue_b32(), v767, v776); + svfloat32_t v153 = svsub_f32_x(svptrue_b32(), v767, v776); + svfloat32_t v176 = svadd_f32_x(svptrue_b32(), v794, v803); + svfloat32_t v177 = svsub_f32_x(svptrue_b32(), v794, v803); + svfloat32_t v41 = svadd_f32_x(svptrue_b32(), v32, v650); + svfloat32_t v65 = svadd_f32_x(svptrue_b32(), v56, v677); + svfloat32_t v89 = svadd_f32_x(svptrue_b32(), v80, v704); + svfloat32_t v113 = svadd_f32_x(svptrue_b32(), v104, v731); + svfloat32_t v137 = svadd_f32_x(svptrue_b32(), v128, v758); + svfloat32_t v161 = svadd_f32_x(svptrue_b32(), v152, v785); + svfloat32_t v185 = svadd_f32_x(svptrue_b32(), v176, v812); + svfloat32_t v275 = svadd_f32_x(svptrue_b32(), v56, v176); + svfloat32_t v276 = svsub_f32_x(svptrue_b32(), v56, v176); + svfloat32_t v277 = svadd_f32_x(svptrue_b32(), v128, v104); + svfloat32_t v278 = svsub_f32_x(svptrue_b32(), v128, v104); + svfloat32_t v279 = svadd_f32_x(svptrue_b32(), v80, v152); + svfloat32_t v280 = svsub_f32_x(svptrue_b32(), v80, v152); + svfloat32_t v364 = svadd_f32_x(svptrue_b32(), v57, v177); + svfloat32_t v365 = svsub_f32_x(svptrue_b32(), v57, v177); + svfloat32_t v366 = svadd_f32_x(svptrue_b32(), v129, v105); + svfloat32_t v367 = svsub_f32_x(svptrue_b32(), v129, v105); + svfloat32_t v368 = svadd_f32_x(svptrue_b32(), v81, v153); + svfloat32_t v369 = svsub_f32_x(svptrue_b32(), v81, v153); + svfloat32_t v186 = svadd_f32_x(svptrue_b32(), v65, v185); + svfloat32_t v187 = svsub_f32_x(svptrue_b32(), v65, v185); + svfloat32_t v188 = svadd_f32_x(svptrue_b32(), v137, v113); + svfloat32_t v189 = svsub_f32_x(svptrue_b32(), v137, v113); + svfloat32_t v190 = svadd_f32_x(svptrue_b32(), v89, v161); + svfloat32_t v191 = svsub_f32_x(svptrue_b32(), v89, v161); + svfloat32_t v281 = svadd_f32_x(svptrue_b32(), v275, v277); + svfloat32_t v284 = svsub_f32_x(svptrue_b32(), v275, v277); + svfloat32_t v285 = svsub_f32_x(svptrue_b32(), v277, v279); + svfloat32_t v286 = svsub_f32_x(svptrue_b32(), v279, v275); + svfloat32_t v287 = svadd_f32_x(svptrue_b32(), v276, v278); + svfloat32_t v289 = svsub_f32_x(svptrue_b32(), v276, v278); + svfloat32_t v290 = svsub_f32_x(svptrue_b32(), v278, v280); + svfloat32_t v291 = svsub_f32_x(svptrue_b32(), v280, v276); + svfloat32_t v370 = svadd_f32_x(svptrue_b32(), v364, v366); + svfloat32_t v373 = svsub_f32_x(svptrue_b32(), v364, v366); + svfloat32_t v374 = svsub_f32_x(svptrue_b32(), v366, v368); + svfloat32_t v375 = svsub_f32_x(svptrue_b32(), v368, v364); + svfloat32_t v376 = svadd_f32_x(svptrue_b32(), v365, v367); + svfloat32_t v378 = svsub_f32_x(svptrue_b32(), v365, v367); + svfloat32_t v379 = svsub_f32_x(svptrue_b32(), v367, v369); + svfloat32_t v380 = svsub_f32_x(svptrue_b32(), v369, v365); + svfloat32_t v192 = svadd_f32_x(svptrue_b32(), v186, v188); + svfloat32_t v195 = svsub_f32_x(svptrue_b32(), v186, v188); + svfloat32_t v196 = svsub_f32_x(svptrue_b32(), v188, v190); + svfloat32_t v197 = svsub_f32_x(svptrue_b32(), v190, v186); + svfloat32_t v198 = svadd_f32_x(svptrue_b32(), v187, v189); + svfloat32_t v200 = svsub_f32_x(svptrue_b32(), v187, v189); + svfloat32_t v201 = svsub_f32_x(svptrue_b32(), v189, v191); + svfloat32_t v202 = svsub_f32_x(svptrue_b32(), v191, v187); + svfloat32_t v282 = svadd_f32_x(svptrue_b32(), v281, v279); + svfloat32_t v288 = svadd_f32_x(svptrue_b32(), v287, v280); + svfloat32_t zero330 = svdup_n_f32(0); svfloat32_t v330 = svcmla_f32_x(pred_full, zero330, v828, v289, 90); - svfloat32_t zero337; - asm volatile("mov %0.s, #0" : "=w"(zero337)); + svfloat32_t zero337 = svdup_n_f32(0); svfloat32_t v337 = svcmla_f32_x(pred_full, zero337, v829, v290, 90); - svfloat32_t zero344; - asm volatile("mov %0.s, #0" : "=w"(zero344)); + svfloat32_t zero344 = svdup_n_f32(0); svfloat32_t v344 = svcmla_f32_x(pred_full, zero344, v830, v291, 90); - svfloat32_t v371; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v371) : "w"(v370), "w"(v368)); - svfloat32_t v377; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v377) : "w"(v376), "w"(v369)); - svfloat32_t zero401; - asm volatile("mov %0.s, #0" : "=w"(zero401)); + svfloat32_t v371 = svadd_f32_x(svptrue_b32(), v370, v368); + svfloat32_t v377 = svadd_f32_x(svptrue_b32(), v376, v369); + svfloat32_t zero401 = svdup_n_f32(0); svfloat32_t v401 = svcmla_f32_x(pred_full, zero401, v833, v373, 90); - svfloat32_t zero408; - asm volatile("mov %0.s, #0" : "=w"(zero408)); + svfloat32_t zero408 = svdup_n_f32(0); svfloat32_t v408 = svcmla_f32_x(pred_full, zero408, v834, v374, 90); - svfloat32_t zero415; - asm volatile("mov %0.s, #0" : "=w"(zero415)); + svfloat32_t zero415 = svdup_n_f32(0); svfloat32_t v415 = svcmla_f32_x(pred_full, zero415, v835, v375, 90); - svfloat32_t v425; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v425) : "w"(v378), "w"(v837)); - svfloat32_t v430; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v430) : "w"(v379), "w"(v838)); - svfloat32_t v193; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v193) : "w"(v192), "w"(v190)); - svfloat32_t v199; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v199) : "w"(v198), "w"(v191)); - svfloat32_t zero241; - asm volatile("mov %0.s, #0" : "=w"(zero241)); + svfloat32_t v425 = svmul_f32_x(svptrue_b32(), v378, v837); + svfloat32_t v430 = svmul_f32_x(svptrue_b32(), v379, v838); + svfloat32_t v193 = svadd_f32_x(svptrue_b32(), v192, v190); + svfloat32_t v199 = svadd_f32_x(svptrue_b32(), v198, v191); + svfloat32_t zero241 = svdup_n_f32(0); svfloat32_t v241 = svcmla_f32_x(pred_full, zero241, v819, v200, 90); - svfloat32_t zero248; - asm volatile("mov %0.s, #0" : "=w"(zero248)); + svfloat32_t zero248 = svdup_n_f32(0); svfloat32_t v248 = svcmla_f32_x(pred_full, zero248, v820, v201, 90); - svfloat32_t zero255; - asm volatile("mov %0.s, #0" : "=w"(zero255)); + svfloat32_t zero255 = svdup_n_f32(0); svfloat32_t v255 = svcmla_f32_x(pred_full, zero255, v821, v202, 90); - svfloat32_t v283; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v283) : "w"(v282), "w"(v32)); - svfloat32_t v301; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v301) : "w"(v282), "w"(v823)); - svfloat32_t zero323; - asm volatile("mov %0.s, #0" : "=w"(zero323)); + svfloat32_t v283 = svadd_f32_x(svptrue_b32(), v282, v32); + svfloat32_t v301 = svmul_f32_x(svptrue_b32(), v282, v823); + svfloat32_t zero323 = svdup_n_f32(0); svfloat32_t v323 = svcmla_f32_x(pred_full, zero323, v827, v288, 90); - svfloat32_t v372; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v372) : "w"(v371), "w"(v33)); - svfloat32_t v194; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v194) : "w"(v193), "w"(v41)); - svfloat32_t zero234; - asm volatile("mov %0.s, #0" : "=w"(zero234)); + svfloat32_t v372 = svadd_f32_x(svptrue_b32(), v371, v33); + svfloat32_t v194 = svadd_f32_x(svptrue_b32(), v193, v41); + svfloat32_t zero234 = svdup_n_f32(0); svfloat32_t v234 = svcmla_f32_x(pred_full, zero234, v818, v199, 90); - svfloat32_t v352; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v352) : "w"(v323), "w"(v330)); - svfloat32_t v354; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v354) : "w"(v323), "w"(v330)); - svfloat32_t v356; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v356) : "w"(v323), "w"(v337)); - svfloat32_t zero387; - asm volatile("mov %0.s, #0" : "=w"(zero387)); + svfloat32_t v352 = svadd_f32_x(svptrue_b32(), v323, v330); + svfloat32_t v354 = svsub_f32_x(svptrue_b32(), v323, v330); + svfloat32_t v356 = svsub_f32_x(svptrue_b32(), v323, v337); + svfloat32_t zero387 = svdup_n_f32(0); svfloat32_t v387 = svcmla_f32_x(pred_full, zero387, v831, v372, 90); svfloat32_t v443 = svmla_f32_x(pred_full, v425, v377, v836); svfloat32_t v445 = svnmls_f32_x(pred_full, v425, v377, v836); svfloat32_t v447 = svnmls_f32_x(pred_full, v430, v377, v836); svfloat32_t v256 = svmla_f32_x(pred_full, v194, v193, v814); - svfloat32_t v263; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v263) : "w"(v234), "w"(v241)); - svfloat32_t v265; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v265) : "w"(v234), "w"(v241)); - svfloat32_t v267; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v267) : "w"(v234), "w"(v248)); + svfloat32_t v263 = svadd_f32_x(svptrue_b32(), v234, v241); + svfloat32_t v265 = svsub_f32_x(svptrue_b32(), v234, v241); + svfloat32_t v267 = svsub_f32_x(svptrue_b32(), v234, v248); svfloat32_t v345 = svmla_f32_x(pred_full, v301, v283, v822); - svfloat32_t v353; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v353) : "w"(v352), "w"(v337)); - svfloat32_t v355; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v355) : "w"(v354), "w"(v344)); - svfloat32_t v357; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v357) : "w"(v356), "w"(v344)); + svfloat32_t v353 = svadd_f32_x(svptrue_b32(), v352, v337); + svfloat32_t v355 = svsub_f32_x(svptrue_b32(), v354, v344); + svfloat32_t v357 = svadd_f32_x(svptrue_b32(), v356, v344); svfloat32_t v436 = svcmla_f32_x(pred_full, v387, v832, v371, 90); svfloat32_t v444 = svmla_f32_x(pred_full, v443, v379, v838); svfloat32_t v446 = svmls_f32_x(pred_full, v445, v380, v839); @@ -8746,117 +4724,70 @@ void armral_fft_cf32_cf32_cf32_ac_n_gu21(const armral_cmplx_f32_t *restrict x, svfloat32_t v257 = svmla_f32_x(pred_full, v256, v195, v815); svfloat32_t v259 = svmls_f32_x(pred_full, v256, v195, v815); svfloat32_t v261 = svmls_f32_x(pred_full, v256, v196, v816); - svfloat32_t v264; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v264) : "w"(v263), "w"(v248)); - svfloat32_t v266; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v266) : "w"(v265), "w"(v255)); - svfloat32_t v268; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v268) : "w"(v267), "w"(v255)); + svfloat32_t v264 = svadd_f32_x(svptrue_b32(), v263, v248); + svfloat32_t v266 = svsub_f32_x(svptrue_b32(), v265, v255); + svfloat32_t v268 = svadd_f32_x(svptrue_b32(), v267, v255); svfloat32_t v346 = svmla_f32_x(pred_full, v345, v284, v824); svfloat32_t v348 = svmls_f32_x(pred_full, v345, v284, v824); svfloat32_t v350 = svmls_f32_x(pred_full, v345, v285, v825); - svfloat32_t v437; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v437) : "w"(v436), "w"(v401)); - svfloat32_t v439; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v439) : "w"(v436), "w"(v401)); - svfloat32_t v441; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v441) : "w"(v436), "w"(v408)); - svfloat32_t v456; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v456) : "w"(v455), "w"(v387)); - svfloat32_t v457; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v457) : "w"(v455), "w"(v387)); + svfloat32_t v437 = svadd_f32_x(svptrue_b32(), v436, v401); + svfloat32_t v439 = svsub_f32_x(svptrue_b32(), v436, v401); + svfloat32_t v441 = svsub_f32_x(svptrue_b32(), v436, v408); + svfloat32_t v456 = svadd_f32_x(svptrue_b32(), v455, v387); + svfloat32_t v457 = svsub_f32_x(svptrue_b32(), v455, v387); svfloat32_t v258 = svmla_f32_x(pred_full, v257, v196, v816); svfloat32_t v260 = svmls_f32_x(pred_full, v259, v197, v817); svfloat32_t v262 = svmla_f32_x(pred_full, v261, v197, v817); svfloat32_t v347 = svmla_f32_x(pred_full, v346, v285, v825); svfloat32_t v349 = svmls_f32_x(pred_full, v348, v286, v826); svfloat32_t v351 = svmla_f32_x(pred_full, v350, v286, v826); - svfloat32_t v438; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v438) : "w"(v437), "w"(v408)); - svfloat32_t v440; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v440) : "w"(v439), "w"(v415)); - svfloat32_t v442; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v442) : "w"(v441), "w"(v415)); + svfloat32_t v438 = svadd_f32_x(svptrue_b32(), v437, v408); + svfloat32_t v440 = svsub_f32_x(svptrue_b32(), v439, v415); + svfloat32_t v442 = svadd_f32_x(svptrue_b32(), v441, v415); svst1_f64(pred_full, (double *)(v856), svreinterpret_f64_f32(v457)); svst1_f64(pred_full, (double *)(v865), svreinterpret_f64_f32(v456)); - svfloat32_t v269; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v269) : "w"(v258), "w"(v264)); - svfloat32_t v270; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v270) : "w"(v258), "w"(v264)); - svfloat32_t v271; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v271) : "w"(v260), "w"(v266)); - svfloat32_t v272; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v272) : "w"(v260), "w"(v266)); - svfloat32_t v273; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v273) : "w"(v262), "w"(v268)); - svfloat32_t v274; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v274) : "w"(v262), "w"(v268)); - svfloat32_t v358; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v358) : "w"(v347), "w"(v353)); - svfloat32_t v359; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v359) : "w"(v347), "w"(v353)); - svfloat32_t v360; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v360) : "w"(v349), "w"(v355)); - svfloat32_t v361; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v361) : "w"(v349), "w"(v355)); - svfloat32_t v362; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v362) : "w"(v351), "w"(v357)); - svfloat32_t v363; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v363) : "w"(v351), "w"(v357)); - svfloat32_t v449; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v449) : "w"(v438), "w"(v444)); - svfloat32_t v450; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v450) : "w"(v438), "w"(v444)); - svfloat32_t v451; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v451) : "w"(v440), "w"(v446)); - svfloat32_t v452; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v452) : "w"(v440), "w"(v446)); - svfloat32_t v453; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v453) : "w"(v442), "w"(v448)); - svfloat32_t v454; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v454) : "w"(v442), "w"(v448)); - svfloat32_t v479; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v479) : "w"(v270), "w"(v359)); - svfloat32_t v503; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v503) : "w"(v272), "w"(v361)); - svfloat32_t v527; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v527) : "w"(v273), "w"(v362)); - svfloat32_t v551; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v551) : "w"(v274), "w"(v363)); - svfloat32_t v575; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v575) : "w"(v271), "w"(v360)); - svfloat32_t v599; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v599) : "w"(v269), "w"(v358)); + svfloat32_t v269 = svadd_f32_x(svptrue_b32(), v258, v264); + svfloat32_t v270 = svsub_f32_x(svptrue_b32(), v258, v264); + svfloat32_t v271 = svadd_f32_x(svptrue_b32(), v260, v266); + svfloat32_t v272 = svsub_f32_x(svptrue_b32(), v260, v266); + svfloat32_t v273 = svadd_f32_x(svptrue_b32(), v262, v268); + svfloat32_t v274 = svsub_f32_x(svptrue_b32(), v262, v268); + svfloat32_t v358 = svadd_f32_x(svptrue_b32(), v347, v353); + svfloat32_t v359 = svsub_f32_x(svptrue_b32(), v347, v353); + svfloat32_t v360 = svadd_f32_x(svptrue_b32(), v349, v355); + svfloat32_t v361 = svsub_f32_x(svptrue_b32(), v349, v355); + svfloat32_t v362 = svadd_f32_x(svptrue_b32(), v351, v357); + svfloat32_t v363 = svsub_f32_x(svptrue_b32(), v351, v357); + svfloat32_t v449 = svadd_f32_x(svptrue_b32(), v438, v444); + svfloat32_t v450 = svsub_f32_x(svptrue_b32(), v438, v444); + svfloat32_t v451 = svadd_f32_x(svptrue_b32(), v440, v446); + svfloat32_t v452 = svsub_f32_x(svptrue_b32(), v440, v446); + svfloat32_t v453 = svadd_f32_x(svptrue_b32(), v442, v448); + svfloat32_t v454 = svsub_f32_x(svptrue_b32(), v442, v448); + svfloat32_t v479 = svadd_f32_x(svptrue_b32(), v270, v359); + svfloat32_t v503 = svadd_f32_x(svptrue_b32(), v272, v361); + svfloat32_t v527 = svadd_f32_x(svptrue_b32(), v273, v362); + svfloat32_t v551 = svadd_f32_x(svptrue_b32(), v274, v363); + svfloat32_t v575 = svadd_f32_x(svptrue_b32(), v271, v360); + svfloat32_t v599 = svadd_f32_x(svptrue_b32(), v269, v358); svst1_f64(pred_full, (double *)(v874), svreinterpret_f64_f32(v270)); svst1_f64(pred_full, (double *)(v901), svreinterpret_f64_f32(v272)); svst1_f64(pred_full, (double *)(v928), svreinterpret_f64_f32(v273)); svst1_f64(pred_full, (double *)(v955), svreinterpret_f64_f32(v274)); svst1_f64(pred_full, (double *)(v982), svreinterpret_f64_f32(v271)); svst1_f64(pred_full, (double *)(v1009), svreinterpret_f64_f32(v269)); - svfloat32_t v480; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v480) : "w"(v479), "w"(v450)); - svfloat32_t v481; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v481) : "w"(v479), "w"(v450)); - svfloat32_t v504; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v504) : "w"(v503), "w"(v452)); - svfloat32_t v505; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v505) : "w"(v503), "w"(v452)); - svfloat32_t v528; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v528) : "w"(v527), "w"(v453)); - svfloat32_t v529; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v529) : "w"(v527), "w"(v453)); - svfloat32_t v552; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v552) : "w"(v551), "w"(v454)); - svfloat32_t v553; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v553) : "w"(v551), "w"(v454)); - svfloat32_t v576; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v576) : "w"(v575), "w"(v451)); - svfloat32_t v577; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v577) : "w"(v575), "w"(v451)); - svfloat32_t v600; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v600) : "w"(v599), "w"(v449)); - svfloat32_t v601; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v601) : "w"(v599), "w"(v449)); + svfloat32_t v480 = svadd_f32_x(svptrue_b32(), v479, v450); + svfloat32_t v481 = svsub_f32_x(svptrue_b32(), v479, v450); + svfloat32_t v504 = svadd_f32_x(svptrue_b32(), v503, v452); + svfloat32_t v505 = svsub_f32_x(svptrue_b32(), v503, v452); + svfloat32_t v528 = svadd_f32_x(svptrue_b32(), v527, v453); + svfloat32_t v529 = svsub_f32_x(svptrue_b32(), v527, v453); + svfloat32_t v552 = svadd_f32_x(svptrue_b32(), v551, v454); + svfloat32_t v553 = svsub_f32_x(svptrue_b32(), v551, v454); + svfloat32_t v576 = svadd_f32_x(svptrue_b32(), v575, v451); + svfloat32_t v577 = svsub_f32_x(svptrue_b32(), v575, v451); + svfloat32_t v600 = svadd_f32_x(svptrue_b32(), v599, v449); + svfloat32_t v601 = svsub_f32_x(svptrue_b32(), v599, v449); svst1_f64(pred_full, (double *)(v883), svreinterpret_f64_f32(v481)); svst1_f64(pred_full, (double *)(v892), svreinterpret_f64_f32(v480)); svst1_f64(pred_full, (double *)(v910), svreinterpret_f64_f32(v505)); @@ -9441,240 +5372,128 @@ void armral_fft_cf32_cf32_cf32_ac_n_gu22(const armral_cmplx_f32_t *restrict x, svld1_gather_s64index_f64(pred_full, (const double *)(v953), v963)); svfloat32_t v964 = svreinterpret_f32_f64( svld1_gather_s64index_f64(pred_full, (const double *)(v962), v963)); - svfloat32_t v32; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v32) : "w"(v775), "w"(v784)); - svfloat32_t v33; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v33) : "w"(v775), "w"(v784)); - svfloat32_t v48; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v48) : "w"(v793), "w"(v802)); - svfloat32_t v49; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v49) : "w"(v793), "w"(v802)); - svfloat32_t v64; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v64) : "w"(v811), "w"(v820)); - svfloat32_t v65; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v65) : "w"(v811), "w"(v820)); - svfloat32_t v80; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v80) : "w"(v829), "w"(v838)); - svfloat32_t v81; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v81) : "w"(v829), "w"(v838)); - svfloat32_t v96; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v96) : "w"(v847), "w"(v856)); - svfloat32_t v97; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v97) : "w"(v847), "w"(v856)); - svfloat32_t v112; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v112) : "w"(v865), "w"(v874)); - svfloat32_t v113; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v113) : "w"(v865), "w"(v874)); - svfloat32_t v128; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v128) : "w"(v883), "w"(v892)); - svfloat32_t v129; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v129) : "w"(v883), "w"(v892)); - svfloat32_t v144; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v144) : "w"(v901), "w"(v910)); - svfloat32_t v145; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v145) : "w"(v901), "w"(v910)); - svfloat32_t v160; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v160) : "w"(v919), "w"(v928)); - svfloat32_t v161; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v161) : "w"(v919), "w"(v928)); - svfloat32_t v176; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v176) : "w"(v937), "w"(v946)); - svfloat32_t v177; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v177) : "w"(v937), "w"(v946)); - svfloat32_t v192; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v192) : "w"(v955), "w"(v964)); - svfloat32_t v193; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v193) : "w"(v955), "w"(v964)); - svfloat32_t v194; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v194) : "w"(v48), "w"(v192)); - svfloat32_t v195; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v195) : "w"(v64), "w"(v176)); - svfloat32_t v196; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v196) : "w"(v80), "w"(v160)); - svfloat32_t v197; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v197) : "w"(v96), "w"(v144)); - svfloat32_t v198; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v198) : "w"(v112), "w"(v128)); - svfloat32_t v199; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v199) : "w"(v48), "w"(v192)); - svfloat32_t v200; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v200) : "w"(v64), "w"(v176)); - svfloat32_t v201; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v201) : "w"(v80), "w"(v160)); - svfloat32_t v202; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v202) : "w"(v96), "w"(v144)); - svfloat32_t v203; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v203) : "w"(v112), "w"(v128)); - svfloat32_t v403; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v403) : "w"(v49), "w"(v193)); - svfloat32_t v404; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v404) : "w"(v65), "w"(v177)); - svfloat32_t v405; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v405) : "w"(v81), "w"(v161)); - svfloat32_t v406; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v406) : "w"(v97), "w"(v145)); - svfloat32_t v407; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v407) : "w"(v113), "w"(v129)); - svfloat32_t v408; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v408) : "w"(v49), "w"(v193)); - svfloat32_t v409; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v409) : "w"(v65), "w"(v177)); - svfloat32_t v410; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v410) : "w"(v81), "w"(v161)); - svfloat32_t v411; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v411) : "w"(v97), "w"(v145)); - svfloat32_t v412; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v412) : "w"(v113), "w"(v129)); - svfloat32_t v204; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v204) : "w"(v194), "w"(v195)); - svfloat32_t v205; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v205) : "w"(v196), "w"(v198)); - svfloat32_t v207; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v207) : "w"(v200), "w"(v201)); - svfloat32_t v208; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v208) : "w"(v199), "w"(v203)); - svfloat32_t v213; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v213) : "w"(v195), "w"(v197)); - svfloat32_t v214; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v214) : "w"(v194), "w"(v197)); - svfloat32_t v215; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v215) : "w"(v195), "w"(v194)); - svfloat32_t v216; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v216) : "w"(v198), "w"(v197)); - svfloat32_t v217; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v217) : "w"(v196), "w"(v197)); - svfloat32_t v218; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v218) : "w"(v198), "w"(v196)); - svfloat32_t v219; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v219) : "w"(v195), "w"(v198)); - svfloat32_t v220; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v220) : "w"(v194), "w"(v196)); - svfloat32_t v222; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v222) : "w"(v200), "w"(v202)); - svfloat32_t v223; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v223) : "w"(v199), "w"(v202)); - svfloat32_t v224; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v224) : "w"(v199), "w"(v200)); - svfloat32_t v225; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v225) : "w"(v202), "w"(v203)); - svfloat32_t v226; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v226) : "w"(v201), "w"(v202)); - svfloat32_t v227; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v227) : "w"(v201), "w"(v203)); - svfloat32_t v228; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v228) : "w"(v200), "w"(v203)); - svfloat32_t v229; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v229) : "w"(v199), "w"(v201)); - svfloat32_t v413; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v413) : "w"(v403), "w"(v404)); - svfloat32_t v414; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v414) : "w"(v405), "w"(v407)); - svfloat32_t v416; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v416) : "w"(v409), "w"(v410)); - svfloat32_t v417; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v417) : "w"(v408), "w"(v412)); - svfloat32_t v422; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v422) : "w"(v404), "w"(v406)); - svfloat32_t v423; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v423) : "w"(v403), "w"(v406)); - svfloat32_t v424; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v424) : "w"(v404), "w"(v403)); - svfloat32_t v425; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v425) : "w"(v407), "w"(v406)); - svfloat32_t v426; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v426) : "w"(v405), "w"(v406)); - svfloat32_t v427; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v427) : "w"(v407), "w"(v405)); - svfloat32_t v428; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v428) : "w"(v404), "w"(v407)); - svfloat32_t v429; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v429) : "w"(v403), "w"(v405)); - svfloat32_t v431; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v431) : "w"(v409), "w"(v411)); - svfloat32_t v432; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v432) : "w"(v408), "w"(v411)); - svfloat32_t v433; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v433) : "w"(v408), "w"(v409)); - svfloat32_t v434; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v434) : "w"(v411), "w"(v412)); - svfloat32_t v435; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v435) : "w"(v410), "w"(v411)); - svfloat32_t v436; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v436) : "w"(v410), "w"(v412)); - svfloat32_t v437; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v437) : "w"(v409), "w"(v412)); - svfloat32_t v438; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v438) : "w"(v408), "w"(v410)); - svfloat32_t v206; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v206) : "w"(v197), "w"(v204)); - svfloat32_t v211; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v211) : "w"(v207), "w"(v208)); - svfloat32_t v221; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v221) : "w"(v205), "w"(v204)); - svfloat32_t v230; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v230) : "w"(v207), "w"(v208)); - svfloat32_t v257; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v257) : "w"(v214), "w"(v990)); - svfloat32_t v262; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v262) : "w"(v215), "w"(v991)); - svfloat32_t v272; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v272) : "w"(v217), "w"(v993)); - svfloat32_t v277; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v277) : "w"(v218), "w"(v994)); - svfloat32_t zero299; - asm volatile("mov %0.s, #0" : "=w"(zero299)); + svfloat32_t v32 = svadd_f32_x(svptrue_b32(), v775, v784); + svfloat32_t v33 = svsub_f32_x(svptrue_b32(), v775, v784); + svfloat32_t v48 = svadd_f32_x(svptrue_b32(), v793, v802); + svfloat32_t v49 = svsub_f32_x(svptrue_b32(), v793, v802); + svfloat32_t v64 = svadd_f32_x(svptrue_b32(), v811, v820); + svfloat32_t v65 = svsub_f32_x(svptrue_b32(), v811, v820); + svfloat32_t v80 = svadd_f32_x(svptrue_b32(), v829, v838); + svfloat32_t v81 = svsub_f32_x(svptrue_b32(), v829, v838); + svfloat32_t v96 = svadd_f32_x(svptrue_b32(), v847, v856); + svfloat32_t v97 = svsub_f32_x(svptrue_b32(), v847, v856); + svfloat32_t v112 = svadd_f32_x(svptrue_b32(), v865, v874); + svfloat32_t v113 = svsub_f32_x(svptrue_b32(), v865, v874); + svfloat32_t v128 = svadd_f32_x(svptrue_b32(), v883, v892); + svfloat32_t v129 = svsub_f32_x(svptrue_b32(), v883, v892); + svfloat32_t v144 = svadd_f32_x(svptrue_b32(), v901, v910); + svfloat32_t v145 = svsub_f32_x(svptrue_b32(), v901, v910); + svfloat32_t v160 = svadd_f32_x(svptrue_b32(), v919, v928); + svfloat32_t v161 = svsub_f32_x(svptrue_b32(), v919, v928); + svfloat32_t v176 = svadd_f32_x(svptrue_b32(), v937, v946); + svfloat32_t v177 = svsub_f32_x(svptrue_b32(), v937, v946); + svfloat32_t v192 = svadd_f32_x(svptrue_b32(), v955, v964); + svfloat32_t v193 = svsub_f32_x(svptrue_b32(), v955, v964); + svfloat32_t v194 = svadd_f32_x(svptrue_b32(), v48, v192); + svfloat32_t v195 = svadd_f32_x(svptrue_b32(), v64, v176); + svfloat32_t v196 = svadd_f32_x(svptrue_b32(), v80, v160); + svfloat32_t v197 = svadd_f32_x(svptrue_b32(), v96, v144); + svfloat32_t v198 = svadd_f32_x(svptrue_b32(), v112, v128); + svfloat32_t v199 = svsub_f32_x(svptrue_b32(), v48, v192); + svfloat32_t v200 = svsub_f32_x(svptrue_b32(), v64, v176); + svfloat32_t v201 = svsub_f32_x(svptrue_b32(), v80, v160); + svfloat32_t v202 = svsub_f32_x(svptrue_b32(), v96, v144); + svfloat32_t v203 = svsub_f32_x(svptrue_b32(), v112, v128); + svfloat32_t v403 = svadd_f32_x(svptrue_b32(), v49, v193); + svfloat32_t v404 = svadd_f32_x(svptrue_b32(), v65, v177); + svfloat32_t v405 = svadd_f32_x(svptrue_b32(), v81, v161); + svfloat32_t v406 = svadd_f32_x(svptrue_b32(), v97, v145); + svfloat32_t v407 = svadd_f32_x(svptrue_b32(), v113, v129); + svfloat32_t v408 = svsub_f32_x(svptrue_b32(), v49, v193); + svfloat32_t v409 = svsub_f32_x(svptrue_b32(), v65, v177); + svfloat32_t v410 = svsub_f32_x(svptrue_b32(), v81, v161); + svfloat32_t v411 = svsub_f32_x(svptrue_b32(), v97, v145); + svfloat32_t v412 = svsub_f32_x(svptrue_b32(), v113, v129); + svfloat32_t v204 = svadd_f32_x(svptrue_b32(), v194, v195); + svfloat32_t v205 = svadd_f32_x(svptrue_b32(), v196, v198); + svfloat32_t v207 = svsub_f32_x(svptrue_b32(), v200, v201); + svfloat32_t v208 = svadd_f32_x(svptrue_b32(), v199, v203); + svfloat32_t v213 = svsub_f32_x(svptrue_b32(), v195, v197); + svfloat32_t v214 = svsub_f32_x(svptrue_b32(), v194, v197); + svfloat32_t v215 = svsub_f32_x(svptrue_b32(), v195, v194); + svfloat32_t v216 = svsub_f32_x(svptrue_b32(), v198, v197); + svfloat32_t v217 = svsub_f32_x(svptrue_b32(), v196, v197); + svfloat32_t v218 = svsub_f32_x(svptrue_b32(), v198, v196); + svfloat32_t v219 = svsub_f32_x(svptrue_b32(), v195, v198); + svfloat32_t v220 = svsub_f32_x(svptrue_b32(), v194, v196); + svfloat32_t v222 = svadd_f32_x(svptrue_b32(), v200, v202); + svfloat32_t v223 = svsub_f32_x(svptrue_b32(), v199, v202); + svfloat32_t v224 = svadd_f32_x(svptrue_b32(), v199, v200); + svfloat32_t v225 = svsub_f32_x(svptrue_b32(), v202, v203); + svfloat32_t v226 = svsub_f32_x(svptrue_b32(), v201, v202); + svfloat32_t v227 = svsub_f32_x(svptrue_b32(), v201, v203); + svfloat32_t v228 = svadd_f32_x(svptrue_b32(), v200, v203); + svfloat32_t v229 = svsub_f32_x(svptrue_b32(), v199, v201); + svfloat32_t v413 = svadd_f32_x(svptrue_b32(), v403, v404); + svfloat32_t v414 = svadd_f32_x(svptrue_b32(), v405, v407); + svfloat32_t v416 = svsub_f32_x(svptrue_b32(), v409, v410); + svfloat32_t v417 = svadd_f32_x(svptrue_b32(), v408, v412); + svfloat32_t v422 = svsub_f32_x(svptrue_b32(), v404, v406); + svfloat32_t v423 = svsub_f32_x(svptrue_b32(), v403, v406); + svfloat32_t v424 = svsub_f32_x(svptrue_b32(), v404, v403); + svfloat32_t v425 = svsub_f32_x(svptrue_b32(), v407, v406); + svfloat32_t v426 = svsub_f32_x(svptrue_b32(), v405, v406); + svfloat32_t v427 = svsub_f32_x(svptrue_b32(), v407, v405); + svfloat32_t v428 = svsub_f32_x(svptrue_b32(), v404, v407); + svfloat32_t v429 = svsub_f32_x(svptrue_b32(), v403, v405); + svfloat32_t v431 = svadd_f32_x(svptrue_b32(), v409, v411); + svfloat32_t v432 = svsub_f32_x(svptrue_b32(), v408, v411); + svfloat32_t v433 = svadd_f32_x(svptrue_b32(), v408, v409); + svfloat32_t v434 = svsub_f32_x(svptrue_b32(), v411, v412); + svfloat32_t v435 = svsub_f32_x(svptrue_b32(), v410, v411); + svfloat32_t v436 = svsub_f32_x(svptrue_b32(), v410, v412); + svfloat32_t v437 = svadd_f32_x(svptrue_b32(), v409, v412); + svfloat32_t v438 = svsub_f32_x(svptrue_b32(), v408, v410); + svfloat32_t v206 = svadd_f32_x(svptrue_b32(), v197, v204); + svfloat32_t v211 = svsub_f32_x(svptrue_b32(), v207, v208); + svfloat32_t v221 = svsub_f32_x(svptrue_b32(), v205, v204); + svfloat32_t v230 = svadd_f32_x(svptrue_b32(), v207, v208); + svfloat32_t v257 = svmul_f32_x(svptrue_b32(), v214, v990); + svfloat32_t v262 = svmul_f32_x(svptrue_b32(), v215, v991); + svfloat32_t v272 = svmul_f32_x(svptrue_b32(), v217, v993); + svfloat32_t v277 = svmul_f32_x(svptrue_b32(), v218, v994); + svfloat32_t zero299 = svdup_n_f32(0); svfloat32_t v299 = svcmla_f32_x(pred_full, zero299, v998, v222, 90); - svfloat32_t zero313; - asm volatile("mov %0.s, #0" : "=w"(zero313)); + svfloat32_t zero313 = svdup_n_f32(0); svfloat32_t v313 = svcmla_f32_x(pred_full, zero313, v1000, v224, 90); - svfloat32_t zero320; - asm volatile("mov %0.s, #0" : "=w"(zero320)); + svfloat32_t zero320 = svdup_n_f32(0); svfloat32_t v320 = svcmla_f32_x(pred_full, zero320, v1001, v225, 90); - svfloat32_t zero334; - asm volatile("mov %0.s, #0" : "=w"(zero334)); + svfloat32_t zero334 = svdup_n_f32(0); svfloat32_t v334 = svcmla_f32_x(pred_full, zero334, v1003, v227, 90); - svfloat32_t zero341; - asm volatile("mov %0.s, #0" : "=w"(zero341)); + svfloat32_t zero341 = svdup_n_f32(0); svfloat32_t v341 = svcmla_f32_x(pred_full, zero341, v1004, v228, 90); - svfloat32_t v415; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v415) : "w"(v406), "w"(v413)); - svfloat32_t v420; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v420) : "w"(v416), "w"(v417)); - svfloat32_t v430; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v430) : "w"(v414), "w"(v413)); - svfloat32_t v439; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v439) : "w"(v416), "w"(v417)); - svfloat32_t v466; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v466) : "w"(v423), "w"(v990)); - svfloat32_t v471; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v471) : "w"(v424), "w"(v991)); - svfloat32_t v481; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v481) : "w"(v426), "w"(v993)); - svfloat32_t v486; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v486) : "w"(v427), "w"(v994)); - svfloat32_t zero508; - asm volatile("mov %0.s, #0" : "=w"(zero508)); + svfloat32_t v415 = svadd_f32_x(svptrue_b32(), v406, v413); + svfloat32_t v420 = svsub_f32_x(svptrue_b32(), v416, v417); + svfloat32_t v430 = svsub_f32_x(svptrue_b32(), v414, v413); + svfloat32_t v439 = svadd_f32_x(svptrue_b32(), v416, v417); + svfloat32_t v466 = svmul_f32_x(svptrue_b32(), v423, v990); + svfloat32_t v471 = svmul_f32_x(svptrue_b32(), v424, v991); + svfloat32_t v481 = svmul_f32_x(svptrue_b32(), v426, v993); + svfloat32_t v486 = svmul_f32_x(svptrue_b32(), v427, v994); + svfloat32_t zero508 = svdup_n_f32(0); svfloat32_t v508 = svcmla_f32_x(pred_full, zero508, v998, v431, 90); - svfloat32_t zero522; - asm volatile("mov %0.s, #0" : "=w"(zero522)); + svfloat32_t zero522 = svdup_n_f32(0); svfloat32_t v522 = svcmla_f32_x(pred_full, zero522, v1000, v433, 90); - svfloat32_t zero529; - asm volatile("mov %0.s, #0" : "=w"(zero529)); + svfloat32_t zero529 = svdup_n_f32(0); svfloat32_t v529 = svcmla_f32_x(pred_full, zero529, v1001, v434, 90); - svfloat32_t zero543; - asm volatile("mov %0.s, #0" : "=w"(zero543)); + svfloat32_t zero543 = svdup_n_f32(0); svfloat32_t v543 = svcmla_f32_x(pred_full, zero543, v1003, v436, 90); - svfloat32_t zero550; - asm volatile("mov %0.s, #0" : "=w"(zero550)); + svfloat32_t zero550 = svdup_n_f32(0); svfloat32_t v550 = svcmla_f32_x(pred_full, zero550, v1004, v437, 90); - svfloat32_t v209; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v209) : "w"(v206), "w"(v205)); - svfloat32_t v212; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v212) : "w"(v211), "w"(v202)); - svfloat32_t v292; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v292) : "w"(v221), "w"(v997)); - svfloat32_t zero355; - asm volatile("mov %0.s, #0" : "=w"(zero355)); + svfloat32_t v209 = svadd_f32_x(svptrue_b32(), v206, v205); + svfloat32_t v212 = svsub_f32_x(svptrue_b32(), v211, v202); + svfloat32_t v292 = svmul_f32_x(svptrue_b32(), v221, v997); + svfloat32_t zero355 = svdup_n_f32(0); svfloat32_t v355 = svcmla_f32_x(pred_full, zero355, v1006, v230, 90); svfloat32_t v357 = svmla_f32_x(pred_full, v257, v213, v989); svfloat32_t v358 = svmla_f32_x(pred_full, v262, v214, v990); @@ -9683,19 +5502,13 @@ void armral_fft_cf32_cf32_cf32_ac_n_gu22(const armral_cmplx_f32_t *restrict x, svfloat32_t v361 = svmla_f32_x(pred_full, v277, v217, v993); svfloat32_t v362 = svnmls_f32_x(pred_full, v277, v216, v992); svfloat32_t v365 = svcmla_f32_x(pred_full, v313, v999, v223, 90); - svfloat32_t v366; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v366) : "w"(v299), "w"(v313)); + svfloat32_t v366 = svsub_f32_x(svptrue_b32(), v299, v313); svfloat32_t v367 = svcmla_f32_x(pred_full, v334, v1002, v226, 90); - svfloat32_t v368; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v368) : "w"(v320), "w"(v334)); - svfloat32_t v418; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v418) : "w"(v415), "w"(v414)); - svfloat32_t v421; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v421) : "w"(v420), "w"(v411)); - svfloat32_t v501; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v501) : "w"(v430), "w"(v997)); - svfloat32_t zero564; - asm volatile("mov %0.s, #0" : "=w"(zero564)); + svfloat32_t v368 = svsub_f32_x(svptrue_b32(), v320, v334); + svfloat32_t v418 = svadd_f32_x(svptrue_b32(), v415, v414); + svfloat32_t v421 = svsub_f32_x(svptrue_b32(), v420, v411); + svfloat32_t v501 = svmul_f32_x(svptrue_b32(), v430, v997); + svfloat32_t zero564 = svdup_n_f32(0); svfloat32_t v564 = svcmla_f32_x(pred_full, zero564, v1006, v439, 90); svfloat32_t v566 = svmla_f32_x(pred_full, v466, v422, v989); svfloat32_t v567 = svmla_f32_x(pred_full, v471, v423, v990); @@ -9704,163 +5517,91 @@ void armral_fft_cf32_cf32_cf32_ac_n_gu22(const armral_cmplx_f32_t *restrict x, svfloat32_t v570 = svmla_f32_x(pred_full, v486, v426, v993); svfloat32_t v571 = svnmls_f32_x(pred_full, v486, v425, v992); svfloat32_t v574 = svcmla_f32_x(pred_full, v522, v999, v432, 90); - svfloat32_t v575; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v575) : "w"(v508), "w"(v522)); + svfloat32_t v575 = svsub_f32_x(svptrue_b32(), v508, v522); svfloat32_t v576 = svcmla_f32_x(pred_full, v543, v1002, v435, 90); - svfloat32_t v577; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v577) : "w"(v529), "w"(v543)); - svfloat32_t v210; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v210) : "w"(v32), "w"(v209)); - svfloat32_t zero247; - asm volatile("mov %0.s, #0" : "=w"(zero247)); + svfloat32_t v577 = svsub_f32_x(svptrue_b32(), v529, v543); + svfloat32_t v210 = svadd_f32_x(svptrue_b32(), v32, v209); + svfloat32_t zero247 = svdup_n_f32(0); svfloat32_t v247 = svcmla_f32_x(pred_full, zero247, v988, v212, 90); svfloat32_t v363 = svmla_f32_x(pred_full, v292, v220, v996); svfloat32_t v364 = svmla_f32_x(pred_full, v292, v219, v995); svfloat32_t v369 = svcmla_f32_x(pred_full, v355, v1005, v229, 90); - svfloat32_t v370; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v370) : "w"(v341), "w"(v355)); - svfloat32_t v389; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v389) : "w"(v365), "w"(v366)); - svfloat32_t v419; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v419) : "w"(v33), "w"(v418)); - svfloat32_t zero456; - asm volatile("mov %0.s, #0" : "=w"(zero456)); + svfloat32_t v370 = svsub_f32_x(svptrue_b32(), v341, v355); + svfloat32_t v389 = svadd_f32_x(svptrue_b32(), v365, v366); + svfloat32_t v419 = svadd_f32_x(svptrue_b32(), v33, v418); + svfloat32_t zero456 = svdup_n_f32(0); svfloat32_t v456 = svcmla_f32_x(pred_full, zero456, v988, v421, 90); svfloat32_t v572 = svmla_f32_x(pred_full, v501, v429, v996); svfloat32_t v573 = svmla_f32_x(pred_full, v501, v428, v995); svfloat32_t v578 = svcmla_f32_x(pred_full, v564, v1005, v438, 90); - svfloat32_t v579; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v579) : "w"(v550), "w"(v564)); - svfloat32_t v598; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v598) : "w"(v574), "w"(v575)); + svfloat32_t v579 = svsub_f32_x(svptrue_b32(), v550, v564); + svfloat32_t v598 = svadd_f32_x(svptrue_b32(), v574, v575); svfloat32_t v356 = svmls_f32_x(pred_full, v210, v209, v987); - svfloat32_t v371; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v371) : "w"(v361), "w"(v363)); - svfloat32_t v381; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v381) : "w"(v247), "w"(v367)); - svfloat32_t v383; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v383) : "w"(v369), "w"(v365)); - svfloat32_t v385; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v385) : "w"(v247), "w"(v370)); - svfloat32_t v387; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v387) : "w"(v370), "w"(v366)); - svfloat32_t v390; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v390) : "w"(v389), "w"(v367)); + svfloat32_t v371 = svadd_f32_x(svptrue_b32(), v361, v363); + svfloat32_t v381 = svadd_f32_x(svptrue_b32(), v247, v367); + svfloat32_t v383 = svsub_f32_x(svptrue_b32(), v369, v365); + svfloat32_t v385 = svadd_f32_x(svptrue_b32(), v247, v370); + svfloat32_t v387 = svsub_f32_x(svptrue_b32(), v370, v366); + svfloat32_t v390 = svadd_f32_x(svptrue_b32(), v389, v367); svfloat32_t v565 = svmls_f32_x(pred_full, v419, v418, v987); - svfloat32_t v580; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v580) : "w"(v570), "w"(v572)); - svfloat32_t v590; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v590) : "w"(v456), "w"(v576)); - svfloat32_t v592; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v592) : "w"(v578), "w"(v574)); - svfloat32_t v594; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v594) : "w"(v456), "w"(v579)); - svfloat32_t v596; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v596) : "w"(v579), "w"(v575)); - svfloat32_t v599; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v599) : "w"(v598), "w"(v576)); + svfloat32_t v580 = svadd_f32_x(svptrue_b32(), v570, v572); + svfloat32_t v590 = svadd_f32_x(svptrue_b32(), v456, v576); + svfloat32_t v592 = svsub_f32_x(svptrue_b32(), v578, v574); + svfloat32_t v594 = svadd_f32_x(svptrue_b32(), v456, v579); + svfloat32_t v596 = svsub_f32_x(svptrue_b32(), v579, v575); + svfloat32_t v599 = svadd_f32_x(svptrue_b32(), v598, v576); svst1_f64(pred_full, (double *)(v1014), svreinterpret_f64_f32(v210)); svst1_f64(pred_full, (double *)(v1023), svreinterpret_f64_f32(v419)); - svfloat32_t v372; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v372) : "w"(v371), "w"(v356)); - svfloat32_t v373; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v373) : "w"(v356), "w"(v358)); - svfloat32_t v375; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v375) : "w"(v356), "w"(v362)); - svfloat32_t v377; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v377) : "w"(v356), "w"(v359)); - svfloat32_t v379; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v379) : "w"(v356), "w"(v357)); - svfloat32_t v382; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v382) : "w"(v381), "w"(v369)); - svfloat32_t v384; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v384) : "w"(v383), "w"(v247)); - svfloat32_t v386; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v386) : "w"(v385), "w"(v368)); - svfloat32_t v388; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v388) : "w"(v387), "w"(v247)); - svfloat32_t v391; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v391) : "w"(v390), "w"(v368)); - svfloat32_t v581; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v581) : "w"(v580), "w"(v565)); - svfloat32_t v582; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v582) : "w"(v565), "w"(v567)); - svfloat32_t v584; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v584) : "w"(v565), "w"(v571)); - svfloat32_t v586; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v586) : "w"(v565), "w"(v568)); - svfloat32_t v588; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v588) : "w"(v565), "w"(v566)); - svfloat32_t v591; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v591) : "w"(v590), "w"(v578)); - svfloat32_t v593; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v593) : "w"(v592), "w"(v456)); - svfloat32_t v595; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v595) : "w"(v594), "w"(v577)); - svfloat32_t v597; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v597) : "w"(v596), "w"(v456)); - svfloat32_t v600; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v600) : "w"(v599), "w"(v577)); - svfloat32_t v374; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v374) : "w"(v373), "w"(v363)); - svfloat32_t v376; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v376) : "w"(v375), "w"(v364)); - svfloat32_t v378; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v378) : "w"(v377), "w"(v364)); - svfloat32_t v380; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v380) : "w"(v379), "w"(v360)); - svfloat32_t v392; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v392) : "w"(v391), "w"(v247)); - svfloat32_t v394; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v394) : "w"(v372), "w"(v382)); - svfloat32_t v401; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v401) : "w"(v372), "w"(v382)); - svfloat32_t v583; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v583) : "w"(v582), "w"(v572)); - svfloat32_t v585; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v585) : "w"(v584), "w"(v573)); - svfloat32_t v587; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v587) : "w"(v586), "w"(v573)); - svfloat32_t v589; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v589) : "w"(v588), "w"(v569)); - svfloat32_t v601; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v601) : "w"(v600), "w"(v456)); - svfloat32_t v603; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v603) : "w"(v581), "w"(v591)); - svfloat32_t v610; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v610) : "w"(v581), "w"(v591)); - svfloat32_t v393; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v393) : "w"(v380), "w"(v392)); - svfloat32_t v395; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v395) : "w"(v374), "w"(v384)); - svfloat32_t v396; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v396) : "w"(v376), "w"(v386)); - svfloat32_t v397; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v397) : "w"(v378), "w"(v388)); - svfloat32_t v398; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v398) : "w"(v378), "w"(v388)); - svfloat32_t v399; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v399) : "w"(v376), "w"(v386)); - svfloat32_t v400; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v400) : "w"(v374), "w"(v384)); - svfloat32_t v402; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v402) : "w"(v380), "w"(v392)); - svfloat32_t v602; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v602) : "w"(v589), "w"(v601)); - svfloat32_t v604; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v604) : "w"(v583), "w"(v593)); - svfloat32_t v605; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v605) : "w"(v585), "w"(v595)); - svfloat32_t v606; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v606) : "w"(v587), "w"(v597)); - svfloat32_t v607; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v607) : "w"(v587), "w"(v597)); - svfloat32_t v608; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v608) : "w"(v585), "w"(v595)); - svfloat32_t v609; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v609) : "w"(v583), "w"(v593)); - svfloat32_t v611; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v611) : "w"(v589), "w"(v601)); + svfloat32_t v372 = svadd_f32_x(svptrue_b32(), v371, v356); + svfloat32_t v373 = svsub_f32_x(svptrue_b32(), v356, v358); + svfloat32_t v375 = svadd_f32_x(svptrue_b32(), v356, v362); + svfloat32_t v377 = svsub_f32_x(svptrue_b32(), v356, v359); + svfloat32_t v379 = svadd_f32_x(svptrue_b32(), v356, v357); + svfloat32_t v382 = svadd_f32_x(svptrue_b32(), v381, v369); + svfloat32_t v384 = svsub_f32_x(svptrue_b32(), v383, v247); + svfloat32_t v386 = svadd_f32_x(svptrue_b32(), v385, v368); + svfloat32_t v388 = svsub_f32_x(svptrue_b32(), v387, v247); + svfloat32_t v391 = svadd_f32_x(svptrue_b32(), v390, v368); + svfloat32_t v581 = svadd_f32_x(svptrue_b32(), v580, v565); + svfloat32_t v582 = svsub_f32_x(svptrue_b32(), v565, v567); + svfloat32_t v584 = svadd_f32_x(svptrue_b32(), v565, v571); + svfloat32_t v586 = svsub_f32_x(svptrue_b32(), v565, v568); + svfloat32_t v588 = svadd_f32_x(svptrue_b32(), v565, v566); + svfloat32_t v591 = svadd_f32_x(svptrue_b32(), v590, v578); + svfloat32_t v593 = svsub_f32_x(svptrue_b32(), v592, v456); + svfloat32_t v595 = svadd_f32_x(svptrue_b32(), v594, v577); + svfloat32_t v597 = svsub_f32_x(svptrue_b32(), v596, v456); + svfloat32_t v600 = svadd_f32_x(svptrue_b32(), v599, v577); + svfloat32_t v374 = svsub_f32_x(svptrue_b32(), v373, v363); + svfloat32_t v376 = svadd_f32_x(svptrue_b32(), v375, v364); + svfloat32_t v378 = svsub_f32_x(svptrue_b32(), v377, v364); + svfloat32_t v380 = svsub_f32_x(svptrue_b32(), v379, v360); + svfloat32_t v392 = svsub_f32_x(svptrue_b32(), v391, v247); + svfloat32_t v394 = svadd_f32_x(svptrue_b32(), v372, v382); + svfloat32_t v401 = svsub_f32_x(svptrue_b32(), v372, v382); + svfloat32_t v583 = svsub_f32_x(svptrue_b32(), v582, v572); + svfloat32_t v585 = svadd_f32_x(svptrue_b32(), v584, v573); + svfloat32_t v587 = svsub_f32_x(svptrue_b32(), v586, v573); + svfloat32_t v589 = svsub_f32_x(svptrue_b32(), v588, v569); + svfloat32_t v601 = svsub_f32_x(svptrue_b32(), v600, v456); + svfloat32_t v603 = svadd_f32_x(svptrue_b32(), v581, v591); + svfloat32_t v610 = svsub_f32_x(svptrue_b32(), v581, v591); + svfloat32_t v393 = svadd_f32_x(svptrue_b32(), v380, v392); + svfloat32_t v395 = svadd_f32_x(svptrue_b32(), v374, v384); + svfloat32_t v396 = svsub_f32_x(svptrue_b32(), v376, v386); + svfloat32_t v397 = svadd_f32_x(svptrue_b32(), v378, v388); + svfloat32_t v398 = svsub_f32_x(svptrue_b32(), v378, v388); + svfloat32_t v399 = svadd_f32_x(svptrue_b32(), v376, v386); + svfloat32_t v400 = svsub_f32_x(svptrue_b32(), v374, v384); + svfloat32_t v402 = svsub_f32_x(svptrue_b32(), v380, v392); + svfloat32_t v602 = svadd_f32_x(svptrue_b32(), v589, v601); + svfloat32_t v604 = svadd_f32_x(svptrue_b32(), v583, v593); + svfloat32_t v605 = svsub_f32_x(svptrue_b32(), v585, v595); + svfloat32_t v606 = svadd_f32_x(svptrue_b32(), v587, v597); + svfloat32_t v607 = svsub_f32_x(svptrue_b32(), v587, v597); + svfloat32_t v608 = svadd_f32_x(svptrue_b32(), v585, v595); + svfloat32_t v609 = svsub_f32_x(svptrue_b32(), v583, v593); + svfloat32_t v611 = svsub_f32_x(svptrue_b32(), v589, v601); svst1_f64(pred_full, (double *)(v1050), svreinterpret_f64_f32(v401)); svst1_f64(pred_full, (double *)(v1059), svreinterpret_f64_f32(v610)); svst1_f64(pred_full, (double *)(v1176), svreinterpret_f64_f32(v394)); @@ -10321,271 +6062,152 @@ void armral_fft_cf32_cf32_cf32_ac_n_gu24(const armral_cmplx_f32_t *restrict x, svld1_gather_s64index_f64(pred_full, (const double *)(v827), v837)); svfloat32_t v838 = svreinterpret_f32_f64( svld1_gather_s64index_f64(pred_full, (const double *)(v836), v837)); - svfloat32_t v32; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v32) : "w"(v630), "w"(v639)); - svfloat32_t v33; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v33) : "w"(v630), "w"(v639)); - svfloat32_t v56; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v56) : "w"(v658), "w"(v667)); - svfloat32_t v57; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v57) : "w"(v658), "w"(v667)); - svfloat32_t v80; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v80) : "w"(v685), "w"(v694)); - svfloat32_t v81; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v81) : "w"(v685), "w"(v694)); - svfloat32_t v104; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v104) : "w"(v712), "w"(v721)); - svfloat32_t v105; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v105) : "w"(v712), "w"(v721)); - svfloat32_t v128; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v128) : "w"(v739), "w"(v748)); - svfloat32_t v129; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v129) : "w"(v739), "w"(v748)); - svfloat32_t v152; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v152) : "w"(v766), "w"(v775)); - svfloat32_t v153; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v153) : "w"(v766), "w"(v775)); - svfloat32_t v176; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v176) : "w"(v793), "w"(v802)); - svfloat32_t v177; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v177) : "w"(v793), "w"(v802)); - svfloat32_t v200; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v200) : "w"(v820), "w"(v829)); - svfloat32_t v201; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v201) : "w"(v820), "w"(v829)); - svfloat32_t v41; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v41) : "w"(v32), "w"(v649)); - svfloat32_t v65; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v65) : "w"(v56), "w"(v676)); - svfloat32_t v89; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v89) : "w"(v80), "w"(v703)); - svfloat32_t v113; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v113) : "w"(v104), "w"(v730)); - svfloat32_t v137; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v137) : "w"(v128), "w"(v757)); - svfloat32_t v161; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v161) : "w"(v152), "w"(v784)); - svfloat32_t v185; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v185) : "w"(v176), "w"(v811)); - svfloat32_t v209; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v209) : "w"(v200), "w"(v838)); - svfloat32_t v282; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v282) : "w"(v32), "w"(v128)); - svfloat32_t v283; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v283) : "w"(v32), "w"(v128)); - svfloat32_t v284; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v284) : "w"(v80), "w"(v176)); - svfloat32_t v285; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v285) : "w"(v80), "w"(v176)); - svfloat32_t v286; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v286) : "w"(v56), "w"(v152)); - svfloat32_t v287; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v287) : "w"(v56), "w"(v152)); - svfloat32_t v288; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v288) : "w"(v104), "w"(v200)); - svfloat32_t v289; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v289) : "w"(v104), "w"(v200)); - svfloat32_t v354; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v354) : "w"(v33), "w"(v129)); - svfloat32_t v355; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v355) : "w"(v33), "w"(v129)); - svfloat32_t v356; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v356) : "w"(v81), "w"(v177)); - svfloat32_t v357; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v357) : "w"(v81), "w"(v177)); - svfloat32_t v358; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v358) : "w"(v57), "w"(v153)); - svfloat32_t v359; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v359) : "w"(v57), "w"(v153)); - svfloat32_t v360; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v360) : "w"(v105), "w"(v201)); - svfloat32_t v361; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v361) : "w"(v105), "w"(v201)); - svfloat32_t v210; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v210) : "w"(v41), "w"(v137)); - svfloat32_t v211; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v211) : "w"(v41), "w"(v137)); - svfloat32_t v212; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v212) : "w"(v89), "w"(v185)); - svfloat32_t v213; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v213) : "w"(v89), "w"(v185)); - svfloat32_t v214; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v214) : "w"(v65), "w"(v161)); - svfloat32_t v215; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v215) : "w"(v65), "w"(v161)); - svfloat32_t v216; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v216) : "w"(v113), "w"(v209)); - svfloat32_t v217; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v217) : "w"(v113), "w"(v209)); - svfloat32_t v290; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v290) : "w"(v282), "w"(v284)); - svfloat32_t v291; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v291) : "w"(v282), "w"(v284)); - svfloat32_t v292; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v292) : "w"(v286), "w"(v288)); - svfloat32_t v293; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v293) : "w"(v286), "w"(v288)); - svfloat32_t v296; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v296) : "w"(v287), "w"(v289)); - svfloat32_t v297; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v297) : "w"(v287), "w"(v289)); - svfloat32_t zero331; - asm volatile("mov %0.s, #0" : "=w"(zero331)); + svfloat32_t v32 = svadd_f32_x(svptrue_b32(), v630, v639); + svfloat32_t v33 = svsub_f32_x(svptrue_b32(), v630, v639); + svfloat32_t v56 = svadd_f32_x(svptrue_b32(), v658, v667); + svfloat32_t v57 = svsub_f32_x(svptrue_b32(), v658, v667); + svfloat32_t v80 = svadd_f32_x(svptrue_b32(), v685, v694); + svfloat32_t v81 = svsub_f32_x(svptrue_b32(), v685, v694); + svfloat32_t v104 = svadd_f32_x(svptrue_b32(), v712, v721); + svfloat32_t v105 = svsub_f32_x(svptrue_b32(), v712, v721); + svfloat32_t v128 = svadd_f32_x(svptrue_b32(), v739, v748); + svfloat32_t v129 = svsub_f32_x(svptrue_b32(), v739, v748); + svfloat32_t v152 = svadd_f32_x(svptrue_b32(), v766, v775); + svfloat32_t v153 = svsub_f32_x(svptrue_b32(), v766, v775); + svfloat32_t v176 = svadd_f32_x(svptrue_b32(), v793, v802); + svfloat32_t v177 = svsub_f32_x(svptrue_b32(), v793, v802); + svfloat32_t v200 = svadd_f32_x(svptrue_b32(), v820, v829); + svfloat32_t v201 = svsub_f32_x(svptrue_b32(), v820, v829); + svfloat32_t v41 = svadd_f32_x(svptrue_b32(), v32, v649); + svfloat32_t v65 = svadd_f32_x(svptrue_b32(), v56, v676); + svfloat32_t v89 = svadd_f32_x(svptrue_b32(), v80, v703); + svfloat32_t v113 = svadd_f32_x(svptrue_b32(), v104, v730); + svfloat32_t v137 = svadd_f32_x(svptrue_b32(), v128, v757); + svfloat32_t v161 = svadd_f32_x(svptrue_b32(), v152, v784); + svfloat32_t v185 = svadd_f32_x(svptrue_b32(), v176, v811); + svfloat32_t v209 = svadd_f32_x(svptrue_b32(), v200, v838); + svfloat32_t v282 = svadd_f32_x(svptrue_b32(), v32, v128); + svfloat32_t v283 = svsub_f32_x(svptrue_b32(), v32, v128); + svfloat32_t v284 = svadd_f32_x(svptrue_b32(), v80, v176); + svfloat32_t v285 = svsub_f32_x(svptrue_b32(), v80, v176); + svfloat32_t v286 = svadd_f32_x(svptrue_b32(), v56, v152); + svfloat32_t v287 = svsub_f32_x(svptrue_b32(), v56, v152); + svfloat32_t v288 = svadd_f32_x(svptrue_b32(), v104, v200); + svfloat32_t v289 = svsub_f32_x(svptrue_b32(), v104, v200); + svfloat32_t v354 = svadd_f32_x(svptrue_b32(), v33, v129); + svfloat32_t v355 = svsub_f32_x(svptrue_b32(), v33, v129); + svfloat32_t v356 = svadd_f32_x(svptrue_b32(), v81, v177); + svfloat32_t v357 = svsub_f32_x(svptrue_b32(), v81, v177); + svfloat32_t v358 = svadd_f32_x(svptrue_b32(), v57, v153); + svfloat32_t v359 = svsub_f32_x(svptrue_b32(), v57, v153); + svfloat32_t v360 = svadd_f32_x(svptrue_b32(), v105, v201); + svfloat32_t v361 = svsub_f32_x(svptrue_b32(), v105, v201); + svfloat32_t v210 = svadd_f32_x(svptrue_b32(), v41, v137); + svfloat32_t v211 = svsub_f32_x(svptrue_b32(), v41, v137); + svfloat32_t v212 = svadd_f32_x(svptrue_b32(), v89, v185); + svfloat32_t v213 = svsub_f32_x(svptrue_b32(), v89, v185); + svfloat32_t v214 = svadd_f32_x(svptrue_b32(), v65, v161); + svfloat32_t v215 = svsub_f32_x(svptrue_b32(), v65, v161); + svfloat32_t v216 = svadd_f32_x(svptrue_b32(), v113, v209); + svfloat32_t v217 = svsub_f32_x(svptrue_b32(), v113, v209); + svfloat32_t v290 = svadd_f32_x(svptrue_b32(), v282, v284); + svfloat32_t v291 = svsub_f32_x(svptrue_b32(), v282, v284); + svfloat32_t v292 = svadd_f32_x(svptrue_b32(), v286, v288); + svfloat32_t v293 = svsub_f32_x(svptrue_b32(), v286, v288); + svfloat32_t v296 = svadd_f32_x(svptrue_b32(), v287, v289); + svfloat32_t v297 = svsub_f32_x(svptrue_b32(), v287, v289); + svfloat32_t zero331 = svdup_n_f32(0); svfloat32_t v331 = svcmla_f32_x(pred_full, zero331, v852, v285, 90); - svfloat32_t v362; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v362) : "w"(v354), "w"(v356)); - svfloat32_t v363; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v363) : "w"(v354), "w"(v356)); - svfloat32_t v364; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v364) : "w"(v358), "w"(v360)); - svfloat32_t v365; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v365) : "w"(v358), "w"(v360)); - svfloat32_t v368; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v368) : "w"(v359), "w"(v361)); - svfloat32_t v369; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v369) : "w"(v359), "w"(v361)); - svfloat32_t zero402; - asm volatile("mov %0.s, #0" : "=w"(zero402)); + svfloat32_t v362 = svadd_f32_x(svptrue_b32(), v354, v356); + svfloat32_t v363 = svsub_f32_x(svptrue_b32(), v354, v356); + svfloat32_t v364 = svadd_f32_x(svptrue_b32(), v358, v360); + svfloat32_t v365 = svsub_f32_x(svptrue_b32(), v358, v360); + svfloat32_t v368 = svadd_f32_x(svptrue_b32(), v359, v361); + svfloat32_t v369 = svsub_f32_x(svptrue_b32(), v359, v361); + svfloat32_t zero402 = svdup_n_f32(0); svfloat32_t v402 = svcmla_f32_x(pred_full, zero402, v859, v355, 90); - svfloat32_t v218; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v218) : "w"(v210), "w"(v212)); - svfloat32_t v219; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v219) : "w"(v210), "w"(v212)); - svfloat32_t v220; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v220) : "w"(v214), "w"(v216)); - svfloat32_t v221; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v221) : "w"(v214), "w"(v216)); - svfloat32_t v224; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v224) : "w"(v215), "w"(v217)); - svfloat32_t v225; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v225) : "w"(v215), "w"(v217)); - svfloat32_t zero259; - asm volatile("mov %0.s, #0" : "=w"(zero259)); + svfloat32_t v218 = svadd_f32_x(svptrue_b32(), v210, v212); + svfloat32_t v219 = svsub_f32_x(svptrue_b32(), v210, v212); + svfloat32_t v220 = svadd_f32_x(svptrue_b32(), v214, v216); + svfloat32_t v221 = svsub_f32_x(svptrue_b32(), v214, v216); + svfloat32_t v224 = svadd_f32_x(svptrue_b32(), v215, v217); + svfloat32_t v225 = svsub_f32_x(svptrue_b32(), v215, v217); + svfloat32_t zero259 = svdup_n_f32(0); svfloat32_t v259 = svcmla_f32_x(pred_full, zero259, v844, v213, 90); - svfloat32_t v294; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v294) : "w"(v290), "w"(v292)); - svfloat32_t v295; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v295) : "w"(v290), "w"(v292)); - svfloat32_t zero319; - asm volatile("mov %0.s, #0" : "=w"(zero319)); + svfloat32_t v294 = svadd_f32_x(svptrue_b32(), v290, v292); + svfloat32_t v295 = svsub_f32_x(svptrue_b32(), v290, v292); + svfloat32_t zero319 = svdup_n_f32(0); svfloat32_t v319 = svcmla_f32_x(pred_full, zero319, v852, v293, 90); - svfloat32_t zero338; - asm volatile("mov %0.s, #0" : "=w"(zero338)); + svfloat32_t zero338 = svdup_n_f32(0); svfloat32_t v338 = svcmla_f32_x(pred_full, zero338, v853, v296, 90); - svfloat32_t v343; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v343) : "w"(v297), "w"(v854)); - svfloat32_t v366; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v366) : "w"(v362), "w"(v364)); - svfloat32_t v367; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v367) : "w"(v362), "w"(v364)); - svfloat32_t zero390; - asm volatile("mov %0.s, #0" : "=w"(zero390)); + svfloat32_t v343 = svmul_f32_x(svptrue_b32(), v297, v854); + svfloat32_t v366 = svadd_f32_x(svptrue_b32(), v362, v364); + svfloat32_t v367 = svsub_f32_x(svptrue_b32(), v362, v364); + svfloat32_t zero390 = svdup_n_f32(0); svfloat32_t v390 = svcmla_f32_x(pred_full, zero390, v859, v363, 90); - svfloat32_t v412; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v412) : "w"(v368), "w"(v861)); - svfloat32_t zero419; - asm volatile("mov %0.s, #0" : "=w"(zero419)); + svfloat32_t v412 = svmul_f32_x(svptrue_b32(), v368, v861); + svfloat32_t zero419 = svdup_n_f32(0); svfloat32_t v419 = svcmla_f32_x(pred_full, zero419, v862, v369, 90); - svfloat32_t v222; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v222) : "w"(v218), "w"(v220)); - svfloat32_t v223; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v223) : "w"(v218), "w"(v220)); - svfloat32_t zero247; - asm volatile("mov %0.s, #0" : "=w"(zero247)); + svfloat32_t v222 = svadd_f32_x(svptrue_b32(), v218, v220); + svfloat32_t v223 = svsub_f32_x(svptrue_b32(), v218, v220); + svfloat32_t zero247 = svdup_n_f32(0); svfloat32_t v247 = svcmla_f32_x(pred_full, zero247, v844, v221, 90); - svfloat32_t zero266; - asm volatile("mov %0.s, #0" : "=w"(zero266)); + svfloat32_t zero266 = svdup_n_f32(0); svfloat32_t v266 = svcmla_f32_x(pred_full, zero266, v845, v224, 90); svfloat32_t v344 = svmla_f32_x(pred_full, v319, v291, v851); svfloat32_t v345 = svnmls_f32_x(pred_full, v319, v291, v851); svfloat32_t v346 = svmla_f32_x(pred_full, v343, v283, v851); svfloat32_t v347 = svnmls_f32_x(pred_full, v343, v283, v851); - svfloat32_t v348; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v348) : "w"(v331), "w"(v338)); - svfloat32_t v349; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v349) : "w"(v331), "w"(v338)); - svfloat32_t zero376; - asm volatile("mov %0.s, #0" : "=w"(zero376)); + svfloat32_t v348 = svadd_f32_x(svptrue_b32(), v331, v338); + svfloat32_t v349 = svsub_f32_x(svptrue_b32(), v331, v338); + svfloat32_t zero376 = svdup_n_f32(0); svfloat32_t v376 = svcmla_f32_x(pred_full, zero376, v859, v366, 90); - svfloat32_t zero383; - asm volatile("mov %0.s, #0" : "=w"(zero383)); + svfloat32_t zero383 = svdup_n_f32(0); svfloat32_t v383 = svcmla_f32_x(pred_full, zero383, v859, v367, 90); svfloat32_t v420 = svmla_f32_x(pred_full, v390, v365, v860); svfloat32_t v421 = svmls_f32_x(pred_full, v390, v365, v860); - svfloat32_t v422; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v422) : "w"(v402), "w"(v419)); - svfloat32_t v423; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v423) : "w"(v402), "w"(v419)); + svfloat32_t v422 = svadd_f32_x(svptrue_b32(), v402, v419); + svfloat32_t v423 = svsub_f32_x(svptrue_b32(), v402, v419); svfloat32_t v424 = svmla_f32_x(pred_full, v412, v357, v860); svfloat32_t v425 = svnmls_f32_x(pred_full, v412, v357, v860); - svfloat32_t v272; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v272) : "w"(v219), "w"(v247)); - svfloat32_t v273; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v273) : "w"(v219), "w"(v247)); + svfloat32_t v272 = svadd_f32_x(svptrue_b32(), v219, v247); + svfloat32_t v273 = svsub_f32_x(svptrue_b32(), v219, v247); svfloat32_t v274 = svmla_f32_x(pred_full, v211, v225, v846); svfloat32_t v275 = svmls_f32_x(pred_full, v211, v225, v846); - svfloat32_t v276; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v276) : "w"(v259), "w"(v266)); - svfloat32_t v277; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v277) : "w"(v259), "w"(v266)); - svfloat32_t v350; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v350) : "w"(v346), "w"(v348)); - svfloat32_t v351; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v351) : "w"(v346), "w"(v348)); - svfloat32_t v352; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v352) : "w"(v347), "w"(v349)); - svfloat32_t v353; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v353) : "w"(v347), "w"(v349)); - svfloat32_t v426; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v426) : "w"(v422), "w"(v424)); - svfloat32_t v427; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v427) : "w"(v422), "w"(v424)); - svfloat32_t v428; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v428) : "w"(v423), "w"(v425)); - svfloat32_t v429; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v429) : "w"(v423), "w"(v425)); + svfloat32_t v276 = svadd_f32_x(svptrue_b32(), v259, v266); + svfloat32_t v277 = svsub_f32_x(svptrue_b32(), v259, v266); + svfloat32_t v350 = svadd_f32_x(svptrue_b32(), v346, v348); + svfloat32_t v351 = svsub_f32_x(svptrue_b32(), v346, v348); + svfloat32_t v352 = svadd_f32_x(svptrue_b32(), v347, v349); + svfloat32_t v353 = svsub_f32_x(svptrue_b32(), v347, v349); + svfloat32_t v426 = svadd_f32_x(svptrue_b32(), v422, v424); + svfloat32_t v427 = svsub_f32_x(svptrue_b32(), v422, v424); + svfloat32_t v428 = svadd_f32_x(svptrue_b32(), v423, v425); + svfloat32_t v429 = svsub_f32_x(svptrue_b32(), v423, v425); svfloat32_t v430 = svmla_f32_x(pred_full, v222, v294, v851); svfloat32_t v526 = svmla_f32_x(pred_full, v223, v295, v851); svst1_f64(pred_full, (double *)(v870), svreinterpret_f64_f32(v222)); svst1_f64(pred_full, (double *)(v978), svreinterpret_f64_f32(v223)); - svfloat32_t v278; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v278) : "w"(v274), "w"(v276)); - svfloat32_t v279; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v279) : "w"(v274), "w"(v276)); - svfloat32_t v280; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v280) : "w"(v275), "w"(v277)); - svfloat32_t v281; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v281) : "w"(v275), "w"(v277)); - svfloat32_t v431; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v431) : "w"(v430), "w"(v376)); - svfloat32_t v432; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v432) : "w"(v430), "w"(v376)); - svfloat32_t v478; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v478) : "w"(v273), "w"(v345)); - svfloat32_t v527; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v527) : "w"(v526), "w"(v383)); - svfloat32_t v528; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v528) : "w"(v526), "w"(v383)); - svfloat32_t v574; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v574) : "w"(v272), "w"(v344)); + svfloat32_t v278 = svadd_f32_x(svptrue_b32(), v274, v276); + svfloat32_t v279 = svsub_f32_x(svptrue_b32(), v274, v276); + svfloat32_t v280 = svadd_f32_x(svptrue_b32(), v275, v277); + svfloat32_t v281 = svsub_f32_x(svptrue_b32(), v275, v277); + svfloat32_t v431 = svadd_f32_x(svptrue_b32(), v430, v376); + svfloat32_t v432 = svsub_f32_x(svptrue_b32(), v430, v376); + svfloat32_t v478 = svadd_f32_x(svptrue_b32(), v273, v345); + svfloat32_t v527 = svadd_f32_x(svptrue_b32(), v526, v383); + svfloat32_t v528 = svsub_f32_x(svptrue_b32(), v526, v383); + svfloat32_t v574 = svadd_f32_x(svptrue_b32(), v272, v344); svst1_f64(pred_full, (double *)(v924), svreinterpret_f64_f32(v273)); svst1_f64(pred_full, (double *)(v1032), svreinterpret_f64_f32(v272)); - svfloat32_t v454; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v454) : "w"(v279), "w"(v351)); - svfloat32_t v479; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v479) : "w"(v478), "w"(v421)); - svfloat32_t v480; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v480) : "w"(v478), "w"(v421)); - svfloat32_t v502; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v502) : "w"(v280), "w"(v352)); - svfloat32_t v550; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v550) : "w"(v281), "w"(v353)); - svfloat32_t v575; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v575) : "w"(v574), "w"(v420)); - svfloat32_t v576; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v576) : "w"(v574), "w"(v420)); - svfloat32_t v598; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v598) : "w"(v278), "w"(v350)); + svfloat32_t v454 = svadd_f32_x(svptrue_b32(), v279, v351); + svfloat32_t v479 = svadd_f32_x(svptrue_b32(), v478, v421); + svfloat32_t v480 = svsub_f32_x(svptrue_b32(), v478, v421); + svfloat32_t v502 = svadd_f32_x(svptrue_b32(), v280, v352); + svfloat32_t v550 = svadd_f32_x(svptrue_b32(), v281, v353); + svfloat32_t v575 = svadd_f32_x(svptrue_b32(), v574, v420); + svfloat32_t v576 = svsub_f32_x(svptrue_b32(), v574, v420); + svfloat32_t v598 = svadd_f32_x(svptrue_b32(), v278, v350); svst1_f64(pred_full, (double *)(v879), svreinterpret_f64_f32(v432)); svst1_f64(pred_full, (double *)(v888), svreinterpret_f64_f32(v431)); svst1_f64(pred_full, (double *)(v897), svreinterpret_f64_f32(v279)); @@ -10594,22 +6216,14 @@ void armral_fft_cf32_cf32_cf32_ac_n_gu24(const armral_cmplx_f32_t *restrict x, svst1_f64(pred_full, (double *)(v996), svreinterpret_f64_f32(v527)); svst1_f64(pred_full, (double *)(v1005), svreinterpret_f64_f32(v281)); svst1_f64(pred_full, (double *)(v1059), svreinterpret_f64_f32(v278)); - svfloat32_t v455; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v455) : "w"(v454), "w"(v427)); - svfloat32_t v456; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v456) : "w"(v454), "w"(v427)); - svfloat32_t v503; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v503) : "w"(v502), "w"(v428)); - svfloat32_t v504; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v504) : "w"(v502), "w"(v428)); - svfloat32_t v551; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v551) : "w"(v550), "w"(v429)); - svfloat32_t v552; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v552) : "w"(v550), "w"(v429)); - svfloat32_t v599; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v599) : "w"(v598), "w"(v426)); - svfloat32_t v600; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v600) : "w"(v598), "w"(v426)); + svfloat32_t v455 = svadd_f32_x(svptrue_b32(), v454, v427); + svfloat32_t v456 = svsub_f32_x(svptrue_b32(), v454, v427); + svfloat32_t v503 = svadd_f32_x(svptrue_b32(), v502, v428); + svfloat32_t v504 = svsub_f32_x(svptrue_b32(), v502, v428); + svfloat32_t v551 = svadd_f32_x(svptrue_b32(), v550, v429); + svfloat32_t v552 = svsub_f32_x(svptrue_b32(), v550, v429); + svfloat32_t v599 = svadd_f32_x(svptrue_b32(), v598, v426); + svfloat32_t v600 = svsub_f32_x(svptrue_b32(), v598, v426); svst1_f64(pred_full, (double *)(v933), svreinterpret_f64_f32(v480)); svst1_f64(pred_full, (double *)(v942), svreinterpret_f64_f32(v479)); svst1_f64(pred_full, (double *)(v1041), svreinterpret_f64_f32(v576)); @@ -10638,7 +6252,6 @@ void armral_fft_cf32_cf32_cf32_ac_n_gu25(const armral_cmplx_f32_t *restrict x, float32x2_t *v6 = (float32x2_t *)y; for (int j = 0; j < howmany; j += 1) { float32x2_t v159 = v5[istride]; - float v758 = 0.0000000000000000e+00F; float v851 = 9.6858316112863108e-01F; float v854 = -2.4868988716485479e-01F; float v855 = 2.4868988716485479e-01F; @@ -10671,7 +6284,6 @@ void armral_fft_cf32_cf32_cf32_ac_n_gu25(const armral_cmplx_f32_t *restrict x, float32x2_t v1376 = (float32x2_t){v4, v4}; float v1397 = 2.0000000000000000e+00F; float32x2_t v20 = v5[0]; - float v761 = dir * v758; float32x2_t v852 = (float32x2_t){v851, v851}; float32x2_t v856 = (float32x2_t){v854, v855}; float32x2_t v991 = (float32x2_t){v990, v990}; @@ -10717,7 +6329,6 @@ void armral_fft_cf32_cf32_cf32_ac_n_gu25(const armral_cmplx_f32_t *restrict x, float32x2_t v586 = v5[istride * 14]; float32x2_t v591 = v5[istride * 19]; float32x2_t v596 = v5[istride * 24]; - float32x2_t v759 = (float32x2_t){v758, v761}; float32x2_t v858 = vmul_f32(v1376, v856); float32x2_t v997 = vmul_f32(v1376, v995); float32x2_t v1136 = vmul_f32(v1376, v1134); @@ -10728,86 +6339,26 @@ void armral_fft_cf32_cf32_cf32_ac_n_gu25(const armral_cmplx_f32_t *restrict x, float32x2_t v1299 = vmul_f32(v1376, v1297); float32x2_t v1317 = vmul_f32(v1376, v1315); float32x2_t v1377 = vmul_f32(v1376, v1375); - float32x2_t v50 = vrev64_f32(v25); - float32x2_t v62 = vrev64_f32(v30); - float32x2_t v74 = vrev64_f32(v40); - float32x2_t v92 = vrev64_f32(v35); - float32x2_t v189 = vrev64_f32(v164); - float32x2_t v201 = vrev64_f32(v169); - float32x2_t v213 = vrev64_f32(v179); - float32x2_t v231 = vrev64_f32(v174); - float32x2_t v328 = vrev64_f32(v303); - float32x2_t v340 = vrev64_f32(v308); - float32x2_t v352 = vrev64_f32(v318); - float32x2_t v370 = vrev64_f32(v313); - float32x2_t v467 = vrev64_f32(v442); - float32x2_t v479 = vrev64_f32(v447); - float32x2_t v491 = vrev64_f32(v457); - float32x2_t v509 = vrev64_f32(v452); - float32x2_t v606 = vrev64_f32(v581); - float32x2_t v618 = vrev64_f32(v586); - float32x2_t v630 = vrev64_f32(v596); - float32x2_t v648 = vrev64_f32(v591); - float32x2_t v51 = vmul_f32(v50, v759); - float32x2_t v63 = vmul_f32(v62, v759); - float32x2_t v75 = vmul_f32(v74, v759); - float32x2_t v93 = vmul_f32(v92, v759); - float32x2_t v190 = vmul_f32(v189, v759); - float32x2_t v202 = vmul_f32(v201, v759); - float32x2_t v214 = vmul_f32(v213, v759); - float32x2_t v232 = vmul_f32(v231, v759); - float32x2_t v329 = vmul_f32(v328, v759); - float32x2_t v341 = vmul_f32(v340, v759); - float32x2_t v353 = vmul_f32(v352, v759); - float32x2_t v371 = vmul_f32(v370, v759); - float32x2_t v468 = vmul_f32(v467, v759); - float32x2_t v480 = vmul_f32(v479, v759); - float32x2_t v492 = vmul_f32(v491, v759); - float32x2_t v510 = vmul_f32(v509, v759); - float32x2_t v607 = vmul_f32(v606, v759); - float32x2_t v619 = vmul_f32(v618, v759); - float32x2_t v631 = vmul_f32(v630, v759); - float32x2_t v649 = vmul_f32(v648, v759); - float32x2_t v52 = vadd_f32(v51, v25); - float32x2_t v64 = vadd_f32(v63, v30); - float32x2_t v76 = vadd_f32(v75, v40); - float32x2_t v94 = vadd_f32(v93, v35); - float32x2_t v191 = vadd_f32(v190, v164); - float32x2_t v203 = vadd_f32(v202, v169); - float32x2_t v215 = vadd_f32(v214, v179); - float32x2_t v233 = vadd_f32(v232, v174); - float32x2_t v330 = vadd_f32(v329, v303); - float32x2_t v342 = vadd_f32(v341, v308); - float32x2_t v354 = vadd_f32(v353, v318); - float32x2_t v372 = vadd_f32(v371, v313); - float32x2_t v469 = vadd_f32(v468, v442); - float32x2_t v481 = vadd_f32(v480, v447); - float32x2_t v493 = vadd_f32(v492, v457); - float32x2_t v511 = vadd_f32(v510, v452); - float32x2_t v608 = vadd_f32(v607, v581); - float32x2_t v620 = vadd_f32(v619, v586); - float32x2_t v632 = vadd_f32(v631, v596); - float32x2_t v650 = vadd_f32(v649, v591); - float32x2_t v77 = vsub_f32(v52, v76); - float32x2_t v81 = vmul_f32(v52, v1398); - float32x2_t v95 = vsub_f32(v64, v94); - float32x2_t v99 = vmul_f32(v64, v1398); - float32x2_t v216 = vsub_f32(v191, v215); - float32x2_t v220 = vmul_f32(v191, v1398); - float32x2_t v234 = vsub_f32(v203, v233); - float32x2_t v238 = vmul_f32(v203, v1398); - float32x2_t v355 = vsub_f32(v330, v354); - float32x2_t v359 = vmul_f32(v330, v1398); - float32x2_t v373 = vsub_f32(v342, v372); - float32x2_t v377 = vmul_f32(v342, v1398); - float32x2_t v494 = vsub_f32(v469, v493); - float32x2_t v498 = vmul_f32(v469, v1398); - float32x2_t v512 = vsub_f32(v481, v511); - float32x2_t v516 = vmul_f32(v481, v1398); - float32x2_t v633 = vsub_f32(v608, v632); - float32x2_t v637 = vmul_f32(v608, v1398); - float32x2_t v651 = vsub_f32(v620, v650); - float32x2_t v655 = vmul_f32(v620, v1398); + float32x2_t v77 = vsub_f32(v25, v40); + float32x2_t v81 = vmul_f32(v25, v1398); + float32x2_t v95 = vsub_f32(v30, v35); + float32x2_t v99 = vmul_f32(v30, v1398); + float32x2_t v216 = vsub_f32(v164, v179); + float32x2_t v220 = vmul_f32(v164, v1398); + float32x2_t v234 = vsub_f32(v169, v174); + float32x2_t v238 = vmul_f32(v169, v1398); + float32x2_t v355 = vsub_f32(v303, v318); + float32x2_t v359 = vmul_f32(v303, v1398); + float32x2_t v373 = vsub_f32(v308, v313); + float32x2_t v377 = vmul_f32(v308, v1398); + float32x2_t v494 = vsub_f32(v442, v457); + float32x2_t v498 = vmul_f32(v442, v1398); + float32x2_t v512 = vsub_f32(v447, v452); + float32x2_t v516 = vmul_f32(v447, v1398); + float32x2_t v633 = vsub_f32(v581, v596); + float32x2_t v637 = vmul_f32(v581, v1398); + float32x2_t v651 = vsub_f32(v586, v591); + float32x2_t v655 = vmul_f32(v586, v1398); float32x2_t v82 = vsub_f32(v81, v77); float32x2_t v100 = vsub_f32(v99, v95); float32x2_t v111 = vmul_f32(v95, v1351); @@ -10888,10 +6439,10 @@ void armral_fft_cf32_cf32_cf32_ac_n_gu25(const armral_cmplx_f32_t *restrict x, float32x2_t v663 = vsub_f32(v576, v662); float32x2_t v691 = vmul_f32(v690, v1377); float32x2_t v699 = vmul_f32(v698, v1377); - float32x2_t v720 = vrev64_f32(v267); - float32x2_t v732 = vrev64_f32(v406); - float32x2_t v744 = vrev64_f32(v684); - float32x2_t v762 = vrev64_f32(v545); + float32x2_t v747 = vsub_f32(v267, v684); + float32x2_t v751 = vmul_f32(v267, v1398); + float32x2_t v765 = vsub_f32(v406, v545); + float32x2_t v769 = vmul_f32(v406, v1398); float32x2_t v117 = vsub_f32(v107, v116); float32x2_t v121 = vmul_f32(v107, v1398); float32x2_t v256 = vsub_f32(v246, v255); @@ -10902,10 +6453,10 @@ void armral_fft_cf32_cf32_cf32_ac_n_gu25(const armral_cmplx_f32_t *restrict x, float32x2_t v538 = vmul_f32(v524, v1398); float32x2_t v673 = vsub_f32(v663, v672); float32x2_t v677 = vmul_f32(v663, v1398); - float32x2_t v721 = vmul_f32(v720, v759); - float32x2_t v733 = vmul_f32(v732, v759); - float32x2_t v745 = vmul_f32(v744, v759); - float32x2_t v763 = vmul_f32(v762, v759); + float32x2_t v752 = vsub_f32(v751, v747); + float32x2_t v770 = vsub_f32(v769, v765); + float32x2_t v781 = vmul_f32(v765, v1351); + float32x2_t v796 = vmul_f32(v747, v1351); float32x2_t v122 = vsub_f32(v121, v117); float32x2_t v144 = vsub_f32(v117, v143); float32x2_t v148 = vmul_f32(v117, v1398); @@ -10921,10 +6472,10 @@ void armral_fft_cf32_cf32_cf32_ac_n_gu25(const armral_cmplx_f32_t *restrict x, float32x2_t v678 = vsub_f32(v677, v673); float32x2_t v700 = vsub_f32(v673, v699); float32x2_t v704 = vmul_f32(v673, v1398); - float32x2_t v722 = vadd_f32(v721, v267); - float32x2_t v734 = vadd_f32(v733, v406); - float32x2_t v746 = vadd_f32(v745, v684); - float32x2_t v764 = vadd_f32(v763, v545); + float32x2_t v771 = vadd_f32(v752, v770); + float32x2_t v772 = vsub_f32(v752, v770); + float32x2_t v782 = vadd_f32(v747, v781); + float32x2_t v797 = vsub_f32(v796, v765); float32x2_t v136 = vsub_f32(v122, v135); float32x2_t v149 = vsub_f32(v148, v144); float32x2_t v153 = vmul_f32(v122, v1398); @@ -10940,10 +6491,11 @@ void armral_fft_cf32_cf32_cf32_ac_n_gu25(const armral_cmplx_f32_t *restrict x, float32x2_t v692 = vsub_f32(v678, v691); float32x2_t v705 = vsub_f32(v704, v700); float32x2_t v709 = vmul_f32(v678, v1398); - float32x2_t v747 = vsub_f32(v722, v746); - float32x2_t v751 = vmul_f32(v722, v1398); - float32x2_t v765 = vsub_f32(v734, v764); - float32x2_t v769 = vmul_f32(v734, v1398); + float32x2_t v776 = vmul_f32(v771, v1331); + float32x2_t v786 = vmul_f32(v772, v1341); + float32x2_t v798 = vadd_f32(v128, v771); + float32x2_t v809 = vrev64_f32(v782); + float32x2_t v822 = vrev64_f32(v797); float32x2_t v998 = vrev64_f32(v283); float32x2_t v1010 = vrev64_f32(v422); float32x2_t v1022 = vrev64_f32(v700); @@ -10953,10 +6505,10 @@ void armral_fft_cf32_cf32_cf32_ac_n_gu25(const armral_cmplx_f32_t *restrict x, float32x2_t v432 = vsub_f32(v431, v414); float32x2_t v571 = vsub_f32(v570, v553); float32x2_t v710 = vsub_f32(v709, v692); - float32x2_t v752 = vsub_f32(v751, v747); - float32x2_t v770 = vsub_f32(v769, v765); - float32x2_t v781 = vmul_f32(v765, v1351); - float32x2_t v796 = vmul_f32(v747, v1351); + float32x2_t v777 = vsub_f32(v128, v776); + v6[0] = v798; + float32x2_t v810 = vmul_f32(v809, v1377); + float32x2_t v823 = vmul_f32(v822, v1377); float32x2_t v859 = vrev64_f32(v275); float32x2_t v871 = vrev64_f32(v414); float32x2_t v883 = vrev64_f32(v692); @@ -10969,10 +6521,8 @@ void armral_fft_cf32_cf32_cf32_ac_n_gu25(const armral_cmplx_f32_t *restrict x, float32x2_t v1149 = vrev64_f32(v427); float32x2_t v1161 = vrev64_f32(v705); float32x2_t v1179 = vrev64_f32(v566); - float32x2_t v771 = vadd_f32(v752, v770); - float32x2_t v772 = vsub_f32(v752, v770); - float32x2_t v782 = vadd_f32(v747, v781); - float32x2_t v797 = vsub_f32(v796, v765); + float32x2_t v787 = vsub_f32(v777, v786); + float32x2_t v791 = vmul_f32(v777, v1398); float32x2_t v860 = vmul_f32(v859, v858); float32x2_t v872 = vmul_f32(v871, v997); float32x2_t v884 = vmul_f32(v883, v1275); @@ -10989,11 +6539,9 @@ void armral_fft_cf32_cf32_cf32_ac_n_gu25(const armral_cmplx_f32_t *restrict x, float32x2_t v1288 = vrev64_f32(v432); float32x2_t v1300 = vrev64_f32(v710); float32x2_t v1318 = vrev64_f32(v571); - float32x2_t v776 = vmul_f32(v771, v1331); - float32x2_t v786 = vmul_f32(v772, v1341); - float32x2_t v798 = vadd_f32(v128, v771); - float32x2_t v809 = vrev64_f32(v782); - float32x2_t v822 = vrev64_f32(v797); + float32x2_t v792 = vsub_f32(v791, v787); + float32x2_t v824 = vsub_f32(v787, v823); + float32x2_t v833 = vmul_f32(v787, v1398); float32x2_t v861 = vfma_f32(v860, v275, v852); float32x2_t v873 = vfma_f32(v872, v414, v991); float32x2_t v885 = vfma_f32(v884, v692, v1269); @@ -11010,10 +6558,10 @@ void armral_fft_cf32_cf32_cf32_ac_n_gu25(const armral_cmplx_f32_t *restrict x, float32x2_t v1289 = vmul_f32(v1288, v1287); float32x2_t v1301 = vmul_f32(v1300, v1299); float32x2_t v1319 = vmul_f32(v1318, v1317); - float32x2_t v777 = vsub_f32(v128, v776); - v6[0] = v798; - float32x2_t v810 = vmul_f32(v809, v1377); - float32x2_t v823 = vmul_f32(v822, v1377); + float32x2_t v811 = vsub_f32(v792, v810); + v6[ostride * 10] = v824; + float32x2_t v834 = vsub_f32(v833, v824); + float32x2_t v843 = vmul_f32(v792, v1398); float32x2_t v886 = vsub_f32(v861, v885); float32x2_t v890 = vmul_f32(v861, v1398); float32x2_t v904 = vsub_f32(v873, v903); @@ -11030,8 +6578,9 @@ void armral_fft_cf32_cf32_cf32_ac_n_gu25(const armral_cmplx_f32_t *restrict x, float32x2_t v1290 = vfma_f32(v1289, v432, v1281); float32x2_t v1302 = vfma_f32(v1301, v710, v1293); float32x2_t v1320 = vfma_f32(v1319, v571, v1311); - float32x2_t v787 = vsub_f32(v777, v786); - float32x2_t v791 = vmul_f32(v777, v1398); + v6[ostride * 5] = v811; + v6[ostride * 15] = v834; + float32x2_t v844 = vsub_f32(v843, v811); float32x2_t v891 = vsub_f32(v890, v886); float32x2_t v909 = vsub_f32(v908, v904); float32x2_t v920 = vmul_f32(v904, v1351); @@ -11048,9 +6597,7 @@ void armral_fft_cf32_cf32_cf32_ac_n_gu25(const armral_cmplx_f32_t *restrict x, float32x2_t v1307 = vmul_f32(v1278, v1398); float32x2_t v1321 = vsub_f32(v1290, v1320); float32x2_t v1325 = vmul_f32(v1290, v1398); - float32x2_t v792 = vsub_f32(v791, v787); - float32x2_t v824 = vsub_f32(v787, v823); - float32x2_t v833 = vmul_f32(v787, v1398); + v6[ostride * 20] = v844; float32x2_t v910 = vadd_f32(v891, v909); float32x2_t v911 = vsub_f32(v891, v909); float32x2_t v921 = vadd_f32(v886, v920); @@ -11068,10 +6615,6 @@ void armral_fft_cf32_cf32_cf32_ac_n_gu25(const armral_cmplx_f32_t *restrict x, float32x2_t v1326 = vsub_f32(v1325, v1321); float32x2_t v1337 = vmul_f32(v1321, v1351); float32x2_t v1352 = vmul_f32(v1303, v1351); - float32x2_t v811 = vsub_f32(v792, v810); - v6[ostride * 10] = v824; - float32x2_t v834 = vsub_f32(v833, v824); - float32x2_t v843 = vmul_f32(v792, v1398); float32x2_t v915 = vmul_f32(v910, v1331); float32x2_t v925 = vmul_f32(v911, v1341); float32x2_t v937 = vadd_f32(v136, v910); @@ -11090,9 +6633,6 @@ void armral_fft_cf32_cf32_cf32_ac_n_gu25(const armral_cmplx_f32_t *restrict x, float32x2_t v1328 = vsub_f32(v1308, v1326); float32x2_t v1338 = vadd_f32(v1303, v1337); float32x2_t v1353 = vsub_f32(v1352, v1321); - v6[ostride * 5] = v811; - v6[ostride * 15] = v834; - float32x2_t v844 = vsub_f32(v843, v811); float32x2_t v916 = vsub_f32(v136, v915); v6[ostride] = v937; float32x2_t v949 = vmul_f32(v948, v1377); @@ -11108,7 +6648,6 @@ void armral_fft_cf32_cf32_cf32_ac_n_gu25(const armral_cmplx_f32_t *restrict x, float32x2_t v1354 = vadd_f32(v154, v1327); float32x2_t v1365 = vrev64_f32(v1338); float32x2_t v1378 = vrev64_f32(v1353); - v6[ostride * 20] = v844; float32x2_t v926 = vsub_f32(v916, v925); float32x2_t v930 = vmul_f32(v916, v1398); float32x2_t v1070 = vsub_f32(v1069, v1065); @@ -11206,7 +6745,6 @@ void armral_fft_cf32_cf32_cf32_ac_n_gu25(const armral_cmplx_f32_t *restrict x, float v1545 = 2.5000000000000000e-01F; float v1557 = 5.5901699437494745e-01F; float v1569 = 6.1803398874989490e-01F; - float v1597 = 0.0000000000000000e+00F; float v1598 = -9.5105651629515353e-01F; float v1626 = 2.0000000000000000e+00F; const float32x2_t *v1709 = &v5[v0]; @@ -11234,7 +6772,6 @@ void armral_fft_cf32_cf32_cf32_ac_n_gu25(const armral_cmplx_f32_t *restrict x, int64_t v681 = v0 * 14; int64_t v688 = v0 * 19; int64_t v695 = v0 * 24; - float v883 = v4 * v1597; int64_t v943 = v2 * 5; int64_t v958 = v2 * 10; int64_t v971 = v2 * 15; @@ -11270,6 +6807,7 @@ void armral_fft_cf32_cf32_cf32_ac_n_gu25(const armral_cmplx_f32_t *restrict x, int64_t v1632 = v2 * 24; const float32x2_t *v1645 = &v5[0]; svint64_t v1938 = svindex_s64(0, v1); + svfloat32_t v1967 = svdup_n_f32(0); float32x2_t *v1981 = &v6[0]; svfloat32_t v2024 = svdup_n_f32(v991); svfloat32_t v2088 = svdup_n_f32(v1153); @@ -11310,7 +6848,6 @@ void armral_fft_cf32_cf32_cf32_ac_n_gu25(const armral_cmplx_f32_t *restrict x, const float32x2_t *v1919 = &v5[v681]; const float32x2_t *v1928 = &v5[v688]; const float32x2_t *v1937 = &v5[v695]; - svfloat32_t v1967 = svdup_n_f32(v883); float32x2_t *v1991 = &v6[v943]; float32x2_t *v2001 = &v6[v958]; float32x2_t *v2011 = &v6[v971]; @@ -11410,26 +6947,16 @@ void armral_fft_cf32_cf32_cf32_ac_n_gu25(const armral_cmplx_f32_t *restrict x, svfloat32_t v726 = svcmla_f32_x(pred_full, v1921, v1967, v1921, 90); svfloat32_t v739 = svcmla_f32_x(pred_full, v1939, v1967, v1939, 90); svfloat32_t v759 = svcmla_f32_x(pred_full, v1930, v1967, v1930, 90); - svfloat32_t v92; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v92) : "w"(v65), "w"(v91)); - svfloat32_t v112; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v112) : "w"(v78), "w"(v111)); - svfloat32_t v254; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v254) : "w"(v227), "w"(v253)); - svfloat32_t v274; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v274) : "w"(v240), "w"(v273)); - svfloat32_t v416; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v416) : "w"(v389), "w"(v415)); - svfloat32_t v436; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v436) : "w"(v402), "w"(v435)); - svfloat32_t v578; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v578) : "w"(v551), "w"(v577)); - svfloat32_t v598; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v598) : "w"(v564), "w"(v597)); - svfloat32_t v740; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v740) : "w"(v713), "w"(v739)); - svfloat32_t v760; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v760) : "w"(v726), "w"(v759)); + svfloat32_t v92 = svsub_f32_x(svptrue_b32(), v65, v91); + svfloat32_t v112 = svsub_f32_x(svptrue_b32(), v78, v111); + svfloat32_t v254 = svsub_f32_x(svptrue_b32(), v227, v253); + svfloat32_t v274 = svsub_f32_x(svptrue_b32(), v240, v273); + svfloat32_t v416 = svsub_f32_x(svptrue_b32(), v389, v415); + svfloat32_t v436 = svsub_f32_x(svptrue_b32(), v402, v435); + svfloat32_t v578 = svsub_f32_x(svptrue_b32(), v551, v577); + svfloat32_t v598 = svsub_f32_x(svptrue_b32(), v564, v597); + svfloat32_t v740 = svsub_f32_x(svptrue_b32(), v713, v739); + svfloat32_t v760 = svsub_f32_x(svptrue_b32(), v726, v759); svfloat32_t v98 = svnmls_f32_x(pred_full, v92, v65, v2270); svfloat32_t v118 = svnmls_f32_x(pred_full, v112, v78, v2270); svfloat32_t v260 = svnmls_f32_x(pred_full, v254, v227, v2270); @@ -11440,75 +6967,50 @@ void armral_fft_cf32_cf32_cf32_ac_n_gu25(const armral_cmplx_f32_t *restrict x, svfloat32_t v604 = svnmls_f32_x(pred_full, v598, v564, v2270); svfloat32_t v746 = svnmls_f32_x(pred_full, v740, v713, v2270); svfloat32_t v766 = svnmls_f32_x(pred_full, v760, v726, v2270); - svfloat32_t v119; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v119) : "w"(v98), "w"(v118)); - svfloat32_t v120; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v120) : "w"(v98), "w"(v118)); + svfloat32_t v119 = svadd_f32_x(svptrue_b32(), v98, v118); + svfloat32_t v120 = svsub_f32_x(svptrue_b32(), v98, v118); svfloat32_t v132 = svmla_f32_x(pred_full, v92, v112, v2230); svfloat32_t v150 = svnmls_f32_x(pred_full, v112, v92, v2230); - svfloat32_t v281; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v281) : "w"(v260), "w"(v280)); - svfloat32_t v282; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v282) : "w"(v260), "w"(v280)); + svfloat32_t v281 = svadd_f32_x(svptrue_b32(), v260, v280); + svfloat32_t v282 = svsub_f32_x(svptrue_b32(), v260, v280); svfloat32_t v294 = svmla_f32_x(pred_full, v254, v274, v2230); svfloat32_t v312 = svnmls_f32_x(pred_full, v274, v254, v2230); - svfloat32_t v443; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v443) : "w"(v422), "w"(v442)); - svfloat32_t v444; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v444) : "w"(v422), "w"(v442)); + svfloat32_t v443 = svadd_f32_x(svptrue_b32(), v422, v442); + svfloat32_t v444 = svsub_f32_x(svptrue_b32(), v422, v442); svfloat32_t v456 = svmla_f32_x(pred_full, v416, v436, v2230); svfloat32_t v474 = svnmls_f32_x(pred_full, v436, v416, v2230); - svfloat32_t v605; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v605) : "w"(v584), "w"(v604)); - svfloat32_t v606; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v606) : "w"(v584), "w"(v604)); + svfloat32_t v605 = svadd_f32_x(svptrue_b32(), v584, v604); + svfloat32_t v606 = svsub_f32_x(svptrue_b32(), v584, v604); svfloat32_t v618 = svmla_f32_x(pred_full, v578, v598, v2230); svfloat32_t v636 = svnmls_f32_x(pred_full, v598, v578, v2230); - svfloat32_t v767; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v767) : "w"(v746), "w"(v766)); - svfloat32_t v768; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v768) : "w"(v746), "w"(v766)); + svfloat32_t v767 = svadd_f32_x(svptrue_b32(), v746, v766); + svfloat32_t v768 = svsub_f32_x(svptrue_b32(), v746, v766); svfloat32_t v780 = svmla_f32_x(pred_full, v740, v760, v2230); svfloat32_t v798 = svnmls_f32_x(pred_full, v760, v740, v2230); - svfloat32_t v151; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v151) : "w"(v1647), "w"(v119)); - svfloat32_t zero158; - asm volatile("mov %0.s, #0" : "=w"(zero158)); + svfloat32_t v151 = svadd_f32_x(svptrue_b32(), v1647, v119); + svfloat32_t zero158 = svdup_n_f32(0); svfloat32_t v158 = svcmla_f32_x(pred_full, zero158, v2250, v132, 90); - svfloat32_t zero166; - asm volatile("mov %0.s, #0" : "=w"(zero166)); + svfloat32_t zero166 = svdup_n_f32(0); svfloat32_t v166 = svcmla_f32_x(pred_full, zero166, v2250, v150, 90); - svfloat32_t v313; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v313) : "w"(v1711), "w"(v281)); - svfloat32_t zero320; - asm volatile("mov %0.s, #0" : "=w"(zero320)); + svfloat32_t v313 = svadd_f32_x(svptrue_b32(), v1711, v281); + svfloat32_t zero320 = svdup_n_f32(0); svfloat32_t v320 = svcmla_f32_x(pred_full, zero320, v2250, v294, 90); - svfloat32_t zero328; - asm volatile("mov %0.s, #0" : "=w"(zero328)); + svfloat32_t zero328 = svdup_n_f32(0); svfloat32_t v328 = svcmla_f32_x(pred_full, zero328, v2250, v312, 90); - svfloat32_t v475; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v475) : "w"(v1775), "w"(v443)); - svfloat32_t zero482; - asm volatile("mov %0.s, #0" : "=w"(zero482)); + svfloat32_t v475 = svadd_f32_x(svptrue_b32(), v1775, v443); + svfloat32_t zero482 = svdup_n_f32(0); svfloat32_t v482 = svcmla_f32_x(pred_full, zero482, v2250, v456, 90); - svfloat32_t zero490; - asm volatile("mov %0.s, #0" : "=w"(zero490)); + svfloat32_t zero490 = svdup_n_f32(0); svfloat32_t v490 = svcmla_f32_x(pred_full, zero490, v2250, v474, 90); - svfloat32_t v637; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v637) : "w"(v1839), "w"(v605)); - svfloat32_t zero644; - asm volatile("mov %0.s, #0" : "=w"(zero644)); + svfloat32_t v637 = svadd_f32_x(svptrue_b32(), v1839, v605); + svfloat32_t zero644 = svdup_n_f32(0); svfloat32_t v644 = svcmla_f32_x(pred_full, zero644, v2250, v618, 90); - svfloat32_t zero652; - asm volatile("mov %0.s, #0" : "=w"(zero652)); + svfloat32_t zero652 = svdup_n_f32(0); svfloat32_t v652 = svcmla_f32_x(pred_full, zero652, v2250, v636, 90); - svfloat32_t v799; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v799) : "w"(v1903), "w"(v767)); - svfloat32_t zero806; - asm volatile("mov %0.s, #0" : "=w"(zero806)); + svfloat32_t v799 = svadd_f32_x(svptrue_b32(), v1903, v767); + svfloat32_t zero806 = svdup_n_f32(0); svfloat32_t v806 = svcmla_f32_x(pred_full, zero806, v2250, v780, 90); - svfloat32_t zero814; - asm volatile("mov %0.s, #0" : "=w"(zero814)); + svfloat32_t zero814 = svdup_n_f32(0); svfloat32_t v814 = svcmla_f32_x(pred_full, zero814, v2250, v798, 90); svfloat32_t v126 = svmls_f32_x(pred_full, v1647, v119, v2226); svfloat32_t v288 = svmls_f32_x(pred_full, v1711, v281, v2226); @@ -11525,119 +7027,80 @@ void armral_fft_cf32_cf32_cf32_ac_n_gu25(const armral_cmplx_f32_t *restrict x, svfloat32_t v866 = svcmla_f32_x(pred_full, v799, v1967, v799, 90); svfloat32_t v886 = svcmla_f32_x(pred_full, v637, v1967, v637, 90); svfloat32_t v144 = svnmls_f32_x(pred_full, v138, v126, v2270); - svfloat32_t v167; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v167) : "w"(v138), "w"(v166)); + svfloat32_t v167 = svsub_f32_x(svptrue_b32(), v138, v166); svfloat32_t v306 = svnmls_f32_x(pred_full, v300, v288, v2270); - svfloat32_t v329; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v329) : "w"(v300), "w"(v328)); + svfloat32_t v329 = svsub_f32_x(svptrue_b32(), v300, v328); svfloat32_t v468 = svnmls_f32_x(pred_full, v462, v450, v2270); - svfloat32_t v491; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v491) : "w"(v462), "w"(v490)); + svfloat32_t v491 = svsub_f32_x(svptrue_b32(), v462, v490); svfloat32_t v630 = svnmls_f32_x(pred_full, v624, v612, v2270); - svfloat32_t v653; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v653) : "w"(v624), "w"(v652)); + svfloat32_t v653 = svsub_f32_x(svptrue_b32(), v624, v652); svfloat32_t v792 = svnmls_f32_x(pred_full, v786, v774, v2270); - svfloat32_t v815; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v815) : "w"(v786), "w"(v814)); - svfloat32_t v867; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v867) : "w"(v840), "w"(v866)); - svfloat32_t v887; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v887) : "w"(v853), "w"(v886)); - svfloat32_t v159; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v159) : "w"(v144), "w"(v158)); + svfloat32_t v815 = svsub_f32_x(svptrue_b32(), v786, v814); + svfloat32_t v867 = svsub_f32_x(svptrue_b32(), v840, v866); + svfloat32_t v887 = svsub_f32_x(svptrue_b32(), v853, v886); + svfloat32_t v159 = svsub_f32_x(svptrue_b32(), v144, v158); svfloat32_t v173 = svnmls_f32_x(pred_full, v167, v138, v2270); - svfloat32_t v321; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v321) : "w"(v306), "w"(v320)); + svfloat32_t v321 = svsub_f32_x(svptrue_b32(), v306, v320); svfloat32_t v335 = svnmls_f32_x(pred_full, v329, v300, v2270); - svfloat32_t v483; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v483) : "w"(v468), "w"(v482)); + svfloat32_t v483 = svsub_f32_x(svptrue_b32(), v468, v482); svfloat32_t v497 = svnmls_f32_x(pred_full, v491, v462, v2270); - svfloat32_t v645; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v645) : "w"(v630), "w"(v644)); + svfloat32_t v645 = svsub_f32_x(svptrue_b32(), v630, v644); svfloat32_t v659 = svnmls_f32_x(pred_full, v653, v624, v2270); - svfloat32_t v807; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v807) : "w"(v792), "w"(v806)); + svfloat32_t v807 = svsub_f32_x(svptrue_b32(), v792, v806); svfloat32_t v821 = svnmls_f32_x(pred_full, v815, v786, v2270); svfloat32_t v873 = svnmls_f32_x(pred_full, v867, v840, v2270); svfloat32_t v893 = svnmls_f32_x(pred_full, v887, v853, v2270); - svfloat32_t v1156; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v1156) : "w"(v329), "w"(v2088)); - svfloat32_t v1169; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v1169) : "w"(v491), "w"(v2216)); - svfloat32_t v1182; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v1182) : "w"(v815), "w"(v2218)); - svfloat32_t v1202; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v1202) : "w"(v653), "w"(v2154)); + svfloat32_t v1156 = svmul_f32_x(svptrue_b32(), v329, v2088); + svfloat32_t v1169 = svmul_f32_x(svptrue_b32(), v491, v2216); + svfloat32_t v1182 = svmul_f32_x(svptrue_b32(), v815, v2218); + svfloat32_t v1202 = svmul_f32_x(svptrue_b32(), v653, v2154); svfloat32_t v179 = svnmls_f32_x(pred_full, v159, v144, v2270); svfloat32_t v341 = svnmls_f32_x(pred_full, v321, v306, v2270); svfloat32_t v503 = svnmls_f32_x(pred_full, v483, v468, v2270); svfloat32_t v665 = svnmls_f32_x(pred_full, v645, v630, v2270); svfloat32_t v827 = svnmls_f32_x(pred_full, v807, v792, v2270); - svfloat32_t v894; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v894) : "w"(v873), "w"(v893)); - svfloat32_t v895; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v895) : "w"(v873), "w"(v893)); + svfloat32_t v894 = svadd_f32_x(svptrue_b32(), v873, v893); + svfloat32_t v895 = svsub_f32_x(svptrue_b32(), v873, v893); svfloat32_t v907 = svmla_f32_x(pred_full, v867, v887, v2230); svfloat32_t v925 = svnmls_f32_x(pred_full, v887, v867, v2230); - svfloat32_t v994; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v994) : "w"(v321), "w"(v2024)); - svfloat32_t v1007; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v1007) : "w"(v483), "w"(v2088)); - svfloat32_t v1020; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v1020) : "w"(v807), "w"(v2216)); - svfloat32_t v1040; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v1040) : "w"(v645), "w"(v2152)); + svfloat32_t v994 = svmul_f32_x(svptrue_b32(), v321, v2024); + svfloat32_t v1007 = svmul_f32_x(svptrue_b32(), v483, v2088); + svfloat32_t v1020 = svmul_f32_x(svptrue_b32(), v807, v2216); + svfloat32_t v1040 = svmul_f32_x(svptrue_b32(), v645, v2152); svfloat32_t v1164 = svcmla_f32_x(pred_full, v1156, v2089, v329, 90); svfloat32_t v1177 = svcmla_f32_x(pred_full, v1169, v2217, v491, 90); svfloat32_t v1190 = svcmla_f32_x(pred_full, v1182, v2219, v815, 90); svfloat32_t v1210 = svcmla_f32_x(pred_full, v1202, v2155, v653, 90); - svfloat32_t v1318; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v1318) : "w"(v335), "w"(v2152)); - svfloat32_t v1331; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v1331) : "w"(v497), "w"(v2154)); - svfloat32_t v1344; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v1344) : "w"(v821), "w"(v2223)); - svfloat32_t v1364; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v1364) : "w"(v659), "w"(v2220)); - svfloat32_t v926; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v926) : "w"(v151), "w"(v894)); - svfloat32_t zero940; - asm volatile("mov %0.s, #0" : "=w"(zero940)); + svfloat32_t v1318 = svmul_f32_x(svptrue_b32(), v335, v2152); + svfloat32_t v1331 = svmul_f32_x(svptrue_b32(), v497, v2154); + svfloat32_t v1344 = svmul_f32_x(svptrue_b32(), v821, v2223); + svfloat32_t v1364 = svmul_f32_x(svptrue_b32(), v659, v2220); + svfloat32_t v926 = svadd_f32_x(svptrue_b32(), v151, v894); + svfloat32_t zero940 = svdup_n_f32(0); svfloat32_t v940 = svcmla_f32_x(pred_full, zero940, v2250, v907, 90); - svfloat32_t zero955; - asm volatile("mov %0.s, #0" : "=w"(zero955)); + svfloat32_t zero955 = svdup_n_f32(0); svfloat32_t v955 = svcmla_f32_x(pred_full, zero955, v2250, v925, 90); svfloat32_t v1002 = svcmla_f32_x(pred_full, v994, v2025, v321, 90); svfloat32_t v1015 = svcmla_f32_x(pred_full, v1007, v2089, v483, 90); svfloat32_t v1028 = svcmla_f32_x(pred_full, v1020, v2217, v807, 90); svfloat32_t v1048 = svcmla_f32_x(pred_full, v1040, v2153, v645, 90); - svfloat32_t v1191; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1191) : "w"(v1164), "w"(v1190)); - svfloat32_t v1211; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1211) : "w"(v1177), "w"(v1210)); + svfloat32_t v1191 = svsub_f32_x(svptrue_b32(), v1164, v1190); + svfloat32_t v1211 = svsub_f32_x(svptrue_b32(), v1177, v1210); svfloat32_t v1326 = svcmla_f32_x(pred_full, v1318, v2153, v335, 90); svfloat32_t v1339 = svcmla_f32_x(pred_full, v1331, v2155, v497, 90); svfloat32_t v1352 = svcmla_f32_x(pred_full, v1344, v2224, v821, 90); svfloat32_t v1372 = svcmla_f32_x(pred_full, v1364, v2160, v659, 90); - svfloat32_t v1480; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v1480) : "w"(v341), "w"(v2216)); - svfloat32_t v1493; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v1493) : "w"(v503), "w"(v2218)); - svfloat32_t v1506; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v1506) : "w"(v827), "w"(v2220)); - svfloat32_t v1526; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v1526) : "w"(v665), "w"(v2223)); + svfloat32_t v1480 = svmul_f32_x(svptrue_b32(), v341, v2216); + svfloat32_t v1493 = svmul_f32_x(svptrue_b32(), v503, v2218); + svfloat32_t v1506 = svmul_f32_x(svptrue_b32(), v827, v2220); + svfloat32_t v1526 = svmul_f32_x(svptrue_b32(), v665, v2223); svfloat32_t v901 = svmls_f32_x(pred_full, v151, v894, v2226); - svfloat32_t v1029; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1029) : "w"(v1002), "w"(v1028)); - svfloat32_t v1049; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1049) : "w"(v1015), "w"(v1048)); + svfloat32_t v1029 = svsub_f32_x(svptrue_b32(), v1002, v1028); + svfloat32_t v1049 = svsub_f32_x(svptrue_b32(), v1015, v1048); svfloat32_t v1197 = svnmls_f32_x(pred_full, v1191, v1164, v2270); svfloat32_t v1217 = svnmls_f32_x(pred_full, v1211, v1177, v2270); - svfloat32_t v1353; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1353) : "w"(v1326), "w"(v1352)); - svfloat32_t v1373; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1373) : "w"(v1339), "w"(v1372)); + svfloat32_t v1353 = svsub_f32_x(svptrue_b32(), v1326, v1352); + svfloat32_t v1373 = svsub_f32_x(svptrue_b32(), v1339, v1372); svfloat32_t v1488 = svcmla_f32_x(pred_full, v1480, v2217, v341, 90); svfloat32_t v1501 = svcmla_f32_x(pred_full, v1493, v2219, v503, 90); svfloat32_t v1514 = svcmla_f32_x(pred_full, v1506, v2221, v827, 90); @@ -11646,67 +7109,46 @@ void armral_fft_cf32_cf32_cf32_ac_n_gu25(const armral_cmplx_f32_t *restrict x, svfloat32_t v913 = svmls_f32_x(pred_full, v901, v895, v2228); svfloat32_t v1035 = svnmls_f32_x(pred_full, v1029, v1002, v2270); svfloat32_t v1055 = svnmls_f32_x(pred_full, v1049, v1015, v2270); - svfloat32_t v1218; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v1218) : "w"(v1197), "w"(v1217)); - svfloat32_t v1219; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1219) : "w"(v1197), "w"(v1217)); + svfloat32_t v1218 = svadd_f32_x(svptrue_b32(), v1197, v1217); + svfloat32_t v1219 = svsub_f32_x(svptrue_b32(), v1197, v1217); svfloat32_t v1231 = svmla_f32_x(pred_full, v1191, v1211, v2230); svfloat32_t v1249 = svnmls_f32_x(pred_full, v1211, v1191, v2230); svfloat32_t v1359 = svnmls_f32_x(pred_full, v1353, v1326, v2270); svfloat32_t v1379 = svnmls_f32_x(pred_full, v1373, v1339, v2270); - svfloat32_t v1515; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1515) : "w"(v1488), "w"(v1514)); - svfloat32_t v1535; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1535) : "w"(v1501), "w"(v1534)); + svfloat32_t v1515 = svsub_f32_x(svptrue_b32(), v1488, v1514); + svfloat32_t v1535 = svsub_f32_x(svptrue_b32(), v1501, v1534); svfloat32_t v919 = svnmls_f32_x(pred_full, v913, v901, v2270); - svfloat32_t v956; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v956) : "w"(v913), "w"(v955)); - svfloat32_t v1056; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v1056) : "w"(v1035), "w"(v1055)); - svfloat32_t v1057; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1057) : "w"(v1035), "w"(v1055)); + svfloat32_t v956 = svsub_f32_x(svptrue_b32(), v913, v955); + svfloat32_t v1056 = svadd_f32_x(svptrue_b32(), v1035, v1055); + svfloat32_t v1057 = svsub_f32_x(svptrue_b32(), v1035, v1055); svfloat32_t v1069 = svmla_f32_x(pred_full, v1029, v1049, v2230); svfloat32_t v1087 = svnmls_f32_x(pred_full, v1049, v1029, v2230); - svfloat32_t v1250; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v1250) : "w"(v167), "w"(v1218)); - svfloat32_t zero1264; - asm volatile("mov %0.s, #0" : "=w"(zero1264)); + svfloat32_t v1250 = svadd_f32_x(svptrue_b32(), v167, v1218); + svfloat32_t zero1264 = svdup_n_f32(0); svfloat32_t v1264 = svcmla_f32_x(pred_full, zero1264, v2250, v1231, 90); - svfloat32_t zero1279; - asm volatile("mov %0.s, #0" : "=w"(zero1279)); + svfloat32_t zero1279 = svdup_n_f32(0); svfloat32_t v1279 = svcmla_f32_x(pred_full, zero1279, v2250, v1249, 90); - svfloat32_t v1380; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v1380) : "w"(v1359), "w"(v1379)); - svfloat32_t v1381; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1381) : "w"(v1359), "w"(v1379)); + svfloat32_t v1380 = svadd_f32_x(svptrue_b32(), v1359, v1379); + svfloat32_t v1381 = svsub_f32_x(svptrue_b32(), v1359, v1379); svfloat32_t v1393 = svmla_f32_x(pred_full, v1353, v1373, v2230); svfloat32_t v1411 = svnmls_f32_x(pred_full, v1373, v1353, v2230); svfloat32_t v1521 = svnmls_f32_x(pred_full, v1515, v1488, v2270); svfloat32_t v1541 = svnmls_f32_x(pred_full, v1535, v1501, v2270); - svfloat32_t v941; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v941) : "w"(v919), "w"(v940)); + svfloat32_t v941 = svsub_f32_x(svptrue_b32(), v919, v940); svfloat32_t v969 = svnmls_f32_x(pred_full, v956, v913, v2270); - svfloat32_t v1088; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v1088) : "w"(v159), "w"(v1056)); - svfloat32_t zero1102; - asm volatile("mov %0.s, #0" : "=w"(zero1102)); + svfloat32_t v1088 = svadd_f32_x(svptrue_b32(), v159, v1056); + svfloat32_t zero1102 = svdup_n_f32(0); svfloat32_t v1102 = svcmla_f32_x(pred_full, zero1102, v2250, v1069, 90); - svfloat32_t zero1117; - asm volatile("mov %0.s, #0" : "=w"(zero1117)); + svfloat32_t zero1117 = svdup_n_f32(0); svfloat32_t v1117 = svcmla_f32_x(pred_full, zero1117, v2250, v1087, 90); svfloat32_t v1225 = svmls_f32_x(pred_full, v167, v1218, v2226); - svfloat32_t v1412; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v1412) : "w"(v173), "w"(v1380)); - svfloat32_t zero1426; - asm volatile("mov %0.s, #0" : "=w"(zero1426)); + svfloat32_t v1412 = svadd_f32_x(svptrue_b32(), v173, v1380); + svfloat32_t zero1426 = svdup_n_f32(0); svfloat32_t v1426 = svcmla_f32_x(pred_full, zero1426, v2250, v1393, 90); - svfloat32_t zero1441; - asm volatile("mov %0.s, #0" : "=w"(zero1441)); + svfloat32_t zero1441 = svdup_n_f32(0); svfloat32_t v1441 = svcmla_f32_x(pred_full, zero1441, v2250, v1411, 90); - svfloat32_t v1542; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v1542) : "w"(v1521), "w"(v1541)); - svfloat32_t v1543; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1543) : "w"(v1521), "w"(v1541)); + svfloat32_t v1542 = svadd_f32_x(svptrue_b32(), v1521, v1541); + svfloat32_t v1543 = svsub_f32_x(svptrue_b32(), v1521, v1541); svfloat32_t v1555 = svmla_f32_x(pred_full, v1515, v1535, v2230); svfloat32_t v1573 = svnmls_f32_x(pred_full, v1535, v1515, v2230); svst1_f64(pred_full, (double *)(v2001), svreinterpret_f64_f32(v956)); @@ -11715,13 +7157,10 @@ void armral_fft_cf32_cf32_cf32_ac_n_gu25(const armral_cmplx_f32_t *restrict x, svfloat32_t v1063 = svmls_f32_x(pred_full, v159, v1056, v2226); svfloat32_t v1237 = svmls_f32_x(pred_full, v1225, v1219, v2228); svfloat32_t v1387 = svmls_f32_x(pred_full, v173, v1380, v2226); - svfloat32_t v1574; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v1574) : "w"(v179), "w"(v1542)); - svfloat32_t zero1588; - asm volatile("mov %0.s, #0" : "=w"(zero1588)); + svfloat32_t v1574 = svadd_f32_x(svptrue_b32(), v179, v1542); + svfloat32_t zero1588 = svdup_n_f32(0); svfloat32_t v1588 = svcmla_f32_x(pred_full, zero1588, v2250, v1555, 90); - svfloat32_t zero1603; - asm volatile("mov %0.s, #0" : "=w"(zero1603)); + svfloat32_t zero1603 = svdup_n_f32(0); svfloat32_t v1603 = svcmla_f32_x(pred_full, zero1603, v2250, v1573, 90); svst1_f64(pred_full, (double *)(v1991), svreinterpret_f64_f32(v941)); svst1_f64(pred_full, (double *)(v2011), svreinterpret_f64_f32(v969)); @@ -11729,41 +7168,33 @@ void armral_fft_cf32_cf32_cf32_ac_n_gu25(const armral_cmplx_f32_t *restrict x, svst1_f64(pred_full, (double *)(v2173), svreinterpret_f64_f32(v1412)); svfloat32_t v1075 = svmls_f32_x(pred_full, v1063, v1057, v2228); svfloat32_t v1243 = svnmls_f32_x(pred_full, v1237, v1225, v2270); - svfloat32_t v1280; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1280) : "w"(v1237), "w"(v1279)); + svfloat32_t v1280 = svsub_f32_x(svptrue_b32(), v1237, v1279); svfloat32_t v1399 = svmls_f32_x(pred_full, v1387, v1381, v2228); svfloat32_t v1549 = svmls_f32_x(pred_full, v179, v1542, v2226); svst1_f64(pred_full, (double *)(v2021), svreinterpret_f64_f32(v982)); svst1_f64(pred_full, (double *)(v2237), svreinterpret_f64_f32(v1574)); svfloat32_t v1081 = svnmls_f32_x(pred_full, v1075, v1063, v2270); - svfloat32_t v1118; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1118) : "w"(v1075), "w"(v1117)); - svfloat32_t v1265; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1265) : "w"(v1243), "w"(v1264)); + svfloat32_t v1118 = svsub_f32_x(svptrue_b32(), v1075, v1117); + svfloat32_t v1265 = svsub_f32_x(svptrue_b32(), v1243, v1264); svfloat32_t v1293 = svnmls_f32_x(pred_full, v1280, v1237, v2270); svfloat32_t v1405 = svnmls_f32_x(pred_full, v1399, v1387, v2270); - svfloat32_t v1442; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1442) : "w"(v1399), "w"(v1441)); + svfloat32_t v1442 = svsub_f32_x(svptrue_b32(), v1399, v1441); svfloat32_t v1561 = svmls_f32_x(pred_full, v1549, v1543, v2228); svst1_f64(pred_full, (double *)(v2129), svreinterpret_f64_f32(v1280)); - svfloat32_t v1103; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1103) : "w"(v1081), "w"(v1102)); + svfloat32_t v1103 = svsub_f32_x(svptrue_b32(), v1081, v1102); svfloat32_t v1131 = svnmls_f32_x(pred_full, v1118, v1075, v2270); svfloat32_t v1306 = svnmls_f32_x(pred_full, v1265, v1243, v2270); - svfloat32_t v1427; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1427) : "w"(v1405), "w"(v1426)); + svfloat32_t v1427 = svsub_f32_x(svptrue_b32(), v1405, v1426); svfloat32_t v1455 = svnmls_f32_x(pred_full, v1442, v1399, v2270); svfloat32_t v1567 = svnmls_f32_x(pred_full, v1561, v1549, v2270); - svfloat32_t v1604; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1604) : "w"(v1561), "w"(v1603)); + svfloat32_t v1604 = svsub_f32_x(svptrue_b32(), v1561, v1603); svst1_f64(pred_full, (double *)(v2065), svreinterpret_f64_f32(v1118)); svst1_f64(pred_full, (double *)(v2119), svreinterpret_f64_f32(v1265)); svst1_f64(pred_full, (double *)(v2139), svreinterpret_f64_f32(v1293)); svst1_f64(pred_full, (double *)(v2193), svreinterpret_f64_f32(v1442)); svfloat32_t v1144 = svnmls_f32_x(pred_full, v1103, v1081, v2270); svfloat32_t v1468 = svnmls_f32_x(pred_full, v1427, v1405, v2270); - svfloat32_t v1589; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1589) : "w"(v1567), "w"(v1588)); + svfloat32_t v1589 = svsub_f32_x(svptrue_b32(), v1567, v1588); svfloat32_t v1617 = svnmls_f32_x(pred_full, v1604, v1561, v2270); svst1_f64(pred_full, (double *)(v2055), svreinterpret_f64_f32(v1103)); svst1_f64(pred_full, (double *)(v2075), svreinterpret_f64_f32(v1131)); @@ -12237,7 +7668,6 @@ void armral_fft_cf32_cf32_cf32_ac_n_gu32(const armral_cmplx_f32_t *restrict x, float v1064 = 9.8078528040323043e-01F; float v1071 = -5.5557023301960218e-01F; float v1076 = -8.3146961230254524e-01F; - float v1087 = 1.0000000000000000e+00F; const float32x2_t *v1301 = &v5[v0]; float32x2_t *v1502 = &v6[v2]; int64_t v26 = v0 * 16; @@ -12305,7 +7735,6 @@ void armral_fft_cf32_cf32_cf32_ac_n_gu32(const armral_cmplx_f32_t *restrict x, int64_t v1052 = v2 * 30; float v1067 = v4 * v1064; float v1079 = v4 * v1076; - float v1090 = v4 * v1087; int64_t v1098 = v2 * 7; int64_t v1105 = v2 * 15; int64_t v1112 = v2 * 23; @@ -12325,6 +7754,7 @@ void armral_fft_cf32_cf32_cf32_ac_n_gu32(const armral_cmplx_f32_t *restrict x, svfloat32_t v1698 = svdup_n_f32(v1004); svfloat32_t v1737 = svdup_n_f32(v1059); svfloat32_t v1739 = svdup_n_f32(v1071); + svfloat32_t v1741 = svdup_n_f32(v4); svfloat32_t v1134 = svreinterpret_f32_f64( svld1_gather_s64index_f64(pred_full, (const double *)(v1132), v1445)); const float32x2_t *v1141 = &v5[v26]; @@ -12394,7 +7824,6 @@ void armral_fft_cf32_cf32_cf32_ac_n_gu32(const armral_cmplx_f32_t *restrict x, float32x2_t *v1734 = &v6[v1052]; svfloat32_t v1738 = svdup_n_f32(v1067); svfloat32_t v1740 = svdup_n_f32(v1079); - svfloat32_t v1741 = svdup_n_f32(v1090); float32x2_t *v1748 = &v6[v1098]; float32x2_t *v1757 = &v6[v1105]; float32x2_t *v1766 = &v6[v1112]; @@ -12459,293 +7888,162 @@ void armral_fft_cf32_cf32_cf32_ac_n_gu32(const armral_cmplx_f32_t *restrict x, svld1_gather_s64index_f64(pred_full, (const double *)(v1435), v1445)); svfloat32_t v1446 = svreinterpret_f32_f64( svld1_gather_s64index_f64(pred_full, (const double *)(v1444), v1445)); - svfloat32_t v32; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v32) : "w"(v1134), "w"(v1143)); - svfloat32_t v33; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v33) : "w"(v1134), "w"(v1143)); - svfloat32_t v48; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v48) : "w"(v1152), "w"(v1161)); - svfloat32_t v49; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v49) : "w"(v1152), "w"(v1161)); - svfloat32_t v75; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v75) : "w"(v1171), "w"(v1180)); - svfloat32_t v76; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v76) : "w"(v1171), "w"(v1180)); - svfloat32_t v91; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v91) : "w"(v1189), "w"(v1198)); - svfloat32_t v92; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v92) : "w"(v1189), "w"(v1198)); - svfloat32_t v159; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v159) : "w"(v1213), "w"(v1222)); - svfloat32_t v160; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v160) : "w"(v1213), "w"(v1222)); - svfloat32_t v175; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v175) : "w"(v1231), "w"(v1240)); - svfloat32_t v176; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v176) : "w"(v1231), "w"(v1240)); - svfloat32_t v202; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v202) : "w"(v1250), "w"(v1259)); - svfloat32_t v203; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v203) : "w"(v1250), "w"(v1259)); - svfloat32_t v218; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v218) : "w"(v1268), "w"(v1277)); - svfloat32_t v219; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v219) : "w"(v1268), "w"(v1277)); - svfloat32_t v375; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v375) : "w"(v1303), "w"(v1312)); - svfloat32_t v376; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v376) : "w"(v1303), "w"(v1312)); - svfloat32_t v391; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v391) : "w"(v1321), "w"(v1330)); - svfloat32_t v392; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v392) : "w"(v1321), "w"(v1330)); - svfloat32_t v418; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v418) : "w"(v1340), "w"(v1349)); - svfloat32_t v419; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v419) : "w"(v1340), "w"(v1349)); - svfloat32_t v434; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v434) : "w"(v1358), "w"(v1367)); - svfloat32_t v435; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v435) : "w"(v1358), "w"(v1367)); - svfloat32_t v502; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v502) : "w"(v1382), "w"(v1391)); - svfloat32_t v503; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v503) : "w"(v1382), "w"(v1391)); - svfloat32_t v518; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v518) : "w"(v1400), "w"(v1409)); - svfloat32_t v519; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v519) : "w"(v1400), "w"(v1409)); - svfloat32_t v545; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v545) : "w"(v1419), "w"(v1428)); - svfloat32_t v546; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v546) : "w"(v1419), "w"(v1428)); - svfloat32_t v561; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v561) : "w"(v1437), "w"(v1446)); - svfloat32_t v562; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v562) : "w"(v1437), "w"(v1446)); - svfloat32_t zero56; - asm volatile("mov %0.s, #0" : "=w"(zero56)); + svfloat32_t v32 = svadd_f32_x(svptrue_b32(), v1134, v1143); + svfloat32_t v33 = svsub_f32_x(svptrue_b32(), v1134, v1143); + svfloat32_t v48 = svadd_f32_x(svptrue_b32(), v1152, v1161); + svfloat32_t v49 = svsub_f32_x(svptrue_b32(), v1152, v1161); + svfloat32_t v75 = svadd_f32_x(svptrue_b32(), v1171, v1180); + svfloat32_t v76 = svsub_f32_x(svptrue_b32(), v1171, v1180); + svfloat32_t v91 = svadd_f32_x(svptrue_b32(), v1189, v1198); + svfloat32_t v92 = svsub_f32_x(svptrue_b32(), v1189, v1198); + svfloat32_t v159 = svadd_f32_x(svptrue_b32(), v1213, v1222); + svfloat32_t v160 = svsub_f32_x(svptrue_b32(), v1213, v1222); + svfloat32_t v175 = svadd_f32_x(svptrue_b32(), v1231, v1240); + svfloat32_t v176 = svsub_f32_x(svptrue_b32(), v1231, v1240); + svfloat32_t v202 = svadd_f32_x(svptrue_b32(), v1250, v1259); + svfloat32_t v203 = svsub_f32_x(svptrue_b32(), v1250, v1259); + svfloat32_t v218 = svadd_f32_x(svptrue_b32(), v1268, v1277); + svfloat32_t v219 = svsub_f32_x(svptrue_b32(), v1268, v1277); + svfloat32_t v375 = svadd_f32_x(svptrue_b32(), v1303, v1312); + svfloat32_t v376 = svsub_f32_x(svptrue_b32(), v1303, v1312); + svfloat32_t v391 = svadd_f32_x(svptrue_b32(), v1321, v1330); + svfloat32_t v392 = svsub_f32_x(svptrue_b32(), v1321, v1330); + svfloat32_t v418 = svadd_f32_x(svptrue_b32(), v1340, v1349); + svfloat32_t v419 = svsub_f32_x(svptrue_b32(), v1340, v1349); + svfloat32_t v434 = svadd_f32_x(svptrue_b32(), v1358, v1367); + svfloat32_t v435 = svsub_f32_x(svptrue_b32(), v1358, v1367); + svfloat32_t v502 = svadd_f32_x(svptrue_b32(), v1382, v1391); + svfloat32_t v503 = svsub_f32_x(svptrue_b32(), v1382, v1391); + svfloat32_t v518 = svadd_f32_x(svptrue_b32(), v1400, v1409); + svfloat32_t v519 = svsub_f32_x(svptrue_b32(), v1400, v1409); + svfloat32_t v545 = svadd_f32_x(svptrue_b32(), v1419, v1428); + svfloat32_t v546 = svsub_f32_x(svptrue_b32(), v1419, v1428); + svfloat32_t v561 = svadd_f32_x(svptrue_b32(), v1437, v1446); + svfloat32_t v562 = svsub_f32_x(svptrue_b32(), v1437, v1446); + svfloat32_t zero56 = svdup_n_f32(0); svfloat32_t v56 = svcmla_f32_x(pred_full, zero56, v1617, v49, 90); - svfloat32_t v57; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v57) : "w"(v32), "w"(v48)); - svfloat32_t v58; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v58) : "w"(v32), "w"(v48)); - svfloat32_t v93; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v93) : "w"(v75), "w"(v91)); - svfloat32_t v94; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v94) : "w"(v75), "w"(v91)); - svfloat32_t v110; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v110) : "w"(v76), "w"(v1614)); - svfloat32_t v122; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v122) : "w"(v92), "w"(v1616)); - svfloat32_t zero183; - asm volatile("mov %0.s, #0" : "=w"(zero183)); + svfloat32_t v57 = svadd_f32_x(svptrue_b32(), v32, v48); + svfloat32_t v58 = svsub_f32_x(svptrue_b32(), v32, v48); + svfloat32_t v93 = svadd_f32_x(svptrue_b32(), v75, v91); + svfloat32_t v94 = svsub_f32_x(svptrue_b32(), v75, v91); + svfloat32_t v110 = svmul_f32_x(svptrue_b32(), v76, v1614); + svfloat32_t v122 = svmul_f32_x(svptrue_b32(), v92, v1616); + svfloat32_t zero183 = svdup_n_f32(0); svfloat32_t v183 = svcmla_f32_x(pred_full, zero183, v1617, v176, 90); - svfloat32_t v184; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v184) : "w"(v159), "w"(v175)); - svfloat32_t v185; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v185) : "w"(v159), "w"(v175)); - svfloat32_t zero226; - asm volatile("mov %0.s, #0" : "=w"(zero226)); + svfloat32_t v184 = svadd_f32_x(svptrue_b32(), v159, v175); + svfloat32_t v185 = svsub_f32_x(svptrue_b32(), v159, v175); + svfloat32_t zero226 = svdup_n_f32(0); svfloat32_t v226 = svcmla_f32_x(pred_full, zero226, v1617, v219, 90); - svfloat32_t v227; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v227) : "w"(v202), "w"(v218)); - svfloat32_t v228; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v228) : "w"(v202), "w"(v218)); - svfloat32_t zero399; - asm volatile("mov %0.s, #0" : "=w"(zero399)); + svfloat32_t v227 = svadd_f32_x(svptrue_b32(), v202, v218); + svfloat32_t v228 = svsub_f32_x(svptrue_b32(), v202, v218); + svfloat32_t zero399 = svdup_n_f32(0); svfloat32_t v399 = svcmla_f32_x(pred_full, zero399, v1617, v392, 90); - svfloat32_t v400; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v400) : "w"(v375), "w"(v391)); - svfloat32_t v401; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v401) : "w"(v375), "w"(v391)); - svfloat32_t v436; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v436) : "w"(v418), "w"(v434)); - svfloat32_t v437; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v437) : "w"(v418), "w"(v434)); - svfloat32_t v453; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v453) : "w"(v419), "w"(v1614)); - svfloat32_t v465; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v465) : "w"(v435), "w"(v1616)); - svfloat32_t zero526; - asm volatile("mov %0.s, #0" : "=w"(zero526)); + svfloat32_t v400 = svadd_f32_x(svptrue_b32(), v375, v391); + svfloat32_t v401 = svsub_f32_x(svptrue_b32(), v375, v391); + svfloat32_t v436 = svadd_f32_x(svptrue_b32(), v418, v434); + svfloat32_t v437 = svsub_f32_x(svptrue_b32(), v418, v434); + svfloat32_t v453 = svmul_f32_x(svptrue_b32(), v419, v1614); + svfloat32_t v465 = svmul_f32_x(svptrue_b32(), v435, v1616); + svfloat32_t zero526 = svdup_n_f32(0); svfloat32_t v526 = svcmla_f32_x(pred_full, zero526, v1617, v519, 90); - svfloat32_t v527; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v527) : "w"(v502), "w"(v518)); - svfloat32_t v528; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v528) : "w"(v502), "w"(v518)); - svfloat32_t v563; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v563) : "w"(v545), "w"(v561)); - svfloat32_t v564; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v564) : "w"(v545), "w"(v561)); - svfloat32_t v580; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v580) : "w"(v546), "w"(v1614)); - svfloat32_t v592; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v592) : "w"(v562), "w"(v1616)); - svfloat32_t v59; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v59) : "w"(v33), "w"(v56)); - svfloat32_t v60; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v60) : "w"(v33), "w"(v56)); - svfloat32_t zero101; - asm volatile("mov %0.s, #0" : "=w"(zero101)); + svfloat32_t v527 = svadd_f32_x(svptrue_b32(), v502, v518); + svfloat32_t v528 = svsub_f32_x(svptrue_b32(), v502, v518); + svfloat32_t v563 = svadd_f32_x(svptrue_b32(), v545, v561); + svfloat32_t v564 = svsub_f32_x(svptrue_b32(), v545, v561); + svfloat32_t v580 = svmul_f32_x(svptrue_b32(), v546, v1614); + svfloat32_t v592 = svmul_f32_x(svptrue_b32(), v562, v1616); + svfloat32_t v59 = svsub_f32_x(svptrue_b32(), v33, v56); + svfloat32_t v60 = svadd_f32_x(svptrue_b32(), v33, v56); + svfloat32_t zero101 = svdup_n_f32(0); svfloat32_t v101 = svcmla_f32_x(pred_full, zero101, v1617, v94, 90); - svfloat32_t v102; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v102) : "w"(v57), "w"(v93)); - svfloat32_t v103; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v103) : "w"(v57), "w"(v93)); - svfloat32_t v186; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v186) : "w"(v160), "w"(v183)); - svfloat32_t v187; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v187) : "w"(v160), "w"(v183)); - svfloat32_t v229; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v229) : "w"(v203), "w"(v226)); - svfloat32_t v230; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v230) : "w"(v203), "w"(v226)); - svfloat32_t v231; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v231) : "w"(v184), "w"(v227)); - svfloat32_t v232; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v232) : "w"(v184), "w"(v227)); - svfloat32_t v287; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v287) : "w"(v185), "w"(v1614)); - svfloat32_t v299; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v299) : "w"(v228), "w"(v1616)); - svfloat32_t v402; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v402) : "w"(v376), "w"(v399)); - svfloat32_t v403; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v403) : "w"(v376), "w"(v399)); - svfloat32_t zero444; - asm volatile("mov %0.s, #0" : "=w"(zero444)); + svfloat32_t v102 = svadd_f32_x(svptrue_b32(), v57, v93); + svfloat32_t v103 = svsub_f32_x(svptrue_b32(), v57, v93); + svfloat32_t v186 = svsub_f32_x(svptrue_b32(), v160, v183); + svfloat32_t v187 = svadd_f32_x(svptrue_b32(), v160, v183); + svfloat32_t v229 = svsub_f32_x(svptrue_b32(), v203, v226); + svfloat32_t v230 = svadd_f32_x(svptrue_b32(), v203, v226); + svfloat32_t v231 = svadd_f32_x(svptrue_b32(), v184, v227); + svfloat32_t v232 = svsub_f32_x(svptrue_b32(), v184, v227); + svfloat32_t v287 = svmul_f32_x(svptrue_b32(), v185, v1614); + svfloat32_t v299 = svmul_f32_x(svptrue_b32(), v228, v1616); + svfloat32_t v402 = svsub_f32_x(svptrue_b32(), v376, v399); + svfloat32_t v403 = svadd_f32_x(svptrue_b32(), v376, v399); + svfloat32_t zero444 = svdup_n_f32(0); svfloat32_t v444 = svcmla_f32_x(pred_full, zero444, v1617, v437, 90); - svfloat32_t v445; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v445) : "w"(v400), "w"(v436)); - svfloat32_t v446; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v446) : "w"(v400), "w"(v436)); - svfloat32_t v529; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v529) : "w"(v503), "w"(v526)); - svfloat32_t v530; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v530) : "w"(v503), "w"(v526)); - svfloat32_t zero571; - asm volatile("mov %0.s, #0" : "=w"(zero571)); + svfloat32_t v445 = svadd_f32_x(svptrue_b32(), v400, v436); + svfloat32_t v446 = svsub_f32_x(svptrue_b32(), v400, v436); + svfloat32_t v529 = svsub_f32_x(svptrue_b32(), v503, v526); + svfloat32_t v530 = svadd_f32_x(svptrue_b32(), v503, v526); + svfloat32_t zero571 = svdup_n_f32(0); svfloat32_t v571 = svcmla_f32_x(pred_full, zero571, v1617, v564, 90); - svfloat32_t v572; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v572) : "w"(v527), "w"(v563)); - svfloat32_t v573; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v573) : "w"(v527), "w"(v563)); - svfloat32_t v104; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v104) : "w"(v58), "w"(v101)); - svfloat32_t v105; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v105) : "w"(v58), "w"(v101)); + svfloat32_t v572 = svadd_f32_x(svptrue_b32(), v527, v563); + svfloat32_t v573 = svsub_f32_x(svptrue_b32(), v527, v563); + svfloat32_t v104 = svsub_f32_x(svptrue_b32(), v58, v101); + svfloat32_t v105 = svadd_f32_x(svptrue_b32(), v58, v101); svfloat32_t v130 = svcmla_f32_x(pred_full, v110, v1741, v110, 90); svfloat32_t v131 = svcmla_f32_x(pred_full, v122, v1617, v122, 90); - svfloat32_t zero239; - asm volatile("mov %0.s, #0" : "=w"(zero239)); + svfloat32_t zero239 = svdup_n_f32(0); svfloat32_t v239 = svcmla_f32_x(pred_full, zero239, v1617, v232, 90); - svfloat32_t v240; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v240) : "w"(v102), "w"(v231)); - svfloat32_t v241; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v241) : "w"(v102), "w"(v231)); - svfloat32_t v248; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v248) : "w"(v186), "w"(v1532)); - svfloat32_t v260; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v260) : "w"(v229), "w"(v1696)); - svfloat32_t v326; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v326) : "w"(v187), "w"(v1696)); - svfloat32_t v338; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v338) : "w"(v230), "w"(v1698)); - svfloat32_t v447; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v447) : "w"(v401), "w"(v444)); - svfloat32_t v448; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v448) : "w"(v401), "w"(v444)); + svfloat32_t v240 = svadd_f32_x(svptrue_b32(), v102, v231); + svfloat32_t v241 = svsub_f32_x(svptrue_b32(), v102, v231); + svfloat32_t v248 = svmul_f32_x(svptrue_b32(), v186, v1532); + svfloat32_t v260 = svmul_f32_x(svptrue_b32(), v229, v1696); + svfloat32_t v326 = svmul_f32_x(svptrue_b32(), v187, v1696); + svfloat32_t v338 = svmul_f32_x(svptrue_b32(), v230, v1698); + svfloat32_t v447 = svsub_f32_x(svptrue_b32(), v401, v444); + svfloat32_t v448 = svadd_f32_x(svptrue_b32(), v401, v444); svfloat32_t v473 = svcmla_f32_x(pred_full, v453, v1741, v453, 90); svfloat32_t v474 = svcmla_f32_x(pred_full, v465, v1617, v465, 90); - svfloat32_t v574; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v574) : "w"(v528), "w"(v571)); - svfloat32_t v575; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v575) : "w"(v528), "w"(v571)); + svfloat32_t v574 = svsub_f32_x(svptrue_b32(), v528, v571); + svfloat32_t v575 = svadd_f32_x(svptrue_b32(), v528, v571); svfloat32_t v600 = svcmla_f32_x(pred_full, v580, v1741, v580, 90); svfloat32_t v601 = svcmla_f32_x(pred_full, v592, v1617, v592, 90); - svfloat32_t v615; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v615) : "w"(v445), "w"(v572)); - svfloat32_t v616; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v616) : "w"(v445), "w"(v572)); - svfloat32_t v861; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v861) : "w"(v446), "w"(v1614)); - svfloat32_t v873; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v873) : "w"(v573), "w"(v1616)); - svfloat32_t v132; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v132) : "w"(v130), "w"(v131)); - svfloat32_t v133; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v133) : "w"(v131), "w"(v130)); - svfloat32_t v242; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v242) : "w"(v103), "w"(v239)); - svfloat32_t v243; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v243) : "w"(v103), "w"(v239)); + svfloat32_t v615 = svadd_f32_x(svptrue_b32(), v445, v572); + svfloat32_t v616 = svsub_f32_x(svptrue_b32(), v445, v572); + svfloat32_t v861 = svmul_f32_x(svptrue_b32(), v446, v1614); + svfloat32_t v873 = svmul_f32_x(svptrue_b32(), v573, v1616); + svfloat32_t v132 = svadd_f32_x(svptrue_b32(), v130, v131); + svfloat32_t v133 = svsub_f32_x(svptrue_b32(), v131, v130); + svfloat32_t v242 = svsub_f32_x(svptrue_b32(), v103, v239); + svfloat32_t v243 = svadd_f32_x(svptrue_b32(), v103, v239); svfloat32_t v268 = svcmla_f32_x(pred_full, v248, v1533, v186, 90); svfloat32_t v269 = svcmla_f32_x(pred_full, v260, v1697, v229, 90); svfloat32_t v307 = svcmla_f32_x(pred_full, v287, v1741, v287, 90); svfloat32_t v308 = svcmla_f32_x(pred_full, v299, v1617, v299, 90); svfloat32_t v346 = svcmla_f32_x(pred_full, v326, v1697, v187, 90); svfloat32_t v347 = svcmla_f32_x(pred_full, v338, v1699, v230, 90); - svfloat32_t v475; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v475) : "w"(v473), "w"(v474)); - svfloat32_t v476; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v476) : "w"(v474), "w"(v473)); - svfloat32_t v602; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v602) : "w"(v600), "w"(v601)); - svfloat32_t v603; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v603) : "w"(v601), "w"(v600)); - svfloat32_t zero623; - asm volatile("mov %0.s, #0" : "=w"(zero623)); + svfloat32_t v475 = svadd_f32_x(svptrue_b32(), v473, v474); + svfloat32_t v476 = svsub_f32_x(svptrue_b32(), v474, v473); + svfloat32_t v602 = svadd_f32_x(svptrue_b32(), v600, v601); + svfloat32_t v603 = svsub_f32_x(svptrue_b32(), v601, v600); + svfloat32_t zero623 = svdup_n_f32(0); svfloat32_t v623 = svcmla_f32_x(pred_full, zero623, v1617, v616, 90); - svfloat32_t v624; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v624) : "w"(v240), "w"(v615)); - svfloat32_t v625; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v625) : "w"(v240), "w"(v615)); - svfloat32_t v727; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v727) : "w"(v447), "w"(v1532)); - svfloat32_t v739; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v739) : "w"(v574), "w"(v1696)); - svfloat32_t v995; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v995) : "w"(v448), "w"(v1696)); - svfloat32_t v1007; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v1007) : "w"(v575), "w"(v1698)); - svfloat32_t zero140; - asm volatile("mov %0.s, #0" : "=w"(zero140)); + svfloat32_t v624 = svadd_f32_x(svptrue_b32(), v240, v615); + svfloat32_t v625 = svsub_f32_x(svptrue_b32(), v240, v615); + svfloat32_t v727 = svmul_f32_x(svptrue_b32(), v447, v1532); + svfloat32_t v739 = svmul_f32_x(svptrue_b32(), v574, v1696); + svfloat32_t v995 = svmul_f32_x(svptrue_b32(), v448, v1696); + svfloat32_t v1007 = svmul_f32_x(svptrue_b32(), v575, v1698); + svfloat32_t zero140 = svdup_n_f32(0); svfloat32_t v140 = svcmla_f32_x(pred_full, zero140, v1741, v133, 90); - svfloat32_t v141; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v141) : "w"(v59), "w"(v132)); - svfloat32_t v142; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v142) : "w"(v59), "w"(v132)); - svfloat32_t v270; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v270) : "w"(v268), "w"(v269)); - svfloat32_t v271; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v271) : "w"(v269), "w"(v268)); - svfloat32_t v309; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v309) : "w"(v307), "w"(v308)); - svfloat32_t v310; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v310) : "w"(v308), "w"(v307)); - svfloat32_t v348; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v348) : "w"(v346), "w"(v347)); - svfloat32_t v349; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v349) : "w"(v347), "w"(v346)); - svfloat32_t zero483; - asm volatile("mov %0.s, #0" : "=w"(zero483)); + svfloat32_t v141 = svadd_f32_x(svptrue_b32(), v59, v132); + svfloat32_t v142 = svsub_f32_x(svptrue_b32(), v59, v132); + svfloat32_t v270 = svadd_f32_x(svptrue_b32(), v268, v269); + svfloat32_t v271 = svsub_f32_x(svptrue_b32(), v269, v268); + svfloat32_t v309 = svadd_f32_x(svptrue_b32(), v307, v308); + svfloat32_t v310 = svsub_f32_x(svptrue_b32(), v308, v307); + svfloat32_t v348 = svadd_f32_x(svptrue_b32(), v346, v347); + svfloat32_t v349 = svsub_f32_x(svptrue_b32(), v347, v346); + svfloat32_t zero483 = svdup_n_f32(0); svfloat32_t v483 = svcmla_f32_x(pred_full, zero483, v1741, v476, 90); - svfloat32_t v484; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v484) : "w"(v402), "w"(v475)); - svfloat32_t v485; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v485) : "w"(v402), "w"(v475)); - svfloat32_t zero610; - asm volatile("mov %0.s, #0" : "=w"(zero610)); + svfloat32_t v484 = svadd_f32_x(svptrue_b32(), v402, v475); + svfloat32_t v485 = svsub_f32_x(svptrue_b32(), v402, v475); + svfloat32_t zero610 = svdup_n_f32(0); svfloat32_t v610 = svcmla_f32_x(pred_full, zero610, v1741, v603, 90); - svfloat32_t v611; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v611) : "w"(v529), "w"(v602)); - svfloat32_t v612; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v612) : "w"(v529), "w"(v602)); - svfloat32_t v626; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v626) : "w"(v241), "w"(v623)); - svfloat32_t v627; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v627) : "w"(v241), "w"(v623)); + svfloat32_t v611 = svadd_f32_x(svptrue_b32(), v529, v602); + svfloat32_t v612 = svsub_f32_x(svptrue_b32(), v529, v602); + svfloat32_t v626 = svsub_f32_x(svptrue_b32(), v241, v623); + svfloat32_t v627 = svadd_f32_x(svptrue_b32(), v241, v623); svfloat32_t v747 = svcmla_f32_x(pred_full, v727, v1533, v447, 90); svfloat32_t v748 = svcmla_f32_x(pred_full, v739, v1697, v574, 90); svfloat32_t v881 = svcmla_f32_x(pred_full, v861, v1741, v861, 90); @@ -12754,156 +8052,92 @@ void armral_fft_cf32_cf32_cf32_ac_n_gu32(const armral_cmplx_f32_t *restrict x, svfloat32_t v1016 = svcmla_f32_x(pred_full, v1007, v1699, v575, 90); svst1_f64(pred_full, (double *)(v1461), svreinterpret_f64_f32(v624)); svst1_f64(pred_full, (double *)(v1479), svreinterpret_f64_f32(v625)); - svfloat32_t v143; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v143) : "w"(v60), "w"(v140)); - svfloat32_t v144; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v144) : "w"(v60), "w"(v140)); - svfloat32_t zero278; - asm volatile("mov %0.s, #0" : "=w"(zero278)); + svfloat32_t v143 = svsub_f32_x(svptrue_b32(), v60, v140); + svfloat32_t v144 = svadd_f32_x(svptrue_b32(), v60, v140); + svfloat32_t zero278 = svdup_n_f32(0); svfloat32_t v278 = svcmla_f32_x(pred_full, zero278, v1741, v271, 90); - svfloat32_t v279; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v279) : "w"(v141), "w"(v270)); - svfloat32_t v280; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v280) : "w"(v141), "w"(v270)); - svfloat32_t zero317; - asm volatile("mov %0.s, #0" : "=w"(zero317)); + svfloat32_t v279 = svadd_f32_x(svptrue_b32(), v141, v270); + svfloat32_t v280 = svsub_f32_x(svptrue_b32(), v141, v270); + svfloat32_t zero317 = svdup_n_f32(0); svfloat32_t v317 = svcmla_f32_x(pred_full, zero317, v1741, v310, 90); - svfloat32_t v318; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v318) : "w"(v104), "w"(v309)); - svfloat32_t v319; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v319) : "w"(v104), "w"(v309)); - svfloat32_t zero356; - asm volatile("mov %0.s, #0" : "=w"(zero356)); + svfloat32_t v318 = svadd_f32_x(svptrue_b32(), v104, v309); + svfloat32_t v319 = svsub_f32_x(svptrue_b32(), v104, v309); + svfloat32_t zero356 = svdup_n_f32(0); svfloat32_t v356 = svcmla_f32_x(pred_full, zero356, v1741, v349, 90); - svfloat32_t v486; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v486) : "w"(v403), "w"(v483)); - svfloat32_t v487; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v487) : "w"(v403), "w"(v483)); - svfloat32_t v613; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v613) : "w"(v530), "w"(v610)); - svfloat32_t v614; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v614) : "w"(v530), "w"(v610)); - svfloat32_t v660; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v660) : "w"(v484), "w"(v1491)); - svfloat32_t v672; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v672) : "w"(v611), "w"(v1573)); - svfloat32_t v749; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v749) : "w"(v747), "w"(v748)); - svfloat32_t v750; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v750) : "w"(v748), "w"(v747)); - svfloat32_t v883; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v883) : "w"(v881), "w"(v882)); - svfloat32_t v884; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v884) : "w"(v882), "w"(v881)); - svfloat32_t v928; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v928) : "w"(v485), "w"(v1655)); - svfloat32_t v940; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v940) : "w"(v612), "w"(v1657)); - svfloat32_t v1017; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v1017) : "w"(v1015), "w"(v1016)); - svfloat32_t v1018; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1018) : "w"(v1016), "w"(v1015)); + svfloat32_t v486 = svsub_f32_x(svptrue_b32(), v403, v483); + svfloat32_t v487 = svadd_f32_x(svptrue_b32(), v403, v483); + svfloat32_t v613 = svsub_f32_x(svptrue_b32(), v530, v610); + svfloat32_t v614 = svadd_f32_x(svptrue_b32(), v530, v610); + svfloat32_t v660 = svmul_f32_x(svptrue_b32(), v484, v1491); + svfloat32_t v672 = svmul_f32_x(svptrue_b32(), v611, v1573); + svfloat32_t v749 = svadd_f32_x(svptrue_b32(), v747, v748); + svfloat32_t v750 = svsub_f32_x(svptrue_b32(), v748, v747); + svfloat32_t v883 = svadd_f32_x(svptrue_b32(), v881, v882); + svfloat32_t v884 = svsub_f32_x(svptrue_b32(), v882, v881); + svfloat32_t v928 = svmul_f32_x(svptrue_b32(), v485, v1655); + svfloat32_t v940 = svmul_f32_x(svptrue_b32(), v612, v1657); + svfloat32_t v1017 = svadd_f32_x(svptrue_b32(), v1015, v1016); + svfloat32_t v1018 = svsub_f32_x(svptrue_b32(), v1016, v1015); svst1_f64(pred_full, (double *)(v1470), svreinterpret_f64_f32(v626)); svst1_f64(pred_full, (double *)(v1488), svreinterpret_f64_f32(v627)); - svfloat32_t v281; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v281) : "w"(v142), "w"(v278)); - svfloat32_t v282; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v282) : "w"(v142), "w"(v278)); - svfloat32_t v320; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v320) : "w"(v105), "w"(v317)); - svfloat32_t v321; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v321) : "w"(v105), "w"(v317)); - svfloat32_t v357; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v357) : "w"(v143), "w"(v348)); - svfloat32_t v358; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v358) : "w"(v143), "w"(v348)); - svfloat32_t v359; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v359) : "w"(v144), "w"(v356)); - svfloat32_t v360; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v360) : "w"(v144), "w"(v356)); + svfloat32_t v281 = svsub_f32_x(svptrue_b32(), v142, v278); + svfloat32_t v282 = svadd_f32_x(svptrue_b32(), v142, v278); + svfloat32_t v320 = svsub_f32_x(svptrue_b32(), v105, v317); + svfloat32_t v321 = svadd_f32_x(svptrue_b32(), v105, v317); + svfloat32_t v357 = svadd_f32_x(svptrue_b32(), v143, v348); + svfloat32_t v358 = svsub_f32_x(svptrue_b32(), v143, v348); + svfloat32_t v359 = svsub_f32_x(svptrue_b32(), v144, v356); + svfloat32_t v360 = svadd_f32_x(svptrue_b32(), v144, v356); svfloat32_t v680 = svcmla_f32_x(pred_full, v660, v1658, v484, 90); svfloat32_t v681 = svcmla_f32_x(pred_full, v672, v1574, v611, 90); - svfloat32_t zero757; - asm volatile("mov %0.s, #0" : "=w"(zero757)); + svfloat32_t zero757 = svdup_n_f32(0); svfloat32_t v757 = svcmla_f32_x(pred_full, zero757, v1741, v750, 90); - svfloat32_t v758; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v758) : "w"(v318), "w"(v749)); - svfloat32_t v759; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v759) : "w"(v318), "w"(v749)); - svfloat32_t v794; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v794) : "w"(v486), "w"(v1573)); - svfloat32_t v806; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v806) : "w"(v613), "w"(v1575)); - svfloat32_t zero891; - asm volatile("mov %0.s, #0" : "=w"(zero891)); + svfloat32_t v758 = svadd_f32_x(svptrue_b32(), v318, v749); + svfloat32_t v759 = svsub_f32_x(svptrue_b32(), v318, v749); + svfloat32_t v794 = svmul_f32_x(svptrue_b32(), v486, v1573); + svfloat32_t v806 = svmul_f32_x(svptrue_b32(), v613, v1575); + svfloat32_t zero891 = svdup_n_f32(0); svfloat32_t v891 = svcmla_f32_x(pred_full, zero891, v1741, v884, 90); - svfloat32_t v892; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v892) : "w"(v242), "w"(v883)); - svfloat32_t v893; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v893) : "w"(v242), "w"(v883)); + svfloat32_t v892 = svadd_f32_x(svptrue_b32(), v242, v883); + svfloat32_t v893 = svsub_f32_x(svptrue_b32(), v242, v883); svfloat32_t v948 = svcmla_f32_x(pred_full, v928, v1656, v485, 90); svfloat32_t v949 = svcmla_f32_x(pred_full, v940, v1658, v612, 90); - svfloat32_t zero1025; - asm volatile("mov %0.s, #0" : "=w"(zero1025)); + svfloat32_t zero1025 = svdup_n_f32(0); svfloat32_t v1025 = svcmla_f32_x(pred_full, zero1025, v1741, v1018, 90); - svfloat32_t v1062; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v1062) : "w"(v487), "w"(v1737)); - svfloat32_t v1074; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v1074) : "w"(v614), "w"(v1739)); - svfloat32_t v682; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v682) : "w"(v680), "w"(v681)); - svfloat32_t v683; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v683) : "w"(v681), "w"(v680)); - svfloat32_t v760; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v760) : "w"(v319), "w"(v757)); - svfloat32_t v761; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v761) : "w"(v319), "w"(v757)); + svfloat32_t v1062 = svmul_f32_x(svptrue_b32(), v487, v1737); + svfloat32_t v1074 = svmul_f32_x(svptrue_b32(), v614, v1739); + svfloat32_t v682 = svadd_f32_x(svptrue_b32(), v680, v681); + svfloat32_t v683 = svsub_f32_x(svptrue_b32(), v681, v680); + svfloat32_t v760 = svsub_f32_x(svptrue_b32(), v319, v757); + svfloat32_t v761 = svadd_f32_x(svptrue_b32(), v319, v757); svfloat32_t v814 = svcmla_f32_x(pred_full, v794, v1574, v486, 90); svfloat32_t v815 = svcmla_f32_x(pred_full, v806, v1738, v613, 90); - svfloat32_t v894; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v894) : "w"(v243), "w"(v891)); - svfloat32_t v895; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v895) : "w"(v243), "w"(v891)); - svfloat32_t v950; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v950) : "w"(v948), "w"(v949)); - svfloat32_t v951; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v951) : "w"(v949), "w"(v948)); - svfloat32_t v1026; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v1026) : "w"(v320), "w"(v1017)); - svfloat32_t v1027; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1027) : "w"(v320), "w"(v1017)); - svfloat32_t v1028; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1028) : "w"(v321), "w"(v1025)); - svfloat32_t v1029; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v1029) : "w"(v321), "w"(v1025)); + svfloat32_t v894 = svsub_f32_x(svptrue_b32(), v243, v891); + svfloat32_t v895 = svadd_f32_x(svptrue_b32(), v243, v891); + svfloat32_t v950 = svadd_f32_x(svptrue_b32(), v948, v949); + svfloat32_t v951 = svsub_f32_x(svptrue_b32(), v949, v948); + svfloat32_t v1026 = svadd_f32_x(svptrue_b32(), v320, v1017); + svfloat32_t v1027 = svsub_f32_x(svptrue_b32(), v320, v1017); + svfloat32_t v1028 = svsub_f32_x(svptrue_b32(), v321, v1025); + svfloat32_t v1029 = svadd_f32_x(svptrue_b32(), v321, v1025); svfloat32_t v1082 = svcmla_f32_x(pred_full, v1062, v1738, v487, 90); svfloat32_t v1083 = svcmla_f32_x(pred_full, v1074, v1740, v614, 90); svst1_f64(pred_full, (double *)(v1543), svreinterpret_f64_f32(v758)); svst1_f64(pred_full, (double *)(v1561), svreinterpret_f64_f32(v759)); svst1_f64(pred_full, (double *)(v1625), svreinterpret_f64_f32(v892)); svst1_f64(pred_full, (double *)(v1643), svreinterpret_f64_f32(v893)); - svfloat32_t zero690; - asm volatile("mov %0.s, #0" : "=w"(zero690)); + svfloat32_t zero690 = svdup_n_f32(0); svfloat32_t v690 = svcmla_f32_x(pred_full, zero690, v1741, v683, 90); - svfloat32_t v691; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v691) : "w"(v279), "w"(v682)); - svfloat32_t v692; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v692) : "w"(v279), "w"(v682)); - svfloat32_t v816; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v816) : "w"(v814), "w"(v815)); - svfloat32_t v817; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v817) : "w"(v815), "w"(v814)); - svfloat32_t zero958; - asm volatile("mov %0.s, #0" : "=w"(zero958)); + svfloat32_t v691 = svadd_f32_x(svptrue_b32(), v279, v682); + svfloat32_t v692 = svsub_f32_x(svptrue_b32(), v279, v682); + svfloat32_t v816 = svadd_f32_x(svptrue_b32(), v814, v815); + svfloat32_t v817 = svsub_f32_x(svptrue_b32(), v815, v814); + svfloat32_t zero958 = svdup_n_f32(0); svfloat32_t v958 = svcmla_f32_x(pred_full, zero958, v1741, v951, 90); - svfloat32_t v959; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v959) : "w"(v281), "w"(v950)); - svfloat32_t v960; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v960) : "w"(v281), "w"(v950)); - svfloat32_t v1084; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v1084) : "w"(v1082), "w"(v1083)); - svfloat32_t v1085; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1085) : "w"(v1083), "w"(v1082)); + svfloat32_t v959 = svadd_f32_x(svptrue_b32(), v281, v950); + svfloat32_t v960 = svsub_f32_x(svptrue_b32(), v281, v950); + svfloat32_t v1084 = svadd_f32_x(svptrue_b32(), v1082, v1083); + svfloat32_t v1085 = svsub_f32_x(svptrue_b32(), v1083, v1082); svst1_f64(pred_full, (double *)(v1552), svreinterpret_f64_f32(v760)); svst1_f64(pred_full, (double *)(v1570), svreinterpret_f64_f32(v761)); svst1_f64(pred_full, (double *)(v1634), svreinterpret_f64_f32(v894)); @@ -12912,40 +8146,26 @@ void armral_fft_cf32_cf32_cf32_ac_n_gu32(const armral_cmplx_f32_t *restrict x, svst1_f64(pred_full, (double *)(v1716), svreinterpret_f64_f32(v1028)); svst1_f64(pred_full, (double *)(v1725), svreinterpret_f64_f32(v1027)); svst1_f64(pred_full, (double *)(v1734), svreinterpret_f64_f32(v1029)); - svfloat32_t v693; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v693) : "w"(v280), "w"(v690)); - svfloat32_t v694; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v694) : "w"(v280), "w"(v690)); - svfloat32_t zero824; - asm volatile("mov %0.s, #0" : "=w"(zero824)); + svfloat32_t v693 = svsub_f32_x(svptrue_b32(), v280, v690); + svfloat32_t v694 = svadd_f32_x(svptrue_b32(), v280, v690); + svfloat32_t zero824 = svdup_n_f32(0); svfloat32_t v824 = svcmla_f32_x(pred_full, zero824, v1741, v817, 90); - svfloat32_t v825; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v825) : "w"(v357), "w"(v816)); - svfloat32_t v826; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v826) : "w"(v357), "w"(v816)); - svfloat32_t v961; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v961) : "w"(v282), "w"(v958)); - svfloat32_t v962; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v962) : "w"(v282), "w"(v958)); - svfloat32_t zero1092; - asm volatile("mov %0.s, #0" : "=w"(zero1092)); + svfloat32_t v825 = svadd_f32_x(svptrue_b32(), v357, v816); + svfloat32_t v826 = svsub_f32_x(svptrue_b32(), v357, v816); + svfloat32_t v961 = svsub_f32_x(svptrue_b32(), v282, v958); + svfloat32_t v962 = svadd_f32_x(svptrue_b32(), v282, v958); + svfloat32_t zero1092 = svdup_n_f32(0); svfloat32_t v1092 = svcmla_f32_x(pred_full, zero1092, v1741, v1085, 90); - svfloat32_t v1093; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v1093) : "w"(v359), "w"(v1084)); - svfloat32_t v1094; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1094) : "w"(v359), "w"(v1084)); + svfloat32_t v1093 = svadd_f32_x(svptrue_b32(), v359, v1084); + svfloat32_t v1094 = svsub_f32_x(svptrue_b32(), v359, v1084); svst1_f64(pred_full, (double *)(v1502), svreinterpret_f64_f32(v691)); svst1_f64(pred_full, (double *)(v1520), svreinterpret_f64_f32(v692)); svst1_f64(pred_full, (double *)(v1666), svreinterpret_f64_f32(v959)); svst1_f64(pred_full, (double *)(v1684), svreinterpret_f64_f32(v960)); - svfloat32_t v827; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v827) : "w"(v358), "w"(v824)); - svfloat32_t v828; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v828) : "w"(v358), "w"(v824)); - svfloat32_t v1095; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1095) : "w"(v360), "w"(v1092)); - svfloat32_t v1096; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v1096) : "w"(v360), "w"(v1092)); + svfloat32_t v827 = svsub_f32_x(svptrue_b32(), v358, v824); + svfloat32_t v828 = svadd_f32_x(svptrue_b32(), v358, v824); + svfloat32_t v1095 = svsub_f32_x(svptrue_b32(), v360, v1092); + svfloat32_t v1096 = svadd_f32_x(svptrue_b32(), v360, v1092); svst1_f64(pred_full, (double *)(v1511), svreinterpret_f64_f32(v693)); svst1_f64(pred_full, (double *)(v1529), svreinterpret_f64_f32(v694)); svst1_f64(pred_full, (double *)(v1584), svreinterpret_f64_f32(v825)); diff --git a/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ac_n_gu.h b/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ac_n_gu.h index 3c6966f3555bde2fb4fe238dc23003aadd24d1a6..4db92c18f7a83f84bc13fb321379c304e8e82135 100644 --- a/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ac_n_gu.h +++ b/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ac_n_gu.h @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #pragma once @@ -16,20 +18,8 @@ typedef void(cf32_cf32_cf32_ac_n_gu_fft_t)(const armral_cmplx_f32_t *x, int ostride, int howmany, int idist, float dir); -cf32_cf32_cf32_ac_n_gu_fft_t armral_fft_cf32_cf32_cf32_ac_n_gu2; -cf32_cf32_cf32_ac_n_gu_fft_t armral_fft_cf32_cf32_cf32_ac_n_gu3; -cf32_cf32_cf32_ac_n_gu_fft_t armral_fft_cf32_cf32_cf32_ac_n_gu4; -cf32_cf32_cf32_ac_n_gu_fft_t armral_fft_cf32_cf32_cf32_ac_n_gu5; -cf32_cf32_cf32_ac_n_gu_fft_t armral_fft_cf32_cf32_cf32_ac_n_gu6; -cf32_cf32_cf32_ac_n_gu_fft_t armral_fft_cf32_cf32_cf32_ac_n_gu7; -cf32_cf32_cf32_ac_n_gu_fft_t armral_fft_cf32_cf32_cf32_ac_n_gu8; -cf32_cf32_cf32_ac_n_gu_fft_t armral_fft_cf32_cf32_cf32_ac_n_gu9; -cf32_cf32_cf32_ac_n_gu_fft_t armral_fft_cf32_cf32_cf32_ac_n_gu10; -cf32_cf32_cf32_ac_n_gu_fft_t armral_fft_cf32_cf32_cf32_ac_n_gu11; -cf32_cf32_cf32_ac_n_gu_fft_t armral_fft_cf32_cf32_cf32_ac_n_gu12; cf32_cf32_cf32_ac_n_gu_fft_t armral_fft_cf32_cf32_cf32_ac_n_gu13; cf32_cf32_cf32_ac_n_gu_fft_t armral_fft_cf32_cf32_cf32_ac_n_gu14; -cf32_cf32_cf32_ac_n_gu_fft_t armral_fft_cf32_cf32_cf32_ac_n_gu15; cf32_cf32_cf32_ac_n_gu_fft_t armral_fft_cf32_cf32_cf32_ac_n_gu16; cf32_cf32_cf32_ac_n_gu_fft_t armral_fft_cf32_cf32_cf32_ac_n_gu17; cf32_cf32_cf32_ac_n_gu_fft_t armral_fft_cf32_cf32_cf32_ac_n_gu18; diff --git a/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ac_n_uu.c b/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ac_n_uu.c index 237b077fa7950c58577d7ee69e1c068b35e9627c..0686272360a2979d6f6a6790ab50d3792b1700dc 100644 --- a/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ac_n_uu.c +++ b/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ac_n_uu.c @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "fft_cf32_cf32_cf32_ac_n_uu.h" @@ -68,10 +70,8 @@ void armral_fft_cf32_cf32_cf32_ac_n_uu2(const armral_cmplx_f32_t *restrict x, svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v74)[0])); svfloat32_t v99 = svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v65)[0])); - svfloat32_t v32; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v32) : "w"(v99), "w"(v101)); - svfloat32_t v33; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v33) : "w"(v99), "w"(v101)); + svfloat32_t v32 = svadd_f32_x(svptrue_b32(), v99, v101); + svfloat32_t v33 = svsub_f32_x(svptrue_b32(), v99, v101); svst1_f64(pred_full, (double *)(v86), svreinterpret_f64_f32(v32)); svst1_f64(pred_full, (double *)(v95), svreinterpret_f64_f32(v33)); v5 += v11; @@ -188,21 +188,15 @@ void armral_fft_cf32_cf32_cf32_ac_n_uu3(const armral_cmplx_f32_t *restrict x, svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v108)[0])); svfloat32_t v145 = svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v98)[0])); - svfloat32_t v32; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v32) : "w"(v143), "w"(v145)); - svfloat32_t v33; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v33) : "w"(v143), "w"(v145)); - svfloat32_t v41; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v41) : "w"(v32), "w"(v147)); - svfloat32_t zero58; - asm volatile("mov %0.s, #0" : "=w"(zero58)); + svfloat32_t v32 = svadd_f32_x(svptrue_b32(), v143, v145); + svfloat32_t v33 = svsub_f32_x(svptrue_b32(), v143, v145); + svfloat32_t v41 = svadd_f32_x(svptrue_b32(), v32, v147); + svfloat32_t zero58 = svdup_n_f32(0); svfloat32_t v58 = svcmla_f32_x(pred_full, zero58, v113, v33, 90); svfloat32_t v59 = svmla_f32_x(pred_full, v41, v32, v112); svst1_f64(pred_full, (double *)(v121), svreinterpret_f64_f32(v41)); - svfloat32_t v60; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v60) : "w"(v59), "w"(v58)); - svfloat32_t v61; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v61) : "w"(v59), "w"(v58)); + svfloat32_t v60 = svadd_f32_x(svptrue_b32(), v59, v58); + svfloat32_t v61 = svsub_f32_x(svptrue_b32(), v59, v58); svst1_f64(pred_full, (double *)(v130), svreinterpret_f64_f32(v61)); svst1_f64(pred_full, (double *)(v139), svreinterpret_f64_f32(v60)); v5 += v11; @@ -326,25 +320,16 @@ void armral_fft_cf32_cf32_cf32_ac_n_uu4(const armral_cmplx_f32_t *restrict x, svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v120)[0])); svfloat32_t v189 = svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v138)[0])); - svfloat32_t v32; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v32) : "w"(v183), "w"(v185)); - svfloat32_t v33; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v33) : "w"(v183), "w"(v185)); - svfloat32_t v48; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v48) : "w"(v187), "w"(v189)); - svfloat32_t v49; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v49) : "w"(v187), "w"(v189)); - svfloat32_t v50; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v50) : "w"(v32), "w"(v48)); - svfloat32_t v51; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v51) : "w"(v32), "w"(v48)); - svfloat32_t zero73; - asm volatile("mov %0.s, #0" : "=w"(zero73)); + svfloat32_t v32 = svadd_f32_x(svptrue_b32(), v183, v185); + svfloat32_t v33 = svsub_f32_x(svptrue_b32(), v183, v185); + svfloat32_t v48 = svadd_f32_x(svptrue_b32(), v187, v189); + svfloat32_t v49 = svsub_f32_x(svptrue_b32(), v187, v189); + svfloat32_t v50 = svadd_f32_x(svptrue_b32(), v32, v48); + svfloat32_t v51 = svsub_f32_x(svptrue_b32(), v32, v48); + svfloat32_t zero73 = svdup_n_f32(0); svfloat32_t v73 = svcmla_f32_x(pred_full, zero73, v144, v49, 90); - svfloat32_t v74; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v74) : "w"(v33), "w"(v73)); - svfloat32_t v75; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v75) : "w"(v33), "w"(v73)); + svfloat32_t v74 = svadd_f32_x(svptrue_b32(), v33, v73); + svfloat32_t v75 = svsub_f32_x(svptrue_b32(), v33, v73); svst1_f64(pred_full, (double *)(v152), svreinterpret_f64_f32(v50)); svst1_f64(pred_full, (double *)(v170), svreinterpret_f64_f32(v51)); svst1_f64(pred_full, (double *)(v161), svreinterpret_f64_f32(v75)); @@ -550,43 +535,28 @@ void armral_fft_cf32_cf32_cf32_ac_n_uu5(const armral_cmplx_f32_t *restrict x, svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v165)[0])); svfloat32_t v246 = svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v174)[0])); - svfloat32_t v32; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v32) : "w"(v240), "w"(v242)); - svfloat32_t v33; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v33) : "w"(v240), "w"(v242)); - svfloat32_t v48; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v48) : "w"(v244), "w"(v246)); - svfloat32_t v49; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v49) : "w"(v244), "w"(v246)); - svfloat32_t v50; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v50) : "w"(v32), "w"(v48)); - svfloat32_t v51; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v51) : "w"(v32), "w"(v48)); - svfloat32_t v52; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v52) : "w"(v33), "w"(v49)); - svfloat32_t zero82; - asm volatile("mov %0.s, #0" : "=w"(zero82)); + svfloat32_t v32 = svadd_f32_x(svptrue_b32(), v240, v242); + svfloat32_t v33 = svsub_f32_x(svptrue_b32(), v240, v242); + svfloat32_t v48 = svadd_f32_x(svptrue_b32(), v244, v246); + svfloat32_t v49 = svsub_f32_x(svptrue_b32(), v244, v246); + svfloat32_t v50 = svadd_f32_x(svptrue_b32(), v32, v48); + svfloat32_t v51 = svsub_f32_x(svptrue_b32(), v32, v48); + svfloat32_t v52 = svadd_f32_x(svptrue_b32(), v33, v49); + svfloat32_t zero82 = svdup_n_f32(0); svfloat32_t v82 = svcmla_f32_x(pred_full, zero82, v190, v33, 90); - svfloat32_t v60; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v60) : "w"(v50), "w"(v248)); - svfloat32_t zero89; - asm volatile("mov %0.s, #0" : "=w"(zero89)); + svfloat32_t v60 = svadd_f32_x(svptrue_b32(), v50, v248); + svfloat32_t zero89 = svdup_n_f32(0); svfloat32_t v89 = svcmla_f32_x(pred_full, zero89, v191, v52, 90); svfloat32_t v97 = svmla_f32_x(pred_full, v60, v50, v188); - svfloat32_t v100; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v100) : "w"(v82), "w"(v89)); + svfloat32_t v100 = svsub_f32_x(svptrue_b32(), v82, v89); svfloat32_t v101 = svcmla_f32_x(pred_full, v89, v192, v49, 90); svst1_f64(pred_full, (double *)(v200), svreinterpret_f64_f32(v60)); svfloat32_t v98 = svmla_f32_x(pred_full, v97, v51, v189); svfloat32_t v99 = svmls_f32_x(pred_full, v97, v51, v189); - svfloat32_t v102; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v102) : "w"(v98), "w"(v100)); - svfloat32_t v103; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v103) : "w"(v98), "w"(v100)); - svfloat32_t v104; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v104) : "w"(v99), "w"(v101)); - svfloat32_t v105; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v105) : "w"(v99), "w"(v101)); + svfloat32_t v102 = svadd_f32_x(svptrue_b32(), v98, v100); + svfloat32_t v103 = svsub_f32_x(svptrue_b32(), v98, v100); + svfloat32_t v104 = svadd_f32_x(svptrue_b32(), v99, v101); + svfloat32_t v105 = svsub_f32_x(svptrue_b32(), v99, v101); svst1_f64(pred_full, (double *)(v209), svreinterpret_f64_f32(v103)); svst1_f64(pred_full, (double *)(v218), svreinterpret_f64_f32(v105)); svst1_f64(pred_full, (double *)(v227), svreinterpret_f64_f32(v104)); @@ -771,48 +741,30 @@ void armral_fft_cf32_cf32_cf32_ac_n_uu6(const armral_cmplx_f32_t *restrict x, svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v188)[0])); svfloat32_t v279 = svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v197)[0])); - svfloat32_t v32; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v32) : "w"(v271), "w"(v273)); - svfloat32_t v33; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v33) : "w"(v271), "w"(v273)); - svfloat32_t v48; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v48) : "w"(v275), "w"(v277)); - svfloat32_t v49; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v49) : "w"(v275), "w"(v277)); - svfloat32_t v64; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v64) : "w"(v279), "w"(v281)); - svfloat32_t v65; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v65) : "w"(v279), "w"(v281)); - svfloat32_t v66; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v66) : "w"(v48), "w"(v64)); - svfloat32_t v67; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v67) : "w"(v48), "w"(v64)); - svfloat32_t v89; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v89) : "w"(v49), "w"(v65)); - svfloat32_t v90; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v90) : "w"(v49), "w"(v65)); - svfloat32_t v68; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v68) : "w"(v66), "w"(v32)); - svfloat32_t zero85; - asm volatile("mov %0.s, #0" : "=w"(zero85)); + svfloat32_t v32 = svadd_f32_x(svptrue_b32(), v271, v273); + svfloat32_t v33 = svsub_f32_x(svptrue_b32(), v271, v273); + svfloat32_t v48 = svadd_f32_x(svptrue_b32(), v275, v277); + svfloat32_t v49 = svsub_f32_x(svptrue_b32(), v275, v277); + svfloat32_t v64 = svadd_f32_x(svptrue_b32(), v279, v281); + svfloat32_t v65 = svsub_f32_x(svptrue_b32(), v279, v281); + svfloat32_t v66 = svadd_f32_x(svptrue_b32(), v48, v64); + svfloat32_t v67 = svsub_f32_x(svptrue_b32(), v48, v64); + svfloat32_t v89 = svadd_f32_x(svptrue_b32(), v49, v65); + svfloat32_t v90 = svsub_f32_x(svptrue_b32(), v49, v65); + svfloat32_t v68 = svadd_f32_x(svptrue_b32(), v66, v32); + svfloat32_t zero85 = svdup_n_f32(0); svfloat32_t v85 = svcmla_f32_x(pred_full, zero85, v214, v67, 90); - svfloat32_t v91; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v91) : "w"(v89), "w"(v33)); - svfloat32_t zero108; - asm volatile("mov %0.s, #0" : "=w"(zero108)); + svfloat32_t v91 = svadd_f32_x(svptrue_b32(), v89, v33); + svfloat32_t zero108 = svdup_n_f32(0); svfloat32_t v108 = svcmla_f32_x(pred_full, zero108, v214, v90, 90); svfloat32_t v86 = svmla_f32_x(pred_full, v68, v66, v213); svfloat32_t v109 = svmla_f32_x(pred_full, v91, v89, v213); svst1_f64(pred_full, (double *)(v222), svreinterpret_f64_f32(v68)); svst1_f64(pred_full, (double *)(v231), svreinterpret_f64_f32(v91)); - svfloat32_t v87; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v87) : "w"(v86), "w"(v85)); - svfloat32_t v88; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v88) : "w"(v86), "w"(v85)); - svfloat32_t v110; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v110) : "w"(v109), "w"(v108)); - svfloat32_t v111; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v111) : "w"(v109), "w"(v108)); + svfloat32_t v87 = svadd_f32_x(svptrue_b32(), v86, v85); + svfloat32_t v88 = svsub_f32_x(svptrue_b32(), v86, v85); + svfloat32_t v110 = svadd_f32_x(svptrue_b32(), v109, v108); + svfloat32_t v111 = svsub_f32_x(svptrue_b32(), v109, v108); svst1_f64(pred_full, (double *)(v240), svreinterpret_f64_f32(v88)); svst1_f64(pred_full, (double *)(v249), svreinterpret_f64_f32(v111)); svst1_f64(pred_full, (double *)(v258), svreinterpret_f64_f32(v87)); @@ -1114,84 +1066,51 @@ void armral_fft_cf32_cf32_cf32_ac_n_uu7(const armral_cmplx_f32_t *restrict x, svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v247)[0])); svfloat32_t v353 = svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v256)[0])); - svfloat32_t v32; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v32) : "w"(v343), "w"(v345)); - svfloat32_t v33; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v33) : "w"(v343), "w"(v345)); - svfloat32_t v48; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v48) : "w"(v347), "w"(v349)); - svfloat32_t v49; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v49) : "w"(v347), "w"(v349)); - svfloat32_t v64; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v64) : "w"(v351), "w"(v353)); - svfloat32_t v65; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v65) : "w"(v351), "w"(v353)); - svfloat32_t v66; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v66) : "w"(v32), "w"(v48)); - svfloat32_t v76; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v76) : "w"(v32), "w"(v48)); - svfloat32_t v77; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v77) : "w"(v48), "w"(v64)); - svfloat32_t v78; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v78) : "w"(v64), "w"(v32)); - svfloat32_t v79; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v79) : "w"(v33), "w"(v49)); - svfloat32_t v81; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v81) : "w"(v33), "w"(v49)); - svfloat32_t v82; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v82) : "w"(v49), "w"(v65)); - svfloat32_t v83; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v83) : "w"(v65), "w"(v33)); - svfloat32_t v67; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v67) : "w"(v66), "w"(v64)); - svfloat32_t v80; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v80) : "w"(v79), "w"(v65)); - svfloat32_t zero122; - asm volatile("mov %0.s, #0" : "=w"(zero122)); + svfloat32_t v32 = svadd_f32_x(svptrue_b32(), v343, v345); + svfloat32_t v33 = svsub_f32_x(svptrue_b32(), v343, v345); + svfloat32_t v48 = svadd_f32_x(svptrue_b32(), v347, v349); + svfloat32_t v49 = svsub_f32_x(svptrue_b32(), v347, v349); + svfloat32_t v64 = svadd_f32_x(svptrue_b32(), v351, v353); + svfloat32_t v65 = svsub_f32_x(svptrue_b32(), v351, v353); + svfloat32_t v66 = svadd_f32_x(svptrue_b32(), v32, v48); + svfloat32_t v76 = svsub_f32_x(svptrue_b32(), v32, v48); + svfloat32_t v77 = svsub_f32_x(svptrue_b32(), v48, v64); + svfloat32_t v78 = svsub_f32_x(svptrue_b32(), v64, v32); + svfloat32_t v79 = svadd_f32_x(svptrue_b32(), v33, v49); + svfloat32_t v81 = svsub_f32_x(svptrue_b32(), v33, v49); + svfloat32_t v82 = svsub_f32_x(svptrue_b32(), v49, v65); + svfloat32_t v83 = svsub_f32_x(svptrue_b32(), v65, v33); + svfloat32_t v67 = svadd_f32_x(svptrue_b32(), v66, v64); + svfloat32_t v80 = svadd_f32_x(svptrue_b32(), v79, v65); + svfloat32_t zero122 = svdup_n_f32(0); svfloat32_t v122 = svcmla_f32_x(pred_full, zero122, v275, v81, 90); - svfloat32_t zero129; - asm volatile("mov %0.s, #0" : "=w"(zero129)); + svfloat32_t zero129 = svdup_n_f32(0); svfloat32_t v129 = svcmla_f32_x(pred_full, zero129, v276, v82, 90); - svfloat32_t zero136; - asm volatile("mov %0.s, #0" : "=w"(zero136)); + svfloat32_t zero136 = svdup_n_f32(0); svfloat32_t v136 = svcmla_f32_x(pred_full, zero136, v277, v83, 90); - svfloat32_t v75; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v75) : "w"(v67), "w"(v355)); - svfloat32_t zero115; - asm volatile("mov %0.s, #0" : "=w"(zero115)); + svfloat32_t v75 = svadd_f32_x(svptrue_b32(), v67, v355); + svfloat32_t zero115 = svdup_n_f32(0); svfloat32_t v115 = svcmla_f32_x(pred_full, zero115, v274, v80, 90); svfloat32_t v137 = svmla_f32_x(pred_full, v75, v67, v270); - svfloat32_t v144; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v144) : "w"(v115), "w"(v122)); - svfloat32_t v146; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v146) : "w"(v115), "w"(v122)); - svfloat32_t v148; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v148) : "w"(v115), "w"(v129)); + svfloat32_t v144 = svadd_f32_x(svptrue_b32(), v115, v122); + svfloat32_t v146 = svsub_f32_x(svptrue_b32(), v115, v122); + svfloat32_t v148 = svsub_f32_x(svptrue_b32(), v115, v129); svst1_f64(pred_full, (double *)(v285), svreinterpret_f64_f32(v75)); svfloat32_t v138 = svmla_f32_x(pred_full, v137, v76, v271); svfloat32_t v140 = svmls_f32_x(pred_full, v137, v76, v271); svfloat32_t v142 = svmls_f32_x(pred_full, v137, v77, v272); - svfloat32_t v145; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v145) : "w"(v144), "w"(v129)); - svfloat32_t v147; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v147) : "w"(v146), "w"(v136)); - svfloat32_t v149; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v149) : "w"(v148), "w"(v136)); + svfloat32_t v145 = svadd_f32_x(svptrue_b32(), v144, v129); + svfloat32_t v147 = svsub_f32_x(svptrue_b32(), v146, v136); + svfloat32_t v149 = svadd_f32_x(svptrue_b32(), v148, v136); svfloat32_t v139 = svmla_f32_x(pred_full, v138, v77, v272); svfloat32_t v141 = svmls_f32_x(pred_full, v140, v78, v273); svfloat32_t v143 = svmla_f32_x(pred_full, v142, v78, v273); - svfloat32_t v150; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v150) : "w"(v139), "w"(v145)); - svfloat32_t v151; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v151) : "w"(v139), "w"(v145)); - svfloat32_t v152; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v152) : "w"(v141), "w"(v147)); - svfloat32_t v153; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v153) : "w"(v141), "w"(v147)); - svfloat32_t v154; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v154) : "w"(v143), "w"(v149)); - svfloat32_t v155; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v155) : "w"(v143), "w"(v149)); + svfloat32_t v150 = svadd_f32_x(svptrue_b32(), v139, v145); + svfloat32_t v151 = svsub_f32_x(svptrue_b32(), v139, v145); + svfloat32_t v152 = svadd_f32_x(svptrue_b32(), v141, v147); + svfloat32_t v153 = svsub_f32_x(svptrue_b32(), v141, v147); + svfloat32_t v154 = svadd_f32_x(svptrue_b32(), v143, v149); + svfloat32_t v155 = svsub_f32_x(svptrue_b32(), v143, v149); svst1_f64(pred_full, (double *)(v294), svreinterpret_f64_f32(v151)); svst1_f64(pred_full, (double *)(v303), svreinterpret_f64_f32(v153)); svst1_f64(pred_full, (double *)(v312), svreinterpret_f64_f32(v154)); @@ -1430,67 +1349,40 @@ void armral_fft_cf32_cf32_cf32_ac_n_uu8(const armral_cmplx_f32_t *restrict x, svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v263)[0])); svfloat32_t v371 = svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v272)[0])); - svfloat32_t v32; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v32) : "w"(v357), "w"(v359)); - svfloat32_t v33; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v33) : "w"(v357), "w"(v359)); - svfloat32_t v48; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v48) : "w"(v361), "w"(v363)); - svfloat32_t v49; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v49) : "w"(v361), "w"(v363)); - svfloat32_t v64; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v64) : "w"(v365), "w"(v367)); - svfloat32_t v65; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v65) : "w"(v365), "w"(v367)); - svfloat32_t v80; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v80) : "w"(v369), "w"(v371)); - svfloat32_t v81; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v81) : "w"(v369), "w"(v371)); - svfloat32_t v82; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v82) : "w"(v32), "w"(v48)); - svfloat32_t v83; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v83) : "w"(v32), "w"(v48)); - svfloat32_t v84; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v84) : "w"(v64), "w"(v80)); - svfloat32_t v85; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v85) : "w"(v64), "w"(v80)); - svfloat32_t v88; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v88) : "w"(v65), "w"(v81)); - svfloat32_t v89; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v89) : "w"(v65), "w"(v81)); - svfloat32_t zero123; - asm volatile("mov %0.s, #0" : "=w"(zero123)); + svfloat32_t v32 = svadd_f32_x(svptrue_b32(), v357, v359); + svfloat32_t v33 = svsub_f32_x(svptrue_b32(), v357, v359); + svfloat32_t v48 = svadd_f32_x(svptrue_b32(), v361, v363); + svfloat32_t v49 = svsub_f32_x(svptrue_b32(), v361, v363); + svfloat32_t v64 = svadd_f32_x(svptrue_b32(), v365, v367); + svfloat32_t v65 = svsub_f32_x(svptrue_b32(), v365, v367); + svfloat32_t v80 = svadd_f32_x(svptrue_b32(), v369, v371); + svfloat32_t v81 = svsub_f32_x(svptrue_b32(), v369, v371); + svfloat32_t v82 = svadd_f32_x(svptrue_b32(), v32, v48); + svfloat32_t v83 = svsub_f32_x(svptrue_b32(), v32, v48); + svfloat32_t v84 = svadd_f32_x(svptrue_b32(), v64, v80); + svfloat32_t v85 = svsub_f32_x(svptrue_b32(), v64, v80); + svfloat32_t v88 = svadd_f32_x(svptrue_b32(), v65, v81); + svfloat32_t v89 = svsub_f32_x(svptrue_b32(), v65, v81); + svfloat32_t zero123 = svdup_n_f32(0); svfloat32_t v123 = svcmla_f32_x(pred_full, zero123, v280, v49, 90); - svfloat32_t v86; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v86) : "w"(v82), "w"(v84)); - svfloat32_t v87; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v87) : "w"(v82), "w"(v84)); - svfloat32_t zero111; - asm volatile("mov %0.s, #0" : "=w"(zero111)); + svfloat32_t v86 = svadd_f32_x(svptrue_b32(), v82, v84); + svfloat32_t v87 = svsub_f32_x(svptrue_b32(), v82, v84); + svfloat32_t zero111 = svdup_n_f32(0); svfloat32_t v111 = svcmla_f32_x(pred_full, zero111, v280, v85, 90); - svfloat32_t zero130; - asm volatile("mov %0.s, #0" : "=w"(zero130)); + svfloat32_t zero130 = svdup_n_f32(0); svfloat32_t v130 = svcmla_f32_x(pred_full, zero130, v281, v88, 90); - svfloat32_t v136; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v136) : "w"(v83), "w"(v111)); - svfloat32_t v137; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v137) : "w"(v83), "w"(v111)); + svfloat32_t v136 = svadd_f32_x(svptrue_b32(), v83, v111); + svfloat32_t v137 = svsub_f32_x(svptrue_b32(), v83, v111); svfloat32_t v138 = svmla_f32_x(pred_full, v33, v89, v282); svfloat32_t v139 = svmls_f32_x(pred_full, v33, v89, v282); - svfloat32_t v140; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v140) : "w"(v123), "w"(v130)); - svfloat32_t v141; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v141) : "w"(v123), "w"(v130)); + svfloat32_t v140 = svadd_f32_x(svptrue_b32(), v123, v130); + svfloat32_t v141 = svsub_f32_x(svptrue_b32(), v123, v130); svst1_f64(pred_full, (double *)(v290), svreinterpret_f64_f32(v86)); svst1_f64(pred_full, (double *)(v326), svreinterpret_f64_f32(v87)); - svfloat32_t v142; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v142) : "w"(v138), "w"(v140)); - svfloat32_t v143; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v143) : "w"(v138), "w"(v140)); - svfloat32_t v144; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v144) : "w"(v139), "w"(v141)); - svfloat32_t v145; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v145) : "w"(v139), "w"(v141)); + svfloat32_t v142 = svadd_f32_x(svptrue_b32(), v138, v140); + svfloat32_t v143 = svsub_f32_x(svptrue_b32(), v138, v140); + svfloat32_t v144 = svadd_f32_x(svptrue_b32(), v139, v141); + svfloat32_t v145 = svsub_f32_x(svptrue_b32(), v139, v141); svst1_f64(pred_full, (double *)(v308), svreinterpret_f64_f32(v137)); svst1_f64(pred_full, (double *)(v344), svreinterpret_f64_f32(v136)); svst1_f64(pred_full, (double *)(v299), svreinterpret_f64_f32(v143)); @@ -1849,88 +1741,51 @@ void armral_fft_cf32_cf32_cf32_ac_n_uu9(const armral_cmplx_f32_t *restrict x, svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v314)[0])); svfloat32_t v444 = svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v323)[0])); - svfloat32_t v32; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v32) : "w"(v430), "w"(v432)); - svfloat32_t v33; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v33) : "w"(v430), "w"(v432)); - svfloat32_t v48; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v48) : "w"(v434), "w"(v436)); - svfloat32_t v49; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v49) : "w"(v434), "w"(v436)); - svfloat32_t v64; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v64) : "w"(v438), "w"(v440)); - svfloat32_t v65; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v65) : "w"(v438), "w"(v440)); - svfloat32_t v80; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v80) : "w"(v442), "w"(v444)); - svfloat32_t v81; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v81) : "w"(v442), "w"(v444)); - svfloat32_t v82; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v82) : "w"(v32), "w"(v48)); - svfloat32_t v93; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v93) : "w"(v33), "w"(v49)); - svfloat32_t v95; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v95) : "w"(v32), "w"(v48)); - svfloat32_t v96; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v96) : "w"(v48), "w"(v80)); - svfloat32_t v97; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v97) : "w"(v80), "w"(v32)); - svfloat32_t v98; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v98) : "w"(v33), "w"(v49)); - svfloat32_t v99; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v99) : "w"(v49), "w"(v81)); - svfloat32_t v100; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v100) : "w"(v81), "w"(v33)); - svfloat32_t zero129; - asm volatile("mov %0.s, #0" : "=w"(zero129)); + svfloat32_t v32 = svadd_f32_x(svptrue_b32(), v430, v432); + svfloat32_t v33 = svsub_f32_x(svptrue_b32(), v430, v432); + svfloat32_t v48 = svadd_f32_x(svptrue_b32(), v434, v436); + svfloat32_t v49 = svsub_f32_x(svptrue_b32(), v434, v436); + svfloat32_t v64 = svadd_f32_x(svptrue_b32(), v438, v440); + svfloat32_t v65 = svsub_f32_x(svptrue_b32(), v438, v440); + svfloat32_t v80 = svadd_f32_x(svptrue_b32(), v442, v444); + svfloat32_t v81 = svsub_f32_x(svptrue_b32(), v442, v444); + svfloat32_t v82 = svadd_f32_x(svptrue_b32(), v32, v48); + svfloat32_t v93 = svadd_f32_x(svptrue_b32(), v33, v49); + svfloat32_t v95 = svsub_f32_x(svptrue_b32(), v32, v48); + svfloat32_t v96 = svsub_f32_x(svptrue_b32(), v48, v80); + svfloat32_t v97 = svsub_f32_x(svptrue_b32(), v80, v32); + svfloat32_t v98 = svsub_f32_x(svptrue_b32(), v33, v49); + svfloat32_t v99 = svsub_f32_x(svptrue_b32(), v49, v81); + svfloat32_t v100 = svsub_f32_x(svptrue_b32(), v81, v33); + svfloat32_t zero129 = svdup_n_f32(0); svfloat32_t v129 = svcmla_f32_x(pred_full, zero129, v340, v65, 90); - svfloat32_t v83; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v83) : "w"(v82), "w"(v80)); - svfloat32_t v94; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v94) : "w"(v93), "w"(v81)); - svfloat32_t zero151; - asm volatile("mov %0.s, #0" : "=w"(zero151)); + svfloat32_t v83 = svadd_f32_x(svptrue_b32(), v82, v80); + svfloat32_t v94 = svadd_f32_x(svptrue_b32(), v93, v81); + svfloat32_t zero151 = svdup_n_f32(0); svfloat32_t v151 = svcmla_f32_x(pred_full, zero151, v344, v98, 90); - svfloat32_t zero158; - asm volatile("mov %0.s, #0" : "=w"(zero158)); + svfloat32_t zero158 = svdup_n_f32(0); svfloat32_t v158 = svcmla_f32_x(pred_full, zero158, v345, v99, 90); - svfloat32_t zero165; - asm volatile("mov %0.s, #0" : "=w"(zero165)); + svfloat32_t zero165 = svdup_n_f32(0); svfloat32_t v165 = svcmla_f32_x(pred_full, zero165, v346, v100, 90); - svfloat32_t v84; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v84) : "w"(v83), "w"(v64)); - svfloat32_t v110; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v110) : "w"(v83), "w"(v337)); - svfloat32_t zero117; - asm volatile("mov %0.s, #0" : "=w"(zero117)); + svfloat32_t v84 = svadd_f32_x(svptrue_b32(), v83, v64); + svfloat32_t v110 = svmul_f32_x(svptrue_b32(), v83, v337); + svfloat32_t zero117 = svdup_n_f32(0); svfloat32_t v117 = svcmla_f32_x(pred_full, zero117, v340, v94, 90); - svfloat32_t v179; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v179) : "w"(v129), "w"(v151)); - svfloat32_t v181; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v181) : "w"(v129), "w"(v158)); - svfloat32_t v183; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v183) : "w"(v129), "w"(v151)); - svfloat32_t v92; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v92) : "w"(v84), "w"(v446)); - svfloat32_t v166; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v166) : "w"(v110), "w"(v110)); - svfloat32_t v180; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v180) : "w"(v179), "w"(v158)); - svfloat32_t v182; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v182) : "w"(v181), "w"(v165)); - svfloat32_t v184; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v184) : "w"(v183), "w"(v165)); + svfloat32_t v179 = svadd_f32_x(svptrue_b32(), v129, v151); + svfloat32_t v181 = svsub_f32_x(svptrue_b32(), v129, v158); + svfloat32_t v183 = svsub_f32_x(svptrue_b32(), v129, v151); + svfloat32_t v92 = svadd_f32_x(svptrue_b32(), v84, v446); + svfloat32_t v166 = svadd_f32_x(svptrue_b32(), v110, v110); + svfloat32_t v180 = svadd_f32_x(svptrue_b32(), v179, v158); + svfloat32_t v182 = svadd_f32_x(svptrue_b32(), v181, v165); + svfloat32_t v184 = svsub_f32_x(svptrue_b32(), v183, v165); svfloat32_t v167 = svmla_f32_x(pred_full, v166, v83, v337); svfloat32_t v171 = svmla_f32_x(pred_full, v92, v64, v339); svst1_f64(pred_full, (double *)(v354), svreinterpret_f64_f32(v92)); - svfloat32_t v168; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v168) : "w"(v92), "w"(v167)); - svfloat32_t v172; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v172) : "w"(v171), "w"(v166)); - svfloat32_t v169; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v169) : "w"(v168), "w"(v117)); - svfloat32_t v170; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v170) : "w"(v168), "w"(v117)); + svfloat32_t v168 = svadd_f32_x(svptrue_b32(), v92, v167); + svfloat32_t v172 = svadd_f32_x(svptrue_b32(), v171, v166); + svfloat32_t v169 = svadd_f32_x(svptrue_b32(), v168, v117); + svfloat32_t v170 = svsub_f32_x(svptrue_b32(), v168, v117); svfloat32_t v173 = svmla_f32_x(pred_full, v172, v95, v341); svfloat32_t v175 = svmls_f32_x(pred_full, v172, v96, v342); svfloat32_t v177 = svmls_f32_x(pred_full, v172, v95, v341); @@ -1939,18 +1794,12 @@ void armral_fft_cf32_cf32_cf32_ac_n_uu9(const armral_cmplx_f32_t *restrict x, svfloat32_t v178 = svmls_f32_x(pred_full, v177, v97, v343); svst1_f64(pred_full, (double *)(v381), svreinterpret_f64_f32(v170)); svst1_f64(pred_full, (double *)(v408), svreinterpret_f64_f32(v169)); - svfloat32_t v185; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v185) : "w"(v174), "w"(v180)); - svfloat32_t v186; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v186) : "w"(v174), "w"(v180)); - svfloat32_t v187; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v187) : "w"(v176), "w"(v182)); - svfloat32_t v188; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v188) : "w"(v176), "w"(v182)); - svfloat32_t v189; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v189) : "w"(v178), "w"(v184)); - svfloat32_t v190; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v190) : "w"(v178), "w"(v184)); + svfloat32_t v185 = svadd_f32_x(svptrue_b32(), v174, v180); + svfloat32_t v186 = svsub_f32_x(svptrue_b32(), v174, v180); + svfloat32_t v187 = svadd_f32_x(svptrue_b32(), v176, v182); + svfloat32_t v188 = svsub_f32_x(svptrue_b32(), v176, v182); + svfloat32_t v189 = svadd_f32_x(svptrue_b32(), v178, v184); + svfloat32_t v190 = svsub_f32_x(svptrue_b32(), v178, v184); svst1_f64(pred_full, (double *)(v363), svreinterpret_f64_f32(v186)); svst1_f64(pred_full, (double *)(v372), svreinterpret_f64_f32(v187)); svst1_f64(pred_full, (double *)(v390), svreinterpret_f64_f32(v190)); @@ -2288,77 +2137,45 @@ void armral_fft_cf32_cf32_cf32_ac_n_uu10(const armral_cmplx_f32_t *restrict x, svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v353)[0])); svfloat32_t v487 = svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v362)[0])); - svfloat32_t v32; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v32) : "w"(v469), "w"(v471)); - svfloat32_t v33; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v33) : "w"(v469), "w"(v471)); - svfloat32_t v48; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v48) : "w"(v473), "w"(v475)); - svfloat32_t v49; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v49) : "w"(v473), "w"(v475)); - svfloat32_t v64; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v64) : "w"(v477), "w"(v479)); - svfloat32_t v65; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v65) : "w"(v477), "w"(v479)); - svfloat32_t v80; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v80) : "w"(v481), "w"(v483)); - svfloat32_t v81; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v81) : "w"(v481), "w"(v483)); - svfloat32_t v96; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v96) : "w"(v485), "w"(v487)); - svfloat32_t v97; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v97) : "w"(v485), "w"(v487)); - svfloat32_t v98; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v98) : "w"(v48), "w"(v96)); - svfloat32_t v99; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v99) : "w"(v48), "w"(v96)); - svfloat32_t v100; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v100) : "w"(v80), "w"(v64)); - svfloat32_t v101; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v101) : "w"(v80), "w"(v64)); - svfloat32_t v151; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v151) : "w"(v49), "w"(v97)); - svfloat32_t v152; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v152) : "w"(v49), "w"(v97)); - svfloat32_t v153; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v153) : "w"(v81), "w"(v65)); - svfloat32_t v154; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v154) : "w"(v81), "w"(v65)); - svfloat32_t v102; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v102) : "w"(v98), "w"(v100)); - svfloat32_t v103; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v103) : "w"(v98), "w"(v100)); - svfloat32_t v104; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v104) : "w"(v99), "w"(v101)); - svfloat32_t zero127; - asm volatile("mov %0.s, #0" : "=w"(zero127)); + svfloat32_t v32 = svadd_f32_x(svptrue_b32(), v469, v471); + svfloat32_t v33 = svsub_f32_x(svptrue_b32(), v469, v471); + svfloat32_t v48 = svadd_f32_x(svptrue_b32(), v473, v475); + svfloat32_t v49 = svsub_f32_x(svptrue_b32(), v473, v475); + svfloat32_t v64 = svadd_f32_x(svptrue_b32(), v477, v479); + svfloat32_t v65 = svsub_f32_x(svptrue_b32(), v477, v479); + svfloat32_t v80 = svadd_f32_x(svptrue_b32(), v481, v483); + svfloat32_t v81 = svsub_f32_x(svptrue_b32(), v481, v483); + svfloat32_t v96 = svadd_f32_x(svptrue_b32(), v485, v487); + svfloat32_t v97 = svsub_f32_x(svptrue_b32(), v485, v487); + svfloat32_t v98 = svadd_f32_x(svptrue_b32(), v48, v96); + svfloat32_t v99 = svsub_f32_x(svptrue_b32(), v48, v96); + svfloat32_t v100 = svadd_f32_x(svptrue_b32(), v80, v64); + svfloat32_t v101 = svsub_f32_x(svptrue_b32(), v80, v64); + svfloat32_t v151 = svadd_f32_x(svptrue_b32(), v49, v97); + svfloat32_t v152 = svsub_f32_x(svptrue_b32(), v49, v97); + svfloat32_t v153 = svadd_f32_x(svptrue_b32(), v81, v65); + svfloat32_t v154 = svsub_f32_x(svptrue_b32(), v81, v65); + svfloat32_t v102 = svadd_f32_x(svptrue_b32(), v98, v100); + svfloat32_t v103 = svsub_f32_x(svptrue_b32(), v98, v100); + svfloat32_t v104 = svadd_f32_x(svptrue_b32(), v99, v101); + svfloat32_t zero127 = svdup_n_f32(0); svfloat32_t v127 = svcmla_f32_x(pred_full, zero127, v374, v99, 90); - svfloat32_t v155; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v155) : "w"(v151), "w"(v153)); - svfloat32_t v156; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v156) : "w"(v151), "w"(v153)); - svfloat32_t v157; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v157) : "w"(v152), "w"(v154)); - svfloat32_t zero180; - asm volatile("mov %0.s, #0" : "=w"(zero180)); + svfloat32_t v155 = svadd_f32_x(svptrue_b32(), v151, v153); + svfloat32_t v156 = svsub_f32_x(svptrue_b32(), v151, v153); + svfloat32_t v157 = svadd_f32_x(svptrue_b32(), v152, v154); + svfloat32_t zero180 = svdup_n_f32(0); svfloat32_t v180 = svcmla_f32_x(pred_full, zero180, v374, v152, 90); - svfloat32_t v105; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v105) : "w"(v102), "w"(v32)); - svfloat32_t zero134; - asm volatile("mov %0.s, #0" : "=w"(zero134)); + svfloat32_t v105 = svadd_f32_x(svptrue_b32(), v102, v32); + svfloat32_t zero134 = svdup_n_f32(0); svfloat32_t v134 = svcmla_f32_x(pred_full, zero134, v375, v104, 90); - svfloat32_t v158; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v158) : "w"(v155), "w"(v33)); - svfloat32_t zero187; - asm volatile("mov %0.s, #0" : "=w"(zero187)); + svfloat32_t v158 = svadd_f32_x(svptrue_b32(), v155, v33); + svfloat32_t zero187 = svdup_n_f32(0); svfloat32_t v187 = svcmla_f32_x(pred_full, zero187, v375, v157, 90); svfloat32_t v142 = svmla_f32_x(pred_full, v105, v102, v372); - svfloat32_t v145; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v145) : "w"(v127), "w"(v134)); + svfloat32_t v145 = svsub_f32_x(svptrue_b32(), v127, v134); svfloat32_t v146 = svcmla_f32_x(pred_full, v134, v376, v101, 90); svfloat32_t v195 = svmla_f32_x(pred_full, v158, v155, v372); - svfloat32_t v198; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v198) : "w"(v180), "w"(v187)); + svfloat32_t v198 = svsub_f32_x(svptrue_b32(), v180, v187); svfloat32_t v199 = svcmla_f32_x(pred_full, v187, v376, v154, 90); svst1_f64(pred_full, (double *)(v384), svreinterpret_f64_f32(v105)); svst1_f64(pred_full, (double *)(v393), svreinterpret_f64_f32(v158)); @@ -2366,22 +2183,14 @@ void armral_fft_cf32_cf32_cf32_ac_n_uu10(const armral_cmplx_f32_t *restrict x, svfloat32_t v144 = svmls_f32_x(pred_full, v142, v103, v373); svfloat32_t v196 = svmla_f32_x(pred_full, v195, v156, v373); svfloat32_t v197 = svmls_f32_x(pred_full, v195, v156, v373); - svfloat32_t v147; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v147) : "w"(v143), "w"(v145)); - svfloat32_t v148; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v148) : "w"(v143), "w"(v145)); - svfloat32_t v149; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v149) : "w"(v144), "w"(v146)); - svfloat32_t v150; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v150) : "w"(v144), "w"(v146)); - svfloat32_t v200; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v200) : "w"(v196), "w"(v198)); - svfloat32_t v201; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v201) : "w"(v196), "w"(v198)); - svfloat32_t v202; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v202) : "w"(v197), "w"(v199)); - svfloat32_t v203; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v203) : "w"(v197), "w"(v199)); + svfloat32_t v147 = svadd_f32_x(svptrue_b32(), v143, v145); + svfloat32_t v148 = svsub_f32_x(svptrue_b32(), v143, v145); + svfloat32_t v149 = svadd_f32_x(svptrue_b32(), v144, v146); + svfloat32_t v150 = svsub_f32_x(svptrue_b32(), v144, v146); + svfloat32_t v200 = svadd_f32_x(svptrue_b32(), v196, v198); + svfloat32_t v201 = svsub_f32_x(svptrue_b32(), v196, v198); + svfloat32_t v202 = svadd_f32_x(svptrue_b32(), v197, v199); + svfloat32_t v203 = svsub_f32_x(svptrue_b32(), v197, v199); svst1_f64(pred_full, (double *)(v402), svreinterpret_f64_f32(v148)); svst1_f64(pred_full, (double *)(v411), svreinterpret_f64_f32(v201)); svst1_f64(pred_full, (double *)(v420), svreinterpret_f64_f32(v150)); @@ -2981,105 +2790,58 @@ void armral_fft_cf32_cf32_cf32_ac_n_uu11(const armral_cmplx_f32_t *restrict x, svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v459)[0])); svfloat32_t v621 = svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v468)[0])); - svfloat32_t v32; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v32) : "w"(v603), "w"(v605)); - svfloat32_t v47; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v47) : "w"(v607), "w"(v609)); - svfloat32_t v62; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v62) : "w"(v611), "w"(v613)); - svfloat32_t v77; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v77) : "w"(v615), "w"(v617)); - svfloat32_t v92; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v92) : "w"(v619), "w"(v621)); - svfloat32_t v93; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v93) : "w"(v603), "w"(v605)); - svfloat32_t v94; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v94) : "w"(v607), "w"(v609)); - svfloat32_t v95; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v95) : "w"(v611), "w"(v613)); - svfloat32_t v96; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v96) : "w"(v615), "w"(v617)); - svfloat32_t v97; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v97) : "w"(v619), "w"(v621)); - svfloat32_t v98; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v98) : "w"(v32), "w"(v47)); - svfloat32_t v99; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v99) : "w"(v62), "w"(v92)); - svfloat32_t v101; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v101) : "w"(v94), "w"(v95)); - svfloat32_t v102; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v102) : "w"(v93), "w"(v97)); - svfloat32_t v114; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v114) : "w"(v47), "w"(v77)); - svfloat32_t v115; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v115) : "w"(v32), "w"(v77)); - svfloat32_t v116; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v116) : "w"(v47), "w"(v32)); - svfloat32_t v117; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v117) : "w"(v92), "w"(v77)); - svfloat32_t v118; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v118) : "w"(v62), "w"(v77)); - svfloat32_t v119; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v119) : "w"(v92), "w"(v62)); - svfloat32_t v120; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v120) : "w"(v47), "w"(v92)); - svfloat32_t v121; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v121) : "w"(v32), "w"(v62)); - svfloat32_t v123; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v123) : "w"(v94), "w"(v96)); - svfloat32_t v124; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v124) : "w"(v93), "w"(v96)); - svfloat32_t v125; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v125) : "w"(v93), "w"(v94)); - svfloat32_t v126; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v126) : "w"(v96), "w"(v97)); - svfloat32_t v127; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v127) : "w"(v95), "w"(v96)); - svfloat32_t v128; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v128) : "w"(v95), "w"(v97)); - svfloat32_t v129; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v129) : "w"(v94), "w"(v97)); - svfloat32_t v130; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v130) : "w"(v93), "w"(v95)); - svfloat32_t v100; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v100) : "w"(v77), "w"(v98)); - svfloat32_t v112; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v112) : "w"(v101), "w"(v102)); - svfloat32_t v122; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v122) : "w"(v99), "w"(v98)); - svfloat32_t v131; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v131) : "w"(v101), "w"(v102)); - svfloat32_t v158; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v158) : "w"(v115), "w"(v485)); - svfloat32_t v163; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v163) : "w"(v116), "w"(v486)); - svfloat32_t v173; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v173) : "w"(v118), "w"(v488)); - svfloat32_t v178; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v178) : "w"(v119), "w"(v489)); - svfloat32_t zero200; - asm volatile("mov %0.s, #0" : "=w"(zero200)); + svfloat32_t v32 = svadd_f32_x(svptrue_b32(), v603, v605); + svfloat32_t v47 = svadd_f32_x(svptrue_b32(), v607, v609); + svfloat32_t v62 = svadd_f32_x(svptrue_b32(), v611, v613); + svfloat32_t v77 = svadd_f32_x(svptrue_b32(), v615, v617); + svfloat32_t v92 = svadd_f32_x(svptrue_b32(), v619, v621); + svfloat32_t v93 = svsub_f32_x(svptrue_b32(), v603, v605); + svfloat32_t v94 = svsub_f32_x(svptrue_b32(), v607, v609); + svfloat32_t v95 = svsub_f32_x(svptrue_b32(), v611, v613); + svfloat32_t v96 = svsub_f32_x(svptrue_b32(), v615, v617); + svfloat32_t v97 = svsub_f32_x(svptrue_b32(), v619, v621); + svfloat32_t v98 = svadd_f32_x(svptrue_b32(), v32, v47); + svfloat32_t v99 = svadd_f32_x(svptrue_b32(), v62, v92); + svfloat32_t v101 = svsub_f32_x(svptrue_b32(), v94, v95); + svfloat32_t v102 = svadd_f32_x(svptrue_b32(), v93, v97); + svfloat32_t v114 = svsub_f32_x(svptrue_b32(), v47, v77); + svfloat32_t v115 = svsub_f32_x(svptrue_b32(), v32, v77); + svfloat32_t v116 = svsub_f32_x(svptrue_b32(), v47, v32); + svfloat32_t v117 = svsub_f32_x(svptrue_b32(), v92, v77); + svfloat32_t v118 = svsub_f32_x(svptrue_b32(), v62, v77); + svfloat32_t v119 = svsub_f32_x(svptrue_b32(), v92, v62); + svfloat32_t v120 = svsub_f32_x(svptrue_b32(), v47, v92); + svfloat32_t v121 = svsub_f32_x(svptrue_b32(), v32, v62); + svfloat32_t v123 = svadd_f32_x(svptrue_b32(), v94, v96); + svfloat32_t v124 = svsub_f32_x(svptrue_b32(), v93, v96); + svfloat32_t v125 = svadd_f32_x(svptrue_b32(), v93, v94); + svfloat32_t v126 = svsub_f32_x(svptrue_b32(), v96, v97); + svfloat32_t v127 = svsub_f32_x(svptrue_b32(), v95, v96); + svfloat32_t v128 = svsub_f32_x(svptrue_b32(), v95, v97); + svfloat32_t v129 = svadd_f32_x(svptrue_b32(), v94, v97); + svfloat32_t v130 = svsub_f32_x(svptrue_b32(), v93, v95); + svfloat32_t v100 = svadd_f32_x(svptrue_b32(), v77, v98); + svfloat32_t v112 = svsub_f32_x(svptrue_b32(), v101, v102); + svfloat32_t v122 = svsub_f32_x(svptrue_b32(), v99, v98); + svfloat32_t v131 = svadd_f32_x(svptrue_b32(), v101, v102); + svfloat32_t v158 = svmul_f32_x(svptrue_b32(), v115, v485); + svfloat32_t v163 = svmul_f32_x(svptrue_b32(), v116, v486); + svfloat32_t v173 = svmul_f32_x(svptrue_b32(), v118, v488); + svfloat32_t v178 = svmul_f32_x(svptrue_b32(), v119, v489); + svfloat32_t zero200 = svdup_n_f32(0); svfloat32_t v200 = svcmla_f32_x(pred_full, zero200, v493, v123, 90); - svfloat32_t zero214; - asm volatile("mov %0.s, #0" : "=w"(zero214)); + svfloat32_t zero214 = svdup_n_f32(0); svfloat32_t v214 = svcmla_f32_x(pred_full, zero214, v495, v125, 90); - svfloat32_t zero221; - asm volatile("mov %0.s, #0" : "=w"(zero221)); + svfloat32_t zero221 = svdup_n_f32(0); svfloat32_t v221 = svcmla_f32_x(pred_full, zero221, v496, v126, 90); - svfloat32_t zero235; - asm volatile("mov %0.s, #0" : "=w"(zero235)); + svfloat32_t zero235 = svdup_n_f32(0); svfloat32_t v235 = svcmla_f32_x(pred_full, zero235, v498, v128, 90); - svfloat32_t zero242; - asm volatile("mov %0.s, #0" : "=w"(zero242)); + svfloat32_t zero242 = svdup_n_f32(0); svfloat32_t v242 = svcmla_f32_x(pred_full, zero242, v499, v129, 90); - svfloat32_t v103; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v103) : "w"(v100), "w"(v99)); - svfloat32_t v113; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v113) : "w"(v112), "w"(v96)); - svfloat32_t v193; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v193) : "w"(v122), "w"(v492)); - svfloat32_t zero256; - asm volatile("mov %0.s, #0" : "=w"(zero256)); + svfloat32_t v103 = svadd_f32_x(svptrue_b32(), v100, v99); + svfloat32_t v113 = svsub_f32_x(svptrue_b32(), v112, v96); + svfloat32_t v193 = svmul_f32_x(svptrue_b32(), v122, v492); + svfloat32_t zero256 = svdup_n_f32(0); svfloat32_t v256 = svcmla_f32_x(pred_full, zero256, v501, v131, 90); svfloat32_t v258 = svmla_f32_x(pred_full, v158, v114, v484); svfloat32_t v259 = svmla_f32_x(pred_full, v163, v115, v485); @@ -3088,87 +2850,50 @@ void armral_fft_cf32_cf32_cf32_ac_n_uu11(const armral_cmplx_f32_t *restrict x, svfloat32_t v262 = svmla_f32_x(pred_full, v178, v118, v488); svfloat32_t v263 = svnmls_f32_x(pred_full, v178, v117, v487); svfloat32_t v266 = svcmla_f32_x(pred_full, v214, v494, v124, 90); - svfloat32_t v267; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v267) : "w"(v200), "w"(v214)); + svfloat32_t v267 = svsub_f32_x(svptrue_b32(), v200, v214); svfloat32_t v268 = svcmla_f32_x(pred_full, v235, v497, v127, 90); - svfloat32_t v269; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v269) : "w"(v221), "w"(v235)); - svfloat32_t v111; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v111) : "w"(v623), "w"(v103)); - svfloat32_t zero148; - asm volatile("mov %0.s, #0" : "=w"(zero148)); + svfloat32_t v269 = svsub_f32_x(svptrue_b32(), v221, v235); + svfloat32_t v111 = svadd_f32_x(svptrue_b32(), v623, v103); + svfloat32_t zero148 = svdup_n_f32(0); svfloat32_t v148 = svcmla_f32_x(pred_full, zero148, v483, v113, 90); svfloat32_t v264 = svmla_f32_x(pred_full, v193, v121, v491); svfloat32_t v265 = svmla_f32_x(pred_full, v193, v120, v490); svfloat32_t v270 = svcmla_f32_x(pred_full, v256, v500, v130, 90); - svfloat32_t v271; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v271) : "w"(v242), "w"(v256)); - svfloat32_t v290; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v290) : "w"(v266), "w"(v267)); + svfloat32_t v271 = svsub_f32_x(svptrue_b32(), v242, v256); + svfloat32_t v290 = svadd_f32_x(svptrue_b32(), v266, v267); svfloat32_t v257 = svmls_f32_x(pred_full, v111, v103, v482); - svfloat32_t v272; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v272) : "w"(v262), "w"(v264)); - svfloat32_t v282; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v282) : "w"(v148), "w"(v268)); - svfloat32_t v284; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v284) : "w"(v270), "w"(v266)); - svfloat32_t v286; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v286) : "w"(v148), "w"(v271)); - svfloat32_t v288; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v288) : "w"(v271), "w"(v267)); - svfloat32_t v291; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v291) : "w"(v290), "w"(v268)); + svfloat32_t v272 = svadd_f32_x(svptrue_b32(), v262, v264); + svfloat32_t v282 = svadd_f32_x(svptrue_b32(), v148, v268); + svfloat32_t v284 = svsub_f32_x(svptrue_b32(), v270, v266); + svfloat32_t v286 = svadd_f32_x(svptrue_b32(), v148, v271); + svfloat32_t v288 = svsub_f32_x(svptrue_b32(), v271, v267); + svfloat32_t v291 = svadd_f32_x(svptrue_b32(), v290, v268); svst1_f64(pred_full, (double *)(v509), svreinterpret_f64_f32(v111)); - svfloat32_t v273; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v273) : "w"(v272), "w"(v257)); - svfloat32_t v274; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v274) : "w"(v257), "w"(v259)); - svfloat32_t v276; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v276) : "w"(v257), "w"(v263)); - svfloat32_t v278; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v278) : "w"(v257), "w"(v260)); - svfloat32_t v280; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v280) : "w"(v257), "w"(v258)); - svfloat32_t v283; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v283) : "w"(v282), "w"(v270)); - svfloat32_t v285; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v285) : "w"(v284), "w"(v148)); - svfloat32_t v287; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v287) : "w"(v286), "w"(v269)); - svfloat32_t v289; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v289) : "w"(v288), "w"(v148)); - svfloat32_t v292; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v292) : "w"(v291), "w"(v269)); - svfloat32_t v275; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v275) : "w"(v274), "w"(v264)); - svfloat32_t v277; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v277) : "w"(v276), "w"(v265)); - svfloat32_t v279; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v279) : "w"(v278), "w"(v265)); - svfloat32_t v281; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v281) : "w"(v280), "w"(v261)); - svfloat32_t v293; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v293) : "w"(v292), "w"(v148)); - svfloat32_t v295; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v295) : "w"(v273), "w"(v283)); - svfloat32_t v302; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v302) : "w"(v273), "w"(v283)); - svfloat32_t v294; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v294) : "w"(v281), "w"(v293)); - svfloat32_t v296; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v296) : "w"(v275), "w"(v285)); - svfloat32_t v297; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v297) : "w"(v277), "w"(v287)); - svfloat32_t v298; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v298) : "w"(v279), "w"(v289)); - svfloat32_t v299; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v299) : "w"(v279), "w"(v289)); - svfloat32_t v300; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v300) : "w"(v277), "w"(v287)); - svfloat32_t v301; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v301) : "w"(v275), "w"(v285)); - svfloat32_t v303; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v303) : "w"(v281), "w"(v293)); + svfloat32_t v273 = svadd_f32_x(svptrue_b32(), v272, v257); + svfloat32_t v274 = svsub_f32_x(svptrue_b32(), v257, v259); + svfloat32_t v276 = svadd_f32_x(svptrue_b32(), v257, v263); + svfloat32_t v278 = svsub_f32_x(svptrue_b32(), v257, v260); + svfloat32_t v280 = svadd_f32_x(svptrue_b32(), v257, v258); + svfloat32_t v283 = svadd_f32_x(svptrue_b32(), v282, v270); + svfloat32_t v285 = svsub_f32_x(svptrue_b32(), v284, v148); + svfloat32_t v287 = svadd_f32_x(svptrue_b32(), v286, v269); + svfloat32_t v289 = svsub_f32_x(svptrue_b32(), v288, v148); + svfloat32_t v292 = svadd_f32_x(svptrue_b32(), v291, v269); + svfloat32_t v275 = svsub_f32_x(svptrue_b32(), v274, v264); + svfloat32_t v277 = svadd_f32_x(svptrue_b32(), v276, v265); + svfloat32_t v279 = svsub_f32_x(svptrue_b32(), v278, v265); + svfloat32_t v281 = svsub_f32_x(svptrue_b32(), v280, v261); + svfloat32_t v293 = svsub_f32_x(svptrue_b32(), v292, v148); + svfloat32_t v295 = svadd_f32_x(svptrue_b32(), v273, v283); + svfloat32_t v302 = svsub_f32_x(svptrue_b32(), v273, v283); + svfloat32_t v294 = svadd_f32_x(svptrue_b32(), v281, v293); + svfloat32_t v296 = svadd_f32_x(svptrue_b32(), v275, v285); + svfloat32_t v297 = svsub_f32_x(svptrue_b32(), v277, v287); + svfloat32_t v298 = svadd_f32_x(svptrue_b32(), v279, v289); + svfloat32_t v299 = svsub_f32_x(svptrue_b32(), v279, v289); + svfloat32_t v300 = svadd_f32_x(svptrue_b32(), v277, v287); + svfloat32_t v301 = svsub_f32_x(svptrue_b32(), v275, v285); + svfloat32_t v303 = svsub_f32_x(svptrue_b32(), v281, v293); svst1_f64(pred_full, (double *)(v527), svreinterpret_f64_f32(v295)); svst1_f64(pred_full, (double *)(v590), svreinterpret_f64_f32(v302)); svst1_f64(pred_full, (double *)(v518), svreinterpret_f64_f32(v294)); @@ -3533,115 +3258,68 @@ void armral_fft_cf32_cf32_cf32_ac_n_uu12(const armral_cmplx_f32_t *restrict x, svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v401)[0])); svfloat32_t v557 = svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v410)[0])); - svfloat32_t v32; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v32) : "w"(v535), "w"(v537)); - svfloat32_t v33; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v33) : "w"(v535), "w"(v537)); - svfloat32_t v56; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v56) : "w"(v541), "w"(v543)); - svfloat32_t v57; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v57) : "w"(v541), "w"(v543)); - svfloat32_t v80; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v80) : "w"(v547), "w"(v549)); - svfloat32_t v81; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v81) : "w"(v547), "w"(v549)); - svfloat32_t v104; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v104) : "w"(v553), "w"(v555)); - svfloat32_t v105; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v105) : "w"(v553), "w"(v555)); - svfloat32_t v41; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v41) : "w"(v32), "w"(v539)); - svfloat32_t v65; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v65) : "w"(v56), "w"(v545)); - svfloat32_t v89; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v89) : "w"(v80), "w"(v551)); - svfloat32_t v113; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v113) : "w"(v104), "w"(v557)); - svfloat32_t v144; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v144) : "w"(v32), "w"(v80)); - svfloat32_t v145; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v145) : "w"(v32), "w"(v80)); - svfloat32_t v146; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v146) : "w"(v56), "w"(v104)); - svfloat32_t v147; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v147) : "w"(v56), "w"(v104)); - svfloat32_t v174; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v174) : "w"(v33), "w"(v81)); - svfloat32_t v175; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v175) : "w"(v33), "w"(v81)); - svfloat32_t v176; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v176) : "w"(v57), "w"(v105)); - svfloat32_t v177; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v177) : "w"(v57), "w"(v105)); - svfloat32_t v114; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v114) : "w"(v41), "w"(v89)); - svfloat32_t v115; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v115) : "w"(v41), "w"(v89)); - svfloat32_t v116; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v116) : "w"(v65), "w"(v113)); - svfloat32_t v117; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v117) : "w"(v65), "w"(v113)); - svfloat32_t v148; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v148) : "w"(v144), "w"(v146)); - svfloat32_t v149; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v149) : "w"(v144), "w"(v146)); - svfloat32_t zero171; - asm volatile("mov %0.s, #0" : "=w"(zero171)); + svfloat32_t v32 = svadd_f32_x(svptrue_b32(), v535, v537); + svfloat32_t v33 = svsub_f32_x(svptrue_b32(), v535, v537); + svfloat32_t v56 = svadd_f32_x(svptrue_b32(), v541, v543); + svfloat32_t v57 = svsub_f32_x(svptrue_b32(), v541, v543); + svfloat32_t v80 = svadd_f32_x(svptrue_b32(), v547, v549); + svfloat32_t v81 = svsub_f32_x(svptrue_b32(), v547, v549); + svfloat32_t v104 = svadd_f32_x(svptrue_b32(), v553, v555); + svfloat32_t v105 = svsub_f32_x(svptrue_b32(), v553, v555); + svfloat32_t v41 = svadd_f32_x(svptrue_b32(), v32, v539); + svfloat32_t v65 = svadd_f32_x(svptrue_b32(), v56, v545); + svfloat32_t v89 = svadd_f32_x(svptrue_b32(), v80, v551); + svfloat32_t v113 = svadd_f32_x(svptrue_b32(), v104, v557); + svfloat32_t v144 = svadd_f32_x(svptrue_b32(), v32, v80); + svfloat32_t v145 = svsub_f32_x(svptrue_b32(), v32, v80); + svfloat32_t v146 = svadd_f32_x(svptrue_b32(), v56, v104); + svfloat32_t v147 = svsub_f32_x(svptrue_b32(), v56, v104); + svfloat32_t v174 = svadd_f32_x(svptrue_b32(), v33, v81); + svfloat32_t v175 = svsub_f32_x(svptrue_b32(), v33, v81); + svfloat32_t v176 = svadd_f32_x(svptrue_b32(), v57, v105); + svfloat32_t v177 = svsub_f32_x(svptrue_b32(), v57, v105); + svfloat32_t v114 = svadd_f32_x(svptrue_b32(), v41, v89); + svfloat32_t v115 = svsub_f32_x(svptrue_b32(), v41, v89); + svfloat32_t v116 = svadd_f32_x(svptrue_b32(), v65, v113); + svfloat32_t v117 = svsub_f32_x(svptrue_b32(), v65, v113); + svfloat32_t v148 = svadd_f32_x(svptrue_b32(), v144, v146); + svfloat32_t v149 = svsub_f32_x(svptrue_b32(), v144, v146); + svfloat32_t zero171 = svdup_n_f32(0); svfloat32_t v171 = svcmla_f32_x(pred_full, zero171, v420, v147, 90); - svfloat32_t v178; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v178) : "w"(v174), "w"(v176)); - svfloat32_t v179; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v179) : "w"(v174), "w"(v176)); - svfloat32_t zero200; - asm volatile("mov %0.s, #0" : "=w"(zero200)); + svfloat32_t v178 = svadd_f32_x(svptrue_b32(), v174, v176); + svfloat32_t v179 = svsub_f32_x(svptrue_b32(), v174, v176); + svfloat32_t zero200 = svdup_n_f32(0); svfloat32_t v200 = svcmla_f32_x(pred_full, zero200, v423, v175, 90); - svfloat32_t v118; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v118) : "w"(v114), "w"(v116)); - svfloat32_t v119; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v119) : "w"(v114), "w"(v116)); - svfloat32_t zero141; - asm volatile("mov %0.s, #0" : "=w"(zero141)); + svfloat32_t v118 = svadd_f32_x(svptrue_b32(), v114, v116); + svfloat32_t v119 = svsub_f32_x(svptrue_b32(), v114, v116); + svfloat32_t zero141 = svdup_n_f32(0); svfloat32_t v141 = svcmla_f32_x(pred_full, zero141, v416, v117, 90); svfloat32_t v172 = svmla_f32_x(pred_full, v171, v145, v419); svfloat32_t v173 = svnmls_f32_x(pred_full, v171, v145, v419); - svfloat32_t zero186; - asm volatile("mov %0.s, #0" : "=w"(zero186)); + svfloat32_t zero186 = svdup_n_f32(0); svfloat32_t v186 = svcmla_f32_x(pred_full, zero186, v423, v178, 90); - svfloat32_t zero193; - asm volatile("mov %0.s, #0" : "=w"(zero193)); + svfloat32_t zero193 = svdup_n_f32(0); svfloat32_t v193 = svcmla_f32_x(pred_full, zero193, v423, v179, 90); svfloat32_t v206 = svmla_f32_x(pred_full, v200, v177, v424); svfloat32_t v207 = svmls_f32_x(pred_full, v200, v177, v424); - svfloat32_t v142; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v142) : "w"(v115), "w"(v141)); - svfloat32_t v143; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v143) : "w"(v115), "w"(v141)); + svfloat32_t v142 = svadd_f32_x(svptrue_b32(), v115, v141); + svfloat32_t v143 = svsub_f32_x(svptrue_b32(), v115, v141); svfloat32_t v208 = svmla_f32_x(pred_full, v118, v148, v419); svfloat32_t v256 = svmla_f32_x(pred_full, v119, v149, v419); svst1_f64(pred_full, (double *)(v432), svreinterpret_f64_f32(v118)); svst1_f64(pred_full, (double *)(v486), svreinterpret_f64_f32(v119)); - svfloat32_t v209; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v209) : "w"(v208), "w"(v186)); - svfloat32_t v210; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v210) : "w"(v208), "w"(v186)); - svfloat32_t v232; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v232) : "w"(v143), "w"(v173)); - svfloat32_t v257; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v257) : "w"(v256), "w"(v193)); - svfloat32_t v258; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v258) : "w"(v256), "w"(v193)); - svfloat32_t v280; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v280) : "w"(v142), "w"(v172)); + svfloat32_t v209 = svadd_f32_x(svptrue_b32(), v208, v186); + svfloat32_t v210 = svsub_f32_x(svptrue_b32(), v208, v186); + svfloat32_t v232 = svadd_f32_x(svptrue_b32(), v143, v173); + svfloat32_t v257 = svadd_f32_x(svptrue_b32(), v256, v193); + svfloat32_t v258 = svsub_f32_x(svptrue_b32(), v256, v193); + svfloat32_t v280 = svadd_f32_x(svptrue_b32(), v142, v172); svst1_f64(pred_full, (double *)(v459), svreinterpret_f64_f32(v143)); svst1_f64(pred_full, (double *)(v513), svreinterpret_f64_f32(v142)); - svfloat32_t v233; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v233) : "w"(v232), "w"(v207)); - svfloat32_t v234; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v234) : "w"(v232), "w"(v207)); - svfloat32_t v281; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v281) : "w"(v280), "w"(v206)); - svfloat32_t v282; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v282) : "w"(v280), "w"(v206)); + svfloat32_t v233 = svadd_f32_x(svptrue_b32(), v232, v207); + svfloat32_t v234 = svsub_f32_x(svptrue_b32(), v232, v207); + svfloat32_t v281 = svadd_f32_x(svptrue_b32(), v280, v206); + svfloat32_t v282 = svsub_f32_x(svptrue_b32(), v280, v206); svst1_f64(pred_full, (double *)(v441), svreinterpret_f64_f32(v210)); svst1_f64(pred_full, (double *)(v450), svreinterpret_f64_f32(v209)); svst1_f64(pred_full, (double *)(v495), svreinterpret_f64_f32(v258)); @@ -4299,218 +3977,125 @@ void armral_fft_cf32_cf32_cf32_ac_n_uu13(const armral_cmplx_f32_t *restrict x, svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v519)[0])); svfloat32_t v703 = svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v528)[0])); - svfloat32_t v32; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v32) : "w"(v681), "w"(v683)); - svfloat32_t v47; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v47) : "w"(v685), "w"(v687)); - svfloat32_t v62; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v62) : "w"(v689), "w"(v691)); - svfloat32_t v77; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v77) : "w"(v693), "w"(v695)); - svfloat32_t v92; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v92) : "w"(v697), "w"(v699)); - svfloat32_t v107; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v107) : "w"(v701), "w"(v703)); - svfloat32_t v108; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v108) : "w"(v681), "w"(v683)); - svfloat32_t v109; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v109) : "w"(v685), "w"(v687)); - svfloat32_t v110; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v110) : "w"(v689), "w"(v691)); - svfloat32_t v111; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v111) : "w"(v693), "w"(v695)); - svfloat32_t v112; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v112) : "w"(v697), "w"(v699)); - svfloat32_t v113; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v113) : "w"(v701), "w"(v703)); - svfloat32_t v114; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v114) : "w"(v47), "w"(v92)); - svfloat32_t v116; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v116) : "w"(v32), "w"(v62)); - svfloat32_t v119; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v119) : "w"(v109), "w"(v112)); - svfloat32_t v121; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v121) : "w"(v108), "w"(v110)); - svfloat32_t v123; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v123) : "w"(v47), "w"(v107)); - svfloat32_t v124; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v124) : "w"(v62), "w"(v77)); - svfloat32_t v125; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v125) : "w"(v32), "w"(v77)); - svfloat32_t v126; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v126) : "w"(v92), "w"(v107)); - svfloat32_t v131; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v131) : "w"(v109), "w"(v113)); - svfloat32_t v132; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v132) : "w"(v108), "w"(v110)); - svfloat32_t v133; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v133) : "w"(v109), "w"(v112)); - svfloat32_t v134; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v134) : "w"(v108), "w"(v111)); - svfloat32_t v135; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v135) : "w"(v112), "w"(v113)); - svfloat32_t v136; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v136) : "w"(v110), "w"(v111)); - svfloat32_t v115; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v115) : "w"(v114), "w"(v107)); - svfloat32_t v117; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v117) : "w"(v116), "w"(v77)); - svfloat32_t v120; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v120) : "w"(v119), "w"(v113)); - svfloat32_t v122; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v122) : "w"(v121), "w"(v111)); - svfloat32_t v127; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v127) : "w"(v123), "w"(v124)); - svfloat32_t v128; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v128) : "w"(v125), "w"(v126)); - svfloat32_t v129; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v129) : "w"(v123), "w"(v124)); - svfloat32_t v130; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v130) : "w"(v125), "w"(v126)); - svfloat32_t v149; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v149) : "w"(v131), "w"(v132)); - svfloat32_t v150; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v150) : "w"(v133), "w"(v134)); - svfloat32_t v151; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v151) : "w"(v135), "w"(v136)); - svfloat32_t zero224; - asm volatile("mov %0.s, #0" : "=w"(zero224)); + svfloat32_t v32 = svadd_f32_x(svptrue_b32(), v681, v683); + svfloat32_t v47 = svadd_f32_x(svptrue_b32(), v685, v687); + svfloat32_t v62 = svadd_f32_x(svptrue_b32(), v689, v691); + svfloat32_t v77 = svadd_f32_x(svptrue_b32(), v693, v695); + svfloat32_t v92 = svadd_f32_x(svptrue_b32(), v697, v699); + svfloat32_t v107 = svadd_f32_x(svptrue_b32(), v701, v703); + svfloat32_t v108 = svsub_f32_x(svptrue_b32(), v681, v683); + svfloat32_t v109 = svsub_f32_x(svptrue_b32(), v685, v687); + svfloat32_t v110 = svsub_f32_x(svptrue_b32(), v689, v691); + svfloat32_t v111 = svsub_f32_x(svptrue_b32(), v693, v695); + svfloat32_t v112 = svsub_f32_x(svptrue_b32(), v697, v699); + svfloat32_t v113 = svsub_f32_x(svptrue_b32(), v701, v703); + svfloat32_t v114 = svadd_f32_x(svptrue_b32(), v47, v92); + svfloat32_t v116 = svadd_f32_x(svptrue_b32(), v32, v62); + svfloat32_t v119 = svadd_f32_x(svptrue_b32(), v109, v112); + svfloat32_t v121 = svadd_f32_x(svptrue_b32(), v108, v110); + svfloat32_t v123 = svsub_f32_x(svptrue_b32(), v47, v107); + svfloat32_t v124 = svsub_f32_x(svptrue_b32(), v62, v77); + svfloat32_t v125 = svsub_f32_x(svptrue_b32(), v32, v77); + svfloat32_t v126 = svsub_f32_x(svptrue_b32(), v92, v107); + svfloat32_t v131 = svsub_f32_x(svptrue_b32(), v109, v113); + svfloat32_t v132 = svsub_f32_x(svptrue_b32(), v108, v110); + svfloat32_t v133 = svsub_f32_x(svptrue_b32(), v109, v112); + svfloat32_t v134 = svadd_f32_x(svptrue_b32(), v108, v111); + svfloat32_t v135 = svsub_f32_x(svptrue_b32(), v112, v113); + svfloat32_t v136 = svadd_f32_x(svptrue_b32(), v110, v111); + svfloat32_t v115 = svadd_f32_x(svptrue_b32(), v114, v107); + svfloat32_t v117 = svadd_f32_x(svptrue_b32(), v116, v77); + svfloat32_t v120 = svadd_f32_x(svptrue_b32(), v119, v113); + svfloat32_t v122 = svsub_f32_x(svptrue_b32(), v121, v111); + svfloat32_t v127 = svsub_f32_x(svptrue_b32(), v123, v124); + svfloat32_t v128 = svsub_f32_x(svptrue_b32(), v125, v126); + svfloat32_t v129 = svadd_f32_x(svptrue_b32(), v123, v124); + svfloat32_t v130 = svadd_f32_x(svptrue_b32(), v125, v126); + svfloat32_t v149 = svadd_f32_x(svptrue_b32(), v131, v132); + svfloat32_t v150 = svadd_f32_x(svptrue_b32(), v133, v134); + svfloat32_t v151 = svsub_f32_x(svptrue_b32(), v135, v136); + svfloat32_t zero224 = svdup_n_f32(0); svfloat32_t v224 = svcmla_f32_x(pred_full, zero224, v553, v131, 90); - svfloat32_t zero231; - asm volatile("mov %0.s, #0" : "=w"(zero231)); + svfloat32_t zero231 = svdup_n_f32(0); svfloat32_t v231 = svcmla_f32_x(pred_full, zero231, v554, v132, 90); - svfloat32_t zero245; - asm volatile("mov %0.s, #0" : "=w"(zero245)); + svfloat32_t zero245 = svdup_n_f32(0); svfloat32_t v245 = svcmla_f32_x(pred_full, zero245, v556, v133, 90); - svfloat32_t zero252; - asm volatile("mov %0.s, #0" : "=w"(zero252)); + svfloat32_t zero252 = svdup_n_f32(0); svfloat32_t v252 = svcmla_f32_x(pred_full, zero252, v557, v134, 90); - svfloat32_t zero266; - asm volatile("mov %0.s, #0" : "=w"(zero266)); + svfloat32_t zero266 = svdup_n_f32(0); svfloat32_t v266 = svcmla_f32_x(pred_full, zero266, v559, v135, 90); - svfloat32_t v118; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v118) : "w"(v115), "w"(v117)); - svfloat32_t v145; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v145) : "w"(v117), "w"(v115)); - svfloat32_t v146; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v146) : "w"(v120), "w"(v122)); - svfloat32_t v147; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v147) : "w"(v127), "w"(v128)); - svfloat32_t v148; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v148) : "w"(v129), "w"(v130)); - svfloat32_t zero173; - asm volatile("mov %0.s, #0" : "=w"(zero173)); + svfloat32_t v118 = svadd_f32_x(svptrue_b32(), v115, v117); + svfloat32_t v145 = svsub_f32_x(svptrue_b32(), v117, v115); + svfloat32_t v146 = svadd_f32_x(svptrue_b32(), v120, v122); + svfloat32_t v147 = svadd_f32_x(svptrue_b32(), v127, v128); + svfloat32_t v148 = svsub_f32_x(svptrue_b32(), v129, v130); + svfloat32_t zero173 = svdup_n_f32(0); svfloat32_t v173 = svcmla_f32_x(pred_full, zero173, v544, v120, 90); - svfloat32_t zero180; - asm volatile("mov %0.s, #0" : "=w"(zero180)); + svfloat32_t zero180 = svdup_n_f32(0); svfloat32_t v180 = svcmla_f32_x(pred_full, zero180, v545, v122, 90); - svfloat32_t v192; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v192) : "w"(v127), "w"(v547)); - svfloat32_t zero238; - asm volatile("mov %0.s, #0" : "=w"(zero238)); + svfloat32_t v192 = svmul_f32_x(svptrue_b32(), v127, v547); + svfloat32_t zero238 = svdup_n_f32(0); svfloat32_t v238 = svcmla_f32_x(pred_full, zero238, v555, v149, 90); - svfloat32_t zero259; - asm volatile("mov %0.s, #0" : "=w"(zero259)); + svfloat32_t zero259 = svdup_n_f32(0); svfloat32_t v259 = svcmla_f32_x(pred_full, zero259, v558, v150, 90); - svfloat32_t zero280; - asm volatile("mov %0.s, #0" : "=w"(zero280)); + svfloat32_t zero280 = svdup_n_f32(0); svfloat32_t v280 = svcmla_f32_x(pred_full, zero280, v561, v151, 90); - svfloat32_t v144; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v144) : "w"(v705), "w"(v118)); - svfloat32_t zero187; - asm volatile("mov %0.s, #0" : "=w"(zero187)); + svfloat32_t v144 = svadd_f32_x(svptrue_b32(), v705, v118); + svfloat32_t zero187 = svdup_n_f32(0); svfloat32_t v187 = svcmla_f32_x(pred_full, zero187, v546, v146, 90); - svfloat32_t v202; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v202) : "w"(v147), "w"(v549)); + svfloat32_t v202 = svmul_f32_x(svptrue_b32(), v147, v549); svfloat32_t v282 = svmla_f32_x(pred_full, v192, v128, v548); - svfloat32_t v294; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v294) : "w"(v224), "w"(v238)); - svfloat32_t v295; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v295) : "w"(v231), "w"(v238)); - svfloat32_t v296; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v296) : "w"(v245), "w"(v259)); - svfloat32_t v297; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v297) : "w"(v252), "w"(v259)); - svfloat32_t v298; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v298) : "w"(v266), "w"(v280)); + svfloat32_t v294 = svsub_f32_x(svptrue_b32(), v224, v238); + svfloat32_t v295 = svsub_f32_x(svptrue_b32(), v231, v238); + svfloat32_t v296 = svsub_f32_x(svptrue_b32(), v245, v259); + svfloat32_t v297 = svsub_f32_x(svptrue_b32(), v252, v259); + svfloat32_t v298 = svsub_f32_x(svptrue_b32(), v266, v280); svfloat32_t v299 = svcmla_f32_x(pred_full, v280, v560, v136, 90); svfloat32_t v281 = svmls_f32_x(pred_full, v144, v118, v542); svfloat32_t v283 = svmls_f32_x(pred_full, v282, v145, v543); svfloat32_t v284 = svmla_f32_x(pred_full, v202, v128, v548); svfloat32_t v286 = svnmls_f32_x(pred_full, v192, v147, v549); - svfloat32_t v300; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v300) : "w"(v173), "w"(v187)); - svfloat32_t v301; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v301) : "w"(v180), "w"(v187)); - svfloat32_t v312; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v312) : "w"(v294), "w"(v298)); - svfloat32_t v314; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v314) : "w"(v296), "w"(v298)); - svfloat32_t v316; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v316) : "w"(v295), "w"(v299)); + svfloat32_t v300 = svsub_f32_x(svptrue_b32(), v173, v187); + svfloat32_t v301 = svsub_f32_x(svptrue_b32(), v180, v187); + svfloat32_t v312 = svadd_f32_x(svptrue_b32(), v294, v298); + svfloat32_t v314 = svadd_f32_x(svptrue_b32(), v296, v298); + svfloat32_t v316 = svsub_f32_x(svptrue_b32(), v295, v299); svst1_f64(pred_full, (double *)(v569), svreinterpret_f64_f32(v144)); svfloat32_t v285 = svmla_f32_x(pred_full, v284, v145, v543); svfloat32_t v287 = svmls_f32_x(pred_full, v286, v145, v543); svfloat32_t v288 = svmla_f32_x(pred_full, v281, v129, v550); svfloat32_t v290 = svmls_f32_x(pred_full, v281, v130, v551); svfloat32_t v292 = svmls_f32_x(pred_full, v281, v129, v550); - svfloat32_t v308; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v308) : "w"(v301), "w"(v294)); - svfloat32_t v310; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v310) : "w"(v299), "w"(v300)); - svfloat32_t v313; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v313) : "w"(v312), "w"(v301)); - svfloat32_t v315; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v315) : "w"(v314), "w"(v301)); - svfloat32_t v317; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v317) : "w"(v316), "w"(v300)); - svfloat32_t v318; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v318) : "w"(v300), "w"(v295)); + svfloat32_t v308 = svsub_f32_x(svptrue_b32(), v301, v294); + svfloat32_t v310 = svsub_f32_x(svptrue_b32(), v299, v300); + svfloat32_t v313 = svadd_f32_x(svptrue_b32(), v312, v301); + svfloat32_t v315 = svsub_f32_x(svptrue_b32(), v314, v301); + svfloat32_t v317 = svsub_f32_x(svptrue_b32(), v316, v300); + svfloat32_t v318 = svadd_f32_x(svptrue_b32(), v300, v295); svfloat32_t v289 = svmla_f32_x(pred_full, v288, v130, v551); svfloat32_t v291 = svmls_f32_x(pred_full, v290, v148, v552); svfloat32_t v293 = svmla_f32_x(pred_full, v292, v148, v552); - svfloat32_t v309; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v309) : "w"(v308), "w"(v296)); - svfloat32_t v311; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v311) : "w"(v310), "w"(v297)); - svfloat32_t v319; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v319) : "w"(v318), "w"(v297)); - svfloat32_t v302; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v302) : "w"(v283), "w"(v289)); - svfloat32_t v303; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v303) : "w"(v285), "w"(v291)); - svfloat32_t v304; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v304) : "w"(v291), "w"(v285)); - svfloat32_t v305; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v305) : "w"(v287), "w"(v293)); - svfloat32_t v306; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v306) : "w"(v289), "w"(v283)); - svfloat32_t v307; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v307) : "w"(v293), "w"(v287)); - svfloat32_t v320; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v320) : "w"(v302), "w"(v309)); - svfloat32_t v321; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v321) : "w"(v303), "w"(v311)); - svfloat32_t v322; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v322) : "w"(v304), "w"(v313)); - svfloat32_t v323; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v323) : "w"(v305), "w"(v315)); - svfloat32_t v324; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v324) : "w"(v306), "w"(v317)); - svfloat32_t v325; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v325) : "w"(v307), "w"(v319)); - svfloat32_t v326; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v326) : "w"(v307), "w"(v319)); - svfloat32_t v327; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v327) : "w"(v306), "w"(v317)); - svfloat32_t v328; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v328) : "w"(v305), "w"(v315)); - svfloat32_t v329; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v329) : "w"(v304), "w"(v313)); - svfloat32_t v330; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v330) : "w"(v303), "w"(v311)); - svfloat32_t v331; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v331) : "w"(v302), "w"(v309)); + svfloat32_t v309 = svadd_f32_x(svptrue_b32(), v308, v296); + svfloat32_t v311 = svsub_f32_x(svptrue_b32(), v310, v297); + svfloat32_t v319 = svsub_f32_x(svptrue_b32(), v318, v297); + svfloat32_t v302 = svadd_f32_x(svptrue_b32(), v283, v289); + svfloat32_t v303 = svadd_f32_x(svptrue_b32(), v285, v291); + svfloat32_t v304 = svsub_f32_x(svptrue_b32(), v291, v285); + svfloat32_t v305 = svadd_f32_x(svptrue_b32(), v287, v293); + svfloat32_t v306 = svsub_f32_x(svptrue_b32(), v289, v283); + svfloat32_t v307 = svsub_f32_x(svptrue_b32(), v293, v287); + svfloat32_t v320 = svsub_f32_x(svptrue_b32(), v302, v309); + svfloat32_t v321 = svadd_f32_x(svptrue_b32(), v303, v311); + svfloat32_t v322 = svsub_f32_x(svptrue_b32(), v304, v313); + svfloat32_t v323 = svsub_f32_x(svptrue_b32(), v305, v315); + svfloat32_t v324 = svadd_f32_x(svptrue_b32(), v306, v317); + svfloat32_t v325 = svsub_f32_x(svptrue_b32(), v307, v319); + svfloat32_t v326 = svadd_f32_x(svptrue_b32(), v307, v319); + svfloat32_t v327 = svsub_f32_x(svptrue_b32(), v306, v317); + svfloat32_t v328 = svadd_f32_x(svptrue_b32(), v305, v315); + svfloat32_t v329 = svadd_f32_x(svptrue_b32(), v304, v313); + svfloat32_t v330 = svsub_f32_x(svptrue_b32(), v303, v311); + svfloat32_t v331 = svadd_f32_x(svptrue_b32(), v302, v309); svst1_f64(pred_full, (double *)(v578), svreinterpret_f64_f32(v320)); svst1_f64(pred_full, (double *)(v587), svreinterpret_f64_f32(v321)); svst1_f64(pred_full, (double *)(v596), svreinterpret_f64_f32(v322)); @@ -5028,190 +4613,110 @@ void armral_fft_cf32_cf32_cf32_ac_n_uu14(const armral_cmplx_f32_t *restrict x, svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v521)[0])); svfloat32_t v705 = svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v530)[0])); - svfloat32_t v32; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v32) : "w"(v679), "w"(v681)); - svfloat32_t v33; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v33) : "w"(v679), "w"(v681)); - svfloat32_t v48; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v48) : "w"(v683), "w"(v685)); - svfloat32_t v49; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v49) : "w"(v683), "w"(v685)); - svfloat32_t v64; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v64) : "w"(v687), "w"(v689)); - svfloat32_t v65; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v65) : "w"(v687), "w"(v689)); - svfloat32_t v80; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v80) : "w"(v691), "w"(v693)); - svfloat32_t v81; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v81) : "w"(v691), "w"(v693)); - svfloat32_t v96; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v96) : "w"(v695), "w"(v697)); - svfloat32_t v97; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v97) : "w"(v695), "w"(v697)); - svfloat32_t v112; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v112) : "w"(v699), "w"(v701)); - svfloat32_t v113; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v113) : "w"(v699), "w"(v701)); - svfloat32_t v128; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v128) : "w"(v703), "w"(v705)); - svfloat32_t v129; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v129) : "w"(v703), "w"(v705)); - svfloat32_t v130; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v130) : "w"(v48), "w"(v128)); - svfloat32_t v131; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v131) : "w"(v48), "w"(v128)); - svfloat32_t v132; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v132) : "w"(v96), "w"(v80)); - svfloat32_t v133; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v133) : "w"(v96), "w"(v80)); - svfloat32_t v134; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v134) : "w"(v64), "w"(v112)); - svfloat32_t v135; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v135) : "w"(v64), "w"(v112)); - svfloat32_t v219; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v219) : "w"(v49), "w"(v129)); - svfloat32_t v220; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v220) : "w"(v49), "w"(v129)); - svfloat32_t v221; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v221) : "w"(v97), "w"(v81)); - svfloat32_t v222; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v222) : "w"(v97), "w"(v81)); - svfloat32_t v223; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v223) : "w"(v65), "w"(v113)); - svfloat32_t v224; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v224) : "w"(v65), "w"(v113)); - svfloat32_t v136; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v136) : "w"(v130), "w"(v132)); - svfloat32_t v139; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v139) : "w"(v130), "w"(v132)); - svfloat32_t v140; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v140) : "w"(v132), "w"(v134)); - svfloat32_t v141; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v141) : "w"(v134), "w"(v130)); - svfloat32_t v142; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v142) : "w"(v131), "w"(v133)); - svfloat32_t v144; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v144) : "w"(v131), "w"(v133)); - svfloat32_t v145; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v145) : "w"(v133), "w"(v135)); - svfloat32_t v146; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v146) : "w"(v135), "w"(v131)); - svfloat32_t v225; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v225) : "w"(v219), "w"(v221)); - svfloat32_t v228; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v228) : "w"(v219), "w"(v221)); - svfloat32_t v229; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v229) : "w"(v221), "w"(v223)); - svfloat32_t v230; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v230) : "w"(v223), "w"(v219)); - svfloat32_t v231; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v231) : "w"(v220), "w"(v222)); - svfloat32_t v233; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v233) : "w"(v220), "w"(v222)); - svfloat32_t v234; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v234) : "w"(v222), "w"(v224)); - svfloat32_t v235; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v235) : "w"(v224), "w"(v220)); - svfloat32_t v137; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v137) : "w"(v136), "w"(v134)); - svfloat32_t v143; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v143) : "w"(v142), "w"(v135)); - svfloat32_t zero185; - asm volatile("mov %0.s, #0" : "=w"(zero185)); + svfloat32_t v32 = svadd_f32_x(svptrue_b32(), v679, v681); + svfloat32_t v33 = svsub_f32_x(svptrue_b32(), v679, v681); + svfloat32_t v48 = svadd_f32_x(svptrue_b32(), v683, v685); + svfloat32_t v49 = svsub_f32_x(svptrue_b32(), v683, v685); + svfloat32_t v64 = svadd_f32_x(svptrue_b32(), v687, v689); + svfloat32_t v65 = svsub_f32_x(svptrue_b32(), v687, v689); + svfloat32_t v80 = svadd_f32_x(svptrue_b32(), v691, v693); + svfloat32_t v81 = svsub_f32_x(svptrue_b32(), v691, v693); + svfloat32_t v96 = svadd_f32_x(svptrue_b32(), v695, v697); + svfloat32_t v97 = svsub_f32_x(svptrue_b32(), v695, v697); + svfloat32_t v112 = svadd_f32_x(svptrue_b32(), v699, v701); + svfloat32_t v113 = svsub_f32_x(svptrue_b32(), v699, v701); + svfloat32_t v128 = svadd_f32_x(svptrue_b32(), v703, v705); + svfloat32_t v129 = svsub_f32_x(svptrue_b32(), v703, v705); + svfloat32_t v130 = svadd_f32_x(svptrue_b32(), v48, v128); + svfloat32_t v131 = svsub_f32_x(svptrue_b32(), v48, v128); + svfloat32_t v132 = svadd_f32_x(svptrue_b32(), v96, v80); + svfloat32_t v133 = svsub_f32_x(svptrue_b32(), v96, v80); + svfloat32_t v134 = svadd_f32_x(svptrue_b32(), v64, v112); + svfloat32_t v135 = svsub_f32_x(svptrue_b32(), v64, v112); + svfloat32_t v219 = svadd_f32_x(svptrue_b32(), v49, v129); + svfloat32_t v220 = svsub_f32_x(svptrue_b32(), v49, v129); + svfloat32_t v221 = svadd_f32_x(svptrue_b32(), v97, v81); + svfloat32_t v222 = svsub_f32_x(svptrue_b32(), v97, v81); + svfloat32_t v223 = svadd_f32_x(svptrue_b32(), v65, v113); + svfloat32_t v224 = svsub_f32_x(svptrue_b32(), v65, v113); + svfloat32_t v136 = svadd_f32_x(svptrue_b32(), v130, v132); + svfloat32_t v139 = svsub_f32_x(svptrue_b32(), v130, v132); + svfloat32_t v140 = svsub_f32_x(svptrue_b32(), v132, v134); + svfloat32_t v141 = svsub_f32_x(svptrue_b32(), v134, v130); + svfloat32_t v142 = svadd_f32_x(svptrue_b32(), v131, v133); + svfloat32_t v144 = svsub_f32_x(svptrue_b32(), v131, v133); + svfloat32_t v145 = svsub_f32_x(svptrue_b32(), v133, v135); + svfloat32_t v146 = svsub_f32_x(svptrue_b32(), v135, v131); + svfloat32_t v225 = svadd_f32_x(svptrue_b32(), v219, v221); + svfloat32_t v228 = svsub_f32_x(svptrue_b32(), v219, v221); + svfloat32_t v229 = svsub_f32_x(svptrue_b32(), v221, v223); + svfloat32_t v230 = svsub_f32_x(svptrue_b32(), v223, v219); + svfloat32_t v231 = svadd_f32_x(svptrue_b32(), v220, v222); + svfloat32_t v233 = svsub_f32_x(svptrue_b32(), v220, v222); + svfloat32_t v234 = svsub_f32_x(svptrue_b32(), v222, v224); + svfloat32_t v235 = svsub_f32_x(svptrue_b32(), v224, v220); + svfloat32_t v137 = svadd_f32_x(svptrue_b32(), v136, v134); + svfloat32_t v143 = svadd_f32_x(svptrue_b32(), v142, v135); + svfloat32_t zero185 = svdup_n_f32(0); svfloat32_t v185 = svcmla_f32_x(pred_full, zero185, v548, v144, 90); - svfloat32_t zero192; - asm volatile("mov %0.s, #0" : "=w"(zero192)); + svfloat32_t zero192 = svdup_n_f32(0); svfloat32_t v192 = svcmla_f32_x(pred_full, zero192, v549, v145, 90); - svfloat32_t zero199; - asm volatile("mov %0.s, #0" : "=w"(zero199)); + svfloat32_t zero199 = svdup_n_f32(0); svfloat32_t v199 = svcmla_f32_x(pred_full, zero199, v550, v146, 90); - svfloat32_t v226; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v226) : "w"(v225), "w"(v223)); - svfloat32_t v232; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v232) : "w"(v231), "w"(v224)); - svfloat32_t zero274; - asm volatile("mov %0.s, #0" : "=w"(zero274)); + svfloat32_t v226 = svadd_f32_x(svptrue_b32(), v225, v223); + svfloat32_t v232 = svadd_f32_x(svptrue_b32(), v231, v224); + svfloat32_t zero274 = svdup_n_f32(0); svfloat32_t v274 = svcmla_f32_x(pred_full, zero274, v548, v233, 90); - svfloat32_t zero281; - asm volatile("mov %0.s, #0" : "=w"(zero281)); + svfloat32_t zero281 = svdup_n_f32(0); svfloat32_t v281 = svcmla_f32_x(pred_full, zero281, v549, v234, 90); - svfloat32_t zero288; - asm volatile("mov %0.s, #0" : "=w"(zero288)); + svfloat32_t zero288 = svdup_n_f32(0); svfloat32_t v288 = svcmla_f32_x(pred_full, zero288, v550, v235, 90); - svfloat32_t v138; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v138) : "w"(v137), "w"(v32)); - svfloat32_t zero178; - asm volatile("mov %0.s, #0" : "=w"(zero178)); + svfloat32_t v138 = svadd_f32_x(svptrue_b32(), v137, v32); + svfloat32_t zero178 = svdup_n_f32(0); svfloat32_t v178 = svcmla_f32_x(pred_full, zero178, v547, v143, 90); - svfloat32_t v227; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v227) : "w"(v226), "w"(v33)); - svfloat32_t zero267; - asm volatile("mov %0.s, #0" : "=w"(zero267)); + svfloat32_t v227 = svadd_f32_x(svptrue_b32(), v226, v33); + svfloat32_t zero267 = svdup_n_f32(0); svfloat32_t v267 = svcmla_f32_x(pred_full, zero267, v547, v232, 90); svfloat32_t v200 = svmla_f32_x(pred_full, v138, v137, v543); - svfloat32_t v207; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v207) : "w"(v178), "w"(v185)); - svfloat32_t v209; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v209) : "w"(v178), "w"(v185)); - svfloat32_t v211; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v211) : "w"(v178), "w"(v192)); + svfloat32_t v207 = svadd_f32_x(svptrue_b32(), v178, v185); + svfloat32_t v209 = svsub_f32_x(svptrue_b32(), v178, v185); + svfloat32_t v211 = svsub_f32_x(svptrue_b32(), v178, v192); svfloat32_t v289 = svmla_f32_x(pred_full, v227, v226, v543); - svfloat32_t v296; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v296) : "w"(v267), "w"(v274)); - svfloat32_t v298; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v298) : "w"(v267), "w"(v274)); - svfloat32_t v300; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v300) : "w"(v267), "w"(v281)); + svfloat32_t v296 = svadd_f32_x(svptrue_b32(), v267, v274); + svfloat32_t v298 = svsub_f32_x(svptrue_b32(), v267, v274); + svfloat32_t v300 = svsub_f32_x(svptrue_b32(), v267, v281); svst1_f64(pred_full, (double *)(v558), svreinterpret_f64_f32(v138)); svst1_f64(pred_full, (double *)(v567), svreinterpret_f64_f32(v227)); svfloat32_t v201 = svmla_f32_x(pred_full, v200, v139, v544); svfloat32_t v203 = svmls_f32_x(pred_full, v200, v139, v544); svfloat32_t v205 = svmls_f32_x(pred_full, v200, v140, v545); - svfloat32_t v208; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v208) : "w"(v207), "w"(v192)); - svfloat32_t v210; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v210) : "w"(v209), "w"(v199)); - svfloat32_t v212; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v212) : "w"(v211), "w"(v199)); + svfloat32_t v208 = svadd_f32_x(svptrue_b32(), v207, v192); + svfloat32_t v210 = svsub_f32_x(svptrue_b32(), v209, v199); + svfloat32_t v212 = svadd_f32_x(svptrue_b32(), v211, v199); svfloat32_t v290 = svmla_f32_x(pred_full, v289, v228, v544); svfloat32_t v292 = svmls_f32_x(pred_full, v289, v228, v544); svfloat32_t v294 = svmls_f32_x(pred_full, v289, v229, v545); - svfloat32_t v297; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v297) : "w"(v296), "w"(v281)); - svfloat32_t v299; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v299) : "w"(v298), "w"(v288)); - svfloat32_t v301; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v301) : "w"(v300), "w"(v288)); + svfloat32_t v297 = svadd_f32_x(svptrue_b32(), v296, v281); + svfloat32_t v299 = svsub_f32_x(svptrue_b32(), v298, v288); + svfloat32_t v301 = svadd_f32_x(svptrue_b32(), v300, v288); svfloat32_t v202 = svmla_f32_x(pred_full, v201, v140, v545); svfloat32_t v204 = svmls_f32_x(pred_full, v203, v141, v546); svfloat32_t v206 = svmla_f32_x(pred_full, v205, v141, v546); svfloat32_t v291 = svmla_f32_x(pred_full, v290, v229, v545); svfloat32_t v293 = svmls_f32_x(pred_full, v292, v230, v546); svfloat32_t v295 = svmla_f32_x(pred_full, v294, v230, v546); - svfloat32_t v213; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v213) : "w"(v202), "w"(v208)); - svfloat32_t v214; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v214) : "w"(v202), "w"(v208)); - svfloat32_t v215; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v215) : "w"(v204), "w"(v210)); - svfloat32_t v216; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v216) : "w"(v204), "w"(v210)); - svfloat32_t v217; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v217) : "w"(v206), "w"(v212)); - svfloat32_t v218; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v218) : "w"(v206), "w"(v212)); - svfloat32_t v302; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v302) : "w"(v291), "w"(v297)); - svfloat32_t v303; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v303) : "w"(v291), "w"(v297)); - svfloat32_t v304; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v304) : "w"(v293), "w"(v299)); - svfloat32_t v305; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v305) : "w"(v293), "w"(v299)); - svfloat32_t v306; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v306) : "w"(v295), "w"(v301)); - svfloat32_t v307; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v307) : "w"(v295), "w"(v301)); + svfloat32_t v213 = svadd_f32_x(svptrue_b32(), v202, v208); + svfloat32_t v214 = svsub_f32_x(svptrue_b32(), v202, v208); + svfloat32_t v215 = svadd_f32_x(svptrue_b32(), v204, v210); + svfloat32_t v216 = svsub_f32_x(svptrue_b32(), v204, v210); + svfloat32_t v217 = svadd_f32_x(svptrue_b32(), v206, v212); + svfloat32_t v218 = svsub_f32_x(svptrue_b32(), v206, v212); + svfloat32_t v302 = svadd_f32_x(svptrue_b32(), v291, v297); + svfloat32_t v303 = svsub_f32_x(svptrue_b32(), v291, v297); + svfloat32_t v304 = svadd_f32_x(svptrue_b32(), v293, v299); + svfloat32_t v305 = svsub_f32_x(svptrue_b32(), v293, v299); + svfloat32_t v306 = svadd_f32_x(svptrue_b32(), v295, v301); + svfloat32_t v307 = svsub_f32_x(svptrue_b32(), v295, v301); svst1_f64(pred_full, (double *)(v576), svreinterpret_f64_f32(v214)); svst1_f64(pred_full, (double *)(v585), svreinterpret_f64_f32(v303)); svst1_f64(pred_full, (double *)(v594), svreinterpret_f64_f32(v216)); @@ -5823,116 +5328,66 @@ void armral_fft_cf32_cf32_cf32_ac_n_uu15(const armral_cmplx_f32_t *restrict x, svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v541)[0])); svfloat32_t v736 = svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v550)[0])); - svfloat32_t v32; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v32) : "w"(v708), "w"(v710)); - svfloat32_t v33; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v33) : "w"(v708), "w"(v710)); - svfloat32_t v56; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v56) : "w"(v714), "w"(v716)); - svfloat32_t v57; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v57) : "w"(v714), "w"(v716)); - svfloat32_t v80; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v80) : "w"(v720), "w"(v722)); - svfloat32_t v81; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v81) : "w"(v720), "w"(v722)); - svfloat32_t v104; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v104) : "w"(v726), "w"(v728)); - svfloat32_t v105; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v105) : "w"(v726), "w"(v728)); - svfloat32_t v128; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v128) : "w"(v732), "w"(v734)); - svfloat32_t v129; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v129) : "w"(v732), "w"(v734)); - svfloat32_t v41; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v41) : "w"(v32), "w"(v712)); - svfloat32_t v65; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v65) : "w"(v56), "w"(v718)); - svfloat32_t v89; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v89) : "w"(v80), "w"(v724)); - svfloat32_t v113; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v113) : "w"(v104), "w"(v730)); - svfloat32_t v137; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v137) : "w"(v128), "w"(v736)); - svfloat32_t v191; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v191) : "w"(v56), "w"(v128)); - svfloat32_t v192; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v192) : "w"(v56), "w"(v128)); - svfloat32_t v193; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v193) : "w"(v104), "w"(v80)); - svfloat32_t v194; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v194) : "w"(v104), "w"(v80)); - svfloat32_t v244; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v244) : "w"(v57), "w"(v129)); - svfloat32_t v245; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v245) : "w"(v57), "w"(v129)); - svfloat32_t v246; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v246) : "w"(v105), "w"(v81)); - svfloat32_t v247; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v247) : "w"(v105), "w"(v81)); - svfloat32_t v138; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v138) : "w"(v65), "w"(v137)); - svfloat32_t v139; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v139) : "w"(v65), "w"(v137)); - svfloat32_t v140; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v140) : "w"(v113), "w"(v89)); - svfloat32_t v141; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v141) : "w"(v113), "w"(v89)); - svfloat32_t v195; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v195) : "w"(v191), "w"(v193)); - svfloat32_t v196; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v196) : "w"(v191), "w"(v193)); - svfloat32_t v197; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v197) : "w"(v192), "w"(v194)); - svfloat32_t zero220; - asm volatile("mov %0.s, #0" : "=w"(zero220)); + svfloat32_t v32 = svadd_f32_x(svptrue_b32(), v708, v710); + svfloat32_t v33 = svsub_f32_x(svptrue_b32(), v708, v710); + svfloat32_t v56 = svadd_f32_x(svptrue_b32(), v714, v716); + svfloat32_t v57 = svsub_f32_x(svptrue_b32(), v714, v716); + svfloat32_t v80 = svadd_f32_x(svptrue_b32(), v720, v722); + svfloat32_t v81 = svsub_f32_x(svptrue_b32(), v720, v722); + svfloat32_t v104 = svadd_f32_x(svptrue_b32(), v726, v728); + svfloat32_t v105 = svsub_f32_x(svptrue_b32(), v726, v728); + svfloat32_t v128 = svadd_f32_x(svptrue_b32(), v732, v734); + svfloat32_t v129 = svsub_f32_x(svptrue_b32(), v732, v734); + svfloat32_t v41 = svadd_f32_x(svptrue_b32(), v32, v712); + svfloat32_t v65 = svadd_f32_x(svptrue_b32(), v56, v718); + svfloat32_t v89 = svadd_f32_x(svptrue_b32(), v80, v724); + svfloat32_t v113 = svadd_f32_x(svptrue_b32(), v104, v730); + svfloat32_t v137 = svadd_f32_x(svptrue_b32(), v128, v736); + svfloat32_t v191 = svadd_f32_x(svptrue_b32(), v56, v128); + svfloat32_t v192 = svsub_f32_x(svptrue_b32(), v56, v128); + svfloat32_t v193 = svadd_f32_x(svptrue_b32(), v104, v80); + svfloat32_t v194 = svsub_f32_x(svptrue_b32(), v104, v80); + svfloat32_t v244 = svadd_f32_x(svptrue_b32(), v57, v129); + svfloat32_t v245 = svsub_f32_x(svptrue_b32(), v57, v129); + svfloat32_t v246 = svadd_f32_x(svptrue_b32(), v105, v81); + svfloat32_t v247 = svsub_f32_x(svptrue_b32(), v105, v81); + svfloat32_t v138 = svadd_f32_x(svptrue_b32(), v65, v137); + svfloat32_t v139 = svsub_f32_x(svptrue_b32(), v65, v137); + svfloat32_t v140 = svadd_f32_x(svptrue_b32(), v113, v89); + svfloat32_t v141 = svsub_f32_x(svptrue_b32(), v113, v89); + svfloat32_t v195 = svadd_f32_x(svptrue_b32(), v191, v193); + svfloat32_t v196 = svsub_f32_x(svptrue_b32(), v191, v193); + svfloat32_t v197 = svadd_f32_x(svptrue_b32(), v192, v194); + svfloat32_t zero220 = svdup_n_f32(0); svfloat32_t v220 = svcmla_f32_x(pred_full, zero220, v562, v192, 90); - svfloat32_t v248; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v248) : "w"(v244), "w"(v246)); - svfloat32_t v249; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v249) : "w"(v244), "w"(v246)); - svfloat32_t v250; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v250) : "w"(v245), "w"(v247)); - svfloat32_t v287; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v287) : "w"(v247), "w"(v570)); - svfloat32_t v142; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v142) : "w"(v138), "w"(v140)); - svfloat32_t v143; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v143) : "w"(v138), "w"(v140)); - svfloat32_t v144; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v144) : "w"(v139), "w"(v141)); - svfloat32_t zero167; - asm volatile("mov %0.s, #0" : "=w"(zero167)); + svfloat32_t v248 = svadd_f32_x(svptrue_b32(), v244, v246); + svfloat32_t v249 = svsub_f32_x(svptrue_b32(), v244, v246); + svfloat32_t v250 = svadd_f32_x(svptrue_b32(), v245, v247); + svfloat32_t v287 = svmul_f32_x(svptrue_b32(), v247, v570); + svfloat32_t v142 = svadd_f32_x(svptrue_b32(), v138, v140); + svfloat32_t v143 = svsub_f32_x(svptrue_b32(), v138, v140); + svfloat32_t v144 = svadd_f32_x(svptrue_b32(), v139, v141); + svfloat32_t zero167 = svdup_n_f32(0); svfloat32_t v167 = svcmla_f32_x(pred_full, zero167, v556, v139, 90); - svfloat32_t v198; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v198) : "w"(v195), "w"(v32)); - svfloat32_t v208; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v208) : "w"(v195), "w"(v560)); - svfloat32_t zero227; - asm volatile("mov %0.s, #0" : "=w"(zero227)); + svfloat32_t v198 = svadd_f32_x(svptrue_b32(), v195, v32); + svfloat32_t v208 = svmul_f32_x(svptrue_b32(), v195, v560); + svfloat32_t zero227 = svdup_n_f32(0); svfloat32_t v227 = svcmla_f32_x(pred_full, zero227, v563, v197, 90); - svfloat32_t v251; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v251) : "w"(v248), "w"(v33)); - svfloat32_t zero272; - asm volatile("mov %0.s, #0" : "=w"(zero272)); + svfloat32_t v251 = svadd_f32_x(svptrue_b32(), v248, v33); + svfloat32_t zero272 = svdup_n_f32(0); svfloat32_t v272 = svcmla_f32_x(pred_full, zero272, v567, v249, 90); - svfloat32_t v282; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v282) : "w"(v250), "w"(v569)); - svfloat32_t v145; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v145) : "w"(v142), "w"(v41)); - svfloat32_t zero174; - asm volatile("mov %0.s, #0" : "=w"(zero174)); + svfloat32_t v282 = svmul_f32_x(svptrue_b32(), v250, v569); + svfloat32_t v145 = svadd_f32_x(svptrue_b32(), v142, v41); + svfloat32_t zero174 = svdup_n_f32(0); svfloat32_t v174 = svcmla_f32_x(pred_full, zero174, v557, v144, 90); - svfloat32_t v238; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v238) : "w"(v220), "w"(v227)); + svfloat32_t v238 = svsub_f32_x(svptrue_b32(), v220, v227); svfloat32_t v239 = svcmla_f32_x(pred_full, v227, v564, v194, 90); - svfloat32_t zero258; - asm volatile("mov %0.s, #0" : "=w"(zero258)); + svfloat32_t zero258 = svdup_n_f32(0); svfloat32_t v258 = svcmla_f32_x(pred_full, zero258, v565, v251, 90); svfloat32_t v291 = svnmls_f32_x(pred_full, v282, v245, v568); svfloat32_t v292 = svmla_f32_x(pred_full, v287, v250, v569); svfloat32_t v182 = svmla_f32_x(pred_full, v145, v142, v554); - svfloat32_t v185; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v185) : "w"(v167), "w"(v174)); + svfloat32_t v185 = svsub_f32_x(svptrue_b32(), v167, v174); svfloat32_t v186 = svcmla_f32_x(pred_full, v174, v558, v141, 90); svfloat32_t v235 = svmla_f32_x(pred_full, v208, v198, v559); svfloat32_t v288 = svcmla_f32_x(pred_full, v258, v566, v248, 90); @@ -5942,68 +5397,40 @@ void armral_fft_cf32_cf32_cf32_ac_n_uu15(const armral_cmplx_f32_t *restrict x, svfloat32_t v184 = svmls_f32_x(pred_full, v182, v143, v555); svfloat32_t v236 = svmla_f32_x(pred_full, v235, v196, v561); svfloat32_t v237 = svmls_f32_x(pred_full, v235, v196, v561); - svfloat32_t v289; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v289) : "w"(v288), "w"(v272)); - svfloat32_t v290; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v290) : "w"(v288), "w"(v272)); - svfloat32_t v298; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v298) : "w"(v297), "w"(v258)); - svfloat32_t v299; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v299) : "w"(v297), "w"(v258)); - svfloat32_t v187; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v187) : "w"(v183), "w"(v185)); - svfloat32_t v188; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v188) : "w"(v183), "w"(v185)); - svfloat32_t v189; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v189) : "w"(v184), "w"(v186)); - svfloat32_t v190; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v190) : "w"(v184), "w"(v186)); - svfloat32_t v240; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v240) : "w"(v236), "w"(v238)); - svfloat32_t v241; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v241) : "w"(v236), "w"(v238)); - svfloat32_t v242; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v242) : "w"(v237), "w"(v239)); - svfloat32_t v243; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v243) : "w"(v237), "w"(v239)); - svfloat32_t v293; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v293) : "w"(v289), "w"(v291)); - svfloat32_t v294; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v294) : "w"(v289), "w"(v291)); - svfloat32_t v295; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v295) : "w"(v290), "w"(v292)); - svfloat32_t v296; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v296) : "w"(v290), "w"(v292)); + svfloat32_t v289 = svadd_f32_x(svptrue_b32(), v288, v272); + svfloat32_t v290 = svsub_f32_x(svptrue_b32(), v288, v272); + svfloat32_t v298 = svadd_f32_x(svptrue_b32(), v297, v258); + svfloat32_t v299 = svsub_f32_x(svptrue_b32(), v297, v258); + svfloat32_t v187 = svadd_f32_x(svptrue_b32(), v183, v185); + svfloat32_t v188 = svsub_f32_x(svptrue_b32(), v183, v185); + svfloat32_t v189 = svadd_f32_x(svptrue_b32(), v184, v186); + svfloat32_t v190 = svsub_f32_x(svptrue_b32(), v184, v186); + svfloat32_t v240 = svadd_f32_x(svptrue_b32(), v236, v238); + svfloat32_t v241 = svsub_f32_x(svptrue_b32(), v236, v238); + svfloat32_t v242 = svadd_f32_x(svptrue_b32(), v237, v239); + svfloat32_t v243 = svsub_f32_x(svptrue_b32(), v237, v239); + svfloat32_t v293 = svadd_f32_x(svptrue_b32(), v289, v291); + svfloat32_t v294 = svsub_f32_x(svptrue_b32(), v289, v291); + svfloat32_t v295 = svadd_f32_x(svptrue_b32(), v290, v292); + svfloat32_t v296 = svsub_f32_x(svptrue_b32(), v290, v292); svst1_f64(pred_full, (double *)(v587), svreinterpret_f64_f32(v299)); svst1_f64(pred_full, (double *)(v596), svreinterpret_f64_f32(v298)); - svfloat32_t v321; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v321) : "w"(v188), "w"(v241)); - svfloat32_t v345; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v345) : "w"(v190), "w"(v243)); - svfloat32_t v369; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v369) : "w"(v189), "w"(v242)); - svfloat32_t v393; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v393) : "w"(v187), "w"(v240)); + svfloat32_t v321 = svadd_f32_x(svptrue_b32(), v188, v241); + svfloat32_t v345 = svadd_f32_x(svptrue_b32(), v190, v243); + svfloat32_t v369 = svadd_f32_x(svptrue_b32(), v189, v242); + svfloat32_t v393 = svadd_f32_x(svptrue_b32(), v187, v240); svst1_f64(pred_full, (double *)(v605), svreinterpret_f64_f32(v188)); svst1_f64(pred_full, (double *)(v632), svreinterpret_f64_f32(v190)); svst1_f64(pred_full, (double *)(v659), svreinterpret_f64_f32(v189)); svst1_f64(pred_full, (double *)(v686), svreinterpret_f64_f32(v187)); - svfloat32_t v322; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v322) : "w"(v321), "w"(v294)); - svfloat32_t v323; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v323) : "w"(v321), "w"(v294)); - svfloat32_t v346; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v346) : "w"(v345), "w"(v296)); - svfloat32_t v347; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v347) : "w"(v345), "w"(v296)); - svfloat32_t v370; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v370) : "w"(v369), "w"(v295)); - svfloat32_t v371; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v371) : "w"(v369), "w"(v295)); - svfloat32_t v394; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v394) : "w"(v393), "w"(v293)); - svfloat32_t v395; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v395) : "w"(v393), "w"(v293)); + svfloat32_t v322 = svadd_f32_x(svptrue_b32(), v321, v294); + svfloat32_t v323 = svsub_f32_x(svptrue_b32(), v321, v294); + svfloat32_t v346 = svadd_f32_x(svptrue_b32(), v345, v296); + svfloat32_t v347 = svsub_f32_x(svptrue_b32(), v345, v296); + svfloat32_t v370 = svadd_f32_x(svptrue_b32(), v369, v295); + svfloat32_t v371 = svsub_f32_x(svptrue_b32(), v369, v295); + svfloat32_t v394 = svadd_f32_x(svptrue_b32(), v393, v293); + svfloat32_t v395 = svsub_f32_x(svptrue_b32(), v393, v293); svst1_f64(pred_full, (double *)(v614), svreinterpret_f64_f32(v323)); svst1_f64(pred_full, (double *)(v623), svreinterpret_f64_f32(v322)); svst1_f64(pred_full, (double *)(v641), svreinterpret_f64_f32(v347)); @@ -6533,192 +5960,109 @@ void armral_fft_cf32_cf32_cf32_ac_n_uu16(const armral_cmplx_f32_t *restrict x, svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v565)[0])); svfloat32_t v771 = svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v574)[0])); - svfloat32_t v32; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v32) : "w"(v741), "w"(v743)); - svfloat32_t v33; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v33) : "w"(v741), "w"(v743)); - svfloat32_t v48; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v48) : "w"(v745), "w"(v747)); - svfloat32_t v49; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v49) : "w"(v745), "w"(v747)); - svfloat32_t v64; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v64) : "w"(v749), "w"(v751)); - svfloat32_t v65; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v65) : "w"(v749), "w"(v751)); - svfloat32_t v80; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v80) : "w"(v753), "w"(v755)); - svfloat32_t v81; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v81) : "w"(v753), "w"(v755)); - svfloat32_t v96; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v96) : "w"(v757), "w"(v759)); - svfloat32_t v97; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v97) : "w"(v757), "w"(v759)); - svfloat32_t v112; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v112) : "w"(v761), "w"(v763)); - svfloat32_t v113; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v113) : "w"(v761), "w"(v763)); - svfloat32_t v128; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v128) : "w"(v765), "w"(v767)); - svfloat32_t v129; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v129) : "w"(v765), "w"(v767)); - svfloat32_t v144; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v144) : "w"(v769), "w"(v771)); - svfloat32_t v145; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v145) : "w"(v769), "w"(v771)); - svfloat32_t v146; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v146) : "w"(v32), "w"(v48)); - svfloat32_t v147; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v147) : "w"(v32), "w"(v48)); - svfloat32_t v148; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v148) : "w"(v64), "w"(v80)); - svfloat32_t v149; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v149) : "w"(v64), "w"(v80)); - svfloat32_t v150; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v150) : "w"(v96), "w"(v112)); - svfloat32_t v151; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v151) : "w"(v96), "w"(v112)); - svfloat32_t v152; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v152) : "w"(v128), "w"(v144)); - svfloat32_t v153; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v153) : "w"(v128), "w"(v144)); - svfloat32_t v162; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v162) : "w"(v65), "w"(v81)); - svfloat32_t v163; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v163) : "w"(v65), "w"(v81)); - svfloat32_t v164; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v164) : "w"(v97), "w"(v145)); - svfloat32_t v165; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v165) : "w"(v97), "w"(v145)); - svfloat32_t v166; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v166) : "w"(v113), "w"(v129)); - svfloat32_t v167; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v167) : "w"(v113), "w"(v129)); - svfloat32_t zero227; - asm volatile("mov %0.s, #0" : "=w"(zero227)); + svfloat32_t v32 = svadd_f32_x(svptrue_b32(), v741, v743); + svfloat32_t v33 = svsub_f32_x(svptrue_b32(), v741, v743); + svfloat32_t v48 = svadd_f32_x(svptrue_b32(), v745, v747); + svfloat32_t v49 = svsub_f32_x(svptrue_b32(), v745, v747); + svfloat32_t v64 = svadd_f32_x(svptrue_b32(), v749, v751); + svfloat32_t v65 = svsub_f32_x(svptrue_b32(), v749, v751); + svfloat32_t v80 = svadd_f32_x(svptrue_b32(), v753, v755); + svfloat32_t v81 = svsub_f32_x(svptrue_b32(), v753, v755); + svfloat32_t v96 = svadd_f32_x(svptrue_b32(), v757, v759); + svfloat32_t v97 = svsub_f32_x(svptrue_b32(), v757, v759); + svfloat32_t v112 = svadd_f32_x(svptrue_b32(), v761, v763); + svfloat32_t v113 = svsub_f32_x(svptrue_b32(), v761, v763); + svfloat32_t v128 = svadd_f32_x(svptrue_b32(), v765, v767); + svfloat32_t v129 = svsub_f32_x(svptrue_b32(), v765, v767); + svfloat32_t v144 = svadd_f32_x(svptrue_b32(), v769, v771); + svfloat32_t v145 = svsub_f32_x(svptrue_b32(), v769, v771); + svfloat32_t v146 = svadd_f32_x(svptrue_b32(), v32, v48); + svfloat32_t v147 = svsub_f32_x(svptrue_b32(), v32, v48); + svfloat32_t v148 = svadd_f32_x(svptrue_b32(), v64, v80); + svfloat32_t v149 = svsub_f32_x(svptrue_b32(), v64, v80); + svfloat32_t v150 = svadd_f32_x(svptrue_b32(), v96, v112); + svfloat32_t v151 = svsub_f32_x(svptrue_b32(), v96, v112); + svfloat32_t v152 = svadd_f32_x(svptrue_b32(), v128, v144); + svfloat32_t v153 = svsub_f32_x(svptrue_b32(), v128, v144); + svfloat32_t v162 = svadd_f32_x(svptrue_b32(), v65, v81); + svfloat32_t v163 = svsub_f32_x(svptrue_b32(), v65, v81); + svfloat32_t v164 = svadd_f32_x(svptrue_b32(), v97, v145); + svfloat32_t v165 = svsub_f32_x(svptrue_b32(), v97, v145); + svfloat32_t v166 = svadd_f32_x(svptrue_b32(), v113, v129); + svfloat32_t v167 = svsub_f32_x(svptrue_b32(), v113, v129); + svfloat32_t zero227 = svdup_n_f32(0); svfloat32_t v227 = svcmla_f32_x(pred_full, zero227, v586, v49, 90); - svfloat32_t v154; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v154) : "w"(v146), "w"(v148)); - svfloat32_t v155; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v155) : "w"(v146), "w"(v148)); - svfloat32_t v156; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v156) : "w"(v150), "w"(v152)); - svfloat32_t v157; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v157) : "w"(v150), "w"(v152)); - svfloat32_t v160; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v160) : "w"(v151), "w"(v153)); - svfloat32_t v161; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v161) : "w"(v151), "w"(v153)); - svfloat32_t v168; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v168) : "w"(v164), "w"(v166)); - svfloat32_t v169; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v169) : "w"(v165), "w"(v167)); - svfloat32_t zero203; - asm volatile("mov %0.s, #0" : "=w"(zero203)); + svfloat32_t v154 = svadd_f32_x(svptrue_b32(), v146, v148); + svfloat32_t v155 = svsub_f32_x(svptrue_b32(), v146, v148); + svfloat32_t v156 = svadd_f32_x(svptrue_b32(), v150, v152); + svfloat32_t v157 = svsub_f32_x(svptrue_b32(), v150, v152); + svfloat32_t v160 = svadd_f32_x(svptrue_b32(), v151, v153); + svfloat32_t v161 = svsub_f32_x(svptrue_b32(), v151, v153); + svfloat32_t v168 = svadd_f32_x(svptrue_b32(), v164, v166); + svfloat32_t v169 = svadd_f32_x(svptrue_b32(), v165, v167); + svfloat32_t zero203 = svdup_n_f32(0); svfloat32_t v203 = svcmla_f32_x(pred_full, zero203, v586, v149, 90); - svfloat32_t zero234; - asm volatile("mov %0.s, #0" : "=w"(zero234)); + svfloat32_t zero234 = svdup_n_f32(0); svfloat32_t v234 = svcmla_f32_x(pred_full, zero234, v587, v162, 90); - svfloat32_t zero260; - asm volatile("mov %0.s, #0" : "=w"(zero260)); + svfloat32_t zero260 = svdup_n_f32(0); svfloat32_t v260 = svcmla_f32_x(pred_full, zero260, v591, v166, 90); - svfloat32_t v270; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v270) : "w"(v165), "w"(v593)); - svfloat32_t v275; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v275) : "w"(v167), "w"(v594)); - svfloat32_t v158; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v158) : "w"(v154), "w"(v156)); - svfloat32_t v159; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v159) : "w"(v154), "w"(v156)); - svfloat32_t zero191; - asm volatile("mov %0.s, #0" : "=w"(zero191)); + svfloat32_t v270 = svmul_f32_x(svptrue_b32(), v165, v593); + svfloat32_t v275 = svmul_f32_x(svptrue_b32(), v167, v594); + svfloat32_t v158 = svadd_f32_x(svptrue_b32(), v154, v156); + svfloat32_t v159 = svsub_f32_x(svptrue_b32(), v154, v156); + svfloat32_t zero191 = svdup_n_f32(0); svfloat32_t v191 = svcmla_f32_x(pred_full, zero191, v586, v157, 90); - svfloat32_t zero210; - asm volatile("mov %0.s, #0" : "=w"(zero210)); + svfloat32_t zero210 = svdup_n_f32(0); svfloat32_t v210 = svcmla_f32_x(pred_full, zero210, v587, v160, 90); - svfloat32_t zero246; - asm volatile("mov %0.s, #0" : "=w"(zero246)); + svfloat32_t zero246 = svdup_n_f32(0); svfloat32_t v246 = svcmla_f32_x(pred_full, zero246, v589, v168, 90); - svfloat32_t v265; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v265) : "w"(v169), "w"(v592)); + svfloat32_t v265 = svmul_f32_x(svptrue_b32(), v169, v592); svfloat32_t v286 = svmla_f32_x(pred_full, v33, v163, v588); svfloat32_t v287 = svmls_f32_x(pred_full, v33, v163, v588); - svfloat32_t v288; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v288) : "w"(v227), "w"(v234)); - svfloat32_t v289; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v289) : "w"(v227), "w"(v234)); - svfloat32_t v276; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v276) : "w"(v155), "w"(v191)); - svfloat32_t v277; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v277) : "w"(v155), "w"(v191)); + svfloat32_t v288 = svadd_f32_x(svptrue_b32(), v227, v234); + svfloat32_t v289 = svsub_f32_x(svptrue_b32(), v227, v234); + svfloat32_t v276 = svadd_f32_x(svptrue_b32(), v155, v191); + svfloat32_t v277 = svsub_f32_x(svptrue_b32(), v155, v191); svfloat32_t v278 = svmla_f32_x(pred_full, v147, v161, v588); - svfloat32_t v279; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v279) : "w"(v203), "w"(v210)); + svfloat32_t v279 = svadd_f32_x(svptrue_b32(), v203, v210); svfloat32_t v280 = svmls_f32_x(pred_full, v147, v161, v588); - svfloat32_t v281; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v281) : "w"(v210), "w"(v203)); + svfloat32_t v281 = svsub_f32_x(svptrue_b32(), v210, v203); svfloat32_t v290 = svcmla_f32_x(pred_full, v246, v590, v164, 90); - svfloat32_t v291; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v291) : "w"(v246), "w"(v260)); + svfloat32_t v291 = svsub_f32_x(svptrue_b32(), v246, v260); svfloat32_t v292 = svnmls_f32_x(pred_full, v265, v165, v593); svfloat32_t v293 = svnmls_f32_x(pred_full, v265, v167, v594); svfloat32_t v294 = svnmls_f32_x(pred_full, v270, v169, v592); svfloat32_t v295 = svnmls_f32_x(pred_full, v275, v169, v592); - svfloat32_t v300; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v300) : "w"(v287), "w"(v289)); - svfloat32_t v301; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v301) : "w"(v287), "w"(v289)); + svfloat32_t v300 = svadd_f32_x(svptrue_b32(), v287, v289); + svfloat32_t v301 = svsub_f32_x(svptrue_b32(), v287, v289); svst1_f64(pred_full, (double *)(v602), svreinterpret_f64_f32(v158)); svst1_f64(pred_full, (double *)(v674), svreinterpret_f64_f32(v159)); - svfloat32_t v282; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v282) : "w"(v278), "w"(v279)); - svfloat32_t v283; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v283) : "w"(v280), "w"(v281)); - svfloat32_t v284; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v284) : "w"(v280), "w"(v281)); - svfloat32_t v285; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v285) : "w"(v278), "w"(v279)); - svfloat32_t v296; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v296) : "w"(v286), "w"(v292)); - svfloat32_t v297; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v297) : "w"(v286), "w"(v292)); - svfloat32_t v298; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v298) : "w"(v286), "w"(v294)); - svfloat32_t v299; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v299) : "w"(v286), "w"(v294)); - svfloat32_t v302; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v302) : "w"(v287), "w"(v295)); - svfloat32_t v303; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v303) : "w"(v287), "w"(v295)); - svfloat32_t v306; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v306) : "w"(v290), "w"(v288)); - svfloat32_t v307; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v307) : "w"(v290), "w"(v288)); - svfloat32_t v308; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v308) : "w"(v291), "w"(v293)); - svfloat32_t v309; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v309) : "w"(v291), "w"(v293)); - svfloat32_t v310; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v310) : "w"(v291), "w"(v289)); - svfloat32_t v311; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v311) : "w"(v291), "w"(v289)); + svfloat32_t v282 = svadd_f32_x(svptrue_b32(), v278, v279); + svfloat32_t v283 = svadd_f32_x(svptrue_b32(), v280, v281); + svfloat32_t v284 = svsub_f32_x(svptrue_b32(), v280, v281); + svfloat32_t v285 = svsub_f32_x(svptrue_b32(), v278, v279); + svfloat32_t v296 = svadd_f32_x(svptrue_b32(), v286, v292); + svfloat32_t v297 = svsub_f32_x(svptrue_b32(), v286, v292); + svfloat32_t v298 = svadd_f32_x(svptrue_b32(), v286, v294); + svfloat32_t v299 = svsub_f32_x(svptrue_b32(), v286, v294); + svfloat32_t v302 = svadd_f32_x(svptrue_b32(), v287, v295); + svfloat32_t v303 = svsub_f32_x(svptrue_b32(), v287, v295); + svfloat32_t v306 = svadd_f32_x(svptrue_b32(), v290, v288); + svfloat32_t v307 = svsub_f32_x(svptrue_b32(), v290, v288); + svfloat32_t v308 = svadd_f32_x(svptrue_b32(), v291, v293); + svfloat32_t v309 = svsub_f32_x(svptrue_b32(), v291, v293); + svfloat32_t v310 = svadd_f32_x(svptrue_b32(), v291, v289); + svfloat32_t v311 = svsub_f32_x(svptrue_b32(), v291, v289); svst1_f64(pred_full, (double *)(v638), svreinterpret_f64_f32(v277)); svst1_f64(pred_full, (double *)(v710), svreinterpret_f64_f32(v276)); - svfloat32_t v312; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v312) : "w"(v296), "w"(v306)); - svfloat32_t v313; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v313) : "w"(v297), "w"(v307)); - svfloat32_t v314; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v314) : "w"(v298), "w"(v307)); - svfloat32_t v315; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v315) : "w"(v299), "w"(v306)); - svfloat32_t v316; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v316) : "w"(v300), "w"(v308)); - svfloat32_t v317; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v317) : "w"(v301), "w"(v309)); - svfloat32_t v318; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v318) : "w"(v302), "w"(v311)); - svfloat32_t v319; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v319) : "w"(v303), "w"(v310)); + svfloat32_t v312 = svadd_f32_x(svptrue_b32(), v296, v306); + svfloat32_t v313 = svadd_f32_x(svptrue_b32(), v297, v307); + svfloat32_t v314 = svsub_f32_x(svptrue_b32(), v298, v307); + svfloat32_t v315 = svsub_f32_x(svptrue_b32(), v299, v306); + svfloat32_t v316 = svadd_f32_x(svptrue_b32(), v300, v308); + svfloat32_t v317 = svadd_f32_x(svptrue_b32(), v301, v309); + svfloat32_t v318 = svsub_f32_x(svptrue_b32(), v302, v311); + svfloat32_t v319 = svsub_f32_x(svptrue_b32(), v303, v310); svst1_f64(pred_full, (double *)(v620), svreinterpret_f64_f32(v285)); svst1_f64(pred_full, (double *)(v656), svreinterpret_f64_f32(v284)); svst1_f64(pred_full, (double *)(v692), svreinterpret_f64_f32(v283)); @@ -7752,167 +7096,91 @@ void armral_fft_cf32_cf32_cf32_ac_n_uu17(const armral_cmplx_f32_t *restrict x, svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v767)[0])); svfloat32_t v1010 = svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v776)[0])); - svfloat32_t v32; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v32) : "w"(v980), "w"(v982)); - svfloat32_t v33; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v33) : "w"(v980), "w"(v982)); - svfloat32_t v48; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v48) : "w"(v984), "w"(v986)); - svfloat32_t v49; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v49) : "w"(v984), "w"(v986)); - svfloat32_t v64; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v64) : "w"(v988), "w"(v990)); - svfloat32_t v65; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v65) : "w"(v988), "w"(v990)); - svfloat32_t v80; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v80) : "w"(v992), "w"(v994)); - svfloat32_t v81; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v81) : "w"(v992), "w"(v994)); - svfloat32_t v96; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v96) : "w"(v996), "w"(v998)); - svfloat32_t v97; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v97) : "w"(v996), "w"(v998)); - svfloat32_t v112; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v112) : "w"(v1000), "w"(v1002)); - svfloat32_t v113; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v113) : "w"(v1000), "w"(v1002)); - svfloat32_t v128; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v128) : "w"(v1004), "w"(v1006)); - svfloat32_t v129; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v129) : "w"(v1004), "w"(v1006)); - svfloat32_t v144; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v144) : "w"(v1008), "w"(v1010)); - svfloat32_t v145; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v145) : "w"(v1008), "w"(v1010)); - svfloat32_t v146; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v146) : "w"(v32), "w"(v96)); - svfloat32_t v147; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v147) : "w"(v48), "w"(v112)); - svfloat32_t v148; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v148) : "w"(v64), "w"(v128)); - svfloat32_t v149; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v149) : "w"(v80), "w"(v144)); - svfloat32_t v152; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v152) : "w"(v32), "w"(v96)); - svfloat32_t v153; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v153) : "w"(v48), "w"(v112)); - svfloat32_t v154; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v154) : "w"(v64), "w"(v128)); - svfloat32_t v155; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v155) : "w"(v80), "w"(v144)); - svfloat32_t v166; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v166) : "w"(v33), "w"(v65)); - svfloat32_t v167; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v167) : "w"(v49), "w"(v81)); - svfloat32_t v168; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v168) : "w"(v33), "w"(v65)); - svfloat32_t v169; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v169) : "w"(v145), "w"(v113)); - svfloat32_t v170; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v170) : "w"(v97), "w"(v129)); - svfloat32_t v171; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v171) : "w"(v113), "w"(v145)); - svfloat32_t v172; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v172) : "w"(v97), "w"(v129)); - svfloat32_t v173; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v173) : "w"(v49), "w"(v81)); - svfloat32_t v186; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v186) : "w"(v33), "w"(v97)); - svfloat32_t v187; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v187) : "w"(v81), "w"(v145)); - svfloat32_t v150; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v150) : "w"(v146), "w"(v148)); - svfloat32_t v151; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v151) : "w"(v147), "w"(v149)); - svfloat32_t v156; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v156) : "w"(v146), "w"(v148)); - svfloat32_t v157; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v157) : "w"(v147), "w"(v149)); - svfloat32_t v160; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v160) : "w"(v153), "w"(v155)); - svfloat32_t v161; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v161) : "w"(v152), "w"(v154)); - svfloat32_t v163; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v163) : "w"(v154), "w"(v155)); - svfloat32_t v164; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v164) : "w"(v152), "w"(v153)); - svfloat32_t v174; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v174) : "w"(v166), "w"(v167)); - svfloat32_t v175; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v175) : "w"(v170), "w"(v171)); - svfloat32_t v177; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v177) : "w"(v166), "w"(v167)); - svfloat32_t v178; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v178) : "w"(v170), "w"(v171)); - svfloat32_t v180; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v180) : "w"(v168), "w"(v169)); - svfloat32_t v181; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v181) : "w"(v172), "w"(v173)); - svfloat32_t v183; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v183) : "w"(v168), "w"(v169)); - svfloat32_t v184; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v184) : "w"(v172), "w"(v173)); - svfloat32_t v223; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v223) : "w"(v154), "w"(v792)); - svfloat32_t zero390; - asm volatile("mov %0.s, #0" : "=w"(zero390)); + svfloat32_t v32 = svadd_f32_x(svptrue_b32(), v980, v982); + svfloat32_t v33 = svsub_f32_x(svptrue_b32(), v980, v982); + svfloat32_t v48 = svadd_f32_x(svptrue_b32(), v984, v986); + svfloat32_t v49 = svsub_f32_x(svptrue_b32(), v984, v986); + svfloat32_t v64 = svadd_f32_x(svptrue_b32(), v988, v990); + svfloat32_t v65 = svsub_f32_x(svptrue_b32(), v988, v990); + svfloat32_t v80 = svadd_f32_x(svptrue_b32(), v992, v994); + svfloat32_t v81 = svsub_f32_x(svptrue_b32(), v992, v994); + svfloat32_t v96 = svadd_f32_x(svptrue_b32(), v996, v998); + svfloat32_t v97 = svsub_f32_x(svptrue_b32(), v996, v998); + svfloat32_t v112 = svadd_f32_x(svptrue_b32(), v1000, v1002); + svfloat32_t v113 = svsub_f32_x(svptrue_b32(), v1000, v1002); + svfloat32_t v128 = svadd_f32_x(svptrue_b32(), v1004, v1006); + svfloat32_t v129 = svsub_f32_x(svptrue_b32(), v1004, v1006); + svfloat32_t v144 = svadd_f32_x(svptrue_b32(), v1008, v1010); + svfloat32_t v145 = svsub_f32_x(svptrue_b32(), v1008, v1010); + svfloat32_t v146 = svadd_f32_x(svptrue_b32(), v32, v96); + svfloat32_t v147 = svadd_f32_x(svptrue_b32(), v48, v112); + svfloat32_t v148 = svadd_f32_x(svptrue_b32(), v64, v128); + svfloat32_t v149 = svadd_f32_x(svptrue_b32(), v80, v144); + svfloat32_t v152 = svsub_f32_x(svptrue_b32(), v32, v96); + svfloat32_t v153 = svsub_f32_x(svptrue_b32(), v48, v112); + svfloat32_t v154 = svsub_f32_x(svptrue_b32(), v64, v128); + svfloat32_t v155 = svsub_f32_x(svptrue_b32(), v80, v144); + svfloat32_t v166 = svadd_f32_x(svptrue_b32(), v33, v65); + svfloat32_t v167 = svadd_f32_x(svptrue_b32(), v49, v81); + svfloat32_t v168 = svsub_f32_x(svptrue_b32(), v33, v65); + svfloat32_t v169 = svsub_f32_x(svptrue_b32(), v145, v113); + svfloat32_t v170 = svadd_f32_x(svptrue_b32(), v97, v129); + svfloat32_t v171 = svadd_f32_x(svptrue_b32(), v113, v145); + svfloat32_t v172 = svsub_f32_x(svptrue_b32(), v97, v129); + svfloat32_t v173 = svsub_f32_x(svptrue_b32(), v49, v81); + svfloat32_t v186 = svadd_f32_x(svptrue_b32(), v33, v97); + svfloat32_t v187 = svadd_f32_x(svptrue_b32(), v81, v145); + svfloat32_t v150 = svadd_f32_x(svptrue_b32(), v146, v148); + svfloat32_t v151 = svadd_f32_x(svptrue_b32(), v147, v149); + svfloat32_t v156 = svsub_f32_x(svptrue_b32(), v146, v148); + svfloat32_t v157 = svsub_f32_x(svptrue_b32(), v147, v149); + svfloat32_t v160 = svadd_f32_x(svptrue_b32(), v153, v155); + svfloat32_t v161 = svadd_f32_x(svptrue_b32(), v152, v154); + svfloat32_t v163 = svsub_f32_x(svptrue_b32(), v154, v155); + svfloat32_t v164 = svsub_f32_x(svptrue_b32(), v152, v153); + svfloat32_t v174 = svadd_f32_x(svptrue_b32(), v166, v167); + svfloat32_t v175 = svadd_f32_x(svptrue_b32(), v170, v171); + svfloat32_t v177 = svsub_f32_x(svptrue_b32(), v166, v167); + svfloat32_t v178 = svsub_f32_x(svptrue_b32(), v170, v171); + svfloat32_t v180 = svadd_f32_x(svptrue_b32(), v168, v169); + svfloat32_t v181 = svadd_f32_x(svptrue_b32(), v172, v173); + svfloat32_t v183 = svsub_f32_x(svptrue_b32(), v168, v169); + svfloat32_t v184 = svsub_f32_x(svptrue_b32(), v172, v173); + svfloat32_t v223 = svmul_f32_x(svptrue_b32(), v154, v792); + svfloat32_t zero390 = svdup_n_f32(0); svfloat32_t v390 = svcmla_f32_x(pred_full, zero390, v819, v187, 90); - svfloat32_t v158; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v158) : "w"(v150), "w"(v151)); - svfloat32_t v159; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v159) : "w"(v150), "w"(v151)); - svfloat32_t v162; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v162) : "w"(v161), "w"(v160)); - svfloat32_t v165; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v165) : "w"(v156), "w"(v157)); - svfloat32_t v176; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v176) : "w"(v174), "w"(v175)); - svfloat32_t v179; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v179) : "w"(v177), "w"(v178)); - svfloat32_t v182; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v182) : "w"(v180), "w"(v181)); - svfloat32_t v185; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v185) : "w"(v183), "w"(v184)); - svfloat32_t v188; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v188) : "w"(v181), "w"(v175)); - svfloat32_t v191; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v191) : "w"(v174), "w"(v180)); - svfloat32_t v233; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v233) : "w"(v156), "w"(v794)); - svfloat32_t v238; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v238) : "w"(v157), "w"(v795)); - svfloat32_t v268; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v268) : "w"(v163), "w"(v801)); - svfloat32_t v273; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v273) : "w"(v164), "w"(v802)); - svfloat32_t v189; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v189) : "w"(v188), "w"(v33)); - svfloat32_t v192; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v192) : "w"(v191), "w"(v81)); - svfloat32_t v203; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v203) : "w"(v1012), "w"(v158)); - svfloat32_t v263; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v263) : "w"(v162), "w"(v800)); - svfloat32_t zero299; - asm volatile("mov %0.s, #0" : "=w"(zero299)); + svfloat32_t v158 = svadd_f32_x(svptrue_b32(), v150, v151); + svfloat32_t v159 = svsub_f32_x(svptrue_b32(), v150, v151); + svfloat32_t v162 = svsub_f32_x(svptrue_b32(), v161, v160); + svfloat32_t v165 = svadd_f32_x(svptrue_b32(), v156, v157); + svfloat32_t v176 = svadd_f32_x(svptrue_b32(), v174, v175); + svfloat32_t v179 = svadd_f32_x(svptrue_b32(), v177, v178); + svfloat32_t v182 = svadd_f32_x(svptrue_b32(), v180, v181); + svfloat32_t v185 = svadd_f32_x(svptrue_b32(), v183, v184); + svfloat32_t v188 = svsub_f32_x(svptrue_b32(), v181, v175); + svfloat32_t v191 = svsub_f32_x(svptrue_b32(), v174, v180); + svfloat32_t v233 = svmul_f32_x(svptrue_b32(), v156, v794); + svfloat32_t v238 = svmul_f32_x(svptrue_b32(), v157, v795); + svfloat32_t v268 = svmul_f32_x(svptrue_b32(), v163, v801); + svfloat32_t v273 = svmul_f32_x(svptrue_b32(), v164, v802); + svfloat32_t v189 = svadd_f32_x(svptrue_b32(), v188, v33); + svfloat32_t v192 = svadd_f32_x(svptrue_b32(), v191, v81); + svfloat32_t v203 = svadd_f32_x(svptrue_b32(), v1012, v158); + svfloat32_t v263 = svmul_f32_x(svptrue_b32(), v162, v800); + svfloat32_t zero299 = svdup_n_f32(0); svfloat32_t v299 = svcmla_f32_x(pred_full, zero299, v806, v176, 90); - svfloat32_t zero320; - asm volatile("mov %0.s, #0" : "=w"(zero320)); + svfloat32_t zero320 = svdup_n_f32(0); svfloat32_t v320 = svcmla_f32_x(pred_full, zero320, v809, v179, 90); - svfloat32_t zero341; - asm volatile("mov %0.s, #0" : "=w"(zero341)); + svfloat32_t zero341 = svdup_n_f32(0); svfloat32_t v341 = svcmla_f32_x(pred_full, zero341, v812, v182, 90); - svfloat32_t zero362; - asm volatile("mov %0.s, #0" : "=w"(zero362)); + svfloat32_t zero362 = svdup_n_f32(0); svfloat32_t v362 = svcmla_f32_x(pred_full, zero362, v815, v185, 90); svfloat32_t v428 = svmla_f32_x(pred_full, v268, v155, v793); svfloat32_t v429 = svnmls_f32_x(pred_full, v223, v163, v801); svfloat32_t v430 = svmla_f32_x(pred_full, v273, v153, v791); svfloat32_t v431 = svnmls_f32_x(pred_full, v273, v152, v790); - svfloat32_t v190; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v190) : "w"(v189), "w"(v187)); - svfloat32_t v193; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v193) : "w"(v192), "w"(v97)); + svfloat32_t v190 = svsub_f32_x(svptrue_b32(), v189, v187); + svfloat32_t v193 = svadd_f32_x(svptrue_b32(), v192, v97); svfloat32_t v426 = svmla_f32_x(pred_full, v263, v160, v798); svfloat32_t v427 = svnmls_f32_x(pred_full, v263, v161, v799); svfloat32_t v432 = svnmls_f32_x(pred_full, v238, v165, v803); @@ -7927,160 +7195,91 @@ void armral_fft_cf32_cf32_cf32_ac_n_uu17(const armral_cmplx_f32_t *restrict x, svfloat32_t v459 = svcmla_f32_x(pred_full, v362, v813, v183, 90); svfloat32_t v460 = svcmla_f32_x(pred_full, v362, v814, v184, 90); svst1_f64(pred_full, (double *)(v832), svreinterpret_f64_f32(v203)); - svfloat32_t v194; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v194) : "w"(v193), "w"(v145)); - svfloat32_t zero411; - asm volatile("mov %0.s, #0" : "=w"(zero411)); + svfloat32_t v194 = svsub_f32_x(svptrue_b32(), v193, v145); + svfloat32_t zero411 = svdup_n_f32(0); svfloat32_t v411 = svcmla_f32_x(pred_full, zero411, v822, v190, 90); svfloat32_t v435 = svmla_f32_x(pred_full, v434, v159, v797); svfloat32_t v436 = svmls_f32_x(pred_full, v434, v159, v797); - svfloat32_t v437; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v437) : "w"(v426), "w"(v428)); - svfloat32_t v439; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v439) : "w"(v427), "w"(v429)); - svfloat32_t v441; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v441) : "w"(v426), "w"(v430)); - svfloat32_t v443; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v443) : "w"(v427), "w"(v431)); - svfloat32_t v464; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v464) : "w"(v453), "w"(v455)); - svfloat32_t v465; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v465) : "w"(v453), "w"(v455)); - svfloat32_t v466; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v466) : "w"(v454), "w"(v456)); - svfloat32_t v467; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v467) : "w"(v454), "w"(v456)); - svfloat32_t v468; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v468) : "w"(v457), "w"(v459)); - svfloat32_t v469; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v469) : "w"(v459), "w"(v457)); - svfloat32_t v470; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v470) : "w"(v458), "w"(v460)); - svfloat32_t v471; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v471) : "w"(v460), "w"(v458)); - svfloat32_t v195; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v195) : "w"(v190), "w"(v194)); - svfloat32_t zero418; - asm volatile("mov %0.s, #0" : "=w"(zero418)); + svfloat32_t v437 = svsub_f32_x(svptrue_b32(), v426, v428); + svfloat32_t v439 = svadd_f32_x(svptrue_b32(), v427, v429); + svfloat32_t v441 = svadd_f32_x(svptrue_b32(), v426, v430); + svfloat32_t v443 = svadd_f32_x(svptrue_b32(), v427, v431); + svfloat32_t v464 = svadd_f32_x(svptrue_b32(), v453, v455); + svfloat32_t v465 = svsub_f32_x(svptrue_b32(), v453, v455); + svfloat32_t v466 = svadd_f32_x(svptrue_b32(), v454, v456); + svfloat32_t v467 = svsub_f32_x(svptrue_b32(), v454, v456); + svfloat32_t v468 = svadd_f32_x(svptrue_b32(), v457, v459); + svfloat32_t v469 = svsub_f32_x(svptrue_b32(), v459, v457); + svfloat32_t v470 = svadd_f32_x(svptrue_b32(), v458, v460); + svfloat32_t v471 = svsub_f32_x(svptrue_b32(), v460, v458); + svfloat32_t v195 = svadd_f32_x(svptrue_b32(), v190, v194); + svfloat32_t zero418 = svdup_n_f32(0); svfloat32_t v418 = svcmla_f32_x(pred_full, zero418, v823, v194, 90); - svfloat32_t v438; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v438) : "w"(v432), "w"(v435)); - svfloat32_t v440; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v440) : "w"(v433), "w"(v436)); - svfloat32_t v442; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v442) : "w"(v435), "w"(v432)); - svfloat32_t v444; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v444) : "w"(v436), "w"(v433)); - svfloat32_t v481; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v481) : "w"(v466), "w"(v470)); - svfloat32_t v483; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v483) : "w"(v465), "w"(v471)); - svfloat32_t v485; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v485) : "w"(v464), "w"(v468)); - svfloat32_t v487; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v487) : "w"(v471), "w"(v465)); - svfloat32_t v489; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v489) : "w"(v464), "w"(v468)); - svfloat32_t v492; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v492) : "w"(v469), "w"(v467)); - svfloat32_t v495; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v495) : "w"(v470), "w"(v466)); - svfloat32_t v498; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v498) : "w"(v467), "w"(v469)); - svfloat32_t v445; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v445) : "w"(v437), "w"(v438)); - svfloat32_t v446; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v446) : "w"(v439), "w"(v440)); - svfloat32_t v447; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v447) : "w"(v441), "w"(v442)); - svfloat32_t v448; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v448) : "w"(v443), "w"(v444)); - svfloat32_t v449; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v449) : "w"(v438), "w"(v437)); - svfloat32_t v450; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v450) : "w"(v440), "w"(v439)); - svfloat32_t v451; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v451) : "w"(v442), "w"(v441)); - svfloat32_t v452; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v452) : "w"(v444), "w"(v443)); - svfloat32_t v472; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v472) : "w"(v411), "w"(v418)); + svfloat32_t v438 = svadd_f32_x(svptrue_b32(), v432, v435); + svfloat32_t v440 = svadd_f32_x(svptrue_b32(), v433, v436); + svfloat32_t v442 = svsub_f32_x(svptrue_b32(), v435, v432); + svfloat32_t v444 = svsub_f32_x(svptrue_b32(), v436, v433); + svfloat32_t v481 = svadd_f32_x(svptrue_b32(), v466, v470); + svfloat32_t v483 = svadd_f32_x(svptrue_b32(), v465, v471); + svfloat32_t v485 = svsub_f32_x(svptrue_b32(), v464, v468); + svfloat32_t v487 = svsub_f32_x(svptrue_b32(), v471, v465); + svfloat32_t v489 = svadd_f32_x(svptrue_b32(), v464, v468); + svfloat32_t v492 = svsub_f32_x(svptrue_b32(), v469, v467); + svfloat32_t v495 = svsub_f32_x(svptrue_b32(), v470, v466); + svfloat32_t v498 = svadd_f32_x(svptrue_b32(), v467, v469); + svfloat32_t v445 = svadd_f32_x(svptrue_b32(), v437, v438); + svfloat32_t v446 = svadd_f32_x(svptrue_b32(), v439, v440); + svfloat32_t v447 = svadd_f32_x(svptrue_b32(), v441, v442); + svfloat32_t v448 = svadd_f32_x(svptrue_b32(), v443, v444); + svfloat32_t v449 = svsub_f32_x(svptrue_b32(), v438, v437); + svfloat32_t v450 = svsub_f32_x(svptrue_b32(), v440, v439); + svfloat32_t v451 = svsub_f32_x(svptrue_b32(), v442, v441); + svfloat32_t v452 = svsub_f32_x(svptrue_b32(), v444, v443); + svfloat32_t v472 = svsub_f32_x(svptrue_b32(), v411, v418); svfloat32_t v461 = svcmla_f32_x(pred_full, v418, v824, v195, 90); - svfloat32_t v474; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v474) : "w"(v472), "w"(v472)); - svfloat32_t v499; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v499) : "w"(v498), "w"(v472)); + svfloat32_t v474 = svadd_f32_x(svptrue_b32(), v472, v472); + svfloat32_t v499 = svsub_f32_x(svptrue_b32(), v498, v472); svfloat32_t v462 = svcmla_f32_x(pred_full, v461, v816, v186, 90); - svfloat32_t v475; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v475) : "w"(v390), "w"(v474)); - svfloat32_t v478; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v478) : "w"(v461), "w"(v461)); - svfloat32_t v496; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v496) : "w"(v495), "w"(v474)); - svfloat32_t v539; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v539) : "w"(v452), "w"(v499)); - svfloat32_t v547; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v547) : "w"(v452), "w"(v499)); + svfloat32_t v475 = svsub_f32_x(svptrue_b32(), v390, v474); + svfloat32_t v478 = svadd_f32_x(svptrue_b32(), v461, v461); + svfloat32_t v496 = svadd_f32_x(svptrue_b32(), v495, v474); + svfloat32_t v539 = svadd_f32_x(svptrue_b32(), v452, v499); + svfloat32_t v547 = svsub_f32_x(svptrue_b32(), v452, v499); svfloat32_t v463 = svcmla_f32_x(pred_full, v462, v817, v33, 90); svfloat32_t v473 = svcmla_f32_x(pred_full, v462, v818, v97, 90); svfloat32_t v476 = svcmla_f32_x(pred_full, v475, v820, v81, 90); svfloat32_t v477 = svcmla_f32_x(pred_full, v475, v821, v145, 90); - svfloat32_t v479; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v479) : "w"(v478), "w"(v478)); - svfloat32_t v480; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v480) : "w"(v472), "w"(v478)); - svfloat32_t v486; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v486) : "w"(v485), "w"(v478)); - svfloat32_t v497; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v497) : "w"(v496), "w"(v478)); + svfloat32_t v479 = svadd_f32_x(svptrue_b32(), v478, v478); + svfloat32_t v480 = svadd_f32_x(svptrue_b32(), v472, v478); + svfloat32_t v486 = svadd_f32_x(svptrue_b32(), v485, v478); + svfloat32_t v497 = svadd_f32_x(svptrue_b32(), v496, v478); svst1_f64(pred_full, (double *)(v877), svreinterpret_f64_f32(v539)); svst1_f64(pred_full, (double *)(v886), svreinterpret_f64_f32(v547)); - svfloat32_t v482; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v482) : "w"(v481), "w"(v473)); - svfloat32_t v484; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v484) : "w"(v483), "w"(v476)); - svfloat32_t v488; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v488) : "w"(v487), "w"(v480)); - svfloat32_t v490; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v490) : "w"(v489), "w"(v463)); - svfloat32_t v493; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v493) : "w"(v492), "w"(v477)); - svfloat32_t v523; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v523) : "w"(v447), "w"(v486)); - svfloat32_t v531; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v531) : "w"(v447), "w"(v486)); - svfloat32_t v619; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v619) : "w"(v451), "w"(v497)); - svfloat32_t v627; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v627) : "w"(v451), "w"(v497)); - svfloat32_t v491; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v491) : "w"(v490), "w"(v472)); - svfloat32_t v494; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v494) : "w"(v493), "w"(v479)); - svfloat32_t v507; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v507) : "w"(v445), "w"(v482)); - svfloat32_t v515; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v515) : "w"(v445), "w"(v482)); - svfloat32_t v571; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v571) : "w"(v448), "w"(v488)); - svfloat32_t v579; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v579) : "w"(v448), "w"(v488)); - svfloat32_t v587; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v587) : "w"(v446), "w"(v484)); - svfloat32_t v595; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v595) : "w"(v446), "w"(v484)); + svfloat32_t v482 = svadd_f32_x(svptrue_b32(), v481, v473); + svfloat32_t v484 = svadd_f32_x(svptrue_b32(), v483, v476); + svfloat32_t v488 = svsub_f32_x(svptrue_b32(), v487, v480); + svfloat32_t v490 = svadd_f32_x(svptrue_b32(), v489, v463); + svfloat32_t v493 = svsub_f32_x(svptrue_b32(), v492, v477); + svfloat32_t v523 = svadd_f32_x(svptrue_b32(), v447, v486); + svfloat32_t v531 = svsub_f32_x(svptrue_b32(), v447, v486); + svfloat32_t v619 = svadd_f32_x(svptrue_b32(), v451, v497); + svfloat32_t v627 = svsub_f32_x(svptrue_b32(), v451, v497); + svfloat32_t v491 = svadd_f32_x(svptrue_b32(), v490, v472); + svfloat32_t v494 = svadd_f32_x(svptrue_b32(), v493, v479); + svfloat32_t v507 = svadd_f32_x(svptrue_b32(), v445, v482); + svfloat32_t v515 = svsub_f32_x(svptrue_b32(), v445, v482); + svfloat32_t v571 = svadd_f32_x(svptrue_b32(), v448, v488); + svfloat32_t v579 = svsub_f32_x(svptrue_b32(), v448, v488); + svfloat32_t v587 = svadd_f32_x(svptrue_b32(), v446, v484); + svfloat32_t v595 = svsub_f32_x(svptrue_b32(), v446, v484); svst1_f64(pred_full, (double *)(v859), svreinterpret_f64_f32(v523)); svst1_f64(pred_full, (double *)(v868), svreinterpret_f64_f32(v531)); svst1_f64(pred_full, (double *)(v967), svreinterpret_f64_f32(v619)); svst1_f64(pred_full, (double *)(v976), svreinterpret_f64_f32(v627)); - svfloat32_t v555; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v555) : "w"(v449), "w"(v491)); - svfloat32_t v563; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v563) : "w"(v449), "w"(v491)); - svfloat32_t v603; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v603) : "w"(v450), "w"(v494)); - svfloat32_t v611; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v611) : "w"(v450), "w"(v494)); + svfloat32_t v555 = svadd_f32_x(svptrue_b32(), v449, v491); + svfloat32_t v563 = svsub_f32_x(svptrue_b32(), v449, v491); + svfloat32_t v603 = svadd_f32_x(svptrue_b32(), v450, v494); + svfloat32_t v611 = svsub_f32_x(svptrue_b32(), v450, v494); svst1_f64(pred_full, (double *)(v841), svreinterpret_f64_f32(v507)); svst1_f64(pred_full, (double *)(v850), svreinterpret_f64_f32(v515)); svst1_f64(pred_full, (double *)(v913), svreinterpret_f64_f32(v571)); @@ -8707,209 +7906,117 @@ void armral_fft_cf32_cf32_cf32_ac_n_uu18(const armral_cmplx_f32_t *restrict x, svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v659)[0])); svfloat32_t v891 = svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v668)[0])); - svfloat32_t v32; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v32) : "w"(v857), "w"(v859)); - svfloat32_t v33; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v33) : "w"(v857), "w"(v859)); - svfloat32_t v48; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v48) : "w"(v861), "w"(v863)); - svfloat32_t v49; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v49) : "w"(v861), "w"(v863)); - svfloat32_t v64; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v64) : "w"(v865), "w"(v867)); - svfloat32_t v65; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v65) : "w"(v865), "w"(v867)); - svfloat32_t v80; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v80) : "w"(v869), "w"(v871)); - svfloat32_t v81; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v81) : "w"(v869), "w"(v871)); - svfloat32_t v96; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v96) : "w"(v873), "w"(v875)); - svfloat32_t v97; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v97) : "w"(v873), "w"(v875)); - svfloat32_t v112; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v112) : "w"(v877), "w"(v879)); - svfloat32_t v113; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v113) : "w"(v877), "w"(v879)); - svfloat32_t v128; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v128) : "w"(v881), "w"(v883)); - svfloat32_t v129; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v129) : "w"(v881), "w"(v883)); - svfloat32_t v144; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v144) : "w"(v885), "w"(v887)); - svfloat32_t v145; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v145) : "w"(v885), "w"(v887)); - svfloat32_t v160; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v160) : "w"(v889), "w"(v891)); - svfloat32_t v161; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v161) : "w"(v889), "w"(v891)); - svfloat32_t v162; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v162) : "w"(v48), "w"(v160)); - svfloat32_t v163; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v163) : "w"(v48), "w"(v160)); - svfloat32_t v164; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v164) : "w"(v144), "w"(v64)); - svfloat32_t v165; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v165) : "w"(v144), "w"(v64)); - svfloat32_t v166; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v166) : "w"(v80), "w"(v128)); - svfloat32_t v167; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v167) : "w"(v80), "w"(v128)); - svfloat32_t v168; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v168) : "w"(v96), "w"(v112)); - svfloat32_t v169; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v169) : "w"(v96), "w"(v112)); - svfloat32_t v272; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v272) : "w"(v49), "w"(v161)); - svfloat32_t v273; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v273) : "w"(v49), "w"(v161)); - svfloat32_t v274; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v274) : "w"(v145), "w"(v65)); - svfloat32_t v275; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v275) : "w"(v145), "w"(v65)); - svfloat32_t v276; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v276) : "w"(v81), "w"(v129)); - svfloat32_t v277; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v277) : "w"(v81), "w"(v129)); - svfloat32_t v278; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v278) : "w"(v97), "w"(v113)); - svfloat32_t v279; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v279) : "w"(v97), "w"(v113)); - svfloat32_t v170; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v170) : "w"(v162), "w"(v164)); - svfloat32_t v174; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v174) : "w"(v163), "w"(v165)); - svfloat32_t v176; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v176) : "w"(v162), "w"(v164)); - svfloat32_t v177; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v177) : "w"(v164), "w"(v168)); - svfloat32_t v178; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v178) : "w"(v168), "w"(v162)); - svfloat32_t v179; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v179) : "w"(v163), "w"(v165)); - svfloat32_t v180; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v180) : "w"(v165), "w"(v169)); - svfloat32_t v181; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v181) : "w"(v169), "w"(v163)); - svfloat32_t zero210; - asm volatile("mov %0.s, #0" : "=w"(zero210)); + svfloat32_t v32 = svadd_f32_x(svptrue_b32(), v857, v859); + svfloat32_t v33 = svsub_f32_x(svptrue_b32(), v857, v859); + svfloat32_t v48 = svadd_f32_x(svptrue_b32(), v861, v863); + svfloat32_t v49 = svsub_f32_x(svptrue_b32(), v861, v863); + svfloat32_t v64 = svadd_f32_x(svptrue_b32(), v865, v867); + svfloat32_t v65 = svsub_f32_x(svptrue_b32(), v865, v867); + svfloat32_t v80 = svadd_f32_x(svptrue_b32(), v869, v871); + svfloat32_t v81 = svsub_f32_x(svptrue_b32(), v869, v871); + svfloat32_t v96 = svadd_f32_x(svptrue_b32(), v873, v875); + svfloat32_t v97 = svsub_f32_x(svptrue_b32(), v873, v875); + svfloat32_t v112 = svadd_f32_x(svptrue_b32(), v877, v879); + svfloat32_t v113 = svsub_f32_x(svptrue_b32(), v877, v879); + svfloat32_t v128 = svadd_f32_x(svptrue_b32(), v881, v883); + svfloat32_t v129 = svsub_f32_x(svptrue_b32(), v881, v883); + svfloat32_t v144 = svadd_f32_x(svptrue_b32(), v885, v887); + svfloat32_t v145 = svsub_f32_x(svptrue_b32(), v885, v887); + svfloat32_t v160 = svadd_f32_x(svptrue_b32(), v889, v891); + svfloat32_t v161 = svsub_f32_x(svptrue_b32(), v889, v891); + svfloat32_t v162 = svadd_f32_x(svptrue_b32(), v48, v160); + svfloat32_t v163 = svsub_f32_x(svptrue_b32(), v48, v160); + svfloat32_t v164 = svadd_f32_x(svptrue_b32(), v144, v64); + svfloat32_t v165 = svsub_f32_x(svptrue_b32(), v144, v64); + svfloat32_t v166 = svadd_f32_x(svptrue_b32(), v80, v128); + svfloat32_t v167 = svsub_f32_x(svptrue_b32(), v80, v128); + svfloat32_t v168 = svadd_f32_x(svptrue_b32(), v96, v112); + svfloat32_t v169 = svsub_f32_x(svptrue_b32(), v96, v112); + svfloat32_t v272 = svadd_f32_x(svptrue_b32(), v49, v161); + svfloat32_t v273 = svsub_f32_x(svptrue_b32(), v49, v161); + svfloat32_t v274 = svadd_f32_x(svptrue_b32(), v145, v65); + svfloat32_t v275 = svsub_f32_x(svptrue_b32(), v145, v65); + svfloat32_t v276 = svadd_f32_x(svptrue_b32(), v81, v129); + svfloat32_t v277 = svsub_f32_x(svptrue_b32(), v81, v129); + svfloat32_t v278 = svadd_f32_x(svptrue_b32(), v97, v113); + svfloat32_t v279 = svsub_f32_x(svptrue_b32(), v97, v113); + svfloat32_t v170 = svadd_f32_x(svptrue_b32(), v162, v164); + svfloat32_t v174 = svadd_f32_x(svptrue_b32(), v163, v165); + svfloat32_t v176 = svsub_f32_x(svptrue_b32(), v162, v164); + svfloat32_t v177 = svsub_f32_x(svptrue_b32(), v164, v168); + svfloat32_t v178 = svsub_f32_x(svptrue_b32(), v168, v162); + svfloat32_t v179 = svsub_f32_x(svptrue_b32(), v163, v165); + svfloat32_t v180 = svsub_f32_x(svptrue_b32(), v165, v169); + svfloat32_t v181 = svsub_f32_x(svptrue_b32(), v169, v163); + svfloat32_t zero210 = svdup_n_f32(0); svfloat32_t v210 = svcmla_f32_x(pred_full, zero210, v686, v167, 90); - svfloat32_t v280; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v280) : "w"(v272), "w"(v274)); - svfloat32_t v284; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v284) : "w"(v273), "w"(v275)); - svfloat32_t v286; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v286) : "w"(v272), "w"(v274)); - svfloat32_t v287; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v287) : "w"(v274), "w"(v278)); - svfloat32_t v288; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v288) : "w"(v278), "w"(v272)); - svfloat32_t v289; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v289) : "w"(v273), "w"(v275)); - svfloat32_t v290; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v290) : "w"(v275), "w"(v279)); - svfloat32_t v291; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v291) : "w"(v279), "w"(v273)); - svfloat32_t zero320; - asm volatile("mov %0.s, #0" : "=w"(zero320)); + svfloat32_t v280 = svadd_f32_x(svptrue_b32(), v272, v274); + svfloat32_t v284 = svadd_f32_x(svptrue_b32(), v273, v275); + svfloat32_t v286 = svsub_f32_x(svptrue_b32(), v272, v274); + svfloat32_t v287 = svsub_f32_x(svptrue_b32(), v274, v278); + svfloat32_t v288 = svsub_f32_x(svptrue_b32(), v278, v272); + svfloat32_t v289 = svsub_f32_x(svptrue_b32(), v273, v275); + svfloat32_t v290 = svsub_f32_x(svptrue_b32(), v275, v279); + svfloat32_t v291 = svsub_f32_x(svptrue_b32(), v279, v273); + svfloat32_t zero320 = svdup_n_f32(0); svfloat32_t v320 = svcmla_f32_x(pred_full, zero320, v686, v277, 90); - svfloat32_t v171; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v171) : "w"(v170), "w"(v168)); - svfloat32_t v175; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v175) : "w"(v174), "w"(v169)); - svfloat32_t zero232; - asm volatile("mov %0.s, #0" : "=w"(zero232)); + svfloat32_t v171 = svadd_f32_x(svptrue_b32(), v170, v168); + svfloat32_t v175 = svadd_f32_x(svptrue_b32(), v174, v169); + svfloat32_t zero232 = svdup_n_f32(0); svfloat32_t v232 = svcmla_f32_x(pred_full, zero232, v690, v179, 90); - svfloat32_t zero239; - asm volatile("mov %0.s, #0" : "=w"(zero239)); + svfloat32_t zero239 = svdup_n_f32(0); svfloat32_t v239 = svcmla_f32_x(pred_full, zero239, v691, v180, 90); - svfloat32_t zero246; - asm volatile("mov %0.s, #0" : "=w"(zero246)); + svfloat32_t zero246 = svdup_n_f32(0); svfloat32_t v246 = svcmla_f32_x(pred_full, zero246, v692, v181, 90); - svfloat32_t v281; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v281) : "w"(v280), "w"(v278)); - svfloat32_t v285; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v285) : "w"(v284), "w"(v279)); - svfloat32_t zero342; - asm volatile("mov %0.s, #0" : "=w"(zero342)); + svfloat32_t v281 = svadd_f32_x(svptrue_b32(), v280, v278); + svfloat32_t v285 = svadd_f32_x(svptrue_b32(), v284, v279); + svfloat32_t zero342 = svdup_n_f32(0); svfloat32_t v342 = svcmla_f32_x(pred_full, zero342, v690, v289, 90); - svfloat32_t zero349; - asm volatile("mov %0.s, #0" : "=w"(zero349)); + svfloat32_t zero349 = svdup_n_f32(0); svfloat32_t v349 = svcmla_f32_x(pred_full, zero349, v691, v290, 90); - svfloat32_t zero356; - asm volatile("mov %0.s, #0" : "=w"(zero356)); + svfloat32_t zero356 = svdup_n_f32(0); svfloat32_t v356 = svcmla_f32_x(pred_full, zero356, v692, v291, 90); - svfloat32_t v172; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v172) : "w"(v171), "w"(v166)); - svfloat32_t v191; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v191) : "w"(v171), "w"(v683)); - svfloat32_t zero198; - asm volatile("mov %0.s, #0" : "=w"(zero198)); + svfloat32_t v172 = svadd_f32_x(svptrue_b32(), v171, v166); + svfloat32_t v191 = svmul_f32_x(svptrue_b32(), v171, v683); + svfloat32_t zero198 = svdup_n_f32(0); svfloat32_t v198 = svcmla_f32_x(pred_full, zero198, v686, v175, 90); - svfloat32_t v260; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v260) : "w"(v210), "w"(v232)); - svfloat32_t v262; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v262) : "w"(v210), "w"(v239)); - svfloat32_t v264; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v264) : "w"(v210), "w"(v232)); - svfloat32_t v282; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v282) : "w"(v281), "w"(v276)); - svfloat32_t v301; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v301) : "w"(v281), "w"(v683)); - svfloat32_t zero308; - asm volatile("mov %0.s, #0" : "=w"(zero308)); + svfloat32_t v260 = svadd_f32_x(svptrue_b32(), v210, v232); + svfloat32_t v262 = svsub_f32_x(svptrue_b32(), v210, v239); + svfloat32_t v264 = svsub_f32_x(svptrue_b32(), v210, v232); + svfloat32_t v282 = svadd_f32_x(svptrue_b32(), v281, v276); + svfloat32_t v301 = svmul_f32_x(svptrue_b32(), v281, v683); + svfloat32_t zero308 = svdup_n_f32(0); svfloat32_t v308 = svcmla_f32_x(pred_full, zero308, v686, v285, 90); - svfloat32_t v370; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v370) : "w"(v320), "w"(v342)); - svfloat32_t v372; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v372) : "w"(v320), "w"(v349)); - svfloat32_t v374; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v374) : "w"(v320), "w"(v342)); - svfloat32_t v173; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v173) : "w"(v172), "w"(v32)); - svfloat32_t v247; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v247) : "w"(v191), "w"(v191)); - svfloat32_t v261; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v261) : "w"(v260), "w"(v239)); - svfloat32_t v263; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v263) : "w"(v262), "w"(v246)); - svfloat32_t v265; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v265) : "w"(v264), "w"(v246)); - svfloat32_t v283; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v283) : "w"(v282), "w"(v33)); - svfloat32_t v357; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v357) : "w"(v301), "w"(v301)); - svfloat32_t v371; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v371) : "w"(v370), "w"(v349)); - svfloat32_t v373; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v373) : "w"(v372), "w"(v356)); - svfloat32_t v375; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v375) : "w"(v374), "w"(v356)); + svfloat32_t v370 = svadd_f32_x(svptrue_b32(), v320, v342); + svfloat32_t v372 = svsub_f32_x(svptrue_b32(), v320, v349); + svfloat32_t v374 = svsub_f32_x(svptrue_b32(), v320, v342); + svfloat32_t v173 = svadd_f32_x(svptrue_b32(), v172, v32); + svfloat32_t v247 = svadd_f32_x(svptrue_b32(), v191, v191); + svfloat32_t v261 = svadd_f32_x(svptrue_b32(), v260, v239); + svfloat32_t v263 = svadd_f32_x(svptrue_b32(), v262, v246); + svfloat32_t v265 = svsub_f32_x(svptrue_b32(), v264, v246); + svfloat32_t v283 = svadd_f32_x(svptrue_b32(), v282, v33); + svfloat32_t v357 = svadd_f32_x(svptrue_b32(), v301, v301); + svfloat32_t v371 = svadd_f32_x(svptrue_b32(), v370, v349); + svfloat32_t v373 = svadd_f32_x(svptrue_b32(), v372, v356); + svfloat32_t v375 = svsub_f32_x(svptrue_b32(), v374, v356); svfloat32_t v248 = svmla_f32_x(pred_full, v247, v171, v683); svfloat32_t v252 = svmla_f32_x(pred_full, v173, v166, v685); svfloat32_t v358 = svmla_f32_x(pred_full, v357, v281, v683); svfloat32_t v362 = svmla_f32_x(pred_full, v283, v276, v685); svst1_f64(pred_full, (double *)(v700), svreinterpret_f64_f32(v173)); svst1_f64(pred_full, (double *)(v709), svreinterpret_f64_f32(v283)); - svfloat32_t v249; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v249) : "w"(v173), "w"(v248)); - svfloat32_t v253; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v253) : "w"(v252), "w"(v247)); - svfloat32_t v359; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v359) : "w"(v283), "w"(v358)); - svfloat32_t v363; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v363) : "w"(v362), "w"(v357)); - svfloat32_t v250; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v250) : "w"(v249), "w"(v198)); - svfloat32_t v251; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v251) : "w"(v249), "w"(v198)); + svfloat32_t v249 = svadd_f32_x(svptrue_b32(), v173, v248); + svfloat32_t v253 = svadd_f32_x(svptrue_b32(), v252, v247); + svfloat32_t v359 = svadd_f32_x(svptrue_b32(), v283, v358); + svfloat32_t v363 = svadd_f32_x(svptrue_b32(), v362, v357); + svfloat32_t v250 = svadd_f32_x(svptrue_b32(), v249, v198); + svfloat32_t v251 = svsub_f32_x(svptrue_b32(), v249, v198); svfloat32_t v254 = svmla_f32_x(pred_full, v253, v176, v687); svfloat32_t v256 = svmls_f32_x(pred_full, v253, v177, v688); svfloat32_t v258 = svmls_f32_x(pred_full, v253, v176, v687); - svfloat32_t v360; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v360) : "w"(v359), "w"(v308)); - svfloat32_t v361; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v361) : "w"(v359), "w"(v308)); + svfloat32_t v360 = svadd_f32_x(svptrue_b32(), v359, v308); + svfloat32_t v361 = svsub_f32_x(svptrue_b32(), v359, v308); svfloat32_t v364 = svmla_f32_x(pred_full, v363, v286, v687); svfloat32_t v366 = svmls_f32_x(pred_full, v363, v287, v688); svfloat32_t v368 = svmls_f32_x(pred_full, v363, v286, v687); @@ -8923,30 +8030,18 @@ void armral_fft_cf32_cf32_cf32_ac_n_uu18(const armral_cmplx_f32_t *restrict x, svst1_f64(pred_full, (double *)(v763), svreinterpret_f64_f32(v361)); svst1_f64(pred_full, (double *)(v808), svreinterpret_f64_f32(v250)); svst1_f64(pred_full, (double *)(v817), svreinterpret_f64_f32(v360)); - svfloat32_t v266; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v266) : "w"(v255), "w"(v261)); - svfloat32_t v267; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v267) : "w"(v255), "w"(v261)); - svfloat32_t v268; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v268) : "w"(v257), "w"(v263)); - svfloat32_t v269; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v269) : "w"(v257), "w"(v263)); - svfloat32_t v270; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v270) : "w"(v259), "w"(v265)); - svfloat32_t v271; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v271) : "w"(v259), "w"(v265)); - svfloat32_t v376; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v376) : "w"(v365), "w"(v371)); - svfloat32_t v377; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v377) : "w"(v365), "w"(v371)); - svfloat32_t v378; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v378) : "w"(v367), "w"(v373)); - svfloat32_t v379; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v379) : "w"(v367), "w"(v373)); - svfloat32_t v380; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v380) : "w"(v369), "w"(v375)); - svfloat32_t v381; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v381) : "w"(v369), "w"(v375)); + svfloat32_t v266 = svadd_f32_x(svptrue_b32(), v255, v261); + svfloat32_t v267 = svsub_f32_x(svptrue_b32(), v255, v261); + svfloat32_t v268 = svadd_f32_x(svptrue_b32(), v257, v263); + svfloat32_t v269 = svsub_f32_x(svptrue_b32(), v257, v263); + svfloat32_t v270 = svadd_f32_x(svptrue_b32(), v259, v265); + svfloat32_t v271 = svsub_f32_x(svptrue_b32(), v259, v265); + svfloat32_t v376 = svadd_f32_x(svptrue_b32(), v365, v371); + svfloat32_t v377 = svsub_f32_x(svptrue_b32(), v365, v371); + svfloat32_t v378 = svadd_f32_x(svptrue_b32(), v367, v373); + svfloat32_t v379 = svsub_f32_x(svptrue_b32(), v367, v373); + svfloat32_t v380 = svadd_f32_x(svptrue_b32(), v369, v375); + svfloat32_t v381 = svsub_f32_x(svptrue_b32(), v369, v375); svst1_f64(pred_full, (double *)(v718), svreinterpret_f64_f32(v267)); svst1_f64(pred_full, (double *)(v727), svreinterpret_f64_f32(v377)); svst1_f64(pred_full, (double *)(v736), svreinterpret_f64_f32(v268)); @@ -10071,403 +9166,227 @@ void armral_fft_cf32_cf32_cf32_ac_n_uu19(const armral_cmplx_f32_t *restrict x, svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v853)[0])); svfloat32_t v1121 = svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v862)[0])); - svfloat32_t v32; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v32) : "w"(v1087), "w"(v1089)); - svfloat32_t v33; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v33) : "w"(v1087), "w"(v1089)); - svfloat32_t v48; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v48) : "w"(v1091), "w"(v1093)); - svfloat32_t v49; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v49) : "w"(v1093), "w"(v1091)); - svfloat32_t v64; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v64) : "w"(v1095), "w"(v1097)); - svfloat32_t v65; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v65) : "w"(v1095), "w"(v1097)); - svfloat32_t v80; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v80) : "w"(v1099), "w"(v1101)); - svfloat32_t v81; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v81) : "w"(v1101), "w"(v1099)); - svfloat32_t v96; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v96) : "w"(v1103), "w"(v1105)); - svfloat32_t v97; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v97) : "w"(v1103), "w"(v1105)); - svfloat32_t v112; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v112) : "w"(v1107), "w"(v1109)); - svfloat32_t v113; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v113) : "w"(v1109), "w"(v1107)); - svfloat32_t v128; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v128) : "w"(v1111), "w"(v1113)); - svfloat32_t v129; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v129) : "w"(v1111), "w"(v1113)); - svfloat32_t v144; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v144) : "w"(v1115), "w"(v1117)); - svfloat32_t v145; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v145) : "w"(v1117), "w"(v1115)); - svfloat32_t v160; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v160) : "w"(v1119), "w"(v1121)); - svfloat32_t v161; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v161) : "w"(v1119), "w"(v1121)); - svfloat32_t v162; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v162) : "w"(v32), "w"(v128)); - svfloat32_t v163; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v163) : "w"(v48), "w"(v144)); - svfloat32_t v164; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v164) : "w"(v64), "w"(v160)); - svfloat32_t v165; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v165) : "w"(v80), "w"(v128)); - svfloat32_t v166; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v166) : "w"(v96), "w"(v144)); - svfloat32_t v167; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v167) : "w"(v112), "w"(v160)); - svfloat32_t v168; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v168) : "w"(v32), "w"(v80)); - svfloat32_t v170; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v170) : "w"(v48), "w"(v96)); - svfloat32_t v172; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v172) : "w"(v64), "w"(v112)); - svfloat32_t v202; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v202) : "w"(v33), "w"(v129)); - svfloat32_t v203; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v203) : "w"(v49), "w"(v145)); - svfloat32_t v204; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v204) : "w"(v65), "w"(v161)); - svfloat32_t v205; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v205) : "w"(v81), "w"(v129)); - svfloat32_t v206; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v206) : "w"(v97), "w"(v145)); - svfloat32_t v207; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v207) : "w"(v113), "w"(v161)); - svfloat32_t v208; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v208) : "w"(v33), "w"(v81)); - svfloat32_t v210; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v210) : "w"(v49), "w"(v97)); - svfloat32_t v212; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v212) : "w"(v65), "w"(v113)); - svfloat32_t v169; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v169) : "w"(v168), "w"(v128)); - svfloat32_t v171; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v171) : "w"(v170), "w"(v144)); - svfloat32_t v173; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v173) : "w"(v172), "w"(v160)); - svfloat32_t v174; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v174) : "w"(v162), "w"(v164)); - svfloat32_t v175; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v175) : "w"(v165), "w"(v167)); - svfloat32_t v192; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v192) : "w"(v162), "w"(v165)); - svfloat32_t v193; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v193) : "w"(v164), "w"(v167)); - svfloat32_t v209; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v209) : "w"(v208), "w"(v129)); - svfloat32_t v211; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v211) : "w"(v210), "w"(v145)); - svfloat32_t v213; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v213) : "w"(v212), "w"(v161)); - svfloat32_t v214; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v214) : "w"(v202), "w"(v204)); - svfloat32_t v215; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v215) : "w"(v205), "w"(v207)); - svfloat32_t v224; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v224) : "w"(v202), "w"(v205)); - svfloat32_t v225; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v225) : "w"(v204), "w"(v207)); - svfloat32_t zero389; - asm volatile("mov %0.s, #0" : "=w"(zero389)); + svfloat32_t v32 = svadd_f32_x(svptrue_b32(), v1087, v1089); + svfloat32_t v33 = svsub_f32_x(svptrue_b32(), v1087, v1089); + svfloat32_t v48 = svadd_f32_x(svptrue_b32(), v1091, v1093); + svfloat32_t v49 = svsub_f32_x(svptrue_b32(), v1093, v1091); + svfloat32_t v64 = svadd_f32_x(svptrue_b32(), v1095, v1097); + svfloat32_t v65 = svsub_f32_x(svptrue_b32(), v1095, v1097); + svfloat32_t v80 = svadd_f32_x(svptrue_b32(), v1099, v1101); + svfloat32_t v81 = svsub_f32_x(svptrue_b32(), v1101, v1099); + svfloat32_t v96 = svadd_f32_x(svptrue_b32(), v1103, v1105); + svfloat32_t v97 = svsub_f32_x(svptrue_b32(), v1103, v1105); + svfloat32_t v112 = svadd_f32_x(svptrue_b32(), v1107, v1109); + svfloat32_t v113 = svsub_f32_x(svptrue_b32(), v1109, v1107); + svfloat32_t v128 = svadd_f32_x(svptrue_b32(), v1111, v1113); + svfloat32_t v129 = svsub_f32_x(svptrue_b32(), v1111, v1113); + svfloat32_t v144 = svadd_f32_x(svptrue_b32(), v1115, v1117); + svfloat32_t v145 = svsub_f32_x(svptrue_b32(), v1117, v1115); + svfloat32_t v160 = svadd_f32_x(svptrue_b32(), v1119, v1121); + svfloat32_t v161 = svsub_f32_x(svptrue_b32(), v1119, v1121); + svfloat32_t v162 = svsub_f32_x(svptrue_b32(), v32, v128); + svfloat32_t v163 = svsub_f32_x(svptrue_b32(), v48, v144); + svfloat32_t v164 = svsub_f32_x(svptrue_b32(), v64, v160); + svfloat32_t v165 = svsub_f32_x(svptrue_b32(), v80, v128); + svfloat32_t v166 = svsub_f32_x(svptrue_b32(), v96, v144); + svfloat32_t v167 = svsub_f32_x(svptrue_b32(), v112, v160); + svfloat32_t v168 = svadd_f32_x(svptrue_b32(), v32, v80); + svfloat32_t v170 = svadd_f32_x(svptrue_b32(), v48, v96); + svfloat32_t v172 = svadd_f32_x(svptrue_b32(), v64, v112); + svfloat32_t v202 = svsub_f32_x(svptrue_b32(), v33, v129); + svfloat32_t v203 = svsub_f32_x(svptrue_b32(), v49, v145); + svfloat32_t v204 = svsub_f32_x(svptrue_b32(), v65, v161); + svfloat32_t v205 = svsub_f32_x(svptrue_b32(), v81, v129); + svfloat32_t v206 = svsub_f32_x(svptrue_b32(), v97, v145); + svfloat32_t v207 = svsub_f32_x(svptrue_b32(), v113, v161); + svfloat32_t v208 = svadd_f32_x(svptrue_b32(), v33, v81); + svfloat32_t v210 = svadd_f32_x(svptrue_b32(), v49, v97); + svfloat32_t v212 = svadd_f32_x(svptrue_b32(), v65, v113); + svfloat32_t v169 = svadd_f32_x(svptrue_b32(), v168, v128); + svfloat32_t v171 = svadd_f32_x(svptrue_b32(), v170, v144); + svfloat32_t v173 = svadd_f32_x(svptrue_b32(), v172, v160); + svfloat32_t v174 = svadd_f32_x(svptrue_b32(), v162, v164); + svfloat32_t v175 = svadd_f32_x(svptrue_b32(), v165, v167); + svfloat32_t v192 = svsub_f32_x(svptrue_b32(), v162, v165); + svfloat32_t v193 = svsub_f32_x(svptrue_b32(), v164, v167); + svfloat32_t v209 = svadd_f32_x(svptrue_b32(), v208, v129); + svfloat32_t v211 = svadd_f32_x(svptrue_b32(), v210, v145); + svfloat32_t v213 = svadd_f32_x(svptrue_b32(), v212, v161); + svfloat32_t v214 = svadd_f32_x(svptrue_b32(), v202, v204); + svfloat32_t v215 = svadd_f32_x(svptrue_b32(), v205, v207); + svfloat32_t v224 = svsub_f32_x(svptrue_b32(), v202, v205); + svfloat32_t v225 = svsub_f32_x(svptrue_b32(), v204, v207); + svfloat32_t zero389 = svdup_n_f32(0); svfloat32_t v389 = svcmla_f32_x(pred_full, zero389, v902, v205, 90); - svfloat32_t zero410; - asm volatile("mov %0.s, #0" : "=w"(zero410)); + svfloat32_t zero410 = svdup_n_f32(0); svfloat32_t v410 = svcmla_f32_x(pred_full, zero410, v905, v207, 90); - svfloat32_t v176; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v176) : "w"(v169), "w"(v171)); - svfloat32_t v186; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v186) : "w"(v175), "w"(v166)); - svfloat32_t v187; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v187) : "w"(v174), "w"(v163)); - svfloat32_t v189; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v189) : "w"(v175), "w"(v166)); - svfloat32_t v190; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v190) : "w"(v174), "w"(v163)); - svfloat32_t v194; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v194) : "w"(v162), "w"(v193)); - svfloat32_t v196; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v196) : "w"(v192), "w"(v167)); - svfloat32_t v199; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v199) : "w"(v169), "w"(v173)); - svfloat32_t v200; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v200) : "w"(v171), "w"(v173)); - svfloat32_t v216; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v216) : "w"(v209), "w"(v211)); - svfloat32_t v218; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v218) : "w"(v215), "w"(v206)); - svfloat32_t v219; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v219) : "w"(v214), "w"(v203)); - svfloat32_t v221; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v221) : "w"(v215), "w"(v206)); - svfloat32_t v222; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v222) : "w"(v214), "w"(v203)); - svfloat32_t v226; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v226) : "w"(v202), "w"(v225)); - svfloat32_t v228; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v228) : "w"(v224), "w"(v207)); - svfloat32_t v231; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v231) : "w"(v209), "w"(v213)); - svfloat32_t v232; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v232) : "w"(v211), "w"(v213)); - svfloat32_t v177; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v177) : "w"(v176), "w"(v173)); - svfloat32_t v188; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v188) : "w"(v187), "w"(v186)); - svfloat32_t v191; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v191) : "w"(v190), "w"(v189)); - svfloat32_t v195; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v195) : "w"(v194), "w"(v166)); - svfloat32_t v197; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v197) : "w"(v196), "w"(v163)); - svfloat32_t v201; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v201) : "w"(v199), "w"(v200)); - svfloat32_t v217; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v217) : "w"(v216), "w"(v213)); - svfloat32_t v220; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v220) : "w"(v219), "w"(v218)); - svfloat32_t v223; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v223) : "w"(v222), "w"(v221)); - svfloat32_t v227; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v227) : "w"(v226), "w"(v206)); - svfloat32_t v229; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v229) : "w"(v228), "w"(v203)); - svfloat32_t v233; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v233) : "w"(v231), "w"(v232)); - svfloat32_t v253; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v253) : "w"(v187), "w"(v878)); - svfloat32_t v268; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v268) : "w"(v190), "w"(v881)); - svfloat32_t zero347; - asm volatile("mov %0.s, #0" : "=w"(zero347)); + svfloat32_t v176 = svadd_f32_x(svptrue_b32(), v169, v171); + svfloat32_t v186 = svadd_f32_x(svptrue_b32(), v175, v166); + svfloat32_t v187 = svadd_f32_x(svptrue_b32(), v174, v163); + svfloat32_t v189 = svsub_f32_x(svptrue_b32(), v175, v166); + svfloat32_t v190 = svsub_f32_x(svptrue_b32(), v174, v163); + svfloat32_t v194 = svsub_f32_x(svptrue_b32(), v162, v193); + svfloat32_t v196 = svadd_f32_x(svptrue_b32(), v192, v167); + svfloat32_t v199 = svsub_f32_x(svptrue_b32(), v169, v173); + svfloat32_t v200 = svsub_f32_x(svptrue_b32(), v171, v173); + svfloat32_t v216 = svadd_f32_x(svptrue_b32(), v209, v211); + svfloat32_t v218 = svadd_f32_x(svptrue_b32(), v215, v206); + svfloat32_t v219 = svadd_f32_x(svptrue_b32(), v214, v203); + svfloat32_t v221 = svsub_f32_x(svptrue_b32(), v215, v206); + svfloat32_t v222 = svsub_f32_x(svptrue_b32(), v214, v203); + svfloat32_t v226 = svsub_f32_x(svptrue_b32(), v202, v225); + svfloat32_t v228 = svadd_f32_x(svptrue_b32(), v224, v207); + svfloat32_t v231 = svsub_f32_x(svptrue_b32(), v209, v213); + svfloat32_t v232 = svsub_f32_x(svptrue_b32(), v211, v213); + svfloat32_t v177 = svadd_f32_x(svptrue_b32(), v176, v173); + svfloat32_t v188 = svsub_f32_x(svptrue_b32(), v187, v186); + svfloat32_t v191 = svsub_f32_x(svptrue_b32(), v190, v189); + svfloat32_t v195 = svsub_f32_x(svptrue_b32(), v194, v166); + svfloat32_t v197 = svsub_f32_x(svptrue_b32(), v196, v163); + svfloat32_t v201 = svadd_f32_x(svptrue_b32(), v199, v200); + svfloat32_t v217 = svadd_f32_x(svptrue_b32(), v216, v213); + svfloat32_t v220 = svsub_f32_x(svptrue_b32(), v219, v218); + svfloat32_t v223 = svsub_f32_x(svptrue_b32(), v222, v221); + svfloat32_t v227 = svsub_f32_x(svptrue_b32(), v226, v206); + svfloat32_t v229 = svsub_f32_x(svptrue_b32(), v228, v203); + svfloat32_t v233 = svadd_f32_x(svptrue_b32(), v231, v232); + svfloat32_t v253 = svmul_f32_x(svptrue_b32(), v187, v878); + svfloat32_t v268 = svmul_f32_x(svptrue_b32(), v190, v881); + svfloat32_t zero347 = svdup_n_f32(0); svfloat32_t v347 = svcmla_f32_x(pred_full, zero347, v896, v218, 90); - svfloat32_t zero368; - asm volatile("mov %0.s, #0" : "=w"(zero368)); + svfloat32_t zero368 = svdup_n_f32(0); svfloat32_t v368 = svcmla_f32_x(pred_full, zero368, v899, v221, 90); - svfloat32_t zero452; - asm volatile("mov %0.s, #0" : "=w"(zero452)); + svfloat32_t zero452 = svdup_n_f32(0); svfloat32_t v452 = svcmla_f32_x(pred_full, zero452, v911, v231, 90); - svfloat32_t zero459; - asm volatile("mov %0.s, #0" : "=w"(zero459)); + svfloat32_t zero459 = svdup_n_f32(0); svfloat32_t v459 = svcmla_f32_x(pred_full, zero459, v912, v232, 90); - svfloat32_t v185; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v185) : "w"(v1123), "w"(v177)); - svfloat32_t v198; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v198) : "w"(v195), "w"(v197)); - svfloat32_t v230; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v230) : "w"(v227), "w"(v229)); - svfloat32_t v258; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v258) : "w"(v188), "w"(v879)); - svfloat32_t v273; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v273) : "w"(v191), "w"(v882)); - svfloat32_t v333; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v333) : "w"(v201), "w"(v894)); - svfloat32_t zero340; - asm volatile("mov %0.s, #0" : "=w"(zero340)); + svfloat32_t v185 = svadd_f32_x(svptrue_b32(), v1123, v177); + svfloat32_t v198 = svsub_f32_x(svptrue_b32(), v195, v197); + svfloat32_t v230 = svsub_f32_x(svptrue_b32(), v227, v229); + svfloat32_t v258 = svmul_f32_x(svptrue_b32(), v188, v879); + svfloat32_t v273 = svmul_f32_x(svptrue_b32(), v191, v882); + svfloat32_t v333 = svmul_f32_x(svptrue_b32(), v201, v894); + svfloat32_t zero340 = svdup_n_f32(0); svfloat32_t v340 = svcmla_f32_x(pred_full, zero340, v895, v217, 90); - svfloat32_t zero466; - asm volatile("mov %0.s, #0" : "=w"(zero466)); + svfloat32_t zero466 = svdup_n_f32(0); svfloat32_t v466 = svcmla_f32_x(pred_full, zero466, v913, v233, 90); svfloat32_t v467 = svmla_f32_x(pred_full, v253, v186, v877); svfloat32_t v468 = svmla_f32_x(pred_full, v268, v189, v880); svfloat32_t v498 = svcmla_f32_x(pred_full, v347, v897, v219, 90); svfloat32_t v499 = svcmla_f32_x(pred_full, v368, v900, v222, 90); - svfloat32_t v318; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v318) : "w"(v198), "w"(v891)); - svfloat32_t zero445; - asm volatile("mov %0.s, #0" : "=w"(zero445)); + svfloat32_t v318 = svmul_f32_x(svptrue_b32(), v198, v891); + svfloat32_t zero445 = svdup_n_f32(0); svfloat32_t v445 = svcmla_f32_x(pred_full, zero445, v910, v230, 90); - svfloat32_t v470; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v470) : "w"(v467), "w"(v468)); + svfloat32_t v470 = svadd_f32_x(svptrue_b32(), v467, v468); svfloat32_t v471 = svmla_f32_x(pred_full, v258, v186, v877); svfloat32_t v472 = svmla_f32_x(pred_full, v273, v189, v880); - svfloat32_t v489; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v489) : "w"(v467), "w"(v468)); + svfloat32_t v489 = svsub_f32_x(svptrue_b32(), v467, v468); svfloat32_t v491 = svnmls_f32_x(pred_full, v333, v199, v892); svfloat32_t v492 = svnmls_f32_x(pred_full, v333, v200, v893); svfloat32_t v493 = svmla_f32_x(pred_full, v185, v177, v876); - svfloat32_t v501; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v501) : "w"(v498), "w"(v499)); + svfloat32_t v501 = svadd_f32_x(svptrue_b32(), v498, v499); svfloat32_t v502 = svcmla_f32_x(pred_full, v347, v898, v220, 90); svfloat32_t v503 = svcmla_f32_x(pred_full, v368, v901, v223, 90); - svfloat32_t v520; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v520) : "w"(v498), "w"(v499)); - svfloat32_t v522; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v522) : "w"(v452), "w"(v466)); - svfloat32_t v523; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v523) : "w"(v459), "w"(v466)); + svfloat32_t v520 = svsub_f32_x(svptrue_b32(), v498, v499); + svfloat32_t v522 = svsub_f32_x(svptrue_b32(), v452, v466); + svfloat32_t v523 = svsub_f32_x(svptrue_b32(), v459, v466); svst1_f64(pred_full, (double *)(v921), svreinterpret_f64_f32(v185)); svfloat32_t v469 = svmla_f32_x(pred_full, v318, v197, v890); svfloat32_t v473 = svmla_f32_x(pred_full, v318, v195, v889); svfloat32_t v474 = svnmls_f32_x(pred_full, v470, v165, v883); - svfloat32_t v475; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v475) : "w"(v471), "w"(v472)); - svfloat32_t v481; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v481) : "w"(v471), "w"(v472)); + svfloat32_t v475 = svadd_f32_x(svptrue_b32(), v471, v472); + svfloat32_t v481 = svsub_f32_x(svptrue_b32(), v471, v472); svfloat32_t v486 = svmla_f32_x(pred_full, v470, v164, v888); - svfloat32_t v494; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v494) : "w"(v493), "w"(v491)); - svfloat32_t v495; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v495) : "w"(v493), "w"(v491)); - svfloat32_t v497; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v497) : "w"(v493), "w"(v492)); + svfloat32_t v494 = svadd_f32_x(svptrue_b32(), v493, v491); + svfloat32_t v495 = svsub_f32_x(svptrue_b32(), v493, v491); + svfloat32_t v497 = svadd_f32_x(svptrue_b32(), v493, v492); svfloat32_t v500 = svcmla_f32_x(pred_full, v445, v909, v229, 90); svfloat32_t v504 = svcmla_f32_x(pred_full, v445, v908, v227, 90); - svfloat32_t v505; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v505) : "w"(v389), "w"(v501)); - svfloat32_t v506; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v506) : "w"(v502), "w"(v503)); - svfloat32_t v512; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v512) : "w"(v502), "w"(v503)); + svfloat32_t v505 = svsub_f32_x(svptrue_b32(), v389, v501); + svfloat32_t v506 = svadd_f32_x(svptrue_b32(), v502, v503); + svfloat32_t v512 = svsub_f32_x(svptrue_b32(), v502, v503); svfloat32_t v517 = svcmla_f32_x(pred_full, v501, v907, v204, 90); - svfloat32_t v524; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v524) : "w"(v340), "w"(v522)); - svfloat32_t v525; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v525) : "w"(v340), "w"(v522)); - svfloat32_t v527; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v527) : "w"(v340), "w"(v523)); + svfloat32_t v524 = svadd_f32_x(svptrue_b32(), v340, v522); + svfloat32_t v525 = svsub_f32_x(svptrue_b32(), v340, v522); + svfloat32_t v527 = svadd_f32_x(svptrue_b32(), v340, v523); svfloat32_t v476 = svnmls_f32_x(pred_full, v473, v167, v886); svfloat32_t v477 = svmla_f32_x(pred_full, v469, v192, v884); svfloat32_t v479 = svmla_f32_x(pred_full, v475, v193, v887); - svfloat32_t v482; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v482) : "w"(v481), "w"(v469)); - svfloat32_t v483; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v483) : "w"(v474), "w"(v475)); - svfloat32_t v490; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v490) : "w"(v489), "w"(v473)); - svfloat32_t v496; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v496) : "w"(v495), "w"(v492)); - svfloat32_t v507; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v507) : "w"(v410), "w"(v504)); + svfloat32_t v482 = svadd_f32_x(svptrue_b32(), v481, v469); + svfloat32_t v483 = svadd_f32_x(svptrue_b32(), v474, v475); + svfloat32_t v490 = svadd_f32_x(svptrue_b32(), v489, v473); + svfloat32_t v496 = svsub_f32_x(svptrue_b32(), v495, v492); + svfloat32_t v507 = svsub_f32_x(svptrue_b32(), v410, v504); svfloat32_t v508 = svcmla_f32_x(pred_full, v500, v903, v224, 90); svfloat32_t v510 = svcmla_f32_x(pred_full, v506, v906, v225, 90); - svfloat32_t v513; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v513) : "w"(v512), "w"(v500)); - svfloat32_t v514; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v514) : "w"(v505), "w"(v506)); - svfloat32_t v521; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v521) : "w"(v520), "w"(v504)); - svfloat32_t v526; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v526) : "w"(v525), "w"(v523)); - svfloat32_t v478; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v478) : "w"(v477), "w"(v474)); - svfloat32_t v480; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v480) : "w"(v479), "w"(v476)); + svfloat32_t v513 = svadd_f32_x(svptrue_b32(), v512, v500); + svfloat32_t v514 = svadd_f32_x(svptrue_b32(), v505, v506); + svfloat32_t v521 = svadd_f32_x(svptrue_b32(), v520, v504); + svfloat32_t v526 = svsub_f32_x(svptrue_b32(), v525, v523); + svfloat32_t v478 = svadd_f32_x(svptrue_b32(), v477, v474); + svfloat32_t v480 = svadd_f32_x(svptrue_b32(), v479, v476); svfloat32_t v484 = svmla_f32_x(pred_full, v483, v162, v885); - svfloat32_t v487; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v487) : "w"(v486), "w"(v476)); - svfloat32_t v509; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v509) : "w"(v508), "w"(v505)); - svfloat32_t v511; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v511) : "w"(v510), "w"(v507)); + svfloat32_t v487 = svadd_f32_x(svptrue_b32(), v486, v476); + svfloat32_t v509 = svadd_f32_x(svptrue_b32(), v508, v505); + svfloat32_t v511 = svadd_f32_x(svptrue_b32(), v510, v507); svfloat32_t v515 = svcmla_f32_x(pred_full, v514, v904, v202, 90); - svfloat32_t v518; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v518) : "w"(v517), "w"(v507)); - svfloat32_t v532; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v532) : "w"(v490), "w"(v482)); - svfloat32_t v536; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v536) : "w"(v497), "w"(v490)); - svfloat32_t v539; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v539) : "w"(v482), "w"(v497)); - svfloat32_t v544; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v544) : "w"(v521), "w"(v513)); - svfloat32_t v548; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v548) : "w"(v521), "w"(v527)); - svfloat32_t v551; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v551) : "w"(v513), "w"(v527)); - svfloat32_t v485; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v485) : "w"(v484), "w"(v473)); - svfloat32_t v488; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v488) : "w"(v487), "w"(v469)); - svfloat32_t v516; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v516) : "w"(v515), "w"(v504)); - svfloat32_t v519; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v519) : "w"(v518), "w"(v500)); - svfloat32_t v533; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v533) : "w"(v532), "w"(v497)); - svfloat32_t v537; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v537) : "w"(v478), "w"(v494)); - svfloat32_t v538; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v538) : "w"(v480), "w"(v496)); - svfloat32_t v545; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v545) : "w"(v544), "w"(v527)); - svfloat32_t v549; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v549) : "w"(v509), "w"(v524)); - svfloat32_t v550; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v550) : "w"(v511), "w"(v526)); - svfloat32_t v575; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v575) : "w"(v539), "w"(v551)); - svfloat32_t v583; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v583) : "w"(v539), "w"(v551)); - svfloat32_t v591; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v591) : "w"(v536), "w"(v548)); - svfloat32_t v599; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v599) : "w"(v536), "w"(v548)); - svfloat32_t v528; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v528) : "w"(v485), "w"(v478)); - svfloat32_t v530; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v530) : "w"(v488), "w"(v480)); - svfloat32_t v534; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v534) : "w"(v494), "w"(v485)); - svfloat32_t v535; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v535) : "w"(v496), "w"(v488)); - svfloat32_t v540; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v540) : "w"(v516), "w"(v509)); - svfloat32_t v542; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v542) : "w"(v519), "w"(v511)); - svfloat32_t v546; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v546) : "w"(v524), "w"(v516)); - svfloat32_t v547; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v547) : "w"(v526), "w"(v519)); - svfloat32_t v607; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v607) : "w"(v538), "w"(v550)); - svfloat32_t v615; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v615) : "w"(v538), "w"(v550)); - svfloat32_t v623; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v623) : "w"(v533), "w"(v545)); - svfloat32_t v631; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v631) : "w"(v533), "w"(v545)); - svfloat32_t v671; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v671) : "w"(v537), "w"(v549)); - svfloat32_t v679; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v679) : "w"(v537), "w"(v549)); + svfloat32_t v518 = svadd_f32_x(svptrue_b32(), v517, v507); + svfloat32_t v532 = svsub_f32_x(svptrue_b32(), v490, v482); + svfloat32_t v536 = svsub_f32_x(svptrue_b32(), v497, v490); + svfloat32_t v539 = svadd_f32_x(svptrue_b32(), v482, v497); + svfloat32_t v544 = svsub_f32_x(svptrue_b32(), v521, v513); + svfloat32_t v548 = svsub_f32_x(svptrue_b32(), v521, v527); + svfloat32_t v551 = svadd_f32_x(svptrue_b32(), v513, v527); + svfloat32_t v485 = svadd_f32_x(svptrue_b32(), v484, v473); + svfloat32_t v488 = svadd_f32_x(svptrue_b32(), v487, v469); + svfloat32_t v516 = svadd_f32_x(svptrue_b32(), v515, v504); + svfloat32_t v519 = svadd_f32_x(svptrue_b32(), v518, v500); + svfloat32_t v533 = svadd_f32_x(svptrue_b32(), v532, v497); + svfloat32_t v537 = svadd_f32_x(svptrue_b32(), v478, v494); + svfloat32_t v538 = svadd_f32_x(svptrue_b32(), v480, v496); + svfloat32_t v545 = svadd_f32_x(svptrue_b32(), v544, v527); + svfloat32_t v549 = svadd_f32_x(svptrue_b32(), v509, v524); + svfloat32_t v550 = svadd_f32_x(svptrue_b32(), v511, v526); + svfloat32_t v575 = svsub_f32_x(svptrue_b32(), v539, v551); + svfloat32_t v583 = svadd_f32_x(svptrue_b32(), v539, v551); + svfloat32_t v591 = svadd_f32_x(svptrue_b32(), v536, v548); + svfloat32_t v599 = svsub_f32_x(svptrue_b32(), v536, v548); + svfloat32_t v528 = svsub_f32_x(svptrue_b32(), v485, v478); + svfloat32_t v530 = svsub_f32_x(svptrue_b32(), v488, v480); + svfloat32_t v534 = svsub_f32_x(svptrue_b32(), v494, v485); + svfloat32_t v535 = svsub_f32_x(svptrue_b32(), v496, v488); + svfloat32_t v540 = svsub_f32_x(svptrue_b32(), v516, v509); + svfloat32_t v542 = svsub_f32_x(svptrue_b32(), v519, v511); + svfloat32_t v546 = svsub_f32_x(svptrue_b32(), v524, v516); + svfloat32_t v547 = svsub_f32_x(svptrue_b32(), v526, v519); + svfloat32_t v607 = svadd_f32_x(svptrue_b32(), v538, v550); + svfloat32_t v615 = svsub_f32_x(svptrue_b32(), v538, v550); + svfloat32_t v623 = svadd_f32_x(svptrue_b32(), v533, v545); + svfloat32_t v631 = svsub_f32_x(svptrue_b32(), v533, v545); + svfloat32_t v671 = svsub_f32_x(svptrue_b32(), v537, v549); + svfloat32_t v679 = svadd_f32_x(svptrue_b32(), v537, v549); svst1_f64(pred_full, (double *)(v948), svreinterpret_f64_f32(v575)); svst1_f64(pred_full, (double *)(v957), svreinterpret_f64_f32(v583)); svst1_f64(pred_full, (double *)(v966), svreinterpret_f64_f32(v591)); svst1_f64(pred_full, (double *)(v975), svreinterpret_f64_f32(v599)); - svfloat32_t v529; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v529) : "w"(v528), "w"(v494)); - svfloat32_t v531; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v531) : "w"(v530), "w"(v496)); - svfloat32_t v541; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v541) : "w"(v540), "w"(v524)); - svfloat32_t v543; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v543) : "w"(v542), "w"(v526)); - svfloat32_t v639; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v639) : "w"(v535), "w"(v547)); - svfloat32_t v647; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v647) : "w"(v535), "w"(v547)); - svfloat32_t v655; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v655) : "w"(v534), "w"(v546)); - svfloat32_t v663; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v663) : "w"(v534), "w"(v546)); + svfloat32_t v529 = svadd_f32_x(svptrue_b32(), v528, v494); + svfloat32_t v531 = svadd_f32_x(svptrue_b32(), v530, v496); + svfloat32_t v541 = svadd_f32_x(svptrue_b32(), v540, v524); + svfloat32_t v543 = svadd_f32_x(svptrue_b32(), v542, v526); + svfloat32_t v639 = svadd_f32_x(svptrue_b32(), v535, v547); + svfloat32_t v647 = svsub_f32_x(svptrue_b32(), v535, v547); + svfloat32_t v655 = svadd_f32_x(svptrue_b32(), v534, v546); + svfloat32_t v663 = svsub_f32_x(svptrue_b32(), v534, v546); svst1_f64(pred_full, (double *)(v984), svreinterpret_f64_f32(v607)); svst1_f64(pred_full, (double *)(v993), svreinterpret_f64_f32(v615)); svst1_f64(pred_full, (double *)(v1002), svreinterpret_f64_f32(v623)); svst1_f64(pred_full, (double *)(v1011), svreinterpret_f64_f32(v631)); svst1_f64(pred_full, (double *)(v1056), svreinterpret_f64_f32(v671)); svst1_f64(pred_full, (double *)(v1065), svreinterpret_f64_f32(v679)); - svfloat32_t v559; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v559) : "w"(v529), "w"(v541)); - svfloat32_t v567; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v567) : "w"(v529), "w"(v541)); - svfloat32_t v687; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v687) : "w"(v531), "w"(v543)); - svfloat32_t v695; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v695) : "w"(v531), "w"(v543)); + svfloat32_t v559 = svadd_f32_x(svptrue_b32(), v529, v541); + svfloat32_t v567 = svsub_f32_x(svptrue_b32(), v529, v541); + svfloat32_t v687 = svadd_f32_x(svptrue_b32(), v531, v543); + svfloat32_t v695 = svsub_f32_x(svptrue_b32(), v531, v543); svst1_f64(pred_full, (double *)(v1020), svreinterpret_f64_f32(v639)); svst1_f64(pred_full, (double *)(v1029), svreinterpret_f64_f32(v647)); svst1_f64(pred_full, (double *)(v1038), svreinterpret_f64_f32(v655)); @@ -11133,239 +10052,136 @@ void armral_fft_cf32_cf32_cf32_ac_n_uu20(const armral_cmplx_f32_t *restrict x, svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v710)[0])); svfloat32_t v975 = svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v728)[0])); - svfloat32_t v32; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v32) : "w"(v937), "w"(v939)); - svfloat32_t v33; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v33) : "w"(v937), "w"(v939)); - svfloat32_t v48; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v48) : "w"(v941), "w"(v943)); - svfloat32_t v49; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v49) : "w"(v941), "w"(v943)); - svfloat32_t v66; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v66) : "w"(v945), "w"(v947)); - svfloat32_t v67; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v67) : "w"(v945), "w"(v947)); - svfloat32_t v82; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v82) : "w"(v949), "w"(v951)); - svfloat32_t v83; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v83) : "w"(v949), "w"(v951)); - svfloat32_t v100; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v100) : "w"(v953), "w"(v955)); - svfloat32_t v101; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v101) : "w"(v953), "w"(v955)); - svfloat32_t v116; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v116) : "w"(v957), "w"(v959)); - svfloat32_t v117; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v117) : "w"(v957), "w"(v959)); - svfloat32_t v134; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v134) : "w"(v961), "w"(v963)); - svfloat32_t v135; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v135) : "w"(v961), "w"(v963)); - svfloat32_t v150; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v150) : "w"(v965), "w"(v967)); - svfloat32_t v151; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v151) : "w"(v965), "w"(v967)); - svfloat32_t v168; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v168) : "w"(v969), "w"(v971)); - svfloat32_t v169; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v169) : "w"(v969), "w"(v971)); - svfloat32_t v184; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v184) : "w"(v973), "w"(v975)); - svfloat32_t v185; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v185) : "w"(v973), "w"(v975)); - svfloat32_t v50; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v50) : "w"(v32), "w"(v48)); - svfloat32_t v51; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v51) : "w"(v32), "w"(v48)); - svfloat32_t v84; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v84) : "w"(v66), "w"(v82)); - svfloat32_t v85; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v85) : "w"(v66), "w"(v82)); - svfloat32_t v118; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v118) : "w"(v100), "w"(v116)); - svfloat32_t v119; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v119) : "w"(v100), "w"(v116)); - svfloat32_t v152; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v152) : "w"(v134), "w"(v150)); - svfloat32_t v153; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v153) : "w"(v134), "w"(v150)); - svfloat32_t v186; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v186) : "w"(v168), "w"(v184)); - svfloat32_t v187; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v187) : "w"(v168), "w"(v184)); - svfloat32_t v294; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v294) : "w"(v67), "w"(v169)); - svfloat32_t v295; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v295) : "w"(v67), "w"(v169)); - svfloat32_t v296; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v296) : "w"(v135), "w"(v101)); - svfloat32_t v297; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v297) : "w"(v135), "w"(v101)); - svfloat32_t v347; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v347) : "w"(v83), "w"(v185)); - svfloat32_t v348; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v348) : "w"(v83), "w"(v185)); - svfloat32_t v349; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v349) : "w"(v151), "w"(v117)); - svfloat32_t v350; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v350) : "w"(v151), "w"(v117)); - svfloat32_t v188; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v188) : "w"(v84), "w"(v186)); - svfloat32_t v189; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v189) : "w"(v84), "w"(v186)); - svfloat32_t v190; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v190) : "w"(v152), "w"(v118)); - svfloat32_t v191; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v191) : "w"(v152), "w"(v118)); - svfloat32_t v241; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v241) : "w"(v85), "w"(v187)); - svfloat32_t v242; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v242) : "w"(v85), "w"(v187)); - svfloat32_t v243; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v243) : "w"(v153), "w"(v119)); - svfloat32_t v244; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v244) : "w"(v153), "w"(v119)); - svfloat32_t v298; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v298) : "w"(v294), "w"(v296)); - svfloat32_t v299; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v299) : "w"(v294), "w"(v296)); - svfloat32_t v300; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v300) : "w"(v295), "w"(v297)); - svfloat32_t zero323; - asm volatile("mov %0.s, #0" : "=w"(zero323)); + svfloat32_t v32 = svadd_f32_x(svptrue_b32(), v937, v939); + svfloat32_t v33 = svsub_f32_x(svptrue_b32(), v937, v939); + svfloat32_t v48 = svadd_f32_x(svptrue_b32(), v941, v943); + svfloat32_t v49 = svsub_f32_x(svptrue_b32(), v941, v943); + svfloat32_t v66 = svadd_f32_x(svptrue_b32(), v945, v947); + svfloat32_t v67 = svsub_f32_x(svptrue_b32(), v945, v947); + svfloat32_t v82 = svadd_f32_x(svptrue_b32(), v949, v951); + svfloat32_t v83 = svsub_f32_x(svptrue_b32(), v949, v951); + svfloat32_t v100 = svadd_f32_x(svptrue_b32(), v953, v955); + svfloat32_t v101 = svsub_f32_x(svptrue_b32(), v953, v955); + svfloat32_t v116 = svadd_f32_x(svptrue_b32(), v957, v959); + svfloat32_t v117 = svsub_f32_x(svptrue_b32(), v957, v959); + svfloat32_t v134 = svadd_f32_x(svptrue_b32(), v961, v963); + svfloat32_t v135 = svsub_f32_x(svptrue_b32(), v961, v963); + svfloat32_t v150 = svadd_f32_x(svptrue_b32(), v965, v967); + svfloat32_t v151 = svsub_f32_x(svptrue_b32(), v965, v967); + svfloat32_t v168 = svadd_f32_x(svptrue_b32(), v969, v971); + svfloat32_t v169 = svsub_f32_x(svptrue_b32(), v969, v971); + svfloat32_t v184 = svadd_f32_x(svptrue_b32(), v973, v975); + svfloat32_t v185 = svsub_f32_x(svptrue_b32(), v973, v975); + svfloat32_t v50 = svadd_f32_x(svptrue_b32(), v32, v48); + svfloat32_t v51 = svsub_f32_x(svptrue_b32(), v32, v48); + svfloat32_t v84 = svadd_f32_x(svptrue_b32(), v66, v82); + svfloat32_t v85 = svsub_f32_x(svptrue_b32(), v66, v82); + svfloat32_t v118 = svadd_f32_x(svptrue_b32(), v100, v116); + svfloat32_t v119 = svsub_f32_x(svptrue_b32(), v100, v116); + svfloat32_t v152 = svadd_f32_x(svptrue_b32(), v134, v150); + svfloat32_t v153 = svsub_f32_x(svptrue_b32(), v134, v150); + svfloat32_t v186 = svadd_f32_x(svptrue_b32(), v168, v184); + svfloat32_t v187 = svsub_f32_x(svptrue_b32(), v168, v184); + svfloat32_t v294 = svadd_f32_x(svptrue_b32(), v67, v169); + svfloat32_t v295 = svsub_f32_x(svptrue_b32(), v67, v169); + svfloat32_t v296 = svadd_f32_x(svptrue_b32(), v135, v101); + svfloat32_t v297 = svsub_f32_x(svptrue_b32(), v135, v101); + svfloat32_t v347 = svadd_f32_x(svptrue_b32(), v83, v185); + svfloat32_t v348 = svsub_f32_x(svptrue_b32(), v83, v185); + svfloat32_t v349 = svadd_f32_x(svptrue_b32(), v151, v117); + svfloat32_t v350 = svsub_f32_x(svptrue_b32(), v151, v117); + svfloat32_t v188 = svadd_f32_x(svptrue_b32(), v84, v186); + svfloat32_t v189 = svsub_f32_x(svptrue_b32(), v84, v186); + svfloat32_t v190 = svadd_f32_x(svptrue_b32(), v152, v118); + svfloat32_t v191 = svsub_f32_x(svptrue_b32(), v152, v118); + svfloat32_t v241 = svadd_f32_x(svptrue_b32(), v85, v187); + svfloat32_t v242 = svsub_f32_x(svptrue_b32(), v85, v187); + svfloat32_t v243 = svadd_f32_x(svptrue_b32(), v153, v119); + svfloat32_t v244 = svsub_f32_x(svptrue_b32(), v153, v119); + svfloat32_t v298 = svadd_f32_x(svptrue_b32(), v294, v296); + svfloat32_t v299 = svsub_f32_x(svptrue_b32(), v294, v296); + svfloat32_t v300 = svadd_f32_x(svptrue_b32(), v295, v297); + svfloat32_t zero323 = svdup_n_f32(0); svfloat32_t v323 = svcmla_f32_x(pred_full, zero323, v746, v295, 90); - svfloat32_t v351; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v351) : "w"(v347), "w"(v349)); - svfloat32_t v352; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v352) : "w"(v347), "w"(v349)); - svfloat32_t v353; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v353) : "w"(v348), "w"(v350)); - svfloat32_t v390; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v390) : "w"(v350), "w"(v754)); - svfloat32_t v192; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v192) : "w"(v188), "w"(v190)); - svfloat32_t v193; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v193) : "w"(v188), "w"(v190)); - svfloat32_t v194; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v194) : "w"(v189), "w"(v191)); - svfloat32_t zero217; - asm volatile("mov %0.s, #0" : "=w"(zero217)); + svfloat32_t v351 = svadd_f32_x(svptrue_b32(), v347, v349); + svfloat32_t v352 = svsub_f32_x(svptrue_b32(), v347, v349); + svfloat32_t v353 = svadd_f32_x(svptrue_b32(), v348, v350); + svfloat32_t v390 = svmul_f32_x(svptrue_b32(), v350, v754); + svfloat32_t v192 = svadd_f32_x(svptrue_b32(), v188, v190); + svfloat32_t v193 = svsub_f32_x(svptrue_b32(), v188, v190); + svfloat32_t v194 = svadd_f32_x(svptrue_b32(), v189, v191); + svfloat32_t zero217 = svdup_n_f32(0); svfloat32_t v217 = svcmla_f32_x(pred_full, zero217, v746, v189, 90); - svfloat32_t v245; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v245) : "w"(v241), "w"(v243)); - svfloat32_t v246; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v246) : "w"(v241), "w"(v243)); - svfloat32_t v247; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v247) : "w"(v242), "w"(v244)); - svfloat32_t zero270; - asm volatile("mov %0.s, #0" : "=w"(zero270)); + svfloat32_t v245 = svadd_f32_x(svptrue_b32(), v241, v243); + svfloat32_t v246 = svsub_f32_x(svptrue_b32(), v241, v243); + svfloat32_t v247 = svadd_f32_x(svptrue_b32(), v242, v244); + svfloat32_t zero270 = svdup_n_f32(0); svfloat32_t v270 = svcmla_f32_x(pred_full, zero270, v746, v242, 90); - svfloat32_t v301; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v301) : "w"(v298), "w"(v33)); - svfloat32_t zero330; - asm volatile("mov %0.s, #0" : "=w"(zero330)); + svfloat32_t v301 = svadd_f32_x(svptrue_b32(), v298, v33); + svfloat32_t zero330 = svdup_n_f32(0); svfloat32_t v330 = svcmla_f32_x(pred_full, zero330, v747, v300, 90); - svfloat32_t v354; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v354) : "w"(v351), "w"(v49)); - svfloat32_t zero375; - asm volatile("mov %0.s, #0" : "=w"(zero375)); + svfloat32_t v354 = svadd_f32_x(svptrue_b32(), v351, v49); + svfloat32_t zero375 = svdup_n_f32(0); svfloat32_t v375 = svcmla_f32_x(pred_full, zero375, v751, v352, 90); - svfloat32_t v385; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v385) : "w"(v353), "w"(v753)); - svfloat32_t v195; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v195) : "w"(v192), "w"(v50)); - svfloat32_t zero224; - asm volatile("mov %0.s, #0" : "=w"(zero224)); + svfloat32_t v385 = svmul_f32_x(svptrue_b32(), v353, v753); + svfloat32_t v195 = svadd_f32_x(svptrue_b32(), v192, v50); + svfloat32_t zero224 = svdup_n_f32(0); svfloat32_t v224 = svcmla_f32_x(pred_full, zero224, v747, v194, 90); - svfloat32_t v248; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v248) : "w"(v245), "w"(v51)); - svfloat32_t zero277; - asm volatile("mov %0.s, #0" : "=w"(zero277)); + svfloat32_t v248 = svadd_f32_x(svptrue_b32(), v245, v51); + svfloat32_t zero277 = svdup_n_f32(0); svfloat32_t v277 = svcmla_f32_x(pred_full, zero277, v747, v247, 90); svfloat32_t v338 = svmla_f32_x(pred_full, v301, v298, v744); - svfloat32_t v341; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v341) : "w"(v323), "w"(v330)); + svfloat32_t v341 = svsub_f32_x(svptrue_b32(), v323, v330); svfloat32_t v342 = svcmla_f32_x(pred_full, v330, v748, v297, 90); - svfloat32_t zero361; - asm volatile("mov %0.s, #0" : "=w"(zero361)); + svfloat32_t zero361 = svdup_n_f32(0); svfloat32_t v361 = svcmla_f32_x(pred_full, zero361, v749, v354, 90); svfloat32_t v394 = svnmls_f32_x(pred_full, v385, v348, v752); svfloat32_t v395 = svmla_f32_x(pred_full, v390, v353, v753); svfloat32_t v232 = svmla_f32_x(pred_full, v195, v192, v744); - svfloat32_t v235; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v235) : "w"(v217), "w"(v224)); + svfloat32_t v235 = svsub_f32_x(svptrue_b32(), v217, v224); svfloat32_t v236 = svcmla_f32_x(pred_full, v224, v748, v191, 90); svfloat32_t v285 = svmla_f32_x(pred_full, v248, v245, v744); - svfloat32_t v288; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v288) : "w"(v270), "w"(v277)); + svfloat32_t v288 = svsub_f32_x(svptrue_b32(), v270, v277); svfloat32_t v289 = svcmla_f32_x(pred_full, v277, v748, v244, 90); svfloat32_t v339 = svmla_f32_x(pred_full, v338, v299, v745); svfloat32_t v340 = svmls_f32_x(pred_full, v338, v299, v745); svfloat32_t v391 = svcmla_f32_x(pred_full, v361, v750, v351, 90); - svfloat32_t v400; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v400) : "w"(v301), "w"(v361)); - svfloat32_t v401; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v401) : "w"(v301), "w"(v361)); + svfloat32_t v400 = svadd_f32_x(svptrue_b32(), v301, v361); + svfloat32_t v401 = svsub_f32_x(svptrue_b32(), v301, v361); svst1_f64(pred_full, (double *)(v762), svreinterpret_f64_f32(v195)); svst1_f64(pred_full, (double *)(v780), svreinterpret_f64_f32(v248)); svfloat32_t v233 = svmla_f32_x(pred_full, v232, v193, v745); svfloat32_t v234 = svmls_f32_x(pred_full, v232, v193, v745); svfloat32_t v286 = svmla_f32_x(pred_full, v285, v246, v745); svfloat32_t v287 = svmls_f32_x(pred_full, v285, v246, v745); - svfloat32_t v343; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v343) : "w"(v339), "w"(v341)); - svfloat32_t v344; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v344) : "w"(v339), "w"(v341)); - svfloat32_t v345; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v345) : "w"(v340), "w"(v342)); - svfloat32_t v346; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v346) : "w"(v340), "w"(v342)); - svfloat32_t v392; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v392) : "w"(v391), "w"(v375)); - svfloat32_t v393; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v393) : "w"(v391), "w"(v375)); + svfloat32_t v343 = svadd_f32_x(svptrue_b32(), v339, v341); + svfloat32_t v344 = svsub_f32_x(svptrue_b32(), v339, v341); + svfloat32_t v345 = svadd_f32_x(svptrue_b32(), v340, v342); + svfloat32_t v346 = svsub_f32_x(svptrue_b32(), v340, v342); + svfloat32_t v392 = svadd_f32_x(svptrue_b32(), v391, v375); + svfloat32_t v393 = svsub_f32_x(svptrue_b32(), v391, v375); svst1_f64(pred_full, (double *)(v771), svreinterpret_f64_f32(v401)); svst1_f64(pred_full, (double *)(v789), svreinterpret_f64_f32(v400)); - svfloat32_t v237; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v237) : "w"(v233), "w"(v235)); - svfloat32_t v238; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v238) : "w"(v233), "w"(v235)); - svfloat32_t v239; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v239) : "w"(v234), "w"(v236)); - svfloat32_t v240; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v240) : "w"(v234), "w"(v236)); - svfloat32_t v290; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v290) : "w"(v286), "w"(v288)); - svfloat32_t v291; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v291) : "w"(v286), "w"(v288)); - svfloat32_t v292; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v292) : "w"(v287), "w"(v289)); - svfloat32_t v293; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v293) : "w"(v287), "w"(v289)); - svfloat32_t v396; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v396) : "w"(v392), "w"(v394)); - svfloat32_t v397; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v397) : "w"(v392), "w"(v394)); - svfloat32_t v398; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v398) : "w"(v393), "w"(v395)); - svfloat32_t v399; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v399) : "w"(v393), "w"(v395)); - svfloat32_t v430; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v430) : "w"(v344), "w"(v397)); - svfloat32_t v431; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v431) : "w"(v344), "w"(v397)); - svfloat32_t v460; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v460) : "w"(v346), "w"(v399)); - svfloat32_t v461; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v461) : "w"(v346), "w"(v399)); - svfloat32_t v490; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v490) : "w"(v345), "w"(v398)); - svfloat32_t v491; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v491) : "w"(v345), "w"(v398)); - svfloat32_t v520; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v520) : "w"(v343), "w"(v396)); - svfloat32_t v521; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v521) : "w"(v343), "w"(v396)); + svfloat32_t v237 = svadd_f32_x(svptrue_b32(), v233, v235); + svfloat32_t v238 = svsub_f32_x(svptrue_b32(), v233, v235); + svfloat32_t v239 = svadd_f32_x(svptrue_b32(), v234, v236); + svfloat32_t v240 = svsub_f32_x(svptrue_b32(), v234, v236); + svfloat32_t v290 = svadd_f32_x(svptrue_b32(), v286, v288); + svfloat32_t v291 = svsub_f32_x(svptrue_b32(), v286, v288); + svfloat32_t v292 = svadd_f32_x(svptrue_b32(), v287, v289); + svfloat32_t v293 = svsub_f32_x(svptrue_b32(), v287, v289); + svfloat32_t v396 = svadd_f32_x(svptrue_b32(), v392, v394); + svfloat32_t v397 = svsub_f32_x(svptrue_b32(), v392, v394); + svfloat32_t v398 = svadd_f32_x(svptrue_b32(), v393, v395); + svfloat32_t v399 = svsub_f32_x(svptrue_b32(), v393, v395); + svfloat32_t v430 = svadd_f32_x(svptrue_b32(), v344, v397); + svfloat32_t v431 = svsub_f32_x(svptrue_b32(), v344, v397); + svfloat32_t v460 = svadd_f32_x(svptrue_b32(), v346, v399); + svfloat32_t v461 = svsub_f32_x(svptrue_b32(), v346, v399); + svfloat32_t v490 = svadd_f32_x(svptrue_b32(), v345, v398); + svfloat32_t v491 = svsub_f32_x(svptrue_b32(), v345, v398); + svfloat32_t v520 = svadd_f32_x(svptrue_b32(), v343, v396); + svfloat32_t v521 = svsub_f32_x(svptrue_b32(), v343, v396); svst1_f64(pred_full, (double *)(v798), svreinterpret_f64_f32(v238)); svst1_f64(pred_full, (double *)(v816), svreinterpret_f64_f32(v291)); svst1_f64(pred_full, (double *)(v834), svreinterpret_f64_f32(v240)); @@ -12300,215 +11116,119 @@ void armral_fft_cf32_cf32_cf32_ac_n_uu21(const armral_cmplx_f32_t *restrict x, svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v801)[0])); svfloat32_t v1071 = svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v810)[0])); - svfloat32_t v32; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v32) : "w"(v1031), "w"(v1033)); - svfloat32_t v33; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v33) : "w"(v1031), "w"(v1033)); - svfloat32_t v56; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v56) : "w"(v1037), "w"(v1039)); - svfloat32_t v57; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v57) : "w"(v1037), "w"(v1039)); - svfloat32_t v80; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v80) : "w"(v1043), "w"(v1045)); - svfloat32_t v81; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v81) : "w"(v1043), "w"(v1045)); - svfloat32_t v104; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v104) : "w"(v1049), "w"(v1051)); - svfloat32_t v105; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v105) : "w"(v1049), "w"(v1051)); - svfloat32_t v128; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v128) : "w"(v1055), "w"(v1057)); - svfloat32_t v129; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v129) : "w"(v1055), "w"(v1057)); - svfloat32_t v152; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v152) : "w"(v1061), "w"(v1063)); - svfloat32_t v153; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v153) : "w"(v1061), "w"(v1063)); - svfloat32_t v176; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v176) : "w"(v1067), "w"(v1069)); - svfloat32_t v177; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v177) : "w"(v1067), "w"(v1069)); - svfloat32_t v41; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v41) : "w"(v32), "w"(v1035)); - svfloat32_t v65; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v65) : "w"(v56), "w"(v1041)); - svfloat32_t v89; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v89) : "w"(v80), "w"(v1047)); - svfloat32_t v113; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v113) : "w"(v104), "w"(v1053)); - svfloat32_t v137; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v137) : "w"(v128), "w"(v1059)); - svfloat32_t v161; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v161) : "w"(v152), "w"(v1065)); - svfloat32_t v185; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v185) : "w"(v176), "w"(v1071)); - svfloat32_t v275; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v275) : "w"(v56), "w"(v176)); - svfloat32_t v276; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v276) : "w"(v56), "w"(v176)); - svfloat32_t v277; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v277) : "w"(v128), "w"(v104)); - svfloat32_t v278; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v278) : "w"(v128), "w"(v104)); - svfloat32_t v279; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v279) : "w"(v80), "w"(v152)); - svfloat32_t v280; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v280) : "w"(v80), "w"(v152)); - svfloat32_t v364; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v364) : "w"(v57), "w"(v177)); - svfloat32_t v365; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v365) : "w"(v57), "w"(v177)); - svfloat32_t v366; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v366) : "w"(v129), "w"(v105)); - svfloat32_t v367; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v367) : "w"(v129), "w"(v105)); - svfloat32_t v368; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v368) : "w"(v81), "w"(v153)); - svfloat32_t v369; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v369) : "w"(v81), "w"(v153)); - svfloat32_t v186; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v186) : "w"(v65), "w"(v185)); - svfloat32_t v187; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v187) : "w"(v65), "w"(v185)); - svfloat32_t v188; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v188) : "w"(v137), "w"(v113)); - svfloat32_t v189; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v189) : "w"(v137), "w"(v113)); - svfloat32_t v190; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v190) : "w"(v89), "w"(v161)); - svfloat32_t v191; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v191) : "w"(v89), "w"(v161)); - svfloat32_t v281; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v281) : "w"(v275), "w"(v277)); - svfloat32_t v284; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v284) : "w"(v275), "w"(v277)); - svfloat32_t v285; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v285) : "w"(v277), "w"(v279)); - svfloat32_t v286; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v286) : "w"(v279), "w"(v275)); - svfloat32_t v287; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v287) : "w"(v276), "w"(v278)); - svfloat32_t v289; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v289) : "w"(v276), "w"(v278)); - svfloat32_t v290; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v290) : "w"(v278), "w"(v280)); - svfloat32_t v291; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v291) : "w"(v280), "w"(v276)); - svfloat32_t v370; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v370) : "w"(v364), "w"(v366)); - svfloat32_t v373; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v373) : "w"(v364), "w"(v366)); - svfloat32_t v374; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v374) : "w"(v366), "w"(v368)); - svfloat32_t v375; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v375) : "w"(v368), "w"(v364)); - svfloat32_t v376; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v376) : "w"(v365), "w"(v367)); - svfloat32_t v378; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v378) : "w"(v365), "w"(v367)); - svfloat32_t v379; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v379) : "w"(v367), "w"(v369)); - svfloat32_t v380; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v380) : "w"(v369), "w"(v365)); - svfloat32_t v192; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v192) : "w"(v186), "w"(v188)); - svfloat32_t v195; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v195) : "w"(v186), "w"(v188)); - svfloat32_t v196; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v196) : "w"(v188), "w"(v190)); - svfloat32_t v197; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v197) : "w"(v190), "w"(v186)); - svfloat32_t v198; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v198) : "w"(v187), "w"(v189)); - svfloat32_t v200; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v200) : "w"(v187), "w"(v189)); - svfloat32_t v201; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v201) : "w"(v189), "w"(v191)); - svfloat32_t v202; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v202) : "w"(v191), "w"(v187)); - svfloat32_t v282; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v282) : "w"(v281), "w"(v279)); - svfloat32_t v288; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v288) : "w"(v287), "w"(v280)); - svfloat32_t zero330; - asm volatile("mov %0.s, #0" : "=w"(zero330)); + svfloat32_t v32 = svadd_f32_x(svptrue_b32(), v1031, v1033); + svfloat32_t v33 = svsub_f32_x(svptrue_b32(), v1031, v1033); + svfloat32_t v56 = svadd_f32_x(svptrue_b32(), v1037, v1039); + svfloat32_t v57 = svsub_f32_x(svptrue_b32(), v1037, v1039); + svfloat32_t v80 = svadd_f32_x(svptrue_b32(), v1043, v1045); + svfloat32_t v81 = svsub_f32_x(svptrue_b32(), v1043, v1045); + svfloat32_t v104 = svadd_f32_x(svptrue_b32(), v1049, v1051); + svfloat32_t v105 = svsub_f32_x(svptrue_b32(), v1049, v1051); + svfloat32_t v128 = svadd_f32_x(svptrue_b32(), v1055, v1057); + svfloat32_t v129 = svsub_f32_x(svptrue_b32(), v1055, v1057); + svfloat32_t v152 = svadd_f32_x(svptrue_b32(), v1061, v1063); + svfloat32_t v153 = svsub_f32_x(svptrue_b32(), v1061, v1063); + svfloat32_t v176 = svadd_f32_x(svptrue_b32(), v1067, v1069); + svfloat32_t v177 = svsub_f32_x(svptrue_b32(), v1067, v1069); + svfloat32_t v41 = svadd_f32_x(svptrue_b32(), v32, v1035); + svfloat32_t v65 = svadd_f32_x(svptrue_b32(), v56, v1041); + svfloat32_t v89 = svadd_f32_x(svptrue_b32(), v80, v1047); + svfloat32_t v113 = svadd_f32_x(svptrue_b32(), v104, v1053); + svfloat32_t v137 = svadd_f32_x(svptrue_b32(), v128, v1059); + svfloat32_t v161 = svadd_f32_x(svptrue_b32(), v152, v1065); + svfloat32_t v185 = svadd_f32_x(svptrue_b32(), v176, v1071); + svfloat32_t v275 = svadd_f32_x(svptrue_b32(), v56, v176); + svfloat32_t v276 = svsub_f32_x(svptrue_b32(), v56, v176); + svfloat32_t v277 = svadd_f32_x(svptrue_b32(), v128, v104); + svfloat32_t v278 = svsub_f32_x(svptrue_b32(), v128, v104); + svfloat32_t v279 = svadd_f32_x(svptrue_b32(), v80, v152); + svfloat32_t v280 = svsub_f32_x(svptrue_b32(), v80, v152); + svfloat32_t v364 = svadd_f32_x(svptrue_b32(), v57, v177); + svfloat32_t v365 = svsub_f32_x(svptrue_b32(), v57, v177); + svfloat32_t v366 = svadd_f32_x(svptrue_b32(), v129, v105); + svfloat32_t v367 = svsub_f32_x(svptrue_b32(), v129, v105); + svfloat32_t v368 = svadd_f32_x(svptrue_b32(), v81, v153); + svfloat32_t v369 = svsub_f32_x(svptrue_b32(), v81, v153); + svfloat32_t v186 = svadd_f32_x(svptrue_b32(), v65, v185); + svfloat32_t v187 = svsub_f32_x(svptrue_b32(), v65, v185); + svfloat32_t v188 = svadd_f32_x(svptrue_b32(), v137, v113); + svfloat32_t v189 = svsub_f32_x(svptrue_b32(), v137, v113); + svfloat32_t v190 = svadd_f32_x(svptrue_b32(), v89, v161); + svfloat32_t v191 = svsub_f32_x(svptrue_b32(), v89, v161); + svfloat32_t v281 = svadd_f32_x(svptrue_b32(), v275, v277); + svfloat32_t v284 = svsub_f32_x(svptrue_b32(), v275, v277); + svfloat32_t v285 = svsub_f32_x(svptrue_b32(), v277, v279); + svfloat32_t v286 = svsub_f32_x(svptrue_b32(), v279, v275); + svfloat32_t v287 = svadd_f32_x(svptrue_b32(), v276, v278); + svfloat32_t v289 = svsub_f32_x(svptrue_b32(), v276, v278); + svfloat32_t v290 = svsub_f32_x(svptrue_b32(), v278, v280); + svfloat32_t v291 = svsub_f32_x(svptrue_b32(), v280, v276); + svfloat32_t v370 = svadd_f32_x(svptrue_b32(), v364, v366); + svfloat32_t v373 = svsub_f32_x(svptrue_b32(), v364, v366); + svfloat32_t v374 = svsub_f32_x(svptrue_b32(), v366, v368); + svfloat32_t v375 = svsub_f32_x(svptrue_b32(), v368, v364); + svfloat32_t v376 = svadd_f32_x(svptrue_b32(), v365, v367); + svfloat32_t v378 = svsub_f32_x(svptrue_b32(), v365, v367); + svfloat32_t v379 = svsub_f32_x(svptrue_b32(), v367, v369); + svfloat32_t v380 = svsub_f32_x(svptrue_b32(), v369, v365); + svfloat32_t v192 = svadd_f32_x(svptrue_b32(), v186, v188); + svfloat32_t v195 = svsub_f32_x(svptrue_b32(), v186, v188); + svfloat32_t v196 = svsub_f32_x(svptrue_b32(), v188, v190); + svfloat32_t v197 = svsub_f32_x(svptrue_b32(), v190, v186); + svfloat32_t v198 = svadd_f32_x(svptrue_b32(), v187, v189); + svfloat32_t v200 = svsub_f32_x(svptrue_b32(), v187, v189); + svfloat32_t v201 = svsub_f32_x(svptrue_b32(), v189, v191); + svfloat32_t v202 = svsub_f32_x(svptrue_b32(), v191, v187); + svfloat32_t v282 = svadd_f32_x(svptrue_b32(), v281, v279); + svfloat32_t v288 = svadd_f32_x(svptrue_b32(), v287, v280); + svfloat32_t zero330 = svdup_n_f32(0); svfloat32_t v330 = svcmla_f32_x(pred_full, zero330, v828, v289, 90); - svfloat32_t zero337; - asm volatile("mov %0.s, #0" : "=w"(zero337)); + svfloat32_t zero337 = svdup_n_f32(0); svfloat32_t v337 = svcmla_f32_x(pred_full, zero337, v829, v290, 90); - svfloat32_t zero344; - asm volatile("mov %0.s, #0" : "=w"(zero344)); + svfloat32_t zero344 = svdup_n_f32(0); svfloat32_t v344 = svcmla_f32_x(pred_full, zero344, v830, v291, 90); - svfloat32_t v371; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v371) : "w"(v370), "w"(v368)); - svfloat32_t v377; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v377) : "w"(v376), "w"(v369)); - svfloat32_t zero401; - asm volatile("mov %0.s, #0" : "=w"(zero401)); + svfloat32_t v371 = svadd_f32_x(svptrue_b32(), v370, v368); + svfloat32_t v377 = svadd_f32_x(svptrue_b32(), v376, v369); + svfloat32_t zero401 = svdup_n_f32(0); svfloat32_t v401 = svcmla_f32_x(pred_full, zero401, v833, v373, 90); - svfloat32_t zero408; - asm volatile("mov %0.s, #0" : "=w"(zero408)); + svfloat32_t zero408 = svdup_n_f32(0); svfloat32_t v408 = svcmla_f32_x(pred_full, zero408, v834, v374, 90); - svfloat32_t zero415; - asm volatile("mov %0.s, #0" : "=w"(zero415)); + svfloat32_t zero415 = svdup_n_f32(0); svfloat32_t v415 = svcmla_f32_x(pred_full, zero415, v835, v375, 90); - svfloat32_t v425; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v425) : "w"(v378), "w"(v837)); - svfloat32_t v430; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v430) : "w"(v379), "w"(v838)); - svfloat32_t v193; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v193) : "w"(v192), "w"(v190)); - svfloat32_t v199; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v199) : "w"(v198), "w"(v191)); - svfloat32_t zero241; - asm volatile("mov %0.s, #0" : "=w"(zero241)); + svfloat32_t v425 = svmul_f32_x(svptrue_b32(), v378, v837); + svfloat32_t v430 = svmul_f32_x(svptrue_b32(), v379, v838); + svfloat32_t v193 = svadd_f32_x(svptrue_b32(), v192, v190); + svfloat32_t v199 = svadd_f32_x(svptrue_b32(), v198, v191); + svfloat32_t zero241 = svdup_n_f32(0); svfloat32_t v241 = svcmla_f32_x(pred_full, zero241, v819, v200, 90); - svfloat32_t zero248; - asm volatile("mov %0.s, #0" : "=w"(zero248)); + svfloat32_t zero248 = svdup_n_f32(0); svfloat32_t v248 = svcmla_f32_x(pred_full, zero248, v820, v201, 90); - svfloat32_t zero255; - asm volatile("mov %0.s, #0" : "=w"(zero255)); + svfloat32_t zero255 = svdup_n_f32(0); svfloat32_t v255 = svcmla_f32_x(pred_full, zero255, v821, v202, 90); - svfloat32_t v283; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v283) : "w"(v282), "w"(v32)); - svfloat32_t v301; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v301) : "w"(v282), "w"(v823)); - svfloat32_t zero323; - asm volatile("mov %0.s, #0" : "=w"(zero323)); + svfloat32_t v283 = svadd_f32_x(svptrue_b32(), v282, v32); + svfloat32_t v301 = svmul_f32_x(svptrue_b32(), v282, v823); + svfloat32_t zero323 = svdup_n_f32(0); svfloat32_t v323 = svcmla_f32_x(pred_full, zero323, v827, v288, 90); - svfloat32_t v372; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v372) : "w"(v371), "w"(v33)); - svfloat32_t v194; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v194) : "w"(v193), "w"(v41)); - svfloat32_t zero234; - asm volatile("mov %0.s, #0" : "=w"(zero234)); + svfloat32_t v372 = svadd_f32_x(svptrue_b32(), v371, v33); + svfloat32_t v194 = svadd_f32_x(svptrue_b32(), v193, v41); + svfloat32_t zero234 = svdup_n_f32(0); svfloat32_t v234 = svcmla_f32_x(pred_full, zero234, v818, v199, 90); - svfloat32_t v352; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v352) : "w"(v323), "w"(v330)); - svfloat32_t v354; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v354) : "w"(v323), "w"(v330)); - svfloat32_t v356; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v356) : "w"(v323), "w"(v337)); - svfloat32_t zero387; - asm volatile("mov %0.s, #0" : "=w"(zero387)); + svfloat32_t v352 = svadd_f32_x(svptrue_b32(), v323, v330); + svfloat32_t v354 = svsub_f32_x(svptrue_b32(), v323, v330); + svfloat32_t v356 = svsub_f32_x(svptrue_b32(), v323, v337); + svfloat32_t zero387 = svdup_n_f32(0); svfloat32_t v387 = svcmla_f32_x(pred_full, zero387, v831, v372, 90); svfloat32_t v443 = svmla_f32_x(pred_full, v425, v377, v836); svfloat32_t v445 = svnmls_f32_x(pred_full, v425, v377, v836); svfloat32_t v447 = svnmls_f32_x(pred_full, v430, v377, v836); svfloat32_t v256 = svmla_f32_x(pred_full, v194, v193, v814); - svfloat32_t v263; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v263) : "w"(v234), "w"(v241)); - svfloat32_t v265; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v265) : "w"(v234), "w"(v241)); - svfloat32_t v267; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v267) : "w"(v234), "w"(v248)); + svfloat32_t v263 = svadd_f32_x(svptrue_b32(), v234, v241); + svfloat32_t v265 = svsub_f32_x(svptrue_b32(), v234, v241); + svfloat32_t v267 = svsub_f32_x(svptrue_b32(), v234, v248); svfloat32_t v345 = svmla_f32_x(pred_full, v301, v283, v822); - svfloat32_t v353; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v353) : "w"(v352), "w"(v337)); - svfloat32_t v355; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v355) : "w"(v354), "w"(v344)); - svfloat32_t v357; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v357) : "w"(v356), "w"(v344)); + svfloat32_t v353 = svadd_f32_x(svptrue_b32(), v352, v337); + svfloat32_t v355 = svsub_f32_x(svptrue_b32(), v354, v344); + svfloat32_t v357 = svadd_f32_x(svptrue_b32(), v356, v344); svfloat32_t v436 = svcmla_f32_x(pred_full, v387, v832, v371, 90); svfloat32_t v444 = svmla_f32_x(pred_full, v443, v379, v838); svfloat32_t v446 = svmls_f32_x(pred_full, v445, v380, v839); @@ -12518,117 +11238,70 @@ void armral_fft_cf32_cf32_cf32_ac_n_uu21(const armral_cmplx_f32_t *restrict x, svfloat32_t v257 = svmla_f32_x(pred_full, v256, v195, v815); svfloat32_t v259 = svmls_f32_x(pred_full, v256, v195, v815); svfloat32_t v261 = svmls_f32_x(pred_full, v256, v196, v816); - svfloat32_t v264; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v264) : "w"(v263), "w"(v248)); - svfloat32_t v266; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v266) : "w"(v265), "w"(v255)); - svfloat32_t v268; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v268) : "w"(v267), "w"(v255)); + svfloat32_t v264 = svadd_f32_x(svptrue_b32(), v263, v248); + svfloat32_t v266 = svsub_f32_x(svptrue_b32(), v265, v255); + svfloat32_t v268 = svadd_f32_x(svptrue_b32(), v267, v255); svfloat32_t v346 = svmla_f32_x(pred_full, v345, v284, v824); svfloat32_t v348 = svmls_f32_x(pred_full, v345, v284, v824); svfloat32_t v350 = svmls_f32_x(pred_full, v345, v285, v825); - svfloat32_t v437; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v437) : "w"(v436), "w"(v401)); - svfloat32_t v439; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v439) : "w"(v436), "w"(v401)); - svfloat32_t v441; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v441) : "w"(v436), "w"(v408)); - svfloat32_t v456; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v456) : "w"(v455), "w"(v387)); - svfloat32_t v457; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v457) : "w"(v455), "w"(v387)); + svfloat32_t v437 = svadd_f32_x(svptrue_b32(), v436, v401); + svfloat32_t v439 = svsub_f32_x(svptrue_b32(), v436, v401); + svfloat32_t v441 = svsub_f32_x(svptrue_b32(), v436, v408); + svfloat32_t v456 = svadd_f32_x(svptrue_b32(), v455, v387); + svfloat32_t v457 = svsub_f32_x(svptrue_b32(), v455, v387); svfloat32_t v258 = svmla_f32_x(pred_full, v257, v196, v816); svfloat32_t v260 = svmls_f32_x(pred_full, v259, v197, v817); svfloat32_t v262 = svmla_f32_x(pred_full, v261, v197, v817); svfloat32_t v347 = svmla_f32_x(pred_full, v346, v285, v825); svfloat32_t v349 = svmls_f32_x(pred_full, v348, v286, v826); svfloat32_t v351 = svmla_f32_x(pred_full, v350, v286, v826); - svfloat32_t v438; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v438) : "w"(v437), "w"(v408)); - svfloat32_t v440; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v440) : "w"(v439), "w"(v415)); - svfloat32_t v442; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v442) : "w"(v441), "w"(v415)); + svfloat32_t v438 = svadd_f32_x(svptrue_b32(), v437, v408); + svfloat32_t v440 = svsub_f32_x(svptrue_b32(), v439, v415); + svfloat32_t v442 = svadd_f32_x(svptrue_b32(), v441, v415); svst1_f64(pred_full, (double *)(v856), svreinterpret_f64_f32(v457)); svst1_f64(pred_full, (double *)(v865), svreinterpret_f64_f32(v456)); - svfloat32_t v269; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v269) : "w"(v258), "w"(v264)); - svfloat32_t v270; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v270) : "w"(v258), "w"(v264)); - svfloat32_t v271; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v271) : "w"(v260), "w"(v266)); - svfloat32_t v272; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v272) : "w"(v260), "w"(v266)); - svfloat32_t v273; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v273) : "w"(v262), "w"(v268)); - svfloat32_t v274; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v274) : "w"(v262), "w"(v268)); - svfloat32_t v358; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v358) : "w"(v347), "w"(v353)); - svfloat32_t v359; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v359) : "w"(v347), "w"(v353)); - svfloat32_t v360; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v360) : "w"(v349), "w"(v355)); - svfloat32_t v361; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v361) : "w"(v349), "w"(v355)); - svfloat32_t v362; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v362) : "w"(v351), "w"(v357)); - svfloat32_t v363; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v363) : "w"(v351), "w"(v357)); - svfloat32_t v449; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v449) : "w"(v438), "w"(v444)); - svfloat32_t v450; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v450) : "w"(v438), "w"(v444)); - svfloat32_t v451; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v451) : "w"(v440), "w"(v446)); - svfloat32_t v452; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v452) : "w"(v440), "w"(v446)); - svfloat32_t v453; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v453) : "w"(v442), "w"(v448)); - svfloat32_t v454; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v454) : "w"(v442), "w"(v448)); - svfloat32_t v479; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v479) : "w"(v270), "w"(v359)); - svfloat32_t v503; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v503) : "w"(v272), "w"(v361)); - svfloat32_t v527; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v527) : "w"(v273), "w"(v362)); - svfloat32_t v551; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v551) : "w"(v274), "w"(v363)); - svfloat32_t v575; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v575) : "w"(v271), "w"(v360)); - svfloat32_t v599; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v599) : "w"(v269), "w"(v358)); + svfloat32_t v269 = svadd_f32_x(svptrue_b32(), v258, v264); + svfloat32_t v270 = svsub_f32_x(svptrue_b32(), v258, v264); + svfloat32_t v271 = svadd_f32_x(svptrue_b32(), v260, v266); + svfloat32_t v272 = svsub_f32_x(svptrue_b32(), v260, v266); + svfloat32_t v273 = svadd_f32_x(svptrue_b32(), v262, v268); + svfloat32_t v274 = svsub_f32_x(svptrue_b32(), v262, v268); + svfloat32_t v358 = svadd_f32_x(svptrue_b32(), v347, v353); + svfloat32_t v359 = svsub_f32_x(svptrue_b32(), v347, v353); + svfloat32_t v360 = svadd_f32_x(svptrue_b32(), v349, v355); + svfloat32_t v361 = svsub_f32_x(svptrue_b32(), v349, v355); + svfloat32_t v362 = svadd_f32_x(svptrue_b32(), v351, v357); + svfloat32_t v363 = svsub_f32_x(svptrue_b32(), v351, v357); + svfloat32_t v449 = svadd_f32_x(svptrue_b32(), v438, v444); + svfloat32_t v450 = svsub_f32_x(svptrue_b32(), v438, v444); + svfloat32_t v451 = svadd_f32_x(svptrue_b32(), v440, v446); + svfloat32_t v452 = svsub_f32_x(svptrue_b32(), v440, v446); + svfloat32_t v453 = svadd_f32_x(svptrue_b32(), v442, v448); + svfloat32_t v454 = svsub_f32_x(svptrue_b32(), v442, v448); + svfloat32_t v479 = svadd_f32_x(svptrue_b32(), v270, v359); + svfloat32_t v503 = svadd_f32_x(svptrue_b32(), v272, v361); + svfloat32_t v527 = svadd_f32_x(svptrue_b32(), v273, v362); + svfloat32_t v551 = svadd_f32_x(svptrue_b32(), v274, v363); + svfloat32_t v575 = svadd_f32_x(svptrue_b32(), v271, v360); + svfloat32_t v599 = svadd_f32_x(svptrue_b32(), v269, v358); svst1_f64(pred_full, (double *)(v874), svreinterpret_f64_f32(v270)); svst1_f64(pred_full, (double *)(v901), svreinterpret_f64_f32(v272)); svst1_f64(pred_full, (double *)(v928), svreinterpret_f64_f32(v273)); svst1_f64(pred_full, (double *)(v955), svreinterpret_f64_f32(v274)); svst1_f64(pred_full, (double *)(v982), svreinterpret_f64_f32(v271)); svst1_f64(pred_full, (double *)(v1009), svreinterpret_f64_f32(v269)); - svfloat32_t v480; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v480) : "w"(v479), "w"(v450)); - svfloat32_t v481; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v481) : "w"(v479), "w"(v450)); - svfloat32_t v504; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v504) : "w"(v503), "w"(v452)); - svfloat32_t v505; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v505) : "w"(v503), "w"(v452)); - svfloat32_t v528; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v528) : "w"(v527), "w"(v453)); - svfloat32_t v529; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v529) : "w"(v527), "w"(v453)); - svfloat32_t v552; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v552) : "w"(v551), "w"(v454)); - svfloat32_t v553; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v553) : "w"(v551), "w"(v454)); - svfloat32_t v576; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v576) : "w"(v575), "w"(v451)); - svfloat32_t v577; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v577) : "w"(v575), "w"(v451)); - svfloat32_t v600; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v600) : "w"(v599), "w"(v449)); - svfloat32_t v601; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v601) : "w"(v599), "w"(v449)); + svfloat32_t v480 = svadd_f32_x(svptrue_b32(), v479, v450); + svfloat32_t v481 = svsub_f32_x(svptrue_b32(), v479, v450); + svfloat32_t v504 = svadd_f32_x(svptrue_b32(), v503, v452); + svfloat32_t v505 = svsub_f32_x(svptrue_b32(), v503, v452); + svfloat32_t v528 = svadd_f32_x(svptrue_b32(), v527, v453); + svfloat32_t v529 = svsub_f32_x(svptrue_b32(), v527, v453); + svfloat32_t v552 = svadd_f32_x(svptrue_b32(), v551, v454); + svfloat32_t v553 = svsub_f32_x(svptrue_b32(), v551, v454); + svfloat32_t v576 = svadd_f32_x(svptrue_b32(), v575, v451); + svfloat32_t v577 = svsub_f32_x(svptrue_b32(), v575, v451); + svfloat32_t v600 = svadd_f32_x(svptrue_b32(), v599, v449); + svfloat32_t v601 = svsub_f32_x(svptrue_b32(), v599, v449); svst1_f64(pred_full, (double *)(v883), svreinterpret_f64_f32(v481)); svst1_f64(pred_full, (double *)(v892), svreinterpret_f64_f32(v480)); svst1_f64(pred_full, (double *)(v910), svreinterpret_f64_f32(v505)); @@ -13636,240 +12309,128 @@ void armral_fft_cf32_cf32_cf32_ac_n_uu22(const armral_cmplx_f32_t *restrict x, svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v953)[0])); svfloat32_t v1249 = svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v962)[0])); - svfloat32_t v32; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v32) : "w"(v1207), "w"(v1209)); - svfloat32_t v33; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v33) : "w"(v1207), "w"(v1209)); - svfloat32_t v48; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v48) : "w"(v1211), "w"(v1213)); - svfloat32_t v49; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v49) : "w"(v1211), "w"(v1213)); - svfloat32_t v64; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v64) : "w"(v1215), "w"(v1217)); - svfloat32_t v65; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v65) : "w"(v1215), "w"(v1217)); - svfloat32_t v80; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v80) : "w"(v1219), "w"(v1221)); - svfloat32_t v81; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v81) : "w"(v1219), "w"(v1221)); - svfloat32_t v96; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v96) : "w"(v1223), "w"(v1225)); - svfloat32_t v97; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v97) : "w"(v1223), "w"(v1225)); - svfloat32_t v112; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v112) : "w"(v1227), "w"(v1229)); - svfloat32_t v113; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v113) : "w"(v1227), "w"(v1229)); - svfloat32_t v128; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v128) : "w"(v1231), "w"(v1233)); - svfloat32_t v129; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v129) : "w"(v1231), "w"(v1233)); - svfloat32_t v144; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v144) : "w"(v1235), "w"(v1237)); - svfloat32_t v145; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v145) : "w"(v1235), "w"(v1237)); - svfloat32_t v160; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v160) : "w"(v1239), "w"(v1241)); - svfloat32_t v161; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v161) : "w"(v1239), "w"(v1241)); - svfloat32_t v176; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v176) : "w"(v1243), "w"(v1245)); - svfloat32_t v177; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v177) : "w"(v1243), "w"(v1245)); - svfloat32_t v192; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v192) : "w"(v1247), "w"(v1249)); - svfloat32_t v193; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v193) : "w"(v1247), "w"(v1249)); - svfloat32_t v194; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v194) : "w"(v48), "w"(v192)); - svfloat32_t v195; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v195) : "w"(v64), "w"(v176)); - svfloat32_t v196; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v196) : "w"(v80), "w"(v160)); - svfloat32_t v197; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v197) : "w"(v96), "w"(v144)); - svfloat32_t v198; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v198) : "w"(v112), "w"(v128)); - svfloat32_t v199; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v199) : "w"(v48), "w"(v192)); - svfloat32_t v200; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v200) : "w"(v64), "w"(v176)); - svfloat32_t v201; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v201) : "w"(v80), "w"(v160)); - svfloat32_t v202; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v202) : "w"(v96), "w"(v144)); - svfloat32_t v203; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v203) : "w"(v112), "w"(v128)); - svfloat32_t v403; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v403) : "w"(v49), "w"(v193)); - svfloat32_t v404; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v404) : "w"(v65), "w"(v177)); - svfloat32_t v405; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v405) : "w"(v81), "w"(v161)); - svfloat32_t v406; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v406) : "w"(v97), "w"(v145)); - svfloat32_t v407; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v407) : "w"(v113), "w"(v129)); - svfloat32_t v408; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v408) : "w"(v49), "w"(v193)); - svfloat32_t v409; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v409) : "w"(v65), "w"(v177)); - svfloat32_t v410; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v410) : "w"(v81), "w"(v161)); - svfloat32_t v411; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v411) : "w"(v97), "w"(v145)); - svfloat32_t v412; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v412) : "w"(v113), "w"(v129)); - svfloat32_t v204; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v204) : "w"(v194), "w"(v195)); - svfloat32_t v205; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v205) : "w"(v196), "w"(v198)); - svfloat32_t v207; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v207) : "w"(v200), "w"(v201)); - svfloat32_t v208; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v208) : "w"(v199), "w"(v203)); - svfloat32_t v213; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v213) : "w"(v195), "w"(v197)); - svfloat32_t v214; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v214) : "w"(v194), "w"(v197)); - svfloat32_t v215; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v215) : "w"(v195), "w"(v194)); - svfloat32_t v216; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v216) : "w"(v198), "w"(v197)); - svfloat32_t v217; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v217) : "w"(v196), "w"(v197)); - svfloat32_t v218; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v218) : "w"(v198), "w"(v196)); - svfloat32_t v219; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v219) : "w"(v195), "w"(v198)); - svfloat32_t v220; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v220) : "w"(v194), "w"(v196)); - svfloat32_t v222; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v222) : "w"(v200), "w"(v202)); - svfloat32_t v223; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v223) : "w"(v199), "w"(v202)); - svfloat32_t v224; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v224) : "w"(v199), "w"(v200)); - svfloat32_t v225; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v225) : "w"(v202), "w"(v203)); - svfloat32_t v226; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v226) : "w"(v201), "w"(v202)); - svfloat32_t v227; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v227) : "w"(v201), "w"(v203)); - svfloat32_t v228; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v228) : "w"(v200), "w"(v203)); - svfloat32_t v229; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v229) : "w"(v199), "w"(v201)); - svfloat32_t v413; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v413) : "w"(v403), "w"(v404)); - svfloat32_t v414; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v414) : "w"(v405), "w"(v407)); - svfloat32_t v416; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v416) : "w"(v409), "w"(v410)); - svfloat32_t v417; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v417) : "w"(v408), "w"(v412)); - svfloat32_t v422; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v422) : "w"(v404), "w"(v406)); - svfloat32_t v423; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v423) : "w"(v403), "w"(v406)); - svfloat32_t v424; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v424) : "w"(v404), "w"(v403)); - svfloat32_t v425; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v425) : "w"(v407), "w"(v406)); - svfloat32_t v426; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v426) : "w"(v405), "w"(v406)); - svfloat32_t v427; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v427) : "w"(v407), "w"(v405)); - svfloat32_t v428; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v428) : "w"(v404), "w"(v407)); - svfloat32_t v429; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v429) : "w"(v403), "w"(v405)); - svfloat32_t v431; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v431) : "w"(v409), "w"(v411)); - svfloat32_t v432; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v432) : "w"(v408), "w"(v411)); - svfloat32_t v433; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v433) : "w"(v408), "w"(v409)); - svfloat32_t v434; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v434) : "w"(v411), "w"(v412)); - svfloat32_t v435; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v435) : "w"(v410), "w"(v411)); - svfloat32_t v436; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v436) : "w"(v410), "w"(v412)); - svfloat32_t v437; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v437) : "w"(v409), "w"(v412)); - svfloat32_t v438; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v438) : "w"(v408), "w"(v410)); - svfloat32_t v206; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v206) : "w"(v197), "w"(v204)); - svfloat32_t v211; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v211) : "w"(v207), "w"(v208)); - svfloat32_t v221; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v221) : "w"(v205), "w"(v204)); - svfloat32_t v230; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v230) : "w"(v207), "w"(v208)); - svfloat32_t v257; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v257) : "w"(v214), "w"(v990)); - svfloat32_t v262; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v262) : "w"(v215), "w"(v991)); - svfloat32_t v272; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v272) : "w"(v217), "w"(v993)); - svfloat32_t v277; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v277) : "w"(v218), "w"(v994)); - svfloat32_t zero299; - asm volatile("mov %0.s, #0" : "=w"(zero299)); + svfloat32_t v32 = svadd_f32_x(svptrue_b32(), v1207, v1209); + svfloat32_t v33 = svsub_f32_x(svptrue_b32(), v1207, v1209); + svfloat32_t v48 = svadd_f32_x(svptrue_b32(), v1211, v1213); + svfloat32_t v49 = svsub_f32_x(svptrue_b32(), v1211, v1213); + svfloat32_t v64 = svadd_f32_x(svptrue_b32(), v1215, v1217); + svfloat32_t v65 = svsub_f32_x(svptrue_b32(), v1215, v1217); + svfloat32_t v80 = svadd_f32_x(svptrue_b32(), v1219, v1221); + svfloat32_t v81 = svsub_f32_x(svptrue_b32(), v1219, v1221); + svfloat32_t v96 = svadd_f32_x(svptrue_b32(), v1223, v1225); + svfloat32_t v97 = svsub_f32_x(svptrue_b32(), v1223, v1225); + svfloat32_t v112 = svadd_f32_x(svptrue_b32(), v1227, v1229); + svfloat32_t v113 = svsub_f32_x(svptrue_b32(), v1227, v1229); + svfloat32_t v128 = svadd_f32_x(svptrue_b32(), v1231, v1233); + svfloat32_t v129 = svsub_f32_x(svptrue_b32(), v1231, v1233); + svfloat32_t v144 = svadd_f32_x(svptrue_b32(), v1235, v1237); + svfloat32_t v145 = svsub_f32_x(svptrue_b32(), v1235, v1237); + svfloat32_t v160 = svadd_f32_x(svptrue_b32(), v1239, v1241); + svfloat32_t v161 = svsub_f32_x(svptrue_b32(), v1239, v1241); + svfloat32_t v176 = svadd_f32_x(svptrue_b32(), v1243, v1245); + svfloat32_t v177 = svsub_f32_x(svptrue_b32(), v1243, v1245); + svfloat32_t v192 = svadd_f32_x(svptrue_b32(), v1247, v1249); + svfloat32_t v193 = svsub_f32_x(svptrue_b32(), v1247, v1249); + svfloat32_t v194 = svadd_f32_x(svptrue_b32(), v48, v192); + svfloat32_t v195 = svadd_f32_x(svptrue_b32(), v64, v176); + svfloat32_t v196 = svadd_f32_x(svptrue_b32(), v80, v160); + svfloat32_t v197 = svadd_f32_x(svptrue_b32(), v96, v144); + svfloat32_t v198 = svadd_f32_x(svptrue_b32(), v112, v128); + svfloat32_t v199 = svsub_f32_x(svptrue_b32(), v48, v192); + svfloat32_t v200 = svsub_f32_x(svptrue_b32(), v64, v176); + svfloat32_t v201 = svsub_f32_x(svptrue_b32(), v80, v160); + svfloat32_t v202 = svsub_f32_x(svptrue_b32(), v96, v144); + svfloat32_t v203 = svsub_f32_x(svptrue_b32(), v112, v128); + svfloat32_t v403 = svadd_f32_x(svptrue_b32(), v49, v193); + svfloat32_t v404 = svadd_f32_x(svptrue_b32(), v65, v177); + svfloat32_t v405 = svadd_f32_x(svptrue_b32(), v81, v161); + svfloat32_t v406 = svadd_f32_x(svptrue_b32(), v97, v145); + svfloat32_t v407 = svadd_f32_x(svptrue_b32(), v113, v129); + svfloat32_t v408 = svsub_f32_x(svptrue_b32(), v49, v193); + svfloat32_t v409 = svsub_f32_x(svptrue_b32(), v65, v177); + svfloat32_t v410 = svsub_f32_x(svptrue_b32(), v81, v161); + svfloat32_t v411 = svsub_f32_x(svptrue_b32(), v97, v145); + svfloat32_t v412 = svsub_f32_x(svptrue_b32(), v113, v129); + svfloat32_t v204 = svadd_f32_x(svptrue_b32(), v194, v195); + svfloat32_t v205 = svadd_f32_x(svptrue_b32(), v196, v198); + svfloat32_t v207 = svsub_f32_x(svptrue_b32(), v200, v201); + svfloat32_t v208 = svadd_f32_x(svptrue_b32(), v199, v203); + svfloat32_t v213 = svsub_f32_x(svptrue_b32(), v195, v197); + svfloat32_t v214 = svsub_f32_x(svptrue_b32(), v194, v197); + svfloat32_t v215 = svsub_f32_x(svptrue_b32(), v195, v194); + svfloat32_t v216 = svsub_f32_x(svptrue_b32(), v198, v197); + svfloat32_t v217 = svsub_f32_x(svptrue_b32(), v196, v197); + svfloat32_t v218 = svsub_f32_x(svptrue_b32(), v198, v196); + svfloat32_t v219 = svsub_f32_x(svptrue_b32(), v195, v198); + svfloat32_t v220 = svsub_f32_x(svptrue_b32(), v194, v196); + svfloat32_t v222 = svadd_f32_x(svptrue_b32(), v200, v202); + svfloat32_t v223 = svsub_f32_x(svptrue_b32(), v199, v202); + svfloat32_t v224 = svadd_f32_x(svptrue_b32(), v199, v200); + svfloat32_t v225 = svsub_f32_x(svptrue_b32(), v202, v203); + svfloat32_t v226 = svsub_f32_x(svptrue_b32(), v201, v202); + svfloat32_t v227 = svsub_f32_x(svptrue_b32(), v201, v203); + svfloat32_t v228 = svadd_f32_x(svptrue_b32(), v200, v203); + svfloat32_t v229 = svsub_f32_x(svptrue_b32(), v199, v201); + svfloat32_t v413 = svadd_f32_x(svptrue_b32(), v403, v404); + svfloat32_t v414 = svadd_f32_x(svptrue_b32(), v405, v407); + svfloat32_t v416 = svsub_f32_x(svptrue_b32(), v409, v410); + svfloat32_t v417 = svadd_f32_x(svptrue_b32(), v408, v412); + svfloat32_t v422 = svsub_f32_x(svptrue_b32(), v404, v406); + svfloat32_t v423 = svsub_f32_x(svptrue_b32(), v403, v406); + svfloat32_t v424 = svsub_f32_x(svptrue_b32(), v404, v403); + svfloat32_t v425 = svsub_f32_x(svptrue_b32(), v407, v406); + svfloat32_t v426 = svsub_f32_x(svptrue_b32(), v405, v406); + svfloat32_t v427 = svsub_f32_x(svptrue_b32(), v407, v405); + svfloat32_t v428 = svsub_f32_x(svptrue_b32(), v404, v407); + svfloat32_t v429 = svsub_f32_x(svptrue_b32(), v403, v405); + svfloat32_t v431 = svadd_f32_x(svptrue_b32(), v409, v411); + svfloat32_t v432 = svsub_f32_x(svptrue_b32(), v408, v411); + svfloat32_t v433 = svadd_f32_x(svptrue_b32(), v408, v409); + svfloat32_t v434 = svsub_f32_x(svptrue_b32(), v411, v412); + svfloat32_t v435 = svsub_f32_x(svptrue_b32(), v410, v411); + svfloat32_t v436 = svsub_f32_x(svptrue_b32(), v410, v412); + svfloat32_t v437 = svadd_f32_x(svptrue_b32(), v409, v412); + svfloat32_t v438 = svsub_f32_x(svptrue_b32(), v408, v410); + svfloat32_t v206 = svadd_f32_x(svptrue_b32(), v197, v204); + svfloat32_t v211 = svsub_f32_x(svptrue_b32(), v207, v208); + svfloat32_t v221 = svsub_f32_x(svptrue_b32(), v205, v204); + svfloat32_t v230 = svadd_f32_x(svptrue_b32(), v207, v208); + svfloat32_t v257 = svmul_f32_x(svptrue_b32(), v214, v990); + svfloat32_t v262 = svmul_f32_x(svptrue_b32(), v215, v991); + svfloat32_t v272 = svmul_f32_x(svptrue_b32(), v217, v993); + svfloat32_t v277 = svmul_f32_x(svptrue_b32(), v218, v994); + svfloat32_t zero299 = svdup_n_f32(0); svfloat32_t v299 = svcmla_f32_x(pred_full, zero299, v998, v222, 90); - svfloat32_t zero313; - asm volatile("mov %0.s, #0" : "=w"(zero313)); + svfloat32_t zero313 = svdup_n_f32(0); svfloat32_t v313 = svcmla_f32_x(pred_full, zero313, v1000, v224, 90); - svfloat32_t zero320; - asm volatile("mov %0.s, #0" : "=w"(zero320)); + svfloat32_t zero320 = svdup_n_f32(0); svfloat32_t v320 = svcmla_f32_x(pred_full, zero320, v1001, v225, 90); - svfloat32_t zero334; - asm volatile("mov %0.s, #0" : "=w"(zero334)); + svfloat32_t zero334 = svdup_n_f32(0); svfloat32_t v334 = svcmla_f32_x(pred_full, zero334, v1003, v227, 90); - svfloat32_t zero341; - asm volatile("mov %0.s, #0" : "=w"(zero341)); + svfloat32_t zero341 = svdup_n_f32(0); svfloat32_t v341 = svcmla_f32_x(pred_full, zero341, v1004, v228, 90); - svfloat32_t v415; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v415) : "w"(v406), "w"(v413)); - svfloat32_t v420; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v420) : "w"(v416), "w"(v417)); - svfloat32_t v430; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v430) : "w"(v414), "w"(v413)); - svfloat32_t v439; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v439) : "w"(v416), "w"(v417)); - svfloat32_t v466; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v466) : "w"(v423), "w"(v990)); - svfloat32_t v471; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v471) : "w"(v424), "w"(v991)); - svfloat32_t v481; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v481) : "w"(v426), "w"(v993)); - svfloat32_t v486; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v486) : "w"(v427), "w"(v994)); - svfloat32_t zero508; - asm volatile("mov %0.s, #0" : "=w"(zero508)); + svfloat32_t v415 = svadd_f32_x(svptrue_b32(), v406, v413); + svfloat32_t v420 = svsub_f32_x(svptrue_b32(), v416, v417); + svfloat32_t v430 = svsub_f32_x(svptrue_b32(), v414, v413); + svfloat32_t v439 = svadd_f32_x(svptrue_b32(), v416, v417); + svfloat32_t v466 = svmul_f32_x(svptrue_b32(), v423, v990); + svfloat32_t v471 = svmul_f32_x(svptrue_b32(), v424, v991); + svfloat32_t v481 = svmul_f32_x(svptrue_b32(), v426, v993); + svfloat32_t v486 = svmul_f32_x(svptrue_b32(), v427, v994); + svfloat32_t zero508 = svdup_n_f32(0); svfloat32_t v508 = svcmla_f32_x(pred_full, zero508, v998, v431, 90); - svfloat32_t zero522; - asm volatile("mov %0.s, #0" : "=w"(zero522)); + svfloat32_t zero522 = svdup_n_f32(0); svfloat32_t v522 = svcmla_f32_x(pred_full, zero522, v1000, v433, 90); - svfloat32_t zero529; - asm volatile("mov %0.s, #0" : "=w"(zero529)); + svfloat32_t zero529 = svdup_n_f32(0); svfloat32_t v529 = svcmla_f32_x(pred_full, zero529, v1001, v434, 90); - svfloat32_t zero543; - asm volatile("mov %0.s, #0" : "=w"(zero543)); + svfloat32_t zero543 = svdup_n_f32(0); svfloat32_t v543 = svcmla_f32_x(pred_full, zero543, v1003, v436, 90); - svfloat32_t zero550; - asm volatile("mov %0.s, #0" : "=w"(zero550)); + svfloat32_t zero550 = svdup_n_f32(0); svfloat32_t v550 = svcmla_f32_x(pred_full, zero550, v1004, v437, 90); - svfloat32_t v209; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v209) : "w"(v206), "w"(v205)); - svfloat32_t v212; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v212) : "w"(v211), "w"(v202)); - svfloat32_t v292; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v292) : "w"(v221), "w"(v997)); - svfloat32_t zero355; - asm volatile("mov %0.s, #0" : "=w"(zero355)); + svfloat32_t v209 = svadd_f32_x(svptrue_b32(), v206, v205); + svfloat32_t v212 = svsub_f32_x(svptrue_b32(), v211, v202); + svfloat32_t v292 = svmul_f32_x(svptrue_b32(), v221, v997); + svfloat32_t zero355 = svdup_n_f32(0); svfloat32_t v355 = svcmla_f32_x(pred_full, zero355, v1006, v230, 90); svfloat32_t v357 = svmla_f32_x(pred_full, v257, v213, v989); svfloat32_t v358 = svmla_f32_x(pred_full, v262, v214, v990); @@ -13878,19 +12439,13 @@ void armral_fft_cf32_cf32_cf32_ac_n_uu22(const armral_cmplx_f32_t *restrict x, svfloat32_t v361 = svmla_f32_x(pred_full, v277, v217, v993); svfloat32_t v362 = svnmls_f32_x(pred_full, v277, v216, v992); svfloat32_t v365 = svcmla_f32_x(pred_full, v313, v999, v223, 90); - svfloat32_t v366; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v366) : "w"(v299), "w"(v313)); + svfloat32_t v366 = svsub_f32_x(svptrue_b32(), v299, v313); svfloat32_t v367 = svcmla_f32_x(pred_full, v334, v1002, v226, 90); - svfloat32_t v368; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v368) : "w"(v320), "w"(v334)); - svfloat32_t v418; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v418) : "w"(v415), "w"(v414)); - svfloat32_t v421; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v421) : "w"(v420), "w"(v411)); - svfloat32_t v501; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v501) : "w"(v430), "w"(v997)); - svfloat32_t zero564; - asm volatile("mov %0.s, #0" : "=w"(zero564)); + svfloat32_t v368 = svsub_f32_x(svptrue_b32(), v320, v334); + svfloat32_t v418 = svadd_f32_x(svptrue_b32(), v415, v414); + svfloat32_t v421 = svsub_f32_x(svptrue_b32(), v420, v411); + svfloat32_t v501 = svmul_f32_x(svptrue_b32(), v430, v997); + svfloat32_t zero564 = svdup_n_f32(0); svfloat32_t v564 = svcmla_f32_x(pred_full, zero564, v1006, v439, 90); svfloat32_t v566 = svmla_f32_x(pred_full, v466, v422, v989); svfloat32_t v567 = svmla_f32_x(pred_full, v471, v423, v990); @@ -13899,163 +12454,91 @@ void armral_fft_cf32_cf32_cf32_ac_n_uu22(const armral_cmplx_f32_t *restrict x, svfloat32_t v570 = svmla_f32_x(pred_full, v486, v426, v993); svfloat32_t v571 = svnmls_f32_x(pred_full, v486, v425, v992); svfloat32_t v574 = svcmla_f32_x(pred_full, v522, v999, v432, 90); - svfloat32_t v575; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v575) : "w"(v508), "w"(v522)); + svfloat32_t v575 = svsub_f32_x(svptrue_b32(), v508, v522); svfloat32_t v576 = svcmla_f32_x(pred_full, v543, v1002, v435, 90); - svfloat32_t v577; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v577) : "w"(v529), "w"(v543)); - svfloat32_t v210; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v210) : "w"(v32), "w"(v209)); - svfloat32_t zero247; - asm volatile("mov %0.s, #0" : "=w"(zero247)); + svfloat32_t v577 = svsub_f32_x(svptrue_b32(), v529, v543); + svfloat32_t v210 = svadd_f32_x(svptrue_b32(), v32, v209); + svfloat32_t zero247 = svdup_n_f32(0); svfloat32_t v247 = svcmla_f32_x(pred_full, zero247, v988, v212, 90); svfloat32_t v363 = svmla_f32_x(pred_full, v292, v220, v996); svfloat32_t v364 = svmla_f32_x(pred_full, v292, v219, v995); svfloat32_t v369 = svcmla_f32_x(pred_full, v355, v1005, v229, 90); - svfloat32_t v370; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v370) : "w"(v341), "w"(v355)); - svfloat32_t v389; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v389) : "w"(v365), "w"(v366)); - svfloat32_t v419; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v419) : "w"(v33), "w"(v418)); - svfloat32_t zero456; - asm volatile("mov %0.s, #0" : "=w"(zero456)); + svfloat32_t v370 = svsub_f32_x(svptrue_b32(), v341, v355); + svfloat32_t v389 = svadd_f32_x(svptrue_b32(), v365, v366); + svfloat32_t v419 = svadd_f32_x(svptrue_b32(), v33, v418); + svfloat32_t zero456 = svdup_n_f32(0); svfloat32_t v456 = svcmla_f32_x(pred_full, zero456, v988, v421, 90); svfloat32_t v572 = svmla_f32_x(pred_full, v501, v429, v996); svfloat32_t v573 = svmla_f32_x(pred_full, v501, v428, v995); svfloat32_t v578 = svcmla_f32_x(pred_full, v564, v1005, v438, 90); - svfloat32_t v579; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v579) : "w"(v550), "w"(v564)); - svfloat32_t v598; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v598) : "w"(v574), "w"(v575)); + svfloat32_t v579 = svsub_f32_x(svptrue_b32(), v550, v564); + svfloat32_t v598 = svadd_f32_x(svptrue_b32(), v574, v575); svfloat32_t v356 = svmls_f32_x(pred_full, v210, v209, v987); - svfloat32_t v371; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v371) : "w"(v361), "w"(v363)); - svfloat32_t v381; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v381) : "w"(v247), "w"(v367)); - svfloat32_t v383; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v383) : "w"(v369), "w"(v365)); - svfloat32_t v385; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v385) : "w"(v247), "w"(v370)); - svfloat32_t v387; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v387) : "w"(v370), "w"(v366)); - svfloat32_t v390; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v390) : "w"(v389), "w"(v367)); + svfloat32_t v371 = svadd_f32_x(svptrue_b32(), v361, v363); + svfloat32_t v381 = svadd_f32_x(svptrue_b32(), v247, v367); + svfloat32_t v383 = svsub_f32_x(svptrue_b32(), v369, v365); + svfloat32_t v385 = svadd_f32_x(svptrue_b32(), v247, v370); + svfloat32_t v387 = svsub_f32_x(svptrue_b32(), v370, v366); + svfloat32_t v390 = svadd_f32_x(svptrue_b32(), v389, v367); svfloat32_t v565 = svmls_f32_x(pred_full, v419, v418, v987); - svfloat32_t v580; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v580) : "w"(v570), "w"(v572)); - svfloat32_t v590; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v590) : "w"(v456), "w"(v576)); - svfloat32_t v592; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v592) : "w"(v578), "w"(v574)); - svfloat32_t v594; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v594) : "w"(v456), "w"(v579)); - svfloat32_t v596; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v596) : "w"(v579), "w"(v575)); - svfloat32_t v599; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v599) : "w"(v598), "w"(v576)); + svfloat32_t v580 = svadd_f32_x(svptrue_b32(), v570, v572); + svfloat32_t v590 = svadd_f32_x(svptrue_b32(), v456, v576); + svfloat32_t v592 = svsub_f32_x(svptrue_b32(), v578, v574); + svfloat32_t v594 = svadd_f32_x(svptrue_b32(), v456, v579); + svfloat32_t v596 = svsub_f32_x(svptrue_b32(), v579, v575); + svfloat32_t v599 = svadd_f32_x(svptrue_b32(), v598, v576); svst1_f64(pred_full, (double *)(v1014), svreinterpret_f64_f32(v210)); svst1_f64(pred_full, (double *)(v1023), svreinterpret_f64_f32(v419)); - svfloat32_t v372; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v372) : "w"(v371), "w"(v356)); - svfloat32_t v373; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v373) : "w"(v356), "w"(v358)); - svfloat32_t v375; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v375) : "w"(v356), "w"(v362)); - svfloat32_t v377; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v377) : "w"(v356), "w"(v359)); - svfloat32_t v379; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v379) : "w"(v356), "w"(v357)); - svfloat32_t v382; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v382) : "w"(v381), "w"(v369)); - svfloat32_t v384; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v384) : "w"(v383), "w"(v247)); - svfloat32_t v386; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v386) : "w"(v385), "w"(v368)); - svfloat32_t v388; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v388) : "w"(v387), "w"(v247)); - svfloat32_t v391; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v391) : "w"(v390), "w"(v368)); - svfloat32_t v581; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v581) : "w"(v580), "w"(v565)); - svfloat32_t v582; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v582) : "w"(v565), "w"(v567)); - svfloat32_t v584; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v584) : "w"(v565), "w"(v571)); - svfloat32_t v586; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v586) : "w"(v565), "w"(v568)); - svfloat32_t v588; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v588) : "w"(v565), "w"(v566)); - svfloat32_t v591; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v591) : "w"(v590), "w"(v578)); - svfloat32_t v593; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v593) : "w"(v592), "w"(v456)); - svfloat32_t v595; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v595) : "w"(v594), "w"(v577)); - svfloat32_t v597; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v597) : "w"(v596), "w"(v456)); - svfloat32_t v600; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v600) : "w"(v599), "w"(v577)); - svfloat32_t v374; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v374) : "w"(v373), "w"(v363)); - svfloat32_t v376; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v376) : "w"(v375), "w"(v364)); - svfloat32_t v378; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v378) : "w"(v377), "w"(v364)); - svfloat32_t v380; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v380) : "w"(v379), "w"(v360)); - svfloat32_t v392; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v392) : "w"(v391), "w"(v247)); - svfloat32_t v394; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v394) : "w"(v372), "w"(v382)); - svfloat32_t v401; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v401) : "w"(v372), "w"(v382)); - svfloat32_t v583; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v583) : "w"(v582), "w"(v572)); - svfloat32_t v585; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v585) : "w"(v584), "w"(v573)); - svfloat32_t v587; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v587) : "w"(v586), "w"(v573)); - svfloat32_t v589; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v589) : "w"(v588), "w"(v569)); - svfloat32_t v601; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v601) : "w"(v600), "w"(v456)); - svfloat32_t v603; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v603) : "w"(v581), "w"(v591)); - svfloat32_t v610; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v610) : "w"(v581), "w"(v591)); - svfloat32_t v393; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v393) : "w"(v380), "w"(v392)); - svfloat32_t v395; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v395) : "w"(v374), "w"(v384)); - svfloat32_t v396; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v396) : "w"(v376), "w"(v386)); - svfloat32_t v397; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v397) : "w"(v378), "w"(v388)); - svfloat32_t v398; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v398) : "w"(v378), "w"(v388)); - svfloat32_t v399; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v399) : "w"(v376), "w"(v386)); - svfloat32_t v400; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v400) : "w"(v374), "w"(v384)); - svfloat32_t v402; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v402) : "w"(v380), "w"(v392)); - svfloat32_t v602; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v602) : "w"(v589), "w"(v601)); - svfloat32_t v604; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v604) : "w"(v583), "w"(v593)); - svfloat32_t v605; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v605) : "w"(v585), "w"(v595)); - svfloat32_t v606; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v606) : "w"(v587), "w"(v597)); - svfloat32_t v607; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v607) : "w"(v587), "w"(v597)); - svfloat32_t v608; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v608) : "w"(v585), "w"(v595)); - svfloat32_t v609; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v609) : "w"(v583), "w"(v593)); - svfloat32_t v611; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v611) : "w"(v589), "w"(v601)); + svfloat32_t v372 = svadd_f32_x(svptrue_b32(), v371, v356); + svfloat32_t v373 = svsub_f32_x(svptrue_b32(), v356, v358); + svfloat32_t v375 = svadd_f32_x(svptrue_b32(), v356, v362); + svfloat32_t v377 = svsub_f32_x(svptrue_b32(), v356, v359); + svfloat32_t v379 = svadd_f32_x(svptrue_b32(), v356, v357); + svfloat32_t v382 = svadd_f32_x(svptrue_b32(), v381, v369); + svfloat32_t v384 = svsub_f32_x(svptrue_b32(), v383, v247); + svfloat32_t v386 = svadd_f32_x(svptrue_b32(), v385, v368); + svfloat32_t v388 = svsub_f32_x(svptrue_b32(), v387, v247); + svfloat32_t v391 = svadd_f32_x(svptrue_b32(), v390, v368); + svfloat32_t v581 = svadd_f32_x(svptrue_b32(), v580, v565); + svfloat32_t v582 = svsub_f32_x(svptrue_b32(), v565, v567); + svfloat32_t v584 = svadd_f32_x(svptrue_b32(), v565, v571); + svfloat32_t v586 = svsub_f32_x(svptrue_b32(), v565, v568); + svfloat32_t v588 = svadd_f32_x(svptrue_b32(), v565, v566); + svfloat32_t v591 = svadd_f32_x(svptrue_b32(), v590, v578); + svfloat32_t v593 = svsub_f32_x(svptrue_b32(), v592, v456); + svfloat32_t v595 = svadd_f32_x(svptrue_b32(), v594, v577); + svfloat32_t v597 = svsub_f32_x(svptrue_b32(), v596, v456); + svfloat32_t v600 = svadd_f32_x(svptrue_b32(), v599, v577); + svfloat32_t v374 = svsub_f32_x(svptrue_b32(), v373, v363); + svfloat32_t v376 = svadd_f32_x(svptrue_b32(), v375, v364); + svfloat32_t v378 = svsub_f32_x(svptrue_b32(), v377, v364); + svfloat32_t v380 = svsub_f32_x(svptrue_b32(), v379, v360); + svfloat32_t v392 = svsub_f32_x(svptrue_b32(), v391, v247); + svfloat32_t v394 = svadd_f32_x(svptrue_b32(), v372, v382); + svfloat32_t v401 = svsub_f32_x(svptrue_b32(), v372, v382); + svfloat32_t v583 = svsub_f32_x(svptrue_b32(), v582, v572); + svfloat32_t v585 = svadd_f32_x(svptrue_b32(), v584, v573); + svfloat32_t v587 = svsub_f32_x(svptrue_b32(), v586, v573); + svfloat32_t v589 = svsub_f32_x(svptrue_b32(), v588, v569); + svfloat32_t v601 = svsub_f32_x(svptrue_b32(), v600, v456); + svfloat32_t v603 = svadd_f32_x(svptrue_b32(), v581, v591); + svfloat32_t v610 = svsub_f32_x(svptrue_b32(), v581, v591); + svfloat32_t v393 = svadd_f32_x(svptrue_b32(), v380, v392); + svfloat32_t v395 = svadd_f32_x(svptrue_b32(), v374, v384); + svfloat32_t v396 = svsub_f32_x(svptrue_b32(), v376, v386); + svfloat32_t v397 = svadd_f32_x(svptrue_b32(), v378, v388); + svfloat32_t v398 = svsub_f32_x(svptrue_b32(), v378, v388); + svfloat32_t v399 = svadd_f32_x(svptrue_b32(), v376, v386); + svfloat32_t v400 = svsub_f32_x(svptrue_b32(), v374, v384); + svfloat32_t v402 = svsub_f32_x(svptrue_b32(), v380, v392); + svfloat32_t v602 = svadd_f32_x(svptrue_b32(), v589, v601); + svfloat32_t v604 = svadd_f32_x(svptrue_b32(), v583, v593); + svfloat32_t v605 = svsub_f32_x(svptrue_b32(), v585, v595); + svfloat32_t v606 = svadd_f32_x(svptrue_b32(), v587, v597); + svfloat32_t v607 = svsub_f32_x(svptrue_b32(), v587, v597); + svfloat32_t v608 = svadd_f32_x(svptrue_b32(), v585, v595); + svfloat32_t v609 = svsub_f32_x(svptrue_b32(), v583, v593); + svfloat32_t v611 = svsub_f32_x(svptrue_b32(), v589, v601); svst1_f64(pred_full, (double *)(v1050), svreinterpret_f64_f32(v401)); svst1_f64(pred_full, (double *)(v1059), svreinterpret_f64_f32(v610)); svst1_f64(pred_full, (double *)(v1176), svreinterpret_f64_f32(v394)); @@ -14814,271 +13297,152 @@ void armral_fft_cf32_cf32_cf32_ac_n_uu24(const armral_cmplx_f32_t *restrict x, svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v827)[0])); svfloat32_t v1127 = svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v836)[0])); - svfloat32_t v32; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v32) : "w"(v1081), "w"(v1083)); - svfloat32_t v33; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v33) : "w"(v1081), "w"(v1083)); - svfloat32_t v56; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v56) : "w"(v1087), "w"(v1089)); - svfloat32_t v57; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v57) : "w"(v1087), "w"(v1089)); - svfloat32_t v80; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v80) : "w"(v1093), "w"(v1095)); - svfloat32_t v81; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v81) : "w"(v1093), "w"(v1095)); - svfloat32_t v104; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v104) : "w"(v1099), "w"(v1101)); - svfloat32_t v105; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v105) : "w"(v1099), "w"(v1101)); - svfloat32_t v128; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v128) : "w"(v1105), "w"(v1107)); - svfloat32_t v129; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v129) : "w"(v1105), "w"(v1107)); - svfloat32_t v152; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v152) : "w"(v1111), "w"(v1113)); - svfloat32_t v153; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v153) : "w"(v1111), "w"(v1113)); - svfloat32_t v176; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v176) : "w"(v1117), "w"(v1119)); - svfloat32_t v177; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v177) : "w"(v1117), "w"(v1119)); - svfloat32_t v200; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v200) : "w"(v1123), "w"(v1125)); - svfloat32_t v201; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v201) : "w"(v1123), "w"(v1125)); - svfloat32_t v41; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v41) : "w"(v32), "w"(v1085)); - svfloat32_t v65; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v65) : "w"(v56), "w"(v1091)); - svfloat32_t v89; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v89) : "w"(v80), "w"(v1097)); - svfloat32_t v113; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v113) : "w"(v104), "w"(v1103)); - svfloat32_t v137; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v137) : "w"(v128), "w"(v1109)); - svfloat32_t v161; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v161) : "w"(v152), "w"(v1115)); - svfloat32_t v185; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v185) : "w"(v176), "w"(v1121)); - svfloat32_t v209; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v209) : "w"(v200), "w"(v1127)); - svfloat32_t v282; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v282) : "w"(v32), "w"(v128)); - svfloat32_t v283; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v283) : "w"(v32), "w"(v128)); - svfloat32_t v284; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v284) : "w"(v80), "w"(v176)); - svfloat32_t v285; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v285) : "w"(v80), "w"(v176)); - svfloat32_t v286; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v286) : "w"(v56), "w"(v152)); - svfloat32_t v287; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v287) : "w"(v56), "w"(v152)); - svfloat32_t v288; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v288) : "w"(v104), "w"(v200)); - svfloat32_t v289; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v289) : "w"(v104), "w"(v200)); - svfloat32_t v354; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v354) : "w"(v33), "w"(v129)); - svfloat32_t v355; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v355) : "w"(v33), "w"(v129)); - svfloat32_t v356; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v356) : "w"(v81), "w"(v177)); - svfloat32_t v357; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v357) : "w"(v81), "w"(v177)); - svfloat32_t v358; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v358) : "w"(v57), "w"(v153)); - svfloat32_t v359; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v359) : "w"(v57), "w"(v153)); - svfloat32_t v360; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v360) : "w"(v105), "w"(v201)); - svfloat32_t v361; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v361) : "w"(v105), "w"(v201)); - svfloat32_t v210; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v210) : "w"(v41), "w"(v137)); - svfloat32_t v211; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v211) : "w"(v41), "w"(v137)); - svfloat32_t v212; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v212) : "w"(v89), "w"(v185)); - svfloat32_t v213; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v213) : "w"(v89), "w"(v185)); - svfloat32_t v214; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v214) : "w"(v65), "w"(v161)); - svfloat32_t v215; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v215) : "w"(v65), "w"(v161)); - svfloat32_t v216; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v216) : "w"(v113), "w"(v209)); - svfloat32_t v217; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v217) : "w"(v113), "w"(v209)); - svfloat32_t v290; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v290) : "w"(v282), "w"(v284)); - svfloat32_t v291; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v291) : "w"(v282), "w"(v284)); - svfloat32_t v292; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v292) : "w"(v286), "w"(v288)); - svfloat32_t v293; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v293) : "w"(v286), "w"(v288)); - svfloat32_t v296; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v296) : "w"(v287), "w"(v289)); - svfloat32_t v297; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v297) : "w"(v287), "w"(v289)); - svfloat32_t zero331; - asm volatile("mov %0.s, #0" : "=w"(zero331)); + svfloat32_t v32 = svadd_f32_x(svptrue_b32(), v1081, v1083); + svfloat32_t v33 = svsub_f32_x(svptrue_b32(), v1081, v1083); + svfloat32_t v56 = svadd_f32_x(svptrue_b32(), v1087, v1089); + svfloat32_t v57 = svsub_f32_x(svptrue_b32(), v1087, v1089); + svfloat32_t v80 = svadd_f32_x(svptrue_b32(), v1093, v1095); + svfloat32_t v81 = svsub_f32_x(svptrue_b32(), v1093, v1095); + svfloat32_t v104 = svadd_f32_x(svptrue_b32(), v1099, v1101); + svfloat32_t v105 = svsub_f32_x(svptrue_b32(), v1099, v1101); + svfloat32_t v128 = svadd_f32_x(svptrue_b32(), v1105, v1107); + svfloat32_t v129 = svsub_f32_x(svptrue_b32(), v1105, v1107); + svfloat32_t v152 = svadd_f32_x(svptrue_b32(), v1111, v1113); + svfloat32_t v153 = svsub_f32_x(svptrue_b32(), v1111, v1113); + svfloat32_t v176 = svadd_f32_x(svptrue_b32(), v1117, v1119); + svfloat32_t v177 = svsub_f32_x(svptrue_b32(), v1117, v1119); + svfloat32_t v200 = svadd_f32_x(svptrue_b32(), v1123, v1125); + svfloat32_t v201 = svsub_f32_x(svptrue_b32(), v1123, v1125); + svfloat32_t v41 = svadd_f32_x(svptrue_b32(), v32, v1085); + svfloat32_t v65 = svadd_f32_x(svptrue_b32(), v56, v1091); + svfloat32_t v89 = svadd_f32_x(svptrue_b32(), v80, v1097); + svfloat32_t v113 = svadd_f32_x(svptrue_b32(), v104, v1103); + svfloat32_t v137 = svadd_f32_x(svptrue_b32(), v128, v1109); + svfloat32_t v161 = svadd_f32_x(svptrue_b32(), v152, v1115); + svfloat32_t v185 = svadd_f32_x(svptrue_b32(), v176, v1121); + svfloat32_t v209 = svadd_f32_x(svptrue_b32(), v200, v1127); + svfloat32_t v282 = svadd_f32_x(svptrue_b32(), v32, v128); + svfloat32_t v283 = svsub_f32_x(svptrue_b32(), v32, v128); + svfloat32_t v284 = svadd_f32_x(svptrue_b32(), v80, v176); + svfloat32_t v285 = svsub_f32_x(svptrue_b32(), v80, v176); + svfloat32_t v286 = svadd_f32_x(svptrue_b32(), v56, v152); + svfloat32_t v287 = svsub_f32_x(svptrue_b32(), v56, v152); + svfloat32_t v288 = svadd_f32_x(svptrue_b32(), v104, v200); + svfloat32_t v289 = svsub_f32_x(svptrue_b32(), v104, v200); + svfloat32_t v354 = svadd_f32_x(svptrue_b32(), v33, v129); + svfloat32_t v355 = svsub_f32_x(svptrue_b32(), v33, v129); + svfloat32_t v356 = svadd_f32_x(svptrue_b32(), v81, v177); + svfloat32_t v357 = svsub_f32_x(svptrue_b32(), v81, v177); + svfloat32_t v358 = svadd_f32_x(svptrue_b32(), v57, v153); + svfloat32_t v359 = svsub_f32_x(svptrue_b32(), v57, v153); + svfloat32_t v360 = svadd_f32_x(svptrue_b32(), v105, v201); + svfloat32_t v361 = svsub_f32_x(svptrue_b32(), v105, v201); + svfloat32_t v210 = svadd_f32_x(svptrue_b32(), v41, v137); + svfloat32_t v211 = svsub_f32_x(svptrue_b32(), v41, v137); + svfloat32_t v212 = svadd_f32_x(svptrue_b32(), v89, v185); + svfloat32_t v213 = svsub_f32_x(svptrue_b32(), v89, v185); + svfloat32_t v214 = svadd_f32_x(svptrue_b32(), v65, v161); + svfloat32_t v215 = svsub_f32_x(svptrue_b32(), v65, v161); + svfloat32_t v216 = svadd_f32_x(svptrue_b32(), v113, v209); + svfloat32_t v217 = svsub_f32_x(svptrue_b32(), v113, v209); + svfloat32_t v290 = svadd_f32_x(svptrue_b32(), v282, v284); + svfloat32_t v291 = svsub_f32_x(svptrue_b32(), v282, v284); + svfloat32_t v292 = svadd_f32_x(svptrue_b32(), v286, v288); + svfloat32_t v293 = svsub_f32_x(svptrue_b32(), v286, v288); + svfloat32_t v296 = svadd_f32_x(svptrue_b32(), v287, v289); + svfloat32_t v297 = svsub_f32_x(svptrue_b32(), v287, v289); + svfloat32_t zero331 = svdup_n_f32(0); svfloat32_t v331 = svcmla_f32_x(pred_full, zero331, v852, v285, 90); - svfloat32_t v362; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v362) : "w"(v354), "w"(v356)); - svfloat32_t v363; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v363) : "w"(v354), "w"(v356)); - svfloat32_t v364; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v364) : "w"(v358), "w"(v360)); - svfloat32_t v365; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v365) : "w"(v358), "w"(v360)); - svfloat32_t v368; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v368) : "w"(v359), "w"(v361)); - svfloat32_t v369; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v369) : "w"(v359), "w"(v361)); - svfloat32_t zero402; - asm volatile("mov %0.s, #0" : "=w"(zero402)); + svfloat32_t v362 = svadd_f32_x(svptrue_b32(), v354, v356); + svfloat32_t v363 = svsub_f32_x(svptrue_b32(), v354, v356); + svfloat32_t v364 = svadd_f32_x(svptrue_b32(), v358, v360); + svfloat32_t v365 = svsub_f32_x(svptrue_b32(), v358, v360); + svfloat32_t v368 = svadd_f32_x(svptrue_b32(), v359, v361); + svfloat32_t v369 = svsub_f32_x(svptrue_b32(), v359, v361); + svfloat32_t zero402 = svdup_n_f32(0); svfloat32_t v402 = svcmla_f32_x(pred_full, zero402, v859, v355, 90); - svfloat32_t v218; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v218) : "w"(v210), "w"(v212)); - svfloat32_t v219; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v219) : "w"(v210), "w"(v212)); - svfloat32_t v220; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v220) : "w"(v214), "w"(v216)); - svfloat32_t v221; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v221) : "w"(v214), "w"(v216)); - svfloat32_t v224; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v224) : "w"(v215), "w"(v217)); - svfloat32_t v225; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v225) : "w"(v215), "w"(v217)); - svfloat32_t zero259; - asm volatile("mov %0.s, #0" : "=w"(zero259)); + svfloat32_t v218 = svadd_f32_x(svptrue_b32(), v210, v212); + svfloat32_t v219 = svsub_f32_x(svptrue_b32(), v210, v212); + svfloat32_t v220 = svadd_f32_x(svptrue_b32(), v214, v216); + svfloat32_t v221 = svsub_f32_x(svptrue_b32(), v214, v216); + svfloat32_t v224 = svadd_f32_x(svptrue_b32(), v215, v217); + svfloat32_t v225 = svsub_f32_x(svptrue_b32(), v215, v217); + svfloat32_t zero259 = svdup_n_f32(0); svfloat32_t v259 = svcmla_f32_x(pred_full, zero259, v844, v213, 90); - svfloat32_t v294; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v294) : "w"(v290), "w"(v292)); - svfloat32_t v295; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v295) : "w"(v290), "w"(v292)); - svfloat32_t zero319; - asm volatile("mov %0.s, #0" : "=w"(zero319)); + svfloat32_t v294 = svadd_f32_x(svptrue_b32(), v290, v292); + svfloat32_t v295 = svsub_f32_x(svptrue_b32(), v290, v292); + svfloat32_t zero319 = svdup_n_f32(0); svfloat32_t v319 = svcmla_f32_x(pred_full, zero319, v852, v293, 90); - svfloat32_t zero338; - asm volatile("mov %0.s, #0" : "=w"(zero338)); + svfloat32_t zero338 = svdup_n_f32(0); svfloat32_t v338 = svcmla_f32_x(pred_full, zero338, v853, v296, 90); - svfloat32_t v343; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v343) : "w"(v297), "w"(v854)); - svfloat32_t v366; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v366) : "w"(v362), "w"(v364)); - svfloat32_t v367; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v367) : "w"(v362), "w"(v364)); - svfloat32_t zero390; - asm volatile("mov %0.s, #0" : "=w"(zero390)); + svfloat32_t v343 = svmul_f32_x(svptrue_b32(), v297, v854); + svfloat32_t v366 = svadd_f32_x(svptrue_b32(), v362, v364); + svfloat32_t v367 = svsub_f32_x(svptrue_b32(), v362, v364); + svfloat32_t zero390 = svdup_n_f32(0); svfloat32_t v390 = svcmla_f32_x(pred_full, zero390, v859, v363, 90); - svfloat32_t v412; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v412) : "w"(v368), "w"(v861)); - svfloat32_t zero419; - asm volatile("mov %0.s, #0" : "=w"(zero419)); + svfloat32_t v412 = svmul_f32_x(svptrue_b32(), v368, v861); + svfloat32_t zero419 = svdup_n_f32(0); svfloat32_t v419 = svcmla_f32_x(pred_full, zero419, v862, v369, 90); - svfloat32_t v222; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v222) : "w"(v218), "w"(v220)); - svfloat32_t v223; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v223) : "w"(v218), "w"(v220)); - svfloat32_t zero247; - asm volatile("mov %0.s, #0" : "=w"(zero247)); + svfloat32_t v222 = svadd_f32_x(svptrue_b32(), v218, v220); + svfloat32_t v223 = svsub_f32_x(svptrue_b32(), v218, v220); + svfloat32_t zero247 = svdup_n_f32(0); svfloat32_t v247 = svcmla_f32_x(pred_full, zero247, v844, v221, 90); - svfloat32_t zero266; - asm volatile("mov %0.s, #0" : "=w"(zero266)); + svfloat32_t zero266 = svdup_n_f32(0); svfloat32_t v266 = svcmla_f32_x(pred_full, zero266, v845, v224, 90); svfloat32_t v344 = svmla_f32_x(pred_full, v319, v291, v851); svfloat32_t v345 = svnmls_f32_x(pred_full, v319, v291, v851); svfloat32_t v346 = svmla_f32_x(pred_full, v343, v283, v851); svfloat32_t v347 = svnmls_f32_x(pred_full, v343, v283, v851); - svfloat32_t v348; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v348) : "w"(v331), "w"(v338)); - svfloat32_t v349; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v349) : "w"(v331), "w"(v338)); - svfloat32_t zero376; - asm volatile("mov %0.s, #0" : "=w"(zero376)); + svfloat32_t v348 = svadd_f32_x(svptrue_b32(), v331, v338); + svfloat32_t v349 = svsub_f32_x(svptrue_b32(), v331, v338); + svfloat32_t zero376 = svdup_n_f32(0); svfloat32_t v376 = svcmla_f32_x(pred_full, zero376, v859, v366, 90); - svfloat32_t zero383; - asm volatile("mov %0.s, #0" : "=w"(zero383)); + svfloat32_t zero383 = svdup_n_f32(0); svfloat32_t v383 = svcmla_f32_x(pred_full, zero383, v859, v367, 90); svfloat32_t v420 = svmla_f32_x(pred_full, v390, v365, v860); svfloat32_t v421 = svmls_f32_x(pred_full, v390, v365, v860); - svfloat32_t v422; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v422) : "w"(v402), "w"(v419)); - svfloat32_t v423; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v423) : "w"(v402), "w"(v419)); + svfloat32_t v422 = svadd_f32_x(svptrue_b32(), v402, v419); + svfloat32_t v423 = svsub_f32_x(svptrue_b32(), v402, v419); svfloat32_t v424 = svmla_f32_x(pred_full, v412, v357, v860); svfloat32_t v425 = svnmls_f32_x(pred_full, v412, v357, v860); - svfloat32_t v272; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v272) : "w"(v219), "w"(v247)); - svfloat32_t v273; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v273) : "w"(v219), "w"(v247)); + svfloat32_t v272 = svadd_f32_x(svptrue_b32(), v219, v247); + svfloat32_t v273 = svsub_f32_x(svptrue_b32(), v219, v247); svfloat32_t v274 = svmla_f32_x(pred_full, v211, v225, v846); svfloat32_t v275 = svmls_f32_x(pred_full, v211, v225, v846); - svfloat32_t v276; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v276) : "w"(v259), "w"(v266)); - svfloat32_t v277; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v277) : "w"(v259), "w"(v266)); - svfloat32_t v350; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v350) : "w"(v346), "w"(v348)); - svfloat32_t v351; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v351) : "w"(v346), "w"(v348)); - svfloat32_t v352; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v352) : "w"(v347), "w"(v349)); - svfloat32_t v353; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v353) : "w"(v347), "w"(v349)); - svfloat32_t v426; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v426) : "w"(v422), "w"(v424)); - svfloat32_t v427; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v427) : "w"(v422), "w"(v424)); - svfloat32_t v428; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v428) : "w"(v423), "w"(v425)); - svfloat32_t v429; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v429) : "w"(v423), "w"(v425)); + svfloat32_t v276 = svadd_f32_x(svptrue_b32(), v259, v266); + svfloat32_t v277 = svsub_f32_x(svptrue_b32(), v259, v266); + svfloat32_t v350 = svadd_f32_x(svptrue_b32(), v346, v348); + svfloat32_t v351 = svsub_f32_x(svptrue_b32(), v346, v348); + svfloat32_t v352 = svadd_f32_x(svptrue_b32(), v347, v349); + svfloat32_t v353 = svsub_f32_x(svptrue_b32(), v347, v349); + svfloat32_t v426 = svadd_f32_x(svptrue_b32(), v422, v424); + svfloat32_t v427 = svsub_f32_x(svptrue_b32(), v422, v424); + svfloat32_t v428 = svadd_f32_x(svptrue_b32(), v423, v425); + svfloat32_t v429 = svsub_f32_x(svptrue_b32(), v423, v425); svfloat32_t v430 = svmla_f32_x(pred_full, v222, v294, v851); svfloat32_t v526 = svmla_f32_x(pred_full, v223, v295, v851); svst1_f64(pred_full, (double *)(v870), svreinterpret_f64_f32(v222)); svst1_f64(pred_full, (double *)(v978), svreinterpret_f64_f32(v223)); - svfloat32_t v278; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v278) : "w"(v274), "w"(v276)); - svfloat32_t v279; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v279) : "w"(v274), "w"(v276)); - svfloat32_t v280; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v280) : "w"(v275), "w"(v277)); - svfloat32_t v281; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v281) : "w"(v275), "w"(v277)); - svfloat32_t v431; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v431) : "w"(v430), "w"(v376)); - svfloat32_t v432; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v432) : "w"(v430), "w"(v376)); - svfloat32_t v478; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v478) : "w"(v273), "w"(v345)); - svfloat32_t v527; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v527) : "w"(v526), "w"(v383)); - svfloat32_t v528; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v528) : "w"(v526), "w"(v383)); - svfloat32_t v574; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v574) : "w"(v272), "w"(v344)); + svfloat32_t v278 = svadd_f32_x(svptrue_b32(), v274, v276); + svfloat32_t v279 = svsub_f32_x(svptrue_b32(), v274, v276); + svfloat32_t v280 = svadd_f32_x(svptrue_b32(), v275, v277); + svfloat32_t v281 = svsub_f32_x(svptrue_b32(), v275, v277); + svfloat32_t v431 = svadd_f32_x(svptrue_b32(), v430, v376); + svfloat32_t v432 = svsub_f32_x(svptrue_b32(), v430, v376); + svfloat32_t v478 = svadd_f32_x(svptrue_b32(), v273, v345); + svfloat32_t v527 = svadd_f32_x(svptrue_b32(), v526, v383); + svfloat32_t v528 = svsub_f32_x(svptrue_b32(), v526, v383); + svfloat32_t v574 = svadd_f32_x(svptrue_b32(), v272, v344); svst1_f64(pred_full, (double *)(v924), svreinterpret_f64_f32(v273)); svst1_f64(pred_full, (double *)(v1032), svreinterpret_f64_f32(v272)); - svfloat32_t v454; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v454) : "w"(v279), "w"(v351)); - svfloat32_t v479; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v479) : "w"(v478), "w"(v421)); - svfloat32_t v480; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v480) : "w"(v478), "w"(v421)); - svfloat32_t v502; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v502) : "w"(v280), "w"(v352)); - svfloat32_t v550; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v550) : "w"(v281), "w"(v353)); - svfloat32_t v575; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v575) : "w"(v574), "w"(v420)); - svfloat32_t v576; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v576) : "w"(v574), "w"(v420)); - svfloat32_t v598; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v598) : "w"(v278), "w"(v350)); + svfloat32_t v454 = svadd_f32_x(svptrue_b32(), v279, v351); + svfloat32_t v479 = svadd_f32_x(svptrue_b32(), v478, v421); + svfloat32_t v480 = svsub_f32_x(svptrue_b32(), v478, v421); + svfloat32_t v502 = svadd_f32_x(svptrue_b32(), v280, v352); + svfloat32_t v550 = svadd_f32_x(svptrue_b32(), v281, v353); + svfloat32_t v575 = svadd_f32_x(svptrue_b32(), v574, v420); + svfloat32_t v576 = svsub_f32_x(svptrue_b32(), v574, v420); + svfloat32_t v598 = svadd_f32_x(svptrue_b32(), v278, v350); svst1_f64(pred_full, (double *)(v879), svreinterpret_f64_f32(v432)); svst1_f64(pred_full, (double *)(v888), svreinterpret_f64_f32(v431)); svst1_f64(pred_full, (double *)(v897), svreinterpret_f64_f32(v279)); @@ -15087,22 +13451,14 @@ void armral_fft_cf32_cf32_cf32_ac_n_uu24(const armral_cmplx_f32_t *restrict x, svst1_f64(pred_full, (double *)(v996), svreinterpret_f64_f32(v527)); svst1_f64(pred_full, (double *)(v1005), svreinterpret_f64_f32(v281)); svst1_f64(pred_full, (double *)(v1059), svreinterpret_f64_f32(v278)); - svfloat32_t v455; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v455) : "w"(v454), "w"(v427)); - svfloat32_t v456; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v456) : "w"(v454), "w"(v427)); - svfloat32_t v503; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v503) : "w"(v502), "w"(v428)); - svfloat32_t v504; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v504) : "w"(v502), "w"(v428)); - svfloat32_t v551; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v551) : "w"(v550), "w"(v429)); - svfloat32_t v552; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v552) : "w"(v550), "w"(v429)); - svfloat32_t v599; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v599) : "w"(v598), "w"(v426)); - svfloat32_t v600; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v600) : "w"(v598), "w"(v426)); + svfloat32_t v455 = svadd_f32_x(svptrue_b32(), v454, v427); + svfloat32_t v456 = svsub_f32_x(svptrue_b32(), v454, v427); + svfloat32_t v503 = svadd_f32_x(svptrue_b32(), v502, v428); + svfloat32_t v504 = svsub_f32_x(svptrue_b32(), v502, v428); + svfloat32_t v551 = svadd_f32_x(svptrue_b32(), v550, v429); + svfloat32_t v552 = svsub_f32_x(svptrue_b32(), v550, v429); + svfloat32_t v599 = svadd_f32_x(svptrue_b32(), v598, v426); + svfloat32_t v600 = svsub_f32_x(svptrue_b32(), v598, v426); svst1_f64(pred_full, (double *)(v933), svreinterpret_f64_f32(v480)); svst1_f64(pred_full, (double *)(v942), svreinterpret_f64_f32(v479)); svst1_f64(pred_full, (double *)(v1041), svreinterpret_f64_f32(v576)); @@ -15132,7 +13488,6 @@ void armral_fft_cf32_cf32_cf32_ac_n_uu25(const armral_cmplx_f32_t *restrict x, int64_t v12 = howmany - 1; int64_t v1701 = howmany / 2; for (int j = 0; j < v12; j += 2) { - float v916 = 0.0000000000000000e+00F; float v1030 = 9.6858316112863108e-01F; float v1034 = -2.4868988716485479e-01F; float v1035 = 2.4868988716485479e-01F; @@ -15166,7 +13521,6 @@ void armral_fft_cf32_cf32_cf32_ac_n_uu25(const armral_cmplx_f32_t *restrict x, float v1689 = 2.0000000000000000e+00F; const float32x2_t *v3155 = &v5[istride]; float32x2_t *v3381 = &v6[ostride]; - float v919 = dir * v916; float32x2_t v1031 = (float32x2_t){v1030, v1030}; float32x2_t v1036 = (float32x2_t){v1034, v1035}; float32x2_t v1199 = (float32x2_t){v1198, v1198}; @@ -15191,8 +13545,7 @@ void armral_fft_cf32_cf32_cf32_ac_n_uu25(const armral_cmplx_f32_t *restrict x, float32x2_t v1690 = (float32x2_t){v1689, v1689}; const float32x2_t *v3110 = &v5[0]; float32x2_t *v3336 = &v6[0]; - float32x4_t v3566 = vld1q_f32((const float32_t *)v3155); - float32x2_t v917 = (float32x2_t){v916, v919}; + float32x4_t v3614 = vld1q_f32((const float32_t *)v3155); float32x4_t v1032 = vcombine_f32(v1031, v1031); float32x2_t v1038 = vmul_f32(v1662, v1036); float32x4_t v1200 = vcombine_f32(v1199, v1199); @@ -15261,8 +13614,7 @@ void armral_fft_cf32_cf32_cf32_ac_n_uu25(const armral_cmplx_f32_t *restrict x, float32x2_t *v3534 = &v6[ostride * 14]; float32x2_t *v3543 = &v6[ostride * 19]; float32x2_t *v3552 = &v6[ostride * 24]; - float32x4_t v3556 = vld1q_f32((const float32_t *)v3110); - float32x4_t v921 = vcombine_f32(v917, v917); + float32x4_t v3604 = vld1q_f32((const float32_t *)v3110); float32x4_t v1040 = vcombine_f32(v1038, v1038); float32x4_t v1208 = vcombine_f32(v1206, v1206); float32x4_t v1376 = vcombine_f32(v1374, v1374); @@ -15273,109 +13625,49 @@ void armral_fft_cf32_cf32_cf32_ac_n_uu25(const armral_cmplx_f32_t *restrict x, float32x4_t v1572 = vcombine_f32(v1570, v1570); float32x4_t v1593 = vcombine_f32(v1591, v1591); float32x4_t v1665 = vcombine_f32(v1663, v1663); - float32x4_t v3558 = vld1q_f32((const float32_t *)v3119); - float32x4_t v3560 = vld1q_f32((const float32_t *)v3128); - float32x4_t v3562 = vld1q_f32((const float32_t *)v3137); - float32x4_t v3564 = vld1q_f32((const float32_t *)v3146); - float32x4_t v3568 = vld1q_f32((const float32_t *)v3164); - float32x4_t v3570 = vld1q_f32((const float32_t *)v3173); - float32x4_t v3572 = vld1q_f32((const float32_t *)v3182); - float32x4_t v3574 = vld1q_f32((const float32_t *)v3191); - float32x4_t v3576 = vld1q_f32((const float32_t *)v3200); - float32x4_t v3578 = vld1q_f32((const float32_t *)v3209); - float32x4_t v3580 = vld1q_f32((const float32_t *)v3218); - float32x4_t v3582 = vld1q_f32((const float32_t *)v3227); - float32x4_t v3584 = vld1q_f32((const float32_t *)v3236); - float32x4_t v3586 = vld1q_f32((const float32_t *)v3245); - float32x4_t v3588 = vld1q_f32((const float32_t *)v3254); - float32x4_t v3590 = vld1q_f32((const float32_t *)v3263); - float32x4_t v3592 = vld1q_f32((const float32_t *)v3272); - float32x4_t v3594 = vld1q_f32((const float32_t *)v3281); - float32x4_t v3596 = vld1q_f32((const float32_t *)v3290); - float32x4_t v3598 = vld1q_f32((const float32_t *)v3299); - float32x4_t v3600 = vld1q_f32((const float32_t *)v3308); - float32x4_t v3602 = vld1q_f32((const float32_t *)v3317); - float32x4_t v3604 = vld1q_f32((const float32_t *)v3326); - float32x4_t v66 = vrev64q_f32(v3558); - float32x4_t v80 = vrev64q_f32(v3560); - float32x4_t v94 = vrev64q_f32(v3564); - float32x4_t v115 = vrev64q_f32(v3562); - float32x4_t v234 = vrev64q_f32(v3568); - float32x4_t v248 = vrev64q_f32(v3570); - float32x4_t v262 = vrev64q_f32(v3574); - float32x4_t v283 = vrev64q_f32(v3572); - float32x4_t v402 = vrev64q_f32(v3578); - float32x4_t v416 = vrev64q_f32(v3580); - float32x4_t v430 = vrev64q_f32(v3584); - float32x4_t v451 = vrev64q_f32(v3582); - float32x4_t v570 = vrev64q_f32(v3588); - float32x4_t v584 = vrev64q_f32(v3590); - float32x4_t v598 = vrev64q_f32(v3594); - float32x4_t v619 = vrev64q_f32(v3592); - float32x4_t v738 = vrev64q_f32(v3598); - float32x4_t v752 = vrev64q_f32(v3600); - float32x4_t v766 = vrev64q_f32(v3604); - float32x4_t v787 = vrev64q_f32(v3602); - float32x4_t v68 = vmulq_f32(v66, v921); - float32x4_t v82 = vmulq_f32(v80, v921); - float32x4_t v96 = vmulq_f32(v94, v921); - float32x4_t v117 = vmulq_f32(v115, v921); - float32x4_t v236 = vmulq_f32(v234, v921); - float32x4_t v250 = vmulq_f32(v248, v921); - float32x4_t v264 = vmulq_f32(v262, v921); - float32x4_t v285 = vmulq_f32(v283, v921); - float32x4_t v404 = vmulq_f32(v402, v921); - float32x4_t v418 = vmulq_f32(v416, v921); - float32x4_t v432 = vmulq_f32(v430, v921); - float32x4_t v453 = vmulq_f32(v451, v921); - float32x4_t v572 = vmulq_f32(v570, v921); - float32x4_t v586 = vmulq_f32(v584, v921); - float32x4_t v600 = vmulq_f32(v598, v921); - float32x4_t v621 = vmulq_f32(v619, v921); - float32x4_t v740 = vmulq_f32(v738, v921); - float32x4_t v754 = vmulq_f32(v752, v921); - float32x4_t v768 = vmulq_f32(v766, v921); - float32x4_t v789 = vmulq_f32(v787, v921); - float32x4_t v69 = vaddq_f32(v68, v3558); - float32x4_t v83 = vaddq_f32(v82, v3560); - float32x4_t v97 = vaddq_f32(v96, v3564); - float32x4_t v118 = vaddq_f32(v117, v3562); - float32x4_t v237 = vaddq_f32(v236, v3568); - float32x4_t v251 = vaddq_f32(v250, v3570); - float32x4_t v265 = vaddq_f32(v264, v3574); - float32x4_t v286 = vaddq_f32(v285, v3572); - float32x4_t v405 = vaddq_f32(v404, v3578); - float32x4_t v419 = vaddq_f32(v418, v3580); - float32x4_t v433 = vaddq_f32(v432, v3584); - float32x4_t v454 = vaddq_f32(v453, v3582); - float32x4_t v573 = vaddq_f32(v572, v3588); - float32x4_t v587 = vaddq_f32(v586, v3590); - float32x4_t v601 = vaddq_f32(v600, v3594); - float32x4_t v622 = vaddq_f32(v621, v3592); - float32x4_t v741 = vaddq_f32(v740, v3598); - float32x4_t v755 = vaddq_f32(v754, v3600); - float32x4_t v769 = vaddq_f32(v768, v3604); - float32x4_t v790 = vaddq_f32(v789, v3602); - float32x4_t v98 = vsubq_f32(v69, v97); - float32x4_t v103 = vmulq_f32(v69, v1691); - float32x4_t v119 = vsubq_f32(v83, v118); - float32x4_t v124 = vmulq_f32(v83, v1691); - float32x4_t v266 = vsubq_f32(v237, v265); - float32x4_t v271 = vmulq_f32(v237, v1691); - float32x4_t v287 = vsubq_f32(v251, v286); - float32x4_t v292 = vmulq_f32(v251, v1691); - float32x4_t v434 = vsubq_f32(v405, v433); - float32x4_t v439 = vmulq_f32(v405, v1691); - float32x4_t v455 = vsubq_f32(v419, v454); - float32x4_t v460 = vmulq_f32(v419, v1691); - float32x4_t v602 = vsubq_f32(v573, v601); - float32x4_t v607 = vmulq_f32(v573, v1691); - float32x4_t v623 = vsubq_f32(v587, v622); - float32x4_t v628 = vmulq_f32(v587, v1691); - float32x4_t v770 = vsubq_f32(v741, v769); - float32x4_t v775 = vmulq_f32(v741, v1691); - float32x4_t v791 = vsubq_f32(v755, v790); - float32x4_t v796 = vmulq_f32(v755, v1691); + float32x4_t v3606 = vld1q_f32((const float32_t *)v3119); + float32x4_t v3608 = vld1q_f32((const float32_t *)v3128); + float32x4_t v3610 = vld1q_f32((const float32_t *)v3137); + float32x4_t v3612 = vld1q_f32((const float32_t *)v3146); + float32x4_t v3616 = vld1q_f32((const float32_t *)v3164); + float32x4_t v3618 = vld1q_f32((const float32_t *)v3173); + float32x4_t v3620 = vld1q_f32((const float32_t *)v3182); + float32x4_t v3622 = vld1q_f32((const float32_t *)v3191); + float32x4_t v3624 = vld1q_f32((const float32_t *)v3200); + float32x4_t v3626 = vld1q_f32((const float32_t *)v3209); + float32x4_t v3628 = vld1q_f32((const float32_t *)v3218); + float32x4_t v3630 = vld1q_f32((const float32_t *)v3227); + float32x4_t v3632 = vld1q_f32((const float32_t *)v3236); + float32x4_t v3634 = vld1q_f32((const float32_t *)v3245); + float32x4_t v3636 = vld1q_f32((const float32_t *)v3254); + float32x4_t v3638 = vld1q_f32((const float32_t *)v3263); + float32x4_t v3640 = vld1q_f32((const float32_t *)v3272); + float32x4_t v3642 = vld1q_f32((const float32_t *)v3281); + float32x4_t v3644 = vld1q_f32((const float32_t *)v3290); + float32x4_t v3646 = vld1q_f32((const float32_t *)v3299); + float32x4_t v3648 = vld1q_f32((const float32_t *)v3308); + float32x4_t v3650 = vld1q_f32((const float32_t *)v3317); + float32x4_t v3652 = vld1q_f32((const float32_t *)v3326); + float32x4_t v98 = vsubq_f32(v3606, v3612); + float32x4_t v103 = vmulq_f32(v3606, v1691); + float32x4_t v119 = vsubq_f32(v3608, v3610); + float32x4_t v124 = vmulq_f32(v3608, v1691); + float32x4_t v266 = vsubq_f32(v3616, v3622); + float32x4_t v271 = vmulq_f32(v3616, v1691); + float32x4_t v287 = vsubq_f32(v3618, v3620); + float32x4_t v292 = vmulq_f32(v3618, v1691); + float32x4_t v434 = vsubq_f32(v3626, v3632); + float32x4_t v439 = vmulq_f32(v3626, v1691); + float32x4_t v455 = vsubq_f32(v3628, v3630); + float32x4_t v460 = vmulq_f32(v3628, v1691); + float32x4_t v602 = vsubq_f32(v3636, v3642); + float32x4_t v607 = vmulq_f32(v3636, v1691); + float32x4_t v623 = vsubq_f32(v3638, v3640); + float32x4_t v628 = vmulq_f32(v3638, v1691); + float32x4_t v770 = vsubq_f32(v3646, v3652); + float32x4_t v775 = vmulq_f32(v3646, v1691); + float32x4_t v791 = vsubq_f32(v3648, v3650); + float32x4_t v796 = vmulq_f32(v3648, v1691); float32x4_t v104 = vsubq_f32(v103, v98); float32x4_t v125 = vsubq_f32(v124, v119); float32x4_t v138 = vmulq_f32(v119, v1632); @@ -15418,48 +13710,48 @@ void armral_fft_cf32_cf32_cf32_ac_n_uu25(const armral_cmplx_f32_t *restrict x, float32x4_t v829 = vsubq_f32(v828, v791); float32x4_t v132 = vmulq_f32(v126, v1608); float32x4_t v144 = vmulq_f32(v127, v1620); - float32x4_t v158 = vaddq_f32(v3556, v126); + float32x4_t v158 = vaddq_f32(v3604, v126); float32x4_t v164 = vrev64q_f32(v139); float32x4_t v173 = vrev64q_f32(v157); float32x4_t v300 = vmulq_f32(v294, v1608); float32x4_t v312 = vmulq_f32(v295, v1620); - float32x4_t v326 = vaddq_f32(v3566, v294); + float32x4_t v326 = vaddq_f32(v3614, v294); float32x4_t v332 = vrev64q_f32(v307); float32x4_t v341 = vrev64q_f32(v325); float32x4_t v468 = vmulq_f32(v462, v1608); float32x4_t v480 = vmulq_f32(v463, v1620); - float32x4_t v494 = vaddq_f32(v3576, v462); + float32x4_t v494 = vaddq_f32(v3624, v462); float32x4_t v500 = vrev64q_f32(v475); float32x4_t v509 = vrev64q_f32(v493); float32x4_t v636 = vmulq_f32(v630, v1608); float32x4_t v648 = vmulq_f32(v631, v1620); - float32x4_t v662 = vaddq_f32(v3586, v630); + float32x4_t v662 = vaddq_f32(v3634, v630); float32x4_t v668 = vrev64q_f32(v643); float32x4_t v677 = vrev64q_f32(v661); float32x4_t v804 = vmulq_f32(v798, v1608); float32x4_t v816 = vmulq_f32(v799, v1620); - float32x4_t v830 = vaddq_f32(v3596, v798); + float32x4_t v830 = vaddq_f32(v3644, v798); float32x4_t v836 = vrev64q_f32(v811); float32x4_t v845 = vrev64q_f32(v829); - float32x4_t v133 = vsubq_f32(v3556, v132); + float32x4_t v133 = vsubq_f32(v3604, v132); float32x4_t v166 = vmulq_f32(v164, v1665); float32x4_t v175 = vmulq_f32(v173, v1665); - float32x4_t v301 = vsubq_f32(v3566, v300); + float32x4_t v301 = vsubq_f32(v3614, v300); float32x4_t v334 = vmulq_f32(v332, v1665); float32x4_t v343 = vmulq_f32(v341, v1665); - float32x4_t v469 = vsubq_f32(v3576, v468); + float32x4_t v469 = vsubq_f32(v3624, v468); float32x4_t v502 = vmulq_f32(v500, v1665); float32x4_t v511 = vmulq_f32(v509, v1665); - float32x4_t v637 = vsubq_f32(v3586, v636); + float32x4_t v637 = vsubq_f32(v3634, v636); float32x4_t v670 = vmulq_f32(v668, v1665); float32x4_t v679 = vmulq_f32(v677, v1665); - float32x4_t v805 = vsubq_f32(v3596, v804); + float32x4_t v805 = vsubq_f32(v3644, v804); float32x4_t v838 = vmulq_f32(v836, v1665); float32x4_t v847 = vmulq_f32(v845, v1665); - float32x4_t v871 = vrev64q_f32(v326); - float32x4_t v885 = vrev64q_f32(v494); - float32x4_t v899 = vrev64q_f32(v830); - float32x4_t v920 = vrev64q_f32(v662); + float32x4_t v903 = vsubq_f32(v326, v830); + float32x4_t v908 = vmulq_f32(v326, v1691); + float32x4_t v924 = vsubq_f32(v494, v662); + float32x4_t v929 = vmulq_f32(v494, v1691); float32x4_t v145 = vsubq_f32(v133, v144); float32x4_t v150 = vmulq_f32(v133, v1691); float32x4_t v313 = vsubq_f32(v301, v312); @@ -15470,10 +13762,10 @@ void armral_fft_cf32_cf32_cf32_ac_n_uu25(const armral_cmplx_f32_t *restrict x, float32x4_t v654 = vmulq_f32(v637, v1691); float32x4_t v817 = vsubq_f32(v805, v816); float32x4_t v822 = vmulq_f32(v805, v1691); - float32x4_t v873 = vmulq_f32(v871, v921); - float32x4_t v887 = vmulq_f32(v885, v921); - float32x4_t v901 = vmulq_f32(v899, v921); - float32x4_t v922 = vmulq_f32(v920, v921); + float32x4_t v909 = vsubq_f32(v908, v903); + float32x4_t v930 = vsubq_f32(v929, v924); + float32x4_t v943 = vmulq_f32(v924, v1632); + float32x4_t v961 = vmulq_f32(v903, v1632); float32x4_t v151 = vsubq_f32(v150, v145); float32x4_t v176 = vsubq_f32(v145, v175); float32x4_t v181 = vmulq_f32(v145, v1691); @@ -15489,10 +13781,10 @@ void armral_fft_cf32_cf32_cf32_ac_n_uu25(const armral_cmplx_f32_t *restrict x, float32x4_t v823 = vsubq_f32(v822, v817); float32x4_t v848 = vsubq_f32(v817, v847); float32x4_t v853 = vmulq_f32(v817, v1691); - float32x4_t v874 = vaddq_f32(v873, v326); - float32x4_t v888 = vaddq_f32(v887, v494); - float32x4_t v902 = vaddq_f32(v901, v830); - float32x4_t v923 = vaddq_f32(v922, v662); + float32x4_t v931 = vaddq_f32(v909, v930); + float32x4_t v932 = vsubq_f32(v909, v930); + float32x4_t v944 = vaddq_f32(v903, v943); + float32x4_t v962 = vsubq_f32(v961, v924); float32x4_t v167 = vsubq_f32(v151, v166); float32x4_t v182 = vsubq_f32(v181, v176); float32x4_t v187 = vmulq_f32(v151, v1691); @@ -15508,10 +13800,11 @@ void armral_fft_cf32_cf32_cf32_ac_n_uu25(const armral_cmplx_f32_t *restrict x, float32x4_t v839 = vsubq_f32(v823, v838); float32x4_t v854 = vsubq_f32(v853, v848); float32x4_t v859 = vmulq_f32(v823, v1691); - float32x4_t v903 = vsubq_f32(v874, v902); - float32x4_t v908 = vmulq_f32(v874, v1691); - float32x4_t v924 = vsubq_f32(v888, v923); - float32x4_t v929 = vmulq_f32(v888, v1691); + float32x4_t v937 = vmulq_f32(v931, v1608); + float32x4_t v949 = vmulq_f32(v932, v1620); + float32x4_t v963 = vaddq_f32(v158, v931); + float32x4_t v976 = vrev64q_f32(v944); + float32x4_t v992 = vrev64q_f32(v962); float32x4_t v1207 = vrev64q_f32(v344); float32x4_t v1221 = vrev64q_f32(v512); float32x4_t v1235 = vrev64q_f32(v848); @@ -15521,10 +13814,9 @@ void armral_fft_cf32_cf32_cf32_ac_n_uu25(const armral_cmplx_f32_t *restrict x, float32x4_t v524 = vsubq_f32(v523, v503); float32x4_t v692 = vsubq_f32(v691, v671); float32x4_t v860 = vsubq_f32(v859, v839); - float32x4_t v909 = vsubq_f32(v908, v903); - float32x4_t v930 = vsubq_f32(v929, v924); - float32x4_t v943 = vmulq_f32(v924, v1632); - float32x4_t v961 = vmulq_f32(v903, v1632); + float32x4_t v938 = vsubq_f32(v158, v937); + float32x4_t v978 = vmulq_f32(v976, v1665); + float32x4_t v994 = vmulq_f32(v992, v1665); float32x4_t v1039 = vrev64q_f32(v335); float32x4_t v1053 = vrev64q_f32(v503); float32x4_t v1067 = vrev64q_f32(v839); @@ -15537,10 +13829,9 @@ void armral_fft_cf32_cf32_cf32_ac_n_uu25(const armral_cmplx_f32_t *restrict x, float32x4_t v1389 = vrev64q_f32(v518); float32x4_t v1403 = vrev64q_f32(v854); float32x4_t v1424 = vrev64q_f32(v686); - float32x4_t v931 = vaddq_f32(v909, v930); - float32x4_t v932 = vsubq_f32(v909, v930); - float32x4_t v944 = vaddq_f32(v903, v943); - float32x4_t v962 = vsubq_f32(v961, v924); + vst1q_f32((float32_t *)v3336, v963); + float32x4_t v950 = vsubq_f32(v938, v949); + float32x4_t v955 = vmulq_f32(v938, v1691); float32x4_t v1041 = vmulq_f32(v1039, v1040); float32x4_t v1055 = vmulq_f32(v1053, v1208); float32x4_t v1069 = vmulq_f32(v1067, v1544); @@ -15557,11 +13848,9 @@ void armral_fft_cf32_cf32_cf32_ac_n_uu25(const armral_cmplx_f32_t *restrict x, float32x4_t v1557 = vrev64q_f32(v524); float32x4_t v1571 = vrev64q_f32(v860); float32x4_t v1592 = vrev64q_f32(v692); - float32x4_t v937 = vmulq_f32(v931, v1608); - float32x4_t v949 = vmulq_f32(v932, v1620); - float32x4_t v963 = vaddq_f32(v158, v931); - float32x4_t v976 = vrev64q_f32(v944); - float32x4_t v992 = vrev64q_f32(v962); + float32x4_t v956 = vsubq_f32(v955, v950); + float32x4_t v995 = vsubq_f32(v950, v994); + float32x4_t v1007 = vmulq_f32(v950, v1691); float32x4_t v1042 = vfmaq_f32(v1041, v335, v1032); float32x4_t v1056 = vfmaq_f32(v1055, v503, v1200); float32x4_t v1070 = vfmaq_f32(v1069, v839, v1536); @@ -15578,9 +13867,9 @@ void armral_fft_cf32_cf32_cf32_ac_n_uu25(const armral_cmplx_f32_t *restrict x, float32x4_t v1559 = vmulq_f32(v1557, v1558); float32x4_t v1573 = vmulq_f32(v1571, v1572); float32x4_t v1594 = vmulq_f32(v1592, v1593); - float32x4_t v938 = vsubq_f32(v158, v937); - float32x4_t v978 = vmulq_f32(v976, v1665); - float32x4_t v994 = vmulq_f32(v992, v1665); + float32x4_t v979 = vsubq_f32(v956, v978); + float32x4_t v1008 = vsubq_f32(v1007, v995); + float32x4_t v1020 = vmulq_f32(v956, v1691); float32x4_t v1071 = vsubq_f32(v1042, v1070); float32x4_t v1076 = vmulq_f32(v1042, v1691); float32x4_t v1092 = vsubq_f32(v1056, v1091); @@ -15597,9 +13886,8 @@ void armral_fft_cf32_cf32_cf32_ac_n_uu25(const armral_cmplx_f32_t *restrict x, float32x4_t v1560 = vfmaq_f32(v1559, v524, v1550); float32x4_t v1574 = vfmaq_f32(v1573, v860, v1564); float32x4_t v1595 = vfmaq_f32(v1594, v692, v1585); - vst1q_f32((float32_t *)v3336, v963); - float32x4_t v950 = vsubq_f32(v938, v949); - float32x4_t v955 = vmulq_f32(v938, v1691); + vst1q_f32((float32_t *)v3354, v995); + float32x4_t v1021 = vsubq_f32(v1020, v979); float32x4_t v1077 = vsubq_f32(v1076, v1071); float32x4_t v1098 = vsubq_f32(v1097, v1092); float32x4_t v1111 = vmulq_f32(v1092, v1632); @@ -15616,9 +13904,8 @@ void armral_fft_cf32_cf32_cf32_ac_n_uu25(const armral_cmplx_f32_t *restrict x, float32x4_t v1580 = vmulq_f32(v1546, v1691); float32x4_t v1596 = vsubq_f32(v1560, v1595); float32x4_t v1601 = vmulq_f32(v1560, v1691); - float32x4_t v956 = vsubq_f32(v955, v950); - float32x4_t v995 = vsubq_f32(v950, v994); - float32x4_t v1007 = vmulq_f32(v950, v1691); + vst1q_f32((float32_t *)v3345, v979); + vst1q_f32((float32_t *)v3363, v1008); float32x4_t v1099 = vaddq_f32(v1077, v1098); float32x4_t v1100 = vsubq_f32(v1077, v1098); float32x4_t v1112 = vaddq_f32(v1071, v1111); @@ -15636,9 +13923,7 @@ void armral_fft_cf32_cf32_cf32_ac_n_uu25(const armral_cmplx_f32_t *restrict x, float32x4_t v1602 = vsubq_f32(v1601, v1596); float32x4_t v1615 = vmulq_f32(v1596, v1632); float32x4_t v1633 = vmulq_f32(v1575, v1632); - float32x4_t v979 = vsubq_f32(v956, v978); - float32x4_t v1008 = vsubq_f32(v1007, v995); - float32x4_t v1020 = vmulq_f32(v956, v1691); + vst1q_f32((float32_t *)v3372, v1021); float32x4_t v1105 = vmulq_f32(v1099, v1608); float32x4_t v1117 = vmulq_f32(v1100, v1620); float32x4_t v1131 = vaddq_f32(v167, v1099); @@ -15656,9 +13941,7 @@ void armral_fft_cf32_cf32_cf32_ac_n_uu25(const armral_cmplx_f32_t *restrict x, float32x4_t v1604 = vsubq_f32(v1581, v1602); float32x4_t v1616 = vaddq_f32(v1575, v1615); float32x4_t v1634 = vsubq_f32(v1633, v1596); - vst1q_f32((float32_t *)v3354, v995); vst1q_f32((float32_t *)v3426, v1299); - float32x4_t v1021 = vsubq_f32(v1020, v979); float32x4_t v1106 = vsubq_f32(v167, v1105); float32x4_t v1146 = vmulq_f32(v1144, v1665); float32x4_t v1162 = vmulq_f32(v1160, v1665); @@ -15672,8 +13955,6 @@ void armral_fft_cf32_cf32_cf32_ac_n_uu25(const armral_cmplx_f32_t *restrict x, float32x4_t v1635 = vaddq_f32(v188, v1603); float32x4_t v1648 = vrev64q_f32(v1616); float32x4_t v1664 = vrev64q_f32(v1634); - vst1q_f32((float32_t *)v3345, v979); - vst1q_f32((float32_t *)v3363, v1008); vst1q_f32((float32_t *)v3381, v1131); vst1q_f32((float32_t *)v3471, v1467); float32x4_t v1118 = vsubq_f32(v1106, v1117); @@ -15686,7 +13967,6 @@ void armral_fft_cf32_cf32_cf32_ac_n_uu25(const armral_cmplx_f32_t *restrict x, float32x4_t v1610 = vsubq_f32(v188, v1609); float32x4_t v1650 = vmulq_f32(v1648, v1665); float32x4_t v1666 = vmulq_f32(v1664, v1665); - vst1q_f32((float32_t *)v3372, v1021); vst1q_f32((float32_t *)v3516, v1635); float32x4_t v1124 = vsubq_f32(v1123, v1118); float32x4_t v1163 = vsubq_f32(v1118, v1162); @@ -15736,7 +14016,6 @@ void armral_fft_cf32_cf32_cf32_ac_n_uu25(const armral_cmplx_f32_t *restrict x, } for (int j = v1701 * 2; j < howmany; j += 1) { float32x2_t v1852 = v5[istride]; - float v2451 = 0.0000000000000000e+00F; float v2544 = 9.6858316112863108e-01F; float v2547 = -2.4868988716485479e-01F; float v2548 = 2.4868988716485479e-01F; @@ -15769,7 +14048,6 @@ void armral_fft_cf32_cf32_cf32_ac_n_uu25(const armral_cmplx_f32_t *restrict x, float32x2_t v3069 = (float32x2_t){v4, v4}; float v3090 = 2.0000000000000000e+00F; float32x2_t v1713 = v5[0]; - float v2454 = dir * v2451; float32x2_t v2545 = (float32x2_t){v2544, v2544}; float32x2_t v2549 = (float32x2_t){v2547, v2548}; float32x2_t v2684 = (float32x2_t){v2683, v2683}; @@ -15815,7 +14093,6 @@ void armral_fft_cf32_cf32_cf32_ac_n_uu25(const armral_cmplx_f32_t *restrict x, float32x2_t v2279 = v5[istride * 14]; float32x2_t v2284 = v5[istride * 19]; float32x2_t v2289 = v5[istride * 24]; - float32x2_t v2452 = (float32x2_t){v2451, v2454}; float32x2_t v2551 = vmul_f32(v3069, v2549); float32x2_t v2690 = vmul_f32(v3069, v2688); float32x2_t v2829 = vmul_f32(v3069, v2827); @@ -15826,86 +14103,26 @@ void armral_fft_cf32_cf32_cf32_ac_n_uu25(const armral_cmplx_f32_t *restrict x, float32x2_t v2992 = vmul_f32(v3069, v2990); float32x2_t v3010 = vmul_f32(v3069, v3008); float32x2_t v3070 = vmul_f32(v3069, v3068); - float32x2_t v1743 = vrev64_f32(v1718); - float32x2_t v1755 = vrev64_f32(v1723); - float32x2_t v1767 = vrev64_f32(v1733); - float32x2_t v1785 = vrev64_f32(v1728); - float32x2_t v1882 = vrev64_f32(v1857); - float32x2_t v1894 = vrev64_f32(v1862); - float32x2_t v1906 = vrev64_f32(v1872); - float32x2_t v1924 = vrev64_f32(v1867); - float32x2_t v2021 = vrev64_f32(v1996); - float32x2_t v2033 = vrev64_f32(v2001); - float32x2_t v2045 = vrev64_f32(v2011); - float32x2_t v2063 = vrev64_f32(v2006); - float32x2_t v2160 = vrev64_f32(v2135); - float32x2_t v2172 = vrev64_f32(v2140); - float32x2_t v2184 = vrev64_f32(v2150); - float32x2_t v2202 = vrev64_f32(v2145); - float32x2_t v2299 = vrev64_f32(v2274); - float32x2_t v2311 = vrev64_f32(v2279); - float32x2_t v2323 = vrev64_f32(v2289); - float32x2_t v2341 = vrev64_f32(v2284); - float32x2_t v1744 = vmul_f32(v1743, v2452); - float32x2_t v1756 = vmul_f32(v1755, v2452); - float32x2_t v1768 = vmul_f32(v1767, v2452); - float32x2_t v1786 = vmul_f32(v1785, v2452); - float32x2_t v1883 = vmul_f32(v1882, v2452); - float32x2_t v1895 = vmul_f32(v1894, v2452); - float32x2_t v1907 = vmul_f32(v1906, v2452); - float32x2_t v1925 = vmul_f32(v1924, v2452); - float32x2_t v2022 = vmul_f32(v2021, v2452); - float32x2_t v2034 = vmul_f32(v2033, v2452); - float32x2_t v2046 = vmul_f32(v2045, v2452); - float32x2_t v2064 = vmul_f32(v2063, v2452); - float32x2_t v2161 = vmul_f32(v2160, v2452); - float32x2_t v2173 = vmul_f32(v2172, v2452); - float32x2_t v2185 = vmul_f32(v2184, v2452); - float32x2_t v2203 = vmul_f32(v2202, v2452); - float32x2_t v2300 = vmul_f32(v2299, v2452); - float32x2_t v2312 = vmul_f32(v2311, v2452); - float32x2_t v2324 = vmul_f32(v2323, v2452); - float32x2_t v2342 = vmul_f32(v2341, v2452); - float32x2_t v1745 = vadd_f32(v1744, v1718); - float32x2_t v1757 = vadd_f32(v1756, v1723); - float32x2_t v1769 = vadd_f32(v1768, v1733); - float32x2_t v1787 = vadd_f32(v1786, v1728); - float32x2_t v1884 = vadd_f32(v1883, v1857); - float32x2_t v1896 = vadd_f32(v1895, v1862); - float32x2_t v1908 = vadd_f32(v1907, v1872); - float32x2_t v1926 = vadd_f32(v1925, v1867); - float32x2_t v2023 = vadd_f32(v2022, v1996); - float32x2_t v2035 = vadd_f32(v2034, v2001); - float32x2_t v2047 = vadd_f32(v2046, v2011); - float32x2_t v2065 = vadd_f32(v2064, v2006); - float32x2_t v2162 = vadd_f32(v2161, v2135); - float32x2_t v2174 = vadd_f32(v2173, v2140); - float32x2_t v2186 = vadd_f32(v2185, v2150); - float32x2_t v2204 = vadd_f32(v2203, v2145); - float32x2_t v2301 = vadd_f32(v2300, v2274); - float32x2_t v2313 = vadd_f32(v2312, v2279); - float32x2_t v2325 = vadd_f32(v2324, v2289); - float32x2_t v2343 = vadd_f32(v2342, v2284); - float32x2_t v1770 = vsub_f32(v1745, v1769); - float32x2_t v1774 = vmul_f32(v1745, v3091); - float32x2_t v1788 = vsub_f32(v1757, v1787); - float32x2_t v1792 = vmul_f32(v1757, v3091); - float32x2_t v1909 = vsub_f32(v1884, v1908); - float32x2_t v1913 = vmul_f32(v1884, v3091); - float32x2_t v1927 = vsub_f32(v1896, v1926); - float32x2_t v1931 = vmul_f32(v1896, v3091); - float32x2_t v2048 = vsub_f32(v2023, v2047); - float32x2_t v2052 = vmul_f32(v2023, v3091); - float32x2_t v2066 = vsub_f32(v2035, v2065); - float32x2_t v2070 = vmul_f32(v2035, v3091); - float32x2_t v2187 = vsub_f32(v2162, v2186); - float32x2_t v2191 = vmul_f32(v2162, v3091); - float32x2_t v2205 = vsub_f32(v2174, v2204); - float32x2_t v2209 = vmul_f32(v2174, v3091); - float32x2_t v2326 = vsub_f32(v2301, v2325); - float32x2_t v2330 = vmul_f32(v2301, v3091); - float32x2_t v2344 = vsub_f32(v2313, v2343); - float32x2_t v2348 = vmul_f32(v2313, v3091); + float32x2_t v1770 = vsub_f32(v1718, v1733); + float32x2_t v1774 = vmul_f32(v1718, v3091); + float32x2_t v1788 = vsub_f32(v1723, v1728); + float32x2_t v1792 = vmul_f32(v1723, v3091); + float32x2_t v1909 = vsub_f32(v1857, v1872); + float32x2_t v1913 = vmul_f32(v1857, v3091); + float32x2_t v1927 = vsub_f32(v1862, v1867); + float32x2_t v1931 = vmul_f32(v1862, v3091); + float32x2_t v2048 = vsub_f32(v1996, v2011); + float32x2_t v2052 = vmul_f32(v1996, v3091); + float32x2_t v2066 = vsub_f32(v2001, v2006); + float32x2_t v2070 = vmul_f32(v2001, v3091); + float32x2_t v2187 = vsub_f32(v2135, v2150); + float32x2_t v2191 = vmul_f32(v2135, v3091); + float32x2_t v2205 = vsub_f32(v2140, v2145); + float32x2_t v2209 = vmul_f32(v2140, v3091); + float32x2_t v2326 = vsub_f32(v2274, v2289); + float32x2_t v2330 = vmul_f32(v2274, v3091); + float32x2_t v2344 = vsub_f32(v2279, v2284); + float32x2_t v2348 = vmul_f32(v2279, v3091); float32x2_t v1775 = vsub_f32(v1774, v1770); float32x2_t v1793 = vsub_f32(v1792, v1788); float32x2_t v1804 = vmul_f32(v1788, v3044); @@ -15986,10 +14203,10 @@ void armral_fft_cf32_cf32_cf32_ac_n_uu25(const armral_cmplx_f32_t *restrict x, float32x2_t v2356 = vsub_f32(v2269, v2355); float32x2_t v2384 = vmul_f32(v2383, v3070); float32x2_t v2392 = vmul_f32(v2391, v3070); - float32x2_t v2413 = vrev64_f32(v1960); - float32x2_t v2425 = vrev64_f32(v2099); - float32x2_t v2437 = vrev64_f32(v2377); - float32x2_t v2455 = vrev64_f32(v2238); + float32x2_t v2440 = vsub_f32(v1960, v2377); + float32x2_t v2444 = vmul_f32(v1960, v3091); + float32x2_t v2458 = vsub_f32(v2099, v2238); + float32x2_t v2462 = vmul_f32(v2099, v3091); float32x2_t v1810 = vsub_f32(v1800, v1809); float32x2_t v1814 = vmul_f32(v1800, v3091); float32x2_t v1949 = vsub_f32(v1939, v1948); @@ -16000,10 +14217,10 @@ void armral_fft_cf32_cf32_cf32_ac_n_uu25(const armral_cmplx_f32_t *restrict x, float32x2_t v2231 = vmul_f32(v2217, v3091); float32x2_t v2366 = vsub_f32(v2356, v2365); float32x2_t v2370 = vmul_f32(v2356, v3091); - float32x2_t v2414 = vmul_f32(v2413, v2452); - float32x2_t v2426 = vmul_f32(v2425, v2452); - float32x2_t v2438 = vmul_f32(v2437, v2452); - float32x2_t v2456 = vmul_f32(v2455, v2452); + float32x2_t v2445 = vsub_f32(v2444, v2440); + float32x2_t v2463 = vsub_f32(v2462, v2458); + float32x2_t v2474 = vmul_f32(v2458, v3044); + float32x2_t v2489 = vmul_f32(v2440, v3044); float32x2_t v1815 = vsub_f32(v1814, v1810); float32x2_t v1837 = vsub_f32(v1810, v1836); float32x2_t v1841 = vmul_f32(v1810, v3091); @@ -16019,10 +14236,10 @@ void armral_fft_cf32_cf32_cf32_ac_n_uu25(const armral_cmplx_f32_t *restrict x, float32x2_t v2371 = vsub_f32(v2370, v2366); float32x2_t v2393 = vsub_f32(v2366, v2392); float32x2_t v2397 = vmul_f32(v2366, v3091); - float32x2_t v2415 = vadd_f32(v2414, v1960); - float32x2_t v2427 = vadd_f32(v2426, v2099); - float32x2_t v2439 = vadd_f32(v2438, v2377); - float32x2_t v2457 = vadd_f32(v2456, v2238); + float32x2_t v2464 = vadd_f32(v2445, v2463); + float32x2_t v2465 = vsub_f32(v2445, v2463); + float32x2_t v2475 = vadd_f32(v2440, v2474); + float32x2_t v2490 = vsub_f32(v2489, v2458); float32x2_t v1829 = vsub_f32(v1815, v1828); float32x2_t v1842 = vsub_f32(v1841, v1837); float32x2_t v1846 = vmul_f32(v1815, v3091); @@ -16038,10 +14255,11 @@ void armral_fft_cf32_cf32_cf32_ac_n_uu25(const armral_cmplx_f32_t *restrict x, float32x2_t v2385 = vsub_f32(v2371, v2384); float32x2_t v2398 = vsub_f32(v2397, v2393); float32x2_t v2402 = vmul_f32(v2371, v3091); - float32x2_t v2440 = vsub_f32(v2415, v2439); - float32x2_t v2444 = vmul_f32(v2415, v3091); - float32x2_t v2458 = vsub_f32(v2427, v2457); - float32x2_t v2462 = vmul_f32(v2427, v3091); + float32x2_t v2469 = vmul_f32(v2464, v3024); + float32x2_t v2479 = vmul_f32(v2465, v3034); + float32x2_t v2491 = vadd_f32(v1821, v2464); + float32x2_t v2502 = vrev64_f32(v2475); + float32x2_t v2515 = vrev64_f32(v2490); float32x2_t v2691 = vrev64_f32(v1976); float32x2_t v2703 = vrev64_f32(v2115); float32x2_t v2715 = vrev64_f32(v2393); @@ -16051,10 +14269,10 @@ void armral_fft_cf32_cf32_cf32_ac_n_uu25(const armral_cmplx_f32_t *restrict x, float32x2_t v2125 = vsub_f32(v2124, v2107); float32x2_t v2264 = vsub_f32(v2263, v2246); float32x2_t v2403 = vsub_f32(v2402, v2385); - float32x2_t v2445 = vsub_f32(v2444, v2440); - float32x2_t v2463 = vsub_f32(v2462, v2458); - float32x2_t v2474 = vmul_f32(v2458, v3044); - float32x2_t v2489 = vmul_f32(v2440, v3044); + float32x2_t v2470 = vsub_f32(v1821, v2469); + v6[0] = v2491; + float32x2_t v2503 = vmul_f32(v2502, v3070); + float32x2_t v2516 = vmul_f32(v2515, v3070); float32x2_t v2552 = vrev64_f32(v1968); float32x2_t v2564 = vrev64_f32(v2107); float32x2_t v2576 = vrev64_f32(v2385); @@ -16067,10 +14285,8 @@ void armral_fft_cf32_cf32_cf32_ac_n_uu25(const armral_cmplx_f32_t *restrict x, float32x2_t v2842 = vrev64_f32(v2120); float32x2_t v2854 = vrev64_f32(v2398); float32x2_t v2872 = vrev64_f32(v2259); - float32x2_t v2464 = vadd_f32(v2445, v2463); - float32x2_t v2465 = vsub_f32(v2445, v2463); - float32x2_t v2475 = vadd_f32(v2440, v2474); - float32x2_t v2490 = vsub_f32(v2489, v2458); + float32x2_t v2480 = vsub_f32(v2470, v2479); + float32x2_t v2484 = vmul_f32(v2470, v3091); float32x2_t v2553 = vmul_f32(v2552, v2551); float32x2_t v2565 = vmul_f32(v2564, v2690); float32x2_t v2577 = vmul_f32(v2576, v2968); @@ -16087,11 +14303,9 @@ void armral_fft_cf32_cf32_cf32_ac_n_uu25(const armral_cmplx_f32_t *restrict x, float32x2_t v2981 = vrev64_f32(v2125); float32x2_t v2993 = vrev64_f32(v2403); float32x2_t v3011 = vrev64_f32(v2264); - float32x2_t v2469 = vmul_f32(v2464, v3024); - float32x2_t v2479 = vmul_f32(v2465, v3034); - float32x2_t v2491 = vadd_f32(v1821, v2464); - float32x2_t v2502 = vrev64_f32(v2475); - float32x2_t v2515 = vrev64_f32(v2490); + float32x2_t v2485 = vsub_f32(v2484, v2480); + float32x2_t v2517 = vsub_f32(v2480, v2516); + float32x2_t v2526 = vmul_f32(v2480, v3091); float32x2_t v2554 = vfma_f32(v2553, v1968, v2545); float32x2_t v2566 = vfma_f32(v2565, v2107, v2684); float32x2_t v2578 = vfma_f32(v2577, v2385, v2962); @@ -16108,10 +14322,10 @@ void armral_fft_cf32_cf32_cf32_ac_n_uu25(const armral_cmplx_f32_t *restrict x, float32x2_t v2982 = vmul_f32(v2981, v2980); float32x2_t v2994 = vmul_f32(v2993, v2992); float32x2_t v3012 = vmul_f32(v3011, v3010); - float32x2_t v2470 = vsub_f32(v1821, v2469); - v6[0] = v2491; - float32x2_t v2503 = vmul_f32(v2502, v3070); - float32x2_t v2516 = vmul_f32(v2515, v3070); + float32x2_t v2504 = vsub_f32(v2485, v2503); + v6[ostride * 10] = v2517; + float32x2_t v2527 = vsub_f32(v2526, v2517); + float32x2_t v2536 = vmul_f32(v2485, v3091); float32x2_t v2579 = vsub_f32(v2554, v2578); float32x2_t v2583 = vmul_f32(v2554, v3091); float32x2_t v2597 = vsub_f32(v2566, v2596); @@ -16128,8 +14342,9 @@ void armral_fft_cf32_cf32_cf32_ac_n_uu25(const armral_cmplx_f32_t *restrict x, float32x2_t v2983 = vfma_f32(v2982, v2125, v2974); float32x2_t v2995 = vfma_f32(v2994, v2403, v2986); float32x2_t v3013 = vfma_f32(v3012, v2264, v3004); - float32x2_t v2480 = vsub_f32(v2470, v2479); - float32x2_t v2484 = vmul_f32(v2470, v3091); + v6[ostride * 5] = v2504; + v6[ostride * 15] = v2527; + float32x2_t v2537 = vsub_f32(v2536, v2504); float32x2_t v2584 = vsub_f32(v2583, v2579); float32x2_t v2602 = vsub_f32(v2601, v2597); float32x2_t v2613 = vmul_f32(v2597, v3044); @@ -16146,9 +14361,7 @@ void armral_fft_cf32_cf32_cf32_ac_n_uu25(const armral_cmplx_f32_t *restrict x, float32x2_t v3000 = vmul_f32(v2971, v3091); float32x2_t v3014 = vsub_f32(v2983, v3013); float32x2_t v3018 = vmul_f32(v2983, v3091); - float32x2_t v2485 = vsub_f32(v2484, v2480); - float32x2_t v2517 = vsub_f32(v2480, v2516); - float32x2_t v2526 = vmul_f32(v2480, v3091); + v6[ostride * 20] = v2537; float32x2_t v2603 = vadd_f32(v2584, v2602); float32x2_t v2604 = vsub_f32(v2584, v2602); float32x2_t v2614 = vadd_f32(v2579, v2613); @@ -16166,10 +14379,6 @@ void armral_fft_cf32_cf32_cf32_ac_n_uu25(const armral_cmplx_f32_t *restrict x, float32x2_t v3019 = vsub_f32(v3018, v3014); float32x2_t v3030 = vmul_f32(v3014, v3044); float32x2_t v3045 = vmul_f32(v2996, v3044); - float32x2_t v2504 = vsub_f32(v2485, v2503); - v6[ostride * 10] = v2517; - float32x2_t v2527 = vsub_f32(v2526, v2517); - float32x2_t v2536 = vmul_f32(v2485, v3091); float32x2_t v2608 = vmul_f32(v2603, v3024); float32x2_t v2618 = vmul_f32(v2604, v3034); float32x2_t v2630 = vadd_f32(v1829, v2603); @@ -16188,9 +14397,6 @@ void armral_fft_cf32_cf32_cf32_ac_n_uu25(const armral_cmplx_f32_t *restrict x, float32x2_t v3021 = vsub_f32(v3001, v3019); float32x2_t v3031 = vadd_f32(v2996, v3030); float32x2_t v3046 = vsub_f32(v3045, v3014); - v6[ostride * 5] = v2504; - v6[ostride * 15] = v2527; - float32x2_t v2537 = vsub_f32(v2536, v2504); float32x2_t v2609 = vsub_f32(v1829, v2608); v6[ostride] = v2630; float32x2_t v2642 = vmul_f32(v2641, v3070); @@ -16206,7 +14412,6 @@ void armral_fft_cf32_cf32_cf32_ac_n_uu25(const armral_cmplx_f32_t *restrict x, float32x2_t v3047 = vadd_f32(v1847, v3020); float32x2_t v3058 = vrev64_f32(v3031); float32x2_t v3071 = vrev64_f32(v3046); - v6[ostride * 20] = v2537; float32x2_t v2619 = vsub_f32(v2609, v2618); float32x2_t v2623 = vmul_f32(v2609, v3091); float32x2_t v2763 = vsub_f32(v2762, v2758); @@ -16303,7 +14508,6 @@ void armral_fft_cf32_cf32_cf32_ac_n_uu25(const armral_cmplx_f32_t *restrict x, float v1545 = 2.5000000000000000e-01F; float v1557 = 5.5901699437494745e-01F; float v1569 = 6.1803398874989490e-01F; - float v1597 = 0.0000000000000000e+00F; float v1598 = -9.5105651629515353e-01F; float v1626 = 2.0000000000000000e+00F; const float32x2_t *v1709 = &v5[v0]; @@ -16331,7 +14535,6 @@ void armral_fft_cf32_cf32_cf32_ac_n_uu25(const armral_cmplx_f32_t *restrict x, int64_t v681 = v0 * 14; int64_t v688 = v0 * 19; int64_t v695 = v0 * 24; - float v883 = v4 * v1597; int64_t v943 = v2 * 5; int64_t v958 = v2 * 10; int64_t v971 = v2 * 15; @@ -16366,6 +14569,7 @@ void armral_fft_cf32_cf32_cf32_ac_n_uu25(const armral_cmplx_f32_t *restrict x, int64_t v1619 = v2 * 19; int64_t v1632 = v2 * 24; const float32x2_t *v1645 = &v5[0]; + svfloat32_t v1967 = svdup_n_f32(0); float32x2_t *v1981 = &v6[0]; svfloat32_t v2024 = svdup_n_f32(v991); svfloat32_t v2088 = svdup_n_f32(v1153); @@ -16379,7 +14583,7 @@ void armral_fft_cf32_cf32_cf32_ac_n_uu25(const armral_cmplx_f32_t *restrict x, svfloat32_t v2228 = svdup_n_f32(v1557); svfloat32_t v2230 = svdup_n_f32(v1569); svfloat32_t v2270 = svdup_n_f32(v1626); - svfloat32_t v2291 = svreinterpret_f32_f64( + svfloat32_t v2315 = svreinterpret_f32_f64( svld1_f64(pred_full, &((const double *)v1709)[0])); const float32x2_t *v1654 = &v5[v26]; const float32x2_t *v1663 = &v5[v33]; @@ -16404,7 +14608,6 @@ void armral_fft_cf32_cf32_cf32_ac_n_uu25(const armral_cmplx_f32_t *restrict x, const float32x2_t *v1919 = &v5[v681]; const float32x2_t *v1928 = &v5[v688]; const float32x2_t *v1937 = &v5[v695]; - svfloat32_t v1967 = svdup_n_f32(v883); float32x2_t *v1991 = &v6[v943]; float32x2_t *v2001 = &v6[v958]; float32x2_t *v2011 = &v6[v971]; @@ -16438,94 +14641,84 @@ void armral_fft_cf32_cf32_cf32_ac_n_uu25(const armral_cmplx_f32_t *restrict x, float32x2_t *v2257 = &v6[v1606]; float32x2_t *v2267 = &v6[v1619]; float32x2_t *v2277 = &v6[v1632]; - svfloat32_t v2281 = svreinterpret_f32_f64( + svfloat32_t v2305 = svreinterpret_f32_f64( svld1_f64(pred_full, &((const double *)v1645)[0])); - svfloat32_t v2283 = svreinterpret_f32_f64( + svfloat32_t v2307 = svreinterpret_f32_f64( svld1_f64(pred_full, &((const double *)v1654)[0])); - svfloat32_t v2285 = svreinterpret_f32_f64( + svfloat32_t v2309 = svreinterpret_f32_f64( svld1_f64(pred_full, &((const double *)v1663)[0])); - svfloat32_t v2287 = svreinterpret_f32_f64( + svfloat32_t v2311 = svreinterpret_f32_f64( svld1_f64(pred_full, &((const double *)v1672)[0])); - svfloat32_t v2289 = svreinterpret_f32_f64( + svfloat32_t v2313 = svreinterpret_f32_f64( svld1_f64(pred_full, &((const double *)v1681)[0])); - svfloat32_t v2293 = svreinterpret_f32_f64( + svfloat32_t v2317 = svreinterpret_f32_f64( svld1_f64(pred_full, &((const double *)v1718)[0])); - svfloat32_t v2295 = svreinterpret_f32_f64( + svfloat32_t v2319 = svreinterpret_f32_f64( svld1_f64(pred_full, &((const double *)v1727)[0])); - svfloat32_t v2297 = svreinterpret_f32_f64( + svfloat32_t v2321 = svreinterpret_f32_f64( svld1_f64(pred_full, &((const double *)v1736)[0])); - svfloat32_t v2299 = svreinterpret_f32_f64( + svfloat32_t v2323 = svreinterpret_f32_f64( svld1_f64(pred_full, &((const double *)v1745)[0])); - svfloat32_t v2301 = svreinterpret_f32_f64( + svfloat32_t v2325 = svreinterpret_f32_f64( svld1_f64(pred_full, &((const double *)v1773)[0])); - svfloat32_t v2303 = svreinterpret_f32_f64( + svfloat32_t v2327 = svreinterpret_f32_f64( svld1_f64(pred_full, &((const double *)v1782)[0])); - svfloat32_t v2305 = svreinterpret_f32_f64( + svfloat32_t v2329 = svreinterpret_f32_f64( svld1_f64(pred_full, &((const double *)v1791)[0])); - svfloat32_t v2307 = svreinterpret_f32_f64( + svfloat32_t v2331 = svreinterpret_f32_f64( svld1_f64(pred_full, &((const double *)v1800)[0])); - svfloat32_t v2309 = svreinterpret_f32_f64( + svfloat32_t v2333 = svreinterpret_f32_f64( svld1_f64(pred_full, &((const double *)v1809)[0])); - svfloat32_t v2311 = svreinterpret_f32_f64( + svfloat32_t v2335 = svreinterpret_f32_f64( svld1_f64(pred_full, &((const double *)v1837)[0])); - svfloat32_t v2313 = svreinterpret_f32_f64( + svfloat32_t v2337 = svreinterpret_f32_f64( svld1_f64(pred_full, &((const double *)v1846)[0])); - svfloat32_t v2315 = svreinterpret_f32_f64( + svfloat32_t v2339 = svreinterpret_f32_f64( svld1_f64(pred_full, &((const double *)v1855)[0])); - svfloat32_t v2317 = svreinterpret_f32_f64( + svfloat32_t v2341 = svreinterpret_f32_f64( svld1_f64(pred_full, &((const double *)v1864)[0])); - svfloat32_t v2319 = svreinterpret_f32_f64( + svfloat32_t v2343 = svreinterpret_f32_f64( svld1_f64(pred_full, &((const double *)v1873)[0])); - svfloat32_t v2321 = svreinterpret_f32_f64( + svfloat32_t v2345 = svreinterpret_f32_f64( svld1_f64(pred_full, &((const double *)v1901)[0])); - svfloat32_t v2323 = svreinterpret_f32_f64( + svfloat32_t v2347 = svreinterpret_f32_f64( svld1_f64(pred_full, &((const double *)v1910)[0])); - svfloat32_t v2325 = svreinterpret_f32_f64( + svfloat32_t v2349 = svreinterpret_f32_f64( svld1_f64(pred_full, &((const double *)v1919)[0])); - svfloat32_t v2327 = svreinterpret_f32_f64( + svfloat32_t v2351 = svreinterpret_f32_f64( svld1_f64(pred_full, &((const double *)v1928)[0])); - svfloat32_t v2329 = svreinterpret_f32_f64( + svfloat32_t v2353 = svreinterpret_f32_f64( svld1_f64(pred_full, &((const double *)v1937)[0])); - svfloat32_t v65 = svcmla_f32_x(pred_full, v2283, v1967, v2283, 90); - svfloat32_t v78 = svcmla_f32_x(pred_full, v2285, v1967, v2285, 90); - svfloat32_t v91 = svcmla_f32_x(pred_full, v2289, v1967, v2289, 90); - svfloat32_t v111 = svcmla_f32_x(pred_full, v2287, v1967, v2287, 90); - svfloat32_t v227 = svcmla_f32_x(pred_full, v2293, v1967, v2293, 90); - svfloat32_t v240 = svcmla_f32_x(pred_full, v2295, v1967, v2295, 90); - svfloat32_t v253 = svcmla_f32_x(pred_full, v2299, v1967, v2299, 90); - svfloat32_t v273 = svcmla_f32_x(pred_full, v2297, v1967, v2297, 90); - svfloat32_t v389 = svcmla_f32_x(pred_full, v2303, v1967, v2303, 90); - svfloat32_t v402 = svcmla_f32_x(pred_full, v2305, v1967, v2305, 90); - svfloat32_t v415 = svcmla_f32_x(pred_full, v2309, v1967, v2309, 90); - svfloat32_t v435 = svcmla_f32_x(pred_full, v2307, v1967, v2307, 90); - svfloat32_t v551 = svcmla_f32_x(pred_full, v2313, v1967, v2313, 90); - svfloat32_t v564 = svcmla_f32_x(pred_full, v2315, v1967, v2315, 90); - svfloat32_t v577 = svcmla_f32_x(pred_full, v2319, v1967, v2319, 90); - svfloat32_t v597 = svcmla_f32_x(pred_full, v2317, v1967, v2317, 90); - svfloat32_t v713 = svcmla_f32_x(pred_full, v2323, v1967, v2323, 90); - svfloat32_t v726 = svcmla_f32_x(pred_full, v2325, v1967, v2325, 90); - svfloat32_t v739 = svcmla_f32_x(pred_full, v2329, v1967, v2329, 90); - svfloat32_t v759 = svcmla_f32_x(pred_full, v2327, v1967, v2327, 90); - svfloat32_t v92; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v92) : "w"(v65), "w"(v91)); - svfloat32_t v112; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v112) : "w"(v78), "w"(v111)); - svfloat32_t v254; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v254) : "w"(v227), "w"(v253)); - svfloat32_t v274; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v274) : "w"(v240), "w"(v273)); - svfloat32_t v416; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v416) : "w"(v389), "w"(v415)); - svfloat32_t v436; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v436) : "w"(v402), "w"(v435)); - svfloat32_t v578; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v578) : "w"(v551), "w"(v577)); - svfloat32_t v598; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v598) : "w"(v564), "w"(v597)); - svfloat32_t v740; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v740) : "w"(v713), "w"(v739)); - svfloat32_t v760; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v760) : "w"(v726), "w"(v759)); + svfloat32_t v65 = svcmla_f32_x(pred_full, v2307, v1967, v2307, 90); + svfloat32_t v78 = svcmla_f32_x(pred_full, v2309, v1967, v2309, 90); + svfloat32_t v91 = svcmla_f32_x(pred_full, v2313, v1967, v2313, 90); + svfloat32_t v111 = svcmla_f32_x(pred_full, v2311, v1967, v2311, 90); + svfloat32_t v227 = svcmla_f32_x(pred_full, v2317, v1967, v2317, 90); + svfloat32_t v240 = svcmla_f32_x(pred_full, v2319, v1967, v2319, 90); + svfloat32_t v253 = svcmla_f32_x(pred_full, v2323, v1967, v2323, 90); + svfloat32_t v273 = svcmla_f32_x(pred_full, v2321, v1967, v2321, 90); + svfloat32_t v389 = svcmla_f32_x(pred_full, v2327, v1967, v2327, 90); + svfloat32_t v402 = svcmla_f32_x(pred_full, v2329, v1967, v2329, 90); + svfloat32_t v415 = svcmla_f32_x(pred_full, v2333, v1967, v2333, 90); + svfloat32_t v435 = svcmla_f32_x(pred_full, v2331, v1967, v2331, 90); + svfloat32_t v551 = svcmla_f32_x(pred_full, v2337, v1967, v2337, 90); + svfloat32_t v564 = svcmla_f32_x(pred_full, v2339, v1967, v2339, 90); + svfloat32_t v577 = svcmla_f32_x(pred_full, v2343, v1967, v2343, 90); + svfloat32_t v597 = svcmla_f32_x(pred_full, v2341, v1967, v2341, 90); + svfloat32_t v713 = svcmla_f32_x(pred_full, v2347, v1967, v2347, 90); + svfloat32_t v726 = svcmla_f32_x(pred_full, v2349, v1967, v2349, 90); + svfloat32_t v739 = svcmla_f32_x(pred_full, v2353, v1967, v2353, 90); + svfloat32_t v759 = svcmla_f32_x(pred_full, v2351, v1967, v2351, 90); + svfloat32_t v92 = svsub_f32_x(svptrue_b32(), v65, v91); + svfloat32_t v112 = svsub_f32_x(svptrue_b32(), v78, v111); + svfloat32_t v254 = svsub_f32_x(svptrue_b32(), v227, v253); + svfloat32_t v274 = svsub_f32_x(svptrue_b32(), v240, v273); + svfloat32_t v416 = svsub_f32_x(svptrue_b32(), v389, v415); + svfloat32_t v436 = svsub_f32_x(svptrue_b32(), v402, v435); + svfloat32_t v578 = svsub_f32_x(svptrue_b32(), v551, v577); + svfloat32_t v598 = svsub_f32_x(svptrue_b32(), v564, v597); + svfloat32_t v740 = svsub_f32_x(svptrue_b32(), v713, v739); + svfloat32_t v760 = svsub_f32_x(svptrue_b32(), v726, v759); svfloat32_t v98 = svnmls_f32_x(pred_full, v92, v65, v2270); svfloat32_t v118 = svnmls_f32_x(pred_full, v112, v78, v2270); svfloat32_t v260 = svnmls_f32_x(pred_full, v254, v227, v2270); @@ -16536,81 +14729,56 @@ void armral_fft_cf32_cf32_cf32_ac_n_uu25(const armral_cmplx_f32_t *restrict x, svfloat32_t v604 = svnmls_f32_x(pred_full, v598, v564, v2270); svfloat32_t v746 = svnmls_f32_x(pred_full, v740, v713, v2270); svfloat32_t v766 = svnmls_f32_x(pred_full, v760, v726, v2270); - svfloat32_t v119; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v119) : "w"(v98), "w"(v118)); - svfloat32_t v120; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v120) : "w"(v98), "w"(v118)); + svfloat32_t v119 = svadd_f32_x(svptrue_b32(), v98, v118); + svfloat32_t v120 = svsub_f32_x(svptrue_b32(), v98, v118); svfloat32_t v132 = svmla_f32_x(pred_full, v92, v112, v2230); svfloat32_t v150 = svnmls_f32_x(pred_full, v112, v92, v2230); - svfloat32_t v281; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v281) : "w"(v260), "w"(v280)); - svfloat32_t v282; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v282) : "w"(v260), "w"(v280)); + svfloat32_t v281 = svadd_f32_x(svptrue_b32(), v260, v280); + svfloat32_t v282 = svsub_f32_x(svptrue_b32(), v260, v280); svfloat32_t v294 = svmla_f32_x(pred_full, v254, v274, v2230); svfloat32_t v312 = svnmls_f32_x(pred_full, v274, v254, v2230); - svfloat32_t v443; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v443) : "w"(v422), "w"(v442)); - svfloat32_t v444; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v444) : "w"(v422), "w"(v442)); + svfloat32_t v443 = svadd_f32_x(svptrue_b32(), v422, v442); + svfloat32_t v444 = svsub_f32_x(svptrue_b32(), v422, v442); svfloat32_t v456 = svmla_f32_x(pred_full, v416, v436, v2230); svfloat32_t v474 = svnmls_f32_x(pred_full, v436, v416, v2230); - svfloat32_t v605; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v605) : "w"(v584), "w"(v604)); - svfloat32_t v606; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v606) : "w"(v584), "w"(v604)); + svfloat32_t v605 = svadd_f32_x(svptrue_b32(), v584, v604); + svfloat32_t v606 = svsub_f32_x(svptrue_b32(), v584, v604); svfloat32_t v618 = svmla_f32_x(pred_full, v578, v598, v2230); svfloat32_t v636 = svnmls_f32_x(pred_full, v598, v578, v2230); - svfloat32_t v767; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v767) : "w"(v746), "w"(v766)); - svfloat32_t v768; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v768) : "w"(v746), "w"(v766)); + svfloat32_t v767 = svadd_f32_x(svptrue_b32(), v746, v766); + svfloat32_t v768 = svsub_f32_x(svptrue_b32(), v746, v766); svfloat32_t v780 = svmla_f32_x(pred_full, v740, v760, v2230); svfloat32_t v798 = svnmls_f32_x(pred_full, v760, v740, v2230); - svfloat32_t v151; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v151) : "w"(v2281), "w"(v119)); - svfloat32_t zero158; - asm volatile("mov %0.s, #0" : "=w"(zero158)); + svfloat32_t v151 = svadd_f32_x(svptrue_b32(), v2305, v119); + svfloat32_t zero158 = svdup_n_f32(0); svfloat32_t v158 = svcmla_f32_x(pred_full, zero158, v2250, v132, 90); - svfloat32_t zero166; - asm volatile("mov %0.s, #0" : "=w"(zero166)); + svfloat32_t zero166 = svdup_n_f32(0); svfloat32_t v166 = svcmla_f32_x(pred_full, zero166, v2250, v150, 90); - svfloat32_t v313; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v313) : "w"(v2291), "w"(v281)); - svfloat32_t zero320; - asm volatile("mov %0.s, #0" : "=w"(zero320)); + svfloat32_t v313 = svadd_f32_x(svptrue_b32(), v2315, v281); + svfloat32_t zero320 = svdup_n_f32(0); svfloat32_t v320 = svcmla_f32_x(pred_full, zero320, v2250, v294, 90); - svfloat32_t zero328; - asm volatile("mov %0.s, #0" : "=w"(zero328)); + svfloat32_t zero328 = svdup_n_f32(0); svfloat32_t v328 = svcmla_f32_x(pred_full, zero328, v2250, v312, 90); - svfloat32_t v475; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v475) : "w"(v2301), "w"(v443)); - svfloat32_t zero482; - asm volatile("mov %0.s, #0" : "=w"(zero482)); + svfloat32_t v475 = svadd_f32_x(svptrue_b32(), v2325, v443); + svfloat32_t zero482 = svdup_n_f32(0); svfloat32_t v482 = svcmla_f32_x(pred_full, zero482, v2250, v456, 90); - svfloat32_t zero490; - asm volatile("mov %0.s, #0" : "=w"(zero490)); + svfloat32_t zero490 = svdup_n_f32(0); svfloat32_t v490 = svcmla_f32_x(pred_full, zero490, v2250, v474, 90); - svfloat32_t v637; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v637) : "w"(v2311), "w"(v605)); - svfloat32_t zero644; - asm volatile("mov %0.s, #0" : "=w"(zero644)); + svfloat32_t v637 = svadd_f32_x(svptrue_b32(), v2335, v605); + svfloat32_t zero644 = svdup_n_f32(0); svfloat32_t v644 = svcmla_f32_x(pred_full, zero644, v2250, v618, 90); - svfloat32_t zero652; - asm volatile("mov %0.s, #0" : "=w"(zero652)); + svfloat32_t zero652 = svdup_n_f32(0); svfloat32_t v652 = svcmla_f32_x(pred_full, zero652, v2250, v636, 90); - svfloat32_t v799; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v799) : "w"(v2321), "w"(v767)); - svfloat32_t zero806; - asm volatile("mov %0.s, #0" : "=w"(zero806)); + svfloat32_t v799 = svadd_f32_x(svptrue_b32(), v2345, v767); + svfloat32_t zero806 = svdup_n_f32(0); svfloat32_t v806 = svcmla_f32_x(pred_full, zero806, v2250, v780, 90); - svfloat32_t zero814; - asm volatile("mov %0.s, #0" : "=w"(zero814)); + svfloat32_t zero814 = svdup_n_f32(0); svfloat32_t v814 = svcmla_f32_x(pred_full, zero814, v2250, v798, 90); - svfloat32_t v126 = svmls_f32_x(pred_full, v2281, v119, v2226); - svfloat32_t v288 = svmls_f32_x(pred_full, v2291, v281, v2226); - svfloat32_t v450 = svmls_f32_x(pred_full, v2301, v443, v2226); - svfloat32_t v612 = svmls_f32_x(pred_full, v2311, v605, v2226); - svfloat32_t v774 = svmls_f32_x(pred_full, v2321, v767, v2226); + svfloat32_t v126 = svmls_f32_x(pred_full, v2305, v119, v2226); + svfloat32_t v288 = svmls_f32_x(pred_full, v2315, v281, v2226); + svfloat32_t v450 = svmls_f32_x(pred_full, v2325, v443, v2226); + svfloat32_t v612 = svmls_f32_x(pred_full, v2335, v605, v2226); + svfloat32_t v774 = svmls_f32_x(pred_full, v2345, v767, v2226); svfloat32_t v138 = svmls_f32_x(pred_full, v126, v120, v2228); svfloat32_t v300 = svmls_f32_x(pred_full, v288, v282, v2228); svfloat32_t v462 = svmls_f32_x(pred_full, v450, v444, v2228); @@ -16621,119 +14789,80 @@ void armral_fft_cf32_cf32_cf32_ac_n_uu25(const armral_cmplx_f32_t *restrict x, svfloat32_t v866 = svcmla_f32_x(pred_full, v799, v1967, v799, 90); svfloat32_t v886 = svcmla_f32_x(pred_full, v637, v1967, v637, 90); svfloat32_t v144 = svnmls_f32_x(pred_full, v138, v126, v2270); - svfloat32_t v167; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v167) : "w"(v138), "w"(v166)); + svfloat32_t v167 = svsub_f32_x(svptrue_b32(), v138, v166); svfloat32_t v306 = svnmls_f32_x(pred_full, v300, v288, v2270); - svfloat32_t v329; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v329) : "w"(v300), "w"(v328)); + svfloat32_t v329 = svsub_f32_x(svptrue_b32(), v300, v328); svfloat32_t v468 = svnmls_f32_x(pred_full, v462, v450, v2270); - svfloat32_t v491; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v491) : "w"(v462), "w"(v490)); + svfloat32_t v491 = svsub_f32_x(svptrue_b32(), v462, v490); svfloat32_t v630 = svnmls_f32_x(pred_full, v624, v612, v2270); - svfloat32_t v653; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v653) : "w"(v624), "w"(v652)); + svfloat32_t v653 = svsub_f32_x(svptrue_b32(), v624, v652); svfloat32_t v792 = svnmls_f32_x(pred_full, v786, v774, v2270); - svfloat32_t v815; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v815) : "w"(v786), "w"(v814)); - svfloat32_t v867; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v867) : "w"(v840), "w"(v866)); - svfloat32_t v887; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v887) : "w"(v853), "w"(v886)); - svfloat32_t v159; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v159) : "w"(v144), "w"(v158)); + svfloat32_t v815 = svsub_f32_x(svptrue_b32(), v786, v814); + svfloat32_t v867 = svsub_f32_x(svptrue_b32(), v840, v866); + svfloat32_t v887 = svsub_f32_x(svptrue_b32(), v853, v886); + svfloat32_t v159 = svsub_f32_x(svptrue_b32(), v144, v158); svfloat32_t v173 = svnmls_f32_x(pred_full, v167, v138, v2270); - svfloat32_t v321; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v321) : "w"(v306), "w"(v320)); + svfloat32_t v321 = svsub_f32_x(svptrue_b32(), v306, v320); svfloat32_t v335 = svnmls_f32_x(pred_full, v329, v300, v2270); - svfloat32_t v483; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v483) : "w"(v468), "w"(v482)); + svfloat32_t v483 = svsub_f32_x(svptrue_b32(), v468, v482); svfloat32_t v497 = svnmls_f32_x(pred_full, v491, v462, v2270); - svfloat32_t v645; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v645) : "w"(v630), "w"(v644)); + svfloat32_t v645 = svsub_f32_x(svptrue_b32(), v630, v644); svfloat32_t v659 = svnmls_f32_x(pred_full, v653, v624, v2270); - svfloat32_t v807; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v807) : "w"(v792), "w"(v806)); + svfloat32_t v807 = svsub_f32_x(svptrue_b32(), v792, v806); svfloat32_t v821 = svnmls_f32_x(pred_full, v815, v786, v2270); svfloat32_t v873 = svnmls_f32_x(pred_full, v867, v840, v2270); svfloat32_t v893 = svnmls_f32_x(pred_full, v887, v853, v2270); - svfloat32_t v1156; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v1156) : "w"(v329), "w"(v2088)); - svfloat32_t v1169; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v1169) : "w"(v491), "w"(v2216)); - svfloat32_t v1182; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v1182) : "w"(v815), "w"(v2218)); - svfloat32_t v1202; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v1202) : "w"(v653), "w"(v2154)); + svfloat32_t v1156 = svmul_f32_x(svptrue_b32(), v329, v2088); + svfloat32_t v1169 = svmul_f32_x(svptrue_b32(), v491, v2216); + svfloat32_t v1182 = svmul_f32_x(svptrue_b32(), v815, v2218); + svfloat32_t v1202 = svmul_f32_x(svptrue_b32(), v653, v2154); svfloat32_t v179 = svnmls_f32_x(pred_full, v159, v144, v2270); svfloat32_t v341 = svnmls_f32_x(pred_full, v321, v306, v2270); svfloat32_t v503 = svnmls_f32_x(pred_full, v483, v468, v2270); svfloat32_t v665 = svnmls_f32_x(pred_full, v645, v630, v2270); svfloat32_t v827 = svnmls_f32_x(pred_full, v807, v792, v2270); - svfloat32_t v894; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v894) : "w"(v873), "w"(v893)); - svfloat32_t v895; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v895) : "w"(v873), "w"(v893)); + svfloat32_t v894 = svadd_f32_x(svptrue_b32(), v873, v893); + svfloat32_t v895 = svsub_f32_x(svptrue_b32(), v873, v893); svfloat32_t v907 = svmla_f32_x(pred_full, v867, v887, v2230); svfloat32_t v925 = svnmls_f32_x(pred_full, v887, v867, v2230); - svfloat32_t v994; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v994) : "w"(v321), "w"(v2024)); - svfloat32_t v1007; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v1007) : "w"(v483), "w"(v2088)); - svfloat32_t v1020; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v1020) : "w"(v807), "w"(v2216)); - svfloat32_t v1040; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v1040) : "w"(v645), "w"(v2152)); + svfloat32_t v994 = svmul_f32_x(svptrue_b32(), v321, v2024); + svfloat32_t v1007 = svmul_f32_x(svptrue_b32(), v483, v2088); + svfloat32_t v1020 = svmul_f32_x(svptrue_b32(), v807, v2216); + svfloat32_t v1040 = svmul_f32_x(svptrue_b32(), v645, v2152); svfloat32_t v1164 = svcmla_f32_x(pred_full, v1156, v2089, v329, 90); svfloat32_t v1177 = svcmla_f32_x(pred_full, v1169, v2217, v491, 90); svfloat32_t v1190 = svcmla_f32_x(pred_full, v1182, v2219, v815, 90); svfloat32_t v1210 = svcmla_f32_x(pred_full, v1202, v2155, v653, 90); - svfloat32_t v1318; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v1318) : "w"(v335), "w"(v2152)); - svfloat32_t v1331; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v1331) : "w"(v497), "w"(v2154)); - svfloat32_t v1344; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v1344) : "w"(v821), "w"(v2223)); - svfloat32_t v1364; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v1364) : "w"(v659), "w"(v2220)); - svfloat32_t v926; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v926) : "w"(v151), "w"(v894)); - svfloat32_t zero940; - asm volatile("mov %0.s, #0" : "=w"(zero940)); + svfloat32_t v1318 = svmul_f32_x(svptrue_b32(), v335, v2152); + svfloat32_t v1331 = svmul_f32_x(svptrue_b32(), v497, v2154); + svfloat32_t v1344 = svmul_f32_x(svptrue_b32(), v821, v2223); + svfloat32_t v1364 = svmul_f32_x(svptrue_b32(), v659, v2220); + svfloat32_t v926 = svadd_f32_x(svptrue_b32(), v151, v894); + svfloat32_t zero940 = svdup_n_f32(0); svfloat32_t v940 = svcmla_f32_x(pred_full, zero940, v2250, v907, 90); - svfloat32_t zero955; - asm volatile("mov %0.s, #0" : "=w"(zero955)); + svfloat32_t zero955 = svdup_n_f32(0); svfloat32_t v955 = svcmla_f32_x(pred_full, zero955, v2250, v925, 90); svfloat32_t v1002 = svcmla_f32_x(pred_full, v994, v2025, v321, 90); svfloat32_t v1015 = svcmla_f32_x(pred_full, v1007, v2089, v483, 90); svfloat32_t v1028 = svcmla_f32_x(pred_full, v1020, v2217, v807, 90); svfloat32_t v1048 = svcmla_f32_x(pred_full, v1040, v2153, v645, 90); - svfloat32_t v1191; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1191) : "w"(v1164), "w"(v1190)); - svfloat32_t v1211; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1211) : "w"(v1177), "w"(v1210)); + svfloat32_t v1191 = svsub_f32_x(svptrue_b32(), v1164, v1190); + svfloat32_t v1211 = svsub_f32_x(svptrue_b32(), v1177, v1210); svfloat32_t v1326 = svcmla_f32_x(pred_full, v1318, v2153, v335, 90); svfloat32_t v1339 = svcmla_f32_x(pred_full, v1331, v2155, v497, 90); svfloat32_t v1352 = svcmla_f32_x(pred_full, v1344, v2224, v821, 90); svfloat32_t v1372 = svcmla_f32_x(pred_full, v1364, v2160, v659, 90); - svfloat32_t v1480; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v1480) : "w"(v341), "w"(v2216)); - svfloat32_t v1493; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v1493) : "w"(v503), "w"(v2218)); - svfloat32_t v1506; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v1506) : "w"(v827), "w"(v2220)); - svfloat32_t v1526; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v1526) : "w"(v665), "w"(v2223)); + svfloat32_t v1480 = svmul_f32_x(svptrue_b32(), v341, v2216); + svfloat32_t v1493 = svmul_f32_x(svptrue_b32(), v503, v2218); + svfloat32_t v1506 = svmul_f32_x(svptrue_b32(), v827, v2220); + svfloat32_t v1526 = svmul_f32_x(svptrue_b32(), v665, v2223); svfloat32_t v901 = svmls_f32_x(pred_full, v151, v894, v2226); - svfloat32_t v1029; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1029) : "w"(v1002), "w"(v1028)); - svfloat32_t v1049; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1049) : "w"(v1015), "w"(v1048)); + svfloat32_t v1029 = svsub_f32_x(svptrue_b32(), v1002, v1028); + svfloat32_t v1049 = svsub_f32_x(svptrue_b32(), v1015, v1048); svfloat32_t v1197 = svnmls_f32_x(pred_full, v1191, v1164, v2270); svfloat32_t v1217 = svnmls_f32_x(pred_full, v1211, v1177, v2270); - svfloat32_t v1353; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1353) : "w"(v1326), "w"(v1352)); - svfloat32_t v1373; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1373) : "w"(v1339), "w"(v1372)); + svfloat32_t v1353 = svsub_f32_x(svptrue_b32(), v1326, v1352); + svfloat32_t v1373 = svsub_f32_x(svptrue_b32(), v1339, v1372); svfloat32_t v1488 = svcmla_f32_x(pred_full, v1480, v2217, v341, 90); svfloat32_t v1501 = svcmla_f32_x(pred_full, v1493, v2219, v503, 90); svfloat32_t v1514 = svcmla_f32_x(pred_full, v1506, v2221, v827, 90); @@ -16742,67 +14871,46 @@ void armral_fft_cf32_cf32_cf32_ac_n_uu25(const armral_cmplx_f32_t *restrict x, svfloat32_t v913 = svmls_f32_x(pred_full, v901, v895, v2228); svfloat32_t v1035 = svnmls_f32_x(pred_full, v1029, v1002, v2270); svfloat32_t v1055 = svnmls_f32_x(pred_full, v1049, v1015, v2270); - svfloat32_t v1218; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v1218) : "w"(v1197), "w"(v1217)); - svfloat32_t v1219; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1219) : "w"(v1197), "w"(v1217)); + svfloat32_t v1218 = svadd_f32_x(svptrue_b32(), v1197, v1217); + svfloat32_t v1219 = svsub_f32_x(svptrue_b32(), v1197, v1217); svfloat32_t v1231 = svmla_f32_x(pred_full, v1191, v1211, v2230); svfloat32_t v1249 = svnmls_f32_x(pred_full, v1211, v1191, v2230); svfloat32_t v1359 = svnmls_f32_x(pred_full, v1353, v1326, v2270); svfloat32_t v1379 = svnmls_f32_x(pred_full, v1373, v1339, v2270); - svfloat32_t v1515; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1515) : "w"(v1488), "w"(v1514)); - svfloat32_t v1535; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1535) : "w"(v1501), "w"(v1534)); + svfloat32_t v1515 = svsub_f32_x(svptrue_b32(), v1488, v1514); + svfloat32_t v1535 = svsub_f32_x(svptrue_b32(), v1501, v1534); svfloat32_t v919 = svnmls_f32_x(pred_full, v913, v901, v2270); - svfloat32_t v956; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v956) : "w"(v913), "w"(v955)); - svfloat32_t v1056; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v1056) : "w"(v1035), "w"(v1055)); - svfloat32_t v1057; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1057) : "w"(v1035), "w"(v1055)); + svfloat32_t v956 = svsub_f32_x(svptrue_b32(), v913, v955); + svfloat32_t v1056 = svadd_f32_x(svptrue_b32(), v1035, v1055); + svfloat32_t v1057 = svsub_f32_x(svptrue_b32(), v1035, v1055); svfloat32_t v1069 = svmla_f32_x(pred_full, v1029, v1049, v2230); svfloat32_t v1087 = svnmls_f32_x(pred_full, v1049, v1029, v2230); - svfloat32_t v1250; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v1250) : "w"(v167), "w"(v1218)); - svfloat32_t zero1264; - asm volatile("mov %0.s, #0" : "=w"(zero1264)); + svfloat32_t v1250 = svadd_f32_x(svptrue_b32(), v167, v1218); + svfloat32_t zero1264 = svdup_n_f32(0); svfloat32_t v1264 = svcmla_f32_x(pred_full, zero1264, v2250, v1231, 90); - svfloat32_t zero1279; - asm volatile("mov %0.s, #0" : "=w"(zero1279)); + svfloat32_t zero1279 = svdup_n_f32(0); svfloat32_t v1279 = svcmla_f32_x(pred_full, zero1279, v2250, v1249, 90); - svfloat32_t v1380; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v1380) : "w"(v1359), "w"(v1379)); - svfloat32_t v1381; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1381) : "w"(v1359), "w"(v1379)); + svfloat32_t v1380 = svadd_f32_x(svptrue_b32(), v1359, v1379); + svfloat32_t v1381 = svsub_f32_x(svptrue_b32(), v1359, v1379); svfloat32_t v1393 = svmla_f32_x(pred_full, v1353, v1373, v2230); svfloat32_t v1411 = svnmls_f32_x(pred_full, v1373, v1353, v2230); svfloat32_t v1521 = svnmls_f32_x(pred_full, v1515, v1488, v2270); svfloat32_t v1541 = svnmls_f32_x(pred_full, v1535, v1501, v2270); - svfloat32_t v941; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v941) : "w"(v919), "w"(v940)); + svfloat32_t v941 = svsub_f32_x(svptrue_b32(), v919, v940); svfloat32_t v969 = svnmls_f32_x(pred_full, v956, v913, v2270); - svfloat32_t v1088; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v1088) : "w"(v159), "w"(v1056)); - svfloat32_t zero1102; - asm volatile("mov %0.s, #0" : "=w"(zero1102)); + svfloat32_t v1088 = svadd_f32_x(svptrue_b32(), v159, v1056); + svfloat32_t zero1102 = svdup_n_f32(0); svfloat32_t v1102 = svcmla_f32_x(pred_full, zero1102, v2250, v1069, 90); - svfloat32_t zero1117; - asm volatile("mov %0.s, #0" : "=w"(zero1117)); + svfloat32_t zero1117 = svdup_n_f32(0); svfloat32_t v1117 = svcmla_f32_x(pred_full, zero1117, v2250, v1087, 90); svfloat32_t v1225 = svmls_f32_x(pred_full, v167, v1218, v2226); - svfloat32_t v1412; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v1412) : "w"(v173), "w"(v1380)); - svfloat32_t zero1426; - asm volatile("mov %0.s, #0" : "=w"(zero1426)); + svfloat32_t v1412 = svadd_f32_x(svptrue_b32(), v173, v1380); + svfloat32_t zero1426 = svdup_n_f32(0); svfloat32_t v1426 = svcmla_f32_x(pred_full, zero1426, v2250, v1393, 90); - svfloat32_t zero1441; - asm volatile("mov %0.s, #0" : "=w"(zero1441)); + svfloat32_t zero1441 = svdup_n_f32(0); svfloat32_t v1441 = svcmla_f32_x(pred_full, zero1441, v2250, v1411, 90); - svfloat32_t v1542; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v1542) : "w"(v1521), "w"(v1541)); - svfloat32_t v1543; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1543) : "w"(v1521), "w"(v1541)); + svfloat32_t v1542 = svadd_f32_x(svptrue_b32(), v1521, v1541); + svfloat32_t v1543 = svsub_f32_x(svptrue_b32(), v1521, v1541); svfloat32_t v1555 = svmla_f32_x(pred_full, v1515, v1535, v2230); svfloat32_t v1573 = svnmls_f32_x(pred_full, v1535, v1515, v2230); svst1_f64(pred_full, (double *)(v2001), svreinterpret_f64_f32(v956)); @@ -16811,13 +14919,10 @@ void armral_fft_cf32_cf32_cf32_ac_n_uu25(const armral_cmplx_f32_t *restrict x, svfloat32_t v1063 = svmls_f32_x(pred_full, v159, v1056, v2226); svfloat32_t v1237 = svmls_f32_x(pred_full, v1225, v1219, v2228); svfloat32_t v1387 = svmls_f32_x(pred_full, v173, v1380, v2226); - svfloat32_t v1574; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v1574) : "w"(v179), "w"(v1542)); - svfloat32_t zero1588; - asm volatile("mov %0.s, #0" : "=w"(zero1588)); + svfloat32_t v1574 = svadd_f32_x(svptrue_b32(), v179, v1542); + svfloat32_t zero1588 = svdup_n_f32(0); svfloat32_t v1588 = svcmla_f32_x(pred_full, zero1588, v2250, v1555, 90); - svfloat32_t zero1603; - asm volatile("mov %0.s, #0" : "=w"(zero1603)); + svfloat32_t zero1603 = svdup_n_f32(0); svfloat32_t v1603 = svcmla_f32_x(pred_full, zero1603, v2250, v1573, 90); svst1_f64(pred_full, (double *)(v1991), svreinterpret_f64_f32(v941)); svst1_f64(pred_full, (double *)(v2011), svreinterpret_f64_f32(v969)); @@ -16825,41 +14930,33 @@ void armral_fft_cf32_cf32_cf32_ac_n_uu25(const armral_cmplx_f32_t *restrict x, svst1_f64(pred_full, (double *)(v2173), svreinterpret_f64_f32(v1412)); svfloat32_t v1075 = svmls_f32_x(pred_full, v1063, v1057, v2228); svfloat32_t v1243 = svnmls_f32_x(pred_full, v1237, v1225, v2270); - svfloat32_t v1280; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1280) : "w"(v1237), "w"(v1279)); + svfloat32_t v1280 = svsub_f32_x(svptrue_b32(), v1237, v1279); svfloat32_t v1399 = svmls_f32_x(pred_full, v1387, v1381, v2228); svfloat32_t v1549 = svmls_f32_x(pred_full, v179, v1542, v2226); svst1_f64(pred_full, (double *)(v2021), svreinterpret_f64_f32(v982)); svst1_f64(pred_full, (double *)(v2237), svreinterpret_f64_f32(v1574)); svfloat32_t v1081 = svnmls_f32_x(pred_full, v1075, v1063, v2270); - svfloat32_t v1118; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1118) : "w"(v1075), "w"(v1117)); - svfloat32_t v1265; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1265) : "w"(v1243), "w"(v1264)); + svfloat32_t v1118 = svsub_f32_x(svptrue_b32(), v1075, v1117); + svfloat32_t v1265 = svsub_f32_x(svptrue_b32(), v1243, v1264); svfloat32_t v1293 = svnmls_f32_x(pred_full, v1280, v1237, v2270); svfloat32_t v1405 = svnmls_f32_x(pred_full, v1399, v1387, v2270); - svfloat32_t v1442; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1442) : "w"(v1399), "w"(v1441)); + svfloat32_t v1442 = svsub_f32_x(svptrue_b32(), v1399, v1441); svfloat32_t v1561 = svmls_f32_x(pred_full, v1549, v1543, v2228); svst1_f64(pred_full, (double *)(v2129), svreinterpret_f64_f32(v1280)); - svfloat32_t v1103; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1103) : "w"(v1081), "w"(v1102)); + svfloat32_t v1103 = svsub_f32_x(svptrue_b32(), v1081, v1102); svfloat32_t v1131 = svnmls_f32_x(pred_full, v1118, v1075, v2270); svfloat32_t v1306 = svnmls_f32_x(pred_full, v1265, v1243, v2270); - svfloat32_t v1427; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1427) : "w"(v1405), "w"(v1426)); + svfloat32_t v1427 = svsub_f32_x(svptrue_b32(), v1405, v1426); svfloat32_t v1455 = svnmls_f32_x(pred_full, v1442, v1399, v2270); svfloat32_t v1567 = svnmls_f32_x(pred_full, v1561, v1549, v2270); - svfloat32_t v1604; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1604) : "w"(v1561), "w"(v1603)); + svfloat32_t v1604 = svsub_f32_x(svptrue_b32(), v1561, v1603); svst1_f64(pred_full, (double *)(v2065), svreinterpret_f64_f32(v1118)); svst1_f64(pred_full, (double *)(v2119), svreinterpret_f64_f32(v1265)); svst1_f64(pred_full, (double *)(v2139), svreinterpret_f64_f32(v1293)); svst1_f64(pred_full, (double *)(v2193), svreinterpret_f64_f32(v1442)); svfloat32_t v1144 = svnmls_f32_x(pred_full, v1103, v1081, v2270); svfloat32_t v1468 = svnmls_f32_x(pred_full, v1427, v1405, v2270); - svfloat32_t v1589; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1589) : "w"(v1567), "w"(v1588)); + svfloat32_t v1589 = svsub_f32_x(svptrue_b32(), v1567, v1588); svfloat32_t v1617 = svnmls_f32_x(pred_full, v1604, v1561, v2270); svst1_f64(pred_full, (double *)(v2055), svreinterpret_f64_f32(v1103)); svst1_f64(pred_full, (double *)(v2075), svreinterpret_f64_f32(v1131)); @@ -16879,6 +14976,2594 @@ void armral_fft_cf32_cf32_cf32_ac_n_uu25(const armral_cmplx_f32_t *restrict x, } #endif +#ifndef ARMRAL_ARCH_SVE +void armral_fft_cf32_cf32_cf32_ac_n_uu28(const armral_cmplx_f32_t *restrict x, + armral_cmplx_f32_t *restrict y, + int istride, int ostride, int howmany, + float dir) { + float v4 = dir; + const float32x2_t *v5 = (const float32x2_t *)x; + float32x2_t *v6 = (float32x2_t *)y; + int64_t v12 = howmany - 1; + int64_t v844 = howmany / 2; + for (int j = 0; j < v12; j += 2) { + float v487 = 4.4095855184409838e-01F; + float v495 = 3.4087293062393137e-01F; + float v503 = -5.3396936033772524e-01F; + float v511 = 8.7484229096165667e-01F; + float v555 = 1.0000000000000000e+00F; + float v556 = -1.0000000000000000e+00F; + float v563 = -1.1666666666666665e+00F; + float v564 = 1.1666666666666665e+00F; + float v571 = 7.9015646852540022e-01F; + float v572 = -7.9015646852540022e-01F; + float v579 = 5.5854267289647742e-02F; + float v580 = -5.5854267289647742e-02F; + float v587 = 7.3430220123575241e-01F; + float v588 = -7.3430220123575241e-01F; + float32x2_t v590 = (float32x2_t){v4, v4}; + float v596 = -4.4095855184409838e-01F; + float v601 = -3.4087293062393137e-01F; + float v606 = 5.3396936033772524e-01F; + float v611 = -8.7484229096165667e-01F; + const float32x2_t *v1637 = &v5[istride]; + float32x2_t *v1836 = &v6[ostride]; + float32x2_t v469 = (float32x2_t){v563, v563}; + float32x2_t v474 = (float32x2_t){v571, v571}; + float32x2_t v479 = (float32x2_t){v579, v579}; + float32x2_t v484 = (float32x2_t){v587, v587}; + float32x2_t v489 = (float32x2_t){v487, v596}; + float32x2_t v497 = (float32x2_t){v495, v601}; + float32x2_t v505 = (float32x2_t){v503, v606}; + float32x2_t v513 = (float32x2_t){v511, v611}; + float32x2_t v557 = (float32x2_t){v555, v556}; + float32x2_t v565 = (float32x2_t){v563, v564}; + float32x2_t v573 = (float32x2_t){v571, v572}; + float32x2_t v581 = (float32x2_t){v579, v580}; + float32x2_t v589 = (float32x2_t){v587, v588}; + float32x2_t v597 = (float32x2_t){v596, v596}; + float32x2_t v602 = (float32x2_t){v601, v601}; + float32x2_t v607 = (float32x2_t){v606, v606}; + float32x2_t v612 = (float32x2_t){v611, v611}; + const float32x2_t *v1538 = &v5[0]; + float32x2_t *v1791 = &v6[0]; + float32x4_t v2060 = vld1q_f32((const float32_t *)v1637); + float32x4_t v470 = vcombine_f32(v469, v469); + float32x4_t v475 = vcombine_f32(v474, v474); + float32x4_t v480 = vcombine_f32(v479, v479); + float32x4_t v485 = vcombine_f32(v484, v484); + float32x2_t v491 = vmul_f32(v590, v489); + float32x2_t v499 = vmul_f32(v590, v497); + float32x2_t v507 = vmul_f32(v590, v505); + float32x2_t v515 = vmul_f32(v590, v513); + float32x2_t v559 = vmul_f32(v590, v557); + float32x2_t v567 = vmul_f32(v590, v565); + float32x2_t v575 = vmul_f32(v590, v573); + float32x2_t v583 = vmul_f32(v590, v581); + float32x2_t v591 = vmul_f32(v590, v589); + float32x4_t v598 = vcombine_f32(v597, v597); + float32x4_t v603 = vcombine_f32(v602, v602); + float32x4_t v608 = vcombine_f32(v607, v607); + float32x4_t v613 = vcombine_f32(v612, v612); + const float32x2_t *v1547 = &v5[istride * 14]; + const float32x2_t *v1556 = &v5[istride * 7]; + const float32x2_t *v1565 = &v5[istride * 21]; + const float32x2_t *v1574 = &v5[istride * 4]; + const float32x2_t *v1583 = &v5[istride * 18]; + const float32x2_t *v1592 = &v5[istride * 11]; + const float32x2_t *v1601 = &v5[istride * 25]; + const float32x2_t *v1610 = &v5[istride * 8]; + const float32x2_t *v1619 = &v5[istride * 22]; + const float32x2_t *v1628 = &v5[istride * 15]; + const float32x2_t *v1646 = &v5[istride * 12]; + const float32x2_t *v1655 = &v5[istride * 26]; + const float32x2_t *v1664 = &v5[istride * 19]; + const float32x2_t *v1673 = &v5[istride * 5]; + const float32x2_t *v1682 = &v5[istride * 16]; + const float32x2_t *v1691 = &v5[istride * 2]; + const float32x2_t *v1700 = &v5[istride * 23]; + const float32x2_t *v1709 = &v5[istride * 9]; + const float32x2_t *v1718 = &v5[istride * 20]; + const float32x2_t *v1727 = &v5[istride * 6]; + const float32x2_t *v1736 = &v5[istride * 27]; + const float32x2_t *v1745 = &v5[istride * 13]; + const float32x2_t *v1754 = &v5[istride * 24]; + const float32x2_t *v1763 = &v5[istride * 10]; + const float32x2_t *v1772 = &v5[istride * 3]; + const float32x2_t *v1781 = &v5[istride * 17]; + float32x2_t *v1800 = &v6[ostride * 21]; + float32x2_t *v1809 = &v6[ostride * 14]; + float32x2_t *v1818 = &v6[ostride * 7]; + float32x2_t *v1827 = &v6[ostride * 8]; + float32x2_t *v1845 = &v6[ostride * 22]; + float32x2_t *v1854 = &v6[ostride * 15]; + float32x2_t *v1863 = &v6[ostride * 16]; + float32x2_t *v1872 = &v6[ostride * 9]; + float32x2_t *v1881 = &v6[ostride * 2]; + float32x2_t *v1890 = &v6[ostride * 23]; + float32x2_t *v1899 = &v6[ostride * 24]; + float32x2_t *v1908 = &v6[ostride * 17]; + float32x2_t *v1917 = &v6[ostride * 10]; + float32x2_t *v1926 = &v6[ostride * 3]; + float32x2_t *v1935 = &v6[ostride * 4]; + float32x2_t *v1944 = &v6[ostride * 25]; + float32x2_t *v1953 = &v6[ostride * 18]; + float32x2_t *v1962 = &v6[ostride * 11]; + float32x2_t *v1971 = &v6[ostride * 12]; + float32x2_t *v1980 = &v6[ostride * 5]; + float32x2_t *v1989 = &v6[ostride * 26]; + float32x2_t *v1998 = &v6[ostride * 19]; + float32x2_t *v2007 = &v6[ostride * 20]; + float32x2_t *v2016 = &v6[ostride * 13]; + float32x2_t *v2025 = &v6[ostride * 6]; + float32x2_t *v2034 = &v6[ostride * 27]; + float32x4_t v2038 = vld1q_f32((const float32_t *)v1538); + float32x4_t v493 = vcombine_f32(v491, v491); + float32x4_t v501 = vcombine_f32(v499, v499); + float32x4_t v509 = vcombine_f32(v507, v507); + float32x4_t v517 = vcombine_f32(v515, v515); + float32x4_t v561 = vcombine_f32(v559, v559); + float32x4_t v569 = vcombine_f32(v567, v567); + float32x4_t v577 = vcombine_f32(v575, v575); + float32x4_t v585 = vcombine_f32(v583, v583); + float32x4_t v593 = vcombine_f32(v591, v591); + float32x4_t v2040 = vld1q_f32((const float32_t *)v1547); + float32x4_t v2042 = vld1q_f32((const float32_t *)v1556); + float32x4_t v2044 = vld1q_f32((const float32_t *)v1565); + float32x4_t v2046 = vld1q_f32((const float32_t *)v1574); + float32x4_t v2048 = vld1q_f32((const float32_t *)v1583); + float32x4_t v2050 = vld1q_f32((const float32_t *)v1592); + float32x4_t v2052 = vld1q_f32((const float32_t *)v1601); + float32x4_t v2054 = vld1q_f32((const float32_t *)v1610); + float32x4_t v2056 = vld1q_f32((const float32_t *)v1619); + float32x4_t v2058 = vld1q_f32((const float32_t *)v1628); + float32x4_t v2062 = vld1q_f32((const float32_t *)v1646); + float32x4_t v2064 = vld1q_f32((const float32_t *)v1655); + float32x4_t v2066 = vld1q_f32((const float32_t *)v1664); + float32x4_t v2068 = vld1q_f32((const float32_t *)v1673); + float32x4_t v2070 = vld1q_f32((const float32_t *)v1682); + float32x4_t v2072 = vld1q_f32((const float32_t *)v1691); + float32x4_t v2074 = vld1q_f32((const float32_t *)v1700); + float32x4_t v2076 = vld1q_f32((const float32_t *)v1709); + float32x4_t v2078 = vld1q_f32((const float32_t *)v1718); + float32x4_t v2080 = vld1q_f32((const float32_t *)v1727); + float32x4_t v2082 = vld1q_f32((const float32_t *)v1736); + float32x4_t v2084 = vld1q_f32((const float32_t *)v1745); + float32x4_t v2086 = vld1q_f32((const float32_t *)v1754); + float32x4_t v2088 = vld1q_f32((const float32_t *)v1763); + float32x4_t v2090 = vld1q_f32((const float32_t *)v1772); + float32x4_t v2092 = vld1q_f32((const float32_t *)v1781); + float32x4_t v35 = vaddq_f32(v2038, v2040); + float32x4_t v36 = vsubq_f32(v2038, v2040); + float32x4_t v51 = vaddq_f32(v2042, v2044); + float32x4_t v52 = vsubq_f32(v2042, v2044); + float32x4_t v69 = vaddq_f32(v2046, v2048); + float32x4_t v70 = vsubq_f32(v2046, v2048); + float32x4_t v85 = vaddq_f32(v2050, v2052); + float32x4_t v86 = vsubq_f32(v2050, v2052); + float32x4_t v103 = vaddq_f32(v2054, v2056); + float32x4_t v104 = vsubq_f32(v2054, v2056); + float32x4_t v119 = vaddq_f32(v2058, v2060); + float32x4_t v120 = vsubq_f32(v2058, v2060); + float32x4_t v137 = vaddq_f32(v2062, v2064); + float32x4_t v138 = vsubq_f32(v2062, v2064); + float32x4_t v153 = vaddq_f32(v2066, v2068); + float32x4_t v154 = vsubq_f32(v2066, v2068); + float32x4_t v171 = vaddq_f32(v2070, v2072); + float32x4_t v172 = vsubq_f32(v2070, v2072); + float32x4_t v187 = vaddq_f32(v2074, v2076); + float32x4_t v188 = vsubq_f32(v2074, v2076); + float32x4_t v205 = vaddq_f32(v2078, v2080); + float32x4_t v206 = vsubq_f32(v2078, v2080); + float32x4_t v221 = vaddq_f32(v2082, v2084); + float32x4_t v222 = vsubq_f32(v2082, v2084); + float32x4_t v239 = vaddq_f32(v2086, v2088); + float32x4_t v240 = vsubq_f32(v2086, v2088); + float32x4_t v255 = vaddq_f32(v2090, v2092); + float32x4_t v256 = vsubq_f32(v2090, v2092); + float32x4_t v53 = vaddq_f32(v35, v51); + float32x4_t v54 = vsubq_f32(v35, v51); + float32x4_t v87 = vaddq_f32(v69, v85); + float32x4_t v88 = vsubq_f32(v69, v85); + float32x4_t v121 = vaddq_f32(v103, v119); + float32x4_t v122 = vsubq_f32(v103, v119); + float32x4_t v155 = vaddq_f32(v137, v153); + float32x4_t v156 = vsubq_f32(v137, v153); + float32x4_t v189 = vaddq_f32(v171, v187); + float32x4_t v190 = vsubq_f32(v171, v187); + float32x4_t v223 = vaddq_f32(v205, v221); + float32x4_t v224 = vsubq_f32(v205, v221); + float32x4_t v257 = vaddq_f32(v239, v255); + float32x4_t v258 = vsubq_f32(v239, v255); + float32x4_t v445 = vaddq_f32(v70, v240); + float32x4_t v446 = vsubq_f32(v70, v240); + float32x4_t v447 = vaddq_f32(v172, v138); + float32x4_t v448 = vsubq_f32(v172, v138); + float32x4_t v449 = vaddq_f32(v104, v206); + float32x4_t v450 = vsubq_f32(v104, v206); + float32x4_t v538 = vaddq_f32(v86, v256); + float32x4_t v539 = vsubq_f32(v86, v256); + float32x4_t v540 = vaddq_f32(v188, v154); + float32x4_t v541 = vsubq_f32(v188, v154); + float32x4_t v542 = vaddq_f32(v120, v222); + float32x4_t v543 = vsubq_f32(v120, v222); + float32x4_t v259 = vaddq_f32(v87, v257); + float32x4_t v260 = vsubq_f32(v87, v257); + float32x4_t v261 = vaddq_f32(v189, v155); + float32x4_t v262 = vsubq_f32(v189, v155); + float32x4_t v263 = vaddq_f32(v121, v223); + float32x4_t v264 = vsubq_f32(v121, v223); + float32x4_t v352 = vaddq_f32(v88, v258); + float32x4_t v353 = vsubq_f32(v88, v258); + float32x4_t v354 = vaddq_f32(v190, v156); + float32x4_t v355 = vsubq_f32(v190, v156); + float32x4_t v356 = vaddq_f32(v122, v224); + float32x4_t v357 = vsubq_f32(v122, v224); + float32x4_t v451 = vaddq_f32(v445, v447); + float32x4_t v454 = vsubq_f32(v445, v447); + float32x4_t v455 = vsubq_f32(v447, v449); + float32x4_t v456 = vsubq_f32(v449, v445); + float32x4_t v457 = vaddq_f32(v446, v448); + float32x4_t v459 = vsubq_f32(v446, v448); + float32x4_t v460 = vsubq_f32(v448, v450); + float32x4_t v461 = vsubq_f32(v450, v446); + float32x4_t v544 = vaddq_f32(v538, v540); + float32x4_t v547 = vsubq_f32(v538, v540); + float32x4_t v548 = vsubq_f32(v540, v542); + float32x4_t v549 = vsubq_f32(v542, v538); + float32x4_t v550 = vaddq_f32(v539, v541); + float32x4_t v552 = vsubq_f32(v539, v541); + float32x4_t v553 = vsubq_f32(v541, v543); + float32x4_t v554 = vsubq_f32(v543, v539); + float32x4_t v265 = vaddq_f32(v259, v261); + float32x4_t v268 = vsubq_f32(v259, v261); + float32x4_t v269 = vsubq_f32(v261, v263); + float32x4_t v270 = vsubq_f32(v263, v259); + float32x4_t v271 = vaddq_f32(v260, v262); + float32x4_t v273 = vsubq_f32(v260, v262); + float32x4_t v274 = vsubq_f32(v262, v264); + float32x4_t v275 = vsubq_f32(v264, v260); + float32x4_t v358 = vaddq_f32(v352, v354); + float32x4_t v361 = vsubq_f32(v352, v354); + float32x4_t v362 = vsubq_f32(v354, v356); + float32x4_t v363 = vsubq_f32(v356, v352); + float32x4_t v364 = vaddq_f32(v353, v355); + float32x4_t v366 = vsubq_f32(v353, v355); + float32x4_t v367 = vsubq_f32(v355, v357); + float32x4_t v368 = vsubq_f32(v357, v353); + float32x4_t v452 = vaddq_f32(v451, v449); + float32x4_t v458 = vaddq_f32(v457, v450); + float32x4_t v476 = vmulq_f32(v454, v475); + float32x4_t v481 = vmulq_f32(v455, v480); + float32x4_t v486 = vmulq_f32(v456, v485); + float32x4_t v500 = vrev64q_f32(v459); + float32x4_t v508 = vrev64q_f32(v460); + float32x4_t v516 = vrev64q_f32(v461); + float32x4_t v545 = vaddq_f32(v544, v542); + float32x4_t v551 = vaddq_f32(v550, v543); + float32x4_t v576 = vrev64q_f32(v547); + float32x4_t v584 = vrev64q_f32(v548); + float32x4_t v592 = vrev64q_f32(v549); + float32x4_t v604 = vmulq_f32(v552, v603); + float32x4_t v609 = vmulq_f32(v553, v608); + float32x4_t v614 = vmulq_f32(v554, v613); + float32x4_t v266 = vaddq_f32(v265, v263); + float32x4_t v272 = vaddq_f32(v271, v264); + float32x4_t v290 = vmulq_f32(v268, v475); + float32x4_t v295 = vmulq_f32(v269, v480); + float32x4_t v300 = vmulq_f32(v270, v485); + float32x4_t v314 = vrev64q_f32(v273); + float32x4_t v322 = vrev64q_f32(v274); + float32x4_t v330 = vrev64q_f32(v275); + float32x4_t v359 = vaddq_f32(v358, v356); + float32x4_t v365 = vaddq_f32(v364, v357); + float32x4_t v383 = vmulq_f32(v361, v475); + float32x4_t v388 = vmulq_f32(v362, v480); + float32x4_t v393 = vmulq_f32(v363, v485); + float32x4_t v407 = vrev64q_f32(v366); + float32x4_t v415 = vrev64q_f32(v367); + float32x4_t v423 = vrev64q_f32(v368); + float32x4_t v453 = vaddq_f32(v452, v36); + float32x4_t v471 = vmulq_f32(v452, v470); + float32x4_t v492 = vrev64q_f32(v458); + float32x4_t v502 = vmulq_f32(v500, v501); + float32x4_t v510 = vmulq_f32(v508, v509); + float32x4_t v518 = vmulq_f32(v516, v517); + float32x4_t v546 = vaddq_f32(v545, v52); + float32x4_t v568 = vrev64q_f32(v545); + float32x4_t v578 = vmulq_f32(v576, v577); + float32x4_t v586 = vmulq_f32(v584, v585); + float32x4_t v594 = vmulq_f32(v592, v593); + float32x4_t v599 = vmulq_f32(v551, v598); + float32x4_t v267 = vaddq_f32(v266, v53); + float32x4_t v285 = vmulq_f32(v266, v470); + float32x4_t v306 = vrev64q_f32(v272); + float32x4_t v316 = vmulq_f32(v314, v501); + float32x4_t v324 = vmulq_f32(v322, v509); + float32x4_t v332 = vmulq_f32(v330, v517); + float32x4_t v360 = vaddq_f32(v359, v54); + float32x4_t v378 = vmulq_f32(v359, v470); + float32x4_t v399 = vrev64q_f32(v365); + float32x4_t v409 = vmulq_f32(v407, v501); + float32x4_t v417 = vmulq_f32(v415, v509); + float32x4_t v425 = vmulq_f32(v423, v517); + float32x4_t v494 = vmulq_f32(v492, v493); + float32x4_t v519 = vaddq_f32(v453, v471); + float32x4_t v560 = vrev64q_f32(v546); + float32x4_t v570 = vmulq_f32(v568, v569); + float32x4_t v622 = vaddq_f32(v599, v604); + float32x4_t v624 = vsubq_f32(v599, v604); + float32x4_t v626 = vsubq_f32(v599, v609); + float32x4_t v308 = vmulq_f32(v306, v493); + float32x4_t v333 = vaddq_f32(v267, v285); + float32x4_t v401 = vmulq_f32(v399, v493); + float32x4_t v426 = vaddq_f32(v360, v378); + float32x4_t v520 = vaddq_f32(v519, v476); + float32x4_t v522 = vsubq_f32(v519, v476); + float32x4_t v524 = vsubq_f32(v519, v481); + float32x4_t v526 = vaddq_f32(v494, v502); + float32x4_t v528 = vsubq_f32(v494, v502); + float32x4_t v530 = vsubq_f32(v494, v510); + float32x4_t v562 = vmulq_f32(v560, v561); + float32x4_t v623 = vaddq_f32(v622, v609); + float32x4_t v625 = vsubq_f32(v624, v614); + float32x4_t v627 = vaddq_f32(v626, v614); + vst1q_f32((float32_t *)v1791, v267); + vst1q_f32((float32_t *)v1809, v360); + float32x4_t v334 = vaddq_f32(v333, v290); + float32x4_t v336 = vsubq_f32(v333, v290); + float32x4_t v338 = vsubq_f32(v333, v295); + float32x4_t v340 = vaddq_f32(v308, v316); + float32x4_t v342 = vsubq_f32(v308, v316); + float32x4_t v344 = vsubq_f32(v308, v324); + float32x4_t v427 = vaddq_f32(v426, v383); + float32x4_t v429 = vsubq_f32(v426, v383); + float32x4_t v431 = vsubq_f32(v426, v388); + float32x4_t v433 = vaddq_f32(v401, v409); + float32x4_t v435 = vsubq_f32(v401, v409); + float32x4_t v437 = vsubq_f32(v401, v417); + float32x4_t v521 = vaddq_f32(v520, v481); + float32x4_t v523 = vsubq_f32(v522, v486); + float32x4_t v525 = vaddq_f32(v524, v486); + float32x4_t v527 = vaddq_f32(v526, v510); + float32x4_t v529 = vsubq_f32(v528, v518); + float32x4_t v531 = vaddq_f32(v530, v518); + float32x4_t v615 = vaddq_f32(v562, v570); + float32x4_t v634 = vaddq_f32(v453, v562); + float32x4_t v635 = vsubq_f32(v453, v562); + float32x4_t v335 = vaddq_f32(v334, v295); + float32x4_t v337 = vsubq_f32(v336, v300); + float32x4_t v339 = vaddq_f32(v338, v300); + float32x4_t v341 = vaddq_f32(v340, v324); + float32x4_t v343 = vsubq_f32(v342, v332); + float32x4_t v345 = vaddq_f32(v344, v332); + float32x4_t v428 = vaddq_f32(v427, v388); + float32x4_t v430 = vsubq_f32(v429, v393); + float32x4_t v432 = vaddq_f32(v431, v393); + float32x4_t v434 = vaddq_f32(v433, v417); + float32x4_t v436 = vsubq_f32(v435, v425); + float32x4_t v438 = vaddq_f32(v437, v425); + float32x4_t v532 = vaddq_f32(v521, v527); + float32x4_t v533 = vsubq_f32(v521, v527); + float32x4_t v534 = vaddq_f32(v523, v529); + float32x4_t v535 = vsubq_f32(v523, v529); + float32x4_t v536 = vaddq_f32(v525, v531); + float32x4_t v537 = vsubq_f32(v525, v531); + float32x4_t v616 = vaddq_f32(v615, v578); + float32x4_t v618 = vsubq_f32(v615, v578); + float32x4_t v620 = vsubq_f32(v615, v586); + vst1q_f32((float32_t *)v1800, v635); + vst1q_f32((float32_t *)v1818, v634); + float32x4_t v346 = vaddq_f32(v335, v341); + float32x4_t v347 = vsubq_f32(v335, v341); + float32x4_t v348 = vaddq_f32(v337, v343); + float32x4_t v349 = vsubq_f32(v337, v343); + float32x4_t v350 = vaddq_f32(v339, v345); + float32x4_t v351 = vsubq_f32(v339, v345); + float32x4_t v439 = vaddq_f32(v428, v434); + float32x4_t v440 = vsubq_f32(v428, v434); + float32x4_t v441 = vaddq_f32(v430, v436); + float32x4_t v442 = vsubq_f32(v430, v436); + float32x4_t v443 = vaddq_f32(v432, v438); + float32x4_t v444 = vsubq_f32(v432, v438); + float32x4_t v617 = vaddq_f32(v616, v586); + float32x4_t v619 = vsubq_f32(v618, v594); + float32x4_t v621 = vaddq_f32(v620, v594); + float32x4_t v628 = vaddq_f32(v617, v623); + float32x4_t v629 = vsubq_f32(v617, v623); + float32x4_t v630 = vaddq_f32(v619, v625); + float32x4_t v631 = vsubq_f32(v619, v625); + float32x4_t v632 = vaddq_f32(v621, v627); + float32x4_t v633 = vsubq_f32(v621, v627); + vst1q_f32((float32_t *)v1827, v347); + vst1q_f32((float32_t *)v1845, v440); + vst1q_f32((float32_t *)v1863, v349); + vst1q_f32((float32_t *)v1881, v442); + vst1q_f32((float32_t *)v1899, v350); + vst1q_f32((float32_t *)v1917, v443); + vst1q_f32((float32_t *)v1935, v351); + vst1q_f32((float32_t *)v1953, v444); + vst1q_f32((float32_t *)v1971, v348); + vst1q_f32((float32_t *)v1989, v441); + vst1q_f32((float32_t *)v2007, v346); + vst1q_f32((float32_t *)v2025, v439); + float32x4_t v664 = vaddq_f32(v533, v629); + float32x4_t v665 = vsubq_f32(v533, v629); + float32x4_t v694 = vaddq_f32(v535, v631); + float32x4_t v695 = vsubq_f32(v535, v631); + float32x4_t v724 = vaddq_f32(v536, v632); + float32x4_t v725 = vsubq_f32(v536, v632); + float32x4_t v754 = vaddq_f32(v537, v633); + float32x4_t v755 = vsubq_f32(v537, v633); + float32x4_t v784 = vaddq_f32(v534, v630); + float32x4_t v785 = vsubq_f32(v534, v630); + float32x4_t v814 = vaddq_f32(v532, v628); + float32x4_t v815 = vsubq_f32(v532, v628); + vst1q_f32((float32_t *)v1836, v665); + vst1q_f32((float32_t *)v1854, v664); + vst1q_f32((float32_t *)v1872, v695); + vst1q_f32((float32_t *)v1890, v694); + vst1q_f32((float32_t *)v1908, v725); + vst1q_f32((float32_t *)v1926, v724); + vst1q_f32((float32_t *)v1944, v755); + vst1q_f32((float32_t *)v1962, v754); + vst1q_f32((float32_t *)v1980, v785); + vst1q_f32((float32_t *)v1998, v784); + vst1q_f32((float32_t *)v2016, v815); + vst1q_f32((float32_t *)v2034, v814); + v5 += 2 * 1; + v6 += 2 * 1; + } + for (int j = v844 * 2; j < howmany; j += 1) { + float32x2_t v925 = v5[istride]; + float v1239 = 4.4095855184409838e-01F; + float v1246 = 3.4087293062393137e-01F; + float v1253 = -5.3396936033772524e-01F; + float v1260 = 8.7484229096165667e-01F; + float v1303 = 1.0000000000000000e+00F; + float v1304 = -1.0000000000000000e+00F; + float v1310 = -1.1666666666666665e+00F; + float v1311 = 1.1666666666666665e+00F; + float v1317 = 7.9015646852540022e-01F; + float v1318 = -7.9015646852540022e-01F; + float v1324 = 5.5854267289647742e-02F; + float v1325 = -5.5854267289647742e-02F; + float v1331 = 7.3430220123575241e-01F; + float v1332 = -7.3430220123575241e-01F; + float32x2_t v1334 = (float32x2_t){v4, v4}; + float v1339 = -4.4095855184409838e-01F; + float v1343 = -3.4087293062393137e-01F; + float v1347 = 5.3396936033772524e-01F; + float v1351 = -8.7484229096165667e-01F; + float32x2_t v856 = v5[0]; + float32x2_t v1225 = (float32x2_t){v1310, v1310}; + float32x2_t v1229 = (float32x2_t){v1317, v1317}; + float32x2_t v1233 = (float32x2_t){v1324, v1324}; + float32x2_t v1237 = (float32x2_t){v1331, v1331}; + float32x2_t v1241 = (float32x2_t){v1239, v1339}; + float32x2_t v1248 = (float32x2_t){v1246, v1343}; + float32x2_t v1255 = (float32x2_t){v1253, v1347}; + float32x2_t v1262 = (float32x2_t){v1260, v1351}; + float32x2_t v1305 = (float32x2_t){v1303, v1304}; + float32x2_t v1312 = (float32x2_t){v1310, v1311}; + float32x2_t v1319 = (float32x2_t){v1317, v1318}; + float32x2_t v1326 = (float32x2_t){v1324, v1325}; + float32x2_t v1333 = (float32x2_t){v1331, v1332}; + float32x2_t v1340 = (float32x2_t){v1339, v1339}; + float32x2_t v1344 = (float32x2_t){v1343, v1343}; + float32x2_t v1348 = (float32x2_t){v1347, v1347}; + float32x2_t v1352 = (float32x2_t){v1351, v1351}; + float32x2_t v861 = v5[istride * 14]; + float32x2_t v868 = v5[istride * 7]; + float32x2_t v873 = v5[istride * 21]; + float32x2_t v882 = v5[istride * 4]; + float32x2_t v887 = v5[istride * 18]; + float32x2_t v894 = v5[istride * 11]; + float32x2_t v899 = v5[istride * 25]; + float32x2_t v908 = v5[istride * 8]; + float32x2_t v913 = v5[istride * 22]; + float32x2_t v920 = v5[istride * 15]; + float32x2_t v934 = v5[istride * 12]; + float32x2_t v939 = v5[istride * 26]; + float32x2_t v946 = v5[istride * 19]; + float32x2_t v951 = v5[istride * 5]; + float32x2_t v960 = v5[istride * 16]; + float32x2_t v965 = v5[istride * 2]; + float32x2_t v972 = v5[istride * 23]; + float32x2_t v977 = v5[istride * 9]; + float32x2_t v986 = v5[istride * 20]; + float32x2_t v991 = v5[istride * 6]; + float32x2_t v998 = v5[istride * 27]; + float32x2_t v1003 = v5[istride * 13]; + float32x2_t v1012 = v5[istride * 24]; + float32x2_t v1017 = v5[istride * 10]; + float32x2_t v1024 = v5[istride * 3]; + float32x2_t v1029 = v5[istride * 17]; + float32x2_t v1243 = vmul_f32(v1334, v1241); + float32x2_t v1250 = vmul_f32(v1334, v1248); + float32x2_t v1257 = vmul_f32(v1334, v1255); + float32x2_t v1264 = vmul_f32(v1334, v1262); + float32x2_t v1307 = vmul_f32(v1334, v1305); + float32x2_t v1314 = vmul_f32(v1334, v1312); + float32x2_t v1321 = vmul_f32(v1334, v1319); + float32x2_t v1328 = vmul_f32(v1334, v1326); + float32x2_t v1335 = vmul_f32(v1334, v1333); + float32x2_t v862 = vadd_f32(v856, v861); + float32x2_t v863 = vsub_f32(v856, v861); + float32x2_t v874 = vadd_f32(v868, v873); + float32x2_t v875 = vsub_f32(v868, v873); + float32x2_t v888 = vadd_f32(v882, v887); + float32x2_t v889 = vsub_f32(v882, v887); + float32x2_t v900 = vadd_f32(v894, v899); + float32x2_t v901 = vsub_f32(v894, v899); + float32x2_t v914 = vadd_f32(v908, v913); + float32x2_t v915 = vsub_f32(v908, v913); + float32x2_t v926 = vadd_f32(v920, v925); + float32x2_t v927 = vsub_f32(v920, v925); + float32x2_t v940 = vadd_f32(v934, v939); + float32x2_t v941 = vsub_f32(v934, v939); + float32x2_t v952 = vadd_f32(v946, v951); + float32x2_t v953 = vsub_f32(v946, v951); + float32x2_t v966 = vadd_f32(v960, v965); + float32x2_t v967 = vsub_f32(v960, v965); + float32x2_t v978 = vadd_f32(v972, v977); + float32x2_t v979 = vsub_f32(v972, v977); + float32x2_t v992 = vadd_f32(v986, v991); + float32x2_t v993 = vsub_f32(v986, v991); + float32x2_t v1004 = vadd_f32(v998, v1003); + float32x2_t v1005 = vsub_f32(v998, v1003); + float32x2_t v1018 = vadd_f32(v1012, v1017); + float32x2_t v1019 = vsub_f32(v1012, v1017); + float32x2_t v1030 = vadd_f32(v1024, v1029); + float32x2_t v1031 = vsub_f32(v1024, v1029); + float32x2_t v876 = vadd_f32(v862, v874); + float32x2_t v877 = vsub_f32(v862, v874); + float32x2_t v902 = vadd_f32(v888, v900); + float32x2_t v903 = vsub_f32(v888, v900); + float32x2_t v928 = vadd_f32(v914, v926); + float32x2_t v929 = vsub_f32(v914, v926); + float32x2_t v954 = vadd_f32(v940, v952); + float32x2_t v955 = vsub_f32(v940, v952); + float32x2_t v980 = vadd_f32(v966, v978); + float32x2_t v981 = vsub_f32(v966, v978); + float32x2_t v1006 = vadd_f32(v992, v1004); + float32x2_t v1007 = vsub_f32(v992, v1004); + float32x2_t v1032 = vadd_f32(v1018, v1030); + float32x2_t v1033 = vsub_f32(v1018, v1030); + float32x2_t v1202 = vadd_f32(v889, v1019); + float32x2_t v1203 = vsub_f32(v889, v1019); + float32x2_t v1204 = vadd_f32(v967, v941); + float32x2_t v1205 = vsub_f32(v967, v941); + float32x2_t v1206 = vadd_f32(v915, v993); + float32x2_t v1207 = vsub_f32(v915, v993); + float32x2_t v1286 = vadd_f32(v901, v1031); + float32x2_t v1287 = vsub_f32(v901, v1031); + float32x2_t v1288 = vadd_f32(v979, v953); + float32x2_t v1289 = vsub_f32(v979, v953); + float32x2_t v1290 = vadd_f32(v927, v1005); + float32x2_t v1291 = vsub_f32(v927, v1005); + float32x2_t v1034 = vadd_f32(v902, v1032); + float32x2_t v1035 = vsub_f32(v902, v1032); + float32x2_t v1036 = vadd_f32(v980, v954); + float32x2_t v1037 = vsub_f32(v980, v954); + float32x2_t v1038 = vadd_f32(v928, v1006); + float32x2_t v1039 = vsub_f32(v928, v1006); + float32x2_t v1118 = vadd_f32(v903, v1033); + float32x2_t v1119 = vsub_f32(v903, v1033); + float32x2_t v1120 = vadd_f32(v981, v955); + float32x2_t v1121 = vsub_f32(v981, v955); + float32x2_t v1122 = vadd_f32(v929, v1007); + float32x2_t v1123 = vsub_f32(v929, v1007); + float32x2_t v1208 = vadd_f32(v1202, v1204); + float32x2_t v1211 = vsub_f32(v1202, v1204); + float32x2_t v1212 = vsub_f32(v1204, v1206); + float32x2_t v1213 = vsub_f32(v1206, v1202); + float32x2_t v1214 = vadd_f32(v1203, v1205); + float32x2_t v1216 = vsub_f32(v1203, v1205); + float32x2_t v1217 = vsub_f32(v1205, v1207); + float32x2_t v1218 = vsub_f32(v1207, v1203); + float32x2_t v1292 = vadd_f32(v1286, v1288); + float32x2_t v1295 = vsub_f32(v1286, v1288); + float32x2_t v1296 = vsub_f32(v1288, v1290); + float32x2_t v1297 = vsub_f32(v1290, v1286); + float32x2_t v1298 = vadd_f32(v1287, v1289); + float32x2_t v1300 = vsub_f32(v1287, v1289); + float32x2_t v1301 = vsub_f32(v1289, v1291); + float32x2_t v1302 = vsub_f32(v1291, v1287); + float32x2_t v1040 = vadd_f32(v1034, v1036); + float32x2_t v1043 = vsub_f32(v1034, v1036); + float32x2_t v1044 = vsub_f32(v1036, v1038); + float32x2_t v1045 = vsub_f32(v1038, v1034); + float32x2_t v1046 = vadd_f32(v1035, v1037); + float32x2_t v1048 = vsub_f32(v1035, v1037); + float32x2_t v1049 = vsub_f32(v1037, v1039); + float32x2_t v1050 = vsub_f32(v1039, v1035); + float32x2_t v1124 = vadd_f32(v1118, v1120); + float32x2_t v1127 = vsub_f32(v1118, v1120); + float32x2_t v1128 = vsub_f32(v1120, v1122); + float32x2_t v1129 = vsub_f32(v1122, v1118); + float32x2_t v1130 = vadd_f32(v1119, v1121); + float32x2_t v1132 = vsub_f32(v1119, v1121); + float32x2_t v1133 = vsub_f32(v1121, v1123); + float32x2_t v1134 = vsub_f32(v1123, v1119); + float32x2_t v1209 = vadd_f32(v1208, v1206); + float32x2_t v1215 = vadd_f32(v1214, v1207); + float32x2_t v1230 = vmul_f32(v1211, v1229); + float32x2_t v1234 = vmul_f32(v1212, v1233); + float32x2_t v1238 = vmul_f32(v1213, v1237); + float32x2_t v1251 = vrev64_f32(v1216); + float32x2_t v1258 = vrev64_f32(v1217); + float32x2_t v1265 = vrev64_f32(v1218); + float32x2_t v1293 = vadd_f32(v1292, v1290); + float32x2_t v1299 = vadd_f32(v1298, v1291); + float32x2_t v1322 = vrev64_f32(v1295); + float32x2_t v1329 = vrev64_f32(v1296); + float32x2_t v1336 = vrev64_f32(v1297); + float32x2_t v1345 = vmul_f32(v1300, v1344); + float32x2_t v1349 = vmul_f32(v1301, v1348); + float32x2_t v1353 = vmul_f32(v1302, v1352); + float32x2_t v1041 = vadd_f32(v1040, v1038); + float32x2_t v1047 = vadd_f32(v1046, v1039); + float32x2_t v1062 = vmul_f32(v1043, v1229); + float32x2_t v1066 = vmul_f32(v1044, v1233); + float32x2_t v1070 = vmul_f32(v1045, v1237); + float32x2_t v1083 = vrev64_f32(v1048); + float32x2_t v1090 = vrev64_f32(v1049); + float32x2_t v1097 = vrev64_f32(v1050); + float32x2_t v1125 = vadd_f32(v1124, v1122); + float32x2_t v1131 = vadd_f32(v1130, v1123); + float32x2_t v1146 = vmul_f32(v1127, v1229); + float32x2_t v1150 = vmul_f32(v1128, v1233); + float32x2_t v1154 = vmul_f32(v1129, v1237); + float32x2_t v1167 = vrev64_f32(v1132); + float32x2_t v1174 = vrev64_f32(v1133); + float32x2_t v1181 = vrev64_f32(v1134); + float32x2_t v1210 = vadd_f32(v1209, v863); + float32x2_t v1226 = vmul_f32(v1209, v1225); + float32x2_t v1244 = vrev64_f32(v1215); + float32x2_t v1252 = vmul_f32(v1251, v1250); + float32x2_t v1259 = vmul_f32(v1258, v1257); + float32x2_t v1266 = vmul_f32(v1265, v1264); + float32x2_t v1294 = vadd_f32(v1293, v875); + float32x2_t v1315 = vrev64_f32(v1293); + float32x2_t v1323 = vmul_f32(v1322, v1321); + float32x2_t v1330 = vmul_f32(v1329, v1328); + float32x2_t v1337 = vmul_f32(v1336, v1335); + float32x2_t v1341 = vmul_f32(v1299, v1340); + float32x2_t v1042 = vadd_f32(v1041, v876); + float32x2_t v1058 = vmul_f32(v1041, v1225); + float32x2_t v1076 = vrev64_f32(v1047); + float32x2_t v1084 = vmul_f32(v1083, v1250); + float32x2_t v1091 = vmul_f32(v1090, v1257); + float32x2_t v1098 = vmul_f32(v1097, v1264); + float32x2_t v1126 = vadd_f32(v1125, v877); + float32x2_t v1142 = vmul_f32(v1125, v1225); + float32x2_t v1160 = vrev64_f32(v1131); + float32x2_t v1168 = vmul_f32(v1167, v1250); + float32x2_t v1175 = vmul_f32(v1174, v1257); + float32x2_t v1182 = vmul_f32(v1181, v1264); + float32x2_t v1245 = vmul_f32(v1244, v1243); + float32x2_t v1267 = vadd_f32(v1210, v1226); + float32x2_t v1308 = vrev64_f32(v1294); + float32x2_t v1316 = vmul_f32(v1315, v1314); + float32x2_t v1361 = vadd_f32(v1341, v1345); + float32x2_t v1363 = vsub_f32(v1341, v1345); + float32x2_t v1365 = vsub_f32(v1341, v1349); + float32x2_t v1077 = vmul_f32(v1076, v1243); + float32x2_t v1099 = vadd_f32(v1042, v1058); + float32x2_t v1161 = vmul_f32(v1160, v1243); + float32x2_t v1183 = vadd_f32(v1126, v1142); + float32x2_t v1268 = vadd_f32(v1267, v1230); + float32x2_t v1270 = vsub_f32(v1267, v1230); + float32x2_t v1272 = vsub_f32(v1267, v1234); + float32x2_t v1274 = vadd_f32(v1245, v1252); + float32x2_t v1276 = vsub_f32(v1245, v1252); + float32x2_t v1278 = vsub_f32(v1245, v1259); + float32x2_t v1309 = vmul_f32(v1308, v1307); + float32x2_t v1362 = vadd_f32(v1361, v1349); + float32x2_t v1364 = vsub_f32(v1363, v1353); + float32x2_t v1366 = vadd_f32(v1365, v1353); + v6[0] = v1042; + v6[ostride * 14] = v1126; + float32x2_t v1100 = vadd_f32(v1099, v1062); + float32x2_t v1102 = vsub_f32(v1099, v1062); + float32x2_t v1104 = vsub_f32(v1099, v1066); + float32x2_t v1106 = vadd_f32(v1077, v1084); + float32x2_t v1108 = vsub_f32(v1077, v1084); + float32x2_t v1110 = vsub_f32(v1077, v1091); + float32x2_t v1184 = vadd_f32(v1183, v1146); + float32x2_t v1186 = vsub_f32(v1183, v1146); + float32x2_t v1188 = vsub_f32(v1183, v1150); + float32x2_t v1190 = vadd_f32(v1161, v1168); + float32x2_t v1192 = vsub_f32(v1161, v1168); + float32x2_t v1194 = vsub_f32(v1161, v1175); + float32x2_t v1269 = vadd_f32(v1268, v1234); + float32x2_t v1271 = vsub_f32(v1270, v1238); + float32x2_t v1273 = vadd_f32(v1272, v1238); + float32x2_t v1275 = vadd_f32(v1274, v1259); + float32x2_t v1277 = vsub_f32(v1276, v1266); + float32x2_t v1279 = vadd_f32(v1278, v1266); + float32x2_t v1354 = vadd_f32(v1309, v1316); + float32x2_t v1373 = vadd_f32(v1210, v1309); + float32x2_t v1374 = vsub_f32(v1210, v1309); + float32x2_t v1101 = vadd_f32(v1100, v1066); + float32x2_t v1103 = vsub_f32(v1102, v1070); + float32x2_t v1105 = vadd_f32(v1104, v1070); + float32x2_t v1107 = vadd_f32(v1106, v1091); + float32x2_t v1109 = vsub_f32(v1108, v1098); + float32x2_t v1111 = vadd_f32(v1110, v1098); + float32x2_t v1185 = vadd_f32(v1184, v1150); + float32x2_t v1187 = vsub_f32(v1186, v1154); + float32x2_t v1189 = vadd_f32(v1188, v1154); + float32x2_t v1191 = vadd_f32(v1190, v1175); + float32x2_t v1193 = vsub_f32(v1192, v1182); + float32x2_t v1195 = vadd_f32(v1194, v1182); + float32x2_t v1280 = vadd_f32(v1269, v1275); + float32x2_t v1281 = vsub_f32(v1269, v1275); + float32x2_t v1282 = vadd_f32(v1271, v1277); + float32x2_t v1283 = vsub_f32(v1271, v1277); + float32x2_t v1284 = vadd_f32(v1273, v1279); + float32x2_t v1285 = vsub_f32(v1273, v1279); + float32x2_t v1355 = vadd_f32(v1354, v1323); + float32x2_t v1357 = vsub_f32(v1354, v1323); + float32x2_t v1359 = vsub_f32(v1354, v1330); + v6[ostride * 21] = v1374; + v6[ostride * 7] = v1373; + float32x2_t v1112 = vadd_f32(v1101, v1107); + float32x2_t v1113 = vsub_f32(v1101, v1107); + float32x2_t v1114 = vadd_f32(v1103, v1109); + float32x2_t v1115 = vsub_f32(v1103, v1109); + float32x2_t v1116 = vadd_f32(v1105, v1111); + float32x2_t v1117 = vsub_f32(v1105, v1111); + float32x2_t v1196 = vadd_f32(v1185, v1191); + float32x2_t v1197 = vsub_f32(v1185, v1191); + float32x2_t v1198 = vadd_f32(v1187, v1193); + float32x2_t v1199 = vsub_f32(v1187, v1193); + float32x2_t v1200 = vadd_f32(v1189, v1195); + float32x2_t v1201 = vsub_f32(v1189, v1195); + float32x2_t v1356 = vadd_f32(v1355, v1330); + float32x2_t v1358 = vsub_f32(v1357, v1337); + float32x2_t v1360 = vadd_f32(v1359, v1337); + float32x2_t v1367 = vadd_f32(v1356, v1362); + float32x2_t v1368 = vsub_f32(v1356, v1362); + float32x2_t v1369 = vadd_f32(v1358, v1364); + float32x2_t v1370 = vsub_f32(v1358, v1364); + float32x2_t v1371 = vadd_f32(v1360, v1366); + float32x2_t v1372 = vsub_f32(v1360, v1366); + v6[ostride * 8] = v1113; + v6[ostride * 22] = v1197; + v6[ostride * 16] = v1115; + v6[ostride * 2] = v1199; + v6[ostride * 24] = v1116; + v6[ostride * 10] = v1200; + v6[ostride * 4] = v1117; + v6[ostride * 18] = v1201; + v6[ostride * 12] = v1114; + v6[ostride * 26] = v1198; + v6[ostride * 20] = v1112; + v6[ostride * 6] = v1196; + float32x2_t v1395 = vadd_f32(v1281, v1368); + float32x2_t v1396 = vsub_f32(v1281, v1368); + float32x2_t v1417 = vadd_f32(v1283, v1370); + float32x2_t v1418 = vsub_f32(v1283, v1370); + float32x2_t v1439 = vadd_f32(v1284, v1371); + float32x2_t v1440 = vsub_f32(v1284, v1371); + float32x2_t v1461 = vadd_f32(v1285, v1372); + float32x2_t v1462 = vsub_f32(v1285, v1372); + float32x2_t v1483 = vadd_f32(v1282, v1369); + float32x2_t v1484 = vsub_f32(v1282, v1369); + float32x2_t v1505 = vadd_f32(v1280, v1367); + float32x2_t v1506 = vsub_f32(v1280, v1367); + v6[ostride] = v1396; + v6[ostride * 15] = v1395; + v6[ostride * 9] = v1418; + v6[ostride * 23] = v1417; + v6[ostride * 17] = v1440; + v6[ostride * 3] = v1439; + v6[ostride * 25] = v1462; + v6[ostride * 11] = v1461; + v6[ostride * 5] = v1484; + v6[ostride * 19] = v1483; + v6[ostride * 13] = v1506; + v6[ostride * 27] = v1505; + v5 += 1 * 1; + v6 += 1 * 1; + } +} +#endif + +#ifdef ARMRAL_ARCH_SVE +void armral_fft_cf32_cf32_cf32_ac_n_uu28(const armral_cmplx_f32_t *restrict x, + armral_cmplx_f32_t *restrict y, + int istride, int ostride, int howmany, + float dir) { + int64_t v0 = istride; + int64_t v2 = ostride; + float v4 = dir; + const float32x2_t *v5 = (const float32x2_t *)x; + float32x2_t *v6 = (float32x2_t *)y; + int64_t v8 = howmany; + int64_t v10 = svcntd(); + int64_t v11 = v10 * 1; + int64_t v12 = v10 * 1; + for (int j = 0; j < v8; j += v10) { + svbool_t pred_full = svwhilelt_b32(j * 2, howmany * 2); + float v457 = -1.1666666666666665e+00F; + float v462 = 7.9015646852540022e-01F; + float v467 = 5.5854267289647742e-02F; + float v472 = 7.3430220123575241e-01F; + float v541 = -1.0000000000000000e+00F; + float v548 = 1.1666666666666665e+00F; + float v555 = -7.9015646852540022e-01F; + float v562 = -5.5854267289647742e-02F; + float v569 = -7.3430220123575241e-01F; + float v576 = -4.4095855184409838e-01F; + float v581 = -3.4087293062393137e-01F; + float v586 = 5.3396936033772524e-01F; + float v591 = -8.7484229096165667e-01F; + const float32x2_t *v930 = &v5[v0]; + float32x2_t *v1165 = &v6[v2]; + int64_t v26 = v0 * 14; + int64_t v35 = v0 * 7; + int64_t v42 = v0 * 21; + int64_t v53 = v0 * 4; + int64_t v60 = v0 * 18; + int64_t v69 = v0 * 11; + int64_t v76 = v0 * 25; + int64_t v87 = v0 * 8; + int64_t v94 = v0 * 22; + int64_t v103 = v0 * 15; + int64_t v121 = v0 * 12; + int64_t v128 = v0 * 26; + int64_t v137 = v0 * 19; + int64_t v144 = v0 * 5; + int64_t v155 = v0 * 16; + int64_t v162 = v0 * 2; + int64_t v171 = v0 * 23; + int64_t v178 = v0 * 9; + int64_t v189 = v0 * 20; + int64_t v196 = v0 * 6; + int64_t v205 = v0 * 27; + int64_t v212 = v0 * 13; + int64_t v223 = v0 * 24; + int64_t v230 = v0 * 10; + int64_t v239 = v0 * 3; + int64_t v246 = v0 * 17; + float v480 = v4 * v576; + float v487 = v4 * v581; + float v494 = v4 * v586; + float v501 = v4 * v591; + float v544 = v4 * v541; + float v551 = v4 * v548; + float v558 = v4 * v555; + float v565 = v4 * v562; + float v572 = v4 * v569; + int64_t v624 = v2 * 21; + int64_t v631 = v2 * 14; + int64_t v638 = v2 * 7; + int64_t v647 = v2 * 8; + int64_t v661 = v2 * 22; + int64_t v668 = v2 * 15; + int64_t v677 = v2 * 16; + int64_t v684 = v2 * 9; + int64_t v691 = v2 * 2; + int64_t v698 = v2 * 23; + int64_t v707 = v2 * 24; + int64_t v714 = v2 * 17; + int64_t v721 = v2 * 10; + int64_t v728 = v2 * 3; + int64_t v737 = v2 * 4; + int64_t v744 = v2 * 25; + int64_t v751 = v2 * 18; + int64_t v758 = v2 * 11; + int64_t v767 = v2 * 12; + int64_t v774 = v2 * 5; + int64_t v781 = v2 * 26; + int64_t v788 = v2 * 19; + int64_t v797 = v2 * 20; + int64_t v804 = v2 * 13; + int64_t v811 = v2 * 6; + int64_t v818 = v2 * 27; + const float32x2_t *v831 = &v5[0]; + svfloat32_t v1096 = svdup_n_f32(v457); + svfloat32_t v1097 = svdup_n_f32(v462); + svfloat32_t v1098 = svdup_n_f32(v467); + svfloat32_t v1099 = svdup_n_f32(v472); + svfloat32_t v1109 = svdup_n_f32(v576); + svfloat32_t v1110 = svdup_n_f32(v581); + svfloat32_t v1111 = svdup_n_f32(v586); + svfloat32_t v1112 = svdup_n_f32(v591); + float32x2_t *v1120 = &v6[0]; + svfloat32_t v1389 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v930)[0])); + const float32x2_t *v840 = &v5[v26]; + const float32x2_t *v849 = &v5[v35]; + const float32x2_t *v858 = &v5[v42]; + const float32x2_t *v867 = &v5[v53]; + const float32x2_t *v876 = &v5[v60]; + const float32x2_t *v885 = &v5[v69]; + const float32x2_t *v894 = &v5[v76]; + const float32x2_t *v903 = &v5[v87]; + const float32x2_t *v912 = &v5[v94]; + const float32x2_t *v921 = &v5[v103]; + const float32x2_t *v939 = &v5[v121]; + const float32x2_t *v948 = &v5[v128]; + const float32x2_t *v957 = &v5[v137]; + const float32x2_t *v966 = &v5[v144]; + const float32x2_t *v975 = &v5[v155]; + const float32x2_t *v984 = &v5[v162]; + const float32x2_t *v993 = &v5[v171]; + const float32x2_t *v1002 = &v5[v178]; + const float32x2_t *v1011 = &v5[v189]; + const float32x2_t *v1020 = &v5[v196]; + const float32x2_t *v1029 = &v5[v205]; + const float32x2_t *v1038 = &v5[v212]; + const float32x2_t *v1047 = &v5[v223]; + const float32x2_t *v1056 = &v5[v230]; + const float32x2_t *v1065 = &v5[v239]; + const float32x2_t *v1074 = &v5[v246]; + svfloat32_t v1100 = svdup_n_f32(v480); + svfloat32_t v1101 = svdup_n_f32(v487); + svfloat32_t v1102 = svdup_n_f32(v494); + svfloat32_t v1103 = svdup_n_f32(v501); + svfloat32_t v1104 = svdup_n_f32(v544); + svfloat32_t v1105 = svdup_n_f32(v551); + svfloat32_t v1106 = svdup_n_f32(v558); + svfloat32_t v1107 = svdup_n_f32(v565); + svfloat32_t v1108 = svdup_n_f32(v572); + float32x2_t *v1129 = &v6[v624]; + float32x2_t *v1138 = &v6[v631]; + float32x2_t *v1147 = &v6[v638]; + float32x2_t *v1156 = &v6[v647]; + float32x2_t *v1174 = &v6[v661]; + float32x2_t *v1183 = &v6[v668]; + float32x2_t *v1192 = &v6[v677]; + float32x2_t *v1201 = &v6[v684]; + float32x2_t *v1210 = &v6[v691]; + float32x2_t *v1219 = &v6[v698]; + float32x2_t *v1228 = &v6[v707]; + float32x2_t *v1237 = &v6[v714]; + float32x2_t *v1246 = &v6[v721]; + float32x2_t *v1255 = &v6[v728]; + float32x2_t *v1264 = &v6[v737]; + float32x2_t *v1273 = &v6[v744]; + float32x2_t *v1282 = &v6[v751]; + float32x2_t *v1291 = &v6[v758]; + float32x2_t *v1300 = &v6[v767]; + float32x2_t *v1309 = &v6[v774]; + float32x2_t *v1318 = &v6[v781]; + float32x2_t *v1327 = &v6[v788]; + float32x2_t *v1336 = &v6[v797]; + float32x2_t *v1345 = &v6[v804]; + float32x2_t *v1354 = &v6[v811]; + float32x2_t *v1363 = &v6[v818]; + svfloat32_t v1367 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v831)[0])); + svfloat32_t v1369 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v840)[0])); + svfloat32_t v1371 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v849)[0])); + svfloat32_t v1373 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v858)[0])); + svfloat32_t v1375 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v867)[0])); + svfloat32_t v1377 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v876)[0])); + svfloat32_t v1379 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v885)[0])); + svfloat32_t v1381 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v894)[0])); + svfloat32_t v1383 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v903)[0])); + svfloat32_t v1385 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v912)[0])); + svfloat32_t v1387 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v921)[0])); + svfloat32_t v1391 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v939)[0])); + svfloat32_t v1393 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v948)[0])); + svfloat32_t v1395 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v957)[0])); + svfloat32_t v1397 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v966)[0])); + svfloat32_t v1399 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v975)[0])); + svfloat32_t v1401 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v984)[0])); + svfloat32_t v1403 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v993)[0])); + svfloat32_t v1405 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v1002)[0])); + svfloat32_t v1407 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v1011)[0])); + svfloat32_t v1409 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v1020)[0])); + svfloat32_t v1411 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v1029)[0])); + svfloat32_t v1413 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v1038)[0])); + svfloat32_t v1415 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v1047)[0])); + svfloat32_t v1417 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v1056)[0])); + svfloat32_t v1419 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v1065)[0])); + svfloat32_t v1421 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v1074)[0])); + svfloat32_t v32 = svadd_f32_x(svptrue_b32(), v1367, v1369); + svfloat32_t v33 = svsub_f32_x(svptrue_b32(), v1367, v1369); + svfloat32_t v48 = svadd_f32_x(svptrue_b32(), v1371, v1373); + svfloat32_t v49 = svsub_f32_x(svptrue_b32(), v1371, v1373); + svfloat32_t v66 = svadd_f32_x(svptrue_b32(), v1375, v1377); + svfloat32_t v67 = svsub_f32_x(svptrue_b32(), v1375, v1377); + svfloat32_t v82 = svadd_f32_x(svptrue_b32(), v1379, v1381); + svfloat32_t v83 = svsub_f32_x(svptrue_b32(), v1379, v1381); + svfloat32_t v100 = svadd_f32_x(svptrue_b32(), v1383, v1385); + svfloat32_t v101 = svsub_f32_x(svptrue_b32(), v1383, v1385); + svfloat32_t v116 = svadd_f32_x(svptrue_b32(), v1387, v1389); + svfloat32_t v117 = svsub_f32_x(svptrue_b32(), v1387, v1389); + svfloat32_t v134 = svadd_f32_x(svptrue_b32(), v1391, v1393); + svfloat32_t v135 = svsub_f32_x(svptrue_b32(), v1391, v1393); + svfloat32_t v150 = svadd_f32_x(svptrue_b32(), v1395, v1397); + svfloat32_t v151 = svsub_f32_x(svptrue_b32(), v1395, v1397); + svfloat32_t v168 = svadd_f32_x(svptrue_b32(), v1399, v1401); + svfloat32_t v169 = svsub_f32_x(svptrue_b32(), v1399, v1401); + svfloat32_t v184 = svadd_f32_x(svptrue_b32(), v1403, v1405); + svfloat32_t v185 = svsub_f32_x(svptrue_b32(), v1403, v1405); + svfloat32_t v202 = svadd_f32_x(svptrue_b32(), v1407, v1409); + svfloat32_t v203 = svsub_f32_x(svptrue_b32(), v1407, v1409); + svfloat32_t v218 = svadd_f32_x(svptrue_b32(), v1411, v1413); + svfloat32_t v219 = svsub_f32_x(svptrue_b32(), v1411, v1413); + svfloat32_t v236 = svadd_f32_x(svptrue_b32(), v1415, v1417); + svfloat32_t v237 = svsub_f32_x(svptrue_b32(), v1415, v1417); + svfloat32_t v252 = svadd_f32_x(svptrue_b32(), v1419, v1421); + svfloat32_t v253 = svsub_f32_x(svptrue_b32(), v1419, v1421); + svfloat32_t v50 = svadd_f32_x(svptrue_b32(), v32, v48); + svfloat32_t v51 = svsub_f32_x(svptrue_b32(), v32, v48); + svfloat32_t v84 = svadd_f32_x(svptrue_b32(), v66, v82); + svfloat32_t v85 = svsub_f32_x(svptrue_b32(), v66, v82); + svfloat32_t v118 = svadd_f32_x(svptrue_b32(), v100, v116); + svfloat32_t v119 = svsub_f32_x(svptrue_b32(), v100, v116); + svfloat32_t v152 = svadd_f32_x(svptrue_b32(), v134, v150); + svfloat32_t v153 = svsub_f32_x(svptrue_b32(), v134, v150); + svfloat32_t v186 = svadd_f32_x(svptrue_b32(), v168, v184); + svfloat32_t v187 = svsub_f32_x(svptrue_b32(), v168, v184); + svfloat32_t v220 = svadd_f32_x(svptrue_b32(), v202, v218); + svfloat32_t v221 = svsub_f32_x(svptrue_b32(), v202, v218); + svfloat32_t v254 = svadd_f32_x(svptrue_b32(), v236, v252); + svfloat32_t v255 = svsub_f32_x(svptrue_b32(), v236, v252); + svfloat32_t v434 = svadd_f32_x(svptrue_b32(), v67, v237); + svfloat32_t v435 = svsub_f32_x(svptrue_b32(), v67, v237); + svfloat32_t v436 = svadd_f32_x(svptrue_b32(), v169, v135); + svfloat32_t v437 = svsub_f32_x(svptrue_b32(), v169, v135); + svfloat32_t v438 = svadd_f32_x(svptrue_b32(), v101, v203); + svfloat32_t v439 = svsub_f32_x(svptrue_b32(), v101, v203); + svfloat32_t v523 = svadd_f32_x(svptrue_b32(), v83, v253); + svfloat32_t v524 = svsub_f32_x(svptrue_b32(), v83, v253); + svfloat32_t v525 = svadd_f32_x(svptrue_b32(), v185, v151); + svfloat32_t v526 = svsub_f32_x(svptrue_b32(), v185, v151); + svfloat32_t v527 = svadd_f32_x(svptrue_b32(), v117, v219); + svfloat32_t v528 = svsub_f32_x(svptrue_b32(), v117, v219); + svfloat32_t v256 = svadd_f32_x(svptrue_b32(), v84, v254); + svfloat32_t v257 = svsub_f32_x(svptrue_b32(), v84, v254); + svfloat32_t v258 = svadd_f32_x(svptrue_b32(), v186, v152); + svfloat32_t v259 = svsub_f32_x(svptrue_b32(), v186, v152); + svfloat32_t v260 = svadd_f32_x(svptrue_b32(), v118, v220); + svfloat32_t v261 = svsub_f32_x(svptrue_b32(), v118, v220); + svfloat32_t v345 = svadd_f32_x(svptrue_b32(), v85, v255); + svfloat32_t v346 = svsub_f32_x(svptrue_b32(), v85, v255); + svfloat32_t v347 = svadd_f32_x(svptrue_b32(), v187, v153); + svfloat32_t v348 = svsub_f32_x(svptrue_b32(), v187, v153); + svfloat32_t v349 = svadd_f32_x(svptrue_b32(), v119, v221); + svfloat32_t v350 = svsub_f32_x(svptrue_b32(), v119, v221); + svfloat32_t v440 = svadd_f32_x(svptrue_b32(), v434, v436); + svfloat32_t v443 = svsub_f32_x(svptrue_b32(), v434, v436); + svfloat32_t v444 = svsub_f32_x(svptrue_b32(), v436, v438); + svfloat32_t v445 = svsub_f32_x(svptrue_b32(), v438, v434); + svfloat32_t v446 = svadd_f32_x(svptrue_b32(), v435, v437); + svfloat32_t v448 = svsub_f32_x(svptrue_b32(), v435, v437); + svfloat32_t v449 = svsub_f32_x(svptrue_b32(), v437, v439); + svfloat32_t v450 = svsub_f32_x(svptrue_b32(), v439, v435); + svfloat32_t v529 = svadd_f32_x(svptrue_b32(), v523, v525); + svfloat32_t v532 = svsub_f32_x(svptrue_b32(), v523, v525); + svfloat32_t v533 = svsub_f32_x(svptrue_b32(), v525, v527); + svfloat32_t v534 = svsub_f32_x(svptrue_b32(), v527, v523); + svfloat32_t v535 = svadd_f32_x(svptrue_b32(), v524, v526); + svfloat32_t v537 = svsub_f32_x(svptrue_b32(), v524, v526); + svfloat32_t v538 = svsub_f32_x(svptrue_b32(), v526, v528); + svfloat32_t v539 = svsub_f32_x(svptrue_b32(), v528, v524); + svfloat32_t v262 = svadd_f32_x(svptrue_b32(), v256, v258); + svfloat32_t v265 = svsub_f32_x(svptrue_b32(), v256, v258); + svfloat32_t v266 = svsub_f32_x(svptrue_b32(), v258, v260); + svfloat32_t v267 = svsub_f32_x(svptrue_b32(), v260, v256); + svfloat32_t v268 = svadd_f32_x(svptrue_b32(), v257, v259); + svfloat32_t v270 = svsub_f32_x(svptrue_b32(), v257, v259); + svfloat32_t v271 = svsub_f32_x(svptrue_b32(), v259, v261); + svfloat32_t v272 = svsub_f32_x(svptrue_b32(), v261, v257); + svfloat32_t v351 = svadd_f32_x(svptrue_b32(), v345, v347); + svfloat32_t v354 = svsub_f32_x(svptrue_b32(), v345, v347); + svfloat32_t v355 = svsub_f32_x(svptrue_b32(), v347, v349); + svfloat32_t v356 = svsub_f32_x(svptrue_b32(), v349, v345); + svfloat32_t v357 = svadd_f32_x(svptrue_b32(), v346, v348); + svfloat32_t v359 = svsub_f32_x(svptrue_b32(), v346, v348); + svfloat32_t v360 = svsub_f32_x(svptrue_b32(), v348, v350); + svfloat32_t v361 = svsub_f32_x(svptrue_b32(), v350, v346); + svfloat32_t v441 = svadd_f32_x(svptrue_b32(), v440, v438); + svfloat32_t v447 = svadd_f32_x(svptrue_b32(), v446, v439); + svfloat32_t zero489 = svdup_n_f32(0); + svfloat32_t v489 = svcmla_f32_x(pred_full, zero489, v1101, v448, 90); + svfloat32_t zero496 = svdup_n_f32(0); + svfloat32_t v496 = svcmla_f32_x(pred_full, zero496, v1102, v449, 90); + svfloat32_t zero503 = svdup_n_f32(0); + svfloat32_t v503 = svcmla_f32_x(pred_full, zero503, v1103, v450, 90); + svfloat32_t v530 = svadd_f32_x(svptrue_b32(), v529, v527); + svfloat32_t v536 = svadd_f32_x(svptrue_b32(), v535, v528); + svfloat32_t zero560 = svdup_n_f32(0); + svfloat32_t v560 = svcmla_f32_x(pred_full, zero560, v1106, v532, 90); + svfloat32_t zero567 = svdup_n_f32(0); + svfloat32_t v567 = svcmla_f32_x(pred_full, zero567, v1107, v533, 90); + svfloat32_t zero574 = svdup_n_f32(0); + svfloat32_t v574 = svcmla_f32_x(pred_full, zero574, v1108, v534, 90); + svfloat32_t v584 = svmul_f32_x(svptrue_b32(), v537, v1110); + svfloat32_t v589 = svmul_f32_x(svptrue_b32(), v538, v1111); + svfloat32_t v263 = svadd_f32_x(svptrue_b32(), v262, v260); + svfloat32_t v269 = svadd_f32_x(svptrue_b32(), v268, v261); + svfloat32_t zero311 = svdup_n_f32(0); + svfloat32_t v311 = svcmla_f32_x(pred_full, zero311, v1101, v270, 90); + svfloat32_t zero318 = svdup_n_f32(0); + svfloat32_t v318 = svcmla_f32_x(pred_full, zero318, v1102, v271, 90); + svfloat32_t zero325 = svdup_n_f32(0); + svfloat32_t v325 = svcmla_f32_x(pred_full, zero325, v1103, v272, 90); + svfloat32_t v352 = svadd_f32_x(svptrue_b32(), v351, v349); + svfloat32_t v358 = svadd_f32_x(svptrue_b32(), v357, v350); + svfloat32_t zero400 = svdup_n_f32(0); + svfloat32_t v400 = svcmla_f32_x(pred_full, zero400, v1101, v359, 90); + svfloat32_t zero407 = svdup_n_f32(0); + svfloat32_t v407 = svcmla_f32_x(pred_full, zero407, v1102, v360, 90); + svfloat32_t zero414 = svdup_n_f32(0); + svfloat32_t v414 = svcmla_f32_x(pred_full, zero414, v1103, v361, 90); + svfloat32_t v442 = svadd_f32_x(svptrue_b32(), v441, v33); + svfloat32_t zero482 = svdup_n_f32(0); + svfloat32_t v482 = svcmla_f32_x(pred_full, zero482, v1100, v447, 90); + svfloat32_t v531 = svadd_f32_x(svptrue_b32(), v530, v49); + svfloat32_t v264 = svadd_f32_x(svptrue_b32(), v263, v50); + svfloat32_t zero304 = svdup_n_f32(0); + svfloat32_t v304 = svcmla_f32_x(pred_full, zero304, v1100, v269, 90); + svfloat32_t v353 = svadd_f32_x(svptrue_b32(), v352, v51); + svfloat32_t zero393 = svdup_n_f32(0); + svfloat32_t v393 = svcmla_f32_x(pred_full, zero393, v1100, v358, 90); + svfloat32_t v504 = svmla_f32_x(pred_full, v442, v441, v1096); + svfloat32_t v511 = svadd_f32_x(svptrue_b32(), v482, v489); + svfloat32_t v513 = svsub_f32_x(svptrue_b32(), v482, v489); + svfloat32_t v515 = svsub_f32_x(svptrue_b32(), v482, v496); + svfloat32_t zero546 = svdup_n_f32(0); + svfloat32_t v546 = svcmla_f32_x(pred_full, zero546, v1104, v531, 90); + svfloat32_t v602 = svmla_f32_x(pred_full, v584, v536, v1109); + svfloat32_t v604 = svnmls_f32_x(pred_full, v584, v536, v1109); + svfloat32_t v606 = svnmls_f32_x(pred_full, v589, v536, v1109); + svfloat32_t v326 = svmla_f32_x(pred_full, v264, v263, v1096); + svfloat32_t v333 = svadd_f32_x(svptrue_b32(), v304, v311); + svfloat32_t v335 = svsub_f32_x(svptrue_b32(), v304, v311); + svfloat32_t v337 = svsub_f32_x(svptrue_b32(), v304, v318); + svfloat32_t v415 = svmla_f32_x(pred_full, v353, v352, v1096); + svfloat32_t v422 = svadd_f32_x(svptrue_b32(), v393, v400); + svfloat32_t v424 = svsub_f32_x(svptrue_b32(), v393, v400); + svfloat32_t v426 = svsub_f32_x(svptrue_b32(), v393, v407); + svfloat32_t v505 = svmla_f32_x(pred_full, v504, v443, v1097); + svfloat32_t v507 = svmls_f32_x(pred_full, v504, v443, v1097); + svfloat32_t v509 = svmls_f32_x(pred_full, v504, v444, v1098); + svfloat32_t v512 = svadd_f32_x(svptrue_b32(), v511, v496); + svfloat32_t v514 = svsub_f32_x(svptrue_b32(), v513, v503); + svfloat32_t v516 = svadd_f32_x(svptrue_b32(), v515, v503); + svfloat32_t v595 = svcmla_f32_x(pred_full, v546, v1105, v530, 90); + svfloat32_t v603 = svmla_f32_x(pred_full, v602, v538, v1111); + svfloat32_t v605 = svmls_f32_x(pred_full, v604, v539, v1112); + svfloat32_t v607 = svmla_f32_x(pred_full, v606, v539, v1112); + svfloat32_t v614 = svadd_f32_x(svptrue_b32(), v442, v546); + svfloat32_t v615 = svsub_f32_x(svptrue_b32(), v442, v546); + svst1_f64(pred_full, (double *)(v1120), svreinterpret_f64_f32(v264)); + svst1_f64(pred_full, (double *)(v1138), svreinterpret_f64_f32(v353)); + svfloat32_t v327 = svmla_f32_x(pred_full, v326, v265, v1097); + svfloat32_t v329 = svmls_f32_x(pred_full, v326, v265, v1097); + svfloat32_t v331 = svmls_f32_x(pred_full, v326, v266, v1098); + svfloat32_t v334 = svadd_f32_x(svptrue_b32(), v333, v318); + svfloat32_t v336 = svsub_f32_x(svptrue_b32(), v335, v325); + svfloat32_t v338 = svadd_f32_x(svptrue_b32(), v337, v325); + svfloat32_t v416 = svmla_f32_x(pred_full, v415, v354, v1097); + svfloat32_t v418 = svmls_f32_x(pred_full, v415, v354, v1097); + svfloat32_t v420 = svmls_f32_x(pred_full, v415, v355, v1098); + svfloat32_t v423 = svadd_f32_x(svptrue_b32(), v422, v407); + svfloat32_t v425 = svsub_f32_x(svptrue_b32(), v424, v414); + svfloat32_t v427 = svadd_f32_x(svptrue_b32(), v426, v414); + svfloat32_t v506 = svmla_f32_x(pred_full, v505, v444, v1098); + svfloat32_t v508 = svmls_f32_x(pred_full, v507, v445, v1099); + svfloat32_t v510 = svmla_f32_x(pred_full, v509, v445, v1099); + svfloat32_t v596 = svadd_f32_x(svptrue_b32(), v595, v560); + svfloat32_t v598 = svsub_f32_x(svptrue_b32(), v595, v560); + svfloat32_t v600 = svsub_f32_x(svptrue_b32(), v595, v567); + svst1_f64(pred_full, (double *)(v1129), svreinterpret_f64_f32(v615)); + svst1_f64(pred_full, (double *)(v1147), svreinterpret_f64_f32(v614)); + svfloat32_t v328 = svmla_f32_x(pred_full, v327, v266, v1098); + svfloat32_t v330 = svmls_f32_x(pred_full, v329, v267, v1099); + svfloat32_t v332 = svmla_f32_x(pred_full, v331, v267, v1099); + svfloat32_t v417 = svmla_f32_x(pred_full, v416, v355, v1098); + svfloat32_t v419 = svmls_f32_x(pred_full, v418, v356, v1099); + svfloat32_t v421 = svmla_f32_x(pred_full, v420, v356, v1099); + svfloat32_t v517 = svadd_f32_x(svptrue_b32(), v506, v512); + svfloat32_t v518 = svsub_f32_x(svptrue_b32(), v506, v512); + svfloat32_t v519 = svadd_f32_x(svptrue_b32(), v508, v514); + svfloat32_t v520 = svsub_f32_x(svptrue_b32(), v508, v514); + svfloat32_t v521 = svadd_f32_x(svptrue_b32(), v510, v516); + svfloat32_t v522 = svsub_f32_x(svptrue_b32(), v510, v516); + svfloat32_t v597 = svadd_f32_x(svptrue_b32(), v596, v567); + svfloat32_t v599 = svsub_f32_x(svptrue_b32(), v598, v574); + svfloat32_t v601 = svadd_f32_x(svptrue_b32(), v600, v574); + svfloat32_t v339 = svadd_f32_x(svptrue_b32(), v328, v334); + svfloat32_t v340 = svsub_f32_x(svptrue_b32(), v328, v334); + svfloat32_t v341 = svadd_f32_x(svptrue_b32(), v330, v336); + svfloat32_t v342 = svsub_f32_x(svptrue_b32(), v330, v336); + svfloat32_t v343 = svadd_f32_x(svptrue_b32(), v332, v338); + svfloat32_t v344 = svsub_f32_x(svptrue_b32(), v332, v338); + svfloat32_t v428 = svadd_f32_x(svptrue_b32(), v417, v423); + svfloat32_t v429 = svsub_f32_x(svptrue_b32(), v417, v423); + svfloat32_t v430 = svadd_f32_x(svptrue_b32(), v419, v425); + svfloat32_t v431 = svsub_f32_x(svptrue_b32(), v419, v425); + svfloat32_t v432 = svadd_f32_x(svptrue_b32(), v421, v427); + svfloat32_t v433 = svsub_f32_x(svptrue_b32(), v421, v427); + svfloat32_t v608 = svadd_f32_x(svptrue_b32(), v597, v603); + svfloat32_t v609 = svsub_f32_x(svptrue_b32(), v597, v603); + svfloat32_t v610 = svadd_f32_x(svptrue_b32(), v599, v605); + svfloat32_t v611 = svsub_f32_x(svptrue_b32(), v599, v605); + svfloat32_t v612 = svadd_f32_x(svptrue_b32(), v601, v607); + svfloat32_t v613 = svsub_f32_x(svptrue_b32(), v601, v607); + svfloat32_t v644 = svadd_f32_x(svptrue_b32(), v518, v609); + svfloat32_t v645 = svsub_f32_x(svptrue_b32(), v518, v609); + svfloat32_t v674 = svadd_f32_x(svptrue_b32(), v520, v611); + svfloat32_t v675 = svsub_f32_x(svptrue_b32(), v520, v611); + svfloat32_t v704 = svadd_f32_x(svptrue_b32(), v521, v612); + svfloat32_t v705 = svsub_f32_x(svptrue_b32(), v521, v612); + svfloat32_t v734 = svadd_f32_x(svptrue_b32(), v522, v613); + svfloat32_t v735 = svsub_f32_x(svptrue_b32(), v522, v613); + svfloat32_t v764 = svadd_f32_x(svptrue_b32(), v519, v610); + svfloat32_t v765 = svsub_f32_x(svptrue_b32(), v519, v610); + svfloat32_t v794 = svadd_f32_x(svptrue_b32(), v517, v608); + svfloat32_t v795 = svsub_f32_x(svptrue_b32(), v517, v608); + svst1_f64(pred_full, (double *)(v1156), svreinterpret_f64_f32(v340)); + svst1_f64(pred_full, (double *)(v1174), svreinterpret_f64_f32(v429)); + svst1_f64(pred_full, (double *)(v1192), svreinterpret_f64_f32(v342)); + svst1_f64(pred_full, (double *)(v1210), svreinterpret_f64_f32(v431)); + svst1_f64(pred_full, (double *)(v1228), svreinterpret_f64_f32(v343)); + svst1_f64(pred_full, (double *)(v1246), svreinterpret_f64_f32(v432)); + svst1_f64(pred_full, (double *)(v1264), svreinterpret_f64_f32(v344)); + svst1_f64(pred_full, (double *)(v1282), svreinterpret_f64_f32(v433)); + svst1_f64(pred_full, (double *)(v1300), svreinterpret_f64_f32(v341)); + svst1_f64(pred_full, (double *)(v1318), svreinterpret_f64_f32(v430)); + svst1_f64(pred_full, (double *)(v1336), svreinterpret_f64_f32(v339)); + svst1_f64(pred_full, (double *)(v1354), svreinterpret_f64_f32(v428)); + svst1_f64(pred_full, (double *)(v1165), svreinterpret_f64_f32(v645)); + svst1_f64(pred_full, (double *)(v1183), svreinterpret_f64_f32(v644)); + svst1_f64(pred_full, (double *)(v1201), svreinterpret_f64_f32(v675)); + svst1_f64(pred_full, (double *)(v1219), svreinterpret_f64_f32(v674)); + svst1_f64(pred_full, (double *)(v1237), svreinterpret_f64_f32(v705)); + svst1_f64(pred_full, (double *)(v1255), svreinterpret_f64_f32(v704)); + svst1_f64(pred_full, (double *)(v1273), svreinterpret_f64_f32(v735)); + svst1_f64(pred_full, (double *)(v1291), svreinterpret_f64_f32(v734)); + svst1_f64(pred_full, (double *)(v1309), svreinterpret_f64_f32(v765)); + svst1_f64(pred_full, (double *)(v1327), svreinterpret_f64_f32(v764)); + svst1_f64(pred_full, (double *)(v1345), svreinterpret_f64_f32(v795)); + svst1_f64(pred_full, (double *)(v1363), svreinterpret_f64_f32(v794)); + v5 += v11; + v6 += v12; + } +} +#endif + +#ifndef ARMRAL_ARCH_SVE +void armral_fft_cf32_cf32_cf32_ac_n_uu30(const armral_cmplx_f32_t *restrict x, + armral_cmplx_f32_t *restrict y, + int istride, int ostride, int howmany, + float dir) { + float v4 = dir; + const float32x2_t *v5 = (const float32x2_t *)x; + float32x2_t *v6 = (float32x2_t *)y; + int64_t v12 = howmany - 1; + int64_t v867 = howmany / 2; + for (int j = 0; j < v12; j += 2) { + float v488 = -1.2500000000000000e+00F; + float v493 = 5.5901699437494745e-01F; + float v497 = 1.5388417685876268e+00F; + float v498 = -1.5388417685876268e+00F; + float v505 = 5.8778525229247325e-01F; + float v506 = -5.8778525229247325e-01F; + float v513 = 3.6327126400268028e-01F; + float v514 = -3.6327126400268028e-01F; + float v539 = -1.4999999999999998e+00F; + float v544 = 1.8749999999999998e+00F; + float v549 = -8.3852549156242107e-01F; + float v553 = -2.3082626528814396e+00F; + float v554 = 2.3082626528814396e+00F; + float v561 = -8.8167787843870971e-01F; + float v562 = 8.8167787843870971e-01F; + float v569 = -5.4490689600402031e-01F; + float v570 = 5.4490689600402031e-01F; + float v594 = 8.6602540378443871e-01F; + float v595 = -8.6602540378443871e-01F; + float v602 = -1.0825317547305484e+00F; + float v603 = 1.0825317547305484e+00F; + float v610 = 4.8412291827592718e-01F; + float v611 = -4.8412291827592718e-01F; + float32x2_t v613 = (float32x2_t){v4, v4}; + float v619 = -1.3326760640014592e+00F; + float v624 = -5.0903696045512736e-01F; + float v629 = -3.1460214309120460e-01F; + const float32x2_t *v1693 = &v5[istride]; + float32x2_t *v1964 = &v6[ostride]; + float32x2_t v489 = (float32x2_t){v488, v488}; + float32x2_t v494 = (float32x2_t){v493, v493}; + float32x2_t v499 = (float32x2_t){v497, v498}; + float32x2_t v507 = (float32x2_t){v505, v506}; + float32x2_t v515 = (float32x2_t){v513, v514}; + float32x2_t v540 = (float32x2_t){v539, v539}; + float32x2_t v545 = (float32x2_t){v544, v544}; + float32x2_t v550 = (float32x2_t){v549, v549}; + float32x2_t v555 = (float32x2_t){v553, v554}; + float32x2_t v563 = (float32x2_t){v561, v562}; + float32x2_t v571 = (float32x2_t){v569, v570}; + float32x2_t v596 = (float32x2_t){v594, v595}; + float32x2_t v604 = (float32x2_t){v602, v603}; + float32x2_t v612 = (float32x2_t){v610, v611}; + float32x2_t v620 = (float32x2_t){v619, v619}; + float32x2_t v625 = (float32x2_t){v624, v624}; + float32x2_t v630 = (float32x2_t){v629, v629}; + const float32x2_t *v1576 = &v5[0]; + float32x2_t *v1847 = &v6[0]; + float32x4_t v2138 = vld1q_f32((const float32_t *)v1693); + float32x4_t v490 = vcombine_f32(v489, v489); + float32x4_t v495 = vcombine_f32(v494, v494); + float32x2_t v501 = vmul_f32(v613, v499); + float32x2_t v509 = vmul_f32(v613, v507); + float32x2_t v517 = vmul_f32(v613, v515); + float32x4_t v541 = vcombine_f32(v540, v540); + float32x4_t v546 = vcombine_f32(v545, v545); + float32x4_t v551 = vcombine_f32(v550, v550); + float32x2_t v557 = vmul_f32(v613, v555); + float32x2_t v565 = vmul_f32(v613, v563); + float32x2_t v573 = vmul_f32(v613, v571); + float32x2_t v598 = vmul_f32(v613, v596); + float32x2_t v606 = vmul_f32(v613, v604); + float32x2_t v614 = vmul_f32(v613, v612); + float32x4_t v621 = vcombine_f32(v620, v620); + float32x4_t v626 = vcombine_f32(v625, v625); + float32x4_t v631 = vcombine_f32(v630, v630); + const float32x2_t *v1585 = &v5[istride * 15]; + const float32x2_t *v1594 = &v5[istride * 6]; + const float32x2_t *v1603 = &v5[istride * 21]; + const float32x2_t *v1612 = &v5[istride * 12]; + const float32x2_t *v1621 = &v5[istride * 27]; + const float32x2_t *v1630 = &v5[istride * 18]; + const float32x2_t *v1639 = &v5[istride * 3]; + const float32x2_t *v1648 = &v5[istride * 24]; + const float32x2_t *v1657 = &v5[istride * 9]; + const float32x2_t *v1666 = &v5[istride * 10]; + const float32x2_t *v1675 = &v5[istride * 25]; + const float32x2_t *v1684 = &v5[istride * 16]; + const float32x2_t *v1702 = &v5[istride * 22]; + const float32x2_t *v1711 = &v5[istride * 7]; + const float32x2_t *v1720 = &v5[istride * 28]; + const float32x2_t *v1729 = &v5[istride * 13]; + const float32x2_t *v1738 = &v5[istride * 4]; + const float32x2_t *v1747 = &v5[istride * 19]; + const float32x2_t *v1756 = &v5[istride * 20]; + const float32x2_t *v1765 = &v5[istride * 5]; + const float32x2_t *v1774 = &v5[istride * 26]; + const float32x2_t *v1783 = &v5[istride * 11]; + const float32x2_t *v1792 = &v5[istride * 2]; + const float32x2_t *v1801 = &v5[istride * 17]; + const float32x2_t *v1810 = &v5[istride * 8]; + const float32x2_t *v1819 = &v5[istride * 23]; + const float32x2_t *v1828 = &v5[istride * 14]; + const float32x2_t *v1837 = &v5[istride * 29]; + float32x2_t *v1856 = &v6[ostride * 15]; + float32x2_t *v1865 = &v6[ostride * 6]; + float32x2_t *v1874 = &v6[ostride * 21]; + float32x2_t *v1883 = &v6[ostride * 12]; + float32x2_t *v1892 = &v6[ostride * 27]; + float32x2_t *v1901 = &v6[ostride * 18]; + float32x2_t *v1910 = &v6[ostride * 3]; + float32x2_t *v1919 = &v6[ostride * 24]; + float32x2_t *v1928 = &v6[ostride * 9]; + float32x2_t *v1937 = &v6[ostride * 10]; + float32x2_t *v1946 = &v6[ostride * 25]; + float32x2_t *v1955 = &v6[ostride * 16]; + float32x2_t *v1973 = &v6[ostride * 22]; + float32x2_t *v1982 = &v6[ostride * 7]; + float32x2_t *v1991 = &v6[ostride * 28]; + float32x2_t *v2000 = &v6[ostride * 13]; + float32x2_t *v2009 = &v6[ostride * 4]; + float32x2_t *v2018 = &v6[ostride * 19]; + float32x2_t *v2027 = &v6[ostride * 20]; + float32x2_t *v2036 = &v6[ostride * 5]; + float32x2_t *v2045 = &v6[ostride * 26]; + float32x2_t *v2054 = &v6[ostride * 11]; + float32x2_t *v2063 = &v6[ostride * 2]; + float32x2_t *v2072 = &v6[ostride * 17]; + float32x2_t *v2081 = &v6[ostride * 8]; + float32x2_t *v2090 = &v6[ostride * 23]; + float32x2_t *v2099 = &v6[ostride * 14]; + float32x2_t *v2108 = &v6[ostride * 29]; + float32x4_t v2112 = vld1q_f32((const float32_t *)v1576); + float32x4_t v503 = vcombine_f32(v501, v501); + float32x4_t v511 = vcombine_f32(v509, v509); + float32x4_t v519 = vcombine_f32(v517, v517); + float32x4_t v559 = vcombine_f32(v557, v557); + float32x4_t v567 = vcombine_f32(v565, v565); + float32x4_t v575 = vcombine_f32(v573, v573); + float32x4_t v600 = vcombine_f32(v598, v598); + float32x4_t v608 = vcombine_f32(v606, v606); + float32x4_t v616 = vcombine_f32(v614, v614); + float32x4_t v2114 = vld1q_f32((const float32_t *)v1585); + float32x4_t v2116 = vld1q_f32((const float32_t *)v1594); + float32x4_t v2118 = vld1q_f32((const float32_t *)v1603); + float32x4_t v2120 = vld1q_f32((const float32_t *)v1612); + float32x4_t v2122 = vld1q_f32((const float32_t *)v1621); + float32x4_t v2124 = vld1q_f32((const float32_t *)v1630); + float32x4_t v2126 = vld1q_f32((const float32_t *)v1639); + float32x4_t v2128 = vld1q_f32((const float32_t *)v1648); + float32x4_t v2130 = vld1q_f32((const float32_t *)v1657); + float32x4_t v2132 = vld1q_f32((const float32_t *)v1666); + float32x4_t v2134 = vld1q_f32((const float32_t *)v1675); + float32x4_t v2136 = vld1q_f32((const float32_t *)v1684); + float32x4_t v2140 = vld1q_f32((const float32_t *)v1702); + float32x4_t v2142 = vld1q_f32((const float32_t *)v1711); + float32x4_t v2144 = vld1q_f32((const float32_t *)v1720); + float32x4_t v2146 = vld1q_f32((const float32_t *)v1729); + float32x4_t v2148 = vld1q_f32((const float32_t *)v1738); + float32x4_t v2150 = vld1q_f32((const float32_t *)v1747); + float32x4_t v2152 = vld1q_f32((const float32_t *)v1756); + float32x4_t v2154 = vld1q_f32((const float32_t *)v1765); + float32x4_t v2156 = vld1q_f32((const float32_t *)v1774); + float32x4_t v2158 = vld1q_f32((const float32_t *)v1783); + float32x4_t v2160 = vld1q_f32((const float32_t *)v1792); + float32x4_t v2162 = vld1q_f32((const float32_t *)v1801); + float32x4_t v2164 = vld1q_f32((const float32_t *)v1810); + float32x4_t v2166 = vld1q_f32((const float32_t *)v1819); + float32x4_t v2168 = vld1q_f32((const float32_t *)v1828); + float32x4_t v2170 = vld1q_f32((const float32_t *)v1837); + float32x4_t v35 = vaddq_f32(v2112, v2114); + float32x4_t v36 = vsubq_f32(v2112, v2114); + float32x4_t v51 = vaddq_f32(v2116, v2118); + float32x4_t v52 = vsubq_f32(v2116, v2118); + float32x4_t v67 = vaddq_f32(v2120, v2122); + float32x4_t v68 = vsubq_f32(v2120, v2122); + float32x4_t v83 = vaddq_f32(v2124, v2126); + float32x4_t v84 = vsubq_f32(v2124, v2126); + float32x4_t v99 = vaddq_f32(v2128, v2130); + float32x4_t v100 = vsubq_f32(v2128, v2130); + float32x4_t v115 = vaddq_f32(v2132, v2134); + float32x4_t v116 = vsubq_f32(v2132, v2134); + float32x4_t v131 = vaddq_f32(v2136, v2138); + float32x4_t v132 = vsubq_f32(v2136, v2138); + float32x4_t v147 = vaddq_f32(v2140, v2142); + float32x4_t v148 = vsubq_f32(v2140, v2142); + float32x4_t v163 = vaddq_f32(v2144, v2146); + float32x4_t v164 = vsubq_f32(v2144, v2146); + float32x4_t v179 = vaddq_f32(v2148, v2150); + float32x4_t v180 = vsubq_f32(v2148, v2150); + float32x4_t v195 = vaddq_f32(v2152, v2154); + float32x4_t v196 = vsubq_f32(v2152, v2154); + float32x4_t v211 = vaddq_f32(v2156, v2158); + float32x4_t v212 = vsubq_f32(v2156, v2158); + float32x4_t v227 = vaddq_f32(v2160, v2162); + float32x4_t v228 = vsubq_f32(v2160, v2162); + float32x4_t v243 = vaddq_f32(v2164, v2166); + float32x4_t v244 = vsubq_f32(v2164, v2166); + float32x4_t v259 = vaddq_f32(v2168, v2170); + float32x4_t v260 = vsubq_f32(v2168, v2170); + float32x4_t v261 = vaddq_f32(v115, v195); + float32x4_t v262 = vsubq_f32(v115, v195); + float32x4_t v264 = vaddq_f32(v131, v211); + float32x4_t v265 = vsubq_f32(v131, v211); + float32x4_t v267 = vaddq_f32(v147, v227); + float32x4_t v268 = vsubq_f32(v147, v227); + float32x4_t v270 = vaddq_f32(v163, v243); + float32x4_t v271 = vsubq_f32(v163, v243); + float32x4_t v273 = vaddq_f32(v179, v259); + float32x4_t v274 = vsubq_f32(v179, v259); + float32x4_t v459 = vaddq_f32(v116, v196); + float32x4_t v460 = vsubq_f32(v116, v196); + float32x4_t v462 = vaddq_f32(v132, v212); + float32x4_t v463 = vsubq_f32(v132, v212); + float32x4_t v465 = vaddq_f32(v148, v228); + float32x4_t v466 = vsubq_f32(v148, v228); + float32x4_t v468 = vaddq_f32(v164, v244); + float32x4_t v469 = vsubq_f32(v164, v244); + float32x4_t v471 = vaddq_f32(v180, v260); + float32x4_t v472 = vsubq_f32(v180, v260); + float32x4_t v263 = vaddq_f32(v261, v35); + float32x4_t v266 = vaddq_f32(v264, v51); + float32x4_t v269 = vaddq_f32(v267, v67); + float32x4_t v272 = vaddq_f32(v270, v83); + float32x4_t v275 = vaddq_f32(v273, v99); + float32x4_t v332 = vaddq_f32(v264, v273); + float32x4_t v333 = vsubq_f32(v264, v273); + float32x4_t v334 = vaddq_f32(v270, v267); + float32x4_t v335 = vsubq_f32(v270, v267); + float32x4_t v388 = vaddq_f32(v265, v274); + float32x4_t v389 = vsubq_f32(v265, v274); + float32x4_t v390 = vaddq_f32(v271, v268); + float32x4_t v391 = vsubq_f32(v271, v268); + float32x4_t v461 = vaddq_f32(v459, v36); + float32x4_t v464 = vaddq_f32(v462, v52); + float32x4_t v467 = vaddq_f32(v465, v68); + float32x4_t v470 = vaddq_f32(v468, v84); + float32x4_t v473 = vaddq_f32(v471, v100); + float32x4_t v530 = vaddq_f32(v462, v471); + float32x4_t v531 = vsubq_f32(v462, v471); + float32x4_t v532 = vaddq_f32(v468, v465); + float32x4_t v533 = vsubq_f32(v468, v465); + float32x4_t v586 = vaddq_f32(v463, v472); + float32x4_t v587 = vsubq_f32(v463, v472); + float32x4_t v588 = vaddq_f32(v469, v466); + float32x4_t v589 = vsubq_f32(v469, v466); + float32x4_t v276 = vaddq_f32(v266, v275); + float32x4_t v277 = vsubq_f32(v266, v275); + float32x4_t v278 = vaddq_f32(v272, v269); + float32x4_t v279 = vsubq_f32(v272, v269); + float32x4_t v336 = vaddq_f32(v332, v334); + float32x4_t v337 = vsubq_f32(v332, v334); + float32x4_t v338 = vaddq_f32(v333, v335); + float32x4_t v360 = vrev64q_f32(v333); + float32x4_t v376 = vrev64q_f32(v335); + float32x4_t v392 = vaddq_f32(v388, v390); + float32x4_t v393 = vsubq_f32(v388, v390); + float32x4_t v394 = vaddq_f32(v389, v391); + float32x4_t v424 = vmulq_f32(v389, v621); + float32x4_t v434 = vmulq_f32(v391, v631); + float32x4_t v474 = vaddq_f32(v464, v473); + float32x4_t v475 = vsubq_f32(v464, v473); + float32x4_t v476 = vaddq_f32(v470, v467); + float32x4_t v477 = vsubq_f32(v470, v467); + float32x4_t v534 = vaddq_f32(v530, v532); + float32x4_t v535 = vsubq_f32(v530, v532); + float32x4_t v536 = vaddq_f32(v531, v533); + float32x4_t v558 = vrev64q_f32(v531); + float32x4_t v574 = vrev64q_f32(v533); + float32x4_t v590 = vaddq_f32(v586, v588); + float32x4_t v591 = vsubq_f32(v586, v588); + float32x4_t v592 = vaddq_f32(v587, v589); + float32x4_t v622 = vmulq_f32(v587, v621); + float32x4_t v632 = vmulq_f32(v589, v631); + float32x4_t v280 = vaddq_f32(v276, v278); + float32x4_t v281 = vsubq_f32(v276, v278); + float32x4_t v282 = vaddq_f32(v277, v279); + float32x4_t v304 = vrev64q_f32(v277); + float32x4_t v320 = vrev64q_f32(v279); + float32x4_t v339 = vaddq_f32(v336, v261); + float32x4_t v349 = vmulq_f32(v336, v546); + float32x4_t v354 = vmulq_f32(v337, v551); + float32x4_t v362 = vmulq_f32(v360, v559); + float32x4_t v368 = vrev64q_f32(v338); + float32x4_t v378 = vmulq_f32(v376, v575); + float32x4_t v395 = vaddq_f32(v392, v262); + float32x4_t v409 = vrev64q_f32(v392); + float32x4_t v417 = vrev64q_f32(v393); + float32x4_t v429 = vmulq_f32(v394, v626); + float32x4_t v478 = vaddq_f32(v474, v476); + float32x4_t v479 = vsubq_f32(v474, v476); + float32x4_t v480 = vaddq_f32(v475, v477); + float32x4_t v502 = vrev64q_f32(v475); + float32x4_t v518 = vrev64q_f32(v477); + float32x4_t v537 = vaddq_f32(v534, v459); + float32x4_t v547 = vmulq_f32(v534, v546); + float32x4_t v552 = vmulq_f32(v535, v551); + float32x4_t v560 = vmulq_f32(v558, v559); + float32x4_t v566 = vrev64q_f32(v536); + float32x4_t v576 = vmulq_f32(v574, v575); + float32x4_t v593 = vaddq_f32(v590, v460); + float32x4_t v607 = vrev64q_f32(v590); + float32x4_t v615 = vrev64q_f32(v591); + float32x4_t v627 = vmulq_f32(v592, v626); + float32x4_t v283 = vaddq_f32(v280, v263); + float32x4_t v293 = vmulq_f32(v280, v490); + float32x4_t v298 = vmulq_f32(v281, v495); + float32x4_t v306 = vmulq_f32(v304, v503); + float32x4_t v312 = vrev64q_f32(v282); + float32x4_t v322 = vmulq_f32(v320, v519); + float32x4_t v344 = vmulq_f32(v339, v541); + float32x4_t v370 = vmulq_f32(v368, v567); + float32x4_t v401 = vrev64q_f32(v395); + float32x4_t v411 = vmulq_f32(v409, v608); + float32x4_t v419 = vmulq_f32(v417, v616); + float32x4_t v438 = vsubq_f32(v424, v429); + float32x4_t v439 = vaddq_f32(v429, v434); + float32x4_t v481 = vaddq_f32(v478, v461); + float32x4_t v491 = vmulq_f32(v478, v490); + float32x4_t v496 = vmulq_f32(v479, v495); + float32x4_t v504 = vmulq_f32(v502, v503); + float32x4_t v510 = vrev64q_f32(v480); + float32x4_t v520 = vmulq_f32(v518, v519); + float32x4_t v542 = vmulq_f32(v537, v541); + float32x4_t v568 = vmulq_f32(v566, v567); + float32x4_t v599 = vrev64q_f32(v593); + float32x4_t v609 = vmulq_f32(v607, v608); + float32x4_t v617 = vmulq_f32(v615, v616); + float32x4_t v636 = vsubq_f32(v622, v627); + float32x4_t v637 = vaddq_f32(v627, v632); + float32x4_t v314 = vmulq_f32(v312, v511); + float32x4_t v323 = vaddq_f32(v283, v293); + float32x4_t v379 = vaddq_f32(v344, v349); + float32x4_t v382 = vsubq_f32(v362, v370); + float32x4_t v383 = vaddq_f32(v370, v378); + float32x4_t v403 = vmulq_f32(v401, v600); + float32x4_t v444 = vaddq_f32(v283, v344); + float32x4_t v512 = vmulq_f32(v510, v511); + float32x4_t v521 = vaddq_f32(v481, v491); + float32x4_t v577 = vaddq_f32(v542, v547); + float32x4_t v580 = vsubq_f32(v560, v568); + float32x4_t v581 = vaddq_f32(v568, v576); + float32x4_t v601 = vmulq_f32(v599, v600); + float32x4_t v642 = vaddq_f32(v481, v542); + vst1q_f32((float32_t *)v1847, v283); + vst1q_f32((float32_t *)v1856, v481); + float32x4_t v324 = vaddq_f32(v323, v298); + float32x4_t v325 = vsubq_f32(v323, v298); + float32x4_t v326 = vsubq_f32(v306, v314); + float32x4_t v327 = vaddq_f32(v314, v322); + float32x4_t v380 = vaddq_f32(v379, v354); + float32x4_t v381 = vsubq_f32(v379, v354); + float32x4_t v435 = vaddq_f32(v403, v411); + float32x4_t v445 = vaddq_f32(v444, v403); + float32x4_t v446 = vsubq_f32(v444, v403); + float32x4_t v522 = vaddq_f32(v521, v496); + float32x4_t v523 = vsubq_f32(v521, v496); + float32x4_t v524 = vsubq_f32(v504, v512); + float32x4_t v525 = vaddq_f32(v512, v520); + float32x4_t v578 = vaddq_f32(v577, v552); + float32x4_t v579 = vsubq_f32(v577, v552); + float32x4_t v633 = vaddq_f32(v601, v609); + float32x4_t v643 = vaddq_f32(v642, v601); + float32x4_t v644 = vsubq_f32(v642, v601); + float32x4_t v328 = vaddq_f32(v324, v326); + float32x4_t v329 = vsubq_f32(v324, v326); + float32x4_t v330 = vaddq_f32(v325, v327); + float32x4_t v331 = vsubq_f32(v325, v327); + float32x4_t v384 = vaddq_f32(v380, v382); + float32x4_t v385 = vsubq_f32(v380, v382); + float32x4_t v386 = vaddq_f32(v381, v383); + float32x4_t v387 = vsubq_f32(v381, v383); + float32x4_t v436 = vaddq_f32(v435, v419); + float32x4_t v437 = vsubq_f32(v435, v419); + float32x4_t v526 = vaddq_f32(v522, v524); + float32x4_t v527 = vsubq_f32(v522, v524); + float32x4_t v528 = vaddq_f32(v523, v525); + float32x4_t v529 = vsubq_f32(v523, v525); + float32x4_t v582 = vaddq_f32(v578, v580); + float32x4_t v583 = vsubq_f32(v578, v580); + float32x4_t v584 = vaddq_f32(v579, v581); + float32x4_t v585 = vsubq_f32(v579, v581); + float32x4_t v634 = vaddq_f32(v633, v617); + float32x4_t v635 = vsubq_f32(v633, v617); + vst1q_f32((float32_t *)v1937, v446); + vst1q_f32((float32_t *)v1946, v644); + vst1q_f32((float32_t *)v2027, v445); + vst1q_f32((float32_t *)v2036, v643); + float32x4_t v440 = vaddq_f32(v436, v438); + float32x4_t v441 = vsubq_f32(v436, v438); + float32x4_t v442 = vaddq_f32(v437, v439); + float32x4_t v443 = vsubq_f32(v437, v439); + float32x4_t v447 = vaddq_f32(v329, v385); + float32x4_t v450 = vaddq_f32(v331, v387); + float32x4_t v453 = vaddq_f32(v330, v386); + float32x4_t v456 = vaddq_f32(v328, v384); + float32x4_t v638 = vaddq_f32(v634, v636); + float32x4_t v639 = vsubq_f32(v634, v636); + float32x4_t v640 = vaddq_f32(v635, v637); + float32x4_t v641 = vsubq_f32(v635, v637); + float32x4_t v645 = vaddq_f32(v527, v583); + float32x4_t v648 = vaddq_f32(v529, v585); + float32x4_t v651 = vaddq_f32(v528, v584); + float32x4_t v654 = vaddq_f32(v526, v582); + vst1q_f32((float32_t *)v1865, v329); + vst1q_f32((float32_t *)v1874, v527); + vst1q_f32((float32_t *)v1883, v331); + vst1q_f32((float32_t *)v1892, v529); + vst1q_f32((float32_t *)v1901, v330); + vst1q_f32((float32_t *)v1910, v528); + vst1q_f32((float32_t *)v1919, v328); + vst1q_f32((float32_t *)v1928, v526); + float32x4_t v448 = vaddq_f32(v447, v441); + float32x4_t v449 = vsubq_f32(v447, v441); + float32x4_t v451 = vaddq_f32(v450, v443); + float32x4_t v452 = vsubq_f32(v450, v443); + float32x4_t v454 = vaddq_f32(v453, v442); + float32x4_t v455 = vsubq_f32(v453, v442); + float32x4_t v457 = vaddq_f32(v456, v440); + float32x4_t v458 = vsubq_f32(v456, v440); + float32x4_t v646 = vaddq_f32(v645, v639); + float32x4_t v647 = vsubq_f32(v645, v639); + float32x4_t v649 = vaddq_f32(v648, v641); + float32x4_t v650 = vsubq_f32(v648, v641); + float32x4_t v652 = vaddq_f32(v651, v640); + float32x4_t v653 = vsubq_f32(v651, v640); + float32x4_t v655 = vaddq_f32(v654, v638); + float32x4_t v656 = vsubq_f32(v654, v638); + vst1q_f32((float32_t *)v1955, v449); + vst1q_f32((float32_t *)v1964, v647); + vst1q_f32((float32_t *)v1973, v452); + vst1q_f32((float32_t *)v1982, v650); + vst1q_f32((float32_t *)v1991, v455); + vst1q_f32((float32_t *)v2000, v653); + vst1q_f32((float32_t *)v2009, v458); + vst1q_f32((float32_t *)v2018, v656); + vst1q_f32((float32_t *)v2045, v448); + vst1q_f32((float32_t *)v2054, v646); + vst1q_f32((float32_t *)v2063, v451); + vst1q_f32((float32_t *)v2072, v649); + vst1q_f32((float32_t *)v2081, v454); + vst1q_f32((float32_t *)v2090, v652); + vst1q_f32((float32_t *)v2099, v457); + vst1q_f32((float32_t *)v2108, v655); + v5 += 2 * 1; + v6 += 2 * 1; + } + for (int j = v867 * 2; j < howmany; j += 1) { + float32x2_t v956 = v5[istride]; + float v1263 = -1.2500000000000000e+00F; + float v1267 = 5.5901699437494745e-01F; + float v1270 = 1.5388417685876268e+00F; + float v1271 = -1.5388417685876268e+00F; + float v1277 = 5.8778525229247325e-01F; + float v1278 = -5.8778525229247325e-01F; + float v1284 = 3.6327126400268028e-01F; + float v1285 = -3.6327126400268028e-01F; + float v1309 = -1.4999999999999998e+00F; + float v1313 = 1.8749999999999998e+00F; + float v1317 = -8.3852549156242107e-01F; + float v1320 = -2.3082626528814396e+00F; + float v1321 = 2.3082626528814396e+00F; + float v1327 = -8.8167787843870971e-01F; + float v1328 = 8.8167787843870971e-01F; + float v1334 = -5.4490689600402031e-01F; + float v1335 = 5.4490689600402031e-01F; + float v1358 = 8.6602540378443871e-01F; + float v1359 = -8.6602540378443871e-01F; + float v1365 = -1.0825317547305484e+00F; + float v1366 = 1.0825317547305484e+00F; + float v1372 = 4.8412291827592718e-01F; + float v1373 = -4.8412291827592718e-01F; + float32x2_t v1375 = (float32x2_t){v4, v4}; + float v1380 = -1.3326760640014592e+00F; + float v1384 = -5.0903696045512736e-01F; + float v1388 = -3.1460214309120460e-01F; + float32x2_t v879 = v5[0]; + float32x2_t v1264 = (float32x2_t){v1263, v1263}; + float32x2_t v1268 = (float32x2_t){v1267, v1267}; + float32x2_t v1272 = (float32x2_t){v1270, v1271}; + float32x2_t v1279 = (float32x2_t){v1277, v1278}; + float32x2_t v1286 = (float32x2_t){v1284, v1285}; + float32x2_t v1310 = (float32x2_t){v1309, v1309}; + float32x2_t v1314 = (float32x2_t){v1313, v1313}; + float32x2_t v1318 = (float32x2_t){v1317, v1317}; + float32x2_t v1322 = (float32x2_t){v1320, v1321}; + float32x2_t v1329 = (float32x2_t){v1327, v1328}; + float32x2_t v1336 = (float32x2_t){v1334, v1335}; + float32x2_t v1360 = (float32x2_t){v1358, v1359}; + float32x2_t v1367 = (float32x2_t){v1365, v1366}; + float32x2_t v1374 = (float32x2_t){v1372, v1373}; + float32x2_t v1381 = (float32x2_t){v1380, v1380}; + float32x2_t v1385 = (float32x2_t){v1384, v1384}; + float32x2_t v1389 = (float32x2_t){v1388, v1388}; + float32x2_t v884 = v5[istride * 15]; + float32x2_t v891 = v5[istride * 6]; + float32x2_t v896 = v5[istride * 21]; + float32x2_t v903 = v5[istride * 12]; + float32x2_t v908 = v5[istride * 27]; + float32x2_t v915 = v5[istride * 18]; + float32x2_t v920 = v5[istride * 3]; + float32x2_t v927 = v5[istride * 24]; + float32x2_t v932 = v5[istride * 9]; + float32x2_t v939 = v5[istride * 10]; + float32x2_t v944 = v5[istride * 25]; + float32x2_t v951 = v5[istride * 16]; + float32x2_t v963 = v5[istride * 22]; + float32x2_t v968 = v5[istride * 7]; + float32x2_t v975 = v5[istride * 28]; + float32x2_t v980 = v5[istride * 13]; + float32x2_t v987 = v5[istride * 4]; + float32x2_t v992 = v5[istride * 19]; + float32x2_t v999 = v5[istride * 20]; + float32x2_t v1004 = v5[istride * 5]; + float32x2_t v1011 = v5[istride * 26]; + float32x2_t v1016 = v5[istride * 11]; + float32x2_t v1023 = v5[istride * 2]; + float32x2_t v1028 = v5[istride * 17]; + float32x2_t v1035 = v5[istride * 8]; + float32x2_t v1040 = v5[istride * 23]; + float32x2_t v1047 = v5[istride * 14]; + float32x2_t v1052 = v5[istride * 29]; + float32x2_t v1274 = vmul_f32(v1375, v1272); + float32x2_t v1281 = vmul_f32(v1375, v1279); + float32x2_t v1288 = vmul_f32(v1375, v1286); + float32x2_t v1324 = vmul_f32(v1375, v1322); + float32x2_t v1331 = vmul_f32(v1375, v1329); + float32x2_t v1338 = vmul_f32(v1375, v1336); + float32x2_t v1362 = vmul_f32(v1375, v1360); + float32x2_t v1369 = vmul_f32(v1375, v1367); + float32x2_t v1376 = vmul_f32(v1375, v1374); + float32x2_t v885 = vadd_f32(v879, v884); + float32x2_t v886 = vsub_f32(v879, v884); + float32x2_t v897 = vadd_f32(v891, v896); + float32x2_t v898 = vsub_f32(v891, v896); + float32x2_t v909 = vadd_f32(v903, v908); + float32x2_t v910 = vsub_f32(v903, v908); + float32x2_t v921 = vadd_f32(v915, v920); + float32x2_t v922 = vsub_f32(v915, v920); + float32x2_t v933 = vadd_f32(v927, v932); + float32x2_t v934 = vsub_f32(v927, v932); + float32x2_t v945 = vadd_f32(v939, v944); + float32x2_t v946 = vsub_f32(v939, v944); + float32x2_t v957 = vadd_f32(v951, v956); + float32x2_t v958 = vsub_f32(v951, v956); + float32x2_t v969 = vadd_f32(v963, v968); + float32x2_t v970 = vsub_f32(v963, v968); + float32x2_t v981 = vadd_f32(v975, v980); + float32x2_t v982 = vsub_f32(v975, v980); + float32x2_t v993 = vadd_f32(v987, v992); + float32x2_t v994 = vsub_f32(v987, v992); + float32x2_t v1005 = vadd_f32(v999, v1004); + float32x2_t v1006 = vsub_f32(v999, v1004); + float32x2_t v1017 = vadd_f32(v1011, v1016); + float32x2_t v1018 = vsub_f32(v1011, v1016); + float32x2_t v1029 = vadd_f32(v1023, v1028); + float32x2_t v1030 = vsub_f32(v1023, v1028); + float32x2_t v1041 = vadd_f32(v1035, v1040); + float32x2_t v1042 = vsub_f32(v1035, v1040); + float32x2_t v1053 = vadd_f32(v1047, v1052); + float32x2_t v1054 = vsub_f32(v1047, v1052); + float32x2_t v1055 = vadd_f32(v945, v1005); + float32x2_t v1056 = vsub_f32(v945, v1005); + float32x2_t v1058 = vadd_f32(v957, v1017); + float32x2_t v1059 = vsub_f32(v957, v1017); + float32x2_t v1061 = vadd_f32(v969, v1029); + float32x2_t v1062 = vsub_f32(v969, v1029); + float32x2_t v1064 = vadd_f32(v981, v1041); + float32x2_t v1065 = vsub_f32(v981, v1041); + float32x2_t v1067 = vadd_f32(v993, v1053); + float32x2_t v1068 = vsub_f32(v993, v1053); + float32x2_t v1235 = vadd_f32(v946, v1006); + float32x2_t v1236 = vsub_f32(v946, v1006); + float32x2_t v1238 = vadd_f32(v958, v1018); + float32x2_t v1239 = vsub_f32(v958, v1018); + float32x2_t v1241 = vadd_f32(v970, v1030); + float32x2_t v1242 = vsub_f32(v970, v1030); + float32x2_t v1244 = vadd_f32(v982, v1042); + float32x2_t v1245 = vsub_f32(v982, v1042); + float32x2_t v1247 = vadd_f32(v994, v1054); + float32x2_t v1248 = vsub_f32(v994, v1054); + float32x2_t v1057 = vadd_f32(v1055, v885); + float32x2_t v1060 = vadd_f32(v1058, v897); + float32x2_t v1063 = vadd_f32(v1061, v909); + float32x2_t v1066 = vadd_f32(v1064, v921); + float32x2_t v1069 = vadd_f32(v1067, v933); + float32x2_t v1120 = vadd_f32(v1058, v1067); + float32x2_t v1121 = vsub_f32(v1058, v1067); + float32x2_t v1122 = vadd_f32(v1064, v1061); + float32x2_t v1123 = vsub_f32(v1064, v1061); + float32x2_t v1170 = vadd_f32(v1059, v1068); + float32x2_t v1171 = vsub_f32(v1059, v1068); + float32x2_t v1172 = vadd_f32(v1065, v1062); + float32x2_t v1173 = vsub_f32(v1065, v1062); + float32x2_t v1237 = vadd_f32(v1235, v886); + float32x2_t v1240 = vadd_f32(v1238, v898); + float32x2_t v1243 = vadd_f32(v1241, v910); + float32x2_t v1246 = vadd_f32(v1244, v922); + float32x2_t v1249 = vadd_f32(v1247, v934); + float32x2_t v1300 = vadd_f32(v1238, v1247); + float32x2_t v1301 = vsub_f32(v1238, v1247); + float32x2_t v1302 = vadd_f32(v1244, v1241); + float32x2_t v1303 = vsub_f32(v1244, v1241); + float32x2_t v1350 = vadd_f32(v1239, v1248); + float32x2_t v1351 = vsub_f32(v1239, v1248); + float32x2_t v1352 = vadd_f32(v1245, v1242); + float32x2_t v1353 = vsub_f32(v1245, v1242); + float32x2_t v1070 = vadd_f32(v1060, v1069); + float32x2_t v1071 = vsub_f32(v1060, v1069); + float32x2_t v1072 = vadd_f32(v1066, v1063); + float32x2_t v1073 = vsub_f32(v1066, v1063); + float32x2_t v1124 = vadd_f32(v1120, v1122); + float32x2_t v1125 = vsub_f32(v1120, v1122); + float32x2_t v1126 = vadd_f32(v1121, v1123); + float32x2_t v1145 = vrev64_f32(v1121); + float32x2_t v1159 = vrev64_f32(v1123); + float32x2_t v1174 = vadd_f32(v1170, v1172); + float32x2_t v1175 = vsub_f32(v1170, v1172); + float32x2_t v1176 = vadd_f32(v1171, v1173); + float32x2_t v1202 = vmul_f32(v1171, v1381); + float32x2_t v1210 = vmul_f32(v1173, v1389); + float32x2_t v1250 = vadd_f32(v1240, v1249); + float32x2_t v1251 = vsub_f32(v1240, v1249); + float32x2_t v1252 = vadd_f32(v1246, v1243); + float32x2_t v1253 = vsub_f32(v1246, v1243); + float32x2_t v1304 = vadd_f32(v1300, v1302); + float32x2_t v1305 = vsub_f32(v1300, v1302); + float32x2_t v1306 = vadd_f32(v1301, v1303); + float32x2_t v1325 = vrev64_f32(v1301); + float32x2_t v1339 = vrev64_f32(v1303); + float32x2_t v1354 = vadd_f32(v1350, v1352); + float32x2_t v1355 = vsub_f32(v1350, v1352); + float32x2_t v1356 = vadd_f32(v1351, v1353); + float32x2_t v1382 = vmul_f32(v1351, v1381); + float32x2_t v1390 = vmul_f32(v1353, v1389); + float32x2_t v1074 = vadd_f32(v1070, v1072); + float32x2_t v1075 = vsub_f32(v1070, v1072); + float32x2_t v1076 = vadd_f32(v1071, v1073); + float32x2_t v1095 = vrev64_f32(v1071); + float32x2_t v1109 = vrev64_f32(v1073); + float32x2_t v1127 = vadd_f32(v1124, v1055); + float32x2_t v1135 = vmul_f32(v1124, v1314); + float32x2_t v1139 = vmul_f32(v1125, v1318); + float32x2_t v1146 = vmul_f32(v1145, v1324); + float32x2_t v1152 = vrev64_f32(v1126); + float32x2_t v1160 = vmul_f32(v1159, v1338); + float32x2_t v1177 = vadd_f32(v1174, v1056); + float32x2_t v1190 = vrev64_f32(v1174); + float32x2_t v1197 = vrev64_f32(v1175); + float32x2_t v1206 = vmul_f32(v1176, v1385); + float32x2_t v1254 = vadd_f32(v1250, v1252); + float32x2_t v1255 = vsub_f32(v1250, v1252); + float32x2_t v1256 = vadd_f32(v1251, v1253); + float32x2_t v1275 = vrev64_f32(v1251); + float32x2_t v1289 = vrev64_f32(v1253); + float32x2_t v1307 = vadd_f32(v1304, v1235); + float32x2_t v1315 = vmul_f32(v1304, v1314); + float32x2_t v1319 = vmul_f32(v1305, v1318); + float32x2_t v1326 = vmul_f32(v1325, v1324); + float32x2_t v1332 = vrev64_f32(v1306); + float32x2_t v1340 = vmul_f32(v1339, v1338); + float32x2_t v1357 = vadd_f32(v1354, v1236); + float32x2_t v1370 = vrev64_f32(v1354); + float32x2_t v1377 = vrev64_f32(v1355); + float32x2_t v1386 = vmul_f32(v1356, v1385); + float32x2_t v1077 = vadd_f32(v1074, v1057); + float32x2_t v1085 = vmul_f32(v1074, v1264); + float32x2_t v1089 = vmul_f32(v1075, v1268); + float32x2_t v1096 = vmul_f32(v1095, v1274); + float32x2_t v1102 = vrev64_f32(v1076); + float32x2_t v1110 = vmul_f32(v1109, v1288); + float32x2_t v1131 = vmul_f32(v1127, v1310); + float32x2_t v1153 = vmul_f32(v1152, v1331); + float32x2_t v1183 = vrev64_f32(v1177); + float32x2_t v1191 = vmul_f32(v1190, v1369); + float32x2_t v1198 = vmul_f32(v1197, v1376); + float32x2_t v1214 = vsub_f32(v1202, v1206); + float32x2_t v1215 = vadd_f32(v1206, v1210); + float32x2_t v1257 = vadd_f32(v1254, v1237); + float32x2_t v1265 = vmul_f32(v1254, v1264); + float32x2_t v1269 = vmul_f32(v1255, v1268); + float32x2_t v1276 = vmul_f32(v1275, v1274); + float32x2_t v1282 = vrev64_f32(v1256); + float32x2_t v1290 = vmul_f32(v1289, v1288); + float32x2_t v1311 = vmul_f32(v1307, v1310); + float32x2_t v1333 = vmul_f32(v1332, v1331); + float32x2_t v1363 = vrev64_f32(v1357); + float32x2_t v1371 = vmul_f32(v1370, v1369); + float32x2_t v1378 = vmul_f32(v1377, v1376); + float32x2_t v1394 = vsub_f32(v1382, v1386); + float32x2_t v1395 = vadd_f32(v1386, v1390); + float32x2_t v1103 = vmul_f32(v1102, v1281); + float32x2_t v1111 = vadd_f32(v1077, v1085); + float32x2_t v1161 = vadd_f32(v1131, v1135); + float32x2_t v1164 = vsub_f32(v1146, v1153); + float32x2_t v1165 = vadd_f32(v1153, v1160); + float32x2_t v1184 = vmul_f32(v1183, v1362); + float32x2_t v1220 = vadd_f32(v1077, v1131); + float32x2_t v1283 = vmul_f32(v1282, v1281); + float32x2_t v1291 = vadd_f32(v1257, v1265); + float32x2_t v1341 = vadd_f32(v1311, v1315); + float32x2_t v1344 = vsub_f32(v1326, v1333); + float32x2_t v1345 = vadd_f32(v1333, v1340); + float32x2_t v1364 = vmul_f32(v1363, v1362); + float32x2_t v1400 = vadd_f32(v1257, v1311); + v6[0] = v1077; + v6[ostride * 15] = v1257; + float32x2_t v1112 = vadd_f32(v1111, v1089); + float32x2_t v1113 = vsub_f32(v1111, v1089); + float32x2_t v1114 = vsub_f32(v1096, v1103); + float32x2_t v1115 = vadd_f32(v1103, v1110); + float32x2_t v1162 = vadd_f32(v1161, v1139); + float32x2_t v1163 = vsub_f32(v1161, v1139); + float32x2_t v1211 = vadd_f32(v1184, v1191); + float32x2_t v1221 = vadd_f32(v1220, v1184); + float32x2_t v1222 = vsub_f32(v1220, v1184); + float32x2_t v1292 = vadd_f32(v1291, v1269); + float32x2_t v1293 = vsub_f32(v1291, v1269); + float32x2_t v1294 = vsub_f32(v1276, v1283); + float32x2_t v1295 = vadd_f32(v1283, v1290); + float32x2_t v1342 = vadd_f32(v1341, v1319); + float32x2_t v1343 = vsub_f32(v1341, v1319); + float32x2_t v1391 = vadd_f32(v1364, v1371); + float32x2_t v1401 = vadd_f32(v1400, v1364); + float32x2_t v1402 = vsub_f32(v1400, v1364); + float32x2_t v1116 = vadd_f32(v1112, v1114); + float32x2_t v1117 = vsub_f32(v1112, v1114); + float32x2_t v1118 = vadd_f32(v1113, v1115); + float32x2_t v1119 = vsub_f32(v1113, v1115); + float32x2_t v1166 = vadd_f32(v1162, v1164); + float32x2_t v1167 = vsub_f32(v1162, v1164); + float32x2_t v1168 = vadd_f32(v1163, v1165); + float32x2_t v1169 = vsub_f32(v1163, v1165); + float32x2_t v1212 = vadd_f32(v1211, v1198); + float32x2_t v1213 = vsub_f32(v1211, v1198); + float32x2_t v1296 = vadd_f32(v1292, v1294); + float32x2_t v1297 = vsub_f32(v1292, v1294); + float32x2_t v1298 = vadd_f32(v1293, v1295); + float32x2_t v1299 = vsub_f32(v1293, v1295); + float32x2_t v1346 = vadd_f32(v1342, v1344); + float32x2_t v1347 = vsub_f32(v1342, v1344); + float32x2_t v1348 = vadd_f32(v1343, v1345); + float32x2_t v1349 = vsub_f32(v1343, v1345); + float32x2_t v1392 = vadd_f32(v1391, v1378); + float32x2_t v1393 = vsub_f32(v1391, v1378); + v6[ostride * 10] = v1222; + v6[ostride * 25] = v1402; + v6[ostride * 20] = v1221; + v6[ostride * 5] = v1401; + float32x2_t v1216 = vadd_f32(v1212, v1214); + float32x2_t v1217 = vsub_f32(v1212, v1214); + float32x2_t v1218 = vadd_f32(v1213, v1215); + float32x2_t v1219 = vsub_f32(v1213, v1215); + float32x2_t v1223 = vadd_f32(v1117, v1167); + float32x2_t v1226 = vadd_f32(v1119, v1169); + float32x2_t v1229 = vadd_f32(v1118, v1168); + float32x2_t v1232 = vadd_f32(v1116, v1166); + float32x2_t v1396 = vadd_f32(v1392, v1394); + float32x2_t v1397 = vsub_f32(v1392, v1394); + float32x2_t v1398 = vadd_f32(v1393, v1395); + float32x2_t v1399 = vsub_f32(v1393, v1395); + float32x2_t v1403 = vadd_f32(v1297, v1347); + float32x2_t v1406 = vadd_f32(v1299, v1349); + float32x2_t v1409 = vadd_f32(v1298, v1348); + float32x2_t v1412 = vadd_f32(v1296, v1346); + v6[ostride * 6] = v1117; + v6[ostride * 21] = v1297; + v6[ostride * 12] = v1119; + v6[ostride * 27] = v1299; + v6[ostride * 18] = v1118; + v6[ostride * 3] = v1298; + v6[ostride * 24] = v1116; + v6[ostride * 9] = v1296; + float32x2_t v1224 = vadd_f32(v1223, v1217); + float32x2_t v1225 = vsub_f32(v1223, v1217); + float32x2_t v1227 = vadd_f32(v1226, v1219); + float32x2_t v1228 = vsub_f32(v1226, v1219); + float32x2_t v1230 = vadd_f32(v1229, v1218); + float32x2_t v1231 = vsub_f32(v1229, v1218); + float32x2_t v1233 = vadd_f32(v1232, v1216); + float32x2_t v1234 = vsub_f32(v1232, v1216); + float32x2_t v1404 = vadd_f32(v1403, v1397); + float32x2_t v1405 = vsub_f32(v1403, v1397); + float32x2_t v1407 = vadd_f32(v1406, v1399); + float32x2_t v1408 = vsub_f32(v1406, v1399); + float32x2_t v1410 = vadd_f32(v1409, v1398); + float32x2_t v1411 = vsub_f32(v1409, v1398); + float32x2_t v1413 = vadd_f32(v1412, v1396); + float32x2_t v1414 = vsub_f32(v1412, v1396); + v6[ostride * 16] = v1225; + v6[ostride] = v1405; + v6[ostride * 22] = v1228; + v6[ostride * 7] = v1408; + v6[ostride * 28] = v1231; + v6[ostride * 13] = v1411; + v6[ostride * 4] = v1234; + v6[ostride * 19] = v1414; + v6[ostride * 26] = v1224; + v6[ostride * 11] = v1404; + v6[ostride * 2] = v1227; + v6[ostride * 17] = v1407; + v6[ostride * 8] = v1230; + v6[ostride * 23] = v1410; + v6[ostride * 14] = v1233; + v6[ostride * 29] = v1413; + v5 += 1 * 1; + v6 += 1 * 1; + } +} +#endif + +#ifdef ARMRAL_ARCH_SVE +void armral_fft_cf32_cf32_cf32_ac_n_uu30(const armral_cmplx_f32_t *restrict x, + armral_cmplx_f32_t *restrict y, + int istride, int ostride, int howmany, + float dir) { + int64_t v0 = istride; + int64_t v2 = ostride; + float v4 = dir; + const float32x2_t *v5 = (const float32x2_t *)x; + float32x2_t *v6 = (float32x2_t *)y; + int64_t v8 = howmany; + int64_t v10 = svcntd(); + int64_t v11 = v10 * 1; + int64_t v12 = v10 * 1; + for (int j = 0; j < v8; j += v10) { + svbool_t pred_full = svwhilelt_b32(j * 2, howmany * 2); + float v476 = -1.2500000000000000e+00F; + float v481 = 5.5901699437494745e-01F; + float v486 = -1.5388417685876268e+00F; + float v493 = -5.8778525229247325e-01F; + float v500 = -3.6327126400268028e-01F; + float v524 = -1.4999999999999998e+00F; + float v529 = 1.8749999999999998e+00F; + float v534 = -8.3852549156242107e-01F; + float v539 = 2.3082626528814396e+00F; + float v546 = 8.8167787843870971e-01F; + float v553 = 5.4490689600402031e-01F; + float v577 = -8.6602540378443871e-01F; + float v584 = 1.0825317547305484e+00F; + float v591 = -4.8412291827592718e-01F; + float v598 = -1.3326760640014592e+00F; + float v603 = -5.0903696045512736e-01F; + float v608 = -3.1460214309120460e-01F; + const float32x2_t *v970 = &v5[v0]; + float32x2_t *v1277 = &v6[v2]; + int64_t v26 = v0 * 15; + int64_t v35 = v0 * 6; + int64_t v42 = v0 * 21; + int64_t v51 = v0 * 12; + int64_t v58 = v0 * 27; + int64_t v67 = v0 * 18; + int64_t v74 = v0 * 3; + int64_t v83 = v0 * 24; + int64_t v90 = v0 * 9; + int64_t v99 = v0 * 10; + int64_t v106 = v0 * 25; + int64_t v115 = v0 * 16; + int64_t v131 = v0 * 22; + int64_t v138 = v0 * 7; + int64_t v147 = v0 * 28; + int64_t v154 = v0 * 13; + int64_t v163 = v0 * 4; + int64_t v170 = v0 * 19; + int64_t v179 = v0 * 20; + int64_t v186 = v0 * 5; + int64_t v195 = v0 * 26; + int64_t v202 = v0 * 11; + int64_t v211 = v0 * 2; + int64_t v218 = v0 * 17; + int64_t v227 = v0 * 8; + int64_t v234 = v0 * 23; + int64_t v243 = v0 * 14; + int64_t v250 = v0 * 29; + float v489 = v4 * v486; + float v496 = v4 * v493; + float v503 = v4 * v500; + float v542 = v4 * v539; + float v549 = v4 * v546; + float v556 = v4 * v553; + float v580 = v4 * v577; + float v587 = v4 * v584; + float v594 = v4 * v591; + int64_t v644 = v2 * 15; + int64_t v651 = v2 * 6; + int64_t v658 = v2 * 21; + int64_t v665 = v2 * 12; + int64_t v672 = v2 * 27; + int64_t v679 = v2 * 18; + int64_t v686 = v2 * 3; + int64_t v693 = v2 * 24; + int64_t v700 = v2 * 9; + int64_t v707 = v2 * 10; + int64_t v714 = v2 * 25; + int64_t v721 = v2 * 16; + int64_t v735 = v2 * 22; + int64_t v742 = v2 * 7; + int64_t v749 = v2 * 28; + int64_t v756 = v2 * 13; + int64_t v763 = v2 * 4; + int64_t v770 = v2 * 19; + int64_t v777 = v2 * 20; + int64_t v784 = v2 * 5; + int64_t v791 = v2 * 26; + int64_t v798 = v2 * 11; + int64_t v805 = v2 * 2; + int64_t v812 = v2 * 17; + int64_t v819 = v2 * 8; + int64_t v826 = v2 * 23; + int64_t v833 = v2 * 14; + int64_t v840 = v2 * 29; + const float32x2_t *v853 = &v5[0]; + svfloat32_t v1136 = svdup_n_f32(v476); + svfloat32_t v1137 = svdup_n_f32(v481); + svfloat32_t v1141 = svdup_n_f32(v524); + svfloat32_t v1142 = svdup_n_f32(v529); + svfloat32_t v1143 = svdup_n_f32(v534); + svfloat32_t v1150 = svdup_n_f32(v598); + svfloat32_t v1151 = svdup_n_f32(v603); + svfloat32_t v1152 = svdup_n_f32(v608); + float32x2_t *v1160 = &v6[0]; + svfloat32_t v1451 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v970)[0])); + const float32x2_t *v862 = &v5[v26]; + const float32x2_t *v871 = &v5[v35]; + const float32x2_t *v880 = &v5[v42]; + const float32x2_t *v889 = &v5[v51]; + const float32x2_t *v898 = &v5[v58]; + const float32x2_t *v907 = &v5[v67]; + const float32x2_t *v916 = &v5[v74]; + const float32x2_t *v925 = &v5[v83]; + const float32x2_t *v934 = &v5[v90]; + const float32x2_t *v943 = &v5[v99]; + const float32x2_t *v952 = &v5[v106]; + const float32x2_t *v961 = &v5[v115]; + const float32x2_t *v979 = &v5[v131]; + const float32x2_t *v988 = &v5[v138]; + const float32x2_t *v997 = &v5[v147]; + const float32x2_t *v1006 = &v5[v154]; + const float32x2_t *v1015 = &v5[v163]; + const float32x2_t *v1024 = &v5[v170]; + const float32x2_t *v1033 = &v5[v179]; + const float32x2_t *v1042 = &v5[v186]; + const float32x2_t *v1051 = &v5[v195]; + const float32x2_t *v1060 = &v5[v202]; + const float32x2_t *v1069 = &v5[v211]; + const float32x2_t *v1078 = &v5[v218]; + const float32x2_t *v1087 = &v5[v227]; + const float32x2_t *v1096 = &v5[v234]; + const float32x2_t *v1105 = &v5[v243]; + const float32x2_t *v1114 = &v5[v250]; + svfloat32_t v1138 = svdup_n_f32(v489); + svfloat32_t v1139 = svdup_n_f32(v496); + svfloat32_t v1140 = svdup_n_f32(v503); + svfloat32_t v1144 = svdup_n_f32(v542); + svfloat32_t v1145 = svdup_n_f32(v549); + svfloat32_t v1146 = svdup_n_f32(v556); + svfloat32_t v1147 = svdup_n_f32(v580); + svfloat32_t v1148 = svdup_n_f32(v587); + svfloat32_t v1149 = svdup_n_f32(v594); + float32x2_t *v1169 = &v6[v644]; + float32x2_t *v1178 = &v6[v651]; + float32x2_t *v1187 = &v6[v658]; + float32x2_t *v1196 = &v6[v665]; + float32x2_t *v1205 = &v6[v672]; + float32x2_t *v1214 = &v6[v679]; + float32x2_t *v1223 = &v6[v686]; + float32x2_t *v1232 = &v6[v693]; + float32x2_t *v1241 = &v6[v700]; + float32x2_t *v1250 = &v6[v707]; + float32x2_t *v1259 = &v6[v714]; + float32x2_t *v1268 = &v6[v721]; + float32x2_t *v1286 = &v6[v735]; + float32x2_t *v1295 = &v6[v742]; + float32x2_t *v1304 = &v6[v749]; + float32x2_t *v1313 = &v6[v756]; + float32x2_t *v1322 = &v6[v763]; + float32x2_t *v1331 = &v6[v770]; + float32x2_t *v1340 = &v6[v777]; + float32x2_t *v1349 = &v6[v784]; + float32x2_t *v1358 = &v6[v791]; + float32x2_t *v1367 = &v6[v798]; + float32x2_t *v1376 = &v6[v805]; + float32x2_t *v1385 = &v6[v812]; + float32x2_t *v1394 = &v6[v819]; + float32x2_t *v1403 = &v6[v826]; + float32x2_t *v1412 = &v6[v833]; + float32x2_t *v1421 = &v6[v840]; + svfloat32_t v1425 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v853)[0])); + svfloat32_t v1427 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v862)[0])); + svfloat32_t v1429 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v871)[0])); + svfloat32_t v1431 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v880)[0])); + svfloat32_t v1433 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v889)[0])); + svfloat32_t v1435 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v898)[0])); + svfloat32_t v1437 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v907)[0])); + svfloat32_t v1439 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v916)[0])); + svfloat32_t v1441 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v925)[0])); + svfloat32_t v1443 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v934)[0])); + svfloat32_t v1445 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v943)[0])); + svfloat32_t v1447 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v952)[0])); + svfloat32_t v1449 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v961)[0])); + svfloat32_t v1453 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v979)[0])); + svfloat32_t v1455 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v988)[0])); + svfloat32_t v1457 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v997)[0])); + svfloat32_t v1459 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v1006)[0])); + svfloat32_t v1461 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v1015)[0])); + svfloat32_t v1463 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v1024)[0])); + svfloat32_t v1465 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v1033)[0])); + svfloat32_t v1467 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v1042)[0])); + svfloat32_t v1469 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v1051)[0])); + svfloat32_t v1471 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v1060)[0])); + svfloat32_t v1473 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v1069)[0])); + svfloat32_t v1475 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v1078)[0])); + svfloat32_t v1477 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v1087)[0])); + svfloat32_t v1479 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v1096)[0])); + svfloat32_t v1481 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v1105)[0])); + svfloat32_t v1483 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v1114)[0])); + svfloat32_t v32 = svadd_f32_x(svptrue_b32(), v1425, v1427); + svfloat32_t v33 = svsub_f32_x(svptrue_b32(), v1425, v1427); + svfloat32_t v48 = svadd_f32_x(svptrue_b32(), v1429, v1431); + svfloat32_t v49 = svsub_f32_x(svptrue_b32(), v1429, v1431); + svfloat32_t v64 = svadd_f32_x(svptrue_b32(), v1433, v1435); + svfloat32_t v65 = svsub_f32_x(svptrue_b32(), v1433, v1435); + svfloat32_t v80 = svadd_f32_x(svptrue_b32(), v1437, v1439); + svfloat32_t v81 = svsub_f32_x(svptrue_b32(), v1437, v1439); + svfloat32_t v96 = svadd_f32_x(svptrue_b32(), v1441, v1443); + svfloat32_t v97 = svsub_f32_x(svptrue_b32(), v1441, v1443); + svfloat32_t v112 = svadd_f32_x(svptrue_b32(), v1445, v1447); + svfloat32_t v113 = svsub_f32_x(svptrue_b32(), v1445, v1447); + svfloat32_t v128 = svadd_f32_x(svptrue_b32(), v1449, v1451); + svfloat32_t v129 = svsub_f32_x(svptrue_b32(), v1449, v1451); + svfloat32_t v144 = svadd_f32_x(svptrue_b32(), v1453, v1455); + svfloat32_t v145 = svsub_f32_x(svptrue_b32(), v1453, v1455); + svfloat32_t v160 = svadd_f32_x(svptrue_b32(), v1457, v1459); + svfloat32_t v161 = svsub_f32_x(svptrue_b32(), v1457, v1459); + svfloat32_t v176 = svadd_f32_x(svptrue_b32(), v1461, v1463); + svfloat32_t v177 = svsub_f32_x(svptrue_b32(), v1461, v1463); + svfloat32_t v192 = svadd_f32_x(svptrue_b32(), v1465, v1467); + svfloat32_t v193 = svsub_f32_x(svptrue_b32(), v1465, v1467); + svfloat32_t v208 = svadd_f32_x(svptrue_b32(), v1469, v1471); + svfloat32_t v209 = svsub_f32_x(svptrue_b32(), v1469, v1471); + svfloat32_t v224 = svadd_f32_x(svptrue_b32(), v1473, v1475); + svfloat32_t v225 = svsub_f32_x(svptrue_b32(), v1473, v1475); + svfloat32_t v240 = svadd_f32_x(svptrue_b32(), v1477, v1479); + svfloat32_t v241 = svsub_f32_x(svptrue_b32(), v1477, v1479); + svfloat32_t v256 = svadd_f32_x(svptrue_b32(), v1481, v1483); + svfloat32_t v257 = svsub_f32_x(svptrue_b32(), v1481, v1483); + svfloat32_t v258 = svadd_f32_x(svptrue_b32(), v112, v192); + svfloat32_t v259 = svsub_f32_x(svptrue_b32(), v112, v192); + svfloat32_t v261 = svadd_f32_x(svptrue_b32(), v128, v208); + svfloat32_t v262 = svsub_f32_x(svptrue_b32(), v128, v208); + svfloat32_t v264 = svadd_f32_x(svptrue_b32(), v144, v224); + svfloat32_t v265 = svsub_f32_x(svptrue_b32(), v144, v224); + svfloat32_t v267 = svadd_f32_x(svptrue_b32(), v160, v240); + svfloat32_t v268 = svsub_f32_x(svptrue_b32(), v160, v240); + svfloat32_t v270 = svadd_f32_x(svptrue_b32(), v176, v256); + svfloat32_t v271 = svsub_f32_x(svptrue_b32(), v176, v256); + svfloat32_t v447 = svadd_f32_x(svptrue_b32(), v113, v193); + svfloat32_t v448 = svsub_f32_x(svptrue_b32(), v113, v193); + svfloat32_t v450 = svadd_f32_x(svptrue_b32(), v129, v209); + svfloat32_t v451 = svsub_f32_x(svptrue_b32(), v129, v209); + svfloat32_t v453 = svadd_f32_x(svptrue_b32(), v145, v225); + svfloat32_t v454 = svsub_f32_x(svptrue_b32(), v145, v225); + svfloat32_t v456 = svadd_f32_x(svptrue_b32(), v161, v241); + svfloat32_t v457 = svsub_f32_x(svptrue_b32(), v161, v241); + svfloat32_t v459 = svadd_f32_x(svptrue_b32(), v177, v257); + svfloat32_t v460 = svsub_f32_x(svptrue_b32(), v177, v257); + svfloat32_t v260 = svadd_f32_x(svptrue_b32(), v258, v32); + svfloat32_t v263 = svadd_f32_x(svptrue_b32(), v261, v48); + svfloat32_t v266 = svadd_f32_x(svptrue_b32(), v264, v64); + svfloat32_t v269 = svadd_f32_x(svptrue_b32(), v267, v80); + svfloat32_t v272 = svadd_f32_x(svptrue_b32(), v270, v96); + svfloat32_t v326 = svadd_f32_x(svptrue_b32(), v261, v270); + svfloat32_t v327 = svsub_f32_x(svptrue_b32(), v261, v270); + svfloat32_t v328 = svadd_f32_x(svptrue_b32(), v267, v264); + svfloat32_t v329 = svsub_f32_x(svptrue_b32(), v267, v264); + svfloat32_t v379 = svadd_f32_x(svptrue_b32(), v262, v271); + svfloat32_t v380 = svsub_f32_x(svptrue_b32(), v262, v271); + svfloat32_t v381 = svadd_f32_x(svptrue_b32(), v268, v265); + svfloat32_t v382 = svsub_f32_x(svptrue_b32(), v268, v265); + svfloat32_t v449 = svadd_f32_x(svptrue_b32(), v447, v33); + svfloat32_t v452 = svadd_f32_x(svptrue_b32(), v450, v49); + svfloat32_t v455 = svadd_f32_x(svptrue_b32(), v453, v65); + svfloat32_t v458 = svadd_f32_x(svptrue_b32(), v456, v81); + svfloat32_t v461 = svadd_f32_x(svptrue_b32(), v459, v97); + svfloat32_t v515 = svadd_f32_x(svptrue_b32(), v450, v459); + svfloat32_t v516 = svsub_f32_x(svptrue_b32(), v450, v459); + svfloat32_t v517 = svadd_f32_x(svptrue_b32(), v456, v453); + svfloat32_t v518 = svsub_f32_x(svptrue_b32(), v456, v453); + svfloat32_t v568 = svadd_f32_x(svptrue_b32(), v451, v460); + svfloat32_t v569 = svsub_f32_x(svptrue_b32(), v451, v460); + svfloat32_t v570 = svadd_f32_x(svptrue_b32(), v457, v454); + svfloat32_t v571 = svsub_f32_x(svptrue_b32(), v457, v454); + svfloat32_t v273 = svadd_f32_x(svptrue_b32(), v263, v272); + svfloat32_t v274 = svsub_f32_x(svptrue_b32(), v263, v272); + svfloat32_t v275 = svadd_f32_x(svptrue_b32(), v269, v266); + svfloat32_t v276 = svsub_f32_x(svptrue_b32(), v269, v266); + svfloat32_t v330 = svadd_f32_x(svptrue_b32(), v326, v328); + svfloat32_t v331 = svsub_f32_x(svptrue_b32(), v326, v328); + svfloat32_t v332 = svadd_f32_x(svptrue_b32(), v327, v329); + svfloat32_t zero355 = svdup_n_f32(0); + svfloat32_t v355 = svcmla_f32_x(pred_full, zero355, v1144, v327, 90); + svfloat32_t v383 = svadd_f32_x(svptrue_b32(), v379, v381); + svfloat32_t v384 = svsub_f32_x(svptrue_b32(), v379, v381); + svfloat32_t v385 = svadd_f32_x(svptrue_b32(), v380, v382); + svfloat32_t v422 = svmul_f32_x(svptrue_b32(), v382, v1152); + svfloat32_t v462 = svadd_f32_x(svptrue_b32(), v452, v461); + svfloat32_t v463 = svsub_f32_x(svptrue_b32(), v452, v461); + svfloat32_t v464 = svadd_f32_x(svptrue_b32(), v458, v455); + svfloat32_t v465 = svsub_f32_x(svptrue_b32(), v458, v455); + svfloat32_t v519 = svadd_f32_x(svptrue_b32(), v515, v517); + svfloat32_t v520 = svsub_f32_x(svptrue_b32(), v515, v517); + svfloat32_t v521 = svadd_f32_x(svptrue_b32(), v516, v518); + svfloat32_t zero544 = svdup_n_f32(0); + svfloat32_t v544 = svcmla_f32_x(pred_full, zero544, v1144, v516, 90); + svfloat32_t v572 = svadd_f32_x(svptrue_b32(), v568, v570); + svfloat32_t v573 = svsub_f32_x(svptrue_b32(), v568, v570); + svfloat32_t v574 = svadd_f32_x(svptrue_b32(), v569, v571); + svfloat32_t v611 = svmul_f32_x(svptrue_b32(), v571, v1152); + svfloat32_t v277 = svadd_f32_x(svptrue_b32(), v273, v275); + svfloat32_t v278 = svsub_f32_x(svptrue_b32(), v273, v275); + svfloat32_t v279 = svadd_f32_x(svptrue_b32(), v274, v276); + svfloat32_t zero302 = svdup_n_f32(0); + svfloat32_t v302 = svcmla_f32_x(pred_full, zero302, v1138, v274, 90); + svfloat32_t v333 = svadd_f32_x(svptrue_b32(), v330, v258); + svfloat32_t v343 = svmul_f32_x(svptrue_b32(), v330, v1142); + svfloat32_t zero362 = svdup_n_f32(0); + svfloat32_t v362 = svcmla_f32_x(pred_full, zero362, v1145, v332, 90); + svfloat32_t v386 = svadd_f32_x(svptrue_b32(), v383, v259); + svfloat32_t zero407 = svdup_n_f32(0); + svfloat32_t v407 = svcmla_f32_x(pred_full, zero407, v1149, v384, 90); + svfloat32_t v417 = svmul_f32_x(svptrue_b32(), v385, v1151); + svfloat32_t v466 = svadd_f32_x(svptrue_b32(), v462, v464); + svfloat32_t v467 = svsub_f32_x(svptrue_b32(), v462, v464); + svfloat32_t v468 = svadd_f32_x(svptrue_b32(), v463, v465); + svfloat32_t zero491 = svdup_n_f32(0); + svfloat32_t v491 = svcmla_f32_x(pred_full, zero491, v1138, v463, 90); + svfloat32_t v522 = svadd_f32_x(svptrue_b32(), v519, v447); + svfloat32_t v532 = svmul_f32_x(svptrue_b32(), v519, v1142); + svfloat32_t zero551 = svdup_n_f32(0); + svfloat32_t v551 = svcmla_f32_x(pred_full, zero551, v1145, v521, 90); + svfloat32_t v575 = svadd_f32_x(svptrue_b32(), v572, v448); + svfloat32_t zero596 = svdup_n_f32(0); + svfloat32_t v596 = svcmla_f32_x(pred_full, zero596, v1149, v573, 90); + svfloat32_t v606 = svmul_f32_x(svptrue_b32(), v574, v1151); + svfloat32_t v280 = svadd_f32_x(svptrue_b32(), v277, v260); + svfloat32_t zero309 = svdup_n_f32(0); + svfloat32_t v309 = svcmla_f32_x(pred_full, zero309, v1139, v279, 90); + svfloat32_t v373 = svsub_f32_x(svptrue_b32(), v355, v362); + svfloat32_t v374 = svcmla_f32_x(pred_full, v362, v1146, v329, 90); + svfloat32_t zero393 = svdup_n_f32(0); + svfloat32_t v393 = svcmla_f32_x(pred_full, zero393, v1147, v386, 90); + svfloat32_t v426 = svnmls_f32_x(pred_full, v417, v380, v1150); + svfloat32_t v427 = svmla_f32_x(pred_full, v422, v385, v1151); + svfloat32_t v469 = svadd_f32_x(svptrue_b32(), v466, v449); + svfloat32_t zero498 = svdup_n_f32(0); + svfloat32_t v498 = svcmla_f32_x(pred_full, zero498, v1139, v468, 90); + svfloat32_t v562 = svsub_f32_x(svptrue_b32(), v544, v551); + svfloat32_t v563 = svcmla_f32_x(pred_full, v551, v1146, v518, 90); + svfloat32_t zero582 = svdup_n_f32(0); + svfloat32_t v582 = svcmla_f32_x(pred_full, zero582, v1147, v575, 90); + svfloat32_t v615 = svnmls_f32_x(pred_full, v606, v569, v1150); + svfloat32_t v616 = svmla_f32_x(pred_full, v611, v574, v1151); + svfloat32_t v317 = svmla_f32_x(pred_full, v280, v277, v1136); + svfloat32_t v320 = svsub_f32_x(svptrue_b32(), v302, v309); + svfloat32_t v321 = svcmla_f32_x(pred_full, v309, v1140, v276, 90); + svfloat32_t v370 = svmla_f32_x(pred_full, v343, v333, v1141); + svfloat32_t v423 = svcmla_f32_x(pred_full, v393, v1148, v383, 90); + svfloat32_t v432 = svmla_f32_x(pred_full, v280, v333, v1141); + svfloat32_t v506 = svmla_f32_x(pred_full, v469, v466, v1136); + svfloat32_t v509 = svsub_f32_x(svptrue_b32(), v491, v498); + svfloat32_t v510 = svcmla_f32_x(pred_full, v498, v1140, v465, 90); + svfloat32_t v559 = svmla_f32_x(pred_full, v532, v522, v1141); + svfloat32_t v612 = svcmla_f32_x(pred_full, v582, v1148, v572, 90); + svfloat32_t v621 = svmla_f32_x(pred_full, v469, v522, v1141); + svst1_f64(pred_full, (double *)(v1160), svreinterpret_f64_f32(v280)); + svst1_f64(pred_full, (double *)(v1169), svreinterpret_f64_f32(v469)); + svfloat32_t v318 = svmla_f32_x(pred_full, v317, v278, v1137); + svfloat32_t v319 = svmls_f32_x(pred_full, v317, v278, v1137); + svfloat32_t v371 = svmla_f32_x(pred_full, v370, v331, v1143); + svfloat32_t v372 = svmls_f32_x(pred_full, v370, v331, v1143); + svfloat32_t v424 = svadd_f32_x(svptrue_b32(), v423, v407); + svfloat32_t v425 = svsub_f32_x(svptrue_b32(), v423, v407); + svfloat32_t v433 = svadd_f32_x(svptrue_b32(), v432, v393); + svfloat32_t v434 = svsub_f32_x(svptrue_b32(), v432, v393); + svfloat32_t v507 = svmla_f32_x(pred_full, v506, v467, v1137); + svfloat32_t v508 = svmls_f32_x(pred_full, v506, v467, v1137); + svfloat32_t v560 = svmla_f32_x(pred_full, v559, v520, v1143); + svfloat32_t v561 = svmls_f32_x(pred_full, v559, v520, v1143); + svfloat32_t v613 = svadd_f32_x(svptrue_b32(), v612, v596); + svfloat32_t v614 = svsub_f32_x(svptrue_b32(), v612, v596); + svfloat32_t v622 = svadd_f32_x(svptrue_b32(), v621, v582); + svfloat32_t v623 = svsub_f32_x(svptrue_b32(), v621, v582); + svfloat32_t v322 = svadd_f32_x(svptrue_b32(), v318, v320); + svfloat32_t v323 = svsub_f32_x(svptrue_b32(), v318, v320); + svfloat32_t v324 = svadd_f32_x(svptrue_b32(), v319, v321); + svfloat32_t v325 = svsub_f32_x(svptrue_b32(), v319, v321); + svfloat32_t v375 = svadd_f32_x(svptrue_b32(), v371, v373); + svfloat32_t v376 = svsub_f32_x(svptrue_b32(), v371, v373); + svfloat32_t v377 = svadd_f32_x(svptrue_b32(), v372, v374); + svfloat32_t v378 = svsub_f32_x(svptrue_b32(), v372, v374); + svfloat32_t v428 = svadd_f32_x(svptrue_b32(), v424, v426); + svfloat32_t v429 = svsub_f32_x(svptrue_b32(), v424, v426); + svfloat32_t v430 = svadd_f32_x(svptrue_b32(), v425, v427); + svfloat32_t v431 = svsub_f32_x(svptrue_b32(), v425, v427); + svfloat32_t v511 = svadd_f32_x(svptrue_b32(), v507, v509); + svfloat32_t v512 = svsub_f32_x(svptrue_b32(), v507, v509); + svfloat32_t v513 = svadd_f32_x(svptrue_b32(), v508, v510); + svfloat32_t v514 = svsub_f32_x(svptrue_b32(), v508, v510); + svfloat32_t v564 = svadd_f32_x(svptrue_b32(), v560, v562); + svfloat32_t v565 = svsub_f32_x(svptrue_b32(), v560, v562); + svfloat32_t v566 = svadd_f32_x(svptrue_b32(), v561, v563); + svfloat32_t v567 = svsub_f32_x(svptrue_b32(), v561, v563); + svfloat32_t v617 = svadd_f32_x(svptrue_b32(), v613, v615); + svfloat32_t v618 = svsub_f32_x(svptrue_b32(), v613, v615); + svfloat32_t v619 = svadd_f32_x(svptrue_b32(), v614, v616); + svfloat32_t v620 = svsub_f32_x(svptrue_b32(), v614, v616); + svst1_f64(pred_full, (double *)(v1250), svreinterpret_f64_f32(v434)); + svst1_f64(pred_full, (double *)(v1259), svreinterpret_f64_f32(v623)); + svst1_f64(pred_full, (double *)(v1340), svreinterpret_f64_f32(v433)); + svst1_f64(pred_full, (double *)(v1349), svreinterpret_f64_f32(v622)); + svfloat32_t v435 = svadd_f32_x(svptrue_b32(), v323, v376); + svfloat32_t v438 = svadd_f32_x(svptrue_b32(), v325, v378); + svfloat32_t v441 = svadd_f32_x(svptrue_b32(), v324, v377); + svfloat32_t v444 = svadd_f32_x(svptrue_b32(), v322, v375); + svfloat32_t v624 = svadd_f32_x(svptrue_b32(), v512, v565); + svfloat32_t v627 = svadd_f32_x(svptrue_b32(), v514, v567); + svfloat32_t v630 = svadd_f32_x(svptrue_b32(), v513, v566); + svfloat32_t v633 = svadd_f32_x(svptrue_b32(), v511, v564); + svst1_f64(pred_full, (double *)(v1178), svreinterpret_f64_f32(v323)); + svst1_f64(pred_full, (double *)(v1187), svreinterpret_f64_f32(v512)); + svst1_f64(pred_full, (double *)(v1196), svreinterpret_f64_f32(v325)); + svst1_f64(pred_full, (double *)(v1205), svreinterpret_f64_f32(v514)); + svst1_f64(pred_full, (double *)(v1214), svreinterpret_f64_f32(v324)); + svst1_f64(pred_full, (double *)(v1223), svreinterpret_f64_f32(v513)); + svst1_f64(pred_full, (double *)(v1232), svreinterpret_f64_f32(v322)); + svst1_f64(pred_full, (double *)(v1241), svreinterpret_f64_f32(v511)); + svfloat32_t v436 = svadd_f32_x(svptrue_b32(), v435, v429); + svfloat32_t v437 = svsub_f32_x(svptrue_b32(), v435, v429); + svfloat32_t v439 = svadd_f32_x(svptrue_b32(), v438, v431); + svfloat32_t v440 = svsub_f32_x(svptrue_b32(), v438, v431); + svfloat32_t v442 = svadd_f32_x(svptrue_b32(), v441, v430); + svfloat32_t v443 = svsub_f32_x(svptrue_b32(), v441, v430); + svfloat32_t v445 = svadd_f32_x(svptrue_b32(), v444, v428); + svfloat32_t v446 = svsub_f32_x(svptrue_b32(), v444, v428); + svfloat32_t v625 = svadd_f32_x(svptrue_b32(), v624, v618); + svfloat32_t v626 = svsub_f32_x(svptrue_b32(), v624, v618); + svfloat32_t v628 = svadd_f32_x(svptrue_b32(), v627, v620); + svfloat32_t v629 = svsub_f32_x(svptrue_b32(), v627, v620); + svfloat32_t v631 = svadd_f32_x(svptrue_b32(), v630, v619); + svfloat32_t v632 = svsub_f32_x(svptrue_b32(), v630, v619); + svfloat32_t v634 = svadd_f32_x(svptrue_b32(), v633, v617); + svfloat32_t v635 = svsub_f32_x(svptrue_b32(), v633, v617); + svst1_f64(pred_full, (double *)(v1268), svreinterpret_f64_f32(v437)); + svst1_f64(pred_full, (double *)(v1277), svreinterpret_f64_f32(v626)); + svst1_f64(pred_full, (double *)(v1286), svreinterpret_f64_f32(v440)); + svst1_f64(pred_full, (double *)(v1295), svreinterpret_f64_f32(v629)); + svst1_f64(pred_full, (double *)(v1304), svreinterpret_f64_f32(v443)); + svst1_f64(pred_full, (double *)(v1313), svreinterpret_f64_f32(v632)); + svst1_f64(pred_full, (double *)(v1322), svreinterpret_f64_f32(v446)); + svst1_f64(pred_full, (double *)(v1331), svreinterpret_f64_f32(v635)); + svst1_f64(pred_full, (double *)(v1358), svreinterpret_f64_f32(v436)); + svst1_f64(pred_full, (double *)(v1367), svreinterpret_f64_f32(v625)); + svst1_f64(pred_full, (double *)(v1376), svreinterpret_f64_f32(v439)); + svst1_f64(pred_full, (double *)(v1385), svreinterpret_f64_f32(v628)); + svst1_f64(pred_full, (double *)(v1394), svreinterpret_f64_f32(v442)); + svst1_f64(pred_full, (double *)(v1403), svreinterpret_f64_f32(v631)); + svst1_f64(pred_full, (double *)(v1412), svreinterpret_f64_f32(v445)); + svst1_f64(pred_full, (double *)(v1421), svreinterpret_f64_f32(v634)); + v5 += v11; + v6 += v12; + } +} +#endif + #ifndef ARMRAL_ARCH_SVE void armral_fft_cf32_cf32_cf32_ac_n_uu32(const armral_cmplx_f32_t *restrict x, armral_cmplx_f32_t *restrict y, @@ -17831,7 +18516,6 @@ void armral_fft_cf32_cf32_cf32_ac_n_uu32(const armral_cmplx_f32_t *restrict x, float v1064 = 9.8078528040323043e-01F; float v1071 = -5.5557023301960218e-01F; float v1076 = -8.3146961230254524e-01F; - float v1087 = 1.0000000000000000e+00F; const float32x2_t *v1301 = &v5[v0]; float32x2_t *v1502 = &v6[v2]; int64_t v26 = v0 * 16; @@ -17899,7 +18583,6 @@ void armral_fft_cf32_cf32_cf32_ac_n_uu32(const armral_cmplx_f32_t *restrict x, int64_t v1052 = v2 * 30; float v1067 = v4 * v1064; float v1079 = v4 * v1076; - float v1090 = v4 * v1087; int64_t v1098 = v2 * 7; int64_t v1105 = v2 * 15; int64_t v1112 = v2 * 23; @@ -17918,6 +18601,7 @@ void armral_fft_cf32_cf32_cf32_ac_n_uu32(const armral_cmplx_f32_t *restrict x, svfloat32_t v1698 = svdup_n_f32(v1004); svfloat32_t v1737 = svdup_n_f32(v1059); svfloat32_t v1739 = svdup_n_f32(v1071); + svfloat32_t v1741 = svdup_n_f32(v4); svfloat32_t v1811 = svreinterpret_f32_f64( svld1_f64(pred_full, &((const double *)v1301)[0])); const float32x2_t *v1141 = &v5[v26]; @@ -17985,7 +18669,6 @@ void armral_fft_cf32_cf32_cf32_ac_n_uu32(const armral_cmplx_f32_t *restrict x, float32x2_t *v1734 = &v6[v1052]; svfloat32_t v1738 = svdup_n_f32(v1067); svfloat32_t v1740 = svdup_n_f32(v1079); - svfloat32_t v1741 = svdup_n_f32(v1090); float32x2_t *v1748 = &v6[v1098]; float32x2_t *v1757 = &v6[v1105]; float32x2_t *v1766 = &v6[v1112]; @@ -18052,293 +18735,162 @@ void armral_fft_cf32_cf32_cf32_ac_n_uu32(const armral_cmplx_f32_t *restrict x, svld1_f64(pred_full, &((const double *)v1435)[0])); svfloat32_t v1841 = svreinterpret_f32_f64( svld1_f64(pred_full, &((const double *)v1444)[0])); - svfloat32_t v32; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v32) : "w"(v1779), "w"(v1781)); - svfloat32_t v33; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v33) : "w"(v1779), "w"(v1781)); - svfloat32_t v48; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v48) : "w"(v1783), "w"(v1785)); - svfloat32_t v49; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v49) : "w"(v1783), "w"(v1785)); - svfloat32_t v75; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v75) : "w"(v1787), "w"(v1789)); - svfloat32_t v76; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v76) : "w"(v1787), "w"(v1789)); - svfloat32_t v91; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v91) : "w"(v1791), "w"(v1793)); - svfloat32_t v92; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v92) : "w"(v1791), "w"(v1793)); - svfloat32_t v159; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v159) : "w"(v1795), "w"(v1797)); - svfloat32_t v160; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v160) : "w"(v1795), "w"(v1797)); - svfloat32_t v175; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v175) : "w"(v1799), "w"(v1801)); - svfloat32_t v176; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v176) : "w"(v1799), "w"(v1801)); - svfloat32_t v202; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v202) : "w"(v1803), "w"(v1805)); - svfloat32_t v203; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v203) : "w"(v1803), "w"(v1805)); - svfloat32_t v218; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v218) : "w"(v1807), "w"(v1809)); - svfloat32_t v219; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v219) : "w"(v1807), "w"(v1809)); - svfloat32_t v375; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v375) : "w"(v1811), "w"(v1813)); - svfloat32_t v376; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v376) : "w"(v1811), "w"(v1813)); - svfloat32_t v391; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v391) : "w"(v1815), "w"(v1817)); - svfloat32_t v392; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v392) : "w"(v1815), "w"(v1817)); - svfloat32_t v418; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v418) : "w"(v1819), "w"(v1821)); - svfloat32_t v419; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v419) : "w"(v1819), "w"(v1821)); - svfloat32_t v434; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v434) : "w"(v1823), "w"(v1825)); - svfloat32_t v435; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v435) : "w"(v1823), "w"(v1825)); - svfloat32_t v502; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v502) : "w"(v1827), "w"(v1829)); - svfloat32_t v503; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v503) : "w"(v1827), "w"(v1829)); - svfloat32_t v518; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v518) : "w"(v1831), "w"(v1833)); - svfloat32_t v519; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v519) : "w"(v1831), "w"(v1833)); - svfloat32_t v545; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v545) : "w"(v1835), "w"(v1837)); - svfloat32_t v546; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v546) : "w"(v1835), "w"(v1837)); - svfloat32_t v561; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v561) : "w"(v1839), "w"(v1841)); - svfloat32_t v562; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v562) : "w"(v1839), "w"(v1841)); - svfloat32_t zero56; - asm volatile("mov %0.s, #0" : "=w"(zero56)); + svfloat32_t v32 = svadd_f32_x(svptrue_b32(), v1779, v1781); + svfloat32_t v33 = svsub_f32_x(svptrue_b32(), v1779, v1781); + svfloat32_t v48 = svadd_f32_x(svptrue_b32(), v1783, v1785); + svfloat32_t v49 = svsub_f32_x(svptrue_b32(), v1783, v1785); + svfloat32_t v75 = svadd_f32_x(svptrue_b32(), v1787, v1789); + svfloat32_t v76 = svsub_f32_x(svptrue_b32(), v1787, v1789); + svfloat32_t v91 = svadd_f32_x(svptrue_b32(), v1791, v1793); + svfloat32_t v92 = svsub_f32_x(svptrue_b32(), v1791, v1793); + svfloat32_t v159 = svadd_f32_x(svptrue_b32(), v1795, v1797); + svfloat32_t v160 = svsub_f32_x(svptrue_b32(), v1795, v1797); + svfloat32_t v175 = svadd_f32_x(svptrue_b32(), v1799, v1801); + svfloat32_t v176 = svsub_f32_x(svptrue_b32(), v1799, v1801); + svfloat32_t v202 = svadd_f32_x(svptrue_b32(), v1803, v1805); + svfloat32_t v203 = svsub_f32_x(svptrue_b32(), v1803, v1805); + svfloat32_t v218 = svadd_f32_x(svptrue_b32(), v1807, v1809); + svfloat32_t v219 = svsub_f32_x(svptrue_b32(), v1807, v1809); + svfloat32_t v375 = svadd_f32_x(svptrue_b32(), v1811, v1813); + svfloat32_t v376 = svsub_f32_x(svptrue_b32(), v1811, v1813); + svfloat32_t v391 = svadd_f32_x(svptrue_b32(), v1815, v1817); + svfloat32_t v392 = svsub_f32_x(svptrue_b32(), v1815, v1817); + svfloat32_t v418 = svadd_f32_x(svptrue_b32(), v1819, v1821); + svfloat32_t v419 = svsub_f32_x(svptrue_b32(), v1819, v1821); + svfloat32_t v434 = svadd_f32_x(svptrue_b32(), v1823, v1825); + svfloat32_t v435 = svsub_f32_x(svptrue_b32(), v1823, v1825); + svfloat32_t v502 = svadd_f32_x(svptrue_b32(), v1827, v1829); + svfloat32_t v503 = svsub_f32_x(svptrue_b32(), v1827, v1829); + svfloat32_t v518 = svadd_f32_x(svptrue_b32(), v1831, v1833); + svfloat32_t v519 = svsub_f32_x(svptrue_b32(), v1831, v1833); + svfloat32_t v545 = svadd_f32_x(svptrue_b32(), v1835, v1837); + svfloat32_t v546 = svsub_f32_x(svptrue_b32(), v1835, v1837); + svfloat32_t v561 = svadd_f32_x(svptrue_b32(), v1839, v1841); + svfloat32_t v562 = svsub_f32_x(svptrue_b32(), v1839, v1841); + svfloat32_t zero56 = svdup_n_f32(0); svfloat32_t v56 = svcmla_f32_x(pred_full, zero56, v1617, v49, 90); - svfloat32_t v57; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v57) : "w"(v32), "w"(v48)); - svfloat32_t v58; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v58) : "w"(v32), "w"(v48)); - svfloat32_t v93; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v93) : "w"(v75), "w"(v91)); - svfloat32_t v94; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v94) : "w"(v75), "w"(v91)); - svfloat32_t v110; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v110) : "w"(v76), "w"(v1614)); - svfloat32_t v122; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v122) : "w"(v92), "w"(v1616)); - svfloat32_t zero183; - asm volatile("mov %0.s, #0" : "=w"(zero183)); + svfloat32_t v57 = svadd_f32_x(svptrue_b32(), v32, v48); + svfloat32_t v58 = svsub_f32_x(svptrue_b32(), v32, v48); + svfloat32_t v93 = svadd_f32_x(svptrue_b32(), v75, v91); + svfloat32_t v94 = svsub_f32_x(svptrue_b32(), v75, v91); + svfloat32_t v110 = svmul_f32_x(svptrue_b32(), v76, v1614); + svfloat32_t v122 = svmul_f32_x(svptrue_b32(), v92, v1616); + svfloat32_t zero183 = svdup_n_f32(0); svfloat32_t v183 = svcmla_f32_x(pred_full, zero183, v1617, v176, 90); - svfloat32_t v184; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v184) : "w"(v159), "w"(v175)); - svfloat32_t v185; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v185) : "w"(v159), "w"(v175)); - svfloat32_t zero226; - asm volatile("mov %0.s, #0" : "=w"(zero226)); + svfloat32_t v184 = svadd_f32_x(svptrue_b32(), v159, v175); + svfloat32_t v185 = svsub_f32_x(svptrue_b32(), v159, v175); + svfloat32_t zero226 = svdup_n_f32(0); svfloat32_t v226 = svcmla_f32_x(pred_full, zero226, v1617, v219, 90); - svfloat32_t v227; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v227) : "w"(v202), "w"(v218)); - svfloat32_t v228; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v228) : "w"(v202), "w"(v218)); - svfloat32_t zero399; - asm volatile("mov %0.s, #0" : "=w"(zero399)); + svfloat32_t v227 = svadd_f32_x(svptrue_b32(), v202, v218); + svfloat32_t v228 = svsub_f32_x(svptrue_b32(), v202, v218); + svfloat32_t zero399 = svdup_n_f32(0); svfloat32_t v399 = svcmla_f32_x(pred_full, zero399, v1617, v392, 90); - svfloat32_t v400; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v400) : "w"(v375), "w"(v391)); - svfloat32_t v401; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v401) : "w"(v375), "w"(v391)); - svfloat32_t v436; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v436) : "w"(v418), "w"(v434)); - svfloat32_t v437; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v437) : "w"(v418), "w"(v434)); - svfloat32_t v453; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v453) : "w"(v419), "w"(v1614)); - svfloat32_t v465; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v465) : "w"(v435), "w"(v1616)); - svfloat32_t zero526; - asm volatile("mov %0.s, #0" : "=w"(zero526)); + svfloat32_t v400 = svadd_f32_x(svptrue_b32(), v375, v391); + svfloat32_t v401 = svsub_f32_x(svptrue_b32(), v375, v391); + svfloat32_t v436 = svadd_f32_x(svptrue_b32(), v418, v434); + svfloat32_t v437 = svsub_f32_x(svptrue_b32(), v418, v434); + svfloat32_t v453 = svmul_f32_x(svptrue_b32(), v419, v1614); + svfloat32_t v465 = svmul_f32_x(svptrue_b32(), v435, v1616); + svfloat32_t zero526 = svdup_n_f32(0); svfloat32_t v526 = svcmla_f32_x(pred_full, zero526, v1617, v519, 90); - svfloat32_t v527; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v527) : "w"(v502), "w"(v518)); - svfloat32_t v528; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v528) : "w"(v502), "w"(v518)); - svfloat32_t v563; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v563) : "w"(v545), "w"(v561)); - svfloat32_t v564; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v564) : "w"(v545), "w"(v561)); - svfloat32_t v580; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v580) : "w"(v546), "w"(v1614)); - svfloat32_t v592; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v592) : "w"(v562), "w"(v1616)); - svfloat32_t v59; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v59) : "w"(v33), "w"(v56)); - svfloat32_t v60; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v60) : "w"(v33), "w"(v56)); - svfloat32_t zero101; - asm volatile("mov %0.s, #0" : "=w"(zero101)); + svfloat32_t v527 = svadd_f32_x(svptrue_b32(), v502, v518); + svfloat32_t v528 = svsub_f32_x(svptrue_b32(), v502, v518); + svfloat32_t v563 = svadd_f32_x(svptrue_b32(), v545, v561); + svfloat32_t v564 = svsub_f32_x(svptrue_b32(), v545, v561); + svfloat32_t v580 = svmul_f32_x(svptrue_b32(), v546, v1614); + svfloat32_t v592 = svmul_f32_x(svptrue_b32(), v562, v1616); + svfloat32_t v59 = svsub_f32_x(svptrue_b32(), v33, v56); + svfloat32_t v60 = svadd_f32_x(svptrue_b32(), v33, v56); + svfloat32_t zero101 = svdup_n_f32(0); svfloat32_t v101 = svcmla_f32_x(pred_full, zero101, v1617, v94, 90); - svfloat32_t v102; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v102) : "w"(v57), "w"(v93)); - svfloat32_t v103; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v103) : "w"(v57), "w"(v93)); - svfloat32_t v186; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v186) : "w"(v160), "w"(v183)); - svfloat32_t v187; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v187) : "w"(v160), "w"(v183)); - svfloat32_t v229; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v229) : "w"(v203), "w"(v226)); - svfloat32_t v230; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v230) : "w"(v203), "w"(v226)); - svfloat32_t v231; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v231) : "w"(v184), "w"(v227)); - svfloat32_t v232; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v232) : "w"(v184), "w"(v227)); - svfloat32_t v287; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v287) : "w"(v185), "w"(v1614)); - svfloat32_t v299; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v299) : "w"(v228), "w"(v1616)); - svfloat32_t v402; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v402) : "w"(v376), "w"(v399)); - svfloat32_t v403; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v403) : "w"(v376), "w"(v399)); - svfloat32_t zero444; - asm volatile("mov %0.s, #0" : "=w"(zero444)); + svfloat32_t v102 = svadd_f32_x(svptrue_b32(), v57, v93); + svfloat32_t v103 = svsub_f32_x(svptrue_b32(), v57, v93); + svfloat32_t v186 = svsub_f32_x(svptrue_b32(), v160, v183); + svfloat32_t v187 = svadd_f32_x(svptrue_b32(), v160, v183); + svfloat32_t v229 = svsub_f32_x(svptrue_b32(), v203, v226); + svfloat32_t v230 = svadd_f32_x(svptrue_b32(), v203, v226); + svfloat32_t v231 = svadd_f32_x(svptrue_b32(), v184, v227); + svfloat32_t v232 = svsub_f32_x(svptrue_b32(), v184, v227); + svfloat32_t v287 = svmul_f32_x(svptrue_b32(), v185, v1614); + svfloat32_t v299 = svmul_f32_x(svptrue_b32(), v228, v1616); + svfloat32_t v402 = svsub_f32_x(svptrue_b32(), v376, v399); + svfloat32_t v403 = svadd_f32_x(svptrue_b32(), v376, v399); + svfloat32_t zero444 = svdup_n_f32(0); svfloat32_t v444 = svcmla_f32_x(pred_full, zero444, v1617, v437, 90); - svfloat32_t v445; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v445) : "w"(v400), "w"(v436)); - svfloat32_t v446; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v446) : "w"(v400), "w"(v436)); - svfloat32_t v529; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v529) : "w"(v503), "w"(v526)); - svfloat32_t v530; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v530) : "w"(v503), "w"(v526)); - svfloat32_t zero571; - asm volatile("mov %0.s, #0" : "=w"(zero571)); + svfloat32_t v445 = svadd_f32_x(svptrue_b32(), v400, v436); + svfloat32_t v446 = svsub_f32_x(svptrue_b32(), v400, v436); + svfloat32_t v529 = svsub_f32_x(svptrue_b32(), v503, v526); + svfloat32_t v530 = svadd_f32_x(svptrue_b32(), v503, v526); + svfloat32_t zero571 = svdup_n_f32(0); svfloat32_t v571 = svcmla_f32_x(pred_full, zero571, v1617, v564, 90); - svfloat32_t v572; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v572) : "w"(v527), "w"(v563)); - svfloat32_t v573; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v573) : "w"(v527), "w"(v563)); - svfloat32_t v104; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v104) : "w"(v58), "w"(v101)); - svfloat32_t v105; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v105) : "w"(v58), "w"(v101)); + svfloat32_t v572 = svadd_f32_x(svptrue_b32(), v527, v563); + svfloat32_t v573 = svsub_f32_x(svptrue_b32(), v527, v563); + svfloat32_t v104 = svsub_f32_x(svptrue_b32(), v58, v101); + svfloat32_t v105 = svadd_f32_x(svptrue_b32(), v58, v101); svfloat32_t v130 = svcmla_f32_x(pred_full, v110, v1741, v110, 90); svfloat32_t v131 = svcmla_f32_x(pred_full, v122, v1617, v122, 90); - svfloat32_t zero239; - asm volatile("mov %0.s, #0" : "=w"(zero239)); + svfloat32_t zero239 = svdup_n_f32(0); svfloat32_t v239 = svcmla_f32_x(pred_full, zero239, v1617, v232, 90); - svfloat32_t v240; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v240) : "w"(v102), "w"(v231)); - svfloat32_t v241; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v241) : "w"(v102), "w"(v231)); - svfloat32_t v248; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v248) : "w"(v186), "w"(v1532)); - svfloat32_t v260; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v260) : "w"(v229), "w"(v1696)); - svfloat32_t v326; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v326) : "w"(v187), "w"(v1696)); - svfloat32_t v338; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v338) : "w"(v230), "w"(v1698)); - svfloat32_t v447; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v447) : "w"(v401), "w"(v444)); - svfloat32_t v448; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v448) : "w"(v401), "w"(v444)); + svfloat32_t v240 = svadd_f32_x(svptrue_b32(), v102, v231); + svfloat32_t v241 = svsub_f32_x(svptrue_b32(), v102, v231); + svfloat32_t v248 = svmul_f32_x(svptrue_b32(), v186, v1532); + svfloat32_t v260 = svmul_f32_x(svptrue_b32(), v229, v1696); + svfloat32_t v326 = svmul_f32_x(svptrue_b32(), v187, v1696); + svfloat32_t v338 = svmul_f32_x(svptrue_b32(), v230, v1698); + svfloat32_t v447 = svsub_f32_x(svptrue_b32(), v401, v444); + svfloat32_t v448 = svadd_f32_x(svptrue_b32(), v401, v444); svfloat32_t v473 = svcmla_f32_x(pred_full, v453, v1741, v453, 90); svfloat32_t v474 = svcmla_f32_x(pred_full, v465, v1617, v465, 90); - svfloat32_t v574; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v574) : "w"(v528), "w"(v571)); - svfloat32_t v575; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v575) : "w"(v528), "w"(v571)); + svfloat32_t v574 = svsub_f32_x(svptrue_b32(), v528, v571); + svfloat32_t v575 = svadd_f32_x(svptrue_b32(), v528, v571); svfloat32_t v600 = svcmla_f32_x(pred_full, v580, v1741, v580, 90); svfloat32_t v601 = svcmla_f32_x(pred_full, v592, v1617, v592, 90); - svfloat32_t v615; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v615) : "w"(v445), "w"(v572)); - svfloat32_t v616; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v616) : "w"(v445), "w"(v572)); - svfloat32_t v861; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v861) : "w"(v446), "w"(v1614)); - svfloat32_t v873; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v873) : "w"(v573), "w"(v1616)); - svfloat32_t v132; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v132) : "w"(v130), "w"(v131)); - svfloat32_t v133; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v133) : "w"(v131), "w"(v130)); - svfloat32_t v242; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v242) : "w"(v103), "w"(v239)); - svfloat32_t v243; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v243) : "w"(v103), "w"(v239)); + svfloat32_t v615 = svadd_f32_x(svptrue_b32(), v445, v572); + svfloat32_t v616 = svsub_f32_x(svptrue_b32(), v445, v572); + svfloat32_t v861 = svmul_f32_x(svptrue_b32(), v446, v1614); + svfloat32_t v873 = svmul_f32_x(svptrue_b32(), v573, v1616); + svfloat32_t v132 = svadd_f32_x(svptrue_b32(), v130, v131); + svfloat32_t v133 = svsub_f32_x(svptrue_b32(), v131, v130); + svfloat32_t v242 = svsub_f32_x(svptrue_b32(), v103, v239); + svfloat32_t v243 = svadd_f32_x(svptrue_b32(), v103, v239); svfloat32_t v268 = svcmla_f32_x(pred_full, v248, v1533, v186, 90); svfloat32_t v269 = svcmla_f32_x(pred_full, v260, v1697, v229, 90); svfloat32_t v307 = svcmla_f32_x(pred_full, v287, v1741, v287, 90); svfloat32_t v308 = svcmla_f32_x(pred_full, v299, v1617, v299, 90); svfloat32_t v346 = svcmla_f32_x(pred_full, v326, v1697, v187, 90); svfloat32_t v347 = svcmla_f32_x(pred_full, v338, v1699, v230, 90); - svfloat32_t v475; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v475) : "w"(v473), "w"(v474)); - svfloat32_t v476; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v476) : "w"(v474), "w"(v473)); - svfloat32_t v602; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v602) : "w"(v600), "w"(v601)); - svfloat32_t v603; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v603) : "w"(v601), "w"(v600)); - svfloat32_t zero623; - asm volatile("mov %0.s, #0" : "=w"(zero623)); + svfloat32_t v475 = svadd_f32_x(svptrue_b32(), v473, v474); + svfloat32_t v476 = svsub_f32_x(svptrue_b32(), v474, v473); + svfloat32_t v602 = svadd_f32_x(svptrue_b32(), v600, v601); + svfloat32_t v603 = svsub_f32_x(svptrue_b32(), v601, v600); + svfloat32_t zero623 = svdup_n_f32(0); svfloat32_t v623 = svcmla_f32_x(pred_full, zero623, v1617, v616, 90); - svfloat32_t v624; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v624) : "w"(v240), "w"(v615)); - svfloat32_t v625; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v625) : "w"(v240), "w"(v615)); - svfloat32_t v727; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v727) : "w"(v447), "w"(v1532)); - svfloat32_t v739; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v739) : "w"(v574), "w"(v1696)); - svfloat32_t v995; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v995) : "w"(v448), "w"(v1696)); - svfloat32_t v1007; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v1007) : "w"(v575), "w"(v1698)); - svfloat32_t zero140; - asm volatile("mov %0.s, #0" : "=w"(zero140)); + svfloat32_t v624 = svadd_f32_x(svptrue_b32(), v240, v615); + svfloat32_t v625 = svsub_f32_x(svptrue_b32(), v240, v615); + svfloat32_t v727 = svmul_f32_x(svptrue_b32(), v447, v1532); + svfloat32_t v739 = svmul_f32_x(svptrue_b32(), v574, v1696); + svfloat32_t v995 = svmul_f32_x(svptrue_b32(), v448, v1696); + svfloat32_t v1007 = svmul_f32_x(svptrue_b32(), v575, v1698); + svfloat32_t zero140 = svdup_n_f32(0); svfloat32_t v140 = svcmla_f32_x(pred_full, zero140, v1741, v133, 90); - svfloat32_t v141; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v141) : "w"(v59), "w"(v132)); - svfloat32_t v142; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v142) : "w"(v59), "w"(v132)); - svfloat32_t v270; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v270) : "w"(v268), "w"(v269)); - svfloat32_t v271; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v271) : "w"(v269), "w"(v268)); - svfloat32_t v309; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v309) : "w"(v307), "w"(v308)); - svfloat32_t v310; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v310) : "w"(v308), "w"(v307)); - svfloat32_t v348; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v348) : "w"(v346), "w"(v347)); - svfloat32_t v349; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v349) : "w"(v347), "w"(v346)); - svfloat32_t zero483; - asm volatile("mov %0.s, #0" : "=w"(zero483)); + svfloat32_t v141 = svadd_f32_x(svptrue_b32(), v59, v132); + svfloat32_t v142 = svsub_f32_x(svptrue_b32(), v59, v132); + svfloat32_t v270 = svadd_f32_x(svptrue_b32(), v268, v269); + svfloat32_t v271 = svsub_f32_x(svptrue_b32(), v269, v268); + svfloat32_t v309 = svadd_f32_x(svptrue_b32(), v307, v308); + svfloat32_t v310 = svsub_f32_x(svptrue_b32(), v308, v307); + svfloat32_t v348 = svadd_f32_x(svptrue_b32(), v346, v347); + svfloat32_t v349 = svsub_f32_x(svptrue_b32(), v347, v346); + svfloat32_t zero483 = svdup_n_f32(0); svfloat32_t v483 = svcmla_f32_x(pred_full, zero483, v1741, v476, 90); - svfloat32_t v484; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v484) : "w"(v402), "w"(v475)); - svfloat32_t v485; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v485) : "w"(v402), "w"(v475)); - svfloat32_t zero610; - asm volatile("mov %0.s, #0" : "=w"(zero610)); + svfloat32_t v484 = svadd_f32_x(svptrue_b32(), v402, v475); + svfloat32_t v485 = svsub_f32_x(svptrue_b32(), v402, v475); + svfloat32_t zero610 = svdup_n_f32(0); svfloat32_t v610 = svcmla_f32_x(pred_full, zero610, v1741, v603, 90); - svfloat32_t v611; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v611) : "w"(v529), "w"(v602)); - svfloat32_t v612; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v612) : "w"(v529), "w"(v602)); - svfloat32_t v626; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v626) : "w"(v241), "w"(v623)); - svfloat32_t v627; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v627) : "w"(v241), "w"(v623)); + svfloat32_t v611 = svadd_f32_x(svptrue_b32(), v529, v602); + svfloat32_t v612 = svsub_f32_x(svptrue_b32(), v529, v602); + svfloat32_t v626 = svsub_f32_x(svptrue_b32(), v241, v623); + svfloat32_t v627 = svadd_f32_x(svptrue_b32(), v241, v623); svfloat32_t v747 = svcmla_f32_x(pred_full, v727, v1533, v447, 90); svfloat32_t v748 = svcmla_f32_x(pred_full, v739, v1697, v574, 90); svfloat32_t v881 = svcmla_f32_x(pred_full, v861, v1741, v861, 90); @@ -18347,156 +18899,92 @@ void armral_fft_cf32_cf32_cf32_ac_n_uu32(const armral_cmplx_f32_t *restrict x, svfloat32_t v1016 = svcmla_f32_x(pred_full, v1007, v1699, v575, 90); svst1_f64(pred_full, (double *)(v1461), svreinterpret_f64_f32(v624)); svst1_f64(pred_full, (double *)(v1479), svreinterpret_f64_f32(v625)); - svfloat32_t v143; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v143) : "w"(v60), "w"(v140)); - svfloat32_t v144; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v144) : "w"(v60), "w"(v140)); - svfloat32_t zero278; - asm volatile("mov %0.s, #0" : "=w"(zero278)); + svfloat32_t v143 = svsub_f32_x(svptrue_b32(), v60, v140); + svfloat32_t v144 = svadd_f32_x(svptrue_b32(), v60, v140); + svfloat32_t zero278 = svdup_n_f32(0); svfloat32_t v278 = svcmla_f32_x(pred_full, zero278, v1741, v271, 90); - svfloat32_t v279; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v279) : "w"(v141), "w"(v270)); - svfloat32_t v280; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v280) : "w"(v141), "w"(v270)); - svfloat32_t zero317; - asm volatile("mov %0.s, #0" : "=w"(zero317)); + svfloat32_t v279 = svadd_f32_x(svptrue_b32(), v141, v270); + svfloat32_t v280 = svsub_f32_x(svptrue_b32(), v141, v270); + svfloat32_t zero317 = svdup_n_f32(0); svfloat32_t v317 = svcmla_f32_x(pred_full, zero317, v1741, v310, 90); - svfloat32_t v318; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v318) : "w"(v104), "w"(v309)); - svfloat32_t v319; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v319) : "w"(v104), "w"(v309)); - svfloat32_t zero356; - asm volatile("mov %0.s, #0" : "=w"(zero356)); + svfloat32_t v318 = svadd_f32_x(svptrue_b32(), v104, v309); + svfloat32_t v319 = svsub_f32_x(svptrue_b32(), v104, v309); + svfloat32_t zero356 = svdup_n_f32(0); svfloat32_t v356 = svcmla_f32_x(pred_full, zero356, v1741, v349, 90); - svfloat32_t v486; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v486) : "w"(v403), "w"(v483)); - svfloat32_t v487; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v487) : "w"(v403), "w"(v483)); - svfloat32_t v613; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v613) : "w"(v530), "w"(v610)); - svfloat32_t v614; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v614) : "w"(v530), "w"(v610)); - svfloat32_t v660; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v660) : "w"(v484), "w"(v1491)); - svfloat32_t v672; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v672) : "w"(v611), "w"(v1573)); - svfloat32_t v749; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v749) : "w"(v747), "w"(v748)); - svfloat32_t v750; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v750) : "w"(v748), "w"(v747)); - svfloat32_t v883; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v883) : "w"(v881), "w"(v882)); - svfloat32_t v884; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v884) : "w"(v882), "w"(v881)); - svfloat32_t v928; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v928) : "w"(v485), "w"(v1655)); - svfloat32_t v940; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v940) : "w"(v612), "w"(v1657)); - svfloat32_t v1017; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v1017) : "w"(v1015), "w"(v1016)); - svfloat32_t v1018; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1018) : "w"(v1016), "w"(v1015)); + svfloat32_t v486 = svsub_f32_x(svptrue_b32(), v403, v483); + svfloat32_t v487 = svadd_f32_x(svptrue_b32(), v403, v483); + svfloat32_t v613 = svsub_f32_x(svptrue_b32(), v530, v610); + svfloat32_t v614 = svadd_f32_x(svptrue_b32(), v530, v610); + svfloat32_t v660 = svmul_f32_x(svptrue_b32(), v484, v1491); + svfloat32_t v672 = svmul_f32_x(svptrue_b32(), v611, v1573); + svfloat32_t v749 = svadd_f32_x(svptrue_b32(), v747, v748); + svfloat32_t v750 = svsub_f32_x(svptrue_b32(), v748, v747); + svfloat32_t v883 = svadd_f32_x(svptrue_b32(), v881, v882); + svfloat32_t v884 = svsub_f32_x(svptrue_b32(), v882, v881); + svfloat32_t v928 = svmul_f32_x(svptrue_b32(), v485, v1655); + svfloat32_t v940 = svmul_f32_x(svptrue_b32(), v612, v1657); + svfloat32_t v1017 = svadd_f32_x(svptrue_b32(), v1015, v1016); + svfloat32_t v1018 = svsub_f32_x(svptrue_b32(), v1016, v1015); svst1_f64(pred_full, (double *)(v1470), svreinterpret_f64_f32(v626)); svst1_f64(pred_full, (double *)(v1488), svreinterpret_f64_f32(v627)); - svfloat32_t v281; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v281) : "w"(v142), "w"(v278)); - svfloat32_t v282; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v282) : "w"(v142), "w"(v278)); - svfloat32_t v320; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v320) : "w"(v105), "w"(v317)); - svfloat32_t v321; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v321) : "w"(v105), "w"(v317)); - svfloat32_t v357; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v357) : "w"(v143), "w"(v348)); - svfloat32_t v358; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v358) : "w"(v143), "w"(v348)); - svfloat32_t v359; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v359) : "w"(v144), "w"(v356)); - svfloat32_t v360; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v360) : "w"(v144), "w"(v356)); + svfloat32_t v281 = svsub_f32_x(svptrue_b32(), v142, v278); + svfloat32_t v282 = svadd_f32_x(svptrue_b32(), v142, v278); + svfloat32_t v320 = svsub_f32_x(svptrue_b32(), v105, v317); + svfloat32_t v321 = svadd_f32_x(svptrue_b32(), v105, v317); + svfloat32_t v357 = svadd_f32_x(svptrue_b32(), v143, v348); + svfloat32_t v358 = svsub_f32_x(svptrue_b32(), v143, v348); + svfloat32_t v359 = svsub_f32_x(svptrue_b32(), v144, v356); + svfloat32_t v360 = svadd_f32_x(svptrue_b32(), v144, v356); svfloat32_t v680 = svcmla_f32_x(pred_full, v660, v1658, v484, 90); svfloat32_t v681 = svcmla_f32_x(pred_full, v672, v1574, v611, 90); - svfloat32_t zero757; - asm volatile("mov %0.s, #0" : "=w"(zero757)); + svfloat32_t zero757 = svdup_n_f32(0); svfloat32_t v757 = svcmla_f32_x(pred_full, zero757, v1741, v750, 90); - svfloat32_t v758; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v758) : "w"(v318), "w"(v749)); - svfloat32_t v759; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v759) : "w"(v318), "w"(v749)); - svfloat32_t v794; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v794) : "w"(v486), "w"(v1573)); - svfloat32_t v806; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v806) : "w"(v613), "w"(v1575)); - svfloat32_t zero891; - asm volatile("mov %0.s, #0" : "=w"(zero891)); + svfloat32_t v758 = svadd_f32_x(svptrue_b32(), v318, v749); + svfloat32_t v759 = svsub_f32_x(svptrue_b32(), v318, v749); + svfloat32_t v794 = svmul_f32_x(svptrue_b32(), v486, v1573); + svfloat32_t v806 = svmul_f32_x(svptrue_b32(), v613, v1575); + svfloat32_t zero891 = svdup_n_f32(0); svfloat32_t v891 = svcmla_f32_x(pred_full, zero891, v1741, v884, 90); - svfloat32_t v892; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v892) : "w"(v242), "w"(v883)); - svfloat32_t v893; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v893) : "w"(v242), "w"(v883)); + svfloat32_t v892 = svadd_f32_x(svptrue_b32(), v242, v883); + svfloat32_t v893 = svsub_f32_x(svptrue_b32(), v242, v883); svfloat32_t v948 = svcmla_f32_x(pred_full, v928, v1656, v485, 90); svfloat32_t v949 = svcmla_f32_x(pred_full, v940, v1658, v612, 90); - svfloat32_t zero1025; - asm volatile("mov %0.s, #0" : "=w"(zero1025)); + svfloat32_t zero1025 = svdup_n_f32(0); svfloat32_t v1025 = svcmla_f32_x(pred_full, zero1025, v1741, v1018, 90); - svfloat32_t v1062; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v1062) : "w"(v487), "w"(v1737)); - svfloat32_t v1074; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v1074) : "w"(v614), "w"(v1739)); - svfloat32_t v682; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v682) : "w"(v680), "w"(v681)); - svfloat32_t v683; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v683) : "w"(v681), "w"(v680)); - svfloat32_t v760; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v760) : "w"(v319), "w"(v757)); - svfloat32_t v761; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v761) : "w"(v319), "w"(v757)); + svfloat32_t v1062 = svmul_f32_x(svptrue_b32(), v487, v1737); + svfloat32_t v1074 = svmul_f32_x(svptrue_b32(), v614, v1739); + svfloat32_t v682 = svadd_f32_x(svptrue_b32(), v680, v681); + svfloat32_t v683 = svsub_f32_x(svptrue_b32(), v681, v680); + svfloat32_t v760 = svsub_f32_x(svptrue_b32(), v319, v757); + svfloat32_t v761 = svadd_f32_x(svptrue_b32(), v319, v757); svfloat32_t v814 = svcmla_f32_x(pred_full, v794, v1574, v486, 90); svfloat32_t v815 = svcmla_f32_x(pred_full, v806, v1738, v613, 90); - svfloat32_t v894; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v894) : "w"(v243), "w"(v891)); - svfloat32_t v895; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v895) : "w"(v243), "w"(v891)); - svfloat32_t v950; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v950) : "w"(v948), "w"(v949)); - svfloat32_t v951; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v951) : "w"(v949), "w"(v948)); - svfloat32_t v1026; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v1026) : "w"(v320), "w"(v1017)); - svfloat32_t v1027; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1027) : "w"(v320), "w"(v1017)); - svfloat32_t v1028; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1028) : "w"(v321), "w"(v1025)); - svfloat32_t v1029; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v1029) : "w"(v321), "w"(v1025)); + svfloat32_t v894 = svsub_f32_x(svptrue_b32(), v243, v891); + svfloat32_t v895 = svadd_f32_x(svptrue_b32(), v243, v891); + svfloat32_t v950 = svadd_f32_x(svptrue_b32(), v948, v949); + svfloat32_t v951 = svsub_f32_x(svptrue_b32(), v949, v948); + svfloat32_t v1026 = svadd_f32_x(svptrue_b32(), v320, v1017); + svfloat32_t v1027 = svsub_f32_x(svptrue_b32(), v320, v1017); + svfloat32_t v1028 = svsub_f32_x(svptrue_b32(), v321, v1025); + svfloat32_t v1029 = svadd_f32_x(svptrue_b32(), v321, v1025); svfloat32_t v1082 = svcmla_f32_x(pred_full, v1062, v1738, v487, 90); svfloat32_t v1083 = svcmla_f32_x(pred_full, v1074, v1740, v614, 90); svst1_f64(pred_full, (double *)(v1543), svreinterpret_f64_f32(v758)); svst1_f64(pred_full, (double *)(v1561), svreinterpret_f64_f32(v759)); svst1_f64(pred_full, (double *)(v1625), svreinterpret_f64_f32(v892)); svst1_f64(pred_full, (double *)(v1643), svreinterpret_f64_f32(v893)); - svfloat32_t zero690; - asm volatile("mov %0.s, #0" : "=w"(zero690)); + svfloat32_t zero690 = svdup_n_f32(0); svfloat32_t v690 = svcmla_f32_x(pred_full, zero690, v1741, v683, 90); - svfloat32_t v691; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v691) : "w"(v279), "w"(v682)); - svfloat32_t v692; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v692) : "w"(v279), "w"(v682)); - svfloat32_t v816; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v816) : "w"(v814), "w"(v815)); - svfloat32_t v817; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v817) : "w"(v815), "w"(v814)); - svfloat32_t zero958; - asm volatile("mov %0.s, #0" : "=w"(zero958)); + svfloat32_t v691 = svadd_f32_x(svptrue_b32(), v279, v682); + svfloat32_t v692 = svsub_f32_x(svptrue_b32(), v279, v682); + svfloat32_t v816 = svadd_f32_x(svptrue_b32(), v814, v815); + svfloat32_t v817 = svsub_f32_x(svptrue_b32(), v815, v814); + svfloat32_t zero958 = svdup_n_f32(0); svfloat32_t v958 = svcmla_f32_x(pred_full, zero958, v1741, v951, 90); - svfloat32_t v959; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v959) : "w"(v281), "w"(v950)); - svfloat32_t v960; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v960) : "w"(v281), "w"(v950)); - svfloat32_t v1084; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v1084) : "w"(v1082), "w"(v1083)); - svfloat32_t v1085; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1085) : "w"(v1083), "w"(v1082)); + svfloat32_t v959 = svadd_f32_x(svptrue_b32(), v281, v950); + svfloat32_t v960 = svsub_f32_x(svptrue_b32(), v281, v950); + svfloat32_t v1084 = svadd_f32_x(svptrue_b32(), v1082, v1083); + svfloat32_t v1085 = svsub_f32_x(svptrue_b32(), v1083, v1082); svst1_f64(pred_full, (double *)(v1552), svreinterpret_f64_f32(v760)); svst1_f64(pred_full, (double *)(v1570), svreinterpret_f64_f32(v761)); svst1_f64(pred_full, (double *)(v1634), svreinterpret_f64_f32(v894)); @@ -18505,40 +18993,26 @@ void armral_fft_cf32_cf32_cf32_ac_n_uu32(const armral_cmplx_f32_t *restrict x, svst1_f64(pred_full, (double *)(v1716), svreinterpret_f64_f32(v1028)); svst1_f64(pred_full, (double *)(v1725), svreinterpret_f64_f32(v1027)); svst1_f64(pred_full, (double *)(v1734), svreinterpret_f64_f32(v1029)); - svfloat32_t v693; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v693) : "w"(v280), "w"(v690)); - svfloat32_t v694; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v694) : "w"(v280), "w"(v690)); - svfloat32_t zero824; - asm volatile("mov %0.s, #0" : "=w"(zero824)); + svfloat32_t v693 = svsub_f32_x(svptrue_b32(), v280, v690); + svfloat32_t v694 = svadd_f32_x(svptrue_b32(), v280, v690); + svfloat32_t zero824 = svdup_n_f32(0); svfloat32_t v824 = svcmla_f32_x(pred_full, zero824, v1741, v817, 90); - svfloat32_t v825; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v825) : "w"(v357), "w"(v816)); - svfloat32_t v826; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v826) : "w"(v357), "w"(v816)); - svfloat32_t v961; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v961) : "w"(v282), "w"(v958)); - svfloat32_t v962; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v962) : "w"(v282), "w"(v958)); - svfloat32_t zero1092; - asm volatile("mov %0.s, #0" : "=w"(zero1092)); + svfloat32_t v825 = svadd_f32_x(svptrue_b32(), v357, v816); + svfloat32_t v826 = svsub_f32_x(svptrue_b32(), v357, v816); + svfloat32_t v961 = svsub_f32_x(svptrue_b32(), v282, v958); + svfloat32_t v962 = svadd_f32_x(svptrue_b32(), v282, v958); + svfloat32_t zero1092 = svdup_n_f32(0); svfloat32_t v1092 = svcmla_f32_x(pred_full, zero1092, v1741, v1085, 90); - svfloat32_t v1093; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v1093) : "w"(v359), "w"(v1084)); - svfloat32_t v1094; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1094) : "w"(v359), "w"(v1084)); + svfloat32_t v1093 = svadd_f32_x(svptrue_b32(), v359, v1084); + svfloat32_t v1094 = svsub_f32_x(svptrue_b32(), v359, v1084); svst1_f64(pred_full, (double *)(v1502), svreinterpret_f64_f32(v691)); svst1_f64(pred_full, (double *)(v1520), svreinterpret_f64_f32(v692)); svst1_f64(pred_full, (double *)(v1666), svreinterpret_f64_f32(v959)); svst1_f64(pred_full, (double *)(v1684), svreinterpret_f64_f32(v960)); - svfloat32_t v827; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v827) : "w"(v358), "w"(v824)); - svfloat32_t v828; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v828) : "w"(v358), "w"(v824)); - svfloat32_t v1095; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1095) : "w"(v360), "w"(v1092)); - svfloat32_t v1096; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v1096) : "w"(v360), "w"(v1092)); + svfloat32_t v827 = svsub_f32_x(svptrue_b32(), v358, v824); + svfloat32_t v828 = svadd_f32_x(svptrue_b32(), v358, v824); + svfloat32_t v1095 = svsub_f32_x(svptrue_b32(), v360, v1092); + svfloat32_t v1096 = svadd_f32_x(svptrue_b32(), v360, v1092); svst1_f64(pred_full, (double *)(v1511), svreinterpret_f64_f32(v693)); svst1_f64(pred_full, (double *)(v1529), svreinterpret_f64_f32(v694)); svst1_f64(pred_full, (double *)(v1584), svreinterpret_f64_f32(v825)); @@ -18556,3 +19030,3413 @@ void armral_fft_cf32_cf32_cf32_ac_n_uu32(const armral_cmplx_f32_t *restrict x, } } #endif + +#ifndef ARMRAL_ARCH_SVE +void armral_fft_cf32_cf32_cf32_ac_n_uu36(const armral_cmplx_f32_t *restrict x, + armral_cmplx_f32_t *restrict y, + int istride, int ostride, int howmany, + float dir) { + float v4 = dir; + const float32x2_t *v5 = (const float32x2_t *)x; + float32x2_t *v6 = (float32x2_t *)y; + int64_t v12 = howmany - 1; + int64_t v1060 = howmany / 2; + for (int j = 0; j < v12; j += 2) { + float v600 = 8.6602540378443871e-01F; + float v623 = 6.4278760968653925e-01F; + float v631 = -3.4202014332566888e-01F; + float v639 = 9.8480775301220802e-01F; + float v692 = 1.0000000000000000e+00F; + float v693 = -1.0000000000000000e+00F; + float v700 = -5.0000000000000000e-01F; + float v701 = 5.0000000000000000e-01F; + float v713 = -1.4999999999999998e+00F; + float v714 = 1.4999999999999998e+00F; + float v722 = -8.6602540378443871e-01F; + float v726 = 7.6604444311897801e-01F; + float v727 = -7.6604444311897801e-01F; + float v734 = 9.3969262078590832e-01F; + float v735 = -9.3969262078590832e-01F; + float v742 = -1.7364817766693039e-01F; + float v743 = 1.7364817766693039e-01F; + float32x2_t v745 = (float32x2_t){v4, v4}; + float v751 = -6.4278760968653925e-01F; + float v756 = 3.4202014332566888e-01F; + float v761 = -9.8480775301220802e-01F; + const float32x2_t *v2200 = &v5[istride]; + float32x2_t *v2300 = &v6[ostride]; + float32x2_t v584 = (float32x2_t){v700, v700}; + float32x2_t v597 = (float32x2_t){v713, v713}; + float32x2_t v602 = (float32x2_t){v600, v722}; + float32x2_t v610 = (float32x2_t){v726, v726}; + float32x2_t v615 = (float32x2_t){v734, v734}; + float32x2_t v620 = (float32x2_t){v742, v742}; + float32x2_t v625 = (float32x2_t){v623, v751}; + float32x2_t v633 = (float32x2_t){v631, v756}; + float32x2_t v641 = (float32x2_t){v639, v761}; + float32x2_t v694 = (float32x2_t){v692, v693}; + float32x2_t v702 = (float32x2_t){v700, v701}; + float32x2_t v715 = (float32x2_t){v713, v714}; + float32x2_t v723 = (float32x2_t){v722, v722}; + float32x2_t v728 = (float32x2_t){v726, v727}; + float32x2_t v736 = (float32x2_t){v734, v735}; + float32x2_t v744 = (float32x2_t){v742, v743}; + float32x2_t v752 = (float32x2_t){v751, v751}; + float32x2_t v757 = (float32x2_t){v756, v756}; + float32x2_t v762 = (float32x2_t){v761, v761}; + const float32x2_t *v1930 = &v5[0]; + float32x2_t *v2255 = &v6[0]; + float32x4_t v2634 = vld1q_f32((const float32_t *)v2200); + float32x4_t v585 = vcombine_f32(v584, v584); + float32x4_t v598 = vcombine_f32(v597, v597); + float32x2_t v604 = vmul_f32(v745, v602); + float32x4_t v611 = vcombine_f32(v610, v610); + float32x4_t v616 = vcombine_f32(v615, v615); + float32x4_t v621 = vcombine_f32(v620, v620); + float32x2_t v627 = vmul_f32(v745, v625); + float32x2_t v635 = vmul_f32(v745, v633); + float32x2_t v643 = vmul_f32(v745, v641); + float32x2_t v696 = vmul_f32(v745, v694); + float32x2_t v704 = vmul_f32(v745, v702); + float32x2_t v717 = vmul_f32(v745, v715); + float32x4_t v724 = vcombine_f32(v723, v723); + float32x2_t v730 = vmul_f32(v745, v728); + float32x2_t v738 = vmul_f32(v745, v736); + float32x2_t v746 = vmul_f32(v745, v744); + float32x4_t v753 = vcombine_f32(v752, v752); + float32x4_t v758 = vcombine_f32(v757, v757); + float32x4_t v763 = vcombine_f32(v762, v762); + const float32x2_t *v1939 = &v5[istride * 18]; + const float32x2_t *v1948 = &v5[istride * 9]; + const float32x2_t *v1957 = &v5[istride * 27]; + const float32x2_t *v1966 = &v5[istride * 4]; + const float32x2_t *v1975 = &v5[istride * 22]; + const float32x2_t *v1984 = &v5[istride * 13]; + const float32x2_t *v1993 = &v5[istride * 31]; + const float32x2_t *v2002 = &v5[istride * 8]; + const float32x2_t *v2011 = &v5[istride * 26]; + const float32x2_t *v2020 = &v5[istride * 17]; + const float32x2_t *v2029 = &v5[istride * 35]; + const float32x2_t *v2038 = &v5[istride * 12]; + const float32x2_t *v2047 = &v5[istride * 30]; + const float32x2_t *v2056 = &v5[istride * 21]; + const float32x2_t *v2065 = &v5[istride * 3]; + const float32x2_t *v2074 = &v5[istride * 16]; + const float32x2_t *v2083 = &v5[istride * 34]; + const float32x2_t *v2092 = &v5[istride * 25]; + const float32x2_t *v2101 = &v5[istride * 7]; + const float32x2_t *v2110 = &v5[istride * 20]; + const float32x2_t *v2119 = &v5[istride * 2]; + const float32x2_t *v2128 = &v5[istride * 29]; + const float32x2_t *v2137 = &v5[istride * 11]; + const float32x2_t *v2146 = &v5[istride * 24]; + const float32x2_t *v2155 = &v5[istride * 6]; + const float32x2_t *v2164 = &v5[istride * 33]; + const float32x2_t *v2173 = &v5[istride * 15]; + const float32x2_t *v2182 = &v5[istride * 28]; + const float32x2_t *v2191 = &v5[istride * 10]; + const float32x2_t *v2209 = &v5[istride * 19]; + const float32x2_t *v2218 = &v5[istride * 32]; + const float32x2_t *v2227 = &v5[istride * 14]; + const float32x2_t *v2236 = &v5[istride * 5]; + const float32x2_t *v2245 = &v5[istride * 23]; + float32x2_t *v2264 = &v6[ostride * 9]; + float32x2_t *v2273 = &v6[ostride * 18]; + float32x2_t *v2282 = &v6[ostride * 27]; + float32x2_t *v2291 = &v6[ostride * 28]; + float32x2_t *v2309 = &v6[ostride * 10]; + float32x2_t *v2318 = &v6[ostride * 19]; + float32x2_t *v2327 = &v6[ostride * 20]; + float32x2_t *v2336 = &v6[ostride * 29]; + float32x2_t *v2345 = &v6[ostride * 2]; + float32x2_t *v2354 = &v6[ostride * 11]; + float32x2_t *v2363 = &v6[ostride * 12]; + float32x2_t *v2372 = &v6[ostride * 21]; + float32x2_t *v2381 = &v6[ostride * 30]; + float32x2_t *v2390 = &v6[ostride * 3]; + float32x2_t *v2399 = &v6[ostride * 4]; + float32x2_t *v2408 = &v6[ostride * 13]; + float32x2_t *v2417 = &v6[ostride * 22]; + float32x2_t *v2426 = &v6[ostride * 31]; + float32x2_t *v2435 = &v6[ostride * 32]; + float32x2_t *v2444 = &v6[ostride * 5]; + float32x2_t *v2453 = &v6[ostride * 14]; + float32x2_t *v2462 = &v6[ostride * 23]; + float32x2_t *v2471 = &v6[ostride * 24]; + float32x2_t *v2480 = &v6[ostride * 33]; + float32x2_t *v2489 = &v6[ostride * 6]; + float32x2_t *v2498 = &v6[ostride * 15]; + float32x2_t *v2507 = &v6[ostride * 16]; + float32x2_t *v2516 = &v6[ostride * 25]; + float32x2_t *v2525 = &v6[ostride * 34]; + float32x2_t *v2534 = &v6[ostride * 7]; + float32x2_t *v2543 = &v6[ostride * 8]; + float32x2_t *v2552 = &v6[ostride * 17]; + float32x2_t *v2561 = &v6[ostride * 26]; + float32x2_t *v2570 = &v6[ostride * 35]; + float32x4_t v2574 = vld1q_f32((const float32_t *)v1930); + float32x4_t v606 = vcombine_f32(v604, v604); + float32x4_t v629 = vcombine_f32(v627, v627); + float32x4_t v637 = vcombine_f32(v635, v635); + float32x4_t v645 = vcombine_f32(v643, v643); + float32x4_t v698 = vcombine_f32(v696, v696); + float32x4_t v706 = vcombine_f32(v704, v704); + float32x4_t v719 = vcombine_f32(v717, v717); + float32x4_t v732 = vcombine_f32(v730, v730); + float32x4_t v740 = vcombine_f32(v738, v738); + float32x4_t v748 = vcombine_f32(v746, v746); + float32x4_t v2576 = vld1q_f32((const float32_t *)v1939); + float32x4_t v2578 = vld1q_f32((const float32_t *)v1948); + float32x4_t v2580 = vld1q_f32((const float32_t *)v1957); + float32x4_t v2582 = vld1q_f32((const float32_t *)v1966); + float32x4_t v2584 = vld1q_f32((const float32_t *)v1975); + float32x4_t v2586 = vld1q_f32((const float32_t *)v1984); + float32x4_t v2588 = vld1q_f32((const float32_t *)v1993); + float32x4_t v2590 = vld1q_f32((const float32_t *)v2002); + float32x4_t v2592 = vld1q_f32((const float32_t *)v2011); + float32x4_t v2594 = vld1q_f32((const float32_t *)v2020); + float32x4_t v2596 = vld1q_f32((const float32_t *)v2029); + float32x4_t v2598 = vld1q_f32((const float32_t *)v2038); + float32x4_t v2600 = vld1q_f32((const float32_t *)v2047); + float32x4_t v2602 = vld1q_f32((const float32_t *)v2056); + float32x4_t v2604 = vld1q_f32((const float32_t *)v2065); + float32x4_t v2606 = vld1q_f32((const float32_t *)v2074); + float32x4_t v2608 = vld1q_f32((const float32_t *)v2083); + float32x4_t v2610 = vld1q_f32((const float32_t *)v2092); + float32x4_t v2612 = vld1q_f32((const float32_t *)v2101); + float32x4_t v2614 = vld1q_f32((const float32_t *)v2110); + float32x4_t v2616 = vld1q_f32((const float32_t *)v2119); + float32x4_t v2618 = vld1q_f32((const float32_t *)v2128); + float32x4_t v2620 = vld1q_f32((const float32_t *)v2137); + float32x4_t v2622 = vld1q_f32((const float32_t *)v2146); + float32x4_t v2624 = vld1q_f32((const float32_t *)v2155); + float32x4_t v2626 = vld1q_f32((const float32_t *)v2164); + float32x4_t v2628 = vld1q_f32((const float32_t *)v2173); + float32x4_t v2630 = vld1q_f32((const float32_t *)v2182); + float32x4_t v2632 = vld1q_f32((const float32_t *)v2191); + float32x4_t v2636 = vld1q_f32((const float32_t *)v2209); + float32x4_t v2638 = vld1q_f32((const float32_t *)v2218); + float32x4_t v2640 = vld1q_f32((const float32_t *)v2227); + float32x4_t v2642 = vld1q_f32((const float32_t *)v2236); + float32x4_t v2644 = vld1q_f32((const float32_t *)v2245); + float32x4_t v35 = vaddq_f32(v2574, v2576); + float32x4_t v36 = vsubq_f32(v2574, v2576); + float32x4_t v51 = vaddq_f32(v2578, v2580); + float32x4_t v52 = vsubq_f32(v2578, v2580); + float32x4_t v69 = vaddq_f32(v2582, v2584); + float32x4_t v70 = vsubq_f32(v2582, v2584); + float32x4_t v85 = vaddq_f32(v2586, v2588); + float32x4_t v86 = vsubq_f32(v2586, v2588); + float32x4_t v103 = vaddq_f32(v2590, v2592); + float32x4_t v104 = vsubq_f32(v2590, v2592); + float32x4_t v119 = vaddq_f32(v2594, v2596); + float32x4_t v120 = vsubq_f32(v2594, v2596); + float32x4_t v137 = vaddq_f32(v2598, v2600); + float32x4_t v138 = vsubq_f32(v2598, v2600); + float32x4_t v153 = vaddq_f32(v2602, v2604); + float32x4_t v154 = vsubq_f32(v2602, v2604); + float32x4_t v171 = vaddq_f32(v2606, v2608); + float32x4_t v172 = vsubq_f32(v2606, v2608); + float32x4_t v187 = vaddq_f32(v2610, v2612); + float32x4_t v188 = vsubq_f32(v2610, v2612); + float32x4_t v205 = vaddq_f32(v2614, v2616); + float32x4_t v206 = vsubq_f32(v2614, v2616); + float32x4_t v221 = vaddq_f32(v2618, v2620); + float32x4_t v222 = vsubq_f32(v2618, v2620); + float32x4_t v239 = vaddq_f32(v2622, v2624); + float32x4_t v240 = vsubq_f32(v2622, v2624); + float32x4_t v255 = vaddq_f32(v2626, v2628); + float32x4_t v256 = vsubq_f32(v2626, v2628); + float32x4_t v273 = vaddq_f32(v2630, v2632); + float32x4_t v274 = vsubq_f32(v2630, v2632); + float32x4_t v289 = vaddq_f32(v2634, v2636); + float32x4_t v290 = vsubq_f32(v2634, v2636); + float32x4_t v307 = vaddq_f32(v2638, v2640); + float32x4_t v308 = vsubq_f32(v2638, v2640); + float32x4_t v323 = vaddq_f32(v2642, v2644); + float32x4_t v324 = vsubq_f32(v2642, v2644); + float32x4_t v53 = vaddq_f32(v35, v51); + float32x4_t v54 = vsubq_f32(v35, v51); + float32x4_t v87 = vaddq_f32(v69, v85); + float32x4_t v88 = vsubq_f32(v69, v85); + float32x4_t v121 = vaddq_f32(v103, v119); + float32x4_t v122 = vsubq_f32(v103, v119); + float32x4_t v155 = vaddq_f32(v137, v153); + float32x4_t v156 = vsubq_f32(v137, v153); + float32x4_t v189 = vaddq_f32(v171, v187); + float32x4_t v190 = vsubq_f32(v171, v187); + float32x4_t v223 = vaddq_f32(v205, v221); + float32x4_t v224 = vsubq_f32(v205, v221); + float32x4_t v257 = vaddq_f32(v239, v255); + float32x4_t v258 = vsubq_f32(v239, v255); + float32x4_t v291 = vaddq_f32(v273, v289); + float32x4_t v292 = vsubq_f32(v273, v289); + float32x4_t v325 = vaddq_f32(v307, v323); + float32x4_t v326 = vsubq_f32(v307, v323); + float32x4_t v557 = vaddq_f32(v70, v308); + float32x4_t v558 = vsubq_f32(v70, v308); + float32x4_t v559 = vaddq_f32(v274, v104); + float32x4_t v560 = vsubq_f32(v274, v104); + float32x4_t v561 = vaddq_f32(v138, v240); + float32x4_t v562 = vsubq_f32(v138, v240); + float32x4_t v563 = vaddq_f32(v172, v206); + float32x4_t v564 = vsubq_f32(v172, v206); + float32x4_t v672 = vaddq_f32(v86, v324); + float32x4_t v673 = vsubq_f32(v86, v324); + float32x4_t v674 = vaddq_f32(v290, v120); + float32x4_t v675 = vsubq_f32(v290, v120); + float32x4_t v676 = vaddq_f32(v154, v256); + float32x4_t v677 = vsubq_f32(v154, v256); + float32x4_t v678 = vaddq_f32(v188, v222); + float32x4_t v679 = vsubq_f32(v188, v222); + float32x4_t v327 = vaddq_f32(v87, v325); + float32x4_t v328 = vsubq_f32(v87, v325); + float32x4_t v329 = vaddq_f32(v291, v121); + float32x4_t v330 = vsubq_f32(v291, v121); + float32x4_t v331 = vaddq_f32(v155, v257); + float32x4_t v332 = vsubq_f32(v155, v257); + float32x4_t v333 = vaddq_f32(v189, v223); + float32x4_t v334 = vsubq_f32(v189, v223); + float32x4_t v442 = vaddq_f32(v88, v326); + float32x4_t v443 = vsubq_f32(v88, v326); + float32x4_t v444 = vaddq_f32(v292, v122); + float32x4_t v445 = vsubq_f32(v292, v122); + float32x4_t v446 = vaddq_f32(v156, v258); + float32x4_t v447 = vsubq_f32(v156, v258); + float32x4_t v448 = vaddq_f32(v190, v224); + float32x4_t v449 = vsubq_f32(v190, v224); + float32x4_t v565 = vaddq_f32(v557, v559); + float32x4_t v569 = vaddq_f32(v558, v560); + float32x4_t v571 = vsubq_f32(v557, v559); + float32x4_t v572 = vsubq_f32(v559, v563); + float32x4_t v573 = vsubq_f32(v563, v557); + float32x4_t v574 = vsubq_f32(v558, v560); + float32x4_t v575 = vsubq_f32(v560, v564); + float32x4_t v576 = vsubq_f32(v564, v558); + float32x4_t v599 = vmulq_f32(v561, v598); + float32x4_t v605 = vrev64q_f32(v562); + float32x4_t v680 = vaddq_f32(v672, v674); + float32x4_t v684 = vaddq_f32(v673, v675); + float32x4_t v686 = vsubq_f32(v672, v674); + float32x4_t v687 = vsubq_f32(v674, v678); + float32x4_t v688 = vsubq_f32(v678, v672); + float32x4_t v689 = vsubq_f32(v673, v675); + float32x4_t v690 = vsubq_f32(v675, v679); + float32x4_t v691 = vsubq_f32(v679, v673); + float32x4_t v718 = vrev64q_f32(v676); + float32x4_t v725 = vmulq_f32(v677, v724); + float32x4_t v335 = vaddq_f32(v327, v329); + float32x4_t v339 = vaddq_f32(v328, v330); + float32x4_t v341 = vsubq_f32(v327, v329); + float32x4_t v342 = vsubq_f32(v329, v333); + float32x4_t v343 = vsubq_f32(v333, v327); + float32x4_t v344 = vsubq_f32(v328, v330); + float32x4_t v345 = vsubq_f32(v330, v334); + float32x4_t v346 = vsubq_f32(v334, v328); + float32x4_t v369 = vmulq_f32(v331, v598); + float32x4_t v375 = vrev64q_f32(v332); + float32x4_t v450 = vaddq_f32(v442, v444); + float32x4_t v454 = vaddq_f32(v443, v445); + float32x4_t v456 = vsubq_f32(v442, v444); + float32x4_t v457 = vsubq_f32(v444, v448); + float32x4_t v458 = vsubq_f32(v448, v442); + float32x4_t v459 = vsubq_f32(v443, v445); + float32x4_t v460 = vsubq_f32(v445, v449); + float32x4_t v461 = vsubq_f32(v449, v443); + float32x4_t v484 = vmulq_f32(v446, v598); + float32x4_t v490 = vrev64q_f32(v447); + float32x4_t v566 = vaddq_f32(v565, v563); + float32x4_t v570 = vaddq_f32(v569, v564); + float32x4_t v607 = vmulq_f32(v605, v606); + float32x4_t v612 = vmulq_f32(v571, v611); + float32x4_t v617 = vmulq_f32(v572, v616); + float32x4_t v622 = vmulq_f32(v573, v621); + float32x4_t v628 = vrev64q_f32(v574); + float32x4_t v636 = vrev64q_f32(v575); + float32x4_t v644 = vrev64q_f32(v576); + float32x4_t v681 = vaddq_f32(v680, v678); + float32x4_t v685 = vaddq_f32(v684, v679); + float32x4_t v720 = vmulq_f32(v718, v719); + float32x4_t v731 = vrev64q_f32(v686); + float32x4_t v739 = vrev64q_f32(v687); + float32x4_t v747 = vrev64q_f32(v688); + float32x4_t v754 = vmulq_f32(v689, v753); + float32x4_t v759 = vmulq_f32(v690, v758); + float32x4_t v764 = vmulq_f32(v691, v763); + float32x4_t v336 = vaddq_f32(v335, v333); + float32x4_t v340 = vaddq_f32(v339, v334); + float32x4_t v377 = vmulq_f32(v375, v606); + float32x4_t v382 = vmulq_f32(v341, v611); + float32x4_t v387 = vmulq_f32(v342, v616); + float32x4_t v392 = vmulq_f32(v343, v621); + float32x4_t v398 = vrev64q_f32(v344); + float32x4_t v406 = vrev64q_f32(v345); + float32x4_t v414 = vrev64q_f32(v346); + float32x4_t v451 = vaddq_f32(v450, v448); + float32x4_t v455 = vaddq_f32(v454, v449); + float32x4_t v492 = vmulq_f32(v490, v606); + float32x4_t v497 = vmulq_f32(v456, v611); + float32x4_t v502 = vmulq_f32(v457, v616); + float32x4_t v507 = vmulq_f32(v458, v621); + float32x4_t v513 = vrev64q_f32(v459); + float32x4_t v521 = vrev64q_f32(v460); + float32x4_t v529 = vrev64q_f32(v461); + float32x4_t v567 = vaddq_f32(v566, v561); + float32x4_t v586 = vmulq_f32(v566, v585); + float32x4_t v592 = vrev64q_f32(v570); + float32x4_t v630 = vmulq_f32(v628, v629); + float32x4_t v638 = vmulq_f32(v636, v637); + float32x4_t v646 = vmulq_f32(v644, v645); + float32x4_t v682 = vaddq_f32(v681, v676); + float32x4_t v705 = vrev64q_f32(v681); + float32x4_t v712 = vmulq_f32(v685, v724); + float32x4_t v733 = vmulq_f32(v731, v732); + float32x4_t v741 = vmulq_f32(v739, v740); + float32x4_t v749 = vmulq_f32(v747, v748); + float32x4_t v778 = vaddq_f32(v725, v754); + float32x4_t v780 = vsubq_f32(v725, v759); + float32x4_t v782 = vsubq_f32(v725, v754); + float32x4_t v337 = vaddq_f32(v336, v331); + float32x4_t v356 = vmulq_f32(v336, v585); + float32x4_t v362 = vrev64q_f32(v340); + float32x4_t v400 = vmulq_f32(v398, v629); + float32x4_t v408 = vmulq_f32(v406, v637); + float32x4_t v416 = vmulq_f32(v414, v645); + float32x4_t v452 = vaddq_f32(v451, v446); + float32x4_t v471 = vmulq_f32(v451, v585); + float32x4_t v477 = vrev64q_f32(v455); + float32x4_t v515 = vmulq_f32(v513, v629); + float32x4_t v523 = vmulq_f32(v521, v637); + float32x4_t v531 = vmulq_f32(v529, v645); + float32x4_t v568 = vaddq_f32(v567, v36); + float32x4_t v594 = vmulq_f32(v592, v606); + float32x4_t v647 = vaddq_f32(v586, v586); + float32x4_t v660 = vaddq_f32(v607, v630); + float32x4_t v662 = vsubq_f32(v607, v638); + float32x4_t v664 = vsubq_f32(v607, v630); + float32x4_t v683 = vaddq_f32(v682, v52); + float32x4_t v707 = vmulq_f32(v705, v706); + float32x4_t v779 = vaddq_f32(v778, v759); + float32x4_t v781 = vaddq_f32(v780, v764); + float32x4_t v783 = vsubq_f32(v782, v764); + float32x4_t v338 = vaddq_f32(v337, v53); + float32x4_t v364 = vmulq_f32(v362, v606); + float32x4_t v417 = vaddq_f32(v356, v356); + float32x4_t v430 = vaddq_f32(v377, v400); + float32x4_t v432 = vsubq_f32(v377, v408); + float32x4_t v434 = vsubq_f32(v377, v400); + float32x4_t v453 = vaddq_f32(v452, v54); + float32x4_t v479 = vmulq_f32(v477, v606); + float32x4_t v532 = vaddq_f32(v471, v471); + float32x4_t v545 = vaddq_f32(v492, v515); + float32x4_t v547 = vsubq_f32(v492, v523); + float32x4_t v549 = vsubq_f32(v492, v515); + float32x4_t v648 = vaddq_f32(v647, v586); + float32x4_t v652 = vaddq_f32(v568, v599); + float32x4_t v661 = vaddq_f32(v660, v638); + float32x4_t v663 = vaddq_f32(v662, v646); + float32x4_t v665 = vsubq_f32(v664, v646); + float32x4_t v697 = vrev64q_f32(v683); + float32x4_t v765 = vaddq_f32(v707, v707); + float32x4_t v418 = vaddq_f32(v417, v356); + float32x4_t v422 = vaddq_f32(v338, v369); + float32x4_t v431 = vaddq_f32(v430, v408); + float32x4_t v433 = vaddq_f32(v432, v416); + float32x4_t v435 = vsubq_f32(v434, v416); + float32x4_t v533 = vaddq_f32(v532, v471); + float32x4_t v537 = vaddq_f32(v453, v484); + float32x4_t v546 = vaddq_f32(v545, v523); + float32x4_t v548 = vaddq_f32(v547, v531); + float32x4_t v550 = vsubq_f32(v549, v531); + float32x4_t v649 = vaddq_f32(v568, v648); + float32x4_t v653 = vaddq_f32(v652, v647); + float32x4_t v699 = vmulq_f32(v697, v698); + float32x4_t v766 = vaddq_f32(v765, v707); + vst1q_f32((float32_t *)v2255, v338); + vst1q_f32((float32_t *)v2273, v453); + float32x4_t v419 = vaddq_f32(v338, v418); + float32x4_t v423 = vaddq_f32(v422, v417); + float32x4_t v534 = vaddq_f32(v453, v533); + float32x4_t v538 = vaddq_f32(v537, v532); + float32x4_t v650 = vaddq_f32(v649, v594); + float32x4_t v651 = vsubq_f32(v649, v594); + float32x4_t v654 = vaddq_f32(v653, v612); + float32x4_t v656 = vsubq_f32(v653, v617); + float32x4_t v658 = vsubq_f32(v653, v612); + float32x4_t v767 = vaddq_f32(v699, v766); + float32x4_t v770 = vaddq_f32(v699, v720); + float32x4_t v790 = vaddq_f32(v568, v699); + float32x4_t v791 = vsubq_f32(v568, v699); + float32x4_t v420 = vaddq_f32(v419, v364); + float32x4_t v421 = vsubq_f32(v419, v364); + float32x4_t v424 = vaddq_f32(v423, v382); + float32x4_t v426 = vsubq_f32(v423, v387); + float32x4_t v428 = vsubq_f32(v423, v382); + float32x4_t v535 = vaddq_f32(v534, v479); + float32x4_t v536 = vsubq_f32(v534, v479); + float32x4_t v539 = vaddq_f32(v538, v497); + float32x4_t v541 = vsubq_f32(v538, v502); + float32x4_t v543 = vsubq_f32(v538, v497); + float32x4_t v655 = vaddq_f32(v654, v617); + float32x4_t v657 = vaddq_f32(v656, v622); + float32x4_t v659 = vsubq_f32(v658, v622); + float32x4_t v768 = vaddq_f32(v767, v712); + float32x4_t v769 = vsubq_f32(v767, v712); + float32x4_t v771 = vaddq_f32(v770, v765); + vst1q_f32((float32_t *)v2264, v791); + vst1q_f32((float32_t *)v2282, v790); + float32x4_t v425 = vaddq_f32(v424, v387); + float32x4_t v427 = vaddq_f32(v426, v392); + float32x4_t v429 = vsubq_f32(v428, v392); + float32x4_t v540 = vaddq_f32(v539, v502); + float32x4_t v542 = vaddq_f32(v541, v507); + float32x4_t v544 = vsubq_f32(v543, v507); + float32x4_t v666 = vaddq_f32(v655, v661); + float32x4_t v667 = vsubq_f32(v655, v661); + float32x4_t v668 = vaddq_f32(v657, v663); + float32x4_t v669 = vsubq_f32(v657, v663); + float32x4_t v670 = vaddq_f32(v659, v665); + float32x4_t v671 = vsubq_f32(v659, v665); + float32x4_t v772 = vaddq_f32(v771, v733); + float32x4_t v774 = vsubq_f32(v771, v741); + float32x4_t v776 = vsubq_f32(v771, v733); + float32x4_t v880 = vaddq_f32(v651, v769); + float32x4_t v881 = vsubq_f32(v651, v769); + float32x4_t v970 = vaddq_f32(v650, v768); + float32x4_t v971 = vsubq_f32(v650, v768); + vst1q_f32((float32_t *)v2363, v421); + vst1q_f32((float32_t *)v2381, v536); + vst1q_f32((float32_t *)v2471, v420); + vst1q_f32((float32_t *)v2489, v535); + float32x4_t v436 = vaddq_f32(v425, v431); + float32x4_t v437 = vsubq_f32(v425, v431); + float32x4_t v438 = vaddq_f32(v427, v433); + float32x4_t v439 = vsubq_f32(v427, v433); + float32x4_t v440 = vaddq_f32(v429, v435); + float32x4_t v441 = vsubq_f32(v429, v435); + float32x4_t v551 = vaddq_f32(v540, v546); + float32x4_t v552 = vsubq_f32(v540, v546); + float32x4_t v553 = vaddq_f32(v542, v548); + float32x4_t v554 = vsubq_f32(v542, v548); + float32x4_t v555 = vaddq_f32(v544, v550); + float32x4_t v556 = vsubq_f32(v544, v550); + float32x4_t v773 = vaddq_f32(v772, v741); + float32x4_t v775 = vaddq_f32(v774, v749); + float32x4_t v777 = vsubq_f32(v776, v749); + vst1q_f32((float32_t *)v2372, v881); + vst1q_f32((float32_t *)v2390, v880); + vst1q_f32((float32_t *)v2480, v971); + vst1q_f32((float32_t *)v2498, v970); + float32x4_t v784 = vaddq_f32(v773, v779); + float32x4_t v785 = vsubq_f32(v773, v779); + float32x4_t v786 = vaddq_f32(v775, v781); + float32x4_t v787 = vsubq_f32(v775, v781); + float32x4_t v788 = vaddq_f32(v777, v783); + float32x4_t v789 = vsubq_f32(v777, v783); + vst1q_f32((float32_t *)v2291, v437); + vst1q_f32((float32_t *)v2309, v552); + vst1q_f32((float32_t *)v2327, v438); + vst1q_f32((float32_t *)v2345, v553); + vst1q_f32((float32_t *)v2399, v441); + vst1q_f32((float32_t *)v2417, v556); + vst1q_f32((float32_t *)v2435, v440); + vst1q_f32((float32_t *)v2453, v555); + vst1q_f32((float32_t *)v2507, v439); + vst1q_f32((float32_t *)v2525, v554); + vst1q_f32((float32_t *)v2543, v436); + vst1q_f32((float32_t *)v2561, v551); + float32x4_t v820 = vaddq_f32(v667, v785); + float32x4_t v821 = vsubq_f32(v667, v785); + float32x4_t v850 = vaddq_f32(v668, v786); + float32x4_t v851 = vsubq_f32(v668, v786); + float32x4_t v910 = vaddq_f32(v671, v789); + float32x4_t v911 = vsubq_f32(v671, v789); + float32x4_t v940 = vaddq_f32(v670, v788); + float32x4_t v941 = vsubq_f32(v670, v788); + float32x4_t v1000 = vaddq_f32(v669, v787); + float32x4_t v1001 = vsubq_f32(v669, v787); + float32x4_t v1030 = vaddq_f32(v666, v784); + float32x4_t v1031 = vsubq_f32(v666, v784); + vst1q_f32((float32_t *)v2300, v821); + vst1q_f32((float32_t *)v2318, v820); + vst1q_f32((float32_t *)v2336, v851); + vst1q_f32((float32_t *)v2354, v850); + vst1q_f32((float32_t *)v2408, v911); + vst1q_f32((float32_t *)v2426, v910); + vst1q_f32((float32_t *)v2444, v941); + vst1q_f32((float32_t *)v2462, v940); + vst1q_f32((float32_t *)v2516, v1001); + vst1q_f32((float32_t *)v2534, v1000); + vst1q_f32((float32_t *)v2552, v1031); + vst1q_f32((float32_t *)v2570, v1030); + v5 += 2 * 1; + v6 += 2 * 1; + } + for (int j = v1060 * 2; j < howmany; j += 1) { + float32x2_t v1266 = v5[istride]; + float v1549 = 8.6602540378443871e-01F; + float v1568 = 6.4278760968653925e-01F; + float v1575 = -3.4202014332566888e-01F; + float v1582 = 9.8480775301220802e-01F; + float v1634 = 1.0000000000000000e+00F; + float v1635 = -1.0000000000000000e+00F; + float v1641 = -5.0000000000000000e-01F; + float v1642 = 5.0000000000000000e-01F; + float v1652 = -1.4999999999999998e+00F; + float v1653 = 1.4999999999999998e+00F; + float v1660 = -8.6602540378443871e-01F; + float v1663 = 7.6604444311897801e-01F; + float v1664 = -7.6604444311897801e-01F; + float v1670 = 9.3969262078590832e-01F; + float v1671 = -9.3969262078590832e-01F; + float v1677 = -1.7364817766693039e-01F; + float v1678 = 1.7364817766693039e-01F; + float32x2_t v1680 = (float32x2_t){v4, v4}; + float v1685 = -6.4278760968653925e-01F; + float v1689 = 3.4202014332566888e-01F; + float v1693 = -9.8480775301220802e-01F; + float32x2_t v1072 = v5[0]; + float32x2_t v1536 = (float32x2_t){v1641, v1641}; + float32x2_t v1547 = (float32x2_t){v1652, v1652}; + float32x2_t v1551 = (float32x2_t){v1549, v1660}; + float32x2_t v1558 = (float32x2_t){v1663, v1663}; + float32x2_t v1562 = (float32x2_t){v1670, v1670}; + float32x2_t v1566 = (float32x2_t){v1677, v1677}; + float32x2_t v1570 = (float32x2_t){v1568, v1685}; + float32x2_t v1577 = (float32x2_t){v1575, v1689}; + float32x2_t v1584 = (float32x2_t){v1582, v1693}; + float32x2_t v1636 = (float32x2_t){v1634, v1635}; + float32x2_t v1643 = (float32x2_t){v1641, v1642}; + float32x2_t v1654 = (float32x2_t){v1652, v1653}; + float32x2_t v1661 = (float32x2_t){v1660, v1660}; + float32x2_t v1665 = (float32x2_t){v1663, v1664}; + float32x2_t v1672 = (float32x2_t){v1670, v1671}; + float32x2_t v1679 = (float32x2_t){v1677, v1678}; + float32x2_t v1686 = (float32x2_t){v1685, v1685}; + float32x2_t v1690 = (float32x2_t){v1689, v1689}; + float32x2_t v1694 = (float32x2_t){v1693, v1693}; + float32x2_t v1077 = v5[istride * 18]; + float32x2_t v1084 = v5[istride * 9]; + float32x2_t v1089 = v5[istride * 27]; + float32x2_t v1098 = v5[istride * 4]; + float32x2_t v1103 = v5[istride * 22]; + float32x2_t v1110 = v5[istride * 13]; + float32x2_t v1115 = v5[istride * 31]; + float32x2_t v1124 = v5[istride * 8]; + float32x2_t v1129 = v5[istride * 26]; + float32x2_t v1136 = v5[istride * 17]; + float32x2_t v1141 = v5[istride * 35]; + float32x2_t v1150 = v5[istride * 12]; + float32x2_t v1155 = v5[istride * 30]; + float32x2_t v1162 = v5[istride * 21]; + float32x2_t v1167 = v5[istride * 3]; + float32x2_t v1176 = v5[istride * 16]; + float32x2_t v1181 = v5[istride * 34]; + float32x2_t v1188 = v5[istride * 25]; + float32x2_t v1193 = v5[istride * 7]; + float32x2_t v1202 = v5[istride * 20]; + float32x2_t v1207 = v5[istride * 2]; + float32x2_t v1214 = v5[istride * 29]; + float32x2_t v1219 = v5[istride * 11]; + float32x2_t v1228 = v5[istride * 24]; + float32x2_t v1233 = v5[istride * 6]; + float32x2_t v1240 = v5[istride * 33]; + float32x2_t v1245 = v5[istride * 15]; + float32x2_t v1254 = v5[istride * 28]; + float32x2_t v1259 = v5[istride * 10]; + float32x2_t v1271 = v5[istride * 19]; + float32x2_t v1280 = v5[istride * 32]; + float32x2_t v1285 = v5[istride * 14]; + float32x2_t v1292 = v5[istride * 5]; + float32x2_t v1297 = v5[istride * 23]; + float32x2_t v1553 = vmul_f32(v1680, v1551); + float32x2_t v1572 = vmul_f32(v1680, v1570); + float32x2_t v1579 = vmul_f32(v1680, v1577); + float32x2_t v1586 = vmul_f32(v1680, v1584); + float32x2_t v1638 = vmul_f32(v1680, v1636); + float32x2_t v1645 = vmul_f32(v1680, v1643); + float32x2_t v1656 = vmul_f32(v1680, v1654); + float32x2_t v1667 = vmul_f32(v1680, v1665); + float32x2_t v1674 = vmul_f32(v1680, v1672); + float32x2_t v1681 = vmul_f32(v1680, v1679); + float32x2_t v1078 = vadd_f32(v1072, v1077); + float32x2_t v1079 = vsub_f32(v1072, v1077); + float32x2_t v1090 = vadd_f32(v1084, v1089); + float32x2_t v1091 = vsub_f32(v1084, v1089); + float32x2_t v1104 = vadd_f32(v1098, v1103); + float32x2_t v1105 = vsub_f32(v1098, v1103); + float32x2_t v1116 = vadd_f32(v1110, v1115); + float32x2_t v1117 = vsub_f32(v1110, v1115); + float32x2_t v1130 = vadd_f32(v1124, v1129); + float32x2_t v1131 = vsub_f32(v1124, v1129); + float32x2_t v1142 = vadd_f32(v1136, v1141); + float32x2_t v1143 = vsub_f32(v1136, v1141); + float32x2_t v1156 = vadd_f32(v1150, v1155); + float32x2_t v1157 = vsub_f32(v1150, v1155); + float32x2_t v1168 = vadd_f32(v1162, v1167); + float32x2_t v1169 = vsub_f32(v1162, v1167); + float32x2_t v1182 = vadd_f32(v1176, v1181); + float32x2_t v1183 = vsub_f32(v1176, v1181); + float32x2_t v1194 = vadd_f32(v1188, v1193); + float32x2_t v1195 = vsub_f32(v1188, v1193); + float32x2_t v1208 = vadd_f32(v1202, v1207); + float32x2_t v1209 = vsub_f32(v1202, v1207); + float32x2_t v1220 = vadd_f32(v1214, v1219); + float32x2_t v1221 = vsub_f32(v1214, v1219); + float32x2_t v1234 = vadd_f32(v1228, v1233); + float32x2_t v1235 = vsub_f32(v1228, v1233); + float32x2_t v1246 = vadd_f32(v1240, v1245); + float32x2_t v1247 = vsub_f32(v1240, v1245); + float32x2_t v1260 = vadd_f32(v1254, v1259); + float32x2_t v1261 = vsub_f32(v1254, v1259); + float32x2_t v1272 = vadd_f32(v1266, v1271); + float32x2_t v1273 = vsub_f32(v1266, v1271); + float32x2_t v1286 = vadd_f32(v1280, v1285); + float32x2_t v1287 = vsub_f32(v1280, v1285); + float32x2_t v1298 = vadd_f32(v1292, v1297); + float32x2_t v1299 = vsub_f32(v1292, v1297); + float32x2_t v1092 = vadd_f32(v1078, v1090); + float32x2_t v1093 = vsub_f32(v1078, v1090); + float32x2_t v1118 = vadd_f32(v1104, v1116); + float32x2_t v1119 = vsub_f32(v1104, v1116); + float32x2_t v1144 = vadd_f32(v1130, v1142); + float32x2_t v1145 = vsub_f32(v1130, v1142); + float32x2_t v1170 = vadd_f32(v1156, v1168); + float32x2_t v1171 = vsub_f32(v1156, v1168); + float32x2_t v1196 = vadd_f32(v1182, v1194); + float32x2_t v1197 = vsub_f32(v1182, v1194); + float32x2_t v1222 = vadd_f32(v1208, v1220); + float32x2_t v1223 = vsub_f32(v1208, v1220); + float32x2_t v1248 = vadd_f32(v1234, v1246); + float32x2_t v1249 = vsub_f32(v1234, v1246); + float32x2_t v1274 = vadd_f32(v1260, v1272); + float32x2_t v1275 = vsub_f32(v1260, v1272); + float32x2_t v1300 = vadd_f32(v1286, v1298); + float32x2_t v1301 = vsub_f32(v1286, v1298); + float32x2_t v1510 = vadd_f32(v1105, v1287); + float32x2_t v1511 = vsub_f32(v1105, v1287); + float32x2_t v1512 = vadd_f32(v1261, v1131); + float32x2_t v1513 = vsub_f32(v1261, v1131); + float32x2_t v1514 = vadd_f32(v1157, v1235); + float32x2_t v1515 = vsub_f32(v1157, v1235); + float32x2_t v1516 = vadd_f32(v1183, v1209); + float32x2_t v1517 = vsub_f32(v1183, v1209); + float32x2_t v1614 = vadd_f32(v1117, v1299); + float32x2_t v1615 = vsub_f32(v1117, v1299); + float32x2_t v1616 = vadd_f32(v1273, v1143); + float32x2_t v1617 = vsub_f32(v1273, v1143); + float32x2_t v1618 = vadd_f32(v1169, v1247); + float32x2_t v1619 = vsub_f32(v1169, v1247); + float32x2_t v1620 = vadd_f32(v1195, v1221); + float32x2_t v1621 = vsub_f32(v1195, v1221); + float32x2_t v1302 = vadd_f32(v1118, v1300); + float32x2_t v1303 = vsub_f32(v1118, v1300); + float32x2_t v1304 = vadd_f32(v1274, v1144); + float32x2_t v1305 = vsub_f32(v1274, v1144); + float32x2_t v1306 = vadd_f32(v1170, v1248); + float32x2_t v1307 = vsub_f32(v1170, v1248); + float32x2_t v1308 = vadd_f32(v1196, v1222); + float32x2_t v1309 = vsub_f32(v1196, v1222); + float32x2_t v1406 = vadd_f32(v1119, v1301); + float32x2_t v1407 = vsub_f32(v1119, v1301); + float32x2_t v1408 = vadd_f32(v1275, v1145); + float32x2_t v1409 = vsub_f32(v1275, v1145); + float32x2_t v1410 = vadd_f32(v1171, v1249); + float32x2_t v1411 = vsub_f32(v1171, v1249); + float32x2_t v1412 = vadd_f32(v1197, v1223); + float32x2_t v1413 = vsub_f32(v1197, v1223); + float32x2_t v1518 = vadd_f32(v1510, v1512); + float32x2_t v1522 = vadd_f32(v1511, v1513); + float32x2_t v1524 = vsub_f32(v1510, v1512); + float32x2_t v1525 = vsub_f32(v1512, v1516); + float32x2_t v1526 = vsub_f32(v1516, v1510); + float32x2_t v1527 = vsub_f32(v1511, v1513); + float32x2_t v1528 = vsub_f32(v1513, v1517); + float32x2_t v1529 = vsub_f32(v1517, v1511); + float32x2_t v1548 = vmul_f32(v1514, v1547); + float32x2_t v1554 = vrev64_f32(v1515); + float32x2_t v1622 = vadd_f32(v1614, v1616); + float32x2_t v1626 = vadd_f32(v1615, v1617); + float32x2_t v1628 = vsub_f32(v1614, v1616); + float32x2_t v1629 = vsub_f32(v1616, v1620); + float32x2_t v1630 = vsub_f32(v1620, v1614); + float32x2_t v1631 = vsub_f32(v1615, v1617); + float32x2_t v1632 = vsub_f32(v1617, v1621); + float32x2_t v1633 = vsub_f32(v1621, v1615); + float32x2_t v1657 = vrev64_f32(v1618); + float32x2_t v1662 = vmul_f32(v1619, v1661); + float32x2_t v1310 = vadd_f32(v1302, v1304); + float32x2_t v1314 = vadd_f32(v1303, v1305); + float32x2_t v1316 = vsub_f32(v1302, v1304); + float32x2_t v1317 = vsub_f32(v1304, v1308); + float32x2_t v1318 = vsub_f32(v1308, v1302); + float32x2_t v1319 = vsub_f32(v1303, v1305); + float32x2_t v1320 = vsub_f32(v1305, v1309); + float32x2_t v1321 = vsub_f32(v1309, v1303); + float32x2_t v1340 = vmul_f32(v1306, v1547); + float32x2_t v1346 = vrev64_f32(v1307); + float32x2_t v1414 = vadd_f32(v1406, v1408); + float32x2_t v1418 = vadd_f32(v1407, v1409); + float32x2_t v1420 = vsub_f32(v1406, v1408); + float32x2_t v1421 = vsub_f32(v1408, v1412); + float32x2_t v1422 = vsub_f32(v1412, v1406); + float32x2_t v1423 = vsub_f32(v1407, v1409); + float32x2_t v1424 = vsub_f32(v1409, v1413); + float32x2_t v1425 = vsub_f32(v1413, v1407); + float32x2_t v1444 = vmul_f32(v1410, v1547); + float32x2_t v1450 = vrev64_f32(v1411); + float32x2_t v1519 = vadd_f32(v1518, v1516); + float32x2_t v1523 = vadd_f32(v1522, v1517); + float32x2_t v1555 = vmul_f32(v1554, v1553); + float32x2_t v1559 = vmul_f32(v1524, v1558); + float32x2_t v1563 = vmul_f32(v1525, v1562); + float32x2_t v1567 = vmul_f32(v1526, v1566); + float32x2_t v1573 = vrev64_f32(v1527); + float32x2_t v1580 = vrev64_f32(v1528); + float32x2_t v1587 = vrev64_f32(v1529); + float32x2_t v1623 = vadd_f32(v1622, v1620); + float32x2_t v1627 = vadd_f32(v1626, v1621); + float32x2_t v1658 = vmul_f32(v1657, v1656); + float32x2_t v1668 = vrev64_f32(v1628); + float32x2_t v1675 = vrev64_f32(v1629); + float32x2_t v1682 = vrev64_f32(v1630); + float32x2_t v1687 = vmul_f32(v1631, v1686); + float32x2_t v1691 = vmul_f32(v1632, v1690); + float32x2_t v1695 = vmul_f32(v1633, v1694); + float32x2_t v1311 = vadd_f32(v1310, v1308); + float32x2_t v1315 = vadd_f32(v1314, v1309); + float32x2_t v1347 = vmul_f32(v1346, v1553); + float32x2_t v1351 = vmul_f32(v1316, v1558); + float32x2_t v1355 = vmul_f32(v1317, v1562); + float32x2_t v1359 = vmul_f32(v1318, v1566); + float32x2_t v1365 = vrev64_f32(v1319); + float32x2_t v1372 = vrev64_f32(v1320); + float32x2_t v1379 = vrev64_f32(v1321); + float32x2_t v1415 = vadd_f32(v1414, v1412); + float32x2_t v1419 = vadd_f32(v1418, v1413); + float32x2_t v1451 = vmul_f32(v1450, v1553); + float32x2_t v1455 = vmul_f32(v1420, v1558); + float32x2_t v1459 = vmul_f32(v1421, v1562); + float32x2_t v1463 = vmul_f32(v1422, v1566); + float32x2_t v1469 = vrev64_f32(v1423); + float32x2_t v1476 = vrev64_f32(v1424); + float32x2_t v1483 = vrev64_f32(v1425); + float32x2_t v1520 = vadd_f32(v1519, v1514); + float32x2_t v1537 = vmul_f32(v1519, v1536); + float32x2_t v1543 = vrev64_f32(v1523); + float32x2_t v1574 = vmul_f32(v1573, v1572); + float32x2_t v1581 = vmul_f32(v1580, v1579); + float32x2_t v1588 = vmul_f32(v1587, v1586); + float32x2_t v1624 = vadd_f32(v1623, v1618); + float32x2_t v1646 = vrev64_f32(v1623); + float32x2_t v1651 = vmul_f32(v1627, v1661); + float32x2_t v1669 = vmul_f32(v1668, v1667); + float32x2_t v1676 = vmul_f32(v1675, v1674); + float32x2_t v1683 = vmul_f32(v1682, v1681); + float32x2_t v1709 = vadd_f32(v1662, v1687); + float32x2_t v1711 = vsub_f32(v1662, v1691); + float32x2_t v1713 = vsub_f32(v1662, v1687); + float32x2_t v1312 = vadd_f32(v1311, v1306); + float32x2_t v1329 = vmul_f32(v1311, v1536); + float32x2_t v1335 = vrev64_f32(v1315); + float32x2_t v1366 = vmul_f32(v1365, v1572); + float32x2_t v1373 = vmul_f32(v1372, v1579); + float32x2_t v1380 = vmul_f32(v1379, v1586); + float32x2_t v1416 = vadd_f32(v1415, v1410); + float32x2_t v1433 = vmul_f32(v1415, v1536); + float32x2_t v1439 = vrev64_f32(v1419); + float32x2_t v1470 = vmul_f32(v1469, v1572); + float32x2_t v1477 = vmul_f32(v1476, v1579); + float32x2_t v1484 = vmul_f32(v1483, v1586); + float32x2_t v1521 = vadd_f32(v1520, v1079); + float32x2_t v1544 = vmul_f32(v1543, v1553); + float32x2_t v1589 = vadd_f32(v1537, v1537); + float32x2_t v1602 = vadd_f32(v1555, v1574); + float32x2_t v1604 = vsub_f32(v1555, v1581); + float32x2_t v1606 = vsub_f32(v1555, v1574); + float32x2_t v1625 = vadd_f32(v1624, v1091); + float32x2_t v1647 = vmul_f32(v1646, v1645); + float32x2_t v1710 = vadd_f32(v1709, v1691); + float32x2_t v1712 = vadd_f32(v1711, v1695); + float32x2_t v1714 = vsub_f32(v1713, v1695); + float32x2_t v1313 = vadd_f32(v1312, v1092); + float32x2_t v1336 = vmul_f32(v1335, v1553); + float32x2_t v1381 = vadd_f32(v1329, v1329); + float32x2_t v1394 = vadd_f32(v1347, v1366); + float32x2_t v1396 = vsub_f32(v1347, v1373); + float32x2_t v1398 = vsub_f32(v1347, v1366); + float32x2_t v1417 = vadd_f32(v1416, v1093); + float32x2_t v1440 = vmul_f32(v1439, v1553); + float32x2_t v1485 = vadd_f32(v1433, v1433); + float32x2_t v1498 = vadd_f32(v1451, v1470); + float32x2_t v1500 = vsub_f32(v1451, v1477); + float32x2_t v1502 = vsub_f32(v1451, v1470); + float32x2_t v1590 = vadd_f32(v1589, v1537); + float32x2_t v1594 = vadd_f32(v1521, v1548); + float32x2_t v1603 = vadd_f32(v1602, v1581); + float32x2_t v1605 = vadd_f32(v1604, v1588); + float32x2_t v1607 = vsub_f32(v1606, v1588); + float32x2_t v1639 = vrev64_f32(v1625); + float32x2_t v1696 = vadd_f32(v1647, v1647); + float32x2_t v1382 = vadd_f32(v1381, v1329); + float32x2_t v1386 = vadd_f32(v1313, v1340); + float32x2_t v1395 = vadd_f32(v1394, v1373); + float32x2_t v1397 = vadd_f32(v1396, v1380); + float32x2_t v1399 = vsub_f32(v1398, v1380); + float32x2_t v1486 = vadd_f32(v1485, v1433); + float32x2_t v1490 = vadd_f32(v1417, v1444); + float32x2_t v1499 = vadd_f32(v1498, v1477); + float32x2_t v1501 = vadd_f32(v1500, v1484); + float32x2_t v1503 = vsub_f32(v1502, v1484); + float32x2_t v1591 = vadd_f32(v1521, v1590); + float32x2_t v1595 = vadd_f32(v1594, v1589); + float32x2_t v1640 = vmul_f32(v1639, v1638); + float32x2_t v1697 = vadd_f32(v1696, v1647); + v6[0] = v1313; + v6[ostride * 18] = v1417; + float32x2_t v1383 = vadd_f32(v1313, v1382); + float32x2_t v1387 = vadd_f32(v1386, v1381); + float32x2_t v1487 = vadd_f32(v1417, v1486); + float32x2_t v1491 = vadd_f32(v1490, v1485); + float32x2_t v1592 = vadd_f32(v1591, v1544); + float32x2_t v1593 = vsub_f32(v1591, v1544); + float32x2_t v1596 = vadd_f32(v1595, v1559); + float32x2_t v1598 = vsub_f32(v1595, v1563); + float32x2_t v1600 = vsub_f32(v1595, v1559); + float32x2_t v1698 = vadd_f32(v1640, v1697); + float32x2_t v1701 = vadd_f32(v1640, v1658); + float32x2_t v1721 = vadd_f32(v1521, v1640); + float32x2_t v1722 = vsub_f32(v1521, v1640); + float32x2_t v1384 = vadd_f32(v1383, v1336); + float32x2_t v1385 = vsub_f32(v1383, v1336); + float32x2_t v1388 = vadd_f32(v1387, v1351); + float32x2_t v1390 = vsub_f32(v1387, v1355); + float32x2_t v1392 = vsub_f32(v1387, v1351); + float32x2_t v1488 = vadd_f32(v1487, v1440); + float32x2_t v1489 = vsub_f32(v1487, v1440); + float32x2_t v1492 = vadd_f32(v1491, v1455); + float32x2_t v1494 = vsub_f32(v1491, v1459); + float32x2_t v1496 = vsub_f32(v1491, v1455); + float32x2_t v1597 = vadd_f32(v1596, v1563); + float32x2_t v1599 = vadd_f32(v1598, v1567); + float32x2_t v1601 = vsub_f32(v1600, v1567); + float32x2_t v1699 = vadd_f32(v1698, v1651); + float32x2_t v1700 = vsub_f32(v1698, v1651); + float32x2_t v1702 = vadd_f32(v1701, v1696); + v6[ostride * 9] = v1722; + v6[ostride * 27] = v1721; + float32x2_t v1389 = vadd_f32(v1388, v1355); + float32x2_t v1391 = vadd_f32(v1390, v1359); + float32x2_t v1393 = vsub_f32(v1392, v1359); + float32x2_t v1493 = vadd_f32(v1492, v1459); + float32x2_t v1495 = vadd_f32(v1494, v1463); + float32x2_t v1497 = vsub_f32(v1496, v1463); + float32x2_t v1608 = vadd_f32(v1597, v1603); + float32x2_t v1609 = vsub_f32(v1597, v1603); + float32x2_t v1610 = vadd_f32(v1599, v1605); + float32x2_t v1611 = vsub_f32(v1599, v1605); + float32x2_t v1612 = vadd_f32(v1601, v1607); + float32x2_t v1613 = vsub_f32(v1601, v1607); + float32x2_t v1703 = vadd_f32(v1702, v1669); + float32x2_t v1705 = vsub_f32(v1702, v1676); + float32x2_t v1707 = vsub_f32(v1702, v1669); + float32x2_t v1787 = vadd_f32(v1593, v1700); + float32x2_t v1788 = vsub_f32(v1593, v1700); + v6[ostride * 12] = v1385; + v6[ostride * 30] = v1489; + float32x2_t v1853 = vadd_f32(v1592, v1699); + float32x2_t v1854 = vsub_f32(v1592, v1699); + v6[ostride * 24] = v1384; + v6[ostride * 6] = v1488; + float32x2_t v1400 = vadd_f32(v1389, v1395); + float32x2_t v1401 = vsub_f32(v1389, v1395); + float32x2_t v1402 = vadd_f32(v1391, v1397); + float32x2_t v1403 = vsub_f32(v1391, v1397); + float32x2_t v1404 = vadd_f32(v1393, v1399); + float32x2_t v1405 = vsub_f32(v1393, v1399); + float32x2_t v1504 = vadd_f32(v1493, v1499); + float32x2_t v1505 = vsub_f32(v1493, v1499); + float32x2_t v1506 = vadd_f32(v1495, v1501); + float32x2_t v1507 = vsub_f32(v1495, v1501); + float32x2_t v1508 = vadd_f32(v1497, v1503); + float32x2_t v1509 = vsub_f32(v1497, v1503); + float32x2_t v1704 = vadd_f32(v1703, v1676); + float32x2_t v1706 = vadd_f32(v1705, v1683); + float32x2_t v1708 = vsub_f32(v1707, v1683); + v6[ostride * 21] = v1788; + v6[ostride * 3] = v1787; + v6[ostride * 33] = v1854; + v6[ostride * 15] = v1853; + float32x2_t v1715 = vadd_f32(v1704, v1710); + float32x2_t v1716 = vsub_f32(v1704, v1710); + float32x2_t v1717 = vadd_f32(v1706, v1712); + float32x2_t v1718 = vsub_f32(v1706, v1712); + float32x2_t v1719 = vadd_f32(v1708, v1714); + float32x2_t v1720 = vsub_f32(v1708, v1714); + v6[ostride * 28] = v1401; + v6[ostride * 10] = v1505; + v6[ostride * 20] = v1402; + v6[ostride * 2] = v1506; + v6[ostride * 4] = v1405; + v6[ostride * 22] = v1509; + v6[ostride * 32] = v1404; + v6[ostride * 14] = v1508; + v6[ostride * 16] = v1403; + v6[ostride * 34] = v1507; + v6[ostride * 8] = v1400; + v6[ostride * 26] = v1504; + float32x2_t v1743 = vadd_f32(v1609, v1716); + float32x2_t v1744 = vsub_f32(v1609, v1716); + float32x2_t v1765 = vadd_f32(v1610, v1717); + float32x2_t v1766 = vsub_f32(v1610, v1717); + float32x2_t v1809 = vadd_f32(v1613, v1720); + float32x2_t v1810 = vsub_f32(v1613, v1720); + float32x2_t v1831 = vadd_f32(v1612, v1719); + float32x2_t v1832 = vsub_f32(v1612, v1719); + float32x2_t v1875 = vadd_f32(v1611, v1718); + float32x2_t v1876 = vsub_f32(v1611, v1718); + float32x2_t v1897 = vadd_f32(v1608, v1715); + float32x2_t v1898 = vsub_f32(v1608, v1715); + v6[ostride] = v1744; + v6[ostride * 19] = v1743; + v6[ostride * 29] = v1766; + v6[ostride * 11] = v1765; + v6[ostride * 13] = v1810; + v6[ostride * 31] = v1809; + v6[ostride * 5] = v1832; + v6[ostride * 23] = v1831; + v6[ostride * 25] = v1876; + v6[ostride * 7] = v1875; + v6[ostride * 17] = v1898; + v6[ostride * 35] = v1897; + v5 += 1 * 1; + v6 += 1 * 1; + } +} +#endif + +#ifdef ARMRAL_ARCH_SVE +void armral_fft_cf32_cf32_cf32_ac_n_uu36(const armral_cmplx_f32_t *restrict x, + armral_cmplx_f32_t *restrict y, + int istride, int ostride, int howmany, + float dir) { + int64_t v0 = istride; + int64_t v2 = ostride; + float v4 = dir; + const float32x2_t *v5 = (const float32x2_t *)x; + float32x2_t *v6 = (float32x2_t *)y; + int64_t v8 = howmany; + int64_t v10 = svcntd(); + int64_t v11 = v10 * 1; + int64_t v12 = v10 * 1; + for (int j = 0; j < v8; j += v10) { + svbool_t pred_full = svwhilelt_b32(j * 2, howmany * 2); + float v570 = -5.0000000000000000e-01F; + float v582 = -1.4999999999999998e+00F; + float v594 = 7.6604444311897801e-01F; + float v599 = 9.3969262078590832e-01F; + float v604 = -1.7364817766693039e-01F; + float v675 = -1.0000000000000000e+00F; + float v682 = 5.0000000000000000e-01F; + float v694 = 1.4999999999999998e+00F; + float v701 = -8.6602540378443871e-01F; + float v706 = -7.6604444311897801e-01F; + float v713 = -9.3969262078590832e-01F; + float v720 = 1.7364817766693039e-01F; + float v727 = -6.4278760968653925e-01F; + float v732 = 3.4202014332566888e-01F; + float v737 = -9.8480775301220802e-01F; + const float32x2_t *v1313 = &v5[v0]; + float32x2_t *v1457 = &v6[v2]; + int64_t v26 = v0 * 18; + int64_t v35 = v0 * 9; + int64_t v42 = v0 * 27; + int64_t v53 = v0 * 4; + int64_t v60 = v0 * 22; + int64_t v69 = v0 * 13; + int64_t v76 = v0 * 31; + int64_t v87 = v0 * 8; + int64_t v94 = v0 * 26; + int64_t v103 = v0 * 17; + int64_t v110 = v0 * 35; + int64_t v121 = v0 * 12; + int64_t v128 = v0 * 30; + int64_t v137 = v0 * 21; + int64_t v144 = v0 * 3; + int64_t v155 = v0 * 16; + int64_t v162 = v0 * 34; + int64_t v171 = v0 * 25; + int64_t v178 = v0 * 7; + int64_t v189 = v0 * 20; + int64_t v196 = v0 * 2; + int64_t v205 = v0 * 29; + int64_t v212 = v0 * 11; + int64_t v223 = v0 * 24; + int64_t v230 = v0 * 6; + int64_t v239 = v0 * 33; + int64_t v246 = v0 * 15; + int64_t v257 = v0 * 28; + int64_t v264 = v0 * 10; + int64_t v280 = v0 * 19; + int64_t v291 = v0 * 32; + int64_t v298 = v0 * 14; + int64_t v307 = v0 * 5; + int64_t v314 = v0 * 23; + float v590 = v4 * v701; + float v612 = v4 * v727; + float v619 = v4 * v732; + float v626 = v4 * v737; + float v678 = v4 * v675; + float v685 = v4 * v682; + float v697 = v4 * v694; + float v709 = v4 * v706; + float v716 = v4 * v713; + float v723 = v4 * v720; + int64_t v776 = v2 * 9; + int64_t v783 = v2 * 18; + int64_t v790 = v2 * 27; + int64_t v799 = v2 * 28; + int64_t v813 = v2 * 10; + int64_t v820 = v2 * 19; + int64_t v829 = v2 * 20; + int64_t v836 = v2 * 29; + int64_t v843 = v2 * 2; + int64_t v850 = v2 * 11; + int64_t v859 = v2 * 12; + int64_t v866 = v2 * 21; + int64_t v873 = v2 * 30; + int64_t v880 = v2 * 3; + int64_t v889 = v2 * 4; + int64_t v896 = v2 * 13; + int64_t v903 = v2 * 22; + int64_t v910 = v2 * 31; + int64_t v919 = v2 * 32; + int64_t v926 = v2 * 5; + int64_t v933 = v2 * 14; + int64_t v940 = v2 * 23; + int64_t v949 = v2 * 24; + int64_t v956 = v2 * 33; + int64_t v963 = v2 * 6; + int64_t v970 = v2 * 15; + int64_t v979 = v2 * 16; + int64_t v986 = v2 * 25; + int64_t v993 = v2 * 34; + int64_t v1000 = v2 * 7; + int64_t v1009 = v2 * 8; + int64_t v1016 = v2 * 17; + int64_t v1023 = v2 * 26; + int64_t v1030 = v2 * 35; + const float32x2_t *v1043 = &v5[0]; + svfloat32_t v1384 = svdup_n_f32(v570); + svfloat32_t v1386 = svdup_n_f32(v582); + svfloat32_t v1388 = svdup_n_f32(v594); + svfloat32_t v1389 = svdup_n_f32(v599); + svfloat32_t v1390 = svdup_n_f32(v604); + svfloat32_t v1398 = svdup_n_f32(v701); + svfloat32_t v1402 = svdup_n_f32(v727); + svfloat32_t v1403 = svdup_n_f32(v732); + svfloat32_t v1404 = svdup_n_f32(v737); + float32x2_t *v1412 = &v6[0]; + svfloat32_t v1791 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v1313)[0])); + const float32x2_t *v1052 = &v5[v26]; + const float32x2_t *v1061 = &v5[v35]; + const float32x2_t *v1070 = &v5[v42]; + const float32x2_t *v1079 = &v5[v53]; + const float32x2_t *v1088 = &v5[v60]; + const float32x2_t *v1097 = &v5[v69]; + const float32x2_t *v1106 = &v5[v76]; + const float32x2_t *v1115 = &v5[v87]; + const float32x2_t *v1124 = &v5[v94]; + const float32x2_t *v1133 = &v5[v103]; + const float32x2_t *v1142 = &v5[v110]; + const float32x2_t *v1151 = &v5[v121]; + const float32x2_t *v1160 = &v5[v128]; + const float32x2_t *v1169 = &v5[v137]; + const float32x2_t *v1178 = &v5[v144]; + const float32x2_t *v1187 = &v5[v155]; + const float32x2_t *v1196 = &v5[v162]; + const float32x2_t *v1205 = &v5[v171]; + const float32x2_t *v1214 = &v5[v178]; + const float32x2_t *v1223 = &v5[v189]; + const float32x2_t *v1232 = &v5[v196]; + const float32x2_t *v1241 = &v5[v205]; + const float32x2_t *v1250 = &v5[v212]; + const float32x2_t *v1259 = &v5[v223]; + const float32x2_t *v1268 = &v5[v230]; + const float32x2_t *v1277 = &v5[v239]; + const float32x2_t *v1286 = &v5[v246]; + const float32x2_t *v1295 = &v5[v257]; + const float32x2_t *v1304 = &v5[v264]; + const float32x2_t *v1322 = &v5[v280]; + const float32x2_t *v1331 = &v5[v291]; + const float32x2_t *v1340 = &v5[v298]; + const float32x2_t *v1349 = &v5[v307]; + const float32x2_t *v1358 = &v5[v314]; + svfloat32_t v1387 = svdup_n_f32(v590); + svfloat32_t v1391 = svdup_n_f32(v612); + svfloat32_t v1392 = svdup_n_f32(v619); + svfloat32_t v1393 = svdup_n_f32(v626); + svfloat32_t v1394 = svdup_n_f32(v678); + svfloat32_t v1395 = svdup_n_f32(v685); + svfloat32_t v1397 = svdup_n_f32(v697); + svfloat32_t v1399 = svdup_n_f32(v709); + svfloat32_t v1400 = svdup_n_f32(v716); + svfloat32_t v1401 = svdup_n_f32(v723); + float32x2_t *v1421 = &v6[v776]; + float32x2_t *v1430 = &v6[v783]; + float32x2_t *v1439 = &v6[v790]; + float32x2_t *v1448 = &v6[v799]; + float32x2_t *v1466 = &v6[v813]; + float32x2_t *v1475 = &v6[v820]; + float32x2_t *v1484 = &v6[v829]; + float32x2_t *v1493 = &v6[v836]; + float32x2_t *v1502 = &v6[v843]; + float32x2_t *v1511 = &v6[v850]; + float32x2_t *v1520 = &v6[v859]; + float32x2_t *v1529 = &v6[v866]; + float32x2_t *v1538 = &v6[v873]; + float32x2_t *v1547 = &v6[v880]; + float32x2_t *v1556 = &v6[v889]; + float32x2_t *v1565 = &v6[v896]; + float32x2_t *v1574 = &v6[v903]; + float32x2_t *v1583 = &v6[v910]; + float32x2_t *v1592 = &v6[v919]; + float32x2_t *v1601 = &v6[v926]; + float32x2_t *v1610 = &v6[v933]; + float32x2_t *v1619 = &v6[v940]; + float32x2_t *v1628 = &v6[v949]; + float32x2_t *v1637 = &v6[v956]; + float32x2_t *v1646 = &v6[v963]; + float32x2_t *v1655 = &v6[v970]; + float32x2_t *v1664 = &v6[v979]; + float32x2_t *v1673 = &v6[v986]; + float32x2_t *v1682 = &v6[v993]; + float32x2_t *v1691 = &v6[v1000]; + float32x2_t *v1700 = &v6[v1009]; + float32x2_t *v1709 = &v6[v1016]; + float32x2_t *v1718 = &v6[v1023]; + float32x2_t *v1727 = &v6[v1030]; + svfloat32_t v1731 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v1043)[0])); + svfloat32_t v1733 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v1052)[0])); + svfloat32_t v1735 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v1061)[0])); + svfloat32_t v1737 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v1070)[0])); + svfloat32_t v1739 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v1079)[0])); + svfloat32_t v1741 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v1088)[0])); + svfloat32_t v1743 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v1097)[0])); + svfloat32_t v1745 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v1106)[0])); + svfloat32_t v1747 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v1115)[0])); + svfloat32_t v1749 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v1124)[0])); + svfloat32_t v1751 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v1133)[0])); + svfloat32_t v1753 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v1142)[0])); + svfloat32_t v1755 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v1151)[0])); + svfloat32_t v1757 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v1160)[0])); + svfloat32_t v1759 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v1169)[0])); + svfloat32_t v1761 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v1178)[0])); + svfloat32_t v1763 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v1187)[0])); + svfloat32_t v1765 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v1196)[0])); + svfloat32_t v1767 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v1205)[0])); + svfloat32_t v1769 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v1214)[0])); + svfloat32_t v1771 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v1223)[0])); + svfloat32_t v1773 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v1232)[0])); + svfloat32_t v1775 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v1241)[0])); + svfloat32_t v1777 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v1250)[0])); + svfloat32_t v1779 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v1259)[0])); + svfloat32_t v1781 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v1268)[0])); + svfloat32_t v1783 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v1277)[0])); + svfloat32_t v1785 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v1286)[0])); + svfloat32_t v1787 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v1295)[0])); + svfloat32_t v1789 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v1304)[0])); + svfloat32_t v1793 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v1322)[0])); + svfloat32_t v1795 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v1331)[0])); + svfloat32_t v1797 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v1340)[0])); + svfloat32_t v1799 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v1349)[0])); + svfloat32_t v1801 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v1358)[0])); + svfloat32_t v32 = svadd_f32_x(svptrue_b32(), v1731, v1733); + svfloat32_t v33 = svsub_f32_x(svptrue_b32(), v1731, v1733); + svfloat32_t v48 = svadd_f32_x(svptrue_b32(), v1735, v1737); + svfloat32_t v49 = svsub_f32_x(svptrue_b32(), v1735, v1737); + svfloat32_t v66 = svadd_f32_x(svptrue_b32(), v1739, v1741); + svfloat32_t v67 = svsub_f32_x(svptrue_b32(), v1739, v1741); + svfloat32_t v82 = svadd_f32_x(svptrue_b32(), v1743, v1745); + svfloat32_t v83 = svsub_f32_x(svptrue_b32(), v1743, v1745); + svfloat32_t v100 = svadd_f32_x(svptrue_b32(), v1747, v1749); + svfloat32_t v101 = svsub_f32_x(svptrue_b32(), v1747, v1749); + svfloat32_t v116 = svadd_f32_x(svptrue_b32(), v1751, v1753); + svfloat32_t v117 = svsub_f32_x(svptrue_b32(), v1751, v1753); + svfloat32_t v134 = svadd_f32_x(svptrue_b32(), v1755, v1757); + svfloat32_t v135 = svsub_f32_x(svptrue_b32(), v1755, v1757); + svfloat32_t v150 = svadd_f32_x(svptrue_b32(), v1759, v1761); + svfloat32_t v151 = svsub_f32_x(svptrue_b32(), v1759, v1761); + svfloat32_t v168 = svadd_f32_x(svptrue_b32(), v1763, v1765); + svfloat32_t v169 = svsub_f32_x(svptrue_b32(), v1763, v1765); + svfloat32_t v184 = svadd_f32_x(svptrue_b32(), v1767, v1769); + svfloat32_t v185 = svsub_f32_x(svptrue_b32(), v1767, v1769); + svfloat32_t v202 = svadd_f32_x(svptrue_b32(), v1771, v1773); + svfloat32_t v203 = svsub_f32_x(svptrue_b32(), v1771, v1773); + svfloat32_t v218 = svadd_f32_x(svptrue_b32(), v1775, v1777); + svfloat32_t v219 = svsub_f32_x(svptrue_b32(), v1775, v1777); + svfloat32_t v236 = svadd_f32_x(svptrue_b32(), v1779, v1781); + svfloat32_t v237 = svsub_f32_x(svptrue_b32(), v1779, v1781); + svfloat32_t v252 = svadd_f32_x(svptrue_b32(), v1783, v1785); + svfloat32_t v253 = svsub_f32_x(svptrue_b32(), v1783, v1785); + svfloat32_t v270 = svadd_f32_x(svptrue_b32(), v1787, v1789); + svfloat32_t v271 = svsub_f32_x(svptrue_b32(), v1787, v1789); + svfloat32_t v286 = svadd_f32_x(svptrue_b32(), v1791, v1793); + svfloat32_t v287 = svsub_f32_x(svptrue_b32(), v1791, v1793); + svfloat32_t v304 = svadd_f32_x(svptrue_b32(), v1795, v1797); + svfloat32_t v305 = svsub_f32_x(svptrue_b32(), v1795, v1797); + svfloat32_t v320 = svadd_f32_x(svptrue_b32(), v1799, v1801); + svfloat32_t v321 = svsub_f32_x(svptrue_b32(), v1799, v1801); + svfloat32_t v50 = svadd_f32_x(svptrue_b32(), v32, v48); + svfloat32_t v51 = svsub_f32_x(svptrue_b32(), v32, v48); + svfloat32_t v84 = svadd_f32_x(svptrue_b32(), v66, v82); + svfloat32_t v85 = svsub_f32_x(svptrue_b32(), v66, v82); + svfloat32_t v118 = svadd_f32_x(svptrue_b32(), v100, v116); + svfloat32_t v119 = svsub_f32_x(svptrue_b32(), v100, v116); + svfloat32_t v152 = svadd_f32_x(svptrue_b32(), v134, v150); + svfloat32_t v153 = svsub_f32_x(svptrue_b32(), v134, v150); + svfloat32_t v186 = svadd_f32_x(svptrue_b32(), v168, v184); + svfloat32_t v187 = svsub_f32_x(svptrue_b32(), v168, v184); + svfloat32_t v220 = svadd_f32_x(svptrue_b32(), v202, v218); + svfloat32_t v221 = svsub_f32_x(svptrue_b32(), v202, v218); + svfloat32_t v254 = svadd_f32_x(svptrue_b32(), v236, v252); + svfloat32_t v255 = svsub_f32_x(svptrue_b32(), v236, v252); + svfloat32_t v288 = svadd_f32_x(svptrue_b32(), v270, v286); + svfloat32_t v289 = svsub_f32_x(svptrue_b32(), v270, v286); + svfloat32_t v322 = svadd_f32_x(svptrue_b32(), v304, v320); + svfloat32_t v323 = svsub_f32_x(svptrue_b32(), v304, v320); + svfloat32_t v544 = svadd_f32_x(svptrue_b32(), v67, v305); + svfloat32_t v545 = svsub_f32_x(svptrue_b32(), v67, v305); + svfloat32_t v546 = svadd_f32_x(svptrue_b32(), v271, v101); + svfloat32_t v547 = svsub_f32_x(svptrue_b32(), v271, v101); + svfloat32_t v548 = svadd_f32_x(svptrue_b32(), v135, v237); + svfloat32_t v549 = svsub_f32_x(svptrue_b32(), v135, v237); + svfloat32_t v550 = svadd_f32_x(svptrue_b32(), v169, v203); + svfloat32_t v551 = svsub_f32_x(svptrue_b32(), v169, v203); + svfloat32_t v654 = svadd_f32_x(svptrue_b32(), v83, v321); + svfloat32_t v655 = svsub_f32_x(svptrue_b32(), v83, v321); + svfloat32_t v656 = svadd_f32_x(svptrue_b32(), v287, v117); + svfloat32_t v657 = svsub_f32_x(svptrue_b32(), v287, v117); + svfloat32_t v658 = svadd_f32_x(svptrue_b32(), v151, v253); + svfloat32_t v659 = svsub_f32_x(svptrue_b32(), v151, v253); + svfloat32_t v660 = svadd_f32_x(svptrue_b32(), v185, v219); + svfloat32_t v661 = svsub_f32_x(svptrue_b32(), v185, v219); + svfloat32_t v324 = svadd_f32_x(svptrue_b32(), v84, v322); + svfloat32_t v325 = svsub_f32_x(svptrue_b32(), v84, v322); + svfloat32_t v326 = svadd_f32_x(svptrue_b32(), v288, v118); + svfloat32_t v327 = svsub_f32_x(svptrue_b32(), v288, v118); + svfloat32_t v328 = svadd_f32_x(svptrue_b32(), v152, v254); + svfloat32_t v329 = svsub_f32_x(svptrue_b32(), v152, v254); + svfloat32_t v330 = svadd_f32_x(svptrue_b32(), v186, v220); + svfloat32_t v331 = svsub_f32_x(svptrue_b32(), v186, v220); + svfloat32_t v434 = svadd_f32_x(svptrue_b32(), v85, v323); + svfloat32_t v435 = svsub_f32_x(svptrue_b32(), v85, v323); + svfloat32_t v436 = svadd_f32_x(svptrue_b32(), v289, v119); + svfloat32_t v437 = svsub_f32_x(svptrue_b32(), v289, v119); + svfloat32_t v438 = svadd_f32_x(svptrue_b32(), v153, v255); + svfloat32_t v439 = svsub_f32_x(svptrue_b32(), v153, v255); + svfloat32_t v440 = svadd_f32_x(svptrue_b32(), v187, v221); + svfloat32_t v441 = svsub_f32_x(svptrue_b32(), v187, v221); + svfloat32_t v552 = svadd_f32_x(svptrue_b32(), v544, v546); + svfloat32_t v556 = svadd_f32_x(svptrue_b32(), v545, v547); + svfloat32_t v558 = svsub_f32_x(svptrue_b32(), v544, v546); + svfloat32_t v559 = svsub_f32_x(svptrue_b32(), v546, v550); + svfloat32_t v560 = svsub_f32_x(svptrue_b32(), v550, v544); + svfloat32_t v561 = svsub_f32_x(svptrue_b32(), v545, v547); + svfloat32_t v562 = svsub_f32_x(svptrue_b32(), v547, v551); + svfloat32_t v563 = svsub_f32_x(svptrue_b32(), v551, v545); + svfloat32_t zero592 = svdup_n_f32(0); + svfloat32_t v592 = svcmla_f32_x(pred_full, zero592, v1387, v549, 90); + svfloat32_t v662 = svadd_f32_x(svptrue_b32(), v654, v656); + svfloat32_t v666 = svadd_f32_x(svptrue_b32(), v655, v657); + svfloat32_t v668 = svsub_f32_x(svptrue_b32(), v654, v656); + svfloat32_t v669 = svsub_f32_x(svptrue_b32(), v656, v660); + svfloat32_t v670 = svsub_f32_x(svptrue_b32(), v660, v654); + svfloat32_t v671 = svsub_f32_x(svptrue_b32(), v655, v657); + svfloat32_t v672 = svsub_f32_x(svptrue_b32(), v657, v661); + svfloat32_t v673 = svsub_f32_x(svptrue_b32(), v661, v655); + svfloat32_t v332 = svadd_f32_x(svptrue_b32(), v324, v326); + svfloat32_t v336 = svadd_f32_x(svptrue_b32(), v325, v327); + svfloat32_t v338 = svsub_f32_x(svptrue_b32(), v324, v326); + svfloat32_t v339 = svsub_f32_x(svptrue_b32(), v326, v330); + svfloat32_t v340 = svsub_f32_x(svptrue_b32(), v330, v324); + svfloat32_t v341 = svsub_f32_x(svptrue_b32(), v325, v327); + svfloat32_t v342 = svsub_f32_x(svptrue_b32(), v327, v331); + svfloat32_t v343 = svsub_f32_x(svptrue_b32(), v331, v325); + svfloat32_t zero372 = svdup_n_f32(0); + svfloat32_t v372 = svcmla_f32_x(pred_full, zero372, v1387, v329, 90); + svfloat32_t v442 = svadd_f32_x(svptrue_b32(), v434, v436); + svfloat32_t v446 = svadd_f32_x(svptrue_b32(), v435, v437); + svfloat32_t v448 = svsub_f32_x(svptrue_b32(), v434, v436); + svfloat32_t v449 = svsub_f32_x(svptrue_b32(), v436, v440); + svfloat32_t v450 = svsub_f32_x(svptrue_b32(), v440, v434); + svfloat32_t v451 = svsub_f32_x(svptrue_b32(), v435, v437); + svfloat32_t v452 = svsub_f32_x(svptrue_b32(), v437, v441); + svfloat32_t v453 = svsub_f32_x(svptrue_b32(), v441, v435); + svfloat32_t zero482 = svdup_n_f32(0); + svfloat32_t v482 = svcmla_f32_x(pred_full, zero482, v1387, v439, 90); + svfloat32_t v553 = svadd_f32_x(svptrue_b32(), v552, v550); + svfloat32_t v557 = svadd_f32_x(svptrue_b32(), v556, v551); + svfloat32_t zero614 = svdup_n_f32(0); + svfloat32_t v614 = svcmla_f32_x(pred_full, zero614, v1391, v561, 90); + svfloat32_t zero621 = svdup_n_f32(0); + svfloat32_t v621 = svcmla_f32_x(pred_full, zero621, v1392, v562, 90); + svfloat32_t zero628 = svdup_n_f32(0); + svfloat32_t v628 = svcmla_f32_x(pred_full, zero628, v1393, v563, 90); + svfloat32_t v663 = svadd_f32_x(svptrue_b32(), v662, v660); + svfloat32_t v667 = svadd_f32_x(svptrue_b32(), v666, v661); + svfloat32_t zero711 = svdup_n_f32(0); + svfloat32_t v711 = svcmla_f32_x(pred_full, zero711, v1399, v668, 90); + svfloat32_t zero718 = svdup_n_f32(0); + svfloat32_t v718 = svcmla_f32_x(pred_full, zero718, v1400, v669, 90); + svfloat32_t zero725 = svdup_n_f32(0); + svfloat32_t v725 = svcmla_f32_x(pred_full, zero725, v1401, v670, 90); + svfloat32_t v730 = svmul_f32_x(svptrue_b32(), v671, v1402); + svfloat32_t v735 = svmul_f32_x(svptrue_b32(), v672, v1403); + svfloat32_t v333 = svadd_f32_x(svptrue_b32(), v332, v330); + svfloat32_t v337 = svadd_f32_x(svptrue_b32(), v336, v331); + svfloat32_t zero394 = svdup_n_f32(0); + svfloat32_t v394 = svcmla_f32_x(pred_full, zero394, v1391, v341, 90); + svfloat32_t zero401 = svdup_n_f32(0); + svfloat32_t v401 = svcmla_f32_x(pred_full, zero401, v1392, v342, 90); + svfloat32_t zero408 = svdup_n_f32(0); + svfloat32_t v408 = svcmla_f32_x(pred_full, zero408, v1393, v343, 90); + svfloat32_t v443 = svadd_f32_x(svptrue_b32(), v442, v440); + svfloat32_t v447 = svadd_f32_x(svptrue_b32(), v446, v441); + svfloat32_t zero504 = svdup_n_f32(0); + svfloat32_t v504 = svcmla_f32_x(pred_full, zero504, v1391, v451, 90); + svfloat32_t zero511 = svdup_n_f32(0); + svfloat32_t v511 = svcmla_f32_x(pred_full, zero511, v1392, v452, 90); + svfloat32_t zero518 = svdup_n_f32(0); + svfloat32_t v518 = svcmla_f32_x(pred_full, zero518, v1393, v453, 90); + svfloat32_t v554 = svadd_f32_x(svptrue_b32(), v553, v548); + svfloat32_t v573 = svmul_f32_x(svptrue_b32(), v553, v1384); + svfloat32_t zero580 = svdup_n_f32(0); + svfloat32_t v580 = svcmla_f32_x(pred_full, zero580, v1387, v557, 90); + svfloat32_t v642 = svadd_f32_x(svptrue_b32(), v592, v614); + svfloat32_t v644 = svsub_f32_x(svptrue_b32(), v592, v621); + svfloat32_t v646 = svsub_f32_x(svptrue_b32(), v592, v614); + svfloat32_t v664 = svadd_f32_x(svptrue_b32(), v663, v658); + svfloat32_t zero687 = svdup_n_f32(0); + svfloat32_t v687 = svcmla_f32_x(pred_full, zero687, v1395, v663, 90); + svfloat32_t v754 = svmla_f32_x(pred_full, v730, v659, v1398); + svfloat32_t v756 = svnmls_f32_x(pred_full, v735, v659, v1398); + svfloat32_t v758 = svnmls_f32_x(pred_full, v730, v659, v1398); + svfloat32_t v334 = svadd_f32_x(svptrue_b32(), v333, v328); + svfloat32_t v353 = svmul_f32_x(svptrue_b32(), v333, v1384); + svfloat32_t zero360 = svdup_n_f32(0); + svfloat32_t v360 = svcmla_f32_x(pred_full, zero360, v1387, v337, 90); + svfloat32_t v422 = svadd_f32_x(svptrue_b32(), v372, v394); + svfloat32_t v424 = svsub_f32_x(svptrue_b32(), v372, v401); + svfloat32_t v426 = svsub_f32_x(svptrue_b32(), v372, v394); + svfloat32_t v444 = svadd_f32_x(svptrue_b32(), v443, v438); + svfloat32_t v463 = svmul_f32_x(svptrue_b32(), v443, v1384); + svfloat32_t zero470 = svdup_n_f32(0); + svfloat32_t v470 = svcmla_f32_x(pred_full, zero470, v1387, v447, 90); + svfloat32_t v532 = svadd_f32_x(svptrue_b32(), v482, v504); + svfloat32_t v534 = svsub_f32_x(svptrue_b32(), v482, v511); + svfloat32_t v536 = svsub_f32_x(svptrue_b32(), v482, v504); + svfloat32_t v555 = svadd_f32_x(svptrue_b32(), v554, v33); + svfloat32_t v629 = svadd_f32_x(svptrue_b32(), v573, v573); + svfloat32_t v643 = svadd_f32_x(svptrue_b32(), v642, v621); + svfloat32_t v645 = svadd_f32_x(svptrue_b32(), v644, v628); + svfloat32_t v647 = svsub_f32_x(svptrue_b32(), v646, v628); + svfloat32_t v665 = svadd_f32_x(svptrue_b32(), v664, v49); + svfloat32_t v741 = svadd_f32_x(svptrue_b32(), v687, v687); + svfloat32_t v755 = svmla_f32_x(pred_full, v754, v672, v1403); + svfloat32_t v757 = svmla_f32_x(pred_full, v756, v673, v1404); + svfloat32_t v759 = svmls_f32_x(pred_full, v758, v673, v1404); + svfloat32_t v335 = svadd_f32_x(svptrue_b32(), v334, v50); + svfloat32_t v409 = svadd_f32_x(svptrue_b32(), v353, v353); + svfloat32_t v423 = svadd_f32_x(svptrue_b32(), v422, v401); + svfloat32_t v425 = svadd_f32_x(svptrue_b32(), v424, v408); + svfloat32_t v427 = svsub_f32_x(svptrue_b32(), v426, v408); + svfloat32_t v445 = svadd_f32_x(svptrue_b32(), v444, v51); + svfloat32_t v519 = svadd_f32_x(svptrue_b32(), v463, v463); + svfloat32_t v533 = svadd_f32_x(svptrue_b32(), v532, v511); + svfloat32_t v535 = svadd_f32_x(svptrue_b32(), v534, v518); + svfloat32_t v537 = svsub_f32_x(svptrue_b32(), v536, v518); + svfloat32_t v630 = svmla_f32_x(pred_full, v629, v553, v1384); + svfloat32_t v634 = svmla_f32_x(pred_full, v555, v548, v1386); + svfloat32_t zero680 = svdup_n_f32(0); + svfloat32_t v680 = svcmla_f32_x(pred_full, zero680, v1394, v665, 90); + svfloat32_t v742 = svadd_f32_x(svptrue_b32(), v741, v687); + svfloat32_t v410 = svmla_f32_x(pred_full, v409, v333, v1384); + svfloat32_t v414 = svmla_f32_x(pred_full, v335, v328, v1386); + svfloat32_t v520 = svmla_f32_x(pred_full, v519, v443, v1384); + svfloat32_t v524 = svmla_f32_x(pred_full, v445, v438, v1386); + svfloat32_t v631 = svadd_f32_x(svptrue_b32(), v555, v630); + svfloat32_t v635 = svadd_f32_x(svptrue_b32(), v634, v629); + svfloat32_t v743 = svadd_f32_x(svptrue_b32(), v680, v742); + svfloat32_t v746 = svcmla_f32_x(pred_full, v680, v1397, v658, 90); + svfloat32_t v766 = svadd_f32_x(svptrue_b32(), v555, v680); + svfloat32_t v767 = svsub_f32_x(svptrue_b32(), v555, v680); + svst1_f64(pred_full, (double *)(v1412), svreinterpret_f64_f32(v335)); + svst1_f64(pred_full, (double *)(v1430), svreinterpret_f64_f32(v445)); + svfloat32_t v411 = svadd_f32_x(svptrue_b32(), v335, v410); + svfloat32_t v415 = svadd_f32_x(svptrue_b32(), v414, v409); + svfloat32_t v521 = svadd_f32_x(svptrue_b32(), v445, v520); + svfloat32_t v525 = svadd_f32_x(svptrue_b32(), v524, v519); + svfloat32_t v632 = svadd_f32_x(svptrue_b32(), v631, v580); + svfloat32_t v633 = svsub_f32_x(svptrue_b32(), v631, v580); + svfloat32_t v636 = svmla_f32_x(pred_full, v635, v558, v1388); + svfloat32_t v638 = svmls_f32_x(pred_full, v635, v559, v1389); + svfloat32_t v640 = svmls_f32_x(pred_full, v635, v558, v1388); + svfloat32_t v744 = svmla_f32_x(pred_full, v743, v667, v1398); + svfloat32_t v745 = svmls_f32_x(pred_full, v743, v667, v1398); + svfloat32_t v747 = svadd_f32_x(svptrue_b32(), v746, v741); + svst1_f64(pred_full, (double *)(v1421), svreinterpret_f64_f32(v767)); + svst1_f64(pred_full, (double *)(v1439), svreinterpret_f64_f32(v766)); + svfloat32_t v412 = svadd_f32_x(svptrue_b32(), v411, v360); + svfloat32_t v413 = svsub_f32_x(svptrue_b32(), v411, v360); + svfloat32_t v416 = svmla_f32_x(pred_full, v415, v338, v1388); + svfloat32_t v418 = svmls_f32_x(pred_full, v415, v339, v1389); + svfloat32_t v420 = svmls_f32_x(pred_full, v415, v338, v1388); + svfloat32_t v522 = svadd_f32_x(svptrue_b32(), v521, v470); + svfloat32_t v523 = svsub_f32_x(svptrue_b32(), v521, v470); + svfloat32_t v526 = svmla_f32_x(pred_full, v525, v448, v1388); + svfloat32_t v528 = svmls_f32_x(pred_full, v525, v449, v1389); + svfloat32_t v530 = svmls_f32_x(pred_full, v525, v448, v1388); + svfloat32_t v637 = svmla_f32_x(pred_full, v636, v559, v1389); + svfloat32_t v639 = svmla_f32_x(pred_full, v638, v560, v1390); + svfloat32_t v641 = svmls_f32_x(pred_full, v640, v560, v1390); + svfloat32_t v748 = svadd_f32_x(svptrue_b32(), v747, v711); + svfloat32_t v750 = svsub_f32_x(svptrue_b32(), v747, v718); + svfloat32_t v752 = svsub_f32_x(svptrue_b32(), v747, v711); + svfloat32_t v856 = svadd_f32_x(svptrue_b32(), v633, v745); + svfloat32_t v857 = svsub_f32_x(svptrue_b32(), v633, v745); + svfloat32_t v946 = svadd_f32_x(svptrue_b32(), v632, v744); + svfloat32_t v947 = svsub_f32_x(svptrue_b32(), v632, v744); + svfloat32_t v417 = svmla_f32_x(pred_full, v416, v339, v1389); + svfloat32_t v419 = svmla_f32_x(pred_full, v418, v340, v1390); + svfloat32_t v421 = svmls_f32_x(pred_full, v420, v340, v1390); + svfloat32_t v527 = svmla_f32_x(pred_full, v526, v449, v1389); + svfloat32_t v529 = svmla_f32_x(pred_full, v528, v450, v1390); + svfloat32_t v531 = svmls_f32_x(pred_full, v530, v450, v1390); + svfloat32_t v648 = svadd_f32_x(svptrue_b32(), v637, v643); + svfloat32_t v649 = svsub_f32_x(svptrue_b32(), v637, v643); + svfloat32_t v650 = svadd_f32_x(svptrue_b32(), v639, v645); + svfloat32_t v651 = svsub_f32_x(svptrue_b32(), v639, v645); + svfloat32_t v652 = svadd_f32_x(svptrue_b32(), v641, v647); + svfloat32_t v653 = svsub_f32_x(svptrue_b32(), v641, v647); + svfloat32_t v749 = svadd_f32_x(svptrue_b32(), v748, v718); + svfloat32_t v751 = svadd_f32_x(svptrue_b32(), v750, v725); + svfloat32_t v753 = svsub_f32_x(svptrue_b32(), v752, v725); + svst1_f64(pred_full, (double *)(v1520), svreinterpret_f64_f32(v413)); + svst1_f64(pred_full, (double *)(v1529), svreinterpret_f64_f32(v857)); + svst1_f64(pred_full, (double *)(v1538), svreinterpret_f64_f32(v523)); + svst1_f64(pred_full, (double *)(v1547), svreinterpret_f64_f32(v856)); + svst1_f64(pred_full, (double *)(v1628), svreinterpret_f64_f32(v412)); + svst1_f64(pred_full, (double *)(v1637), svreinterpret_f64_f32(v947)); + svst1_f64(pred_full, (double *)(v1646), svreinterpret_f64_f32(v522)); + svst1_f64(pred_full, (double *)(v1655), svreinterpret_f64_f32(v946)); + svfloat32_t v428 = svadd_f32_x(svptrue_b32(), v417, v423); + svfloat32_t v429 = svsub_f32_x(svptrue_b32(), v417, v423); + svfloat32_t v430 = svadd_f32_x(svptrue_b32(), v419, v425); + svfloat32_t v431 = svsub_f32_x(svptrue_b32(), v419, v425); + svfloat32_t v432 = svadd_f32_x(svptrue_b32(), v421, v427); + svfloat32_t v433 = svsub_f32_x(svptrue_b32(), v421, v427); + svfloat32_t v538 = svadd_f32_x(svptrue_b32(), v527, v533); + svfloat32_t v539 = svsub_f32_x(svptrue_b32(), v527, v533); + svfloat32_t v540 = svadd_f32_x(svptrue_b32(), v529, v535); + svfloat32_t v541 = svsub_f32_x(svptrue_b32(), v529, v535); + svfloat32_t v542 = svadd_f32_x(svptrue_b32(), v531, v537); + svfloat32_t v543 = svsub_f32_x(svptrue_b32(), v531, v537); + svfloat32_t v760 = svadd_f32_x(svptrue_b32(), v749, v755); + svfloat32_t v761 = svsub_f32_x(svptrue_b32(), v749, v755); + svfloat32_t v762 = svadd_f32_x(svptrue_b32(), v751, v757); + svfloat32_t v763 = svsub_f32_x(svptrue_b32(), v751, v757); + svfloat32_t v764 = svadd_f32_x(svptrue_b32(), v753, v759); + svfloat32_t v765 = svsub_f32_x(svptrue_b32(), v753, v759); + svfloat32_t v796 = svadd_f32_x(svptrue_b32(), v649, v761); + svfloat32_t v797 = svsub_f32_x(svptrue_b32(), v649, v761); + svfloat32_t v826 = svadd_f32_x(svptrue_b32(), v650, v762); + svfloat32_t v827 = svsub_f32_x(svptrue_b32(), v650, v762); + svfloat32_t v886 = svadd_f32_x(svptrue_b32(), v653, v765); + svfloat32_t v887 = svsub_f32_x(svptrue_b32(), v653, v765); + svfloat32_t v916 = svadd_f32_x(svptrue_b32(), v652, v764); + svfloat32_t v917 = svsub_f32_x(svptrue_b32(), v652, v764); + svfloat32_t v976 = svadd_f32_x(svptrue_b32(), v651, v763); + svfloat32_t v977 = svsub_f32_x(svptrue_b32(), v651, v763); + svfloat32_t v1006 = svadd_f32_x(svptrue_b32(), v648, v760); + svfloat32_t v1007 = svsub_f32_x(svptrue_b32(), v648, v760); + svst1_f64(pred_full, (double *)(v1448), svreinterpret_f64_f32(v429)); + svst1_f64(pred_full, (double *)(v1466), svreinterpret_f64_f32(v539)); + svst1_f64(pred_full, (double *)(v1484), svreinterpret_f64_f32(v430)); + svst1_f64(pred_full, (double *)(v1502), svreinterpret_f64_f32(v540)); + svst1_f64(pred_full, (double *)(v1556), svreinterpret_f64_f32(v433)); + svst1_f64(pred_full, (double *)(v1574), svreinterpret_f64_f32(v543)); + svst1_f64(pred_full, (double *)(v1592), svreinterpret_f64_f32(v432)); + svst1_f64(pred_full, (double *)(v1610), svreinterpret_f64_f32(v542)); + svst1_f64(pred_full, (double *)(v1664), svreinterpret_f64_f32(v431)); + svst1_f64(pred_full, (double *)(v1682), svreinterpret_f64_f32(v541)); + svst1_f64(pred_full, (double *)(v1700), svreinterpret_f64_f32(v428)); + svst1_f64(pred_full, (double *)(v1718), svreinterpret_f64_f32(v538)); + svst1_f64(pred_full, (double *)(v1457), svreinterpret_f64_f32(v797)); + svst1_f64(pred_full, (double *)(v1475), svreinterpret_f64_f32(v796)); + svst1_f64(pred_full, (double *)(v1493), svreinterpret_f64_f32(v827)); + svst1_f64(pred_full, (double *)(v1511), svreinterpret_f64_f32(v826)); + svst1_f64(pred_full, (double *)(v1565), svreinterpret_f64_f32(v887)); + svst1_f64(pred_full, (double *)(v1583), svreinterpret_f64_f32(v886)); + svst1_f64(pred_full, (double *)(v1601), svreinterpret_f64_f32(v917)); + svst1_f64(pred_full, (double *)(v1619), svreinterpret_f64_f32(v916)); + svst1_f64(pred_full, (double *)(v1673), svreinterpret_f64_f32(v977)); + svst1_f64(pred_full, (double *)(v1691), svreinterpret_f64_f32(v976)); + svst1_f64(pred_full, (double *)(v1709), svreinterpret_f64_f32(v1007)); + svst1_f64(pred_full, (double *)(v1727), svreinterpret_f64_f32(v1006)); + v5 += v11; + v6 += v12; + } +} +#endif + +#ifndef ARMRAL_ARCH_SVE +void armral_fft_cf32_cf32_cf32_ac_n_uu40(const armral_cmplx_f32_t *restrict x, + armral_cmplx_f32_t *restrict y, + int istride, int ostride, int howmany, + float dir) { + float v4 = dir; + const float32x2_t *v5 = (const float32x2_t *)x; + float32x2_t *v6 = (float32x2_t *)y; + int64_t v12 = howmany - 1; + int64_t v1185 = howmany / 2; + for (int j = 0; j < v12; j += 2) { + float v409 = 1.0000000000000000e+00F; + float v410 = -1.0000000000000000e+00F; + float v418 = -7.0710678118654746e-01F; + float v426 = 7.0710678118654757e-01F; + float v484 = -1.2500000000000000e+00F; + float v485 = 1.2500000000000000e+00F; + float v493 = 8.8388347648318433e-01F; + float v501 = -8.8388347648318444e-01F; + float v559 = 5.5901699437494745e-01F; + float v560 = -5.5901699437494745e-01F; + float v568 = -3.9528470752104738e-01F; + float v576 = 3.9528470752104744e-01F; + float v635 = 1.5388417685876268e+00F; + float v644 = -1.5388417685876268e+00F; + float v653 = 1.0881254497414108e+00F; + float v654 = -1.0881254497414108e+00F; + float v716 = 5.8778525229247325e-01F; + float v725 = -5.8778525229247325e-01F; + float v734 = 4.1562693777745352e-01F; + float v735 = -4.1562693777745352e-01F; + float v797 = 3.6327126400268028e-01F; + float v806 = -3.6327126400268028e-01F; + float v815 = 2.5687157418650380e-01F; + float v816 = -2.5687157418650380e-01F; + float32x2_t v818 = (float32x2_t){v4, v4}; + const float32x2_t *v2412 = &v5[istride]; + float32x2_t *v2575 = &v6[ostride]; + float32x2_t v411 = (float32x2_t){v409, v410}; + float32x2_t v419 = (float32x2_t){v426, v418}; + float32x2_t v427 = (float32x2_t){v426, v426}; + float32x2_t v481 = (float32x2_t){v484, v484}; + float32x2_t v486 = (float32x2_t){v484, v485}; + float32x2_t v494 = (float32x2_t){v501, v493}; + float32x2_t v502 = (float32x2_t){v501, v501}; + float32x2_t v556 = (float32x2_t){v559, v559}; + float32x2_t v561 = (float32x2_t){v559, v560}; + float32x2_t v569 = (float32x2_t){v576, v568}; + float32x2_t v577 = (float32x2_t){v576, v576}; + float32x2_t v637 = (float32x2_t){v635, v644}; + float32x2_t v645 = (float32x2_t){v644, v644}; + float32x2_t v650 = (float32x2_t){v654, v654}; + float32x2_t v655 = (float32x2_t){v653, v654}; + float32x2_t v718 = (float32x2_t){v716, v725}; + float32x2_t v726 = (float32x2_t){v725, v725}; + float32x2_t v731 = (float32x2_t){v735, v735}; + float32x2_t v736 = (float32x2_t){v734, v735}; + float32x2_t v799 = (float32x2_t){v797, v806}; + float32x2_t v807 = (float32x2_t){v806, v806}; + float32x2_t v812 = (float32x2_t){v816, v816}; + float32x2_t v817 = (float32x2_t){v815, v816}; + const float32x2_t *v2196 = &v5[0]; + float32x2_t *v2521 = &v6[0]; + float32x4_t v2932 = vld1q_f32((const float32_t *)v2412); + float32x2_t v413 = vmul_f32(v818, v411); + float32x2_t v421 = vmul_f32(v818, v419); + float32x4_t v428 = vcombine_f32(v427, v427); + float32x4_t v482 = vcombine_f32(v481, v481); + float32x2_t v488 = vmul_f32(v818, v486); + float32x2_t v496 = vmul_f32(v818, v494); + float32x4_t v503 = vcombine_f32(v502, v502); + float32x4_t v557 = vcombine_f32(v556, v556); + float32x2_t v563 = vmul_f32(v818, v561); + float32x2_t v571 = vmul_f32(v818, v569); + float32x4_t v578 = vcombine_f32(v577, v577); + float32x2_t v639 = vmul_f32(v818, v637); + float32x4_t v646 = vcombine_f32(v645, v645); + float32x4_t v651 = vcombine_f32(v650, v650); + float32x2_t v657 = vmul_f32(v818, v655); + float32x2_t v720 = vmul_f32(v818, v718); + float32x4_t v727 = vcombine_f32(v726, v726); + float32x4_t v732 = vcombine_f32(v731, v731); + float32x2_t v738 = vmul_f32(v818, v736); + float32x2_t v801 = vmul_f32(v818, v799); + float32x4_t v808 = vcombine_f32(v807, v807); + float32x4_t v813 = vcombine_f32(v812, v812); + float32x2_t v819 = vmul_f32(v818, v817); + const float32x2_t *v2159 = &v5[istride * 8]; + const float32x2_t *v2168 = &v5[istride * 32]; + const float32x2_t *v2177 = &v5[istride * 24]; + const float32x2_t *v2186 = &v5[istride * 16]; + const float32x2_t *v2205 = &v5[istride * 13]; + const float32x2_t *v2214 = &v5[istride * 37]; + const float32x2_t *v2223 = &v5[istride * 29]; + const float32x2_t *v2232 = &v5[istride * 21]; + const float32x2_t *v2241 = &v5[istride * 5]; + const float32x2_t *v2250 = &v5[istride * 18]; + const float32x2_t *v2259 = &v5[istride * 2]; + const float32x2_t *v2268 = &v5[istride * 34]; + const float32x2_t *v2277 = &v5[istride * 26]; + const float32x2_t *v2286 = &v5[istride * 10]; + const float32x2_t *v2295 = &v5[istride * 23]; + const float32x2_t *v2304 = &v5[istride * 7]; + const float32x2_t *v2313 = &v5[istride * 39]; + const float32x2_t *v2322 = &v5[istride * 31]; + const float32x2_t *v2331 = &v5[istride * 15]; + const float32x2_t *v2340 = &v5[istride * 28]; + const float32x2_t *v2349 = &v5[istride * 12]; + const float32x2_t *v2358 = &v5[istride * 4]; + const float32x2_t *v2367 = &v5[istride * 36]; + const float32x2_t *v2376 = &v5[istride * 20]; + const float32x2_t *v2385 = &v5[istride * 33]; + const float32x2_t *v2394 = &v5[istride * 17]; + const float32x2_t *v2403 = &v5[istride * 9]; + const float32x2_t *v2421 = &v5[istride * 25]; + const float32x2_t *v2430 = &v5[istride * 38]; + const float32x2_t *v2439 = &v5[istride * 22]; + const float32x2_t *v2448 = &v5[istride * 14]; + const float32x2_t *v2457 = &v5[istride * 6]; + const float32x2_t *v2466 = &v5[istride * 30]; + const float32x2_t *v2475 = &v5[istride * 3]; + const float32x2_t *v2484 = &v5[istride * 27]; + const float32x2_t *v2493 = &v5[istride * 19]; + const float32x2_t *v2502 = &v5[istride * 11]; + const float32x2_t *v2511 = &v5[istride * 35]; + float32x2_t *v2530 = &v6[ostride * 16]; + float32x2_t *v2539 = &v6[ostride * 32]; + float32x2_t *v2548 = &v6[ostride * 8]; + float32x2_t *v2557 = &v6[ostride * 24]; + float32x2_t *v2566 = &v6[ostride * 25]; + float32x2_t *v2584 = &v6[ostride * 17]; + float32x2_t *v2593 = &v6[ostride * 33]; + float32x2_t *v2602 = &v6[ostride * 9]; + float32x2_t *v2611 = &v6[ostride * 10]; + float32x2_t *v2620 = &v6[ostride * 26]; + float32x2_t *v2629 = &v6[ostride * 2]; + float32x2_t *v2638 = &v6[ostride * 18]; + float32x2_t *v2647 = &v6[ostride * 34]; + float32x2_t *v2656 = &v6[ostride * 35]; + float32x2_t *v2665 = &v6[ostride * 11]; + float32x2_t *v2674 = &v6[ostride * 27]; + float32x2_t *v2683 = &v6[ostride * 3]; + float32x2_t *v2692 = &v6[ostride * 19]; + float32x2_t *v2701 = &v6[ostride * 20]; + float32x2_t *v2710 = &v6[ostride * 36]; + float32x2_t *v2719 = &v6[ostride * 12]; + float32x2_t *v2728 = &v6[ostride * 28]; + float32x2_t *v2737 = &v6[ostride * 4]; + float32x2_t *v2746 = &v6[ostride * 5]; + float32x2_t *v2755 = &v6[ostride * 21]; + float32x2_t *v2764 = &v6[ostride * 37]; + float32x2_t *v2773 = &v6[ostride * 13]; + float32x2_t *v2782 = &v6[ostride * 29]; + float32x2_t *v2791 = &v6[ostride * 30]; + float32x2_t *v2800 = &v6[ostride * 6]; + float32x2_t *v2809 = &v6[ostride * 22]; + float32x2_t *v2818 = &v6[ostride * 38]; + float32x2_t *v2827 = &v6[ostride * 14]; + float32x2_t *v2836 = &v6[ostride * 15]; + float32x2_t *v2845 = &v6[ostride * 31]; + float32x2_t *v2854 = &v6[ostride * 7]; + float32x2_t *v2863 = &v6[ostride * 23]; + float32x2_t *v2872 = &v6[ostride * 39]; + float32x4_t v2884 = vld1q_f32((const float32_t *)v2196); + float32x4_t v415 = vcombine_f32(v413, v413); + float32x4_t v423 = vcombine_f32(v421, v421); + float32x4_t v490 = vcombine_f32(v488, v488); + float32x4_t v498 = vcombine_f32(v496, v496); + float32x4_t v565 = vcombine_f32(v563, v563); + float32x4_t v573 = vcombine_f32(v571, v571); + float32x4_t v641 = vcombine_f32(v639, v639); + float32x4_t v659 = vcombine_f32(v657, v657); + float32x4_t v722 = vcombine_f32(v720, v720); + float32x4_t v740 = vcombine_f32(v738, v738); + float32x4_t v803 = vcombine_f32(v801, v801); + float32x4_t v821 = vcombine_f32(v819, v819); + float32x4_t v2876 = vld1q_f32((const float32_t *)v2159); + float32x4_t v2878 = vld1q_f32((const float32_t *)v2168); + float32x4_t v2880 = vld1q_f32((const float32_t *)v2177); + float32x4_t v2882 = vld1q_f32((const float32_t *)v2186); + float32x4_t v2886 = vld1q_f32((const float32_t *)v2205); + float32x4_t v2888 = vld1q_f32((const float32_t *)v2214); + float32x4_t v2890 = vld1q_f32((const float32_t *)v2223); + float32x4_t v2892 = vld1q_f32((const float32_t *)v2232); + float32x4_t v2894 = vld1q_f32((const float32_t *)v2241); + float32x4_t v2896 = vld1q_f32((const float32_t *)v2250); + float32x4_t v2898 = vld1q_f32((const float32_t *)v2259); + float32x4_t v2900 = vld1q_f32((const float32_t *)v2268); + float32x4_t v2902 = vld1q_f32((const float32_t *)v2277); + float32x4_t v2904 = vld1q_f32((const float32_t *)v2286); + float32x4_t v2906 = vld1q_f32((const float32_t *)v2295); + float32x4_t v2908 = vld1q_f32((const float32_t *)v2304); + float32x4_t v2910 = vld1q_f32((const float32_t *)v2313); + float32x4_t v2912 = vld1q_f32((const float32_t *)v2322); + float32x4_t v2914 = vld1q_f32((const float32_t *)v2331); + float32x4_t v2916 = vld1q_f32((const float32_t *)v2340); + float32x4_t v2918 = vld1q_f32((const float32_t *)v2349); + float32x4_t v2920 = vld1q_f32((const float32_t *)v2358); + float32x4_t v2922 = vld1q_f32((const float32_t *)v2367); + float32x4_t v2924 = vld1q_f32((const float32_t *)v2376); + float32x4_t v2926 = vld1q_f32((const float32_t *)v2385); + float32x4_t v2928 = vld1q_f32((const float32_t *)v2394); + float32x4_t v2930 = vld1q_f32((const float32_t *)v2403); + float32x4_t v2934 = vld1q_f32((const float32_t *)v2421); + float32x4_t v2936 = vld1q_f32((const float32_t *)v2430); + float32x4_t v2938 = vld1q_f32((const float32_t *)v2439); + float32x4_t v2940 = vld1q_f32((const float32_t *)v2448); + float32x4_t v2942 = vld1q_f32((const float32_t *)v2457); + float32x4_t v2944 = vld1q_f32((const float32_t *)v2466); + float32x4_t v2946 = vld1q_f32((const float32_t *)v2475); + float32x4_t v2948 = vld1q_f32((const float32_t *)v2484); + float32x4_t v2950 = vld1q_f32((const float32_t *)v2493); + float32x4_t v2952 = vld1q_f32((const float32_t *)v2502); + float32x4_t v2954 = vld1q_f32((const float32_t *)v2511); + float32x4_t v35 = vaddq_f32(v2876, v2878); + float32x4_t v36 = vsubq_f32(v2876, v2878); + float32x4_t v51 = vaddq_f32(v2880, v2882); + float32x4_t v52 = vsubq_f32(v2880, v2882); + float32x4_t v78 = vaddq_f32(v2886, v2888); + float32x4_t v79 = vsubq_f32(v2886, v2888); + float32x4_t v94 = vaddq_f32(v2890, v2892); + float32x4_t v95 = vsubq_f32(v2890, v2892); + float32x4_t v121 = vaddq_f32(v2896, v2898); + float32x4_t v122 = vsubq_f32(v2896, v2898); + float32x4_t v137 = vaddq_f32(v2900, v2902); + float32x4_t v138 = vsubq_f32(v2900, v2902); + float32x4_t v164 = vaddq_f32(v2906, v2908); + float32x4_t v165 = vsubq_f32(v2906, v2908); + float32x4_t v180 = vaddq_f32(v2910, v2912); + float32x4_t v181 = vsubq_f32(v2910, v2912); + float32x4_t v207 = vaddq_f32(v2916, v2918); + float32x4_t v208 = vsubq_f32(v2916, v2918); + float32x4_t v223 = vaddq_f32(v2920, v2922); + float32x4_t v224 = vsubq_f32(v2920, v2922); + float32x4_t v250 = vaddq_f32(v2926, v2928); + float32x4_t v251 = vsubq_f32(v2926, v2928); + float32x4_t v266 = vaddq_f32(v2930, v2932); + float32x4_t v267 = vsubq_f32(v2930, v2932); + float32x4_t v293 = vaddq_f32(v2936, v2938); + float32x4_t v294 = vsubq_f32(v2936, v2938); + float32x4_t v309 = vaddq_f32(v2940, v2942); + float32x4_t v310 = vsubq_f32(v2940, v2942); + float32x4_t v336 = vaddq_f32(v2946, v2948); + float32x4_t v337 = vsubq_f32(v2946, v2948); + float32x4_t v352 = vaddq_f32(v2950, v2952); + float32x4_t v353 = vsubq_f32(v2950, v2952); + float32x4_t v53 = vaddq_f32(v35, v51); + float32x4_t v54 = vsubq_f32(v35, v51); + float32x4_t v55 = vaddq_f32(v36, v52); + float32x4_t v96 = vaddq_f32(v78, v94); + float32x4_t v97 = vsubq_f32(v78, v94); + float32x4_t v98 = vaddq_f32(v79, v95); + float32x4_t v139 = vaddq_f32(v121, v137); + float32x4_t v140 = vsubq_f32(v121, v137); + float32x4_t v141 = vaddq_f32(v122, v138); + float32x4_t v182 = vaddq_f32(v164, v180); + float32x4_t v183 = vsubq_f32(v164, v180); + float32x4_t v184 = vaddq_f32(v165, v181); + float32x4_t v225 = vaddq_f32(v207, v223); + float32x4_t v226 = vsubq_f32(v207, v223); + float32x4_t v227 = vaddq_f32(v208, v224); + float32x4_t v268 = vaddq_f32(v250, v266); + float32x4_t v269 = vsubq_f32(v250, v266); + float32x4_t v270 = vaddq_f32(v251, v267); + float32x4_t v311 = vaddq_f32(v293, v309); + float32x4_t v312 = vsubq_f32(v293, v309); + float32x4_t v313 = vaddq_f32(v294, v310); + float32x4_t v354 = vaddq_f32(v336, v352); + float32x4_t v355 = vsubq_f32(v336, v352); + float32x4_t v356 = vaddq_f32(v337, v353); + float32x4_t v590 = vaddq_f32(v36, v208); + float32x4_t v591 = vsubq_f32(v36, v208); + float32x4_t v592 = vaddq_f32(v122, v294); + float32x4_t v593 = vsubq_f32(v122, v294); + float32x4_t v594 = vaddq_f32(v79, v251); + float32x4_t v595 = vsubq_f32(v79, v251); + float32x4_t v596 = vaddq_f32(v165, v337); + float32x4_t v597 = vsubq_f32(v165, v337); + float32x4_t v752 = vaddq_f32(v52, v224); + float32x4_t v753 = vsubq_f32(v52, v224); + float32x4_t v754 = vaddq_f32(v138, v310); + float32x4_t v755 = vsubq_f32(v138, v310); + float32x4_t v756 = vaddq_f32(v95, v267); + float32x4_t v757 = vsubq_f32(v95, v267); + float32x4_t v758 = vaddq_f32(v181, v353); + float32x4_t v759 = vsubq_f32(v181, v353); + float32x4_t v63 = vaddq_f32(v53, v2884); + float32x4_t v106 = vaddq_f32(v96, v2894); + float32x4_t v149 = vaddq_f32(v139, v2904); + float32x4_t v192 = vaddq_f32(v182, v2914); + float32x4_t v235 = vaddq_f32(v225, v2924); + float32x4_t v278 = vaddq_f32(v268, v2934); + float32x4_t v321 = vaddq_f32(v311, v2944); + float32x4_t v364 = vaddq_f32(v354, v2954); + float32x4_t v440 = vaddq_f32(v53, v225); + float32x4_t v441 = vsubq_f32(v53, v225); + float32x4_t v442 = vaddq_f32(v139, v311); + float32x4_t v443 = vsubq_f32(v139, v311); + float32x4_t v444 = vaddq_f32(v96, v268); + float32x4_t v445 = vsubq_f32(v96, v268); + float32x4_t v446 = vaddq_f32(v182, v354); + float32x4_t v447 = vsubq_f32(v182, v354); + float32x4_t v515 = vaddq_f32(v54, v226); + float32x4_t v516 = vsubq_f32(v54, v226); + float32x4_t v517 = vaddq_f32(v140, v312); + float32x4_t v518 = vsubq_f32(v140, v312); + float32x4_t v519 = vaddq_f32(v97, v269); + float32x4_t v520 = vsubq_f32(v97, v269); + float32x4_t v521 = vaddq_f32(v183, v355); + float32x4_t v522 = vsubq_f32(v183, v355); + float32x4_t v598 = vaddq_f32(v590, v592); + float32x4_t v599 = vsubq_f32(v590, v592); + float32x4_t v600 = vaddq_f32(v594, v596); + float32x4_t v601 = vsubq_f32(v594, v596); + float32x4_t v604 = vaddq_f32(v595, v597); + float32x4_t v605 = vsubq_f32(v595, v597); + float32x4_t v640 = vrev64q_f32(v591); + float32x4_t v647 = vmulq_f32(v593, v646); + float32x4_t v671 = vaddq_f32(v55, v227); + float32x4_t v672 = vsubq_f32(v55, v227); + float32x4_t v673 = vaddq_f32(v141, v313); + float32x4_t v674 = vsubq_f32(v141, v313); + float32x4_t v675 = vaddq_f32(v98, v270); + float32x4_t v676 = vsubq_f32(v98, v270); + float32x4_t v677 = vaddq_f32(v184, v356); + float32x4_t v678 = vsubq_f32(v184, v356); + float32x4_t v760 = vaddq_f32(v752, v754); + float32x4_t v761 = vsubq_f32(v752, v754); + float32x4_t v762 = vaddq_f32(v756, v758); + float32x4_t v763 = vsubq_f32(v756, v758); + float32x4_t v766 = vaddq_f32(v757, v759); + float32x4_t v767 = vsubq_f32(v757, v759); + float32x4_t v802 = vrev64q_f32(v753); + float32x4_t v809 = vmulq_f32(v755, v808); + float32x4_t v365 = vaddq_f32(v63, v235); + float32x4_t v366 = vsubq_f32(v63, v235); + float32x4_t v367 = vaddq_f32(v149, v321); + float32x4_t v368 = vsubq_f32(v149, v321); + float32x4_t v369 = vaddq_f32(v106, v278); + float32x4_t v370 = vsubq_f32(v106, v278); + float32x4_t v371 = vaddq_f32(v192, v364); + float32x4_t v372 = vsubq_f32(v192, v364); + float32x4_t v448 = vaddq_f32(v440, v442); + float32x4_t v449 = vsubq_f32(v440, v442); + float32x4_t v450 = vaddq_f32(v444, v446); + float32x4_t v451 = vsubq_f32(v444, v446); + float32x4_t v454 = vaddq_f32(v445, v447); + float32x4_t v455 = vsubq_f32(v445, v447); + float32x4_t v483 = vmulq_f32(v441, v482); + float32x4_t v489 = vrev64q_f32(v443); + float32x4_t v523 = vaddq_f32(v515, v517); + float32x4_t v524 = vsubq_f32(v515, v517); + float32x4_t v525 = vaddq_f32(v519, v521); + float32x4_t v526 = vsubq_f32(v519, v521); + float32x4_t v529 = vaddq_f32(v520, v522); + float32x4_t v530 = vsubq_f32(v520, v522); + float32x4_t v558 = vmulq_f32(v516, v557); + float32x4_t v564 = vrev64q_f32(v518); + float32x4_t v602 = vaddq_f32(v598, v600); + float32x4_t v603 = vsubq_f32(v598, v600); + float32x4_t v627 = vrev64q_f32(v599); + float32x4_t v634 = vmulq_f32(v601, v646); + float32x4_t v642 = vmulq_f32(v640, v641); + float32x4_t v652 = vmulq_f32(v604, v651); + float32x4_t v658 = vrev64q_f32(v605); + float32x4_t v679 = vaddq_f32(v671, v673); + float32x4_t v680 = vsubq_f32(v671, v673); + float32x4_t v681 = vaddq_f32(v675, v677); + float32x4_t v682 = vsubq_f32(v675, v677); + float32x4_t v685 = vaddq_f32(v676, v678); + float32x4_t v686 = vsubq_f32(v676, v678); + float32x4_t v721 = vrev64q_f32(v672); + float32x4_t v728 = vmulq_f32(v674, v727); + float32x4_t v764 = vaddq_f32(v760, v762); + float32x4_t v765 = vsubq_f32(v760, v762); + float32x4_t v789 = vrev64q_f32(v761); + float32x4_t v796 = vmulq_f32(v763, v808); + float32x4_t v804 = vmulq_f32(v802, v803); + float32x4_t v814 = vmulq_f32(v766, v813); + float32x4_t v820 = vrev64q_f32(v767); + float32x4_t v373 = vaddq_f32(v365, v367); + float32x4_t v374 = vsubq_f32(v365, v367); + float32x4_t v375 = vaddq_f32(v369, v371); + float32x4_t v376 = vsubq_f32(v369, v371); + float32x4_t v379 = vaddq_f32(v370, v372); + float32x4_t v380 = vsubq_f32(v370, v372); + float32x4_t v414 = vrev64q_f32(v368); + float32x4_t v452 = vaddq_f32(v448, v450); + float32x4_t v453 = vsubq_f32(v448, v450); + float32x4_t v470 = vmulq_f32(v449, v482); + float32x4_t v476 = vrev64q_f32(v451); + float32x4_t v491 = vmulq_f32(v489, v490); + float32x4_t v497 = vrev64q_f32(v454); + float32x4_t v504 = vmulq_f32(v455, v503); + float32x4_t v527 = vaddq_f32(v523, v525); + float32x4_t v528 = vsubq_f32(v523, v525); + float32x4_t v545 = vmulq_f32(v524, v557); + float32x4_t v551 = vrev64q_f32(v526); + float32x4_t v566 = vmulq_f32(v564, v565); + float32x4_t v572 = vrev64q_f32(v529); + float32x4_t v579 = vmulq_f32(v530, v578); + float32x4_t v611 = vrev64q_f32(v602); + float32x4_t v619 = vrev64q_f32(v603); + float32x4_t v629 = vmulq_f32(v627, v641); + float32x4_t v660 = vmulq_f32(v658, v659); + float32x4_t v665 = vaddq_f32(v647, v652); + float32x4_t v666 = vsubq_f32(v647, v652); + float32x4_t v683 = vaddq_f32(v679, v681); + float32x4_t v684 = vsubq_f32(v679, v681); + float32x4_t v708 = vrev64q_f32(v680); + float32x4_t v715 = vmulq_f32(v682, v727); + float32x4_t v723 = vmulq_f32(v721, v722); + float32x4_t v733 = vmulq_f32(v685, v732); + float32x4_t v739 = vrev64q_f32(v686); + float32x4_t v773 = vrev64q_f32(v764); + float32x4_t v781 = vrev64q_f32(v765); + float32x4_t v791 = vmulq_f32(v789, v803); + float32x4_t v822 = vmulq_f32(v820, v821); + float32x4_t v827 = vaddq_f32(v809, v814); + float32x4_t v828 = vsubq_f32(v809, v814); + float32x4_t v377 = vaddq_f32(v373, v375); + float32x4_t v378 = vsubq_f32(v373, v375); + float32x4_t v401 = vrev64q_f32(v376); + float32x4_t v416 = vmulq_f32(v414, v415); + float32x4_t v422 = vrev64q_f32(v379); + float32x4_t v429 = vmulq_f32(v380, v428); + float32x4_t v460 = vmulq_f32(v452, v482); + float32x4_t v465 = vmulq_f32(v453, v482); + float32x4_t v478 = vmulq_f32(v476, v490); + float32x4_t v499 = vmulq_f32(v497, v498); + float32x4_t v507 = vaddq_f32(v483, v504); + float32x4_t v508 = vsubq_f32(v483, v504); + float32x4_t v535 = vmulq_f32(v527, v557); + float32x4_t v540 = vmulq_f32(v528, v557); + float32x4_t v553 = vmulq_f32(v551, v565); + float32x4_t v574 = vmulq_f32(v572, v573); + float32x4_t v582 = vaddq_f32(v558, v579); + float32x4_t v583 = vsubq_f32(v558, v579); + float32x4_t v613 = vmulq_f32(v611, v641); + float32x4_t v621 = vmulq_f32(v619, v641); + float32x4_t v661 = vaddq_f32(v629, v634); + float32x4_t v662 = vsubq_f32(v629, v634); + float32x4_t v663 = vaddq_f32(v642, v660); + float32x4_t v664 = vsubq_f32(v642, v660); + float32x4_t v692 = vrev64q_f32(v683); + float32x4_t v700 = vrev64q_f32(v684); + float32x4_t v710 = vmulq_f32(v708, v722); + float32x4_t v741 = vmulq_f32(v739, v740); + float32x4_t v746 = vaddq_f32(v728, v733); + float32x4_t v747 = vsubq_f32(v728, v733); + float32x4_t v775 = vmulq_f32(v773, v803); + float32x4_t v783 = vmulq_f32(v781, v803); + float32x4_t v823 = vaddq_f32(v791, v796); + float32x4_t v824 = vsubq_f32(v791, v796); + float32x4_t v825 = vaddq_f32(v804, v822); + float32x4_t v826 = vsubq_f32(v804, v822); + float32x4_t v403 = vmulq_f32(v401, v415); + float32x4_t v424 = vmulq_f32(v422, v423); + float32x4_t v432 = vaddq_f32(v366, v429); + float32x4_t v433 = vsubq_f32(v366, v429); + float32x4_t v505 = vaddq_f32(v470, v478); + float32x4_t v506 = vsubq_f32(v470, v478); + float32x4_t v509 = vaddq_f32(v491, v499); + float32x4_t v510 = vsubq_f32(v491, v499); + float32x4_t v580 = vaddq_f32(v545, v553); + float32x4_t v581 = vsubq_f32(v545, v553); + float32x4_t v584 = vaddq_f32(v566, v574); + float32x4_t v585 = vsubq_f32(v566, v574); + float32x4_t v667 = vaddq_f32(v663, v665); + float32x4_t v668 = vsubq_f32(v663, v665); + float32x4_t v669 = vaddq_f32(v664, v666); + float32x4_t v670 = vsubq_f32(v664, v666); + float32x4_t v694 = vmulq_f32(v692, v722); + float32x4_t v702 = vmulq_f32(v700, v722); + float32x4_t v742 = vaddq_f32(v710, v715); + float32x4_t v743 = vsubq_f32(v710, v715); + float32x4_t v744 = vaddq_f32(v723, v741); + float32x4_t v745 = vsubq_f32(v723, v741); + float32x4_t v829 = vaddq_f32(v825, v827); + float32x4_t v830 = vsubq_f32(v825, v827); + float32x4_t v831 = vaddq_f32(v826, v828); + float32x4_t v832 = vsubq_f32(v826, v828); + float32x4_t v833 = vaddq_f32(v377, v460); + float32x4_t v1009 = vaddq_f32(v378, v465); + vst1q_f32((float32_t *)v2521, v377); + vst1q_f32((float32_t *)v2701, v378); + float32x4_t v430 = vaddq_f32(v374, v403); + float32x4_t v431 = vsubq_f32(v374, v403); + float32x4_t v434 = vaddq_f32(v416, v424); + float32x4_t v435 = vsubq_f32(v416, v424); + float32x4_t v511 = vaddq_f32(v507, v509); + float32x4_t v512 = vsubq_f32(v507, v509); + float32x4_t v513 = vaddq_f32(v508, v510); + float32x4_t v514 = vsubq_f32(v508, v510); + float32x4_t v586 = vaddq_f32(v582, v584); + float32x4_t v587 = vsubq_f32(v582, v584); + float32x4_t v588 = vaddq_f32(v583, v585); + float32x4_t v589 = vsubq_f32(v583, v585); + float32x4_t v748 = vaddq_f32(v744, v746); + float32x4_t v749 = vsubq_f32(v744, v746); + float32x4_t v750 = vaddq_f32(v745, v747); + float32x4_t v751 = vsubq_f32(v745, v747); + float32x4_t v834 = vaddq_f32(v833, v535); + float32x4_t v835 = vsubq_f32(v833, v535); + float32x4_t v836 = vsubq_f32(v613, v694); + float32x4_t v837 = vaddq_f32(v694, v775); + float32x4_t v924 = vsubq_f32(v662, v743); + float32x4_t v925 = vaddq_f32(v743, v824); + float32x4_t v1010 = vaddq_f32(v1009, v540); + float32x4_t v1011 = vsubq_f32(v1009, v540); + float32x4_t v1012 = vsubq_f32(v621, v702); + float32x4_t v1013 = vaddq_f32(v702, v783); + float32x4_t v1100 = vsubq_f32(v661, v742); + float32x4_t v1101 = vaddq_f32(v742, v823); + float32x4_t v436 = vaddq_f32(v432, v434); + float32x4_t v437 = vsubq_f32(v432, v434); + float32x4_t v438 = vaddq_f32(v433, v435); + float32x4_t v439 = vsubq_f32(v433, v435); + float32x4_t v838 = vaddq_f32(v834, v836); + float32x4_t v839 = vsubq_f32(v834, v836); + float32x4_t v840 = vaddq_f32(v835, v837); + float32x4_t v841 = vsubq_f32(v835, v837); + float32x4_t v880 = vsubq_f32(v668, v749); + float32x4_t v881 = vaddq_f32(v749, v830); + float32x4_t v921 = vaddq_f32(v431, v506); + float32x4_t v968 = vsubq_f32(v669, v750); + float32x4_t v969 = vaddq_f32(v750, v831); + float32x4_t v1014 = vaddq_f32(v1010, v1012); + float32x4_t v1015 = vsubq_f32(v1010, v1012); + float32x4_t v1016 = vaddq_f32(v1011, v1013); + float32x4_t v1017 = vsubq_f32(v1011, v1013); + float32x4_t v1056 = vsubq_f32(v670, v751); + float32x4_t v1057 = vaddq_f32(v751, v832); + float32x4_t v1097 = vaddq_f32(v430, v505); + float32x4_t v1144 = vsubq_f32(v667, v748); + float32x4_t v1145 = vaddq_f32(v748, v829); + vst1q_f32((float32_t *)v2611, v431); + vst1q_f32((float32_t *)v2791, v430); + float32x4_t v877 = vaddq_f32(v437, v512); + float32x4_t v922 = vaddq_f32(v921, v581); + float32x4_t v923 = vsubq_f32(v921, v581); + float32x4_t v965 = vaddq_f32(v438, v513); + float32x4_t v1053 = vaddq_f32(v439, v514); + float32x4_t v1098 = vaddq_f32(v1097, v580); + float32x4_t v1099 = vsubq_f32(v1097, v580); + float32x4_t v1141 = vaddq_f32(v436, v511); + vst1q_f32((float32_t *)v2530, v839); + vst1q_f32((float32_t *)v2539, v841); + vst1q_f32((float32_t *)v2548, v840); + vst1q_f32((float32_t *)v2557, v838); + vst1q_f32((float32_t *)v2566, v437); + vst1q_f32((float32_t *)v2656, v438); + vst1q_f32((float32_t *)v2710, v1015); + vst1q_f32((float32_t *)v2719, v1017); + vst1q_f32((float32_t *)v2728, v1016); + vst1q_f32((float32_t *)v2737, v1014); + vst1q_f32((float32_t *)v2746, v439); + vst1q_f32((float32_t *)v2836, v436); + float32x4_t v878 = vaddq_f32(v877, v587); + float32x4_t v879 = vsubq_f32(v877, v587); + float32x4_t v926 = vaddq_f32(v922, v924); + float32x4_t v927 = vsubq_f32(v922, v924); + float32x4_t v928 = vaddq_f32(v923, v925); + float32x4_t v929 = vsubq_f32(v923, v925); + float32x4_t v966 = vaddq_f32(v965, v588); + float32x4_t v967 = vsubq_f32(v965, v588); + float32x4_t v1054 = vaddq_f32(v1053, v589); + float32x4_t v1055 = vsubq_f32(v1053, v589); + float32x4_t v1102 = vaddq_f32(v1098, v1100); + float32x4_t v1103 = vsubq_f32(v1098, v1100); + float32x4_t v1104 = vaddq_f32(v1099, v1101); + float32x4_t v1105 = vsubq_f32(v1099, v1101); + float32x4_t v1142 = vaddq_f32(v1141, v586); + float32x4_t v1143 = vsubq_f32(v1141, v586); + float32x4_t v882 = vaddq_f32(v878, v880); + float32x4_t v883 = vsubq_f32(v878, v880); + float32x4_t v884 = vaddq_f32(v879, v881); + float32x4_t v885 = vsubq_f32(v879, v881); + float32x4_t v970 = vaddq_f32(v966, v968); + float32x4_t v971 = vsubq_f32(v966, v968); + float32x4_t v972 = vaddq_f32(v967, v969); + float32x4_t v973 = vsubq_f32(v967, v969); + float32x4_t v1058 = vaddq_f32(v1054, v1056); + float32x4_t v1059 = vsubq_f32(v1054, v1056); + float32x4_t v1060 = vaddq_f32(v1055, v1057); + float32x4_t v1061 = vsubq_f32(v1055, v1057); + float32x4_t v1146 = vaddq_f32(v1142, v1144); + float32x4_t v1147 = vsubq_f32(v1142, v1144); + float32x4_t v1148 = vaddq_f32(v1143, v1145); + float32x4_t v1149 = vsubq_f32(v1143, v1145); + vst1q_f32((float32_t *)v2620, v927); + vst1q_f32((float32_t *)v2629, v929); + vst1q_f32((float32_t *)v2638, v928); + vst1q_f32((float32_t *)v2647, v926); + vst1q_f32((float32_t *)v2800, v1103); + vst1q_f32((float32_t *)v2809, v1105); + vst1q_f32((float32_t *)v2818, v1104); + vst1q_f32((float32_t *)v2827, v1102); + vst1q_f32((float32_t *)v2575, v883); + vst1q_f32((float32_t *)v2584, v885); + vst1q_f32((float32_t *)v2593, v884); + vst1q_f32((float32_t *)v2602, v882); + vst1q_f32((float32_t *)v2665, v971); + vst1q_f32((float32_t *)v2674, v973); + vst1q_f32((float32_t *)v2683, v972); + vst1q_f32((float32_t *)v2692, v970); + vst1q_f32((float32_t *)v2755, v1059); + vst1q_f32((float32_t *)v2764, v1061); + vst1q_f32((float32_t *)v2773, v1060); + vst1q_f32((float32_t *)v2782, v1058); + vst1q_f32((float32_t *)v2845, v1147); + vst1q_f32((float32_t *)v2854, v1149); + vst1q_f32((float32_t *)v2863, v1148); + vst1q_f32((float32_t *)v2872, v1146); + v5 += 2 * 1; + v6 += 2 * 1; + } + for (int j = v1185 * 2; j < howmany; j += 1) { + float32x2_t v1379 = v5[istride]; + float v1496 = 1.0000000000000000e+00F; + float v1497 = -1.0000000000000000e+00F; + float v1504 = -7.0710678118654746e-01F; + float v1511 = 7.0710678118654757e-01F; + float v1563 = -1.2500000000000000e+00F; + float v1564 = 1.2500000000000000e+00F; + float v1571 = 8.8388347648318433e-01F; + float v1578 = -8.8388347648318444e-01F; + float v1630 = 5.5901699437494745e-01F; + float v1631 = -5.5901699437494745e-01F; + float v1638 = -3.9528470752104738e-01F; + float v1645 = 3.9528470752104744e-01F; + float v1699 = 1.5388417685876268e+00F; + float v1707 = -1.5388417685876268e+00F; + float v1714 = 1.0881254497414108e+00F; + float v1715 = -1.0881254497414108e+00F; + float v1772 = 5.8778525229247325e-01F; + float v1780 = -5.8778525229247325e-01F; + float v1787 = 4.1562693777745352e-01F; + float v1788 = -4.1562693777745352e-01F; + float v1845 = 3.6327126400268028e-01F; + float v1853 = -3.6327126400268028e-01F; + float v1860 = 2.5687157418650380e-01F; + float v1861 = -2.5687157418650380e-01F; + float32x2_t v1863 = (float32x2_t){v4, v4}; + float32x2_t v1224 = v5[0]; + float32x2_t v1498 = (float32x2_t){v1496, v1497}; + float32x2_t v1505 = (float32x2_t){v1511, v1504}; + float32x2_t v1512 = (float32x2_t){v1511, v1511}; + float32x2_t v1561 = (float32x2_t){v1563, v1563}; + float32x2_t v1565 = (float32x2_t){v1563, v1564}; + float32x2_t v1572 = (float32x2_t){v1578, v1571}; + float32x2_t v1579 = (float32x2_t){v1578, v1578}; + float32x2_t v1628 = (float32x2_t){v1630, v1630}; + float32x2_t v1632 = (float32x2_t){v1630, v1631}; + float32x2_t v1639 = (float32x2_t){v1645, v1638}; + float32x2_t v1646 = (float32x2_t){v1645, v1645}; + float32x2_t v1701 = (float32x2_t){v1699, v1707}; + float32x2_t v1708 = (float32x2_t){v1707, v1707}; + float32x2_t v1712 = (float32x2_t){v1715, v1715}; + float32x2_t v1716 = (float32x2_t){v1714, v1715}; + float32x2_t v1774 = (float32x2_t){v1772, v1780}; + float32x2_t v1781 = (float32x2_t){v1780, v1780}; + float32x2_t v1785 = (float32x2_t){v1788, v1788}; + float32x2_t v1789 = (float32x2_t){v1787, v1788}; + float32x2_t v1847 = (float32x2_t){v1845, v1853}; + float32x2_t v1854 = (float32x2_t){v1853, v1853}; + float32x2_t v1858 = (float32x2_t){v1861, v1861}; + float32x2_t v1862 = (float32x2_t){v1860, v1861}; + float32x2_t v1197 = v5[istride * 8]; + float32x2_t v1202 = v5[istride * 32]; + float32x2_t v1209 = v5[istride * 24]; + float32x2_t v1214 = v5[istride * 16]; + float32x2_t v1230 = v5[istride * 13]; + float32x2_t v1235 = v5[istride * 37]; + float32x2_t v1242 = v5[istride * 29]; + float32x2_t v1247 = v5[istride * 21]; + float32x2_t v1257 = v5[istride * 5]; + float32x2_t v1263 = v5[istride * 18]; + float32x2_t v1268 = v5[istride * 2]; + float32x2_t v1275 = v5[istride * 34]; + float32x2_t v1280 = v5[istride * 26]; + float32x2_t v1290 = v5[istride * 10]; + float32x2_t v1296 = v5[istride * 23]; + float32x2_t v1301 = v5[istride * 7]; + float32x2_t v1308 = v5[istride * 39]; + float32x2_t v1313 = v5[istride * 31]; + float32x2_t v1323 = v5[istride * 15]; + float32x2_t v1329 = v5[istride * 28]; + float32x2_t v1334 = v5[istride * 12]; + float32x2_t v1341 = v5[istride * 4]; + float32x2_t v1346 = v5[istride * 36]; + float32x2_t v1356 = v5[istride * 20]; + float32x2_t v1362 = v5[istride * 33]; + float32x2_t v1367 = v5[istride * 17]; + float32x2_t v1374 = v5[istride * 9]; + float32x2_t v1389 = v5[istride * 25]; + float32x2_t v1395 = v5[istride * 38]; + float32x2_t v1400 = v5[istride * 22]; + float32x2_t v1407 = v5[istride * 14]; + float32x2_t v1412 = v5[istride * 6]; + float32x2_t v1422 = v5[istride * 30]; + float32x2_t v1428 = v5[istride * 3]; + float32x2_t v1433 = v5[istride * 27]; + float32x2_t v1440 = v5[istride * 19]; + float32x2_t v1445 = v5[istride * 11]; + float32x2_t v1455 = v5[istride * 35]; + float32x2_t v1500 = vmul_f32(v1863, v1498); + float32x2_t v1507 = vmul_f32(v1863, v1505); + float32x2_t v1567 = vmul_f32(v1863, v1565); + float32x2_t v1574 = vmul_f32(v1863, v1572); + float32x2_t v1634 = vmul_f32(v1863, v1632); + float32x2_t v1641 = vmul_f32(v1863, v1639); + float32x2_t v1703 = vmul_f32(v1863, v1701); + float32x2_t v1718 = vmul_f32(v1863, v1716); + float32x2_t v1776 = vmul_f32(v1863, v1774); + float32x2_t v1791 = vmul_f32(v1863, v1789); + float32x2_t v1849 = vmul_f32(v1863, v1847); + float32x2_t v1864 = vmul_f32(v1863, v1862); + float32x2_t v1203 = vadd_f32(v1197, v1202); + float32x2_t v1204 = vsub_f32(v1197, v1202); + float32x2_t v1215 = vadd_f32(v1209, v1214); + float32x2_t v1216 = vsub_f32(v1209, v1214); + float32x2_t v1236 = vadd_f32(v1230, v1235); + float32x2_t v1237 = vsub_f32(v1230, v1235); + float32x2_t v1248 = vadd_f32(v1242, v1247); + float32x2_t v1249 = vsub_f32(v1242, v1247); + float32x2_t v1269 = vadd_f32(v1263, v1268); + float32x2_t v1270 = vsub_f32(v1263, v1268); + float32x2_t v1281 = vadd_f32(v1275, v1280); + float32x2_t v1282 = vsub_f32(v1275, v1280); + float32x2_t v1302 = vadd_f32(v1296, v1301); + float32x2_t v1303 = vsub_f32(v1296, v1301); + float32x2_t v1314 = vadd_f32(v1308, v1313); + float32x2_t v1315 = vsub_f32(v1308, v1313); + float32x2_t v1335 = vadd_f32(v1329, v1334); + float32x2_t v1336 = vsub_f32(v1329, v1334); + float32x2_t v1347 = vadd_f32(v1341, v1346); + float32x2_t v1348 = vsub_f32(v1341, v1346); + float32x2_t v1368 = vadd_f32(v1362, v1367); + float32x2_t v1369 = vsub_f32(v1362, v1367); + float32x2_t v1380 = vadd_f32(v1374, v1379); + float32x2_t v1381 = vsub_f32(v1374, v1379); + float32x2_t v1401 = vadd_f32(v1395, v1400); + float32x2_t v1402 = vsub_f32(v1395, v1400); + float32x2_t v1413 = vadd_f32(v1407, v1412); + float32x2_t v1414 = vsub_f32(v1407, v1412); + float32x2_t v1434 = vadd_f32(v1428, v1433); + float32x2_t v1435 = vsub_f32(v1428, v1433); + float32x2_t v1446 = vadd_f32(v1440, v1445); + float32x2_t v1447 = vsub_f32(v1440, v1445); + float32x2_t v1217 = vadd_f32(v1203, v1215); + float32x2_t v1218 = vsub_f32(v1203, v1215); + float32x2_t v1219 = vadd_f32(v1204, v1216); + float32x2_t v1250 = vadd_f32(v1236, v1248); + float32x2_t v1251 = vsub_f32(v1236, v1248); + float32x2_t v1252 = vadd_f32(v1237, v1249); + float32x2_t v1283 = vadd_f32(v1269, v1281); + float32x2_t v1284 = vsub_f32(v1269, v1281); + float32x2_t v1285 = vadd_f32(v1270, v1282); + float32x2_t v1316 = vadd_f32(v1302, v1314); + float32x2_t v1317 = vsub_f32(v1302, v1314); + float32x2_t v1318 = vadd_f32(v1303, v1315); + float32x2_t v1349 = vadd_f32(v1335, v1347); + float32x2_t v1350 = vsub_f32(v1335, v1347); + float32x2_t v1351 = vadd_f32(v1336, v1348); + float32x2_t v1382 = vadd_f32(v1368, v1380); + float32x2_t v1383 = vsub_f32(v1368, v1380); + float32x2_t v1384 = vadd_f32(v1369, v1381); + float32x2_t v1415 = vadd_f32(v1401, v1413); + float32x2_t v1416 = vsub_f32(v1401, v1413); + float32x2_t v1417 = vadd_f32(v1402, v1414); + float32x2_t v1448 = vadd_f32(v1434, v1446); + float32x2_t v1449 = vsub_f32(v1434, v1446); + float32x2_t v1450 = vadd_f32(v1435, v1447); + float32x2_t v1658 = vadd_f32(v1204, v1336); + float32x2_t v1659 = vsub_f32(v1204, v1336); + float32x2_t v1660 = vadd_f32(v1270, v1402); + float32x2_t v1661 = vsub_f32(v1270, v1402); + float32x2_t v1662 = vadd_f32(v1237, v1369); + float32x2_t v1663 = vsub_f32(v1237, v1369); + float32x2_t v1664 = vadd_f32(v1303, v1435); + float32x2_t v1665 = vsub_f32(v1303, v1435); + float32x2_t v1804 = vadd_f32(v1216, v1348); + float32x2_t v1805 = vsub_f32(v1216, v1348); + float32x2_t v1806 = vadd_f32(v1282, v1414); + float32x2_t v1807 = vsub_f32(v1282, v1414); + float32x2_t v1808 = vadd_f32(v1249, v1381); + float32x2_t v1809 = vsub_f32(v1249, v1381); + float32x2_t v1810 = vadd_f32(v1315, v1447); + float32x2_t v1811 = vsub_f32(v1315, v1447); + float32x2_t v1225 = vadd_f32(v1217, v1224); + float32x2_t v1258 = vadd_f32(v1250, v1257); + float32x2_t v1291 = vadd_f32(v1283, v1290); + float32x2_t v1324 = vadd_f32(v1316, v1323); + float32x2_t v1357 = vadd_f32(v1349, v1356); + float32x2_t v1390 = vadd_f32(v1382, v1389); + float32x2_t v1423 = vadd_f32(v1415, v1422); + float32x2_t v1456 = vadd_f32(v1448, v1455); + float32x2_t v1524 = vadd_f32(v1217, v1349); + float32x2_t v1525 = vsub_f32(v1217, v1349); + float32x2_t v1526 = vadd_f32(v1283, v1415); + float32x2_t v1527 = vsub_f32(v1283, v1415); + float32x2_t v1528 = vadd_f32(v1250, v1382); + float32x2_t v1529 = vsub_f32(v1250, v1382); + float32x2_t v1530 = vadd_f32(v1316, v1448); + float32x2_t v1531 = vsub_f32(v1316, v1448); + float32x2_t v1591 = vadd_f32(v1218, v1350); + float32x2_t v1592 = vsub_f32(v1218, v1350); + float32x2_t v1593 = vadd_f32(v1284, v1416); + float32x2_t v1594 = vsub_f32(v1284, v1416); + float32x2_t v1595 = vadd_f32(v1251, v1383); + float32x2_t v1596 = vsub_f32(v1251, v1383); + float32x2_t v1597 = vadd_f32(v1317, v1449); + float32x2_t v1598 = vsub_f32(v1317, v1449); + float32x2_t v1666 = vadd_f32(v1658, v1660); + float32x2_t v1667 = vsub_f32(v1658, v1660); + float32x2_t v1668 = vadd_f32(v1662, v1664); + float32x2_t v1669 = vsub_f32(v1662, v1664); + float32x2_t v1672 = vadd_f32(v1663, v1665); + float32x2_t v1673 = vsub_f32(v1663, v1665); + float32x2_t v1704 = vrev64_f32(v1659); + float32x2_t v1709 = vmul_f32(v1661, v1708); + float32x2_t v1731 = vadd_f32(v1219, v1351); + float32x2_t v1732 = vsub_f32(v1219, v1351); + float32x2_t v1733 = vadd_f32(v1285, v1417); + float32x2_t v1734 = vsub_f32(v1285, v1417); + float32x2_t v1735 = vadd_f32(v1252, v1384); + float32x2_t v1736 = vsub_f32(v1252, v1384); + float32x2_t v1737 = vadd_f32(v1318, v1450); + float32x2_t v1738 = vsub_f32(v1318, v1450); + float32x2_t v1812 = vadd_f32(v1804, v1806); + float32x2_t v1813 = vsub_f32(v1804, v1806); + float32x2_t v1814 = vadd_f32(v1808, v1810); + float32x2_t v1815 = vsub_f32(v1808, v1810); + float32x2_t v1818 = vadd_f32(v1809, v1811); + float32x2_t v1819 = vsub_f32(v1809, v1811); + float32x2_t v1850 = vrev64_f32(v1805); + float32x2_t v1855 = vmul_f32(v1807, v1854); + float32x2_t v1457 = vadd_f32(v1225, v1357); + float32x2_t v1458 = vsub_f32(v1225, v1357); + float32x2_t v1459 = vadd_f32(v1291, v1423); + float32x2_t v1460 = vsub_f32(v1291, v1423); + float32x2_t v1461 = vadd_f32(v1258, v1390); + float32x2_t v1462 = vsub_f32(v1258, v1390); + float32x2_t v1463 = vadd_f32(v1324, v1456); + float32x2_t v1464 = vsub_f32(v1324, v1456); + float32x2_t v1532 = vadd_f32(v1524, v1526); + float32x2_t v1533 = vsub_f32(v1524, v1526); + float32x2_t v1534 = vadd_f32(v1528, v1530); + float32x2_t v1535 = vsub_f32(v1528, v1530); + float32x2_t v1538 = vadd_f32(v1529, v1531); + float32x2_t v1539 = vsub_f32(v1529, v1531); + float32x2_t v1562 = vmul_f32(v1525, v1561); + float32x2_t v1568 = vrev64_f32(v1527); + float32x2_t v1599 = vadd_f32(v1591, v1593); + float32x2_t v1600 = vsub_f32(v1591, v1593); + float32x2_t v1601 = vadd_f32(v1595, v1597); + float32x2_t v1602 = vsub_f32(v1595, v1597); + float32x2_t v1605 = vadd_f32(v1596, v1598); + float32x2_t v1606 = vsub_f32(v1596, v1598); + float32x2_t v1629 = vmul_f32(v1592, v1628); + float32x2_t v1635 = vrev64_f32(v1594); + float32x2_t v1670 = vadd_f32(v1666, v1668); + float32x2_t v1671 = vsub_f32(v1666, v1668); + float32x2_t v1693 = vrev64_f32(v1667); + float32x2_t v1698 = vmul_f32(v1669, v1708); + float32x2_t v1705 = vmul_f32(v1704, v1703); + float32x2_t v1713 = vmul_f32(v1672, v1712); + float32x2_t v1719 = vrev64_f32(v1673); + float32x2_t v1739 = vadd_f32(v1731, v1733); + float32x2_t v1740 = vsub_f32(v1731, v1733); + float32x2_t v1741 = vadd_f32(v1735, v1737); + float32x2_t v1742 = vsub_f32(v1735, v1737); + float32x2_t v1745 = vadd_f32(v1736, v1738); + float32x2_t v1746 = vsub_f32(v1736, v1738); + float32x2_t v1777 = vrev64_f32(v1732); + float32x2_t v1782 = vmul_f32(v1734, v1781); + float32x2_t v1816 = vadd_f32(v1812, v1814); + float32x2_t v1817 = vsub_f32(v1812, v1814); + float32x2_t v1839 = vrev64_f32(v1813); + float32x2_t v1844 = vmul_f32(v1815, v1854); + float32x2_t v1851 = vmul_f32(v1850, v1849); + float32x2_t v1859 = vmul_f32(v1818, v1858); + float32x2_t v1865 = vrev64_f32(v1819); + float32x2_t v1465 = vadd_f32(v1457, v1459); + float32x2_t v1466 = vsub_f32(v1457, v1459); + float32x2_t v1467 = vadd_f32(v1461, v1463); + float32x2_t v1468 = vsub_f32(v1461, v1463); + float32x2_t v1471 = vadd_f32(v1462, v1464); + float32x2_t v1472 = vsub_f32(v1462, v1464); + float32x2_t v1501 = vrev64_f32(v1460); + float32x2_t v1536 = vadd_f32(v1532, v1534); + float32x2_t v1537 = vsub_f32(v1532, v1534); + float32x2_t v1551 = vmul_f32(v1533, v1561); + float32x2_t v1557 = vrev64_f32(v1535); + float32x2_t v1569 = vmul_f32(v1568, v1567); + float32x2_t v1575 = vrev64_f32(v1538); + float32x2_t v1580 = vmul_f32(v1539, v1579); + float32x2_t v1603 = vadd_f32(v1599, v1601); + float32x2_t v1604 = vsub_f32(v1599, v1601); + float32x2_t v1618 = vmul_f32(v1600, v1628); + float32x2_t v1624 = vrev64_f32(v1602); + float32x2_t v1636 = vmul_f32(v1635, v1634); + float32x2_t v1642 = vrev64_f32(v1605); + float32x2_t v1647 = vmul_f32(v1606, v1646); + float32x2_t v1679 = vrev64_f32(v1670); + float32x2_t v1686 = vrev64_f32(v1671); + float32x2_t v1694 = vmul_f32(v1693, v1703); + float32x2_t v1720 = vmul_f32(v1719, v1718); + float32x2_t v1725 = vadd_f32(v1709, v1713); + float32x2_t v1726 = vsub_f32(v1709, v1713); + float32x2_t v1743 = vadd_f32(v1739, v1741); + float32x2_t v1744 = vsub_f32(v1739, v1741); + float32x2_t v1766 = vrev64_f32(v1740); + float32x2_t v1771 = vmul_f32(v1742, v1781); + float32x2_t v1778 = vmul_f32(v1777, v1776); + float32x2_t v1786 = vmul_f32(v1745, v1785); + float32x2_t v1792 = vrev64_f32(v1746); + float32x2_t v1825 = vrev64_f32(v1816); + float32x2_t v1832 = vrev64_f32(v1817); + float32x2_t v1840 = vmul_f32(v1839, v1849); + float32x2_t v1866 = vmul_f32(v1865, v1864); + float32x2_t v1871 = vadd_f32(v1855, v1859); + float32x2_t v1872 = vsub_f32(v1855, v1859); + float32x2_t v1469 = vadd_f32(v1465, v1467); + float32x2_t v1470 = vsub_f32(v1465, v1467); + float32x2_t v1490 = vrev64_f32(v1468); + float32x2_t v1502 = vmul_f32(v1501, v1500); + float32x2_t v1508 = vrev64_f32(v1471); + float32x2_t v1513 = vmul_f32(v1472, v1512); + float32x2_t v1543 = vmul_f32(v1536, v1561); + float32x2_t v1547 = vmul_f32(v1537, v1561); + float32x2_t v1558 = vmul_f32(v1557, v1567); + float32x2_t v1576 = vmul_f32(v1575, v1574); + float32x2_t v1583 = vadd_f32(v1562, v1580); + float32x2_t v1584 = vsub_f32(v1562, v1580); + float32x2_t v1610 = vmul_f32(v1603, v1628); + float32x2_t v1614 = vmul_f32(v1604, v1628); + float32x2_t v1625 = vmul_f32(v1624, v1634); + float32x2_t v1643 = vmul_f32(v1642, v1641); + float32x2_t v1650 = vadd_f32(v1629, v1647); + float32x2_t v1651 = vsub_f32(v1629, v1647); + float32x2_t v1680 = vmul_f32(v1679, v1703); + float32x2_t v1687 = vmul_f32(v1686, v1703); + float32x2_t v1721 = vadd_f32(v1694, v1698); + float32x2_t v1722 = vsub_f32(v1694, v1698); + float32x2_t v1723 = vadd_f32(v1705, v1720); + float32x2_t v1724 = vsub_f32(v1705, v1720); + float32x2_t v1752 = vrev64_f32(v1743); + float32x2_t v1759 = vrev64_f32(v1744); + float32x2_t v1767 = vmul_f32(v1766, v1776); + float32x2_t v1793 = vmul_f32(v1792, v1791); + float32x2_t v1798 = vadd_f32(v1782, v1786); + float32x2_t v1799 = vsub_f32(v1782, v1786); + float32x2_t v1826 = vmul_f32(v1825, v1849); + float32x2_t v1833 = vmul_f32(v1832, v1849); + float32x2_t v1867 = vadd_f32(v1840, v1844); + float32x2_t v1868 = vsub_f32(v1840, v1844); + float32x2_t v1869 = vadd_f32(v1851, v1866); + float32x2_t v1870 = vsub_f32(v1851, v1866); + float32x2_t v1491 = vmul_f32(v1490, v1500); + float32x2_t v1509 = vmul_f32(v1508, v1507); + float32x2_t v1516 = vadd_f32(v1458, v1513); + float32x2_t v1517 = vsub_f32(v1458, v1513); + float32x2_t v1581 = vadd_f32(v1551, v1558); + float32x2_t v1582 = vsub_f32(v1551, v1558); + float32x2_t v1585 = vadd_f32(v1569, v1576); + float32x2_t v1586 = vsub_f32(v1569, v1576); + float32x2_t v1648 = vadd_f32(v1618, v1625); + float32x2_t v1649 = vsub_f32(v1618, v1625); + float32x2_t v1652 = vadd_f32(v1636, v1643); + float32x2_t v1653 = vsub_f32(v1636, v1643); + float32x2_t v1727 = vadd_f32(v1723, v1725); + float32x2_t v1728 = vsub_f32(v1723, v1725); + float32x2_t v1729 = vadd_f32(v1724, v1726); + float32x2_t v1730 = vsub_f32(v1724, v1726); + float32x2_t v1753 = vmul_f32(v1752, v1776); + float32x2_t v1760 = vmul_f32(v1759, v1776); + float32x2_t v1794 = vadd_f32(v1767, v1771); + float32x2_t v1795 = vsub_f32(v1767, v1771); + float32x2_t v1796 = vadd_f32(v1778, v1793); + float32x2_t v1797 = vsub_f32(v1778, v1793); + float32x2_t v1873 = vadd_f32(v1869, v1871); + float32x2_t v1874 = vsub_f32(v1869, v1871); + float32x2_t v1875 = vadd_f32(v1870, v1872); + float32x2_t v1876 = vsub_f32(v1870, v1872); + float32x2_t v1877 = vadd_f32(v1469, v1543); + v6[0] = v1469; + float32x2_t v2013 = vadd_f32(v1470, v1547); + v6[ostride * 20] = v1470; + float32x2_t v1514 = vadd_f32(v1466, v1491); + float32x2_t v1515 = vsub_f32(v1466, v1491); + float32x2_t v1518 = vadd_f32(v1502, v1509); + float32x2_t v1519 = vsub_f32(v1502, v1509); + float32x2_t v1587 = vadd_f32(v1583, v1585); + float32x2_t v1588 = vsub_f32(v1583, v1585); + float32x2_t v1589 = vadd_f32(v1584, v1586); + float32x2_t v1590 = vsub_f32(v1584, v1586); + float32x2_t v1654 = vadd_f32(v1650, v1652); + float32x2_t v1655 = vsub_f32(v1650, v1652); + float32x2_t v1656 = vadd_f32(v1651, v1653); + float32x2_t v1657 = vsub_f32(v1651, v1653); + float32x2_t v1800 = vadd_f32(v1796, v1798); + float32x2_t v1801 = vsub_f32(v1796, v1798); + float32x2_t v1802 = vadd_f32(v1797, v1799); + float32x2_t v1803 = vsub_f32(v1797, v1799); + float32x2_t v1878 = vadd_f32(v1877, v1610); + float32x2_t v1879 = vsub_f32(v1877, v1610); + float32x2_t v1880 = vsub_f32(v1680, v1753); + float32x2_t v1881 = vadd_f32(v1753, v1826); + float32x2_t v1948 = vsub_f32(v1722, v1795); + float32x2_t v1949 = vadd_f32(v1795, v1868); + float32x2_t v2014 = vadd_f32(v2013, v1614); + float32x2_t v2015 = vsub_f32(v2013, v1614); + float32x2_t v2016 = vsub_f32(v1687, v1760); + float32x2_t v2017 = vadd_f32(v1760, v1833); + float32x2_t v2084 = vsub_f32(v1721, v1794); + float32x2_t v2085 = vadd_f32(v1794, v1867); + float32x2_t v1520 = vadd_f32(v1516, v1518); + float32x2_t v1521 = vsub_f32(v1516, v1518); + float32x2_t v1522 = vadd_f32(v1517, v1519); + float32x2_t v1523 = vsub_f32(v1517, v1519); + float32x2_t v1882 = vadd_f32(v1878, v1880); + float32x2_t v1883 = vsub_f32(v1878, v1880); + float32x2_t v1884 = vadd_f32(v1879, v1881); + float32x2_t v1885 = vsub_f32(v1879, v1881); + float32x2_t v1914 = vsub_f32(v1728, v1801); + float32x2_t v1915 = vadd_f32(v1801, v1874); + float32x2_t v1945 = vadd_f32(v1515, v1582); + v6[ostride * 10] = v1515; + float32x2_t v1982 = vsub_f32(v1729, v1802); + float32x2_t v1983 = vadd_f32(v1802, v1875); + float32x2_t v2018 = vadd_f32(v2014, v2016); + float32x2_t v2019 = vsub_f32(v2014, v2016); + float32x2_t v2020 = vadd_f32(v2015, v2017); + float32x2_t v2021 = vsub_f32(v2015, v2017); + float32x2_t v2050 = vsub_f32(v1730, v1803); + float32x2_t v2051 = vadd_f32(v1803, v1876); + float32x2_t v2081 = vadd_f32(v1514, v1581); + v6[ostride * 30] = v1514; + float32x2_t v2118 = vsub_f32(v1727, v1800); + float32x2_t v2119 = vadd_f32(v1800, v1873); + v6[ostride * 16] = v1883; + v6[ostride * 32] = v1885; + v6[ostride * 8] = v1884; + v6[ostride * 24] = v1882; + float32x2_t v1911 = vadd_f32(v1521, v1588); + v6[ostride * 25] = v1521; + float32x2_t v1946 = vadd_f32(v1945, v1649); + float32x2_t v1947 = vsub_f32(v1945, v1649); + float32x2_t v1979 = vadd_f32(v1522, v1589); + v6[ostride * 35] = v1522; + v6[ostride * 36] = v2019; + v6[ostride * 12] = v2021; + v6[ostride * 28] = v2020; + v6[ostride * 4] = v2018; + float32x2_t v2047 = vadd_f32(v1523, v1590); + v6[ostride * 5] = v1523; + float32x2_t v2082 = vadd_f32(v2081, v1648); + float32x2_t v2083 = vsub_f32(v2081, v1648); + float32x2_t v2115 = vadd_f32(v1520, v1587); + v6[ostride * 15] = v1520; + float32x2_t v1912 = vadd_f32(v1911, v1655); + float32x2_t v1913 = vsub_f32(v1911, v1655); + float32x2_t v1950 = vadd_f32(v1946, v1948); + float32x2_t v1951 = vsub_f32(v1946, v1948); + float32x2_t v1952 = vadd_f32(v1947, v1949); + float32x2_t v1953 = vsub_f32(v1947, v1949); + float32x2_t v1980 = vadd_f32(v1979, v1656); + float32x2_t v1981 = vsub_f32(v1979, v1656); + float32x2_t v2048 = vadd_f32(v2047, v1657); + float32x2_t v2049 = vsub_f32(v2047, v1657); + float32x2_t v2086 = vadd_f32(v2082, v2084); + float32x2_t v2087 = vsub_f32(v2082, v2084); + float32x2_t v2088 = vadd_f32(v2083, v2085); + float32x2_t v2089 = vsub_f32(v2083, v2085); + float32x2_t v2116 = vadd_f32(v2115, v1654); + float32x2_t v2117 = vsub_f32(v2115, v1654); + float32x2_t v1916 = vadd_f32(v1912, v1914); + float32x2_t v1917 = vsub_f32(v1912, v1914); + float32x2_t v1918 = vadd_f32(v1913, v1915); + float32x2_t v1919 = vsub_f32(v1913, v1915); + v6[ostride * 26] = v1951; + v6[ostride * 2] = v1953; + v6[ostride * 18] = v1952; + v6[ostride * 34] = v1950; + float32x2_t v1984 = vadd_f32(v1980, v1982); + float32x2_t v1985 = vsub_f32(v1980, v1982); + float32x2_t v1986 = vadd_f32(v1981, v1983); + float32x2_t v1987 = vsub_f32(v1981, v1983); + float32x2_t v2052 = vadd_f32(v2048, v2050); + float32x2_t v2053 = vsub_f32(v2048, v2050); + float32x2_t v2054 = vadd_f32(v2049, v2051); + float32x2_t v2055 = vsub_f32(v2049, v2051); + v6[ostride * 6] = v2087; + v6[ostride * 22] = v2089; + v6[ostride * 38] = v2088; + v6[ostride * 14] = v2086; + float32x2_t v2120 = vadd_f32(v2116, v2118); + float32x2_t v2121 = vsub_f32(v2116, v2118); + float32x2_t v2122 = vadd_f32(v2117, v2119); + float32x2_t v2123 = vsub_f32(v2117, v2119); + v6[ostride] = v1917; + v6[ostride * 17] = v1919; + v6[ostride * 33] = v1918; + v6[ostride * 9] = v1916; + v6[ostride * 11] = v1985; + v6[ostride * 27] = v1987; + v6[ostride * 3] = v1986; + v6[ostride * 19] = v1984; + v6[ostride * 21] = v2053; + v6[ostride * 37] = v2055; + v6[ostride * 13] = v2054; + v6[ostride * 29] = v2052; + v6[ostride * 31] = v2121; + v6[ostride * 7] = v2123; + v6[ostride * 23] = v2122; + v6[ostride * 39] = v2120; + v5 += 1 * 1; + v6 += 1 * 1; + } +} +#endif + +#ifdef ARMRAL_ARCH_SVE +void armral_fft_cf32_cf32_cf32_ac_n_uu40(const armral_cmplx_f32_t *restrict x, + armral_cmplx_f32_t *restrict y, + int istride, int ostride, int howmany, + float dir) { + int64_t v0 = istride; + int64_t v2 = ostride; + float v4 = dir; + const float32x2_t *v5 = (const float32x2_t *)x; + float32x2_t *v6 = (float32x2_t *)y; + int64_t v8 = howmany; + int64_t v10 = svcntd(); + int64_t v11 = v10 * 1; + int64_t v12 = v10 * 1; + for (int j = 0; j < v8; j += v10) { + svbool_t pred_full = svwhilelt_b32(j * 2, howmany * 2); + float v406 = -1.0000000000000000e+00F; + float v413 = -7.0710678118654746e-01F; + float v420 = 7.0710678118654757e-01F; + float v473 = -1.2500000000000000e+00F; + float v478 = 1.2500000000000000e+00F; + float v485 = 8.8388347648318433e-01F; + float v492 = -8.8388347648318444e-01F; + float v545 = 5.5901699437494745e-01F; + float v550 = -5.5901699437494745e-01F; + float v557 = -3.9528470752104738e-01F; + float v564 = 3.9528470752104744e-01F; + float v628 = -1.5388417685876268e+00F; + float v638 = -1.0881254497414108e+00F; + float v704 = -5.8778525229247325e-01F; + float v714 = -4.1562693777745352e-01F; + float v780 = -3.6327126400268028e-01F; + float v790 = -2.5687157418650380e-01F; + const float32x2_t *v1417 = &v5[v0]; + float32x2_t *v1628 = &v6[v2]; + int64_t v19 = v0 * 8; + int64_t v26 = v0 * 32; + int64_t v35 = v0 * 24; + int64_t v42 = v0 * 16; + int64_t v62 = v0 * 13; + int64_t v69 = v0 * 37; + int64_t v78 = v0 * 29; + int64_t v85 = v0 * 21; + int64_t v97 = v0 * 5; + int64_t v105 = v0 * 18; + int64_t v112 = v0 * 2; + int64_t v121 = v0 * 34; + int64_t v128 = v0 * 26; + int64_t v140 = v0 * 10; + int64_t v148 = v0 * 23; + int64_t v155 = v0 * 7; + int64_t v164 = v0 * 39; + int64_t v171 = v0 * 31; + int64_t v183 = v0 * 15; + int64_t v191 = v0 * 28; + int64_t v198 = v0 * 12; + int64_t v207 = v0 * 4; + int64_t v214 = v0 * 36; + int64_t v226 = v0 * 20; + int64_t v234 = v0 * 33; + int64_t v241 = v0 * 17; + int64_t v250 = v0 * 9; + int64_t v269 = v0 * 25; + int64_t v277 = v0 * 38; + int64_t v284 = v0 * 22; + int64_t v293 = v0 * 14; + int64_t v300 = v0 * 6; + int64_t v312 = v0 * 30; + int64_t v320 = v0 * 3; + int64_t v327 = v0 * 27; + int64_t v336 = v0 * 19; + int64_t v343 = v0 * 11; + int64_t v355 = v0 * 35; + float v409 = v4 * v406; + float v416 = v4 * v413; + float v481 = v4 * v478; + float v488 = v4 * v485; + float v553 = v4 * v550; + float v560 = v4 * v557; + float v624 = v4 * v628; + float v641 = v4 * v638; + float v700 = v4 * v704; + float v717 = v4 * v714; + float v776 = v4 * v780; + float v793 = v4 * v790; + int64_t v823 = v2 * 16; + int64_t v830 = v2 * 32; + int64_t v837 = v2 * 8; + int64_t v844 = v2 * 24; + int64_t v860 = v2 * 25; + int64_t v874 = v2 * 17; + int64_t v881 = v2 * 33; + int64_t v888 = v2 * 9; + int64_t v904 = v2 * 10; + int64_t v911 = v2 * 26; + int64_t v918 = v2 * 2; + int64_t v925 = v2 * 18; + int64_t v932 = v2 * 34; + int64_t v948 = v2 * 35; + int64_t v955 = v2 * 11; + int64_t v962 = v2 * 27; + int64_t v969 = v2 * 3; + int64_t v976 = v2 * 19; + int64_t v992 = v2 * 20; + int64_t v999 = v2 * 36; + int64_t v1006 = v2 * 12; + int64_t v1013 = v2 * 28; + int64_t v1020 = v2 * 4; + int64_t v1036 = v2 * 5; + int64_t v1043 = v2 * 21; + int64_t v1050 = v2 * 37; + int64_t v1057 = v2 * 13; + int64_t v1064 = v2 * 29; + int64_t v1080 = v2 * 30; + int64_t v1087 = v2 * 6; + int64_t v1094 = v2 * 22; + int64_t v1101 = v2 * 38; + int64_t v1108 = v2 * 14; + int64_t v1124 = v2 * 15; + int64_t v1131 = v2 * 31; + int64_t v1138 = v2 * 7; + int64_t v1145 = v2 * 23; + int64_t v1152 = v2 * 39; + const float32x2_t *v1201 = &v5[0]; + svfloat32_t v1526 = svdup_n_f32(v420); + svfloat32_t v1531 = svdup_n_f32(v473); + svfloat32_t v1534 = svdup_n_f32(v492); + svfloat32_t v1539 = svdup_n_f32(v545); + svfloat32_t v1542 = svdup_n_f32(v564); + svfloat32_t v1548 = svdup_n_f32(v628); + svfloat32_t v1549 = svdup_n_f32(v638); + svfloat32_t v1556 = svdup_n_f32(v704); + svfloat32_t v1557 = svdup_n_f32(v714); + svfloat32_t v1564 = svdup_n_f32(v780); + svfloat32_t v1565 = svdup_n_f32(v790); + float32x2_t *v1574 = &v6[0]; + svfloat32_t v1985 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v1417)[0])); + const float32x2_t *v1164 = &v5[v19]; + const float32x2_t *v1173 = &v5[v26]; + const float32x2_t *v1182 = &v5[v35]; + const float32x2_t *v1191 = &v5[v42]; + const float32x2_t *v1210 = &v5[v62]; + const float32x2_t *v1219 = &v5[v69]; + const float32x2_t *v1228 = &v5[v78]; + const float32x2_t *v1237 = &v5[v85]; + const float32x2_t *v1246 = &v5[v97]; + const float32x2_t *v1255 = &v5[v105]; + const float32x2_t *v1264 = &v5[v112]; + const float32x2_t *v1273 = &v5[v121]; + const float32x2_t *v1282 = &v5[v128]; + const float32x2_t *v1291 = &v5[v140]; + const float32x2_t *v1300 = &v5[v148]; + const float32x2_t *v1309 = &v5[v155]; + const float32x2_t *v1318 = &v5[v164]; + const float32x2_t *v1327 = &v5[v171]; + const float32x2_t *v1336 = &v5[v183]; + const float32x2_t *v1345 = &v5[v191]; + const float32x2_t *v1354 = &v5[v198]; + const float32x2_t *v1363 = &v5[v207]; + const float32x2_t *v1372 = &v5[v214]; + const float32x2_t *v1381 = &v5[v226]; + const float32x2_t *v1390 = &v5[v234]; + const float32x2_t *v1399 = &v5[v241]; + const float32x2_t *v1408 = &v5[v250]; + const float32x2_t *v1426 = &v5[v269]; + const float32x2_t *v1435 = &v5[v277]; + const float32x2_t *v1444 = &v5[v284]; + const float32x2_t *v1453 = &v5[v293]; + const float32x2_t *v1462 = &v5[v300]; + const float32x2_t *v1471 = &v5[v312]; + const float32x2_t *v1480 = &v5[v320]; + const float32x2_t *v1489 = &v5[v327]; + const float32x2_t *v1498 = &v5[v336]; + const float32x2_t *v1507 = &v5[v343]; + const float32x2_t *v1516 = &v5[v355]; + svfloat32_t v1524 = svdup_n_f32(v409); + svfloat32_t v1525 = svdup_n_f32(v416); + svfloat32_t v1532 = svdup_n_f32(v481); + svfloat32_t v1533 = svdup_n_f32(v488); + svfloat32_t v1540 = svdup_n_f32(v553); + svfloat32_t v1541 = svdup_n_f32(v560); + svfloat32_t v1547 = svdup_n_f32(v624); + svfloat32_t v1550 = svdup_n_f32(v641); + svfloat32_t v1555 = svdup_n_f32(v700); + svfloat32_t v1558 = svdup_n_f32(v717); + svfloat32_t v1563 = svdup_n_f32(v776); + svfloat32_t v1566 = svdup_n_f32(v793); + float32x2_t *v1583 = &v6[v823]; + float32x2_t *v1592 = &v6[v830]; + float32x2_t *v1601 = &v6[v837]; + float32x2_t *v1610 = &v6[v844]; + float32x2_t *v1619 = &v6[v860]; + float32x2_t *v1637 = &v6[v874]; + float32x2_t *v1646 = &v6[v881]; + float32x2_t *v1655 = &v6[v888]; + float32x2_t *v1664 = &v6[v904]; + float32x2_t *v1673 = &v6[v911]; + float32x2_t *v1682 = &v6[v918]; + float32x2_t *v1691 = &v6[v925]; + float32x2_t *v1700 = &v6[v932]; + float32x2_t *v1709 = &v6[v948]; + float32x2_t *v1718 = &v6[v955]; + float32x2_t *v1727 = &v6[v962]; + float32x2_t *v1736 = &v6[v969]; + float32x2_t *v1745 = &v6[v976]; + float32x2_t *v1754 = &v6[v992]; + float32x2_t *v1763 = &v6[v999]; + float32x2_t *v1772 = &v6[v1006]; + float32x2_t *v1781 = &v6[v1013]; + float32x2_t *v1790 = &v6[v1020]; + float32x2_t *v1799 = &v6[v1036]; + float32x2_t *v1808 = &v6[v1043]; + float32x2_t *v1817 = &v6[v1050]; + float32x2_t *v1826 = &v6[v1057]; + float32x2_t *v1835 = &v6[v1064]; + float32x2_t *v1844 = &v6[v1080]; + float32x2_t *v1853 = &v6[v1087]; + float32x2_t *v1862 = &v6[v1094]; + float32x2_t *v1871 = &v6[v1101]; + float32x2_t *v1880 = &v6[v1108]; + float32x2_t *v1889 = &v6[v1124]; + float32x2_t *v1898 = &v6[v1131]; + float32x2_t *v1907 = &v6[v1138]; + float32x2_t *v1916 = &v6[v1145]; + float32x2_t *v1925 = &v6[v1152]; + svfloat32_t v1937 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v1201)[0])); + svfloat32_t v1929 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v1164)[0])); + svfloat32_t v1931 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v1173)[0])); + svfloat32_t v1933 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v1182)[0])); + svfloat32_t v1935 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v1191)[0])); + svfloat32_t v1939 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v1210)[0])); + svfloat32_t v1941 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v1219)[0])); + svfloat32_t v1943 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v1228)[0])); + svfloat32_t v1945 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v1237)[0])); + svfloat32_t v1947 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v1246)[0])); + svfloat32_t v1949 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v1255)[0])); + svfloat32_t v1951 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v1264)[0])); + svfloat32_t v1953 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v1273)[0])); + svfloat32_t v1955 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v1282)[0])); + svfloat32_t v1957 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v1291)[0])); + svfloat32_t v1959 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v1300)[0])); + svfloat32_t v1961 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v1309)[0])); + svfloat32_t v1963 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v1318)[0])); + svfloat32_t v1965 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v1327)[0])); + svfloat32_t v1967 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v1336)[0])); + svfloat32_t v1969 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v1345)[0])); + svfloat32_t v1971 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v1354)[0])); + svfloat32_t v1973 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v1363)[0])); + svfloat32_t v1975 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v1372)[0])); + svfloat32_t v1977 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v1381)[0])); + svfloat32_t v1979 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v1390)[0])); + svfloat32_t v1981 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v1399)[0])); + svfloat32_t v1983 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v1408)[0])); + svfloat32_t v1987 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v1426)[0])); + svfloat32_t v1989 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v1435)[0])); + svfloat32_t v1991 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v1444)[0])); + svfloat32_t v1993 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v1453)[0])); + svfloat32_t v1995 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v1462)[0])); + svfloat32_t v1997 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v1471)[0])); + svfloat32_t v1999 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v1480)[0])); + svfloat32_t v2001 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v1489)[0])); + svfloat32_t v2003 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v1498)[0])); + svfloat32_t v2005 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v1507)[0])); + svfloat32_t v2007 = svreinterpret_f32_f64( + svld1_f64(pred_full, &((const double *)v1516)[0])); + svfloat32_t v32 = svadd_f32_x(svptrue_b32(), v1929, v1931); + svfloat32_t v33 = svsub_f32_x(svptrue_b32(), v1929, v1931); + svfloat32_t v48 = svadd_f32_x(svptrue_b32(), v1933, v1935); + svfloat32_t v49 = svsub_f32_x(svptrue_b32(), v1933, v1935); + svfloat32_t v75 = svadd_f32_x(svptrue_b32(), v1939, v1941); + svfloat32_t v76 = svsub_f32_x(svptrue_b32(), v1939, v1941); + svfloat32_t v91 = svadd_f32_x(svptrue_b32(), v1943, v1945); + svfloat32_t v92 = svsub_f32_x(svptrue_b32(), v1943, v1945); + svfloat32_t v118 = svadd_f32_x(svptrue_b32(), v1949, v1951); + svfloat32_t v119 = svsub_f32_x(svptrue_b32(), v1949, v1951); + svfloat32_t v134 = svadd_f32_x(svptrue_b32(), v1953, v1955); + svfloat32_t v135 = svsub_f32_x(svptrue_b32(), v1953, v1955); + svfloat32_t v161 = svadd_f32_x(svptrue_b32(), v1959, v1961); + svfloat32_t v162 = svsub_f32_x(svptrue_b32(), v1959, v1961); + svfloat32_t v177 = svadd_f32_x(svptrue_b32(), v1963, v1965); + svfloat32_t v178 = svsub_f32_x(svptrue_b32(), v1963, v1965); + svfloat32_t v204 = svadd_f32_x(svptrue_b32(), v1969, v1971); + svfloat32_t v205 = svsub_f32_x(svptrue_b32(), v1969, v1971); + svfloat32_t v220 = svadd_f32_x(svptrue_b32(), v1973, v1975); + svfloat32_t v221 = svsub_f32_x(svptrue_b32(), v1973, v1975); + svfloat32_t v247 = svadd_f32_x(svptrue_b32(), v1979, v1981); + svfloat32_t v248 = svsub_f32_x(svptrue_b32(), v1979, v1981); + svfloat32_t v263 = svadd_f32_x(svptrue_b32(), v1983, v1985); + svfloat32_t v264 = svsub_f32_x(svptrue_b32(), v1983, v1985); + svfloat32_t v290 = svadd_f32_x(svptrue_b32(), v1989, v1991); + svfloat32_t v291 = svsub_f32_x(svptrue_b32(), v1989, v1991); + svfloat32_t v306 = svadd_f32_x(svptrue_b32(), v1993, v1995); + svfloat32_t v307 = svsub_f32_x(svptrue_b32(), v1993, v1995); + svfloat32_t v333 = svadd_f32_x(svptrue_b32(), v1999, v2001); + svfloat32_t v334 = svsub_f32_x(svptrue_b32(), v1999, v2001); + svfloat32_t v349 = svadd_f32_x(svptrue_b32(), v2003, v2005); + svfloat32_t v350 = svsub_f32_x(svptrue_b32(), v2003, v2005); + svfloat32_t v50 = svadd_f32_x(svptrue_b32(), v32, v48); + svfloat32_t v51 = svsub_f32_x(svptrue_b32(), v32, v48); + svfloat32_t v52 = svadd_f32_x(svptrue_b32(), v33, v49); + svfloat32_t v93 = svadd_f32_x(svptrue_b32(), v75, v91); + svfloat32_t v94 = svsub_f32_x(svptrue_b32(), v75, v91); + svfloat32_t v95 = svadd_f32_x(svptrue_b32(), v76, v92); + svfloat32_t v136 = svadd_f32_x(svptrue_b32(), v118, v134); + svfloat32_t v137 = svsub_f32_x(svptrue_b32(), v118, v134); + svfloat32_t v138 = svadd_f32_x(svptrue_b32(), v119, v135); + svfloat32_t v179 = svadd_f32_x(svptrue_b32(), v161, v177); + svfloat32_t v180 = svsub_f32_x(svptrue_b32(), v161, v177); + svfloat32_t v181 = svadd_f32_x(svptrue_b32(), v162, v178); + svfloat32_t v222 = svadd_f32_x(svptrue_b32(), v204, v220); + svfloat32_t v223 = svsub_f32_x(svptrue_b32(), v204, v220); + svfloat32_t v224 = svadd_f32_x(svptrue_b32(), v205, v221); + svfloat32_t v265 = svadd_f32_x(svptrue_b32(), v247, v263); + svfloat32_t v266 = svsub_f32_x(svptrue_b32(), v247, v263); + svfloat32_t v267 = svadd_f32_x(svptrue_b32(), v248, v264); + svfloat32_t v308 = svadd_f32_x(svptrue_b32(), v290, v306); + svfloat32_t v309 = svsub_f32_x(svptrue_b32(), v290, v306); + svfloat32_t v310 = svadd_f32_x(svptrue_b32(), v291, v307); + svfloat32_t v351 = svadd_f32_x(svptrue_b32(), v333, v349); + svfloat32_t v352 = svsub_f32_x(svptrue_b32(), v333, v349); + svfloat32_t v353 = svadd_f32_x(svptrue_b32(), v334, v350); + svfloat32_t v578 = svadd_f32_x(svptrue_b32(), v33, v205); + svfloat32_t v579 = svsub_f32_x(svptrue_b32(), v33, v205); + svfloat32_t v580 = svadd_f32_x(svptrue_b32(), v119, v291); + svfloat32_t v581 = svsub_f32_x(svptrue_b32(), v119, v291); + svfloat32_t v582 = svadd_f32_x(svptrue_b32(), v76, v248); + svfloat32_t v583 = svsub_f32_x(svptrue_b32(), v76, v248); + svfloat32_t v584 = svadd_f32_x(svptrue_b32(), v162, v334); + svfloat32_t v585 = svsub_f32_x(svptrue_b32(), v162, v334); + svfloat32_t v730 = svadd_f32_x(svptrue_b32(), v49, v221); + svfloat32_t v731 = svsub_f32_x(svptrue_b32(), v49, v221); + svfloat32_t v732 = svadd_f32_x(svptrue_b32(), v135, v307); + svfloat32_t v733 = svsub_f32_x(svptrue_b32(), v135, v307); + svfloat32_t v734 = svadd_f32_x(svptrue_b32(), v92, v264); + svfloat32_t v735 = svsub_f32_x(svptrue_b32(), v92, v264); + svfloat32_t v736 = svadd_f32_x(svptrue_b32(), v178, v350); + svfloat32_t v737 = svsub_f32_x(svptrue_b32(), v178, v350); + svfloat32_t v60 = svadd_f32_x(svptrue_b32(), v50, v1937); + svfloat32_t v103 = svadd_f32_x(svptrue_b32(), v93, v1947); + svfloat32_t v146 = svadd_f32_x(svptrue_b32(), v136, v1957); + svfloat32_t v189 = svadd_f32_x(svptrue_b32(), v179, v1967); + svfloat32_t v232 = svadd_f32_x(svptrue_b32(), v222, v1977); + svfloat32_t v275 = svadd_f32_x(svptrue_b32(), v265, v1987); + svfloat32_t v318 = svadd_f32_x(svptrue_b32(), v308, v1997); + svfloat32_t v361 = svadd_f32_x(svptrue_b32(), v351, v2007); + svfloat32_t v434 = svadd_f32_x(svptrue_b32(), v50, v222); + svfloat32_t v435 = svsub_f32_x(svptrue_b32(), v50, v222); + svfloat32_t v436 = svadd_f32_x(svptrue_b32(), v136, v308); + svfloat32_t v437 = svsub_f32_x(svptrue_b32(), v136, v308); + svfloat32_t v438 = svadd_f32_x(svptrue_b32(), v93, v265); + svfloat32_t v439 = svsub_f32_x(svptrue_b32(), v93, v265); + svfloat32_t v440 = svadd_f32_x(svptrue_b32(), v179, v351); + svfloat32_t v441 = svsub_f32_x(svptrue_b32(), v179, v351); + svfloat32_t v506 = svadd_f32_x(svptrue_b32(), v51, v223); + svfloat32_t v507 = svsub_f32_x(svptrue_b32(), v51, v223); + svfloat32_t v508 = svadd_f32_x(svptrue_b32(), v137, v309); + svfloat32_t v509 = svsub_f32_x(svptrue_b32(), v137, v309); + svfloat32_t v510 = svadd_f32_x(svptrue_b32(), v94, v266); + svfloat32_t v511 = svsub_f32_x(svptrue_b32(), v94, v266); + svfloat32_t v512 = svadd_f32_x(svptrue_b32(), v180, v352); + svfloat32_t v513 = svsub_f32_x(svptrue_b32(), v180, v352); + svfloat32_t v586 = svadd_f32_x(svptrue_b32(), v578, v580); + svfloat32_t v587 = svsub_f32_x(svptrue_b32(), v578, v580); + svfloat32_t v588 = svadd_f32_x(svptrue_b32(), v582, v584); + svfloat32_t v589 = svsub_f32_x(svptrue_b32(), v582, v584); + svfloat32_t v592 = svadd_f32_x(svptrue_b32(), v583, v585); + svfloat32_t v593 = svsub_f32_x(svptrue_b32(), v583, v585); + svfloat32_t zero626 = svdup_n_f32(0); + svfloat32_t v626 = svcmla_f32_x(pred_full, zero626, v1547, v579, 90); + svfloat32_t v654 = svadd_f32_x(svptrue_b32(), v52, v224); + svfloat32_t v655 = svsub_f32_x(svptrue_b32(), v52, v224); + svfloat32_t v656 = svadd_f32_x(svptrue_b32(), v138, v310); + svfloat32_t v657 = svsub_f32_x(svptrue_b32(), v138, v310); + svfloat32_t v658 = svadd_f32_x(svptrue_b32(), v95, v267); + svfloat32_t v659 = svsub_f32_x(svptrue_b32(), v95, v267); + svfloat32_t v660 = svadd_f32_x(svptrue_b32(), v181, v353); + svfloat32_t v661 = svsub_f32_x(svptrue_b32(), v181, v353); + svfloat32_t v738 = svadd_f32_x(svptrue_b32(), v730, v732); + svfloat32_t v739 = svsub_f32_x(svptrue_b32(), v730, v732); + svfloat32_t v740 = svadd_f32_x(svptrue_b32(), v734, v736); + svfloat32_t v741 = svsub_f32_x(svptrue_b32(), v734, v736); + svfloat32_t v744 = svadd_f32_x(svptrue_b32(), v735, v737); + svfloat32_t v745 = svsub_f32_x(svptrue_b32(), v735, v737); + svfloat32_t zero778 = svdup_n_f32(0); + svfloat32_t v778 = svcmla_f32_x(pred_full, zero778, v1563, v731, 90); + svfloat32_t v362 = svadd_f32_x(svptrue_b32(), v60, v232); + svfloat32_t v363 = svsub_f32_x(svptrue_b32(), v60, v232); + svfloat32_t v364 = svadd_f32_x(svptrue_b32(), v146, v318); + svfloat32_t v365 = svsub_f32_x(svptrue_b32(), v146, v318); + svfloat32_t v366 = svadd_f32_x(svptrue_b32(), v103, v275); + svfloat32_t v367 = svsub_f32_x(svptrue_b32(), v103, v275); + svfloat32_t v368 = svadd_f32_x(svptrue_b32(), v189, v361); + svfloat32_t v369 = svsub_f32_x(svptrue_b32(), v189, v361); + svfloat32_t v442 = svadd_f32_x(svptrue_b32(), v434, v436); + svfloat32_t v443 = svsub_f32_x(svptrue_b32(), v434, v436); + svfloat32_t v444 = svadd_f32_x(svptrue_b32(), v438, v440); + svfloat32_t v445 = svsub_f32_x(svptrue_b32(), v438, v440); + svfloat32_t v448 = svadd_f32_x(svptrue_b32(), v439, v441); + svfloat32_t v449 = svsub_f32_x(svptrue_b32(), v439, v441); + svfloat32_t zero483 = svdup_n_f32(0); + svfloat32_t v483 = svcmla_f32_x(pred_full, zero483, v1532, v437, 90); + svfloat32_t v514 = svadd_f32_x(svptrue_b32(), v506, v508); + svfloat32_t v515 = svsub_f32_x(svptrue_b32(), v506, v508); + svfloat32_t v516 = svadd_f32_x(svptrue_b32(), v510, v512); + svfloat32_t v517 = svsub_f32_x(svptrue_b32(), v510, v512); + svfloat32_t v520 = svadd_f32_x(svptrue_b32(), v511, v513); + svfloat32_t v521 = svsub_f32_x(svptrue_b32(), v511, v513); + svfloat32_t zero555 = svdup_n_f32(0); + svfloat32_t v555 = svcmla_f32_x(pred_full, zero555, v1540, v509, 90); + svfloat32_t v590 = svadd_f32_x(svptrue_b32(), v586, v588); + svfloat32_t v591 = svsub_f32_x(svptrue_b32(), v586, v588); + svfloat32_t zero614 = svdup_n_f32(0); + svfloat32_t v614 = svcmla_f32_x(pred_full, zero614, v1547, v587, 90); + svfloat32_t v636 = svmul_f32_x(svptrue_b32(), v592, v1549); + svfloat32_t zero643 = svdup_n_f32(0); + svfloat32_t v643 = svcmla_f32_x(pred_full, zero643, v1550, v593, 90); + svfloat32_t v662 = svadd_f32_x(svptrue_b32(), v654, v656); + svfloat32_t v663 = svsub_f32_x(svptrue_b32(), v654, v656); + svfloat32_t v664 = svadd_f32_x(svptrue_b32(), v658, v660); + svfloat32_t v665 = svsub_f32_x(svptrue_b32(), v658, v660); + svfloat32_t v668 = svadd_f32_x(svptrue_b32(), v659, v661); + svfloat32_t v669 = svsub_f32_x(svptrue_b32(), v659, v661); + svfloat32_t zero702 = svdup_n_f32(0); + svfloat32_t v702 = svcmla_f32_x(pred_full, zero702, v1555, v655, 90); + svfloat32_t v742 = svadd_f32_x(svptrue_b32(), v738, v740); + svfloat32_t v743 = svsub_f32_x(svptrue_b32(), v738, v740); + svfloat32_t zero766 = svdup_n_f32(0); + svfloat32_t v766 = svcmla_f32_x(pred_full, zero766, v1563, v739, 90); + svfloat32_t v788 = svmul_f32_x(svptrue_b32(), v744, v1565); + svfloat32_t zero795 = svdup_n_f32(0); + svfloat32_t v795 = svcmla_f32_x(pred_full, zero795, v1566, v745, 90); + svfloat32_t v370 = svadd_f32_x(svptrue_b32(), v362, v364); + svfloat32_t v371 = svsub_f32_x(svptrue_b32(), v362, v364); + svfloat32_t v372 = svadd_f32_x(svptrue_b32(), v366, v368); + svfloat32_t v373 = svsub_f32_x(svptrue_b32(), v366, v368); + svfloat32_t v376 = svadd_f32_x(svptrue_b32(), v367, v369); + svfloat32_t v377 = svsub_f32_x(svptrue_b32(), v367, v369); + svfloat32_t zero411 = svdup_n_f32(0); + svfloat32_t v411 = svcmla_f32_x(pred_full, zero411, v1524, v365, 90); + svfloat32_t v446 = svadd_f32_x(svptrue_b32(), v442, v444); + svfloat32_t v447 = svsub_f32_x(svptrue_b32(), v442, v444); + svfloat32_t zero471 = svdup_n_f32(0); + svfloat32_t v471 = svcmla_f32_x(pred_full, zero471, v1532, v445, 90); + svfloat32_t zero490 = svdup_n_f32(0); + svfloat32_t v490 = svcmla_f32_x(pred_full, zero490, v1533, v448, 90); + svfloat32_t v495 = svmul_f32_x(svptrue_b32(), v449, v1534); + svfloat32_t v518 = svadd_f32_x(svptrue_b32(), v514, v516); + svfloat32_t v519 = svsub_f32_x(svptrue_b32(), v514, v516); + svfloat32_t zero543 = svdup_n_f32(0); + svfloat32_t v543 = svcmla_f32_x(pred_full, zero543, v1540, v517, 90); + svfloat32_t zero562 = svdup_n_f32(0); + svfloat32_t v562 = svcmla_f32_x(pred_full, zero562, v1541, v520, 90); + svfloat32_t v567 = svmul_f32_x(svptrue_b32(), v521, v1542); + svfloat32_t zero600 = svdup_n_f32(0); + svfloat32_t v600 = svcmla_f32_x(pred_full, zero600, v1547, v590, 90); + svfloat32_t zero607 = svdup_n_f32(0); + svfloat32_t v607 = svcmla_f32_x(pred_full, zero607, v1547, v591, 90); + svfloat32_t v644 = svmla_f32_x(pred_full, v614, v589, v1548); + svfloat32_t v645 = svmls_f32_x(pred_full, v614, v589, v1548); + svfloat32_t v646 = svadd_f32_x(svptrue_b32(), v626, v643); + svfloat32_t v647 = svsub_f32_x(svptrue_b32(), v626, v643); + svfloat32_t v648 = svmla_f32_x(pred_full, v636, v581, v1548); + svfloat32_t v649 = svnmls_f32_x(pred_full, v636, v581, v1548); + svfloat32_t v666 = svadd_f32_x(svptrue_b32(), v662, v664); + svfloat32_t v667 = svsub_f32_x(svptrue_b32(), v662, v664); + svfloat32_t zero690 = svdup_n_f32(0); + svfloat32_t v690 = svcmla_f32_x(pred_full, zero690, v1555, v663, 90); + svfloat32_t v712 = svmul_f32_x(svptrue_b32(), v668, v1557); + svfloat32_t zero719 = svdup_n_f32(0); + svfloat32_t v719 = svcmla_f32_x(pred_full, zero719, v1558, v669, 90); + svfloat32_t v796 = svmla_f32_x(pred_full, v766, v741, v1564); + svfloat32_t v797 = svmls_f32_x(pred_full, v766, v741, v1564); + svfloat32_t v798 = svadd_f32_x(svptrue_b32(), v778, v795); + svfloat32_t v799 = svsub_f32_x(svptrue_b32(), v778, v795); + svfloat32_t v800 = svmla_f32_x(pred_full, v788, v733, v1564); + svfloat32_t v801 = svnmls_f32_x(pred_full, v788, v733, v1564); + svfloat32_t v374 = svadd_f32_x(svptrue_b32(), v370, v372); + svfloat32_t v375 = svsub_f32_x(svptrue_b32(), v370, v372); + svfloat32_t zero399 = svdup_n_f32(0); + svfloat32_t v399 = svcmla_f32_x(pred_full, zero399, v1524, v373, 90); + svfloat32_t zero418 = svdup_n_f32(0); + svfloat32_t v418 = svcmla_f32_x(pred_full, zero418, v1525, v376, 90); + svfloat32_t v496 = svmla_f32_x(pred_full, v471, v443, v1531); + svfloat32_t v497 = svnmls_f32_x(pred_full, v471, v443, v1531); + svfloat32_t v498 = svmla_f32_x(pred_full, v495, v435, v1531); + svfloat32_t v499 = svnmls_f32_x(pred_full, v495, v435, v1531); + svfloat32_t v500 = svadd_f32_x(svptrue_b32(), v483, v490); + svfloat32_t v501 = svsub_f32_x(svptrue_b32(), v483, v490); + svfloat32_t v568 = svmla_f32_x(pred_full, v543, v515, v1539); + svfloat32_t v569 = svnmls_f32_x(pred_full, v543, v515, v1539); + svfloat32_t v570 = svmla_f32_x(pred_full, v567, v507, v1539); + svfloat32_t v571 = svnmls_f32_x(pred_full, v567, v507, v1539); + svfloat32_t v572 = svadd_f32_x(svptrue_b32(), v555, v562); + svfloat32_t v573 = svsub_f32_x(svptrue_b32(), v555, v562); + svfloat32_t v650 = svadd_f32_x(svptrue_b32(), v646, v648); + svfloat32_t v651 = svsub_f32_x(svptrue_b32(), v646, v648); + svfloat32_t v652 = svadd_f32_x(svptrue_b32(), v647, v649); + svfloat32_t v653 = svsub_f32_x(svptrue_b32(), v647, v649); + svfloat32_t zero676 = svdup_n_f32(0); + svfloat32_t v676 = svcmla_f32_x(pred_full, zero676, v1555, v666, 90); + svfloat32_t zero683 = svdup_n_f32(0); + svfloat32_t v683 = svcmla_f32_x(pred_full, zero683, v1555, v667, 90); + svfloat32_t v720 = svmla_f32_x(pred_full, v690, v665, v1556); + svfloat32_t v721 = svmls_f32_x(pred_full, v690, v665, v1556); + svfloat32_t v722 = svadd_f32_x(svptrue_b32(), v702, v719); + svfloat32_t v723 = svsub_f32_x(svptrue_b32(), v702, v719); + svfloat32_t v724 = svmla_f32_x(pred_full, v712, v657, v1556); + svfloat32_t v725 = svnmls_f32_x(pred_full, v712, v657, v1556); + svfloat32_t v802 = svadd_f32_x(svptrue_b32(), v798, v800); + svfloat32_t v803 = svsub_f32_x(svptrue_b32(), v798, v800); + svfloat32_t v804 = svadd_f32_x(svptrue_b32(), v799, v801); + svfloat32_t v805 = svsub_f32_x(svptrue_b32(), v799, v801); + svfloat32_t v424 = svadd_f32_x(svptrue_b32(), v371, v399); + svfloat32_t v425 = svsub_f32_x(svptrue_b32(), v371, v399); + svfloat32_t v426 = svmla_f32_x(pred_full, v363, v377, v1526); + svfloat32_t v427 = svmls_f32_x(pred_full, v363, v377, v1526); + svfloat32_t v428 = svadd_f32_x(svptrue_b32(), v411, v418); + svfloat32_t v429 = svsub_f32_x(svptrue_b32(), v411, v418); + svfloat32_t v502 = svadd_f32_x(svptrue_b32(), v498, v500); + svfloat32_t v503 = svsub_f32_x(svptrue_b32(), v498, v500); + svfloat32_t v504 = svadd_f32_x(svptrue_b32(), v499, v501); + svfloat32_t v505 = svsub_f32_x(svptrue_b32(), v499, v501); + svfloat32_t v574 = svadd_f32_x(svptrue_b32(), v570, v572); + svfloat32_t v575 = svsub_f32_x(svptrue_b32(), v570, v572); + svfloat32_t v576 = svadd_f32_x(svptrue_b32(), v571, v573); + svfloat32_t v577 = svsub_f32_x(svptrue_b32(), v571, v573); + svfloat32_t v726 = svadd_f32_x(svptrue_b32(), v722, v724); + svfloat32_t v727 = svsub_f32_x(svptrue_b32(), v722, v724); + svfloat32_t v728 = svadd_f32_x(svptrue_b32(), v723, v725); + svfloat32_t v729 = svsub_f32_x(svptrue_b32(), v723, v725); + svfloat32_t v806 = svmla_f32_x(pred_full, v374, v446, v1531); + svfloat32_t v809 = svsub_f32_x(svptrue_b32(), v600, v676); + svfloat32_t v810 = svcmla_f32_x(pred_full, v676, v1563, v742, 90); + svfloat32_t v897 = svsub_f32_x(svptrue_b32(), v645, v721); + svfloat32_t v898 = svadd_f32_x(svptrue_b32(), v721, v797); + svfloat32_t v982 = svmla_f32_x(pred_full, v375, v447, v1531); + svfloat32_t v985 = svsub_f32_x(svptrue_b32(), v607, v683); + svfloat32_t v986 = svcmla_f32_x(pred_full, v683, v1563, v743, 90); + svfloat32_t v1073 = svsub_f32_x(svptrue_b32(), v644, v720); + svfloat32_t v1074 = svadd_f32_x(svptrue_b32(), v720, v796); + svst1_f64(pred_full, (double *)(v1574), svreinterpret_f64_f32(v374)); + svst1_f64(pred_full, (double *)(v1754), svreinterpret_f64_f32(v375)); + svfloat32_t v430 = svadd_f32_x(svptrue_b32(), v426, v428); + svfloat32_t v431 = svsub_f32_x(svptrue_b32(), v426, v428); + svfloat32_t v432 = svadd_f32_x(svptrue_b32(), v427, v429); + svfloat32_t v433 = svsub_f32_x(svptrue_b32(), v427, v429); + svfloat32_t v807 = svmla_f32_x(pred_full, v806, v518, v1539); + svfloat32_t v808 = svmls_f32_x(pred_full, v806, v518, v1539); + svfloat32_t v853 = svsub_f32_x(svptrue_b32(), v651, v727); + svfloat32_t v854 = svadd_f32_x(svptrue_b32(), v727, v803); + svfloat32_t v894 = svadd_f32_x(svptrue_b32(), v425, v497); + svfloat32_t v941 = svsub_f32_x(svptrue_b32(), v652, v728); + svfloat32_t v942 = svadd_f32_x(svptrue_b32(), v728, v804); + svfloat32_t v983 = svmla_f32_x(pred_full, v982, v519, v1539); + svfloat32_t v984 = svmls_f32_x(pred_full, v982, v519, v1539); + svfloat32_t v1029 = svsub_f32_x(svptrue_b32(), v653, v729); + svfloat32_t v1030 = svadd_f32_x(svptrue_b32(), v729, v805); + svfloat32_t v1070 = svadd_f32_x(svptrue_b32(), v424, v496); + svfloat32_t v1117 = svsub_f32_x(svptrue_b32(), v650, v726); + svfloat32_t v1118 = svadd_f32_x(svptrue_b32(), v726, v802); + svst1_f64(pred_full, (double *)(v1664), svreinterpret_f64_f32(v425)); + svst1_f64(pred_full, (double *)(v1844), svreinterpret_f64_f32(v424)); + svfloat32_t v811 = svadd_f32_x(svptrue_b32(), v807, v809); + svfloat32_t v812 = svsub_f32_x(svptrue_b32(), v807, v809); + svfloat32_t v813 = svadd_f32_x(svptrue_b32(), v808, v810); + svfloat32_t v814 = svsub_f32_x(svptrue_b32(), v808, v810); + svfloat32_t v850 = svadd_f32_x(svptrue_b32(), v431, v503); + svfloat32_t v895 = svadd_f32_x(svptrue_b32(), v894, v569); + svfloat32_t v896 = svsub_f32_x(svptrue_b32(), v894, v569); + svfloat32_t v938 = svadd_f32_x(svptrue_b32(), v432, v504); + svfloat32_t v987 = svadd_f32_x(svptrue_b32(), v983, v985); + svfloat32_t v988 = svsub_f32_x(svptrue_b32(), v983, v985); + svfloat32_t v989 = svadd_f32_x(svptrue_b32(), v984, v986); + svfloat32_t v990 = svsub_f32_x(svptrue_b32(), v984, v986); + svfloat32_t v1026 = svadd_f32_x(svptrue_b32(), v433, v505); + svfloat32_t v1071 = svadd_f32_x(svptrue_b32(), v1070, v568); + svfloat32_t v1072 = svsub_f32_x(svptrue_b32(), v1070, v568); + svfloat32_t v1114 = svadd_f32_x(svptrue_b32(), v430, v502); + svst1_f64(pred_full, (double *)(v1619), svreinterpret_f64_f32(v431)); + svst1_f64(pred_full, (double *)(v1709), svreinterpret_f64_f32(v432)); + svst1_f64(pred_full, (double *)(v1799), svreinterpret_f64_f32(v433)); + svst1_f64(pred_full, (double *)(v1889), svreinterpret_f64_f32(v430)); + svfloat32_t v851 = svadd_f32_x(svptrue_b32(), v850, v575); + svfloat32_t v852 = svsub_f32_x(svptrue_b32(), v850, v575); + svfloat32_t v899 = svadd_f32_x(svptrue_b32(), v895, v897); + svfloat32_t v900 = svsub_f32_x(svptrue_b32(), v895, v897); + svfloat32_t v901 = svadd_f32_x(svptrue_b32(), v896, v898); + svfloat32_t v902 = svsub_f32_x(svptrue_b32(), v896, v898); + svfloat32_t v939 = svadd_f32_x(svptrue_b32(), v938, v576); + svfloat32_t v940 = svsub_f32_x(svptrue_b32(), v938, v576); + svfloat32_t v1027 = svadd_f32_x(svptrue_b32(), v1026, v577); + svfloat32_t v1028 = svsub_f32_x(svptrue_b32(), v1026, v577); + svfloat32_t v1075 = svadd_f32_x(svptrue_b32(), v1071, v1073); + svfloat32_t v1076 = svsub_f32_x(svptrue_b32(), v1071, v1073); + svfloat32_t v1077 = svadd_f32_x(svptrue_b32(), v1072, v1074); + svfloat32_t v1078 = svsub_f32_x(svptrue_b32(), v1072, v1074); + svfloat32_t v1115 = svadd_f32_x(svptrue_b32(), v1114, v574); + svfloat32_t v1116 = svsub_f32_x(svptrue_b32(), v1114, v574); + svst1_f64(pred_full, (double *)(v1583), svreinterpret_f64_f32(v812)); + svst1_f64(pred_full, (double *)(v1592), svreinterpret_f64_f32(v814)); + svst1_f64(pred_full, (double *)(v1601), svreinterpret_f64_f32(v813)); + svst1_f64(pred_full, (double *)(v1610), svreinterpret_f64_f32(v811)); + svst1_f64(pred_full, (double *)(v1763), svreinterpret_f64_f32(v988)); + svst1_f64(pred_full, (double *)(v1772), svreinterpret_f64_f32(v990)); + svst1_f64(pred_full, (double *)(v1781), svreinterpret_f64_f32(v989)); + svst1_f64(pred_full, (double *)(v1790), svreinterpret_f64_f32(v987)); + svfloat32_t v855 = svadd_f32_x(svptrue_b32(), v851, v853); + svfloat32_t v856 = svsub_f32_x(svptrue_b32(), v851, v853); + svfloat32_t v857 = svadd_f32_x(svptrue_b32(), v852, v854); + svfloat32_t v858 = svsub_f32_x(svptrue_b32(), v852, v854); + svfloat32_t v943 = svadd_f32_x(svptrue_b32(), v939, v941); + svfloat32_t v944 = svsub_f32_x(svptrue_b32(), v939, v941); + svfloat32_t v945 = svadd_f32_x(svptrue_b32(), v940, v942); + svfloat32_t v946 = svsub_f32_x(svptrue_b32(), v940, v942); + svfloat32_t v1031 = svadd_f32_x(svptrue_b32(), v1027, v1029); + svfloat32_t v1032 = svsub_f32_x(svptrue_b32(), v1027, v1029); + svfloat32_t v1033 = svadd_f32_x(svptrue_b32(), v1028, v1030); + svfloat32_t v1034 = svsub_f32_x(svptrue_b32(), v1028, v1030); + svfloat32_t v1119 = svadd_f32_x(svptrue_b32(), v1115, v1117); + svfloat32_t v1120 = svsub_f32_x(svptrue_b32(), v1115, v1117); + svfloat32_t v1121 = svadd_f32_x(svptrue_b32(), v1116, v1118); + svfloat32_t v1122 = svsub_f32_x(svptrue_b32(), v1116, v1118); + svst1_f64(pred_full, (double *)(v1673), svreinterpret_f64_f32(v900)); + svst1_f64(pred_full, (double *)(v1682), svreinterpret_f64_f32(v902)); + svst1_f64(pred_full, (double *)(v1691), svreinterpret_f64_f32(v901)); + svst1_f64(pred_full, (double *)(v1700), svreinterpret_f64_f32(v899)); + svst1_f64(pred_full, (double *)(v1853), svreinterpret_f64_f32(v1076)); + svst1_f64(pred_full, (double *)(v1862), svreinterpret_f64_f32(v1078)); + svst1_f64(pred_full, (double *)(v1871), svreinterpret_f64_f32(v1077)); + svst1_f64(pred_full, (double *)(v1880), svreinterpret_f64_f32(v1075)); + svst1_f64(pred_full, (double *)(v1628), svreinterpret_f64_f32(v856)); + svst1_f64(pred_full, (double *)(v1637), svreinterpret_f64_f32(v858)); + svst1_f64(pred_full, (double *)(v1646), svreinterpret_f64_f32(v857)); + svst1_f64(pred_full, (double *)(v1655), svreinterpret_f64_f32(v855)); + svst1_f64(pred_full, (double *)(v1718), svreinterpret_f64_f32(v944)); + svst1_f64(pred_full, (double *)(v1727), svreinterpret_f64_f32(v946)); + svst1_f64(pred_full, (double *)(v1736), svreinterpret_f64_f32(v945)); + svst1_f64(pred_full, (double *)(v1745), svreinterpret_f64_f32(v943)); + svst1_f64(pred_full, (double *)(v1808), svreinterpret_f64_f32(v1032)); + svst1_f64(pred_full, (double *)(v1817), svreinterpret_f64_f32(v1034)); + svst1_f64(pred_full, (double *)(v1826), svreinterpret_f64_f32(v1033)); + svst1_f64(pred_full, (double *)(v1835), svreinterpret_f64_f32(v1031)); + svst1_f64(pred_full, (double *)(v1898), svreinterpret_f64_f32(v1120)); + svst1_f64(pred_full, (double *)(v1907), svreinterpret_f64_f32(v1122)); + svst1_f64(pred_full, (double *)(v1916), svreinterpret_f64_f32(v1121)); + svst1_f64(pred_full, (double *)(v1925), svreinterpret_f64_f32(v1119)); + v5 += v11; + v6 += v12; + } +} +#endif diff --git a/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ac_n_uu.h b/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ac_n_uu.h index dd75a5537ab29d0a024f34b1c9da641ce20afd96..dc3161101aa75c1eceb4bd1327d91759aa86c31f 100644 --- a/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ac_n_uu.h +++ b/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ac_n_uu.h @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #pragma once @@ -38,7 +40,11 @@ cf32_cf32_cf32_ac_n_uu_fft_t armral_fft_cf32_cf32_cf32_ac_n_uu21; cf32_cf32_cf32_ac_n_uu_fft_t armral_fft_cf32_cf32_cf32_ac_n_uu22; cf32_cf32_cf32_ac_n_uu_fft_t armral_fft_cf32_cf32_cf32_ac_n_uu24; cf32_cf32_cf32_ac_n_uu_fft_t armral_fft_cf32_cf32_cf32_ac_n_uu25; +cf32_cf32_cf32_ac_n_uu_fft_t armral_fft_cf32_cf32_cf32_ac_n_uu28; +cf32_cf32_cf32_ac_n_uu_fft_t armral_fft_cf32_cf32_cf32_ac_n_uu30; cf32_cf32_cf32_ac_n_uu_fft_t armral_fft_cf32_cf32_cf32_ac_n_uu32; +cf32_cf32_cf32_ac_n_uu_fft_t armral_fft_cf32_cf32_cf32_ac_n_uu36; +cf32_cf32_cf32_ac_n_uu_fft_t armral_fft_cf32_cf32_cf32_ac_n_uu40; #ifdef __cplusplus } // extern "C" diff --git a/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ac_n_uun.c b/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ac_n_uun.c new file mode 100644 index 0000000000000000000000000000000000000000..e2287d042f60b599e646dc51ea6fec84e10c8500 --- /dev/null +++ b/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ac_n_uun.c @@ -0,0 +1,10591 @@ +/* + Arm RAN Acceleration Library + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause +*/ +#include "fft_cf32_cf32_cf32_ac_n_uun.h" + +#include +#ifdef ARMRAL_ARCH_SVE +#include +#endif + +#ifndef ARMRAL_ARCH_SVE +void armral_fft_cf32_cf32_cf32_ac_n_uun2(const armral_cmplx_f32_t *restrict x, + armral_cmplx_f32_t *restrict y, + int istride, int ostride, int howmany, + float dir) { + const float32x2_t *v5 = (const float32x2_t *)x; + float32x2_t *v6 = (float32x2_t *)y; + float32x2_t v13 = v5[0]; + float32x2_t v18 = v5[istride]; + float32x2_t v19 = vadd_f32(v13, v18); + float32x2_t v20 = vsub_f32(v13, v18); + v6[0] = v19; + v6[ostride] = v20; +} +#endif + +#ifdef ARMRAL_ARCH_SVE +void armral_fft_cf32_cf32_cf32_ac_n_uun2(const armral_cmplx_f32_t *restrict x, + armral_cmplx_f32_t *restrict y, + int istride, int ostride, int howmany, + float dir) { + int64_t v0 = istride; + int64_t v2 = ostride; + const float32x2_t *v5 = (const float32x2_t *)x; + float32x2_t *v6 = (float32x2_t *)y; + svbool_t pred_full = svptrue_pat_b32(SV_VL2); + const float32x2_t *v70 = &v5[v0]; + float32x2_t *v91 = &v6[v2]; + const float32x2_t *v61 = &v5[0]; + float32x2_t *v82 = &v6[0]; + svfloat32_t v97 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v70)[0])); + svfloat32_t v95 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v61)[0])); + svfloat32_t v28 = svadd_f32_x(svptrue_b32(), v95, v97); + svfloat32_t v29 = svsub_f32_x(svptrue_b32(), v95, v97); + svst1_f64(pred_full, (double *)(v82), svreinterpret_f64_f32(v28)); + svst1_f64(pred_full, (double *)(v91), svreinterpret_f64_f32(v29)); +} +#endif + +#ifndef ARMRAL_ARCH_SVE +void armral_fft_cf32_cf32_cf32_ac_n_uun3(const armral_cmplx_f32_t *restrict x, + armral_cmplx_f32_t *restrict y, + int istride, int ostride, int howmany, + float dir) { + float v4 = dir; + const float32x2_t *v5 = (const float32x2_t *)x; + float32x2_t *v6 = (float32x2_t *)y; + float v32 = -1.4999999999999998e+00F; + float v35 = 8.6602540378443871e-01F; + float v36 = -8.6602540378443871e-01F; + float32x2_t v13 = v5[istride]; + float32x2_t v25 = v5[0]; + float32x2_t v33 = (float32x2_t){v32, v32}; + float32x2_t v37 = (float32x2_t){v35, v36}; + float32x2_t v38 = (float32x2_t){v4, v4}; + float32x2_t v18 = v5[istride * 2]; + float32x2_t v39 = vmul_f32(v38, v37); + float32x2_t v19 = vadd_f32(v13, v18); + float32x2_t v20 = vsub_f32(v13, v18); + float32x2_t v26 = vadd_f32(v19, v25); + float32x2_t v34 = vmul_f32(v19, v33); + float32x2_t v40 = vrev64_f32(v20); + float32x2_t v41 = vmul_f32(v40, v39); + float32x2_t v42 = vadd_f32(v26, v34); + v6[0] = v26; + float32x2_t v43 = vadd_f32(v42, v41); + float32x2_t v44 = vsub_f32(v42, v41); + v6[ostride] = v44; + v6[ostride * 2] = v43; +} +#endif + +#ifdef ARMRAL_ARCH_SVE +void armral_fft_cf32_cf32_cf32_ac_n_uun3(const armral_cmplx_f32_t *restrict x, + armral_cmplx_f32_t *restrict y, + int istride, int ostride, int howmany, + float dir) { + int64_t v0 = istride; + int64_t v2 = ostride; + float v4 = dir; + const float32x2_t *v5 = (const float32x2_t *)x; + float32x2_t *v6 = (float32x2_t *)y; + svbool_t pred_full = svptrue_pat_b32(SV_VL2); + float v44 = -1.4999999999999998e+00F; + float v49 = -8.6602540378443871e-01F; + const float32x2_t *v85 = &v5[v0]; + float32x2_t *v126 = &v6[v2]; + int64_t v22 = v0 * 2; + float v52 = v4 * v49; + int64_t v73 = v2 * 2; + const float32x2_t *v104 = &v5[0]; + svfloat32_t v108 = svdup_n_f32(v44); + float32x2_t *v117 = &v6[0]; + svfloat32_t v139 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v85)[0])); + const float32x2_t *v94 = &v5[v22]; + svfloat32_t v109 = svdup_n_f32(v52); + float32x2_t *v135 = &v6[v73]; + svfloat32_t v143 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v104)[0])); + svfloat32_t v141 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v94)[0])); + svfloat32_t v28 = svadd_f32_x(svptrue_b32(), v139, v141); + svfloat32_t v29 = svsub_f32_x(svptrue_b32(), v139, v141); + svfloat32_t v37 = svadd_f32_x(svptrue_b32(), v28, v143); + svfloat32_t zero54 = svdup_n_f32(0); + svfloat32_t v54 = svcmla_f32_x(pred_full, zero54, v109, v29, 90); + svfloat32_t v55 = svmla_f32_x(pred_full, v37, v28, v108); + svst1_f64(pred_full, (double *)(v117), svreinterpret_f64_f32(v37)); + svfloat32_t v56 = svadd_f32_x(svptrue_b32(), v55, v54); + svfloat32_t v57 = svsub_f32_x(svptrue_b32(), v55, v54); + svst1_f64(pred_full, (double *)(v126), svreinterpret_f64_f32(v57)); + svst1_f64(pred_full, (double *)(v135), svreinterpret_f64_f32(v56)); +} +#endif + +#ifndef ARMRAL_ARCH_SVE +void armral_fft_cf32_cf32_cf32_ac_n_uun4(const armral_cmplx_f32_t *restrict x, + armral_cmplx_f32_t *restrict y, + int istride, int ostride, int howmany, + float dir) { + float v4 = dir; + const float32x2_t *v5 = (const float32x2_t *)x; + float32x2_t *v6 = (float32x2_t *)y; + float v47 = 1.0000000000000000e+00F; + float v48 = -1.0000000000000000e+00F; + float32x2_t v13 = v5[0]; + float32x2_t v25 = v5[istride]; + float32x2_t v49 = (float32x2_t){v47, v48}; + float32x2_t v50 = (float32x2_t){v4, v4}; + float32x2_t v18 = v5[istride * 2]; + float32x2_t v30 = v5[istride * 3]; + float32x2_t v51 = vmul_f32(v50, v49); + float32x2_t v19 = vadd_f32(v13, v18); + float32x2_t v20 = vsub_f32(v13, v18); + float32x2_t v31 = vadd_f32(v25, v30); + float32x2_t v32 = vsub_f32(v25, v30); + float32x2_t v33 = vadd_f32(v19, v31); + float32x2_t v34 = vsub_f32(v19, v31); + float32x2_t v52 = vrev64_f32(v32); + float32x2_t v53 = vmul_f32(v52, v51); + v6[0] = v33; + v6[ostride * 2] = v34; + float32x2_t v54 = vadd_f32(v20, v53); + float32x2_t v55 = vsub_f32(v20, v53); + v6[ostride] = v55; + v6[ostride * 3] = v54; +} +#endif + +#ifdef ARMRAL_ARCH_SVE +void armral_fft_cf32_cf32_cf32_ac_n_uun4(const armral_cmplx_f32_t *restrict x, + armral_cmplx_f32_t *restrict y, + int istride, int ostride, int howmany, + float dir) { + int64_t v0 = istride; + int64_t v2 = ostride; + float v4 = dir; + const float32x2_t *v5 = (const float32x2_t *)x; + float32x2_t *v6 = (float32x2_t *)y; + svbool_t pred_full = svptrue_pat_b32(SV_VL2); + float v64 = -1.0000000000000000e+00F; + const float32x2_t *v125 = &v5[v0]; + float32x2_t *v157 = &v6[v2]; + int64_t v22 = v0 * 2; + int64_t v38 = v0 * 3; + float v67 = v4 * v64; + int64_t v87 = v2 * 2; + int64_t v94 = v2 * 3; + const float32x2_t *v107 = &v5[0]; + float32x2_t *v148 = &v6[0]; + svfloat32_t v183 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v125)[0])); + const float32x2_t *v116 = &v5[v22]; + const float32x2_t *v134 = &v5[v38]; + svfloat32_t v140 = svdup_n_f32(v67); + float32x2_t *v166 = &v6[v87]; + float32x2_t *v175 = &v6[v94]; + svfloat32_t v179 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v107)[0])); + svfloat32_t v181 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v116)[0])); + svfloat32_t v185 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v134)[0])); + svfloat32_t v28 = svadd_f32_x(svptrue_b32(), v179, v181); + svfloat32_t v29 = svsub_f32_x(svptrue_b32(), v179, v181); + svfloat32_t v44 = svadd_f32_x(svptrue_b32(), v183, v185); + svfloat32_t v45 = svsub_f32_x(svptrue_b32(), v183, v185); + svfloat32_t v46 = svadd_f32_x(svptrue_b32(), v28, v44); + svfloat32_t v47 = svsub_f32_x(svptrue_b32(), v28, v44); + svfloat32_t zero69 = svdup_n_f32(0); + svfloat32_t v69 = svcmla_f32_x(pred_full, zero69, v140, v45, 90); + svfloat32_t v70 = svadd_f32_x(svptrue_b32(), v29, v69); + svfloat32_t v71 = svsub_f32_x(svptrue_b32(), v29, v69); + svst1_f64(pred_full, (double *)(v148), svreinterpret_f64_f32(v46)); + svst1_f64(pred_full, (double *)(v166), svreinterpret_f64_f32(v47)); + svst1_f64(pred_full, (double *)(v157), svreinterpret_f64_f32(v71)); + svst1_f64(pred_full, (double *)(v175), svreinterpret_f64_f32(v70)); +} +#endif + +#ifndef ARMRAL_ARCH_SVE +void armral_fft_cf32_cf32_cf32_ac_n_uun5(const armral_cmplx_f32_t *restrict x, + armral_cmplx_f32_t *restrict y, + int istride, int ostride, int howmany, + float dir) { + float v4 = dir; + const float32x2_t *v5 = (const float32x2_t *)x; + float32x2_t *v6 = (float32x2_t *)y; + float v47 = -1.2500000000000000e+00F; + float v51 = 5.5901699437494745e-01F; + float v54 = 1.5388417685876268e+00F; + float v55 = -1.5388417685876268e+00F; + float v61 = 5.8778525229247325e-01F; + float v62 = -5.8778525229247325e-01F; + float v68 = 3.6327126400268028e-01F; + float v69 = -3.6327126400268028e-01F; + float32x2_t v13 = v5[istride]; + float32x2_t v40 = v5[0]; + float32x2_t v48 = (float32x2_t){v47, v47}; + float32x2_t v52 = (float32x2_t){v51, v51}; + float32x2_t v56 = (float32x2_t){v54, v55}; + float32x2_t v63 = (float32x2_t){v61, v62}; + float32x2_t v70 = (float32x2_t){v68, v69}; + float32x2_t v71 = (float32x2_t){v4, v4}; + float32x2_t v18 = v5[istride * 4]; + float32x2_t v25 = v5[istride * 3]; + float32x2_t v30 = v5[istride * 2]; + float32x2_t v58 = vmul_f32(v71, v56); + float32x2_t v65 = vmul_f32(v71, v63); + float32x2_t v72 = vmul_f32(v71, v70); + float32x2_t v19 = vadd_f32(v13, v18); + float32x2_t v20 = vsub_f32(v13, v18); + float32x2_t v31 = vadd_f32(v25, v30); + float32x2_t v32 = vsub_f32(v25, v30); + float32x2_t v33 = vadd_f32(v19, v31); + float32x2_t v34 = vsub_f32(v19, v31); + float32x2_t v35 = vadd_f32(v20, v32); + float32x2_t v59 = vrev64_f32(v20); + float32x2_t v73 = vrev64_f32(v32); + float32x2_t v41 = vadd_f32(v33, v40); + float32x2_t v49 = vmul_f32(v33, v48); + float32x2_t v53 = vmul_f32(v34, v52); + float32x2_t v60 = vmul_f32(v59, v58); + float32x2_t v66 = vrev64_f32(v35); + float32x2_t v74 = vmul_f32(v73, v72); + float32x2_t v67 = vmul_f32(v66, v65); + float32x2_t v75 = vadd_f32(v41, v49); + v6[0] = v41; + float32x2_t v76 = vadd_f32(v75, v53); + float32x2_t v77 = vsub_f32(v75, v53); + float32x2_t v78 = vsub_f32(v60, v67); + float32x2_t v79 = vadd_f32(v67, v74); + float32x2_t v80 = vadd_f32(v76, v78); + float32x2_t v81 = vsub_f32(v76, v78); + float32x2_t v82 = vadd_f32(v77, v79); + float32x2_t v83 = vsub_f32(v77, v79); + v6[ostride] = v81; + v6[ostride * 2] = v83; + v6[ostride * 3] = v82; + v6[ostride * 4] = v80; +} +#endif + +#ifdef ARMRAL_ARCH_SVE +void armral_fft_cf32_cf32_cf32_ac_n_uun5(const armral_cmplx_f32_t *restrict x, + armral_cmplx_f32_t *restrict y, + int istride, int ostride, int howmany, + float dir) { + int64_t v0 = istride; + int64_t v2 = ostride; + float v4 = dir; + const float32x2_t *v5 = (const float32x2_t *)x; + float32x2_t *v6 = (float32x2_t *)y; + svbool_t pred_full = svptrue_pat_b32(SV_VL2); + float v63 = -1.2500000000000000e+00F; + float v68 = 5.5901699437494745e-01F; + float v73 = -1.5388417685876268e+00F; + float v80 = -5.8778525229247325e-01F; + float v87 = -3.6327126400268028e-01F; + const float32x2_t *v143 = &v5[v0]; + float32x2_t *v205 = &v6[v2]; + int64_t v22 = v0 * 4; + int64_t v31 = v0 * 3; + int64_t v38 = v0 * 2; + float v76 = v4 * v73; + float v83 = v4 * v80; + float v90 = v4 * v87; + int64_t v117 = v2 * 2; + int64_t v124 = v2 * 3; + int64_t v131 = v2 * 4; + const float32x2_t *v180 = &v5[0]; + svfloat32_t v184 = svdup_n_f32(v63); + svfloat32_t v185 = svdup_n_f32(v68); + float32x2_t *v196 = &v6[0]; + svfloat32_t v236 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v143)[0])); + const float32x2_t *v152 = &v5[v22]; + const float32x2_t *v161 = &v5[v31]; + const float32x2_t *v170 = &v5[v38]; + svfloat32_t v186 = svdup_n_f32(v76); + svfloat32_t v187 = svdup_n_f32(v83); + svfloat32_t v188 = svdup_n_f32(v90); + float32x2_t *v214 = &v6[v117]; + float32x2_t *v223 = &v6[v124]; + float32x2_t *v232 = &v6[v131]; + svfloat32_t v244 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v180)[0])); + svfloat32_t v238 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v152)[0])); + svfloat32_t v240 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v161)[0])); + svfloat32_t v242 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v170)[0])); + svfloat32_t v28 = svadd_f32_x(svptrue_b32(), v236, v238); + svfloat32_t v29 = svsub_f32_x(svptrue_b32(), v236, v238); + svfloat32_t v44 = svadd_f32_x(svptrue_b32(), v240, v242); + svfloat32_t v45 = svsub_f32_x(svptrue_b32(), v240, v242); + svfloat32_t v46 = svadd_f32_x(svptrue_b32(), v28, v44); + svfloat32_t v47 = svsub_f32_x(svptrue_b32(), v28, v44); + svfloat32_t v48 = svadd_f32_x(svptrue_b32(), v29, v45); + svfloat32_t zero78 = svdup_n_f32(0); + svfloat32_t v78 = svcmla_f32_x(pred_full, zero78, v186, v29, 90); + svfloat32_t v56 = svadd_f32_x(svptrue_b32(), v46, v244); + svfloat32_t zero85 = svdup_n_f32(0); + svfloat32_t v85 = svcmla_f32_x(pred_full, zero85, v187, v48, 90); + svfloat32_t v93 = svmla_f32_x(pred_full, v56, v46, v184); + svfloat32_t v96 = svsub_f32_x(svptrue_b32(), v78, v85); + svfloat32_t v97 = svcmla_f32_x(pred_full, v85, v188, v45, 90); + svst1_f64(pred_full, (double *)(v196), svreinterpret_f64_f32(v56)); + svfloat32_t v94 = svmla_f32_x(pred_full, v93, v47, v185); + svfloat32_t v95 = svmls_f32_x(pred_full, v93, v47, v185); + svfloat32_t v98 = svadd_f32_x(svptrue_b32(), v94, v96); + svfloat32_t v99 = svsub_f32_x(svptrue_b32(), v94, v96); + svfloat32_t v100 = svadd_f32_x(svptrue_b32(), v95, v97); + svfloat32_t v101 = svsub_f32_x(svptrue_b32(), v95, v97); + svst1_f64(pred_full, (double *)(v205), svreinterpret_f64_f32(v99)); + svst1_f64(pred_full, (double *)(v214), svreinterpret_f64_f32(v101)); + svst1_f64(pred_full, (double *)(v223), svreinterpret_f64_f32(v100)); + svst1_f64(pred_full, (double *)(v232), svreinterpret_f64_f32(v98)); +} +#endif + +#ifndef ARMRAL_ARCH_SVE +void armral_fft_cf32_cf32_cf32_ac_n_uun6(const armral_cmplx_f32_t *restrict x, + armral_cmplx_f32_t *restrict y, + int istride, int ostride, int howmany, + float dir) { + float v4 = dir; + const float32x2_t *v5 = (const float32x2_t *)x; + float32x2_t *v6 = (float32x2_t *)y; + float v74 = -1.4999999999999998e+00F; + float v77 = 8.6602540378443871e-01F; + float v78 = -8.6602540378443871e-01F; + float32x2_t v13 = v5[0]; + float32x2_t v42 = v5[istride]; + float32x2_t v75 = (float32x2_t){v74, v74}; + float32x2_t v79 = (float32x2_t){v77, v78}; + float32x2_t v80 = (float32x2_t){v4, v4}; + float32x2_t v18 = v5[istride * 3]; + float32x2_t v25 = v5[istride * 2]; + float32x2_t v30 = v5[istride * 5]; + float32x2_t v37 = v5[istride * 4]; + float32x2_t v81 = vmul_f32(v80, v79); + float32x2_t v19 = vadd_f32(v13, v18); + float32x2_t v20 = vsub_f32(v13, v18); + float32x2_t v31 = vadd_f32(v25, v30); + float32x2_t v32 = vsub_f32(v25, v30); + float32x2_t v43 = vadd_f32(v37, v42); + float32x2_t v44 = vsub_f32(v37, v42); + float32x2_t v45 = vadd_f32(v31, v43); + float32x2_t v46 = vsub_f32(v31, v43); + float32x2_t v66 = vadd_f32(v32, v44); + float32x2_t v67 = vsub_f32(v32, v44); + float32x2_t v47 = vadd_f32(v45, v19); + float32x2_t v55 = vmul_f32(v45, v75); + float32x2_t v61 = vrev64_f32(v46); + float32x2_t v68 = vadd_f32(v66, v20); + float32x2_t v76 = vmul_f32(v66, v75); + float32x2_t v82 = vrev64_f32(v67); + float32x2_t v62 = vmul_f32(v61, v81); + float32x2_t v63 = vadd_f32(v47, v55); + float32x2_t v83 = vmul_f32(v82, v81); + float32x2_t v84 = vadd_f32(v68, v76); + v6[0] = v47; + v6[ostride * 3] = v68; + float32x2_t v64 = vadd_f32(v63, v62); + float32x2_t v65 = vsub_f32(v63, v62); + float32x2_t v85 = vadd_f32(v84, v83); + float32x2_t v86 = vsub_f32(v84, v83); + v6[ostride * 4] = v65; + v6[ostride] = v86; + v6[ostride * 2] = v64; + v6[ostride * 5] = v85; +} +#endif + +#ifdef ARMRAL_ARCH_SVE +void armral_fft_cf32_cf32_cf32_ac_n_uun6(const armral_cmplx_f32_t *restrict x, + armral_cmplx_f32_t *restrict y, + int istride, int ostride, int howmany, + float dir) { + int64_t v0 = istride; + int64_t v2 = ostride; + float v4 = dir; + const float32x2_t *v5 = (const float32x2_t *)x; + float32x2_t *v6 = (float32x2_t *)y; + svbool_t pred_full = svptrue_pat_b32(SV_VL2); + float v94 = -1.4999999999999998e+00F; + float v99 = -8.6602540378443871e-01F; + const float32x2_t *v202 = &v5[v0]; + float32x2_t *v245 = &v6[v2]; + int64_t v22 = v0 * 3; + int64_t v31 = v0 * 2; + int64_t v38 = v0 * 5; + int64_t v47 = v0 * 4; + float v102 = v4 * v99; + int64_t v116 = v2 * 3; + int64_t v123 = v2 * 4; + int64_t v137 = v2 * 2; + int64_t v144 = v2 * 5; + const float32x2_t *v157 = &v5[0]; + svfloat32_t v209 = svdup_n_f32(v94); + float32x2_t *v218 = &v6[0]; + svfloat32_t v277 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v202)[0])); + const float32x2_t *v166 = &v5[v22]; + const float32x2_t *v175 = &v5[v31]; + const float32x2_t *v184 = &v5[v38]; + const float32x2_t *v193 = &v5[v47]; + svfloat32_t v210 = svdup_n_f32(v102); + float32x2_t *v227 = &v6[v116]; + float32x2_t *v236 = &v6[v123]; + float32x2_t *v254 = &v6[v137]; + float32x2_t *v263 = &v6[v144]; + svfloat32_t v267 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v157)[0])); + svfloat32_t v269 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v166)[0])); + svfloat32_t v271 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v175)[0])); + svfloat32_t v273 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v184)[0])); + svfloat32_t v275 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v193)[0])); + svfloat32_t v28 = svadd_f32_x(svptrue_b32(), v267, v269); + svfloat32_t v29 = svsub_f32_x(svptrue_b32(), v267, v269); + svfloat32_t v44 = svadd_f32_x(svptrue_b32(), v271, v273); + svfloat32_t v45 = svsub_f32_x(svptrue_b32(), v271, v273); + svfloat32_t v60 = svadd_f32_x(svptrue_b32(), v275, v277); + svfloat32_t v61 = svsub_f32_x(svptrue_b32(), v275, v277); + svfloat32_t v62 = svadd_f32_x(svptrue_b32(), v44, v60); + svfloat32_t v63 = svsub_f32_x(svptrue_b32(), v44, v60); + svfloat32_t v85 = svadd_f32_x(svptrue_b32(), v45, v61); + svfloat32_t v86 = svsub_f32_x(svptrue_b32(), v45, v61); + svfloat32_t v64 = svadd_f32_x(svptrue_b32(), v62, v28); + svfloat32_t zero81 = svdup_n_f32(0); + svfloat32_t v81 = svcmla_f32_x(pred_full, zero81, v210, v63, 90); + svfloat32_t v87 = svadd_f32_x(svptrue_b32(), v85, v29); + svfloat32_t zero104 = svdup_n_f32(0); + svfloat32_t v104 = svcmla_f32_x(pred_full, zero104, v210, v86, 90); + svfloat32_t v82 = svmla_f32_x(pred_full, v64, v62, v209); + svfloat32_t v105 = svmla_f32_x(pred_full, v87, v85, v209); + svst1_f64(pred_full, (double *)(v218), svreinterpret_f64_f32(v64)); + svst1_f64(pred_full, (double *)(v227), svreinterpret_f64_f32(v87)); + svfloat32_t v83 = svadd_f32_x(svptrue_b32(), v82, v81); + svfloat32_t v84 = svsub_f32_x(svptrue_b32(), v82, v81); + svfloat32_t v106 = svadd_f32_x(svptrue_b32(), v105, v104); + svfloat32_t v107 = svsub_f32_x(svptrue_b32(), v105, v104); + svst1_f64(pred_full, (double *)(v236), svreinterpret_f64_f32(v84)); + svst1_f64(pred_full, (double *)(v245), svreinterpret_f64_f32(v107)); + svst1_f64(pred_full, (double *)(v254), svreinterpret_f64_f32(v83)); + svst1_f64(pred_full, (double *)(v263), svreinterpret_f64_f32(v106)); +} +#endif + +#ifndef ARMRAL_ARCH_SVE +void armral_fft_cf32_cf32_cf32_ac_n_uun7(const armral_cmplx_f32_t *restrict x, + armral_cmplx_f32_t *restrict y, + int istride, int ostride, int howmany, + float dir) { + float v4 = dir; + const float32x2_t *v5 = (const float32x2_t *)x; + float32x2_t *v6 = (float32x2_t *)y; + float v66 = -1.1666666666666665e+00F; + float v70 = 7.9015646852540022e-01F; + float v74 = 5.5854267289647742e-02F; + float v78 = 7.3430220123575241e-01F; + float v81 = 4.4095855184409838e-01F; + float v82 = -4.4095855184409838e-01F; + float v88 = 3.4087293062393137e-01F; + float v89 = -3.4087293062393137e-01F; + float v95 = -5.3396936033772524e-01F; + float v96 = 5.3396936033772524e-01F; + float v102 = 8.7484229096165667e-01F; + float v103 = -8.7484229096165667e-01F; + float32x2_t v13 = v5[istride]; + float32x2_t v51 = v5[0]; + float32x2_t v67 = (float32x2_t){v66, v66}; + float32x2_t v71 = (float32x2_t){v70, v70}; + float32x2_t v75 = (float32x2_t){v74, v74}; + float32x2_t v79 = (float32x2_t){v78, v78}; + float32x2_t v83 = (float32x2_t){v81, v82}; + float32x2_t v90 = (float32x2_t){v88, v89}; + float32x2_t v97 = (float32x2_t){v95, v96}; + float32x2_t v104 = (float32x2_t){v102, v103}; + float32x2_t v105 = (float32x2_t){v4, v4}; + float32x2_t v18 = v5[istride * 6]; + float32x2_t v25 = v5[istride * 4]; + float32x2_t v30 = v5[istride * 3]; + float32x2_t v37 = v5[istride * 2]; + float32x2_t v42 = v5[istride * 5]; + float32x2_t v85 = vmul_f32(v105, v83); + float32x2_t v92 = vmul_f32(v105, v90); + float32x2_t v99 = vmul_f32(v105, v97); + float32x2_t v106 = vmul_f32(v105, v104); + float32x2_t v19 = vadd_f32(v13, v18); + float32x2_t v20 = vsub_f32(v13, v18); + float32x2_t v31 = vadd_f32(v25, v30); + float32x2_t v32 = vsub_f32(v25, v30); + float32x2_t v43 = vadd_f32(v37, v42); + float32x2_t v44 = vsub_f32(v37, v42); + float32x2_t v45 = vadd_f32(v19, v31); + float32x2_t v53 = vsub_f32(v19, v31); + float32x2_t v54 = vsub_f32(v31, v43); + float32x2_t v55 = vsub_f32(v43, v19); + float32x2_t v56 = vadd_f32(v20, v32); + float32x2_t v58 = vsub_f32(v20, v32); + float32x2_t v59 = vsub_f32(v32, v44); + float32x2_t v60 = vsub_f32(v44, v20); + float32x2_t v46 = vadd_f32(v45, v43); + float32x2_t v57 = vadd_f32(v56, v44); + float32x2_t v72 = vmul_f32(v53, v71); + float32x2_t v76 = vmul_f32(v54, v75); + float32x2_t v80 = vmul_f32(v55, v79); + float32x2_t v93 = vrev64_f32(v58); + float32x2_t v100 = vrev64_f32(v59); + float32x2_t v107 = vrev64_f32(v60); + float32x2_t v52 = vadd_f32(v46, v51); + float32x2_t v68 = vmul_f32(v46, v67); + float32x2_t v86 = vrev64_f32(v57); + float32x2_t v94 = vmul_f32(v93, v92); + float32x2_t v101 = vmul_f32(v100, v99); + float32x2_t v108 = vmul_f32(v107, v106); + float32x2_t v87 = vmul_f32(v86, v85); + float32x2_t v109 = vadd_f32(v52, v68); + v6[0] = v52; + float32x2_t v110 = vadd_f32(v109, v72); + float32x2_t v112 = vsub_f32(v109, v72); + float32x2_t v114 = vsub_f32(v109, v76); + float32x2_t v116 = vadd_f32(v87, v94); + float32x2_t v118 = vsub_f32(v87, v94); + float32x2_t v120 = vsub_f32(v87, v101); + float32x2_t v111 = vadd_f32(v110, v76); + float32x2_t v113 = vsub_f32(v112, v80); + float32x2_t v115 = vadd_f32(v114, v80); + float32x2_t v117 = vadd_f32(v116, v101); + float32x2_t v119 = vsub_f32(v118, v108); + float32x2_t v121 = vadd_f32(v120, v108); + float32x2_t v122 = vadd_f32(v111, v117); + float32x2_t v123 = vsub_f32(v111, v117); + float32x2_t v124 = vadd_f32(v113, v119); + float32x2_t v125 = vsub_f32(v113, v119); + float32x2_t v126 = vadd_f32(v115, v121); + float32x2_t v127 = vsub_f32(v115, v121); + v6[ostride] = v123; + v6[ostride * 2] = v125; + v6[ostride * 3] = v126; + v6[ostride * 4] = v127; + v6[ostride * 5] = v124; + v6[ostride * 6] = v122; +} +#endif + +#ifdef ARMRAL_ARCH_SVE +void armral_fft_cf32_cf32_cf32_ac_n_uun7(const armral_cmplx_f32_t *restrict x, + armral_cmplx_f32_t *restrict y, + int istride, int ostride, int howmany, + float dir) { + int64_t v0 = istride; + int64_t v2 = ostride; + float v4 = dir; + const float32x2_t *v5 = (const float32x2_t *)x; + float32x2_t *v6 = (float32x2_t *)y; + svbool_t pred_full = svptrue_pat_b32(SV_VL2); + float v86 = -1.1666666666666665e+00F; + float v91 = 7.9015646852540022e-01F; + float v96 = 5.5854267289647742e-02F; + float v101 = 7.3430220123575241e-01F; + float v106 = -4.4095855184409838e-01F; + float v113 = -3.4087293062393137e-01F; + float v120 = 5.3396936033772524e-01F; + float v127 = -8.7484229096165667e-01F; + const float32x2_t *v207 = &v5[v0]; + float32x2_t *v290 = &v6[v2]; + int64_t v22 = v0 * 6; + int64_t v31 = v0 * 4; + int64_t v38 = v0 * 3; + int64_t v47 = v0 * 2; + int64_t v54 = v0 * 5; + float v109 = v4 * v106; + float v116 = v4 * v113; + float v123 = v4 * v120; + float v130 = v4 * v127; + int64_t v167 = v2 * 2; + int64_t v174 = v2 * 3; + int64_t v181 = v2 * 4; + int64_t v188 = v2 * 5; + int64_t v195 = v2 * 6; + const float32x2_t *v262 = &v5[0]; + svfloat32_t v266 = svdup_n_f32(v86); + svfloat32_t v267 = svdup_n_f32(v91); + svfloat32_t v268 = svdup_n_f32(v96); + svfloat32_t v269 = svdup_n_f32(v101); + float32x2_t *v281 = &v6[0]; + svfloat32_t v339 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v207)[0])); + const float32x2_t *v216 = &v5[v22]; + const float32x2_t *v225 = &v5[v31]; + const float32x2_t *v234 = &v5[v38]; + const float32x2_t *v243 = &v5[v47]; + const float32x2_t *v252 = &v5[v54]; + svfloat32_t v270 = svdup_n_f32(v109); + svfloat32_t v271 = svdup_n_f32(v116); + svfloat32_t v272 = svdup_n_f32(v123); + svfloat32_t v273 = svdup_n_f32(v130); + float32x2_t *v299 = &v6[v167]; + float32x2_t *v308 = &v6[v174]; + float32x2_t *v317 = &v6[v181]; + float32x2_t *v326 = &v6[v188]; + float32x2_t *v335 = &v6[v195]; + svfloat32_t v351 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v262)[0])); + svfloat32_t v341 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v216)[0])); + svfloat32_t v343 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v225)[0])); + svfloat32_t v345 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v234)[0])); + svfloat32_t v347 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v243)[0])); + svfloat32_t v349 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v252)[0])); + svfloat32_t v28 = svadd_f32_x(svptrue_b32(), v339, v341); + svfloat32_t v29 = svsub_f32_x(svptrue_b32(), v339, v341); + svfloat32_t v44 = svadd_f32_x(svptrue_b32(), v343, v345); + svfloat32_t v45 = svsub_f32_x(svptrue_b32(), v343, v345); + svfloat32_t v60 = svadd_f32_x(svptrue_b32(), v347, v349); + svfloat32_t v61 = svsub_f32_x(svptrue_b32(), v347, v349); + svfloat32_t v62 = svadd_f32_x(svptrue_b32(), v28, v44); + svfloat32_t v72 = svsub_f32_x(svptrue_b32(), v28, v44); + svfloat32_t v73 = svsub_f32_x(svptrue_b32(), v44, v60); + svfloat32_t v74 = svsub_f32_x(svptrue_b32(), v60, v28); + svfloat32_t v75 = svadd_f32_x(svptrue_b32(), v29, v45); + svfloat32_t v77 = svsub_f32_x(svptrue_b32(), v29, v45); + svfloat32_t v78 = svsub_f32_x(svptrue_b32(), v45, v61); + svfloat32_t v79 = svsub_f32_x(svptrue_b32(), v61, v29); + svfloat32_t v63 = svadd_f32_x(svptrue_b32(), v62, v60); + svfloat32_t v76 = svadd_f32_x(svptrue_b32(), v75, v61); + svfloat32_t zero118 = svdup_n_f32(0); + svfloat32_t v118 = svcmla_f32_x(pred_full, zero118, v271, v77, 90); + svfloat32_t zero125 = svdup_n_f32(0); + svfloat32_t v125 = svcmla_f32_x(pred_full, zero125, v272, v78, 90); + svfloat32_t zero132 = svdup_n_f32(0); + svfloat32_t v132 = svcmla_f32_x(pred_full, zero132, v273, v79, 90); + svfloat32_t v71 = svadd_f32_x(svptrue_b32(), v63, v351); + svfloat32_t zero111 = svdup_n_f32(0); + svfloat32_t v111 = svcmla_f32_x(pred_full, zero111, v270, v76, 90); + svfloat32_t v133 = svmla_f32_x(pred_full, v71, v63, v266); + svfloat32_t v140 = svadd_f32_x(svptrue_b32(), v111, v118); + svfloat32_t v142 = svsub_f32_x(svptrue_b32(), v111, v118); + svfloat32_t v144 = svsub_f32_x(svptrue_b32(), v111, v125); + svst1_f64(pred_full, (double *)(v281), svreinterpret_f64_f32(v71)); + svfloat32_t v134 = svmla_f32_x(pred_full, v133, v72, v267); + svfloat32_t v136 = svmls_f32_x(pred_full, v133, v72, v267); + svfloat32_t v138 = svmls_f32_x(pred_full, v133, v73, v268); + svfloat32_t v141 = svadd_f32_x(svptrue_b32(), v140, v125); + svfloat32_t v143 = svsub_f32_x(svptrue_b32(), v142, v132); + svfloat32_t v145 = svadd_f32_x(svptrue_b32(), v144, v132); + svfloat32_t v135 = svmla_f32_x(pred_full, v134, v73, v268); + svfloat32_t v137 = svmls_f32_x(pred_full, v136, v74, v269); + svfloat32_t v139 = svmla_f32_x(pred_full, v138, v74, v269); + svfloat32_t v146 = svadd_f32_x(svptrue_b32(), v135, v141); + svfloat32_t v147 = svsub_f32_x(svptrue_b32(), v135, v141); + svfloat32_t v148 = svadd_f32_x(svptrue_b32(), v137, v143); + svfloat32_t v149 = svsub_f32_x(svptrue_b32(), v137, v143); + svfloat32_t v150 = svadd_f32_x(svptrue_b32(), v139, v145); + svfloat32_t v151 = svsub_f32_x(svptrue_b32(), v139, v145); + svst1_f64(pred_full, (double *)(v290), svreinterpret_f64_f32(v147)); + svst1_f64(pred_full, (double *)(v299), svreinterpret_f64_f32(v149)); + svst1_f64(pred_full, (double *)(v308), svreinterpret_f64_f32(v150)); + svst1_f64(pred_full, (double *)(v317), svreinterpret_f64_f32(v151)); + svst1_f64(pred_full, (double *)(v326), svreinterpret_f64_f32(v148)); + svst1_f64(pred_full, (double *)(v335), svreinterpret_f64_f32(v146)); +} +#endif + +#ifndef ARMRAL_ARCH_SVE +void armral_fft_cf32_cf32_cf32_ac_n_uun8(const armral_cmplx_f32_t *restrict x, + armral_cmplx_f32_t *restrict y, + int istride, int ostride, int howmany, + float dir) { + float v4 = dir; + const float32x2_t *v5 = (const float32x2_t *)x; + float32x2_t *v6 = (float32x2_t *)y; + float v88 = 1.0000000000000000e+00F; + float v89 = -1.0000000000000000e+00F; + float v96 = -7.0710678118654746e-01F; + float v103 = 7.0710678118654757e-01F; + float32x2_t v13 = v5[0]; + float32x2_t v37 = v5[istride]; + float32x2_t v90 = (float32x2_t){v88, v89}; + float32x2_t v97 = (float32x2_t){v103, v96}; + float32x2_t v98 = (float32x2_t){v4, v4}; + float32x2_t v104 = (float32x2_t){v103, v103}; + float32x2_t v18 = v5[istride * 4]; + float32x2_t v25 = v5[istride * 2]; + float32x2_t v30 = v5[istride * 6]; + float32x2_t v42 = v5[istride * 5]; + float32x2_t v49 = v5[istride * 3]; + float32x2_t v54 = v5[istride * 7]; + float32x2_t v92 = vmul_f32(v98, v90); + float32x2_t v99 = vmul_f32(v98, v97); + float32x2_t v19 = vadd_f32(v13, v18); + float32x2_t v20 = vsub_f32(v13, v18); + float32x2_t v31 = vadd_f32(v25, v30); + float32x2_t v32 = vsub_f32(v25, v30); + float32x2_t v43 = vadd_f32(v37, v42); + float32x2_t v44 = vsub_f32(v37, v42); + float32x2_t v55 = vadd_f32(v49, v54); + float32x2_t v56 = vsub_f32(v49, v54); + float32x2_t v57 = vadd_f32(v19, v31); + float32x2_t v58 = vsub_f32(v19, v31); + float32x2_t v59 = vadd_f32(v43, v55); + float32x2_t v60 = vsub_f32(v43, v55); + float32x2_t v63 = vadd_f32(v44, v56); + float32x2_t v64 = vsub_f32(v44, v56); + float32x2_t v93 = vrev64_f32(v32); + float32x2_t v61 = vadd_f32(v57, v59); + float32x2_t v62 = vsub_f32(v57, v59); + float32x2_t v82 = vrev64_f32(v60); + float32x2_t v94 = vmul_f32(v93, v92); + float32x2_t v100 = vrev64_f32(v63); + float32x2_t v105 = vmul_f32(v64, v104); + float32x2_t v83 = vmul_f32(v82, v92); + float32x2_t v101 = vmul_f32(v100, v99); + float32x2_t v108 = vadd_f32(v20, v105); + float32x2_t v109 = vsub_f32(v20, v105); + v6[0] = v61; + v6[ostride * 4] = v62; + float32x2_t v106 = vadd_f32(v58, v83); + float32x2_t v107 = vsub_f32(v58, v83); + float32x2_t v110 = vadd_f32(v94, v101); + float32x2_t v111 = vsub_f32(v94, v101); + float32x2_t v112 = vadd_f32(v108, v110); + float32x2_t v113 = vsub_f32(v108, v110); + float32x2_t v114 = vadd_f32(v109, v111); + float32x2_t v115 = vsub_f32(v109, v111); + v6[ostride * 2] = v107; + v6[ostride * 6] = v106; + v6[ostride] = v113; + v6[ostride * 3] = v114; + v6[ostride * 5] = v115; + v6[ostride * 7] = v112; +} +#endif + +#ifdef ARMRAL_ARCH_SVE +void armral_fft_cf32_cf32_cf32_ac_n_uun8(const armral_cmplx_f32_t *restrict x, + armral_cmplx_f32_t *restrict y, + int istride, int ostride, int howmany, + float dir) { + int64_t v0 = istride; + int64_t v2 = ostride; + float v4 = dir; + const float32x2_t *v5 = (const float32x2_t *)x; + float32x2_t *v6 = (float32x2_t *)y; + svbool_t pred_full = svptrue_pat_b32(SV_VL2); + float v114 = -1.0000000000000000e+00F; + float v121 = -7.0710678118654746e-01F; + float v128 = 7.0710678118654757e-01F; + const float32x2_t *v241 = &v5[v0]; + float32x2_t *v295 = &v6[v2]; + int64_t v22 = v0 * 4; + int64_t v31 = v0 * 2; + int64_t v38 = v0 * 6; + int64_t v54 = v0 * 5; + int64_t v63 = v0 * 3; + int64_t v70 = v0 * 7; + float v117 = v4 * v114; + float v124 = v4 * v121; + int64_t v157 = v2 * 2; + int64_t v164 = v2 * 3; + int64_t v171 = v2 * 4; + int64_t v178 = v2 * 5; + int64_t v185 = v2 * 6; + int64_t v192 = v2 * 7; + const float32x2_t *v205 = &v5[0]; + svfloat32_t v278 = svdup_n_f32(v128); + float32x2_t *v286 = &v6[0]; + svfloat32_t v361 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v241)[0])); + const float32x2_t *v214 = &v5[v22]; + const float32x2_t *v223 = &v5[v31]; + const float32x2_t *v232 = &v5[v38]; + const float32x2_t *v250 = &v5[v54]; + const float32x2_t *v259 = &v5[v63]; + const float32x2_t *v268 = &v5[v70]; + svfloat32_t v276 = svdup_n_f32(v117); + svfloat32_t v277 = svdup_n_f32(v124); + float32x2_t *v304 = &v6[v157]; + float32x2_t *v313 = &v6[v164]; + float32x2_t *v322 = &v6[v171]; + float32x2_t *v331 = &v6[v178]; + float32x2_t *v340 = &v6[v185]; + float32x2_t *v349 = &v6[v192]; + svfloat32_t v353 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v205)[0])); + svfloat32_t v355 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v214)[0])); + svfloat32_t v357 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v223)[0])); + svfloat32_t v359 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v232)[0])); + svfloat32_t v363 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v250)[0])); + svfloat32_t v365 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v259)[0])); + svfloat32_t v367 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v268)[0])); + svfloat32_t v28 = svadd_f32_x(svptrue_b32(), v353, v355); + svfloat32_t v29 = svsub_f32_x(svptrue_b32(), v353, v355); + svfloat32_t v44 = svadd_f32_x(svptrue_b32(), v357, v359); + svfloat32_t v45 = svsub_f32_x(svptrue_b32(), v357, v359); + svfloat32_t v60 = svadd_f32_x(svptrue_b32(), v361, v363); + svfloat32_t v61 = svsub_f32_x(svptrue_b32(), v361, v363); + svfloat32_t v76 = svadd_f32_x(svptrue_b32(), v365, v367); + svfloat32_t v77 = svsub_f32_x(svptrue_b32(), v365, v367); + svfloat32_t v78 = svadd_f32_x(svptrue_b32(), v28, v44); + svfloat32_t v79 = svsub_f32_x(svptrue_b32(), v28, v44); + svfloat32_t v80 = svadd_f32_x(svptrue_b32(), v60, v76); + svfloat32_t v81 = svsub_f32_x(svptrue_b32(), v60, v76); + svfloat32_t v84 = svadd_f32_x(svptrue_b32(), v61, v77); + svfloat32_t v85 = svsub_f32_x(svptrue_b32(), v61, v77); + svfloat32_t zero119 = svdup_n_f32(0); + svfloat32_t v119 = svcmla_f32_x(pred_full, zero119, v276, v45, 90); + svfloat32_t v82 = svadd_f32_x(svptrue_b32(), v78, v80); + svfloat32_t v83 = svsub_f32_x(svptrue_b32(), v78, v80); + svfloat32_t zero107 = svdup_n_f32(0); + svfloat32_t v107 = svcmla_f32_x(pred_full, zero107, v276, v81, 90); + svfloat32_t zero126 = svdup_n_f32(0); + svfloat32_t v126 = svcmla_f32_x(pred_full, zero126, v277, v84, 90); + svfloat32_t v132 = svadd_f32_x(svptrue_b32(), v79, v107); + svfloat32_t v133 = svsub_f32_x(svptrue_b32(), v79, v107); + svfloat32_t v134 = svmla_f32_x(pred_full, v29, v85, v278); + svfloat32_t v135 = svmls_f32_x(pred_full, v29, v85, v278); + svfloat32_t v136 = svadd_f32_x(svptrue_b32(), v119, v126); + svfloat32_t v137 = svsub_f32_x(svptrue_b32(), v119, v126); + svst1_f64(pred_full, (double *)(v286), svreinterpret_f64_f32(v82)); + svst1_f64(pred_full, (double *)(v322), svreinterpret_f64_f32(v83)); + svfloat32_t v138 = svadd_f32_x(svptrue_b32(), v134, v136); + svfloat32_t v139 = svsub_f32_x(svptrue_b32(), v134, v136); + svfloat32_t v140 = svadd_f32_x(svptrue_b32(), v135, v137); + svfloat32_t v141 = svsub_f32_x(svptrue_b32(), v135, v137); + svst1_f64(pred_full, (double *)(v304), svreinterpret_f64_f32(v133)); + svst1_f64(pred_full, (double *)(v340), svreinterpret_f64_f32(v132)); + svst1_f64(pred_full, (double *)(v295), svreinterpret_f64_f32(v139)); + svst1_f64(pred_full, (double *)(v313), svreinterpret_f64_f32(v140)); + svst1_f64(pred_full, (double *)(v331), svreinterpret_f64_f32(v141)); + svst1_f64(pred_full, (double *)(v349), svreinterpret_f64_f32(v138)); +} +#endif + +#ifndef ARMRAL_ARCH_SVE +void armral_fft_cf32_cf32_cf32_ac_n_uun9(const armral_cmplx_f32_t *restrict x, + armral_cmplx_f32_t *restrict y, + int istride, int ostride, int howmany, + float dir) { + float v4 = dir; + const float32x2_t *v5 = (const float32x2_t *)x; + float32x2_t *v6 = (float32x2_t *)y; + float v79 = -5.0000000000000000e-01F; + float v90 = -1.4999999999999998e+00F; + float v93 = 8.6602540378443871e-01F; + float v94 = -8.6602540378443871e-01F; + float v101 = 7.6604444311897801e-01F; + float v105 = 9.3969262078590832e-01F; + float v109 = -1.7364817766693039e-01F; + float v112 = 6.4278760968653925e-01F; + float v113 = -6.4278760968653925e-01F; + float v119 = -3.4202014332566888e-01F; + float v120 = 3.4202014332566888e-01F; + float v126 = 9.8480775301220802e-01F; + float v127 = -9.8480775301220802e-01F; + float32x2_t v13 = v5[istride]; + float32x2_t v64 = v5[0]; + float32x2_t v80 = (float32x2_t){v79, v79}; + float32x2_t v91 = (float32x2_t){v90, v90}; + float32x2_t v95 = (float32x2_t){v93, v94}; + float32x2_t v102 = (float32x2_t){v101, v101}; + float32x2_t v106 = (float32x2_t){v105, v105}; + float32x2_t v110 = (float32x2_t){v109, v109}; + float32x2_t v114 = (float32x2_t){v112, v113}; + float32x2_t v121 = (float32x2_t){v119, v120}; + float32x2_t v128 = (float32x2_t){v126, v127}; + float32x2_t v129 = (float32x2_t){v4, v4}; + float32x2_t v18 = v5[istride * 8]; + float32x2_t v25 = v5[istride * 7]; + float32x2_t v30 = v5[istride * 2]; + float32x2_t v37 = v5[istride * 3]; + float32x2_t v42 = v5[istride * 6]; + float32x2_t v49 = v5[istride * 4]; + float32x2_t v54 = v5[istride * 5]; + float32x2_t v97 = vmul_f32(v129, v95); + float32x2_t v116 = vmul_f32(v129, v114); + float32x2_t v123 = vmul_f32(v129, v121); + float32x2_t v130 = vmul_f32(v129, v128); + float32x2_t v19 = vadd_f32(v13, v18); + float32x2_t v20 = vsub_f32(v13, v18); + float32x2_t v31 = vadd_f32(v25, v30); + float32x2_t v32 = vsub_f32(v25, v30); + float32x2_t v43 = vadd_f32(v37, v42); + float32x2_t v44 = vsub_f32(v37, v42); + float32x2_t v55 = vadd_f32(v49, v54); + float32x2_t v56 = vsub_f32(v49, v54); + float32x2_t v57 = vadd_f32(v19, v31); + float32x2_t v66 = vadd_f32(v20, v32); + float32x2_t v68 = vsub_f32(v19, v31); + float32x2_t v69 = vsub_f32(v31, v55); + float32x2_t v70 = vsub_f32(v55, v19); + float32x2_t v71 = vsub_f32(v20, v32); + float32x2_t v72 = vsub_f32(v32, v56); + float32x2_t v73 = vsub_f32(v56, v20); + float32x2_t v92 = vmul_f32(v43, v91); + float32x2_t v98 = vrev64_f32(v44); + float32x2_t v58 = vadd_f32(v57, v55); + float32x2_t v67 = vadd_f32(v66, v56); + float32x2_t v99 = vmul_f32(v98, v97); + float32x2_t v103 = vmul_f32(v68, v102); + float32x2_t v107 = vmul_f32(v69, v106); + float32x2_t v111 = vmul_f32(v70, v110); + float32x2_t v117 = vrev64_f32(v71); + float32x2_t v124 = vrev64_f32(v72); + float32x2_t v131 = vrev64_f32(v73); + float32x2_t v59 = vadd_f32(v58, v43); + float32x2_t v81 = vmul_f32(v58, v80); + float32x2_t v87 = vrev64_f32(v67); + float32x2_t v118 = vmul_f32(v117, v116); + float32x2_t v125 = vmul_f32(v124, v123); + float32x2_t v132 = vmul_f32(v131, v130); + float32x2_t v65 = vadd_f32(v59, v64); + float32x2_t v88 = vmul_f32(v87, v97); + float32x2_t v133 = vadd_f32(v81, v81); + float32x2_t v146 = vadd_f32(v99, v118); + float32x2_t v148 = vsub_f32(v99, v125); + float32x2_t v150 = vsub_f32(v99, v118); + float32x2_t v134 = vadd_f32(v133, v81); + float32x2_t v138 = vadd_f32(v65, v92); + float32x2_t v147 = vadd_f32(v146, v125); + float32x2_t v149 = vadd_f32(v148, v132); + float32x2_t v151 = vsub_f32(v150, v132); + v6[0] = v65; + float32x2_t v135 = vadd_f32(v65, v134); + float32x2_t v139 = vadd_f32(v138, v133); + float32x2_t v136 = vadd_f32(v135, v88); + float32x2_t v137 = vsub_f32(v135, v88); + float32x2_t v140 = vadd_f32(v139, v103); + float32x2_t v142 = vsub_f32(v139, v107); + float32x2_t v144 = vsub_f32(v139, v103); + float32x2_t v141 = vadd_f32(v140, v107); + float32x2_t v143 = vadd_f32(v142, v111); + float32x2_t v145 = vsub_f32(v144, v111); + v6[ostride * 3] = v137; + v6[ostride * 6] = v136; + float32x2_t v152 = vadd_f32(v141, v147); + float32x2_t v153 = vsub_f32(v141, v147); + float32x2_t v154 = vadd_f32(v143, v149); + float32x2_t v155 = vsub_f32(v143, v149); + float32x2_t v156 = vadd_f32(v145, v151); + float32x2_t v157 = vsub_f32(v145, v151); + v6[ostride] = v153; + v6[ostride * 2] = v154; + v6[ostride * 4] = v157; + v6[ostride * 5] = v156; + v6[ostride * 7] = v155; + v6[ostride * 8] = v152; +} +#endif + +#ifdef ARMRAL_ARCH_SVE +void armral_fft_cf32_cf32_cf32_ac_n_uun9(const armral_cmplx_f32_t *restrict x, + armral_cmplx_f32_t *restrict y, + int istride, int ostride, int howmany, + float dir) { + int64_t v0 = istride; + int64_t v2 = ostride; + float v4 = dir; + const float32x2_t *v5 = (const float32x2_t *)x; + float32x2_t *v6 = (float32x2_t *)y; + svbool_t pred_full = svptrue_pat_b32(SV_VL2); + float v103 = -5.0000000000000000e-01F; + float v115 = -1.4999999999999998e+00F; + float v120 = -8.6602540378443871e-01F; + float v127 = 7.6604444311897801e-01F; + float v132 = 9.3969262078590832e-01F; + float v137 = -1.7364817766693039e-01F; + float v142 = -6.4278760968653925e-01F; + float v149 = 3.4202014332566888e-01F; + float v156 = -9.8480775301220802e-01F; + const float32x2_t *v256 = &v5[v0]; + float32x2_t *v359 = &v6[v2]; + int64_t v22 = v0 * 8; + int64_t v31 = v0 * 7; + int64_t v38 = v0 * 2; + int64_t v47 = v0 * 3; + int64_t v54 = v0 * 6; + int64_t v63 = v0 * 4; + int64_t v70 = v0 * 5; + float v123 = v4 * v120; + float v145 = v4 * v142; + float v152 = v4 * v149; + float v159 = v4 * v156; + int64_t v202 = v2 * 2; + int64_t v209 = v2 * 3; + int64_t v216 = v2 * 4; + int64_t v223 = v2 * 5; + int64_t v230 = v2 * 6; + int64_t v237 = v2 * 7; + int64_t v244 = v2 * 8; + const float32x2_t *v329 = &v5[0]; + svfloat32_t v333 = svdup_n_f32(v103); + svfloat32_t v335 = svdup_n_f32(v115); + svfloat32_t v337 = svdup_n_f32(v127); + svfloat32_t v338 = svdup_n_f32(v132); + svfloat32_t v339 = svdup_n_f32(v137); + float32x2_t *v350 = &v6[0]; + svfloat32_t v426 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v256)[0])); + const float32x2_t *v265 = &v5[v22]; + const float32x2_t *v274 = &v5[v31]; + const float32x2_t *v283 = &v5[v38]; + const float32x2_t *v292 = &v5[v47]; + const float32x2_t *v301 = &v5[v54]; + const float32x2_t *v310 = &v5[v63]; + const float32x2_t *v319 = &v5[v70]; + svfloat32_t v336 = svdup_n_f32(v123); + svfloat32_t v340 = svdup_n_f32(v145); + svfloat32_t v341 = svdup_n_f32(v152); + svfloat32_t v342 = svdup_n_f32(v159); + float32x2_t *v368 = &v6[v202]; + float32x2_t *v377 = &v6[v209]; + float32x2_t *v386 = &v6[v216]; + float32x2_t *v395 = &v6[v223]; + float32x2_t *v404 = &v6[v230]; + float32x2_t *v413 = &v6[v237]; + float32x2_t *v422 = &v6[v244]; + svfloat32_t v442 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v329)[0])); + svfloat32_t v428 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v265)[0])); + svfloat32_t v430 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v274)[0])); + svfloat32_t v432 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v283)[0])); + svfloat32_t v434 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v292)[0])); + svfloat32_t v436 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v301)[0])); + svfloat32_t v438 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v310)[0])); + svfloat32_t v440 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v319)[0])); + svfloat32_t v28 = svadd_f32_x(svptrue_b32(), v426, v428); + svfloat32_t v29 = svsub_f32_x(svptrue_b32(), v426, v428); + svfloat32_t v44 = svadd_f32_x(svptrue_b32(), v430, v432); + svfloat32_t v45 = svsub_f32_x(svptrue_b32(), v430, v432); + svfloat32_t v60 = svadd_f32_x(svptrue_b32(), v434, v436); + svfloat32_t v61 = svsub_f32_x(svptrue_b32(), v434, v436); + svfloat32_t v76 = svadd_f32_x(svptrue_b32(), v438, v440); + svfloat32_t v77 = svsub_f32_x(svptrue_b32(), v438, v440); + svfloat32_t v78 = svadd_f32_x(svptrue_b32(), v28, v44); + svfloat32_t v89 = svadd_f32_x(svptrue_b32(), v29, v45); + svfloat32_t v91 = svsub_f32_x(svptrue_b32(), v28, v44); + svfloat32_t v92 = svsub_f32_x(svptrue_b32(), v44, v76); + svfloat32_t v93 = svsub_f32_x(svptrue_b32(), v76, v28); + svfloat32_t v94 = svsub_f32_x(svptrue_b32(), v29, v45); + svfloat32_t v95 = svsub_f32_x(svptrue_b32(), v45, v77); + svfloat32_t v96 = svsub_f32_x(svptrue_b32(), v77, v29); + svfloat32_t zero125 = svdup_n_f32(0); + svfloat32_t v125 = svcmla_f32_x(pred_full, zero125, v336, v61, 90); + svfloat32_t v79 = svadd_f32_x(svptrue_b32(), v78, v76); + svfloat32_t v90 = svadd_f32_x(svptrue_b32(), v89, v77); + svfloat32_t zero147 = svdup_n_f32(0); + svfloat32_t v147 = svcmla_f32_x(pred_full, zero147, v340, v94, 90); + svfloat32_t zero154 = svdup_n_f32(0); + svfloat32_t v154 = svcmla_f32_x(pred_full, zero154, v341, v95, 90); + svfloat32_t zero161 = svdup_n_f32(0); + svfloat32_t v161 = svcmla_f32_x(pred_full, zero161, v342, v96, 90); + svfloat32_t v80 = svadd_f32_x(svptrue_b32(), v79, v60); + svfloat32_t v106 = svmul_f32_x(svptrue_b32(), v79, v333); + svfloat32_t zero113 = svdup_n_f32(0); + svfloat32_t v113 = svcmla_f32_x(pred_full, zero113, v336, v90, 90); + svfloat32_t v175 = svadd_f32_x(svptrue_b32(), v125, v147); + svfloat32_t v177 = svsub_f32_x(svptrue_b32(), v125, v154); + svfloat32_t v179 = svsub_f32_x(svptrue_b32(), v125, v147); + svfloat32_t v88 = svadd_f32_x(svptrue_b32(), v80, v442); + svfloat32_t v162 = svadd_f32_x(svptrue_b32(), v106, v106); + svfloat32_t v176 = svadd_f32_x(svptrue_b32(), v175, v154); + svfloat32_t v178 = svadd_f32_x(svptrue_b32(), v177, v161); + svfloat32_t v180 = svsub_f32_x(svptrue_b32(), v179, v161); + svfloat32_t v163 = svmla_f32_x(pred_full, v162, v79, v333); + svfloat32_t v167 = svmla_f32_x(pred_full, v88, v60, v335); + svst1_f64(pred_full, (double *)(v350), svreinterpret_f64_f32(v88)); + svfloat32_t v164 = svadd_f32_x(svptrue_b32(), v88, v163); + svfloat32_t v168 = svadd_f32_x(svptrue_b32(), v167, v162); + svfloat32_t v165 = svadd_f32_x(svptrue_b32(), v164, v113); + svfloat32_t v166 = svsub_f32_x(svptrue_b32(), v164, v113); + svfloat32_t v169 = svmla_f32_x(pred_full, v168, v91, v337); + svfloat32_t v171 = svmls_f32_x(pred_full, v168, v92, v338); + svfloat32_t v173 = svmls_f32_x(pred_full, v168, v91, v337); + svfloat32_t v170 = svmla_f32_x(pred_full, v169, v92, v338); + svfloat32_t v172 = svmla_f32_x(pred_full, v171, v93, v339); + svfloat32_t v174 = svmls_f32_x(pred_full, v173, v93, v339); + svst1_f64(pred_full, (double *)(v377), svreinterpret_f64_f32(v166)); + svst1_f64(pred_full, (double *)(v404), svreinterpret_f64_f32(v165)); + svfloat32_t v181 = svadd_f32_x(svptrue_b32(), v170, v176); + svfloat32_t v182 = svsub_f32_x(svptrue_b32(), v170, v176); + svfloat32_t v183 = svadd_f32_x(svptrue_b32(), v172, v178); + svfloat32_t v184 = svsub_f32_x(svptrue_b32(), v172, v178); + svfloat32_t v185 = svadd_f32_x(svptrue_b32(), v174, v180); + svfloat32_t v186 = svsub_f32_x(svptrue_b32(), v174, v180); + svst1_f64(pred_full, (double *)(v359), svreinterpret_f64_f32(v182)); + svst1_f64(pred_full, (double *)(v368), svreinterpret_f64_f32(v183)); + svst1_f64(pred_full, (double *)(v386), svreinterpret_f64_f32(v186)); + svst1_f64(pred_full, (double *)(v395), svreinterpret_f64_f32(v185)); + svst1_f64(pred_full, (double *)(v413), svreinterpret_f64_f32(v184)); + svst1_f64(pred_full, (double *)(v422), svreinterpret_f64_f32(v181)); +} +#endif + +#ifndef ARMRAL_ARCH_SVE +void armral_fft_cf32_cf32_cf32_ac_n_uun10(const armral_cmplx_f32_t *restrict x, + armral_cmplx_f32_t *restrict y, + int istride, int ostride, int howmany, + float dir) { + float v4 = dir; + const float32x2_t *v5 = (const float32x2_t *)x; + float32x2_t *v6 = (float32x2_t *)y; + float v132 = -1.2500000000000000e+00F; + float v136 = 5.5901699437494745e-01F; + float v139 = 1.5388417685876268e+00F; + float v140 = -1.5388417685876268e+00F; + float v146 = 5.8778525229247325e-01F; + float v147 = -5.8778525229247325e-01F; + float v153 = 3.6327126400268028e-01F; + float v154 = -3.6327126400268028e-01F; + float32x2_t v13 = v5[0]; + float32x2_t v54 = v5[istride]; + float32x2_t v133 = (float32x2_t){v132, v132}; + float32x2_t v137 = (float32x2_t){v136, v136}; + float32x2_t v141 = (float32x2_t){v139, v140}; + float32x2_t v148 = (float32x2_t){v146, v147}; + float32x2_t v155 = (float32x2_t){v153, v154}; + float32x2_t v156 = (float32x2_t){v4, v4}; + float32x2_t v18 = v5[istride * 5]; + float32x2_t v25 = v5[istride * 2]; + float32x2_t v30 = v5[istride * 7]; + float32x2_t v37 = v5[istride * 4]; + float32x2_t v42 = v5[istride * 9]; + float32x2_t v49 = v5[istride * 6]; + float32x2_t v61 = v5[istride * 8]; + float32x2_t v66 = v5[istride * 3]; + float32x2_t v143 = vmul_f32(v156, v141); + float32x2_t v150 = vmul_f32(v156, v148); + float32x2_t v157 = vmul_f32(v156, v155); + float32x2_t v19 = vadd_f32(v13, v18); + float32x2_t v20 = vsub_f32(v13, v18); + float32x2_t v31 = vadd_f32(v25, v30); + float32x2_t v32 = vsub_f32(v25, v30); + float32x2_t v43 = vadd_f32(v37, v42); + float32x2_t v44 = vsub_f32(v37, v42); + float32x2_t v55 = vadd_f32(v49, v54); + float32x2_t v56 = vsub_f32(v49, v54); + float32x2_t v67 = vadd_f32(v61, v66); + float32x2_t v68 = vsub_f32(v61, v66); + float32x2_t v69 = vadd_f32(v31, v67); + float32x2_t v70 = vsub_f32(v31, v67); + float32x2_t v71 = vadd_f32(v55, v43); + float32x2_t v72 = vsub_f32(v55, v43); + float32x2_t v119 = vadd_f32(v32, v68); + float32x2_t v120 = vsub_f32(v32, v68); + float32x2_t v121 = vadd_f32(v56, v44); + float32x2_t v122 = vsub_f32(v56, v44); + float32x2_t v73 = vadd_f32(v69, v71); + float32x2_t v74 = vsub_f32(v69, v71); + float32x2_t v75 = vadd_f32(v70, v72); + float32x2_t v94 = vrev64_f32(v70); + float32x2_t v108 = vrev64_f32(v72); + float32x2_t v123 = vadd_f32(v119, v121); + float32x2_t v124 = vsub_f32(v119, v121); + float32x2_t v125 = vadd_f32(v120, v122); + float32x2_t v144 = vrev64_f32(v120); + float32x2_t v158 = vrev64_f32(v122); + float32x2_t v76 = vadd_f32(v73, v19); + float32x2_t v84 = vmul_f32(v73, v133); + float32x2_t v88 = vmul_f32(v74, v137); + float32x2_t v95 = vmul_f32(v94, v143); + float32x2_t v101 = vrev64_f32(v75); + float32x2_t v109 = vmul_f32(v108, v157); + float32x2_t v126 = vadd_f32(v123, v20); + float32x2_t v134 = vmul_f32(v123, v133); + float32x2_t v138 = vmul_f32(v124, v137); + float32x2_t v145 = vmul_f32(v144, v143); + float32x2_t v151 = vrev64_f32(v125); + float32x2_t v159 = vmul_f32(v158, v157); + float32x2_t v102 = vmul_f32(v101, v150); + float32x2_t v110 = vadd_f32(v76, v84); + float32x2_t v152 = vmul_f32(v151, v150); + float32x2_t v160 = vadd_f32(v126, v134); + v6[0] = v76; + v6[ostride * 5] = v126; + float32x2_t v111 = vadd_f32(v110, v88); + float32x2_t v112 = vsub_f32(v110, v88); + float32x2_t v113 = vsub_f32(v95, v102); + float32x2_t v114 = vadd_f32(v102, v109); + float32x2_t v161 = vadd_f32(v160, v138); + float32x2_t v162 = vsub_f32(v160, v138); + float32x2_t v163 = vsub_f32(v145, v152); + float32x2_t v164 = vadd_f32(v152, v159); + float32x2_t v115 = vadd_f32(v111, v113); + float32x2_t v116 = vsub_f32(v111, v113); + float32x2_t v117 = vadd_f32(v112, v114); + float32x2_t v118 = vsub_f32(v112, v114); + float32x2_t v165 = vadd_f32(v161, v163); + float32x2_t v166 = vsub_f32(v161, v163); + float32x2_t v167 = vadd_f32(v162, v164); + float32x2_t v168 = vsub_f32(v162, v164); + v6[ostride * 6] = v116; + v6[ostride] = v166; + v6[ostride * 2] = v118; + v6[ostride * 7] = v168; + v6[ostride * 8] = v117; + v6[ostride * 3] = v167; + v6[ostride * 4] = v115; + v6[ostride * 9] = v165; +} +#endif + +#ifdef ARMRAL_ARCH_SVE +void armral_fft_cf32_cf32_cf32_ac_n_uun10(const armral_cmplx_f32_t *restrict x, + armral_cmplx_f32_t *restrict y, + int istride, int ostride, int howmany, + float dir) { + int64_t v0 = istride; + int64_t v2 = ostride; + float v4 = dir; + const float32x2_t *v5 = (const float32x2_t *)x; + float32x2_t *v6 = (float32x2_t *)y; + svbool_t pred_full = svptrue_pat_b32(SV_VL2); + float v161 = -1.2500000000000000e+00F; + float v166 = 5.5901699437494745e-01F; + float v171 = -1.5388417685876268e+00F; + float v178 = -5.8778525229247325e-01F; + float v185 = -3.6327126400268028e-01F; + const float32x2_t *v340 = &v5[v0]; + float32x2_t *v407 = &v6[v2]; + int64_t v22 = v0 * 5; + int64_t v31 = v0 * 2; + int64_t v38 = v0 * 7; + int64_t v47 = v0 * 4; + int64_t v54 = v0 * 9; + int64_t v63 = v0 * 6; + int64_t v79 = v0 * 8; + int64_t v86 = v0 * 3; + float v174 = v4 * v171; + float v181 = v4 * v178; + float v188 = v4 * v185; + int64_t v208 = v2 * 5; + int64_t v215 = v2 * 6; + int64_t v229 = v2 * 2; + int64_t v236 = v2 * 7; + int64_t v243 = v2 * 8; + int64_t v250 = v2 * 3; + int64_t v257 = v2 * 4; + int64_t v264 = v2 * 9; + const float32x2_t *v277 = &v5[0]; + svfloat32_t v368 = svdup_n_f32(v161); + svfloat32_t v369 = svdup_n_f32(v166); + float32x2_t *v380 = &v6[0]; + svfloat32_t v479 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v340)[0])); + const float32x2_t *v286 = &v5[v22]; + const float32x2_t *v295 = &v5[v31]; + const float32x2_t *v304 = &v5[v38]; + const float32x2_t *v313 = &v5[v47]; + const float32x2_t *v322 = &v5[v54]; + const float32x2_t *v331 = &v5[v63]; + const float32x2_t *v349 = &v5[v79]; + const float32x2_t *v358 = &v5[v86]; + svfloat32_t v370 = svdup_n_f32(v174); + svfloat32_t v371 = svdup_n_f32(v181); + svfloat32_t v372 = svdup_n_f32(v188); + float32x2_t *v389 = &v6[v208]; + float32x2_t *v398 = &v6[v215]; + float32x2_t *v416 = &v6[v229]; + float32x2_t *v425 = &v6[v236]; + float32x2_t *v434 = &v6[v243]; + float32x2_t *v443 = &v6[v250]; + float32x2_t *v452 = &v6[v257]; + float32x2_t *v461 = &v6[v264]; + svfloat32_t v465 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v277)[0])); + svfloat32_t v467 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v286)[0])); + svfloat32_t v469 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v295)[0])); + svfloat32_t v471 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v304)[0])); + svfloat32_t v473 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v313)[0])); + svfloat32_t v475 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v322)[0])); + svfloat32_t v477 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v331)[0])); + svfloat32_t v481 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v349)[0])); + svfloat32_t v483 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v358)[0])); + svfloat32_t v28 = svadd_f32_x(svptrue_b32(), v465, v467); + svfloat32_t v29 = svsub_f32_x(svptrue_b32(), v465, v467); + svfloat32_t v44 = svadd_f32_x(svptrue_b32(), v469, v471); + svfloat32_t v45 = svsub_f32_x(svptrue_b32(), v469, v471); + svfloat32_t v60 = svadd_f32_x(svptrue_b32(), v473, v475); + svfloat32_t v61 = svsub_f32_x(svptrue_b32(), v473, v475); + svfloat32_t v76 = svadd_f32_x(svptrue_b32(), v477, v479); + svfloat32_t v77 = svsub_f32_x(svptrue_b32(), v477, v479); + svfloat32_t v92 = svadd_f32_x(svptrue_b32(), v481, v483); + svfloat32_t v93 = svsub_f32_x(svptrue_b32(), v481, v483); + svfloat32_t v94 = svadd_f32_x(svptrue_b32(), v44, v92); + svfloat32_t v95 = svsub_f32_x(svptrue_b32(), v44, v92); + svfloat32_t v96 = svadd_f32_x(svptrue_b32(), v76, v60); + svfloat32_t v97 = svsub_f32_x(svptrue_b32(), v76, v60); + svfloat32_t v147 = svadd_f32_x(svptrue_b32(), v45, v93); + svfloat32_t v148 = svsub_f32_x(svptrue_b32(), v45, v93); + svfloat32_t v149 = svadd_f32_x(svptrue_b32(), v77, v61); + svfloat32_t v150 = svsub_f32_x(svptrue_b32(), v77, v61); + svfloat32_t v98 = svadd_f32_x(svptrue_b32(), v94, v96); + svfloat32_t v99 = svsub_f32_x(svptrue_b32(), v94, v96); + svfloat32_t v100 = svadd_f32_x(svptrue_b32(), v95, v97); + svfloat32_t zero123 = svdup_n_f32(0); + svfloat32_t v123 = svcmla_f32_x(pred_full, zero123, v370, v95, 90); + svfloat32_t v151 = svadd_f32_x(svptrue_b32(), v147, v149); + svfloat32_t v152 = svsub_f32_x(svptrue_b32(), v147, v149); + svfloat32_t v153 = svadd_f32_x(svptrue_b32(), v148, v150); + svfloat32_t zero176 = svdup_n_f32(0); + svfloat32_t v176 = svcmla_f32_x(pred_full, zero176, v370, v148, 90); + svfloat32_t v101 = svadd_f32_x(svptrue_b32(), v98, v28); + svfloat32_t zero130 = svdup_n_f32(0); + svfloat32_t v130 = svcmla_f32_x(pred_full, zero130, v371, v100, 90); + svfloat32_t v154 = svadd_f32_x(svptrue_b32(), v151, v29); + svfloat32_t zero183 = svdup_n_f32(0); + svfloat32_t v183 = svcmla_f32_x(pred_full, zero183, v371, v153, 90); + svfloat32_t v138 = svmla_f32_x(pred_full, v101, v98, v368); + svfloat32_t v141 = svsub_f32_x(svptrue_b32(), v123, v130); + svfloat32_t v142 = svcmla_f32_x(pred_full, v130, v372, v97, 90); + svfloat32_t v191 = svmla_f32_x(pred_full, v154, v151, v368); + svfloat32_t v194 = svsub_f32_x(svptrue_b32(), v176, v183); + svfloat32_t v195 = svcmla_f32_x(pred_full, v183, v372, v150, 90); + svst1_f64(pred_full, (double *)(v380), svreinterpret_f64_f32(v101)); + svst1_f64(pred_full, (double *)(v389), svreinterpret_f64_f32(v154)); + svfloat32_t v139 = svmla_f32_x(pred_full, v138, v99, v369); + svfloat32_t v140 = svmls_f32_x(pred_full, v138, v99, v369); + svfloat32_t v192 = svmla_f32_x(pred_full, v191, v152, v369); + svfloat32_t v193 = svmls_f32_x(pred_full, v191, v152, v369); + svfloat32_t v143 = svadd_f32_x(svptrue_b32(), v139, v141); + svfloat32_t v144 = svsub_f32_x(svptrue_b32(), v139, v141); + svfloat32_t v145 = svadd_f32_x(svptrue_b32(), v140, v142); + svfloat32_t v146 = svsub_f32_x(svptrue_b32(), v140, v142); + svfloat32_t v196 = svadd_f32_x(svptrue_b32(), v192, v194); + svfloat32_t v197 = svsub_f32_x(svptrue_b32(), v192, v194); + svfloat32_t v198 = svadd_f32_x(svptrue_b32(), v193, v195); + svfloat32_t v199 = svsub_f32_x(svptrue_b32(), v193, v195); + svst1_f64(pred_full, (double *)(v398), svreinterpret_f64_f32(v144)); + svst1_f64(pred_full, (double *)(v407), svreinterpret_f64_f32(v197)); + svst1_f64(pred_full, (double *)(v416), svreinterpret_f64_f32(v146)); + svst1_f64(pred_full, (double *)(v425), svreinterpret_f64_f32(v199)); + svst1_f64(pred_full, (double *)(v434), svreinterpret_f64_f32(v145)); + svst1_f64(pred_full, (double *)(v443), svreinterpret_f64_f32(v198)); + svst1_f64(pred_full, (double *)(v452), svreinterpret_f64_f32(v143)); + svst1_f64(pred_full, (double *)(v461), svreinterpret_f64_f32(v196)); +} +#endif + +#ifndef ARMRAL_ARCH_SVE +void armral_fft_cf32_cf32_cf32_ac_n_uun11(const armral_cmplx_f32_t *restrict x, + armral_cmplx_f32_t *restrict y, + int istride, int ostride, int howmany, + float dir) { + float v4 = dir; + const float32x2_t *v5 = (const float32x2_t *)x; + float32x2_t *v6 = (float32x2_t *)y; + float v106 = 1.1000000000000001e+00F; + float v109 = 3.3166247903554003e-01F; + float v110 = -3.3166247903554003e-01F; + float v117 = 5.1541501300188641e-01F; + float v121 = 9.4125353283118118e-01F; + float v125 = 1.4143537075597825e+00F; + float v129 = 8.5949297361449750e-01F; + float v133 = 4.2314838273285138e-02F; + float v137 = 3.8639279888589606e-01F; + float v141 = 5.1254589567200015e-01F; + float v145 = 1.0702757469471715e+00F; + float v149 = 5.5486073394528512e-01F; + float v152 = 1.2412944743900585e+00F; + float v153 = -1.2412944743900585e+00F; + float v159 = 2.0897833842005756e-01F; + float v160 = -2.0897833842005756e-01F; + float v166 = 3.7415717312460811e-01F; + float v167 = -3.7415717312460811e-01F; + float v173 = 4.9929922194110327e-02F; + float v174 = -4.9929922194110327e-02F; + float v180 = 6.5815896284539266e-01F; + float v181 = -6.5815896284539266e-01F; + float v187 = 6.3306543373877577e-01F; + float v188 = -6.3306543373877577e-01F; + float v194 = 1.0822460581641109e+00F; + float v195 = -1.0822460581641109e+00F; + float v201 = 8.1720737907134022e-01F; + float v202 = -8.1720737907134022e-01F; + float v208 = 4.2408709531871824e-01F; + float v209 = -4.2408709531871824e-01F; + float32x2_t v13 = v5[istride]; + float32x2_t v79 = v5[0]; + float32x2_t v107 = (float32x2_t){v106, v106}; + float32x2_t v111 = (float32x2_t){v109, v110}; + float32x2_t v118 = (float32x2_t){v117, v117}; + float32x2_t v122 = (float32x2_t){v121, v121}; + float32x2_t v126 = (float32x2_t){v125, v125}; + float32x2_t v130 = (float32x2_t){v129, v129}; + float32x2_t v134 = (float32x2_t){v133, v133}; + float32x2_t v138 = (float32x2_t){v137, v137}; + float32x2_t v142 = (float32x2_t){v141, v141}; + float32x2_t v146 = (float32x2_t){v145, v145}; + float32x2_t v150 = (float32x2_t){v149, v149}; + float32x2_t v154 = (float32x2_t){v152, v153}; + float32x2_t v161 = (float32x2_t){v159, v160}; + float32x2_t v168 = (float32x2_t){v166, v167}; + float32x2_t v175 = (float32x2_t){v173, v174}; + float32x2_t v182 = (float32x2_t){v180, v181}; + float32x2_t v189 = (float32x2_t){v187, v188}; + float32x2_t v196 = (float32x2_t){v194, v195}; + float32x2_t v203 = (float32x2_t){v201, v202}; + float32x2_t v210 = (float32x2_t){v208, v209}; + float32x2_t v211 = (float32x2_t){v4, v4}; + float32x2_t v18 = v5[istride * 10]; + float32x2_t v24 = v5[istride * 2]; + float32x2_t v29 = v5[istride * 9]; + float32x2_t v35 = v5[istride * 3]; + float32x2_t v40 = v5[istride * 8]; + float32x2_t v46 = v5[istride * 4]; + float32x2_t v51 = v5[istride * 7]; + float32x2_t v57 = v5[istride * 5]; + float32x2_t v62 = v5[istride * 6]; + float32x2_t v113 = vmul_f32(v211, v111); + float32x2_t v156 = vmul_f32(v211, v154); + float32x2_t v163 = vmul_f32(v211, v161); + float32x2_t v170 = vmul_f32(v211, v168); + float32x2_t v177 = vmul_f32(v211, v175); + float32x2_t v184 = vmul_f32(v211, v182); + float32x2_t v191 = vmul_f32(v211, v189); + float32x2_t v198 = vmul_f32(v211, v196); + float32x2_t v205 = vmul_f32(v211, v203); + float32x2_t v212 = vmul_f32(v211, v210); + float32x2_t v19 = vadd_f32(v13, v18); + float32x2_t v30 = vadd_f32(v24, v29); + float32x2_t v41 = vadd_f32(v35, v40); + float32x2_t v52 = vadd_f32(v46, v51); + float32x2_t v63 = vadd_f32(v57, v62); + float32x2_t v64 = vsub_f32(v13, v18); + float32x2_t v65 = vsub_f32(v24, v29); + float32x2_t v66 = vsub_f32(v35, v40); + float32x2_t v67 = vsub_f32(v46, v51); + float32x2_t v68 = vsub_f32(v57, v62); + float32x2_t v69 = vadd_f32(v19, v30); + float32x2_t v70 = vadd_f32(v41, v63); + float32x2_t v72 = vsub_f32(v65, v66); + float32x2_t v73 = vadd_f32(v64, v68); + float32x2_t v83 = vsub_f32(v30, v52); + float32x2_t v84 = vsub_f32(v19, v52); + float32x2_t v85 = vsub_f32(v30, v19); + float32x2_t v86 = vsub_f32(v63, v52); + float32x2_t v87 = vsub_f32(v41, v52); + float32x2_t v88 = vsub_f32(v63, v41); + float32x2_t v89 = vsub_f32(v30, v63); + float32x2_t v90 = vsub_f32(v19, v41); + float32x2_t v92 = vadd_f32(v65, v67); + float32x2_t v93 = vsub_f32(v64, v67); + float32x2_t v94 = vadd_f32(v64, v65); + float32x2_t v95 = vsub_f32(v67, v68); + float32x2_t v96 = vsub_f32(v66, v67); + float32x2_t v97 = vsub_f32(v66, v68); + float32x2_t v98 = vadd_f32(v65, v68); + float32x2_t v99 = vsub_f32(v64, v66); + float32x2_t v71 = vadd_f32(v52, v69); + float32x2_t v81 = vsub_f32(v72, v73); + float32x2_t v91 = vsub_f32(v70, v69); + float32x2_t v100 = vadd_f32(v72, v73); + float32x2_t v119 = vmul_f32(v83, v118); + float32x2_t v123 = vmul_f32(v84, v122); + float32x2_t v127 = vmul_f32(v85, v126); + float32x2_t v131 = vmul_f32(v86, v130); + float32x2_t v135 = vmul_f32(v87, v134); + float32x2_t v139 = vmul_f32(v88, v138); + float32x2_t v143 = vmul_f32(v89, v142); + float32x2_t v147 = vmul_f32(v90, v146); + float32x2_t v157 = vrev64_f32(v92); + float32x2_t v164 = vrev64_f32(v93); + float32x2_t v171 = vrev64_f32(v94); + float32x2_t v178 = vrev64_f32(v95); + float32x2_t v185 = vrev64_f32(v96); + float32x2_t v192 = vrev64_f32(v97); + float32x2_t v199 = vrev64_f32(v98); + float32x2_t v206 = vrev64_f32(v99); + float32x2_t v74 = vadd_f32(v71, v70); + float32x2_t v82 = vsub_f32(v81, v67); + float32x2_t v151 = vmul_f32(v91, v150); + float32x2_t v158 = vmul_f32(v157, v156); + float32x2_t v165 = vmul_f32(v164, v163); + float32x2_t v172 = vmul_f32(v171, v170); + float32x2_t v179 = vmul_f32(v178, v177); + float32x2_t v186 = vmul_f32(v185, v184); + float32x2_t v193 = vmul_f32(v192, v191); + float32x2_t v200 = vmul_f32(v199, v198); + float32x2_t v207 = vmul_f32(v206, v205); + float32x2_t v213 = vrev64_f32(v100); + float32x2_t v216 = vadd_f32(v119, v123); + float32x2_t v217 = vadd_f32(v123, v127); + float32x2_t v218 = vsub_f32(v119, v127); + float32x2_t v219 = vadd_f32(v131, v135); + float32x2_t v220 = vadd_f32(v135, v139); + float32x2_t v221 = vsub_f32(v131, v139); + float32x2_t v80 = vadd_f32(v79, v74); + float32x2_t v108 = vmul_f32(v74, v107); + float32x2_t v114 = vrev64_f32(v82); + float32x2_t v214 = vmul_f32(v213, v212); + float32x2_t v222 = vadd_f32(v147, v151); + float32x2_t v223 = vadd_f32(v143, v151); + float32x2_t v224 = vadd_f32(v165, v172); + float32x2_t v225 = vsub_f32(v158, v172); + float32x2_t v226 = vadd_f32(v186, v193); + float32x2_t v227 = vsub_f32(v179, v193); + float32x2_t v115 = vmul_f32(v114, v113); + float32x2_t v215 = vsub_f32(v80, v108); + float32x2_t v228 = vadd_f32(v207, v214); + float32x2_t v229 = vsub_f32(v200, v214); + float32x2_t v230 = vadd_f32(v220, v222); + float32x2_t v248 = vadd_f32(v224, v225); + v6[0] = v80; + float32x2_t v231 = vadd_f32(v230, v215); + float32x2_t v232 = vsub_f32(v215, v217); + float32x2_t v234 = vadd_f32(v215, v221); + float32x2_t v236 = vsub_f32(v215, v218); + float32x2_t v238 = vadd_f32(v215, v216); + float32x2_t v240 = vadd_f32(v115, v226); + float32x2_t v242 = vsub_f32(v228, v224); + float32x2_t v244 = vadd_f32(v115, v229); + float32x2_t v246 = vsub_f32(v229, v225); + float32x2_t v249 = vadd_f32(v248, v226); + float32x2_t v233 = vsub_f32(v232, v222); + float32x2_t v235 = vadd_f32(v234, v223); + float32x2_t v237 = vsub_f32(v236, v223); + float32x2_t v239 = vsub_f32(v238, v219); + float32x2_t v241 = vadd_f32(v240, v228); + float32x2_t v243 = vsub_f32(v242, v115); + float32x2_t v245 = vadd_f32(v244, v227); + float32x2_t v247 = vsub_f32(v246, v115); + float32x2_t v250 = vadd_f32(v249, v227); + float32x2_t v251 = vsub_f32(v250, v115); + float32x2_t v253 = vadd_f32(v231, v241); + float32x2_t v254 = vadd_f32(v233, v243); + float32x2_t v255 = vsub_f32(v235, v245); + float32x2_t v256 = vadd_f32(v237, v247); + float32x2_t v257 = vsub_f32(v237, v247); + float32x2_t v258 = vadd_f32(v235, v245); + float32x2_t v259 = vsub_f32(v233, v243); + float32x2_t v260 = vsub_f32(v231, v241); + float32x2_t v252 = vadd_f32(v239, v251); + float32x2_t v261 = vsub_f32(v239, v251); + v6[ostride * 9] = v253; + v6[ostride * 8] = v254; + v6[ostride * 7] = v255; + v6[ostride * 6] = v256; + v6[ostride * 5] = v257; + v6[ostride * 4] = v258; + v6[ostride * 3] = v259; + v6[ostride * 2] = v260; + v6[ostride * 10] = v252; + v6[ostride] = v261; +} +#endif + +#ifdef ARMRAL_ARCH_SVE +void armral_fft_cf32_cf32_cf32_ac_n_uun11(const armral_cmplx_f32_t *restrict x, + armral_cmplx_f32_t *restrict y, + int istride, int ostride, int howmany, + float dir) { + int64_t v0 = istride; + int64_t v2 = ostride; + float v4 = dir; + const float32x2_t *v5 = (const float32x2_t *)x; + float32x2_t *v6 = (float32x2_t *)y; + svbool_t pred_full = svptrue_pat_b32(SV_VL2); + float v134 = 1.1000000000000001e+00F; + float v139 = -3.3166247903554003e-01F; + float v146 = 5.1541501300188641e-01F; + float v151 = 9.4125353283118118e-01F; + float v156 = 1.4143537075597825e+00F; + float v161 = 8.5949297361449750e-01F; + float v166 = 4.2314838273285138e-02F; + float v171 = 3.8639279888589606e-01F; + float v176 = 5.1254589567200015e-01F; + float v181 = 1.0702757469471715e+00F; + float v186 = 5.5486073394528512e-01F; + float v191 = -1.2412944743900585e+00F; + float v198 = -2.0897833842005756e-01F; + float v205 = -3.7415717312460811e-01F; + float v212 = -4.9929922194110327e-02F; + float v219 = -6.5815896284539266e-01F; + float v226 = -6.3306543373877577e-01F; + float v233 = -1.0822460581641109e+00F; + float v240 = -8.1720737907134022e-01F; + float v247 = -4.2408709531871824e-01F; + const float32x2_t *v383 = &v5[v0]; + float32x2_t *v595 = &v6[v2]; + int64_t v22 = v0 * 10; + int64_t v30 = v0 * 2; + int64_t v37 = v0 * 9; + int64_t v45 = v0 * 3; + int64_t v52 = v0 * 8; + int64_t v60 = v0 * 4; + int64_t v67 = v0 * 7; + int64_t v75 = v0 * 5; + int64_t v82 = v0 * 6; + float v142 = v4 * v139; + float v194 = v4 * v191; + float v201 = v4 * v198; + float v208 = v4 * v205; + float v215 = v4 * v212; + float v222 = v4 * v219; + float v229 = v4 * v226; + float v236 = v4 * v233; + float v243 = v4 * v240; + float v250 = v4 * v247; + int64_t v308 = v2 * 10; + int64_t v315 = v2 * 9; + int64_t v322 = v2 * 8; + int64_t v329 = v2 * 7; + int64_t v336 = v2 * 6; + int64_t v343 = v2 * 5; + int64_t v350 = v2 * 4; + int64_t v357 = v2 * 3; + int64_t v364 = v2 * 2; + const float32x2_t *v474 = &v5[0]; + svfloat32_t v478 = svdup_n_f32(v134); + svfloat32_t v480 = svdup_n_f32(v146); + svfloat32_t v481 = svdup_n_f32(v151); + svfloat32_t v482 = svdup_n_f32(v156); + svfloat32_t v483 = svdup_n_f32(v161); + svfloat32_t v484 = svdup_n_f32(v166); + svfloat32_t v485 = svdup_n_f32(v171); + svfloat32_t v486 = svdup_n_f32(v176); + svfloat32_t v487 = svdup_n_f32(v181); + svfloat32_t v488 = svdup_n_f32(v186); + float32x2_t *v505 = &v6[0]; + svfloat32_t v599 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v383)[0])); + const float32x2_t *v392 = &v5[v22]; + const float32x2_t *v401 = &v5[v30]; + const float32x2_t *v410 = &v5[v37]; + const float32x2_t *v419 = &v5[v45]; + const float32x2_t *v428 = &v5[v52]; + const float32x2_t *v437 = &v5[v60]; + const float32x2_t *v446 = &v5[v67]; + const float32x2_t *v455 = &v5[v75]; + const float32x2_t *v464 = &v5[v82]; + svfloat32_t v479 = svdup_n_f32(v142); + svfloat32_t v489 = svdup_n_f32(v194); + svfloat32_t v490 = svdup_n_f32(v201); + svfloat32_t v491 = svdup_n_f32(v208); + svfloat32_t v492 = svdup_n_f32(v215); + svfloat32_t v493 = svdup_n_f32(v222); + svfloat32_t v494 = svdup_n_f32(v229); + svfloat32_t v495 = svdup_n_f32(v236); + svfloat32_t v496 = svdup_n_f32(v243); + svfloat32_t v497 = svdup_n_f32(v250); + float32x2_t *v514 = &v6[v308]; + float32x2_t *v523 = &v6[v315]; + float32x2_t *v532 = &v6[v322]; + float32x2_t *v541 = &v6[v329]; + float32x2_t *v550 = &v6[v336]; + float32x2_t *v559 = &v6[v343]; + float32x2_t *v568 = &v6[v350]; + float32x2_t *v577 = &v6[v357]; + float32x2_t *v586 = &v6[v364]; + svfloat32_t v619 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v474)[0])); + svfloat32_t v601 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v392)[0])); + svfloat32_t v603 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v401)[0])); + svfloat32_t v605 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v410)[0])); + svfloat32_t v607 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v419)[0])); + svfloat32_t v609 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v428)[0])); + svfloat32_t v611 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v437)[0])); + svfloat32_t v613 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v446)[0])); + svfloat32_t v615 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v455)[0])); + svfloat32_t v617 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v464)[0])); + svfloat32_t v28 = svadd_f32_x(svptrue_b32(), v599, v601); + svfloat32_t v43 = svadd_f32_x(svptrue_b32(), v603, v605); + svfloat32_t v58 = svadd_f32_x(svptrue_b32(), v607, v609); + svfloat32_t v73 = svadd_f32_x(svptrue_b32(), v611, v613); + svfloat32_t v88 = svadd_f32_x(svptrue_b32(), v615, v617); + svfloat32_t v89 = svsub_f32_x(svptrue_b32(), v599, v601); + svfloat32_t v90 = svsub_f32_x(svptrue_b32(), v603, v605); + svfloat32_t v91 = svsub_f32_x(svptrue_b32(), v607, v609); + svfloat32_t v92 = svsub_f32_x(svptrue_b32(), v611, v613); + svfloat32_t v93 = svsub_f32_x(svptrue_b32(), v615, v617); + svfloat32_t v94 = svadd_f32_x(svptrue_b32(), v28, v43); + svfloat32_t v95 = svadd_f32_x(svptrue_b32(), v58, v88); + svfloat32_t v97 = svsub_f32_x(svptrue_b32(), v90, v91); + svfloat32_t v98 = svadd_f32_x(svptrue_b32(), v89, v93); + svfloat32_t v110 = svsub_f32_x(svptrue_b32(), v43, v73); + svfloat32_t v111 = svsub_f32_x(svptrue_b32(), v28, v73); + svfloat32_t v112 = svsub_f32_x(svptrue_b32(), v43, v28); + svfloat32_t v113 = svsub_f32_x(svptrue_b32(), v88, v73); + svfloat32_t v114 = svsub_f32_x(svptrue_b32(), v58, v73); + svfloat32_t v115 = svsub_f32_x(svptrue_b32(), v88, v58); + svfloat32_t v116 = svsub_f32_x(svptrue_b32(), v43, v88); + svfloat32_t v117 = svsub_f32_x(svptrue_b32(), v28, v58); + svfloat32_t v119 = svadd_f32_x(svptrue_b32(), v90, v92); + svfloat32_t v120 = svsub_f32_x(svptrue_b32(), v89, v92); + svfloat32_t v121 = svadd_f32_x(svptrue_b32(), v89, v90); + svfloat32_t v122 = svsub_f32_x(svptrue_b32(), v92, v93); + svfloat32_t v123 = svsub_f32_x(svptrue_b32(), v91, v92); + svfloat32_t v124 = svsub_f32_x(svptrue_b32(), v91, v93); + svfloat32_t v125 = svadd_f32_x(svptrue_b32(), v90, v93); + svfloat32_t v126 = svsub_f32_x(svptrue_b32(), v89, v91); + svfloat32_t v96 = svadd_f32_x(svptrue_b32(), v73, v94); + svfloat32_t v108 = svsub_f32_x(svptrue_b32(), v97, v98); + svfloat32_t v118 = svsub_f32_x(svptrue_b32(), v95, v94); + svfloat32_t v127 = svadd_f32_x(svptrue_b32(), v97, v98); + svfloat32_t v154 = svmul_f32_x(svptrue_b32(), v111, v481); + svfloat32_t v159 = svmul_f32_x(svptrue_b32(), v112, v482); + svfloat32_t v169 = svmul_f32_x(svptrue_b32(), v114, v484); + svfloat32_t v174 = svmul_f32_x(svptrue_b32(), v115, v485); + svfloat32_t zero196 = svdup_n_f32(0); + svfloat32_t v196 = svcmla_f32_x(pred_full, zero196, v489, v119, 90); + svfloat32_t zero210 = svdup_n_f32(0); + svfloat32_t v210 = svcmla_f32_x(pred_full, zero210, v491, v121, 90); + svfloat32_t zero217 = svdup_n_f32(0); + svfloat32_t v217 = svcmla_f32_x(pred_full, zero217, v492, v122, 90); + svfloat32_t zero231 = svdup_n_f32(0); + svfloat32_t v231 = svcmla_f32_x(pred_full, zero231, v494, v124, 90); + svfloat32_t zero238 = svdup_n_f32(0); + svfloat32_t v238 = svcmla_f32_x(pred_full, zero238, v495, v125, 90); + svfloat32_t v99 = svadd_f32_x(svptrue_b32(), v96, v95); + svfloat32_t v109 = svsub_f32_x(svptrue_b32(), v108, v92); + svfloat32_t v189 = svmul_f32_x(svptrue_b32(), v118, v488); + svfloat32_t zero252 = svdup_n_f32(0); + svfloat32_t v252 = svcmla_f32_x(pred_full, zero252, v497, v127, 90); + svfloat32_t v254 = svmla_f32_x(pred_full, v154, v110, v480); + svfloat32_t v255 = svmla_f32_x(pred_full, v159, v111, v481); + svfloat32_t v256 = svnmls_f32_x(pred_full, v159, v110, v480); + svfloat32_t v257 = svmla_f32_x(pred_full, v169, v113, v483); + svfloat32_t v258 = svmla_f32_x(pred_full, v174, v114, v484); + svfloat32_t v259 = svnmls_f32_x(pred_full, v174, v113, v483); + svfloat32_t v262 = svcmla_f32_x(pred_full, v210, v490, v120, 90); + svfloat32_t v263 = svsub_f32_x(svptrue_b32(), v196, v210); + svfloat32_t v264 = svcmla_f32_x(pred_full, v231, v493, v123, 90); + svfloat32_t v265 = svsub_f32_x(svptrue_b32(), v217, v231); + svfloat32_t v107 = svadd_f32_x(svptrue_b32(), v619, v99); + svfloat32_t zero144 = svdup_n_f32(0); + svfloat32_t v144 = svcmla_f32_x(pred_full, zero144, v479, v109, 90); + svfloat32_t v260 = svmla_f32_x(pred_full, v189, v117, v487); + svfloat32_t v261 = svmla_f32_x(pred_full, v189, v116, v486); + svfloat32_t v266 = svcmla_f32_x(pred_full, v252, v496, v126, 90); + svfloat32_t v267 = svsub_f32_x(svptrue_b32(), v238, v252); + svfloat32_t v286 = svadd_f32_x(svptrue_b32(), v262, v263); + svfloat32_t v253 = svmls_f32_x(pred_full, v107, v99, v478); + svfloat32_t v268 = svadd_f32_x(svptrue_b32(), v258, v260); + svfloat32_t v278 = svadd_f32_x(svptrue_b32(), v144, v264); + svfloat32_t v280 = svsub_f32_x(svptrue_b32(), v266, v262); + svfloat32_t v282 = svadd_f32_x(svptrue_b32(), v144, v267); + svfloat32_t v284 = svsub_f32_x(svptrue_b32(), v267, v263); + svfloat32_t v287 = svadd_f32_x(svptrue_b32(), v286, v264); + svst1_f64(pred_full, (double *)(v505), svreinterpret_f64_f32(v107)); + svfloat32_t v269 = svadd_f32_x(svptrue_b32(), v268, v253); + svfloat32_t v270 = svsub_f32_x(svptrue_b32(), v253, v255); + svfloat32_t v272 = svadd_f32_x(svptrue_b32(), v253, v259); + svfloat32_t v274 = svsub_f32_x(svptrue_b32(), v253, v256); + svfloat32_t v276 = svadd_f32_x(svptrue_b32(), v253, v254); + svfloat32_t v279 = svadd_f32_x(svptrue_b32(), v278, v266); + svfloat32_t v281 = svsub_f32_x(svptrue_b32(), v280, v144); + svfloat32_t v283 = svadd_f32_x(svptrue_b32(), v282, v265); + svfloat32_t v285 = svsub_f32_x(svptrue_b32(), v284, v144); + svfloat32_t v288 = svadd_f32_x(svptrue_b32(), v287, v265); + svfloat32_t v271 = svsub_f32_x(svptrue_b32(), v270, v260); + svfloat32_t v273 = svadd_f32_x(svptrue_b32(), v272, v261); + svfloat32_t v275 = svsub_f32_x(svptrue_b32(), v274, v261); + svfloat32_t v277 = svsub_f32_x(svptrue_b32(), v276, v257); + svfloat32_t v289 = svsub_f32_x(svptrue_b32(), v288, v144); + svfloat32_t v291 = svadd_f32_x(svptrue_b32(), v269, v279); + svfloat32_t v298 = svsub_f32_x(svptrue_b32(), v269, v279); + svfloat32_t v290 = svadd_f32_x(svptrue_b32(), v277, v289); + svfloat32_t v292 = svadd_f32_x(svptrue_b32(), v271, v281); + svfloat32_t v293 = svsub_f32_x(svptrue_b32(), v273, v283); + svfloat32_t v294 = svadd_f32_x(svptrue_b32(), v275, v285); + svfloat32_t v295 = svsub_f32_x(svptrue_b32(), v275, v285); + svfloat32_t v296 = svadd_f32_x(svptrue_b32(), v273, v283); + svfloat32_t v297 = svsub_f32_x(svptrue_b32(), v271, v281); + svfloat32_t v299 = svsub_f32_x(svptrue_b32(), v277, v289); + svst1_f64(pred_full, (double *)(v523), svreinterpret_f64_f32(v291)); + svst1_f64(pred_full, (double *)(v586), svreinterpret_f64_f32(v298)); + svst1_f64(pred_full, (double *)(v514), svreinterpret_f64_f32(v290)); + svst1_f64(pred_full, (double *)(v532), svreinterpret_f64_f32(v292)); + svst1_f64(pred_full, (double *)(v541), svreinterpret_f64_f32(v293)); + svst1_f64(pred_full, (double *)(v550), svreinterpret_f64_f32(v294)); + svst1_f64(pred_full, (double *)(v559), svreinterpret_f64_f32(v295)); + svst1_f64(pred_full, (double *)(v568), svreinterpret_f64_f32(v296)); + svst1_f64(pred_full, (double *)(v577), svreinterpret_f64_f32(v297)); + svst1_f64(pred_full, (double *)(v595), svreinterpret_f64_f32(v299)); +} +#endif + +#ifndef ARMRAL_ARCH_SVE +void armral_fft_cf32_cf32_cf32_ac_n_uun12(const armral_cmplx_f32_t *restrict x, + armral_cmplx_f32_t *restrict y, + int istride, int ostride, int howmany, + float dir) { + float v4 = dir; + const float32x2_t *v5 = (const float32x2_t *)x; + float32x2_t *v6 = (float32x2_t *)y; + float v99 = 1.0000000000000000e+00F; + float v100 = -1.0000000000000000e+00F; + float v126 = -1.4999999999999998e+00F; + float v127 = 1.4999999999999998e+00F; + float v155 = 8.6602540378443871e-01F; + float v163 = -8.6602540378443871e-01F; + float32x2_t v25 = v5[0]; + float32x2_t v67 = v5[istride]; + float32x2_t v101 = (float32x2_t){v99, v100}; + float32x2_t v124 = (float32x2_t){v126, v126}; + float32x2_t v128 = (float32x2_t){v126, v127}; + float32x2_t v157 = (float32x2_t){v155, v163}; + float32x2_t v158 = (float32x2_t){v4, v4}; + float32x2_t v164 = (float32x2_t){v163, v163}; + float32x2_t v13 = v5[istride * 4]; + float32x2_t v18 = v5[istride * 8]; + float32x2_t v31 = v5[istride * 7]; + float32x2_t v36 = v5[istride * 11]; + float32x2_t v43 = v5[istride * 3]; + float32x2_t v49 = v5[istride * 10]; + float32x2_t v54 = v5[istride * 2]; + float32x2_t v61 = v5[istride * 6]; + float32x2_t v72 = v5[istride * 5]; + float32x2_t v79 = v5[istride * 9]; + float32x2_t v103 = vmul_f32(v158, v101); + float32x2_t v130 = vmul_f32(v158, v128); + float32x2_t v159 = vmul_f32(v158, v157); + float32x2_t v19 = vadd_f32(v13, v18); + float32x2_t v20 = vsub_f32(v13, v18); + float32x2_t v37 = vadd_f32(v31, v36); + float32x2_t v38 = vsub_f32(v31, v36); + float32x2_t v55 = vadd_f32(v49, v54); + float32x2_t v56 = vsub_f32(v49, v54); + float32x2_t v73 = vadd_f32(v67, v72); + float32x2_t v74 = vsub_f32(v67, v72); + float32x2_t v26 = vadd_f32(v19, v25); + float32x2_t v44 = vadd_f32(v37, v43); + float32x2_t v62 = vadd_f32(v55, v61); + float32x2_t v80 = vadd_f32(v73, v79); + float32x2_t v108 = vadd_f32(v19, v55); + float32x2_t v109 = vsub_f32(v19, v55); + float32x2_t v110 = vadd_f32(v37, v73); + float32x2_t v111 = vsub_f32(v37, v73); + float32x2_t v135 = vadd_f32(v20, v56); + float32x2_t v136 = vsub_f32(v20, v56); + float32x2_t v137 = vadd_f32(v38, v74); + float32x2_t v138 = vsub_f32(v38, v74); + float32x2_t v81 = vadd_f32(v26, v62); + float32x2_t v82 = vsub_f32(v26, v62); + float32x2_t v83 = vadd_f32(v44, v80); + float32x2_t v84 = vsub_f32(v44, v80); + float32x2_t v112 = vadd_f32(v108, v110); + float32x2_t v113 = vsub_f32(v108, v110); + float32x2_t v125 = vmul_f32(v109, v124); + float32x2_t v131 = vrev64_f32(v111); + float32x2_t v139 = vadd_f32(v135, v137); + float32x2_t v140 = vsub_f32(v135, v137); + float32x2_t v160 = vrev64_f32(v136); + float32x2_t v165 = vmul_f32(v138, v164); + float32x2_t v85 = vadd_f32(v81, v83); + float32x2_t v86 = vsub_f32(v81, v83); + float32x2_t v104 = vrev64_f32(v84); + float32x2_t v117 = vmul_f32(v112, v124); + float32x2_t v121 = vmul_f32(v113, v124); + float32x2_t v132 = vmul_f32(v131, v130); + float32x2_t v146 = vrev64_f32(v139); + float32x2_t v153 = vrev64_f32(v140); + float32x2_t v161 = vmul_f32(v160, v159); + float32x2_t v105 = vmul_f32(v104, v103); + float32x2_t v133 = vadd_f32(v125, v132); + float32x2_t v134 = vsub_f32(v125, v132); + float32x2_t v147 = vmul_f32(v146, v159); + float32x2_t v154 = vmul_f32(v153, v159); + float32x2_t v166 = vadd_f32(v161, v165); + float32x2_t v167 = vsub_f32(v161, v165); + float32x2_t v168 = vadd_f32(v85, v117); + v6[0] = v85; + float32x2_t v204 = vadd_f32(v86, v121); + v6[ostride * 6] = v86; + float32x2_t v106 = vadd_f32(v82, v105); + float32x2_t v107 = vsub_f32(v82, v105); + float32x2_t v169 = vadd_f32(v168, v147); + float32x2_t v170 = vsub_f32(v168, v147); + float32x2_t v205 = vadd_f32(v204, v154); + float32x2_t v206 = vsub_f32(v204, v154); + v6[ostride * 4] = v170; + v6[ostride * 8] = v169; + float32x2_t v186 = vadd_f32(v107, v134); + v6[ostride * 9] = v107; + v6[ostride * 10] = v206; + v6[ostride * 2] = v205; + float32x2_t v222 = vadd_f32(v106, v133); + v6[ostride * 3] = v106; + float32x2_t v187 = vadd_f32(v186, v167); + float32x2_t v188 = vsub_f32(v186, v167); + float32x2_t v223 = vadd_f32(v222, v166); + float32x2_t v224 = vsub_f32(v222, v166); + v6[ostride] = v188; + v6[ostride * 5] = v187; + v6[ostride * 7] = v224; + v6[ostride * 11] = v223; +} +#endif + +#ifdef ARMRAL_ARCH_SVE +void armral_fft_cf32_cf32_cf32_ac_n_uun12(const armral_cmplx_f32_t *restrict x, + armral_cmplx_f32_t *restrict y, + int istride, int ostride, int howmany, + float dir) { + int64_t v0 = istride; + int64_t v2 = ostride; + float v4 = dir; + const float32x2_t *v5 = (const float32x2_t *)x; + float32x2_t *v6 = (float32x2_t *)y; + svbool_t pred_full = svptrue_pat_b32(SV_VL2); + float v132 = -1.0000000000000000e+00F; + float v157 = -1.4999999999999998e+00F; + float v162 = 1.4999999999999998e+00F; + float v198 = -8.6602540378443871e-01F; + const float32x2_t *v388 = &v5[v0]; + float32x2_t *v464 = &v6[v2]; + int64_t v15 = v0 * 4; + int64_t v22 = v0 * 8; + int64_t v39 = v0 * 7; + int64_t v46 = v0 * 11; + int64_t v55 = v0 * 3; + int64_t v63 = v0 * 10; + int64_t v70 = v0 * 2; + int64_t v79 = v0 * 6; + int64_t v94 = v0 * 5; + int64_t v103 = v0 * 9; + float v135 = v4 * v132; + float v165 = v4 * v162; + float v194 = v4 * v198; + int64_t v215 = v2 * 4; + int64_t v222 = v2 * 8; + int64_t v232 = v2 * 9; + int64_t v246 = v2 * 5; + int64_t v256 = v2 * 6; + int64_t v263 = v2 * 10; + int64_t v270 = v2 * 2; + int64_t v280 = v2 * 3; + int64_t v287 = v2 * 7; + int64_t v294 = v2 * 11; + const float32x2_t *v325 = &v5[0]; + svfloat32_t v415 = svdup_n_f32(v157); + svfloat32_t v420 = svdup_n_f32(v198); + float32x2_t *v428 = &v6[0]; + svfloat32_t v549 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v388)[0])); + const float32x2_t *v306 = &v5[v15]; + const float32x2_t *v315 = &v5[v22]; + const float32x2_t *v334 = &v5[v39]; + const float32x2_t *v343 = &v5[v46]; + const float32x2_t *v352 = &v5[v55]; + const float32x2_t *v361 = &v5[v63]; + const float32x2_t *v370 = &v5[v70]; + const float32x2_t *v379 = &v5[v79]; + const float32x2_t *v397 = &v5[v94]; + const float32x2_t *v406 = &v5[v103]; + svfloat32_t v412 = svdup_n_f32(v135); + svfloat32_t v416 = svdup_n_f32(v165); + svfloat32_t v419 = svdup_n_f32(v194); + float32x2_t *v437 = &v6[v215]; + float32x2_t *v446 = &v6[v222]; + float32x2_t *v455 = &v6[v232]; + float32x2_t *v473 = &v6[v246]; + float32x2_t *v482 = &v6[v256]; + float32x2_t *v491 = &v6[v263]; + float32x2_t *v500 = &v6[v270]; + float32x2_t *v509 = &v6[v280]; + float32x2_t *v518 = &v6[v287]; + float32x2_t *v527 = &v6[v294]; + svfloat32_t v535 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v325)[0])); + svfloat32_t v531 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v306)[0])); + svfloat32_t v533 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v315)[0])); + svfloat32_t v537 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v334)[0])); + svfloat32_t v539 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v343)[0])); + svfloat32_t v541 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v352)[0])); + svfloat32_t v543 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v361)[0])); + svfloat32_t v545 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v370)[0])); + svfloat32_t v547 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v379)[0])); + svfloat32_t v551 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v397)[0])); + svfloat32_t v553 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v406)[0])); + svfloat32_t v28 = svadd_f32_x(svptrue_b32(), v531, v533); + svfloat32_t v29 = svsub_f32_x(svptrue_b32(), v531, v533); + svfloat32_t v52 = svadd_f32_x(svptrue_b32(), v537, v539); + svfloat32_t v53 = svsub_f32_x(svptrue_b32(), v537, v539); + svfloat32_t v76 = svadd_f32_x(svptrue_b32(), v543, v545); + svfloat32_t v77 = svsub_f32_x(svptrue_b32(), v543, v545); + svfloat32_t v100 = svadd_f32_x(svptrue_b32(), v549, v551); + svfloat32_t v101 = svsub_f32_x(svptrue_b32(), v549, v551); + svfloat32_t v37 = svadd_f32_x(svptrue_b32(), v28, v535); + svfloat32_t v61 = svadd_f32_x(svptrue_b32(), v52, v541); + svfloat32_t v85 = svadd_f32_x(svptrue_b32(), v76, v547); + svfloat32_t v109 = svadd_f32_x(svptrue_b32(), v100, v553); + svfloat32_t v140 = svadd_f32_x(svptrue_b32(), v28, v76); + svfloat32_t v141 = svsub_f32_x(svptrue_b32(), v28, v76); + svfloat32_t v142 = svadd_f32_x(svptrue_b32(), v52, v100); + svfloat32_t v143 = svsub_f32_x(svptrue_b32(), v52, v100); + svfloat32_t v170 = svadd_f32_x(svptrue_b32(), v29, v77); + svfloat32_t v171 = svsub_f32_x(svptrue_b32(), v29, v77); + svfloat32_t v172 = svadd_f32_x(svptrue_b32(), v53, v101); + svfloat32_t v173 = svsub_f32_x(svptrue_b32(), v53, v101); + svfloat32_t v110 = svadd_f32_x(svptrue_b32(), v37, v85); + svfloat32_t v111 = svsub_f32_x(svptrue_b32(), v37, v85); + svfloat32_t v112 = svadd_f32_x(svptrue_b32(), v61, v109); + svfloat32_t v113 = svsub_f32_x(svptrue_b32(), v61, v109); + svfloat32_t v144 = svadd_f32_x(svptrue_b32(), v140, v142); + svfloat32_t v145 = svsub_f32_x(svptrue_b32(), v140, v142); + svfloat32_t zero167 = svdup_n_f32(0); + svfloat32_t v167 = svcmla_f32_x(pred_full, zero167, v416, v143, 90); + svfloat32_t v174 = svadd_f32_x(svptrue_b32(), v170, v172); + svfloat32_t v175 = svsub_f32_x(svptrue_b32(), v170, v172); + svfloat32_t zero196 = svdup_n_f32(0); + svfloat32_t v196 = svcmla_f32_x(pred_full, zero196, v419, v171, 90); + svfloat32_t v114 = svadd_f32_x(svptrue_b32(), v110, v112); + svfloat32_t v115 = svsub_f32_x(svptrue_b32(), v110, v112); + svfloat32_t zero137 = svdup_n_f32(0); + svfloat32_t v137 = svcmla_f32_x(pred_full, zero137, v412, v113, 90); + svfloat32_t v168 = svmla_f32_x(pred_full, v167, v141, v415); + svfloat32_t v169 = svnmls_f32_x(pred_full, v167, v141, v415); + svfloat32_t zero182 = svdup_n_f32(0); + svfloat32_t v182 = svcmla_f32_x(pred_full, zero182, v419, v174, 90); + svfloat32_t zero189 = svdup_n_f32(0); + svfloat32_t v189 = svcmla_f32_x(pred_full, zero189, v419, v175, 90); + svfloat32_t v202 = svmla_f32_x(pred_full, v196, v173, v420); + svfloat32_t v203 = svmls_f32_x(pred_full, v196, v173, v420); + svfloat32_t v138 = svadd_f32_x(svptrue_b32(), v111, v137); + svfloat32_t v139 = svsub_f32_x(svptrue_b32(), v111, v137); + svfloat32_t v204 = svmla_f32_x(pred_full, v114, v144, v415); + svfloat32_t v252 = svmla_f32_x(pred_full, v115, v145, v415); + svst1_f64(pred_full, (double *)(v428), svreinterpret_f64_f32(v114)); + svst1_f64(pred_full, (double *)(v482), svreinterpret_f64_f32(v115)); + svfloat32_t v205 = svadd_f32_x(svptrue_b32(), v204, v182); + svfloat32_t v206 = svsub_f32_x(svptrue_b32(), v204, v182); + svfloat32_t v228 = svadd_f32_x(svptrue_b32(), v139, v169); + svfloat32_t v253 = svadd_f32_x(svptrue_b32(), v252, v189); + svfloat32_t v254 = svsub_f32_x(svptrue_b32(), v252, v189); + svfloat32_t v276 = svadd_f32_x(svptrue_b32(), v138, v168); + svst1_f64(pred_full, (double *)(v455), svreinterpret_f64_f32(v139)); + svst1_f64(pred_full, (double *)(v509), svreinterpret_f64_f32(v138)); + svfloat32_t v229 = svadd_f32_x(svptrue_b32(), v228, v203); + svfloat32_t v230 = svsub_f32_x(svptrue_b32(), v228, v203); + svfloat32_t v277 = svadd_f32_x(svptrue_b32(), v276, v202); + svfloat32_t v278 = svsub_f32_x(svptrue_b32(), v276, v202); + svst1_f64(pred_full, (double *)(v437), svreinterpret_f64_f32(v206)); + svst1_f64(pred_full, (double *)(v446), svreinterpret_f64_f32(v205)); + svst1_f64(pred_full, (double *)(v491), svreinterpret_f64_f32(v254)); + svst1_f64(pred_full, (double *)(v500), svreinterpret_f64_f32(v253)); + svst1_f64(pred_full, (double *)(v464), svreinterpret_f64_f32(v230)); + svst1_f64(pred_full, (double *)(v473), svreinterpret_f64_f32(v229)); + svst1_f64(pred_full, (double *)(v518), svreinterpret_f64_f32(v278)); + svst1_f64(pred_full, (double *)(v527), svreinterpret_f64_f32(v277)); +} +#endif + +#ifndef ARMRAL_ARCH_SVE +void armral_fft_cf32_cf32_cf32_ac_n_uun13(const armral_cmplx_f32_t *restrict x, + armral_cmplx_f32_t *restrict y, + int istride, int ostride, int howmany, + float dir) { + float v4 = dir; + const float32x2_t *v5 = (const float32x2_t *)x; + float32x2_t *v6 = (float32x2_t *)y; + float v122 = 1.0833333333333333e+00F; + float v126 = -3.0046260628866578e-01F; + float v129 = 7.4927933062613905e-01F; + float v130 = -7.4927933062613905e-01F; + float v136 = 4.0100212832186721e-01F; + float v137 = -4.0100212832186721e-01F; + float v143 = 5.7514072947400308e-01F; + float v144 = -5.7514072947400308e-01F; + float v151 = 5.2422663952658211e-01F; + float v155 = 5.1652078062348972e-01F; + float v159 = 7.7058589030924258e-03F; + float v163 = 4.2763404682656941e-01F; + float v167 = 1.5180597207438440e-01F; + float v171 = 5.7944001890096386e-01F; + float v174 = 1.1543953381323635e+00F; + float v175 = -1.1543953381323635e+00F; + float v181 = 9.0655220171271012e-01F; + float v182 = -9.0655220171271012e-01F; + float v188 = 8.1857027294591811e-01F; + float v189 = -8.1857027294591811e-01F; + float v195 = 1.1971367726043427e+00F; + float v196 = -1.1971367726043427e+00F; + float v202 = 8.6131170741789742e-01F; + float v203 = -8.6131170741789742e-01F; + float v209 = 1.1091548438375507e+00F; + float v210 = -1.1091548438375507e+00F; + float v216 = 4.2741434471979367e-02F; + float v217 = -4.2741434471979367e-02F; + float v223 = -4.5240494294812715e-02F; + float v224 = 4.5240494294812715e-02F; + float v230 = 2.9058457089163264e-01F; + float v231 = -2.9058457089163264e-01F; + float32x2_t v13 = v5[istride]; + float32x2_t v108 = v5[0]; + float32x2_t v123 = (float32x2_t){v122, v122}; + float32x2_t v127 = (float32x2_t){v126, v126}; + float32x2_t v131 = (float32x2_t){v129, v130}; + float32x2_t v138 = (float32x2_t){v136, v137}; + float32x2_t v145 = (float32x2_t){v143, v144}; + float32x2_t v152 = (float32x2_t){v151, v151}; + float32x2_t v156 = (float32x2_t){v155, v155}; + float32x2_t v160 = (float32x2_t){v159, v159}; + float32x2_t v164 = (float32x2_t){v163, v163}; + float32x2_t v168 = (float32x2_t){v167, v167}; + float32x2_t v172 = (float32x2_t){v171, v171}; + float32x2_t v176 = (float32x2_t){v174, v175}; + float32x2_t v183 = (float32x2_t){v181, v182}; + float32x2_t v190 = (float32x2_t){v188, v189}; + float32x2_t v197 = (float32x2_t){v195, v196}; + float32x2_t v204 = (float32x2_t){v202, v203}; + float32x2_t v211 = (float32x2_t){v209, v210}; + float32x2_t v218 = (float32x2_t){v216, v217}; + float32x2_t v225 = (float32x2_t){v223, v224}; + float32x2_t v232 = (float32x2_t){v230, v231}; + float32x2_t v233 = (float32x2_t){v4, v4}; + float32x2_t v18 = v5[istride * 12]; + float32x2_t v24 = v5[istride * 2]; + float32x2_t v29 = v5[istride * 11]; + float32x2_t v35 = v5[istride * 3]; + float32x2_t v40 = v5[istride * 10]; + float32x2_t v46 = v5[istride * 4]; + float32x2_t v51 = v5[istride * 9]; + float32x2_t v57 = v5[istride * 5]; + float32x2_t v62 = v5[istride * 8]; + float32x2_t v68 = v5[istride * 6]; + float32x2_t v73 = v5[istride * 7]; + float32x2_t v133 = vmul_f32(v233, v131); + float32x2_t v140 = vmul_f32(v233, v138); + float32x2_t v147 = vmul_f32(v233, v145); + float32x2_t v178 = vmul_f32(v233, v176); + float32x2_t v185 = vmul_f32(v233, v183); + float32x2_t v192 = vmul_f32(v233, v190); + float32x2_t v199 = vmul_f32(v233, v197); + float32x2_t v206 = vmul_f32(v233, v204); + float32x2_t v213 = vmul_f32(v233, v211); + float32x2_t v220 = vmul_f32(v233, v218); + float32x2_t v227 = vmul_f32(v233, v225); + float32x2_t v234 = vmul_f32(v233, v232); + float32x2_t v19 = vadd_f32(v13, v18); + float32x2_t v30 = vadd_f32(v24, v29); + float32x2_t v41 = vadd_f32(v35, v40); + float32x2_t v52 = vadd_f32(v46, v51); + float32x2_t v63 = vadd_f32(v57, v62); + float32x2_t v74 = vadd_f32(v68, v73); + float32x2_t v75 = vsub_f32(v13, v18); + float32x2_t v76 = vsub_f32(v24, v29); + float32x2_t v77 = vsub_f32(v35, v40); + float32x2_t v78 = vsub_f32(v46, v51); + float32x2_t v79 = vsub_f32(v57, v62); + float32x2_t v80 = vsub_f32(v68, v73); + float32x2_t v81 = vadd_f32(v30, v63); + float32x2_t v83 = vadd_f32(v19, v41); + float32x2_t v86 = vadd_f32(v76, v79); + float32x2_t v88 = vadd_f32(v75, v77); + float32x2_t v90 = vsub_f32(v30, v74); + float32x2_t v91 = vsub_f32(v41, v52); + float32x2_t v92 = vsub_f32(v19, v52); + float32x2_t v93 = vsub_f32(v63, v74); + float32x2_t v98 = vsub_f32(v76, v80); + float32x2_t v99 = vsub_f32(v75, v77); + float32x2_t v100 = vsub_f32(v76, v79); + float32x2_t v101 = vadd_f32(v75, v78); + float32x2_t v102 = vsub_f32(v79, v80); + float32x2_t v103 = vadd_f32(v77, v78); + float32x2_t v82 = vadd_f32(v81, v74); + float32x2_t v84 = vadd_f32(v83, v52); + float32x2_t v87 = vadd_f32(v86, v80); + float32x2_t v89 = vsub_f32(v88, v78); + float32x2_t v94 = vsub_f32(v90, v91); + float32x2_t v95 = vsub_f32(v92, v93); + float32x2_t v96 = vadd_f32(v90, v91); + float32x2_t v97 = vadd_f32(v92, v93); + float32x2_t v114 = vadd_f32(v98, v99); + float32x2_t v115 = vadd_f32(v100, v101); + float32x2_t v116 = vsub_f32(v102, v103); + float32x2_t v179 = vrev64_f32(v98); + float32x2_t v186 = vrev64_f32(v99); + float32x2_t v200 = vrev64_f32(v100); + float32x2_t v207 = vrev64_f32(v101); + float32x2_t v221 = vrev64_f32(v102); + float32x2_t v228 = vrev64_f32(v103); + float32x2_t v85 = vadd_f32(v82, v84); + float32x2_t v110 = vsub_f32(v84, v82); + float32x2_t v111 = vadd_f32(v87, v89); + float32x2_t v112 = vadd_f32(v94, v95); + float32x2_t v113 = vsub_f32(v96, v97); + float32x2_t v134 = vrev64_f32(v87); + float32x2_t v141 = vrev64_f32(v89); + float32x2_t v153 = vmul_f32(v94, v152); + float32x2_t v157 = vmul_f32(v95, v156); + float32x2_t v165 = vmul_f32(v96, v164); + float32x2_t v169 = vmul_f32(v97, v168); + float32x2_t v180 = vmul_f32(v179, v178); + float32x2_t v187 = vmul_f32(v186, v185); + float32x2_t v193 = vrev64_f32(v114); + float32x2_t v201 = vmul_f32(v200, v199); + float32x2_t v208 = vmul_f32(v207, v206); + float32x2_t v214 = vrev64_f32(v115); + float32x2_t v222 = vmul_f32(v221, v220); + float32x2_t v229 = vmul_f32(v228, v227); + float32x2_t v235 = vrev64_f32(v116); + float32x2_t v109 = vadd_f32(v108, v85); + float32x2_t v124 = vmul_f32(v85, v123); + float32x2_t v128 = vmul_f32(v110, v127); + float32x2_t v135 = vmul_f32(v134, v133); + float32x2_t v142 = vmul_f32(v141, v140); + float32x2_t v148 = vrev64_f32(v111); + float32x2_t v161 = vmul_f32(v112, v160); + float32x2_t v173 = vmul_f32(v113, v172); + float32x2_t v194 = vmul_f32(v193, v192); + float32x2_t v215 = vmul_f32(v214, v213); + float32x2_t v236 = vmul_f32(v235, v234); + float32x2_t v238 = vadd_f32(v157, v153); + float32x2_t v149 = vmul_f32(v148, v147); + float32x2_t v237 = vsub_f32(v109, v124); + float32x2_t v239 = vsub_f32(v238, v128); + float32x2_t v240 = vadd_f32(v157, v161); + float32x2_t v242 = vsub_f32(v161, v153); + float32x2_t v250 = vsub_f32(v180, v194); + float32x2_t v251 = vsub_f32(v187, v194); + float32x2_t v252 = vsub_f32(v201, v215); + float32x2_t v253 = vsub_f32(v208, v215); + float32x2_t v254 = vsub_f32(v222, v236); + float32x2_t v255 = vadd_f32(v229, v236); + v6[0] = v109; + float32x2_t v241 = vadd_f32(v240, v128); + float32x2_t v243 = vsub_f32(v242, v128); + float32x2_t v244 = vadd_f32(v237, v165); + float32x2_t v246 = vsub_f32(v237, v169); + float32x2_t v248 = vsub_f32(v237, v165); + float32x2_t v256 = vsub_f32(v135, v149); + float32x2_t v257 = vsub_f32(v142, v149); + float32x2_t v268 = vadd_f32(v250, v254); + float32x2_t v270 = vadd_f32(v252, v254); + float32x2_t v272 = vsub_f32(v251, v255); + float32x2_t v245 = vadd_f32(v244, v169); + float32x2_t v247 = vsub_f32(v246, v173); + float32x2_t v249 = vadd_f32(v248, v173); + float32x2_t v264 = vsub_f32(v257, v250); + float32x2_t v266 = vsub_f32(v255, v256); + float32x2_t v269 = vadd_f32(v268, v257); + float32x2_t v271 = vsub_f32(v270, v257); + float32x2_t v273 = vsub_f32(v272, v256); + float32x2_t v274 = vadd_f32(v256, v251); + float32x2_t v258 = vadd_f32(v239, v245); + float32x2_t v259 = vadd_f32(v241, v247); + float32x2_t v260 = vsub_f32(v247, v241); + float32x2_t v261 = vadd_f32(v243, v249); + float32x2_t v262 = vsub_f32(v245, v239); + float32x2_t v263 = vsub_f32(v249, v243); + float32x2_t v265 = vadd_f32(v264, v252); + float32x2_t v267 = vsub_f32(v266, v253); + float32x2_t v275 = vsub_f32(v274, v253); + float32x2_t v276 = vsub_f32(v258, v265); + float32x2_t v277 = vadd_f32(v259, v267); + float32x2_t v278 = vsub_f32(v260, v269); + float32x2_t v279 = vsub_f32(v261, v271); + float32x2_t v280 = vadd_f32(v262, v273); + float32x2_t v281 = vsub_f32(v263, v275); + float32x2_t v282 = vadd_f32(v263, v275); + float32x2_t v283 = vsub_f32(v262, v273); + float32x2_t v284 = vadd_f32(v261, v271); + float32x2_t v285 = vadd_f32(v260, v269); + float32x2_t v286 = vsub_f32(v259, v267); + float32x2_t v287 = vadd_f32(v258, v265); + v6[ostride * 12] = v276; + v6[ostride * 11] = v277; + v6[ostride * 10] = v278; + v6[ostride * 9] = v279; + v6[ostride * 8] = v280; + v6[ostride * 7] = v281; + v6[ostride * 6] = v282; + v6[ostride * 5] = v283; + v6[ostride * 4] = v284; + v6[ostride * 3] = v285; + v6[ostride * 2] = v286; + v6[ostride] = v287; +} +#endif + +#ifdef ARMRAL_ARCH_SVE +void armral_fft_cf32_cf32_cf32_ac_n_uun13(const armral_cmplx_f32_t *restrict x, + armral_cmplx_f32_t *restrict y, + int istride, int ostride, int howmany, + float dir) { + int64_t v0 = istride; + int64_t v2 = ostride; + float v4 = dir; + const float32x2_t *v5 = (const float32x2_t *)x; + float32x2_t *v6 = (float32x2_t *)y; + svbool_t pred_full = svptrue_pat_b32(SV_VL2); + float v154 = 1.0833333333333333e+00F; + float v159 = -3.0046260628866578e-01F; + float v164 = -7.4927933062613905e-01F; + float v171 = -4.0100212832186721e-01F; + float v178 = -5.7514072947400308e-01F; + float v185 = 5.2422663952658211e-01F; + float v190 = 5.1652078062348972e-01F; + float v195 = 7.7058589030924258e-03F; + float v200 = 4.2763404682656941e-01F; + float v205 = 1.5180597207438440e-01F; + float v210 = 5.7944001890096386e-01F; + float v215 = -1.1543953381323635e+00F; + float v222 = -9.0655220171271012e-01F; + float v229 = -8.1857027294591811e-01F; + float v236 = -1.1971367726043427e+00F; + float v243 = -8.6131170741789742e-01F; + float v250 = -1.1091548438375507e+00F; + float v257 = -4.2741434471979367e-02F; + float v264 = 4.5240494294812715e-02F; + float v271 = -2.9058457089163264e-01F; + const float32x2_t *v425 = &v5[v0]; + float32x2_t *v673 = &v6[v2]; + int64_t v22 = v0 * 12; + int64_t v30 = v0 * 2; + int64_t v37 = v0 * 11; + int64_t v45 = v0 * 3; + int64_t v52 = v0 * 10; + int64_t v60 = v0 * 4; + int64_t v67 = v0 * 9; + int64_t v75 = v0 * 5; + int64_t v82 = v0 * 8; + int64_t v90 = v0 * 6; + int64_t v97 = v0 * 7; + float v167 = v4 * v164; + float v174 = v4 * v171; + float v181 = v4 * v178; + float v218 = v4 * v215; + float v225 = v4 * v222; + float v232 = v4 * v229; + float v239 = v4 * v236; + float v246 = v4 * v243; + float v253 = v4 * v250; + float v260 = v4 * v257; + float v267 = v4 * v264; + float v274 = v4 * v271; + int64_t v336 = v2 * 12; + int64_t v343 = v2 * 11; + int64_t v350 = v2 * 10; + int64_t v357 = v2 * 9; + int64_t v364 = v2 * 8; + int64_t v371 = v2 * 7; + int64_t v378 = v2 * 6; + int64_t v385 = v2 * 5; + int64_t v392 = v2 * 4; + int64_t v399 = v2 * 3; + int64_t v406 = v2 * 2; + const float32x2_t *v534 = &v5[0]; + svfloat32_t v538 = svdup_n_f32(v154); + svfloat32_t v539 = svdup_n_f32(v159); + svfloat32_t v543 = svdup_n_f32(v185); + svfloat32_t v544 = svdup_n_f32(v190); + svfloat32_t v545 = svdup_n_f32(v195); + svfloat32_t v546 = svdup_n_f32(v200); + svfloat32_t v547 = svdup_n_f32(v205); + svfloat32_t v548 = svdup_n_f32(v210); + float32x2_t *v565 = &v6[0]; + svfloat32_t v677 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v425)[0])); + const float32x2_t *v434 = &v5[v22]; + const float32x2_t *v443 = &v5[v30]; + const float32x2_t *v452 = &v5[v37]; + const float32x2_t *v461 = &v5[v45]; + const float32x2_t *v470 = &v5[v52]; + const float32x2_t *v479 = &v5[v60]; + const float32x2_t *v488 = &v5[v67]; + const float32x2_t *v497 = &v5[v75]; + const float32x2_t *v506 = &v5[v82]; + const float32x2_t *v515 = &v5[v90]; + const float32x2_t *v524 = &v5[v97]; + svfloat32_t v540 = svdup_n_f32(v167); + svfloat32_t v541 = svdup_n_f32(v174); + svfloat32_t v542 = svdup_n_f32(v181); + svfloat32_t v549 = svdup_n_f32(v218); + svfloat32_t v550 = svdup_n_f32(v225); + svfloat32_t v551 = svdup_n_f32(v232); + svfloat32_t v552 = svdup_n_f32(v239); + svfloat32_t v553 = svdup_n_f32(v246); + svfloat32_t v554 = svdup_n_f32(v253); + svfloat32_t v555 = svdup_n_f32(v260); + svfloat32_t v556 = svdup_n_f32(v267); + svfloat32_t v557 = svdup_n_f32(v274); + float32x2_t *v574 = &v6[v336]; + float32x2_t *v583 = &v6[v343]; + float32x2_t *v592 = &v6[v350]; + float32x2_t *v601 = &v6[v357]; + float32x2_t *v610 = &v6[v364]; + float32x2_t *v619 = &v6[v371]; + float32x2_t *v628 = &v6[v378]; + float32x2_t *v637 = &v6[v385]; + float32x2_t *v646 = &v6[v392]; + float32x2_t *v655 = &v6[v399]; + float32x2_t *v664 = &v6[v406]; + svfloat32_t v701 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v534)[0])); + svfloat32_t v679 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v434)[0])); + svfloat32_t v681 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v443)[0])); + svfloat32_t v683 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v452)[0])); + svfloat32_t v685 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v461)[0])); + svfloat32_t v687 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v470)[0])); + svfloat32_t v689 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v479)[0])); + svfloat32_t v691 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v488)[0])); + svfloat32_t v693 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v497)[0])); + svfloat32_t v695 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v506)[0])); + svfloat32_t v697 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v515)[0])); + svfloat32_t v699 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v524)[0])); + svfloat32_t v28 = svadd_f32_x(svptrue_b32(), v677, v679); + svfloat32_t v43 = svadd_f32_x(svptrue_b32(), v681, v683); + svfloat32_t v58 = svadd_f32_x(svptrue_b32(), v685, v687); + svfloat32_t v73 = svadd_f32_x(svptrue_b32(), v689, v691); + svfloat32_t v88 = svadd_f32_x(svptrue_b32(), v693, v695); + svfloat32_t v103 = svadd_f32_x(svptrue_b32(), v697, v699); + svfloat32_t v104 = svsub_f32_x(svptrue_b32(), v677, v679); + svfloat32_t v105 = svsub_f32_x(svptrue_b32(), v681, v683); + svfloat32_t v106 = svsub_f32_x(svptrue_b32(), v685, v687); + svfloat32_t v107 = svsub_f32_x(svptrue_b32(), v689, v691); + svfloat32_t v108 = svsub_f32_x(svptrue_b32(), v693, v695); + svfloat32_t v109 = svsub_f32_x(svptrue_b32(), v697, v699); + svfloat32_t v110 = svadd_f32_x(svptrue_b32(), v43, v88); + svfloat32_t v112 = svadd_f32_x(svptrue_b32(), v28, v58); + svfloat32_t v115 = svadd_f32_x(svptrue_b32(), v105, v108); + svfloat32_t v117 = svadd_f32_x(svptrue_b32(), v104, v106); + svfloat32_t v119 = svsub_f32_x(svptrue_b32(), v43, v103); + svfloat32_t v120 = svsub_f32_x(svptrue_b32(), v58, v73); + svfloat32_t v121 = svsub_f32_x(svptrue_b32(), v28, v73); + svfloat32_t v122 = svsub_f32_x(svptrue_b32(), v88, v103); + svfloat32_t v127 = svsub_f32_x(svptrue_b32(), v105, v109); + svfloat32_t v128 = svsub_f32_x(svptrue_b32(), v104, v106); + svfloat32_t v129 = svsub_f32_x(svptrue_b32(), v105, v108); + svfloat32_t v130 = svadd_f32_x(svptrue_b32(), v104, v107); + svfloat32_t v131 = svsub_f32_x(svptrue_b32(), v108, v109); + svfloat32_t v132 = svadd_f32_x(svptrue_b32(), v106, v107); + svfloat32_t v111 = svadd_f32_x(svptrue_b32(), v110, v103); + svfloat32_t v113 = svadd_f32_x(svptrue_b32(), v112, v73); + svfloat32_t v116 = svadd_f32_x(svptrue_b32(), v115, v109); + svfloat32_t v118 = svsub_f32_x(svptrue_b32(), v117, v107); + svfloat32_t v123 = svsub_f32_x(svptrue_b32(), v119, v120); + svfloat32_t v124 = svsub_f32_x(svptrue_b32(), v121, v122); + svfloat32_t v125 = svadd_f32_x(svptrue_b32(), v119, v120); + svfloat32_t v126 = svadd_f32_x(svptrue_b32(), v121, v122); + svfloat32_t v145 = svadd_f32_x(svptrue_b32(), v127, v128); + svfloat32_t v146 = svadd_f32_x(svptrue_b32(), v129, v130); + svfloat32_t v147 = svsub_f32_x(svptrue_b32(), v131, v132); + svfloat32_t zero220 = svdup_n_f32(0); + svfloat32_t v220 = svcmla_f32_x(pred_full, zero220, v549, v127, 90); + svfloat32_t zero227 = svdup_n_f32(0); + svfloat32_t v227 = svcmla_f32_x(pred_full, zero227, v550, v128, 90); + svfloat32_t zero241 = svdup_n_f32(0); + svfloat32_t v241 = svcmla_f32_x(pred_full, zero241, v552, v129, 90); + svfloat32_t zero248 = svdup_n_f32(0); + svfloat32_t v248 = svcmla_f32_x(pred_full, zero248, v553, v130, 90); + svfloat32_t zero262 = svdup_n_f32(0); + svfloat32_t v262 = svcmla_f32_x(pred_full, zero262, v555, v131, 90); + svfloat32_t v114 = svadd_f32_x(svptrue_b32(), v111, v113); + svfloat32_t v141 = svsub_f32_x(svptrue_b32(), v113, v111); + svfloat32_t v142 = svadd_f32_x(svptrue_b32(), v116, v118); + svfloat32_t v143 = svadd_f32_x(svptrue_b32(), v123, v124); + svfloat32_t v144 = svsub_f32_x(svptrue_b32(), v125, v126); + svfloat32_t zero169 = svdup_n_f32(0); + svfloat32_t v169 = svcmla_f32_x(pred_full, zero169, v540, v116, 90); + svfloat32_t zero176 = svdup_n_f32(0); + svfloat32_t v176 = svcmla_f32_x(pred_full, zero176, v541, v118, 90); + svfloat32_t v188 = svmul_f32_x(svptrue_b32(), v123, v543); + svfloat32_t zero234 = svdup_n_f32(0); + svfloat32_t v234 = svcmla_f32_x(pred_full, zero234, v551, v145, 90); + svfloat32_t zero255 = svdup_n_f32(0); + svfloat32_t v255 = svcmla_f32_x(pred_full, zero255, v554, v146, 90); + svfloat32_t zero276 = svdup_n_f32(0); + svfloat32_t v276 = svcmla_f32_x(pred_full, zero276, v557, v147, 90); + svfloat32_t v140 = svadd_f32_x(svptrue_b32(), v701, v114); + svfloat32_t zero183 = svdup_n_f32(0); + svfloat32_t v183 = svcmla_f32_x(pred_full, zero183, v542, v142, 90); + svfloat32_t v198 = svmul_f32_x(svptrue_b32(), v143, v545); + svfloat32_t v278 = svmla_f32_x(pred_full, v188, v124, v544); + svfloat32_t v290 = svsub_f32_x(svptrue_b32(), v220, v234); + svfloat32_t v291 = svsub_f32_x(svptrue_b32(), v227, v234); + svfloat32_t v292 = svsub_f32_x(svptrue_b32(), v241, v255); + svfloat32_t v293 = svsub_f32_x(svptrue_b32(), v248, v255); + svfloat32_t v294 = svsub_f32_x(svptrue_b32(), v262, v276); + svfloat32_t v295 = svcmla_f32_x(pred_full, v276, v556, v132, 90); + svfloat32_t v277 = svmls_f32_x(pred_full, v140, v114, v538); + svfloat32_t v279 = svmls_f32_x(pred_full, v278, v141, v539); + svfloat32_t v280 = svmla_f32_x(pred_full, v198, v124, v544); + svfloat32_t v282 = svnmls_f32_x(pred_full, v188, v143, v545); + svfloat32_t v296 = svsub_f32_x(svptrue_b32(), v169, v183); + svfloat32_t v297 = svsub_f32_x(svptrue_b32(), v176, v183); + svfloat32_t v308 = svadd_f32_x(svptrue_b32(), v290, v294); + svfloat32_t v310 = svadd_f32_x(svptrue_b32(), v292, v294); + svfloat32_t v312 = svsub_f32_x(svptrue_b32(), v291, v295); + svst1_f64(pred_full, (double *)(v565), svreinterpret_f64_f32(v140)); + svfloat32_t v281 = svmla_f32_x(pred_full, v280, v141, v539); + svfloat32_t v283 = svmls_f32_x(pred_full, v282, v141, v539); + svfloat32_t v284 = svmla_f32_x(pred_full, v277, v125, v546); + svfloat32_t v286 = svmls_f32_x(pred_full, v277, v126, v547); + svfloat32_t v288 = svmls_f32_x(pred_full, v277, v125, v546); + svfloat32_t v304 = svsub_f32_x(svptrue_b32(), v297, v290); + svfloat32_t v306 = svsub_f32_x(svptrue_b32(), v295, v296); + svfloat32_t v309 = svadd_f32_x(svptrue_b32(), v308, v297); + svfloat32_t v311 = svsub_f32_x(svptrue_b32(), v310, v297); + svfloat32_t v313 = svsub_f32_x(svptrue_b32(), v312, v296); + svfloat32_t v314 = svadd_f32_x(svptrue_b32(), v296, v291); + svfloat32_t v285 = svmla_f32_x(pred_full, v284, v126, v547); + svfloat32_t v287 = svmls_f32_x(pred_full, v286, v144, v548); + svfloat32_t v289 = svmla_f32_x(pred_full, v288, v144, v548); + svfloat32_t v305 = svadd_f32_x(svptrue_b32(), v304, v292); + svfloat32_t v307 = svsub_f32_x(svptrue_b32(), v306, v293); + svfloat32_t v315 = svsub_f32_x(svptrue_b32(), v314, v293); + svfloat32_t v298 = svadd_f32_x(svptrue_b32(), v279, v285); + svfloat32_t v299 = svadd_f32_x(svptrue_b32(), v281, v287); + svfloat32_t v300 = svsub_f32_x(svptrue_b32(), v287, v281); + svfloat32_t v301 = svadd_f32_x(svptrue_b32(), v283, v289); + svfloat32_t v302 = svsub_f32_x(svptrue_b32(), v285, v279); + svfloat32_t v303 = svsub_f32_x(svptrue_b32(), v289, v283); + svfloat32_t v316 = svsub_f32_x(svptrue_b32(), v298, v305); + svfloat32_t v317 = svadd_f32_x(svptrue_b32(), v299, v307); + svfloat32_t v318 = svsub_f32_x(svptrue_b32(), v300, v309); + svfloat32_t v319 = svsub_f32_x(svptrue_b32(), v301, v311); + svfloat32_t v320 = svadd_f32_x(svptrue_b32(), v302, v313); + svfloat32_t v321 = svsub_f32_x(svptrue_b32(), v303, v315); + svfloat32_t v322 = svadd_f32_x(svptrue_b32(), v303, v315); + svfloat32_t v323 = svsub_f32_x(svptrue_b32(), v302, v313); + svfloat32_t v324 = svadd_f32_x(svptrue_b32(), v301, v311); + svfloat32_t v325 = svadd_f32_x(svptrue_b32(), v300, v309); + svfloat32_t v326 = svsub_f32_x(svptrue_b32(), v299, v307); + svfloat32_t v327 = svadd_f32_x(svptrue_b32(), v298, v305); + svst1_f64(pred_full, (double *)(v574), svreinterpret_f64_f32(v316)); + svst1_f64(pred_full, (double *)(v583), svreinterpret_f64_f32(v317)); + svst1_f64(pred_full, (double *)(v592), svreinterpret_f64_f32(v318)); + svst1_f64(pred_full, (double *)(v601), svreinterpret_f64_f32(v319)); + svst1_f64(pred_full, (double *)(v610), svreinterpret_f64_f32(v320)); + svst1_f64(pred_full, (double *)(v619), svreinterpret_f64_f32(v321)); + svst1_f64(pred_full, (double *)(v628), svreinterpret_f64_f32(v322)); + svst1_f64(pred_full, (double *)(v637), svreinterpret_f64_f32(v323)); + svst1_f64(pred_full, (double *)(v646), svreinterpret_f64_f32(v324)); + svst1_f64(pred_full, (double *)(v655), svreinterpret_f64_f32(v325)); + svst1_f64(pred_full, (double *)(v664), svreinterpret_f64_f32(v326)); + svst1_f64(pred_full, (double *)(v673), svreinterpret_f64_f32(v327)); +} +#endif + +#ifndef ARMRAL_ARCH_SVE +void armral_fft_cf32_cf32_cf32_ac_n_uun14(const armral_cmplx_f32_t *restrict x, + armral_cmplx_f32_t *restrict y, + int istride, int ostride, int howmany, + float dir) { + float v4 = dir; + const float32x2_t *v5 = (const float32x2_t *)x; + float32x2_t *v6 = (float32x2_t *)y; + float v199 = -1.1666666666666665e+00F; + float v203 = 7.9015646852540022e-01F; + float v207 = 5.5854267289647742e-02F; + float v211 = 7.3430220123575241e-01F; + float v214 = 4.4095855184409838e-01F; + float v215 = -4.4095855184409838e-01F; + float v221 = 3.4087293062393137e-01F; + float v222 = -3.4087293062393137e-01F; + float v228 = -5.3396936033772524e-01F; + float v229 = 5.3396936033772524e-01F; + float v235 = 8.7484229096165667e-01F; + float v236 = -8.7484229096165667e-01F; + float32x2_t v13 = v5[0]; + float32x2_t v66 = v5[istride]; + float32x2_t v200 = (float32x2_t){v199, v199}; + float32x2_t v204 = (float32x2_t){v203, v203}; + float32x2_t v208 = (float32x2_t){v207, v207}; + float32x2_t v212 = (float32x2_t){v211, v211}; + float32x2_t v216 = (float32x2_t){v214, v215}; + float32x2_t v223 = (float32x2_t){v221, v222}; + float32x2_t v230 = (float32x2_t){v228, v229}; + float32x2_t v237 = (float32x2_t){v235, v236}; + float32x2_t v238 = (float32x2_t){v4, v4}; + float32x2_t v18 = v5[istride * 7]; + float32x2_t v25 = v5[istride * 2]; + float32x2_t v30 = v5[istride * 9]; + float32x2_t v37 = v5[istride * 4]; + float32x2_t v42 = v5[istride * 11]; + float32x2_t v49 = v5[istride * 6]; + float32x2_t v54 = v5[istride * 13]; + float32x2_t v61 = v5[istride * 8]; + float32x2_t v73 = v5[istride * 10]; + float32x2_t v78 = v5[istride * 3]; + float32x2_t v85 = v5[istride * 12]; + float32x2_t v90 = v5[istride * 5]; + float32x2_t v218 = vmul_f32(v238, v216); + float32x2_t v225 = vmul_f32(v238, v223); + float32x2_t v232 = vmul_f32(v238, v230); + float32x2_t v239 = vmul_f32(v238, v237); + float32x2_t v19 = vadd_f32(v13, v18); + float32x2_t v20 = vsub_f32(v13, v18); + float32x2_t v31 = vadd_f32(v25, v30); + float32x2_t v32 = vsub_f32(v25, v30); + float32x2_t v43 = vadd_f32(v37, v42); + float32x2_t v44 = vsub_f32(v37, v42); + float32x2_t v55 = vadd_f32(v49, v54); + float32x2_t v56 = vsub_f32(v49, v54); + float32x2_t v67 = vadd_f32(v61, v66); + float32x2_t v68 = vsub_f32(v61, v66); + float32x2_t v79 = vadd_f32(v73, v78); + float32x2_t v80 = vsub_f32(v73, v78); + float32x2_t v91 = vadd_f32(v85, v90); + float32x2_t v92 = vsub_f32(v85, v90); + float32x2_t v93 = vadd_f32(v31, v91); + float32x2_t v94 = vsub_f32(v31, v91); + float32x2_t v95 = vadd_f32(v67, v55); + float32x2_t v96 = vsub_f32(v67, v55); + float32x2_t v97 = vadd_f32(v43, v79); + float32x2_t v98 = vsub_f32(v43, v79); + float32x2_t v177 = vadd_f32(v32, v92); + float32x2_t v178 = vsub_f32(v32, v92); + float32x2_t v179 = vadd_f32(v68, v56); + float32x2_t v180 = vsub_f32(v68, v56); + float32x2_t v181 = vadd_f32(v44, v80); + float32x2_t v182 = vsub_f32(v44, v80); + float32x2_t v99 = vadd_f32(v93, v95); + float32x2_t v102 = vsub_f32(v93, v95); + float32x2_t v103 = vsub_f32(v95, v97); + float32x2_t v104 = vsub_f32(v97, v93); + float32x2_t v105 = vadd_f32(v94, v96); + float32x2_t v107 = vsub_f32(v94, v96); + float32x2_t v108 = vsub_f32(v96, v98); + float32x2_t v109 = vsub_f32(v98, v94); + float32x2_t v183 = vadd_f32(v177, v179); + float32x2_t v186 = vsub_f32(v177, v179); + float32x2_t v187 = vsub_f32(v179, v181); + float32x2_t v188 = vsub_f32(v181, v177); + float32x2_t v189 = vadd_f32(v178, v180); + float32x2_t v191 = vsub_f32(v178, v180); + float32x2_t v192 = vsub_f32(v180, v182); + float32x2_t v193 = vsub_f32(v182, v178); + float32x2_t v100 = vadd_f32(v99, v97); + float32x2_t v106 = vadd_f32(v105, v98); + float32x2_t v121 = vmul_f32(v102, v204); + float32x2_t v125 = vmul_f32(v103, v208); + float32x2_t v129 = vmul_f32(v104, v212); + float32x2_t v142 = vrev64_f32(v107); + float32x2_t v149 = vrev64_f32(v108); + float32x2_t v156 = vrev64_f32(v109); + float32x2_t v184 = vadd_f32(v183, v181); + float32x2_t v190 = vadd_f32(v189, v182); + float32x2_t v205 = vmul_f32(v186, v204); + float32x2_t v209 = vmul_f32(v187, v208); + float32x2_t v213 = vmul_f32(v188, v212); + float32x2_t v226 = vrev64_f32(v191); + float32x2_t v233 = vrev64_f32(v192); + float32x2_t v240 = vrev64_f32(v193); + float32x2_t v101 = vadd_f32(v100, v19); + float32x2_t v117 = vmul_f32(v100, v200); + float32x2_t v135 = vrev64_f32(v106); + float32x2_t v143 = vmul_f32(v142, v225); + float32x2_t v150 = vmul_f32(v149, v232); + float32x2_t v157 = vmul_f32(v156, v239); + float32x2_t v185 = vadd_f32(v184, v20); + float32x2_t v201 = vmul_f32(v184, v200); + float32x2_t v219 = vrev64_f32(v190); + float32x2_t v227 = vmul_f32(v226, v225); + float32x2_t v234 = vmul_f32(v233, v232); + float32x2_t v241 = vmul_f32(v240, v239); + float32x2_t v136 = vmul_f32(v135, v218); + float32x2_t v158 = vadd_f32(v101, v117); + float32x2_t v220 = vmul_f32(v219, v218); + float32x2_t v242 = vadd_f32(v185, v201); + v6[0] = v101; + v6[ostride * 7] = v185; + float32x2_t v159 = vadd_f32(v158, v121); + float32x2_t v161 = vsub_f32(v158, v121); + float32x2_t v163 = vsub_f32(v158, v125); + float32x2_t v165 = vadd_f32(v136, v143); + float32x2_t v167 = vsub_f32(v136, v143); + float32x2_t v169 = vsub_f32(v136, v150); + float32x2_t v243 = vadd_f32(v242, v205); + float32x2_t v245 = vsub_f32(v242, v205); + float32x2_t v247 = vsub_f32(v242, v209); + float32x2_t v249 = vadd_f32(v220, v227); + float32x2_t v251 = vsub_f32(v220, v227); + float32x2_t v253 = vsub_f32(v220, v234); + float32x2_t v160 = vadd_f32(v159, v125); + float32x2_t v162 = vsub_f32(v161, v129); + float32x2_t v164 = vadd_f32(v163, v129); + float32x2_t v166 = vadd_f32(v165, v150); + float32x2_t v168 = vsub_f32(v167, v157); + float32x2_t v170 = vadd_f32(v169, v157); + float32x2_t v244 = vadd_f32(v243, v209); + float32x2_t v246 = vsub_f32(v245, v213); + float32x2_t v248 = vadd_f32(v247, v213); + float32x2_t v250 = vadd_f32(v249, v234); + float32x2_t v252 = vsub_f32(v251, v241); + float32x2_t v254 = vadd_f32(v253, v241); + float32x2_t v171 = vadd_f32(v160, v166); + float32x2_t v172 = vsub_f32(v160, v166); + float32x2_t v173 = vadd_f32(v162, v168); + float32x2_t v174 = vsub_f32(v162, v168); + float32x2_t v175 = vadd_f32(v164, v170); + float32x2_t v176 = vsub_f32(v164, v170); + float32x2_t v255 = vadd_f32(v244, v250); + float32x2_t v256 = vsub_f32(v244, v250); + float32x2_t v257 = vadd_f32(v246, v252); + float32x2_t v258 = vsub_f32(v246, v252); + float32x2_t v259 = vadd_f32(v248, v254); + float32x2_t v260 = vsub_f32(v248, v254); + v6[ostride * 8] = v172; + v6[ostride] = v256; + v6[ostride * 2] = v174; + v6[ostride * 9] = v258; + v6[ostride * 10] = v175; + v6[ostride * 3] = v259; + v6[ostride * 4] = v176; + v6[ostride * 11] = v260; + v6[ostride * 12] = v173; + v6[ostride * 5] = v257; + v6[ostride * 6] = v171; + v6[ostride * 13] = v255; +} +#endif + +#ifdef ARMRAL_ARCH_SVE +void armral_fft_cf32_cf32_cf32_ac_n_uun14(const armral_cmplx_f32_t *restrict x, + armral_cmplx_f32_t *restrict y, + int istride, int ostride, int howmany, + float dir) { + int64_t v0 = istride; + int64_t v2 = ostride; + float v4 = dir; + const float32x2_t *v5 = (const float32x2_t *)x; + float32x2_t *v6 = (float32x2_t *)y; + svbool_t pred_full = svptrue_pat_b32(SV_VL2); + float v238 = -1.1666666666666665e+00F; + float v243 = 7.9015646852540022e-01F; + float v248 = 5.5854267289647742e-02F; + float v253 = 7.3430220123575241e-01F; + float v258 = -4.4095855184409838e-01F; + float v265 = -3.4087293062393137e-01F; + float v272 = 5.3396936033772524e-01F; + float v279 = -8.7484229096165667e-01F; + const float32x2_t *v490 = &v5[v0]; + float32x2_t *v581 = &v6[v2]; + int64_t v22 = v0 * 7; + int64_t v31 = v0 * 2; + int64_t v38 = v0 * 9; + int64_t v47 = v0 * 4; + int64_t v54 = v0 * 11; + int64_t v63 = v0 * 6; + int64_t v70 = v0 * 13; + int64_t v79 = v0 * 8; + int64_t v95 = v0 * 10; + int64_t v102 = v0 * 3; + int64_t v111 = v0 * 12; + int64_t v118 = v0 * 5; + float v261 = v4 * v258; + float v268 = v4 * v265; + float v275 = v4 * v272; + float v282 = v4 * v279; + int64_t v312 = v2 * 7; + int64_t v319 = v2 * 8; + int64_t v333 = v2 * 2; + int64_t v340 = v2 * 9; + int64_t v347 = v2 * 10; + int64_t v354 = v2 * 3; + int64_t v361 = v2 * 4; + int64_t v368 = v2 * 11; + int64_t v375 = v2 * 12; + int64_t v382 = v2 * 5; + int64_t v389 = v2 * 6; + int64_t v396 = v2 * 13; + const float32x2_t *v409 = &v5[0]; + svfloat32_t v539 = svdup_n_f32(v238); + svfloat32_t v540 = svdup_n_f32(v243); + svfloat32_t v541 = svdup_n_f32(v248); + svfloat32_t v542 = svdup_n_f32(v253); + float32x2_t *v554 = &v6[0]; + svfloat32_t v693 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v490)[0])); + const float32x2_t *v418 = &v5[v22]; + const float32x2_t *v427 = &v5[v31]; + const float32x2_t *v436 = &v5[v38]; + const float32x2_t *v445 = &v5[v47]; + const float32x2_t *v454 = &v5[v54]; + const float32x2_t *v463 = &v5[v63]; + const float32x2_t *v472 = &v5[v70]; + const float32x2_t *v481 = &v5[v79]; + const float32x2_t *v499 = &v5[v95]; + const float32x2_t *v508 = &v5[v102]; + const float32x2_t *v517 = &v5[v111]; + const float32x2_t *v526 = &v5[v118]; + svfloat32_t v543 = svdup_n_f32(v261); + svfloat32_t v544 = svdup_n_f32(v268); + svfloat32_t v545 = svdup_n_f32(v275); + svfloat32_t v546 = svdup_n_f32(v282); + float32x2_t *v563 = &v6[v312]; + float32x2_t *v572 = &v6[v319]; + float32x2_t *v590 = &v6[v333]; + float32x2_t *v599 = &v6[v340]; + float32x2_t *v608 = &v6[v347]; + float32x2_t *v617 = &v6[v354]; + float32x2_t *v626 = &v6[v361]; + float32x2_t *v635 = &v6[v368]; + float32x2_t *v644 = &v6[v375]; + float32x2_t *v653 = &v6[v382]; + float32x2_t *v662 = &v6[v389]; + float32x2_t *v671 = &v6[v396]; + svfloat32_t v675 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v409)[0])); + svfloat32_t v677 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v418)[0])); + svfloat32_t v679 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v427)[0])); + svfloat32_t v681 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v436)[0])); + svfloat32_t v683 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v445)[0])); + svfloat32_t v685 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v454)[0])); + svfloat32_t v687 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v463)[0])); + svfloat32_t v689 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v472)[0])); + svfloat32_t v691 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v481)[0])); + svfloat32_t v695 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v499)[0])); + svfloat32_t v697 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v508)[0])); + svfloat32_t v699 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v517)[0])); + svfloat32_t v701 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v526)[0])); + svfloat32_t v28 = svadd_f32_x(svptrue_b32(), v675, v677); + svfloat32_t v29 = svsub_f32_x(svptrue_b32(), v675, v677); + svfloat32_t v44 = svadd_f32_x(svptrue_b32(), v679, v681); + svfloat32_t v45 = svsub_f32_x(svptrue_b32(), v679, v681); + svfloat32_t v60 = svadd_f32_x(svptrue_b32(), v683, v685); + svfloat32_t v61 = svsub_f32_x(svptrue_b32(), v683, v685); + svfloat32_t v76 = svadd_f32_x(svptrue_b32(), v687, v689); + svfloat32_t v77 = svsub_f32_x(svptrue_b32(), v687, v689); + svfloat32_t v92 = svadd_f32_x(svptrue_b32(), v691, v693); + svfloat32_t v93 = svsub_f32_x(svptrue_b32(), v691, v693); + svfloat32_t v108 = svadd_f32_x(svptrue_b32(), v695, v697); + svfloat32_t v109 = svsub_f32_x(svptrue_b32(), v695, v697); + svfloat32_t v124 = svadd_f32_x(svptrue_b32(), v699, v701); + svfloat32_t v125 = svsub_f32_x(svptrue_b32(), v699, v701); + svfloat32_t v126 = svadd_f32_x(svptrue_b32(), v44, v124); + svfloat32_t v127 = svsub_f32_x(svptrue_b32(), v44, v124); + svfloat32_t v128 = svadd_f32_x(svptrue_b32(), v92, v76); + svfloat32_t v129 = svsub_f32_x(svptrue_b32(), v92, v76); + svfloat32_t v130 = svadd_f32_x(svptrue_b32(), v60, v108); + svfloat32_t v131 = svsub_f32_x(svptrue_b32(), v60, v108); + svfloat32_t v215 = svadd_f32_x(svptrue_b32(), v45, v125); + svfloat32_t v216 = svsub_f32_x(svptrue_b32(), v45, v125); + svfloat32_t v217 = svadd_f32_x(svptrue_b32(), v93, v77); + svfloat32_t v218 = svsub_f32_x(svptrue_b32(), v93, v77); + svfloat32_t v219 = svadd_f32_x(svptrue_b32(), v61, v109); + svfloat32_t v220 = svsub_f32_x(svptrue_b32(), v61, v109); + svfloat32_t v132 = svadd_f32_x(svptrue_b32(), v126, v128); + svfloat32_t v135 = svsub_f32_x(svptrue_b32(), v126, v128); + svfloat32_t v136 = svsub_f32_x(svptrue_b32(), v128, v130); + svfloat32_t v137 = svsub_f32_x(svptrue_b32(), v130, v126); + svfloat32_t v138 = svadd_f32_x(svptrue_b32(), v127, v129); + svfloat32_t v140 = svsub_f32_x(svptrue_b32(), v127, v129); + svfloat32_t v141 = svsub_f32_x(svptrue_b32(), v129, v131); + svfloat32_t v142 = svsub_f32_x(svptrue_b32(), v131, v127); + svfloat32_t v221 = svadd_f32_x(svptrue_b32(), v215, v217); + svfloat32_t v224 = svsub_f32_x(svptrue_b32(), v215, v217); + svfloat32_t v225 = svsub_f32_x(svptrue_b32(), v217, v219); + svfloat32_t v226 = svsub_f32_x(svptrue_b32(), v219, v215); + svfloat32_t v227 = svadd_f32_x(svptrue_b32(), v216, v218); + svfloat32_t v229 = svsub_f32_x(svptrue_b32(), v216, v218); + svfloat32_t v230 = svsub_f32_x(svptrue_b32(), v218, v220); + svfloat32_t v231 = svsub_f32_x(svptrue_b32(), v220, v216); + svfloat32_t v133 = svadd_f32_x(svptrue_b32(), v132, v130); + svfloat32_t v139 = svadd_f32_x(svptrue_b32(), v138, v131); + svfloat32_t zero181 = svdup_n_f32(0); + svfloat32_t v181 = svcmla_f32_x(pred_full, zero181, v544, v140, 90); + svfloat32_t zero188 = svdup_n_f32(0); + svfloat32_t v188 = svcmla_f32_x(pred_full, zero188, v545, v141, 90); + svfloat32_t zero195 = svdup_n_f32(0); + svfloat32_t v195 = svcmla_f32_x(pred_full, zero195, v546, v142, 90); + svfloat32_t v222 = svadd_f32_x(svptrue_b32(), v221, v219); + svfloat32_t v228 = svadd_f32_x(svptrue_b32(), v227, v220); + svfloat32_t zero270 = svdup_n_f32(0); + svfloat32_t v270 = svcmla_f32_x(pred_full, zero270, v544, v229, 90); + svfloat32_t zero277 = svdup_n_f32(0); + svfloat32_t v277 = svcmla_f32_x(pred_full, zero277, v545, v230, 90); + svfloat32_t zero284 = svdup_n_f32(0); + svfloat32_t v284 = svcmla_f32_x(pred_full, zero284, v546, v231, 90); + svfloat32_t v134 = svadd_f32_x(svptrue_b32(), v133, v28); + svfloat32_t zero174 = svdup_n_f32(0); + svfloat32_t v174 = svcmla_f32_x(pred_full, zero174, v543, v139, 90); + svfloat32_t v223 = svadd_f32_x(svptrue_b32(), v222, v29); + svfloat32_t zero263 = svdup_n_f32(0); + svfloat32_t v263 = svcmla_f32_x(pred_full, zero263, v543, v228, 90); + svfloat32_t v196 = svmla_f32_x(pred_full, v134, v133, v539); + svfloat32_t v203 = svadd_f32_x(svptrue_b32(), v174, v181); + svfloat32_t v205 = svsub_f32_x(svptrue_b32(), v174, v181); + svfloat32_t v207 = svsub_f32_x(svptrue_b32(), v174, v188); + svfloat32_t v285 = svmla_f32_x(pred_full, v223, v222, v539); + svfloat32_t v292 = svadd_f32_x(svptrue_b32(), v263, v270); + svfloat32_t v294 = svsub_f32_x(svptrue_b32(), v263, v270); + svfloat32_t v296 = svsub_f32_x(svptrue_b32(), v263, v277); + svst1_f64(pred_full, (double *)(v554), svreinterpret_f64_f32(v134)); + svst1_f64(pred_full, (double *)(v563), svreinterpret_f64_f32(v223)); + svfloat32_t v197 = svmla_f32_x(pred_full, v196, v135, v540); + svfloat32_t v199 = svmls_f32_x(pred_full, v196, v135, v540); + svfloat32_t v201 = svmls_f32_x(pred_full, v196, v136, v541); + svfloat32_t v204 = svadd_f32_x(svptrue_b32(), v203, v188); + svfloat32_t v206 = svsub_f32_x(svptrue_b32(), v205, v195); + svfloat32_t v208 = svadd_f32_x(svptrue_b32(), v207, v195); + svfloat32_t v286 = svmla_f32_x(pred_full, v285, v224, v540); + svfloat32_t v288 = svmls_f32_x(pred_full, v285, v224, v540); + svfloat32_t v290 = svmls_f32_x(pred_full, v285, v225, v541); + svfloat32_t v293 = svadd_f32_x(svptrue_b32(), v292, v277); + svfloat32_t v295 = svsub_f32_x(svptrue_b32(), v294, v284); + svfloat32_t v297 = svadd_f32_x(svptrue_b32(), v296, v284); + svfloat32_t v198 = svmla_f32_x(pred_full, v197, v136, v541); + svfloat32_t v200 = svmls_f32_x(pred_full, v199, v137, v542); + svfloat32_t v202 = svmla_f32_x(pred_full, v201, v137, v542); + svfloat32_t v287 = svmla_f32_x(pred_full, v286, v225, v541); + svfloat32_t v289 = svmls_f32_x(pred_full, v288, v226, v542); + svfloat32_t v291 = svmla_f32_x(pred_full, v290, v226, v542); + svfloat32_t v209 = svadd_f32_x(svptrue_b32(), v198, v204); + svfloat32_t v210 = svsub_f32_x(svptrue_b32(), v198, v204); + svfloat32_t v211 = svadd_f32_x(svptrue_b32(), v200, v206); + svfloat32_t v212 = svsub_f32_x(svptrue_b32(), v200, v206); + svfloat32_t v213 = svadd_f32_x(svptrue_b32(), v202, v208); + svfloat32_t v214 = svsub_f32_x(svptrue_b32(), v202, v208); + svfloat32_t v298 = svadd_f32_x(svptrue_b32(), v287, v293); + svfloat32_t v299 = svsub_f32_x(svptrue_b32(), v287, v293); + svfloat32_t v300 = svadd_f32_x(svptrue_b32(), v289, v295); + svfloat32_t v301 = svsub_f32_x(svptrue_b32(), v289, v295); + svfloat32_t v302 = svadd_f32_x(svptrue_b32(), v291, v297); + svfloat32_t v303 = svsub_f32_x(svptrue_b32(), v291, v297); + svst1_f64(pred_full, (double *)(v572), svreinterpret_f64_f32(v210)); + svst1_f64(pred_full, (double *)(v581), svreinterpret_f64_f32(v299)); + svst1_f64(pred_full, (double *)(v590), svreinterpret_f64_f32(v212)); + svst1_f64(pred_full, (double *)(v599), svreinterpret_f64_f32(v301)); + svst1_f64(pred_full, (double *)(v608), svreinterpret_f64_f32(v213)); + svst1_f64(pred_full, (double *)(v617), svreinterpret_f64_f32(v302)); + svst1_f64(pred_full, (double *)(v626), svreinterpret_f64_f32(v214)); + svst1_f64(pred_full, (double *)(v635), svreinterpret_f64_f32(v303)); + svst1_f64(pred_full, (double *)(v644), svreinterpret_f64_f32(v211)); + svst1_f64(pred_full, (double *)(v653), svreinterpret_f64_f32(v300)); + svst1_f64(pred_full, (double *)(v662), svreinterpret_f64_f32(v209)); + svst1_f64(pred_full, (double *)(v671), svreinterpret_f64_f32(v298)); +} +#endif + +#ifndef ARMRAL_ARCH_SVE +void armral_fft_cf32_cf32_cf32_ac_n_uun15(const armral_cmplx_f32_t *restrict x, + armral_cmplx_f32_t *restrict y, + int istride, int ostride, int howmany, + float dir) { + float v4 = dir; + const float32x2_t *v5 = (const float32x2_t *)x; + float32x2_t *v6 = (float32x2_t *)y; + float v112 = -1.2500000000000000e+00F; + float v116 = 5.5901699437494745e-01F; + float v119 = 1.5388417685876268e+00F; + float v120 = -1.5388417685876268e+00F; + float v126 = 5.8778525229247325e-01F; + float v127 = -5.8778525229247325e-01F; + float v133 = 3.6327126400268028e-01F; + float v134 = -3.6327126400268028e-01F; + float v158 = -1.4999999999999998e+00F; + float v162 = 1.8749999999999998e+00F; + float v166 = -8.3852549156242107e-01F; + float v169 = -2.3082626528814396e+00F; + float v170 = 2.3082626528814396e+00F; + float v176 = -8.8167787843870971e-01F; + float v177 = 8.8167787843870971e-01F; + float v183 = -5.4490689600402031e-01F; + float v184 = 5.4490689600402031e-01F; + float v207 = 8.6602540378443871e-01F; + float v208 = -8.6602540378443871e-01F; + float v214 = -1.0825317547305484e+00F; + float v215 = 1.0825317547305484e+00F; + float v221 = 4.8412291827592718e-01F; + float v222 = -4.8412291827592718e-01F; + float v229 = -1.3326760640014592e+00F; + float v233 = -5.0903696045512736e-01F; + float v237 = -3.1460214309120460e-01F; + float32x2_t v25 = v5[0]; + float32x2_t v54 = v5[istride]; + float32x2_t v113 = (float32x2_t){v112, v112}; + float32x2_t v117 = (float32x2_t){v116, v116}; + float32x2_t v121 = (float32x2_t){v119, v120}; + float32x2_t v128 = (float32x2_t){v126, v127}; + float32x2_t v135 = (float32x2_t){v133, v134}; + float32x2_t v159 = (float32x2_t){v158, v158}; + float32x2_t v163 = (float32x2_t){v162, v162}; + float32x2_t v167 = (float32x2_t){v166, v166}; + float32x2_t v171 = (float32x2_t){v169, v170}; + float32x2_t v178 = (float32x2_t){v176, v177}; + float32x2_t v185 = (float32x2_t){v183, v184}; + float32x2_t v209 = (float32x2_t){v207, v208}; + float32x2_t v216 = (float32x2_t){v214, v215}; + float32x2_t v223 = (float32x2_t){v221, v222}; + float32x2_t v224 = (float32x2_t){v4, v4}; + float32x2_t v230 = (float32x2_t){v229, v229}; + float32x2_t v234 = (float32x2_t){v233, v233}; + float32x2_t v238 = (float32x2_t){v237, v237}; + float32x2_t v13 = v5[istride * 5]; + float32x2_t v18 = v5[istride * 10]; + float32x2_t v31 = v5[istride * 8]; + float32x2_t v36 = v5[istride * 13]; + float32x2_t v43 = v5[istride * 3]; + float32x2_t v49 = v5[istride * 11]; + float32x2_t v61 = v5[istride * 6]; + float32x2_t v67 = v5[istride * 14]; + float32x2_t v72 = v5[istride * 4]; + float32x2_t v79 = v5[istride * 9]; + float32x2_t v85 = v5[istride * 2]; + float32x2_t v90 = v5[istride * 7]; + float32x2_t v97 = v5[istride * 12]; + float32x2_t v123 = vmul_f32(v224, v121); + float32x2_t v130 = vmul_f32(v224, v128); + float32x2_t v137 = vmul_f32(v224, v135); + float32x2_t v173 = vmul_f32(v224, v171); + float32x2_t v180 = vmul_f32(v224, v178); + float32x2_t v187 = vmul_f32(v224, v185); + float32x2_t v211 = vmul_f32(v224, v209); + float32x2_t v218 = vmul_f32(v224, v216); + float32x2_t v225 = vmul_f32(v224, v223); + float32x2_t v19 = vadd_f32(v13, v18); + float32x2_t v20 = vsub_f32(v13, v18); + float32x2_t v37 = vadd_f32(v31, v36); + float32x2_t v38 = vsub_f32(v31, v36); + float32x2_t v55 = vadd_f32(v49, v54); + float32x2_t v56 = vsub_f32(v49, v54); + float32x2_t v73 = vadd_f32(v67, v72); + float32x2_t v74 = vsub_f32(v67, v72); + float32x2_t v91 = vadd_f32(v85, v90); + float32x2_t v92 = vsub_f32(v85, v90); + float32x2_t v26 = vadd_f32(v19, v25); + float32x2_t v44 = vadd_f32(v37, v43); + float32x2_t v62 = vadd_f32(v55, v61); + float32x2_t v80 = vadd_f32(v73, v79); + float32x2_t v98 = vadd_f32(v91, v97); + float32x2_t v149 = vadd_f32(v37, v91); + float32x2_t v150 = vsub_f32(v37, v91); + float32x2_t v151 = vadd_f32(v73, v55); + float32x2_t v152 = vsub_f32(v73, v55); + float32x2_t v199 = vadd_f32(v38, v92); + float32x2_t v200 = vsub_f32(v38, v92); + float32x2_t v201 = vadd_f32(v74, v56); + float32x2_t v202 = vsub_f32(v74, v56); + float32x2_t v99 = vadd_f32(v44, v98); + float32x2_t v100 = vsub_f32(v44, v98); + float32x2_t v101 = vadd_f32(v80, v62); + float32x2_t v102 = vsub_f32(v80, v62); + float32x2_t v153 = vadd_f32(v149, v151); + float32x2_t v154 = vsub_f32(v149, v151); + float32x2_t v155 = vadd_f32(v150, v152); + float32x2_t v174 = vrev64_f32(v150); + float32x2_t v188 = vrev64_f32(v152); + float32x2_t v203 = vadd_f32(v199, v201); + float32x2_t v204 = vsub_f32(v199, v201); + float32x2_t v205 = vadd_f32(v200, v202); + float32x2_t v231 = vmul_f32(v200, v230); + float32x2_t v239 = vmul_f32(v202, v238); + float32x2_t v103 = vadd_f32(v99, v101); + float32x2_t v104 = vsub_f32(v99, v101); + float32x2_t v105 = vadd_f32(v100, v102); + float32x2_t v124 = vrev64_f32(v100); + float32x2_t v138 = vrev64_f32(v102); + float32x2_t v156 = vadd_f32(v153, v19); + float32x2_t v164 = vmul_f32(v153, v163); + float32x2_t v168 = vmul_f32(v154, v167); + float32x2_t v175 = vmul_f32(v174, v173); + float32x2_t v181 = vrev64_f32(v155); + float32x2_t v189 = vmul_f32(v188, v187); + float32x2_t v206 = vadd_f32(v203, v20); + float32x2_t v219 = vrev64_f32(v203); + float32x2_t v226 = vrev64_f32(v204); + float32x2_t v235 = vmul_f32(v205, v234); + float32x2_t v106 = vadd_f32(v103, v26); + float32x2_t v114 = vmul_f32(v103, v113); + float32x2_t v118 = vmul_f32(v104, v117); + float32x2_t v125 = vmul_f32(v124, v123); + float32x2_t v131 = vrev64_f32(v105); + float32x2_t v139 = vmul_f32(v138, v137); + float32x2_t v160 = vmul_f32(v156, v159); + float32x2_t v182 = vmul_f32(v181, v180); + float32x2_t v212 = vrev64_f32(v206); + float32x2_t v220 = vmul_f32(v219, v218); + float32x2_t v227 = vmul_f32(v226, v225); + float32x2_t v243 = vsub_f32(v231, v235); + float32x2_t v244 = vadd_f32(v235, v239); + float32x2_t v132 = vmul_f32(v131, v130); + float32x2_t v140 = vadd_f32(v106, v114); + float32x2_t v190 = vadd_f32(v160, v164); + float32x2_t v193 = vsub_f32(v175, v182); + float32x2_t v194 = vadd_f32(v182, v189); + float32x2_t v213 = vmul_f32(v212, v211); + float32x2_t v249 = vadd_f32(v106, v160); + v6[0] = v106; + float32x2_t v141 = vadd_f32(v140, v118); + float32x2_t v142 = vsub_f32(v140, v118); + float32x2_t v143 = vsub_f32(v125, v132); + float32x2_t v144 = vadd_f32(v132, v139); + float32x2_t v191 = vadd_f32(v190, v168); + float32x2_t v192 = vsub_f32(v190, v168); + float32x2_t v240 = vadd_f32(v213, v220); + float32x2_t v250 = vadd_f32(v249, v213); + float32x2_t v251 = vsub_f32(v249, v213); + float32x2_t v145 = vadd_f32(v141, v143); + float32x2_t v146 = vsub_f32(v141, v143); + float32x2_t v147 = vadd_f32(v142, v144); + float32x2_t v148 = vsub_f32(v142, v144); + float32x2_t v195 = vadd_f32(v191, v193); + float32x2_t v196 = vsub_f32(v191, v193); + float32x2_t v197 = vadd_f32(v192, v194); + float32x2_t v198 = vsub_f32(v192, v194); + float32x2_t v241 = vadd_f32(v240, v227); + float32x2_t v242 = vsub_f32(v240, v227); + v6[ostride * 10] = v251; + v6[ostride * 5] = v250; + float32x2_t v245 = vadd_f32(v241, v243); + float32x2_t v246 = vsub_f32(v241, v243); + float32x2_t v247 = vadd_f32(v242, v244); + float32x2_t v248 = vsub_f32(v242, v244); + float32x2_t v267 = vadd_f32(v146, v196); + v6[ostride * 6] = v146; + float32x2_t v285 = vadd_f32(v148, v198); + v6[ostride * 12] = v148; + float32x2_t v303 = vadd_f32(v147, v197); + v6[ostride * 3] = v147; + float32x2_t v321 = vadd_f32(v145, v195); + v6[ostride * 9] = v145; + float32x2_t v268 = vadd_f32(v267, v246); + float32x2_t v269 = vsub_f32(v267, v246); + float32x2_t v286 = vadd_f32(v285, v248); + float32x2_t v287 = vsub_f32(v285, v248); + float32x2_t v304 = vadd_f32(v303, v247); + float32x2_t v305 = vsub_f32(v303, v247); + float32x2_t v322 = vadd_f32(v321, v245); + float32x2_t v323 = vsub_f32(v321, v245); + v6[ostride] = v269; + v6[ostride * 11] = v268; + v6[ostride * 7] = v287; + v6[ostride * 2] = v286; + v6[ostride * 13] = v305; + v6[ostride * 8] = v304; + v6[ostride * 4] = v323; + v6[ostride * 14] = v322; +} +#endif + +#ifdef ARMRAL_ARCH_SVE +void armral_fft_cf32_cf32_cf32_ac_n_uun15(const armral_cmplx_f32_t *restrict x, + armral_cmplx_f32_t *restrict y, + int istride, int ostride, int howmany, + float dir) { + int64_t v0 = istride; + int64_t v2 = ostride; + float v4 = dir; + const float32x2_t *v5 = (const float32x2_t *)x; + float32x2_t *v6 = (float32x2_t *)y; + svbool_t pred_full = svptrue_pat_b32(SV_VL2); + float v148 = -1.2500000000000000e+00F; + float v153 = 5.5901699437494745e-01F; + float v158 = -1.5388417685876268e+00F; + float v165 = -5.8778525229247325e-01F; + float v172 = -3.6327126400268028e-01F; + float v196 = -1.4999999999999998e+00F; + float v201 = 1.8749999999999998e+00F; + float v206 = -8.3852549156242107e-01F; + float v211 = 2.3082626528814396e+00F; + float v218 = 8.8167787843870971e-01F; + float v225 = 5.4490689600402031e-01F; + float v249 = -8.6602540378443871e-01F; + float v256 = 1.0825317547305484e+00F; + float v263 = -4.8412291827592718e-01F; + float v270 = -1.3326760640014592e+00F; + float v275 = -5.0903696045512736e-01F; + float v280 = -3.1460214309120460e-01F; + const float32x2_t *v483 = &v5[v0]; + float32x2_t *v610 = &v6[v2]; + int64_t v15 = v0 * 5; + int64_t v22 = v0 * 10; + int64_t v39 = v0 * 8; + int64_t v46 = v0 * 13; + int64_t v55 = v0 * 3; + int64_t v63 = v0 * 11; + int64_t v79 = v0 * 6; + int64_t v87 = v0 * 14; + int64_t v94 = v0 * 4; + int64_t v103 = v0 * 9; + int64_t v111 = v0 * 2; + int64_t v118 = v0 * 7; + int64_t v127 = v0 * 12; + float v161 = v4 * v158; + float v168 = v4 * v165; + float v175 = v4 * v172; + float v214 = v4 * v211; + float v221 = v4 * v218; + float v228 = v4 * v225; + float v252 = v4 * v249; + float v259 = v4 * v256; + float v266 = v4 * v263; + int64_t v304 = v2 * 10; + int64_t v311 = v2 * 5; + int64_t v321 = v2 * 6; + int64_t v335 = v2 * 11; + int64_t v345 = v2 * 12; + int64_t v352 = v2 * 7; + int64_t v359 = v2 * 2; + int64_t v369 = v2 * 3; + int64_t v376 = v2 * 13; + int64_t v383 = v2 * 8; + int64_t v393 = v2 * 9; + int64_t v400 = v2 * 4; + int64_t v407 = v2 * 14; + const float32x2_t *v438 = &v5[0]; + svfloat32_t v550 = svdup_n_f32(v148); + svfloat32_t v551 = svdup_n_f32(v153); + svfloat32_t v555 = svdup_n_f32(v196); + svfloat32_t v556 = svdup_n_f32(v201); + svfloat32_t v557 = svdup_n_f32(v206); + svfloat32_t v564 = svdup_n_f32(v270); + svfloat32_t v565 = svdup_n_f32(v275); + svfloat32_t v566 = svdup_n_f32(v280); + float32x2_t *v574 = &v6[0]; + svfloat32_t v718 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v483)[0])); + const float32x2_t *v419 = &v5[v15]; + const float32x2_t *v428 = &v5[v22]; + const float32x2_t *v447 = &v5[v39]; + const float32x2_t *v456 = &v5[v46]; + const float32x2_t *v465 = &v5[v55]; + const float32x2_t *v474 = &v5[v63]; + const float32x2_t *v492 = &v5[v79]; + const float32x2_t *v501 = &v5[v87]; + const float32x2_t *v510 = &v5[v94]; + const float32x2_t *v519 = &v5[v103]; + const float32x2_t *v528 = &v5[v111]; + const float32x2_t *v537 = &v5[v118]; + const float32x2_t *v546 = &v5[v127]; + svfloat32_t v552 = svdup_n_f32(v161); + svfloat32_t v553 = svdup_n_f32(v168); + svfloat32_t v554 = svdup_n_f32(v175); + svfloat32_t v558 = svdup_n_f32(v214); + svfloat32_t v559 = svdup_n_f32(v221); + svfloat32_t v560 = svdup_n_f32(v228); + svfloat32_t v561 = svdup_n_f32(v252); + svfloat32_t v562 = svdup_n_f32(v259); + svfloat32_t v563 = svdup_n_f32(v266); + float32x2_t *v583 = &v6[v304]; + float32x2_t *v592 = &v6[v311]; + float32x2_t *v601 = &v6[v321]; + float32x2_t *v619 = &v6[v335]; + float32x2_t *v628 = &v6[v345]; + float32x2_t *v637 = &v6[v352]; + float32x2_t *v646 = &v6[v359]; + float32x2_t *v655 = &v6[v369]; + float32x2_t *v664 = &v6[v376]; + float32x2_t *v673 = &v6[v383]; + float32x2_t *v682 = &v6[v393]; + float32x2_t *v691 = &v6[v400]; + float32x2_t *v700 = &v6[v407]; + svfloat32_t v708 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v438)[0])); + svfloat32_t v704 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v419)[0])); + svfloat32_t v706 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v428)[0])); + svfloat32_t v710 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v447)[0])); + svfloat32_t v712 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v456)[0])); + svfloat32_t v714 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v465)[0])); + svfloat32_t v716 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v474)[0])); + svfloat32_t v720 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v492)[0])); + svfloat32_t v722 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v501)[0])); + svfloat32_t v724 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v510)[0])); + svfloat32_t v726 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v519)[0])); + svfloat32_t v728 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v528)[0])); + svfloat32_t v730 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v537)[0])); + svfloat32_t v732 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v546)[0])); + svfloat32_t v28 = svadd_f32_x(svptrue_b32(), v704, v706); + svfloat32_t v29 = svsub_f32_x(svptrue_b32(), v704, v706); + svfloat32_t v52 = svadd_f32_x(svptrue_b32(), v710, v712); + svfloat32_t v53 = svsub_f32_x(svptrue_b32(), v710, v712); + svfloat32_t v76 = svadd_f32_x(svptrue_b32(), v716, v718); + svfloat32_t v77 = svsub_f32_x(svptrue_b32(), v716, v718); + svfloat32_t v100 = svadd_f32_x(svptrue_b32(), v722, v724); + svfloat32_t v101 = svsub_f32_x(svptrue_b32(), v722, v724); + svfloat32_t v124 = svadd_f32_x(svptrue_b32(), v728, v730); + svfloat32_t v125 = svsub_f32_x(svptrue_b32(), v728, v730); + svfloat32_t v37 = svadd_f32_x(svptrue_b32(), v28, v708); + svfloat32_t v61 = svadd_f32_x(svptrue_b32(), v52, v714); + svfloat32_t v85 = svadd_f32_x(svptrue_b32(), v76, v720); + svfloat32_t v109 = svadd_f32_x(svptrue_b32(), v100, v726); + svfloat32_t v133 = svadd_f32_x(svptrue_b32(), v124, v732); + svfloat32_t v187 = svadd_f32_x(svptrue_b32(), v52, v124); + svfloat32_t v188 = svsub_f32_x(svptrue_b32(), v52, v124); + svfloat32_t v189 = svadd_f32_x(svptrue_b32(), v100, v76); + svfloat32_t v190 = svsub_f32_x(svptrue_b32(), v100, v76); + svfloat32_t v240 = svadd_f32_x(svptrue_b32(), v53, v125); + svfloat32_t v241 = svsub_f32_x(svptrue_b32(), v53, v125); + svfloat32_t v242 = svadd_f32_x(svptrue_b32(), v101, v77); + svfloat32_t v243 = svsub_f32_x(svptrue_b32(), v101, v77); + svfloat32_t v134 = svadd_f32_x(svptrue_b32(), v61, v133); + svfloat32_t v135 = svsub_f32_x(svptrue_b32(), v61, v133); + svfloat32_t v136 = svadd_f32_x(svptrue_b32(), v109, v85); + svfloat32_t v137 = svsub_f32_x(svptrue_b32(), v109, v85); + svfloat32_t v191 = svadd_f32_x(svptrue_b32(), v187, v189); + svfloat32_t v192 = svsub_f32_x(svptrue_b32(), v187, v189); + svfloat32_t v193 = svadd_f32_x(svptrue_b32(), v188, v190); + svfloat32_t zero216 = svdup_n_f32(0); + svfloat32_t v216 = svcmla_f32_x(pred_full, zero216, v558, v188, 90); + svfloat32_t v244 = svadd_f32_x(svptrue_b32(), v240, v242); + svfloat32_t v245 = svsub_f32_x(svptrue_b32(), v240, v242); + svfloat32_t v246 = svadd_f32_x(svptrue_b32(), v241, v243); + svfloat32_t v283 = svmul_f32_x(svptrue_b32(), v243, v566); + svfloat32_t v138 = svadd_f32_x(svptrue_b32(), v134, v136); + svfloat32_t v139 = svsub_f32_x(svptrue_b32(), v134, v136); + svfloat32_t v140 = svadd_f32_x(svptrue_b32(), v135, v137); + svfloat32_t zero163 = svdup_n_f32(0); + svfloat32_t v163 = svcmla_f32_x(pred_full, zero163, v552, v135, 90); + svfloat32_t v194 = svadd_f32_x(svptrue_b32(), v191, v28); + svfloat32_t v204 = svmul_f32_x(svptrue_b32(), v191, v556); + svfloat32_t zero223 = svdup_n_f32(0); + svfloat32_t v223 = svcmla_f32_x(pred_full, zero223, v559, v193, 90); + svfloat32_t v247 = svadd_f32_x(svptrue_b32(), v244, v29); + svfloat32_t zero268 = svdup_n_f32(0); + svfloat32_t v268 = svcmla_f32_x(pred_full, zero268, v563, v245, 90); + svfloat32_t v278 = svmul_f32_x(svptrue_b32(), v246, v565); + svfloat32_t v141 = svadd_f32_x(svptrue_b32(), v138, v37); + svfloat32_t zero170 = svdup_n_f32(0); + svfloat32_t v170 = svcmla_f32_x(pred_full, zero170, v553, v140, 90); + svfloat32_t v234 = svsub_f32_x(svptrue_b32(), v216, v223); + svfloat32_t v235 = svcmla_f32_x(pred_full, v223, v560, v190, 90); + svfloat32_t zero254 = svdup_n_f32(0); + svfloat32_t v254 = svcmla_f32_x(pred_full, zero254, v561, v247, 90); + svfloat32_t v287 = svnmls_f32_x(pred_full, v278, v241, v564); + svfloat32_t v288 = svmla_f32_x(pred_full, v283, v246, v565); + svfloat32_t v178 = svmla_f32_x(pred_full, v141, v138, v550); + svfloat32_t v181 = svsub_f32_x(svptrue_b32(), v163, v170); + svfloat32_t v182 = svcmla_f32_x(pred_full, v170, v554, v137, 90); + svfloat32_t v231 = svmla_f32_x(pred_full, v204, v194, v555); + svfloat32_t v284 = svcmla_f32_x(pred_full, v254, v562, v244, 90); + svfloat32_t v293 = svmla_f32_x(pred_full, v141, v194, v555); + svst1_f64(pred_full, (double *)(v574), svreinterpret_f64_f32(v141)); + svfloat32_t v179 = svmla_f32_x(pred_full, v178, v139, v551); + svfloat32_t v180 = svmls_f32_x(pred_full, v178, v139, v551); + svfloat32_t v232 = svmla_f32_x(pred_full, v231, v192, v557); + svfloat32_t v233 = svmls_f32_x(pred_full, v231, v192, v557); + svfloat32_t v285 = svadd_f32_x(svptrue_b32(), v284, v268); + svfloat32_t v286 = svsub_f32_x(svptrue_b32(), v284, v268); + svfloat32_t v294 = svadd_f32_x(svptrue_b32(), v293, v254); + svfloat32_t v295 = svsub_f32_x(svptrue_b32(), v293, v254); + svfloat32_t v183 = svadd_f32_x(svptrue_b32(), v179, v181); + svfloat32_t v184 = svsub_f32_x(svptrue_b32(), v179, v181); + svfloat32_t v185 = svadd_f32_x(svptrue_b32(), v180, v182); + svfloat32_t v186 = svsub_f32_x(svptrue_b32(), v180, v182); + svfloat32_t v236 = svadd_f32_x(svptrue_b32(), v232, v234); + svfloat32_t v237 = svsub_f32_x(svptrue_b32(), v232, v234); + svfloat32_t v238 = svadd_f32_x(svptrue_b32(), v233, v235); + svfloat32_t v239 = svsub_f32_x(svptrue_b32(), v233, v235); + svfloat32_t v289 = svadd_f32_x(svptrue_b32(), v285, v287); + svfloat32_t v290 = svsub_f32_x(svptrue_b32(), v285, v287); + svfloat32_t v291 = svadd_f32_x(svptrue_b32(), v286, v288); + svfloat32_t v292 = svsub_f32_x(svptrue_b32(), v286, v288); + svst1_f64(pred_full, (double *)(v583), svreinterpret_f64_f32(v295)); + svst1_f64(pred_full, (double *)(v592), svreinterpret_f64_f32(v294)); + svfloat32_t v317 = svadd_f32_x(svptrue_b32(), v184, v237); + svfloat32_t v341 = svadd_f32_x(svptrue_b32(), v186, v239); + svfloat32_t v365 = svadd_f32_x(svptrue_b32(), v185, v238); + svfloat32_t v389 = svadd_f32_x(svptrue_b32(), v183, v236); + svst1_f64(pred_full, (double *)(v601), svreinterpret_f64_f32(v184)); + svst1_f64(pred_full, (double *)(v628), svreinterpret_f64_f32(v186)); + svst1_f64(pred_full, (double *)(v655), svreinterpret_f64_f32(v185)); + svst1_f64(pred_full, (double *)(v682), svreinterpret_f64_f32(v183)); + svfloat32_t v318 = svadd_f32_x(svptrue_b32(), v317, v290); + svfloat32_t v319 = svsub_f32_x(svptrue_b32(), v317, v290); + svfloat32_t v342 = svadd_f32_x(svptrue_b32(), v341, v292); + svfloat32_t v343 = svsub_f32_x(svptrue_b32(), v341, v292); + svfloat32_t v366 = svadd_f32_x(svptrue_b32(), v365, v291); + svfloat32_t v367 = svsub_f32_x(svptrue_b32(), v365, v291); + svfloat32_t v390 = svadd_f32_x(svptrue_b32(), v389, v289); + svfloat32_t v391 = svsub_f32_x(svptrue_b32(), v389, v289); + svst1_f64(pred_full, (double *)(v610), svreinterpret_f64_f32(v319)); + svst1_f64(pred_full, (double *)(v619), svreinterpret_f64_f32(v318)); + svst1_f64(pred_full, (double *)(v637), svreinterpret_f64_f32(v343)); + svst1_f64(pred_full, (double *)(v646), svreinterpret_f64_f32(v342)); + svst1_f64(pred_full, (double *)(v664), svreinterpret_f64_f32(v367)); + svst1_f64(pred_full, (double *)(v673), svreinterpret_f64_f32(v366)); + svst1_f64(pred_full, (double *)(v691), svreinterpret_f64_f32(v391)); + svst1_f64(pred_full, (double *)(v700), svreinterpret_f64_f32(v390)); +} +#endif + +#ifndef ARMRAL_ARCH_SVE +void armral_fft_cf32_cf32_cf32_ac_n_uun16(const armral_cmplx_f32_t *restrict x, + armral_cmplx_f32_t *restrict y, + int istride, int ostride, int howmany, + float dir) { + float v4 = dir; + const float32x2_t *v5 = (const float32x2_t *)x; + float32x2_t *v6 = (float32x2_t *)y; + float v174 = 1.0000000000000000e+00F; + float v175 = -1.0000000000000000e+00F; + float v182 = -7.0710678118654746e-01F; + float v189 = 7.0710678118654757e-01F; + float v192 = 9.2387953251128674e-01F; + float v193 = -9.2387953251128674e-01F; + float v200 = 5.4119610014619690e-01F; + float v207 = -1.3065629648763766e+00F; + float v214 = 3.8268343236508984e-01F; + float v218 = 1.3065629648763766e+00F; + float v222 = -5.4119610014619690e-01F; + float32x2_t v13 = v5[0]; + float32x2_t v61 = v5[istride]; + float32x2_t v176 = (float32x2_t){v174, v175}; + float32x2_t v183 = (float32x2_t){v189, v182}; + float32x2_t v190 = (float32x2_t){v189, v189}; + float32x2_t v194 = (float32x2_t){v192, v193}; + float32x2_t v201 = (float32x2_t){v222, v200}; + float32x2_t v208 = (float32x2_t){v218, v207}; + float32x2_t v209 = (float32x2_t){v4, v4}; + float32x2_t v215 = (float32x2_t){v214, v214}; + float32x2_t v219 = (float32x2_t){v218, v218}; + float32x2_t v223 = (float32x2_t){v222, v222}; + float32x2_t v18 = v5[istride * 8]; + float32x2_t v25 = v5[istride * 4]; + float32x2_t v30 = v5[istride * 12]; + float32x2_t v37 = v5[istride * 2]; + float32x2_t v42 = v5[istride * 10]; + float32x2_t v49 = v5[istride * 6]; + float32x2_t v54 = v5[istride * 14]; + float32x2_t v66 = v5[istride * 9]; + float32x2_t v73 = v5[istride * 5]; + float32x2_t v78 = v5[istride * 13]; + float32x2_t v85 = v5[istride * 3]; + float32x2_t v90 = v5[istride * 11]; + float32x2_t v97 = v5[istride * 7]; + float32x2_t v102 = v5[istride * 15]; + float32x2_t v178 = vmul_f32(v209, v176); + float32x2_t v185 = vmul_f32(v209, v183); + float32x2_t v196 = vmul_f32(v209, v194); + float32x2_t v203 = vmul_f32(v209, v201); + float32x2_t v210 = vmul_f32(v209, v208); + float32x2_t v19 = vadd_f32(v13, v18); + float32x2_t v20 = vsub_f32(v13, v18); + float32x2_t v31 = vadd_f32(v25, v30); + float32x2_t v32 = vsub_f32(v25, v30); + float32x2_t v43 = vadd_f32(v37, v42); + float32x2_t v44 = vsub_f32(v37, v42); + float32x2_t v55 = vadd_f32(v49, v54); + float32x2_t v56 = vsub_f32(v49, v54); + float32x2_t v67 = vadd_f32(v61, v66); + float32x2_t v68 = vsub_f32(v61, v66); + float32x2_t v79 = vadd_f32(v73, v78); + float32x2_t v80 = vsub_f32(v73, v78); + float32x2_t v91 = vadd_f32(v85, v90); + float32x2_t v92 = vsub_f32(v85, v90); + float32x2_t v103 = vadd_f32(v97, v102); + float32x2_t v104 = vsub_f32(v97, v102); + float32x2_t v105 = vadd_f32(v19, v31); + float32x2_t v106 = vsub_f32(v19, v31); + float32x2_t v107 = vadd_f32(v43, v55); + float32x2_t v108 = vsub_f32(v43, v55); + float32x2_t v109 = vadd_f32(v67, v79); + float32x2_t v110 = vsub_f32(v67, v79); + float32x2_t v111 = vadd_f32(v91, v103); + float32x2_t v112 = vsub_f32(v91, v103); + float32x2_t v121 = vadd_f32(v44, v56); + float32x2_t v122 = vsub_f32(v44, v56); + float32x2_t v123 = vadd_f32(v68, v104); + float32x2_t v124 = vsub_f32(v68, v104); + float32x2_t v125 = vadd_f32(v80, v92); + float32x2_t v126 = vsub_f32(v80, v92); + float32x2_t v179 = vrev64_f32(v32); + float32x2_t v113 = vadd_f32(v105, v107); + float32x2_t v114 = vsub_f32(v105, v107); + float32x2_t v115 = vadd_f32(v109, v111); + float32x2_t v116 = vsub_f32(v109, v111); + float32x2_t v119 = vadd_f32(v110, v112); + float32x2_t v120 = vsub_f32(v110, v112); + float32x2_t v127 = vadd_f32(v123, v125); + float32x2_t v128 = vadd_f32(v124, v126); + float32x2_t v157 = vrev64_f32(v108); + float32x2_t v180 = vmul_f32(v179, v178); + float32x2_t v186 = vrev64_f32(v121); + float32x2_t v191 = vmul_f32(v122, v190); + float32x2_t v204 = vrev64_f32(v123); + float32x2_t v211 = vrev64_f32(v125); + float32x2_t v220 = vmul_f32(v124, v219); + float32x2_t v224 = vmul_f32(v126, v223); + float32x2_t v117 = vadd_f32(v113, v115); + float32x2_t v118 = vsub_f32(v113, v115); + float32x2_t v146 = vrev64_f32(v116); + float32x2_t v158 = vmul_f32(v157, v178); + float32x2_t v164 = vrev64_f32(v119); + float32x2_t v169 = vmul_f32(v120, v190); + float32x2_t v187 = vmul_f32(v186, v185); + float32x2_t v197 = vrev64_f32(v127); + float32x2_t v205 = vmul_f32(v204, v203); + float32x2_t v212 = vmul_f32(v211, v210); + float32x2_t v216 = vmul_f32(v128, v215); + float32x2_t v235 = vadd_f32(v20, v191); + float32x2_t v236 = vsub_f32(v20, v191); + float32x2_t v147 = vmul_f32(v146, v178); + float32x2_t v165 = vmul_f32(v164, v185); + float32x2_t v198 = vmul_f32(v197, v196); + float32x2_t v227 = vadd_f32(v106, v169); + float32x2_t v229 = vsub_f32(v106, v169); + float32x2_t v237 = vadd_f32(v180, v187); + float32x2_t v238 = vsub_f32(v180, v187); + float32x2_t v241 = vsub_f32(v220, v216); + float32x2_t v242 = vsub_f32(v224, v216); + float32x2_t v243 = vsub_f32(v216, v220); + float32x2_t v244 = vsub_f32(v216, v224); + v6[0] = v117; + v6[ostride * 8] = v118; + float32x2_t v225 = vadd_f32(v114, v147); + float32x2_t v226 = vsub_f32(v114, v147); + float32x2_t v228 = vadd_f32(v158, v165); + float32x2_t v230 = vsub_f32(v165, v158); + float32x2_t v239 = vadd_f32(v198, v205); + float32x2_t v240 = vsub_f32(v198, v212); + float32x2_t v245 = vadd_f32(v235, v241); + float32x2_t v246 = vsub_f32(v235, v241); + float32x2_t v247 = vadd_f32(v235, v243); + float32x2_t v248 = vsub_f32(v235, v243); + float32x2_t v249 = vadd_f32(v236, v238); + float32x2_t v250 = vsub_f32(v236, v238); + float32x2_t v251 = vadd_f32(v236, v244); + float32x2_t v252 = vsub_f32(v236, v244); + float32x2_t v231 = vadd_f32(v227, v228); + float32x2_t v232 = vadd_f32(v229, v230); + float32x2_t v233 = vsub_f32(v229, v230); + float32x2_t v234 = vsub_f32(v227, v228); + float32x2_t v255 = vadd_f32(v239, v237); + float32x2_t v256 = vsub_f32(v239, v237); + float32x2_t v257 = vadd_f32(v240, v242); + float32x2_t v258 = vsub_f32(v240, v242); + float32x2_t v259 = vadd_f32(v240, v238); + float32x2_t v260 = vsub_f32(v240, v238); + v6[ostride * 4] = v226; + v6[ostride * 12] = v225; + float32x2_t v261 = vadd_f32(v245, v255); + float32x2_t v262 = vadd_f32(v246, v256); + float32x2_t v263 = vsub_f32(v247, v256); + float32x2_t v264 = vsub_f32(v248, v255); + float32x2_t v265 = vadd_f32(v249, v257); + float32x2_t v266 = vadd_f32(v250, v258); + float32x2_t v267 = vsub_f32(v251, v260); + float32x2_t v268 = vsub_f32(v252, v259); + v6[ostride * 2] = v234; + v6[ostride * 6] = v233; + v6[ostride * 10] = v232; + v6[ostride * 14] = v231; + v6[ostride] = v264; + v6[ostride * 3] = v267; + v6[ostride * 5] = v268; + v6[ostride * 7] = v263; + v6[ostride * 9] = v262; + v6[ostride * 11] = v265; + v6[ostride * 13] = v266; + v6[ostride * 15] = v261; +} +#endif + +#ifdef ARMRAL_ARCH_SVE +void armral_fft_cf32_cf32_cf32_ac_n_uun16(const armral_cmplx_f32_t *restrict x, + armral_cmplx_f32_t *restrict y, + int istride, int ostride, int howmany, + float dir) { + int64_t v0 = istride; + int64_t v2 = ostride; + float v4 = dir; + const float32x2_t *v5 = (const float32x2_t *)x; + float32x2_t *v6 = (float32x2_t *)y; + svbool_t pred_full = svptrue_pat_b32(SV_VL2); + float v218 = -1.0000000000000000e+00F; + float v225 = -7.0710678118654746e-01F; + float v232 = 7.0710678118654757e-01F; + float v237 = -9.2387953251128674e-01F; + float v244 = 5.4119610014619690e-01F; + float v251 = -1.3065629648763766e+00F; + float v258 = 3.8268343236508984e-01F; + float v263 = 1.3065629648763766e+00F; + float v268 = -5.4119610014619690e-01F; + const float32x2_t *v507 = &v5[v0]; + float32x2_t *v607 = &v6[v2]; + int64_t v22 = v0 * 8; + int64_t v31 = v0 * 4; + int64_t v38 = v0 * 12; + int64_t v47 = v0 * 2; + int64_t v54 = v0 * 10; + int64_t v63 = v0 * 6; + int64_t v70 = v0 * 14; + int64_t v86 = v0 * 9; + int64_t v95 = v0 * 5; + int64_t v102 = v0 * 13; + int64_t v111 = v0 * 3; + int64_t v118 = v0 * 11; + int64_t v127 = v0 * 7; + int64_t v134 = v0 * 15; + float v221 = v4 * v218; + float v228 = v4 * v225; + float v240 = v4 * v237; + float v247 = v4 * v244; + float v254 = v4 * v251; + int64_t v331 = v2 * 2; + int64_t v338 = v2 * 3; + int64_t v345 = v2 * 4; + int64_t v352 = v2 * 5; + int64_t v359 = v2 * 6; + int64_t v366 = v2 * 7; + int64_t v373 = v2 * 8; + int64_t v380 = v2 * 9; + int64_t v387 = v2 * 10; + int64_t v394 = v2 * 11; + int64_t v401 = v2 * 12; + int64_t v408 = v2 * 13; + int64_t v415 = v2 * 14; + int64_t v422 = v2 * 15; + const float32x2_t *v435 = &v5[0]; + svfloat32_t v584 = svdup_n_f32(v232); + svfloat32_t v588 = svdup_n_f32(v258); + svfloat32_t v589 = svdup_n_f32(v263); + svfloat32_t v590 = svdup_n_f32(v268); + float32x2_t *v598 = &v6[0]; + svfloat32_t v753 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v507)[0])); + const float32x2_t *v444 = &v5[v22]; + const float32x2_t *v453 = &v5[v31]; + const float32x2_t *v462 = &v5[v38]; + const float32x2_t *v471 = &v5[v47]; + const float32x2_t *v480 = &v5[v54]; + const float32x2_t *v489 = &v5[v63]; + const float32x2_t *v498 = &v5[v70]; + const float32x2_t *v516 = &v5[v86]; + const float32x2_t *v525 = &v5[v95]; + const float32x2_t *v534 = &v5[v102]; + const float32x2_t *v543 = &v5[v111]; + const float32x2_t *v552 = &v5[v118]; + const float32x2_t *v561 = &v5[v127]; + const float32x2_t *v570 = &v5[v134]; + svfloat32_t v582 = svdup_n_f32(v221); + svfloat32_t v583 = svdup_n_f32(v228); + svfloat32_t v585 = svdup_n_f32(v240); + svfloat32_t v586 = svdup_n_f32(v247); + svfloat32_t v587 = svdup_n_f32(v254); + float32x2_t *v616 = &v6[v331]; + float32x2_t *v625 = &v6[v338]; + float32x2_t *v634 = &v6[v345]; + float32x2_t *v643 = &v6[v352]; + float32x2_t *v652 = &v6[v359]; + float32x2_t *v661 = &v6[v366]; + float32x2_t *v670 = &v6[v373]; + float32x2_t *v679 = &v6[v380]; + float32x2_t *v688 = &v6[v387]; + float32x2_t *v697 = &v6[v394]; + float32x2_t *v706 = &v6[v401]; + float32x2_t *v715 = &v6[v408]; + float32x2_t *v724 = &v6[v415]; + float32x2_t *v733 = &v6[v422]; + svfloat32_t v737 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v435)[0])); + svfloat32_t v739 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v444)[0])); + svfloat32_t v741 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v453)[0])); + svfloat32_t v743 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v462)[0])); + svfloat32_t v745 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v471)[0])); + svfloat32_t v747 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v480)[0])); + svfloat32_t v749 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v489)[0])); + svfloat32_t v751 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v498)[0])); + svfloat32_t v755 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v516)[0])); + svfloat32_t v757 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v525)[0])); + svfloat32_t v759 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v534)[0])); + svfloat32_t v761 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v543)[0])); + svfloat32_t v763 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v552)[0])); + svfloat32_t v765 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v561)[0])); + svfloat32_t v767 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v570)[0])); + svfloat32_t v28 = svadd_f32_x(svptrue_b32(), v737, v739); + svfloat32_t v29 = svsub_f32_x(svptrue_b32(), v737, v739); + svfloat32_t v44 = svadd_f32_x(svptrue_b32(), v741, v743); + svfloat32_t v45 = svsub_f32_x(svptrue_b32(), v741, v743); + svfloat32_t v60 = svadd_f32_x(svptrue_b32(), v745, v747); + svfloat32_t v61 = svsub_f32_x(svptrue_b32(), v745, v747); + svfloat32_t v76 = svadd_f32_x(svptrue_b32(), v749, v751); + svfloat32_t v77 = svsub_f32_x(svptrue_b32(), v749, v751); + svfloat32_t v92 = svadd_f32_x(svptrue_b32(), v753, v755); + svfloat32_t v93 = svsub_f32_x(svptrue_b32(), v753, v755); + svfloat32_t v108 = svadd_f32_x(svptrue_b32(), v757, v759); + svfloat32_t v109 = svsub_f32_x(svptrue_b32(), v757, v759); + svfloat32_t v124 = svadd_f32_x(svptrue_b32(), v761, v763); + svfloat32_t v125 = svsub_f32_x(svptrue_b32(), v761, v763); + svfloat32_t v140 = svadd_f32_x(svptrue_b32(), v765, v767); + svfloat32_t v141 = svsub_f32_x(svptrue_b32(), v765, v767); + svfloat32_t v142 = svadd_f32_x(svptrue_b32(), v28, v44); + svfloat32_t v143 = svsub_f32_x(svptrue_b32(), v28, v44); + svfloat32_t v144 = svadd_f32_x(svptrue_b32(), v60, v76); + svfloat32_t v145 = svsub_f32_x(svptrue_b32(), v60, v76); + svfloat32_t v146 = svadd_f32_x(svptrue_b32(), v92, v108); + svfloat32_t v147 = svsub_f32_x(svptrue_b32(), v92, v108); + svfloat32_t v148 = svadd_f32_x(svptrue_b32(), v124, v140); + svfloat32_t v149 = svsub_f32_x(svptrue_b32(), v124, v140); + svfloat32_t v158 = svadd_f32_x(svptrue_b32(), v61, v77); + svfloat32_t v159 = svsub_f32_x(svptrue_b32(), v61, v77); + svfloat32_t v160 = svadd_f32_x(svptrue_b32(), v93, v141); + svfloat32_t v161 = svsub_f32_x(svptrue_b32(), v93, v141); + svfloat32_t v162 = svadd_f32_x(svptrue_b32(), v109, v125); + svfloat32_t v163 = svsub_f32_x(svptrue_b32(), v109, v125); + svfloat32_t zero223 = svdup_n_f32(0); + svfloat32_t v223 = svcmla_f32_x(pred_full, zero223, v582, v45, 90); + svfloat32_t v150 = svadd_f32_x(svptrue_b32(), v142, v144); + svfloat32_t v151 = svsub_f32_x(svptrue_b32(), v142, v144); + svfloat32_t v152 = svadd_f32_x(svptrue_b32(), v146, v148); + svfloat32_t v153 = svsub_f32_x(svptrue_b32(), v146, v148); + svfloat32_t v156 = svadd_f32_x(svptrue_b32(), v147, v149); + svfloat32_t v157 = svsub_f32_x(svptrue_b32(), v147, v149); + svfloat32_t v164 = svadd_f32_x(svptrue_b32(), v160, v162); + svfloat32_t v165 = svadd_f32_x(svptrue_b32(), v161, v163); + svfloat32_t zero199 = svdup_n_f32(0); + svfloat32_t v199 = svcmla_f32_x(pred_full, zero199, v582, v145, 90); + svfloat32_t zero230 = svdup_n_f32(0); + svfloat32_t v230 = svcmla_f32_x(pred_full, zero230, v583, v158, 90); + svfloat32_t zero256 = svdup_n_f32(0); + svfloat32_t v256 = svcmla_f32_x(pred_full, zero256, v587, v162, 90); + svfloat32_t v266 = svmul_f32_x(svptrue_b32(), v161, v589); + svfloat32_t v271 = svmul_f32_x(svptrue_b32(), v163, v590); + svfloat32_t v154 = svadd_f32_x(svptrue_b32(), v150, v152); + svfloat32_t v155 = svsub_f32_x(svptrue_b32(), v150, v152); + svfloat32_t zero187 = svdup_n_f32(0); + svfloat32_t v187 = svcmla_f32_x(pred_full, zero187, v582, v153, 90); + svfloat32_t zero206 = svdup_n_f32(0); + svfloat32_t v206 = svcmla_f32_x(pred_full, zero206, v583, v156, 90); + svfloat32_t zero242 = svdup_n_f32(0); + svfloat32_t v242 = svcmla_f32_x(pred_full, zero242, v585, v164, 90); + svfloat32_t v261 = svmul_f32_x(svptrue_b32(), v165, v588); + svfloat32_t v282 = svmla_f32_x(pred_full, v29, v159, v584); + svfloat32_t v283 = svmls_f32_x(pred_full, v29, v159, v584); + svfloat32_t v284 = svadd_f32_x(svptrue_b32(), v223, v230); + svfloat32_t v285 = svsub_f32_x(svptrue_b32(), v223, v230); + svfloat32_t v272 = svadd_f32_x(svptrue_b32(), v151, v187); + svfloat32_t v273 = svsub_f32_x(svptrue_b32(), v151, v187); + svfloat32_t v274 = svmla_f32_x(pred_full, v143, v157, v584); + svfloat32_t v275 = svadd_f32_x(svptrue_b32(), v199, v206); + svfloat32_t v276 = svmls_f32_x(pred_full, v143, v157, v584); + svfloat32_t v277 = svsub_f32_x(svptrue_b32(), v206, v199); + svfloat32_t v286 = svcmla_f32_x(pred_full, v242, v586, v160, 90); + svfloat32_t v287 = svsub_f32_x(svptrue_b32(), v242, v256); + svfloat32_t v288 = svnmls_f32_x(pred_full, v261, v161, v589); + svfloat32_t v289 = svnmls_f32_x(pred_full, v261, v163, v590); + svfloat32_t v290 = svnmls_f32_x(pred_full, v266, v165, v588); + svfloat32_t v291 = svnmls_f32_x(pred_full, v271, v165, v588); + svfloat32_t v296 = svadd_f32_x(svptrue_b32(), v283, v285); + svfloat32_t v297 = svsub_f32_x(svptrue_b32(), v283, v285); + svst1_f64(pred_full, (double *)(v598), svreinterpret_f64_f32(v154)); + svst1_f64(pred_full, (double *)(v670), svreinterpret_f64_f32(v155)); + svfloat32_t v278 = svadd_f32_x(svptrue_b32(), v274, v275); + svfloat32_t v279 = svadd_f32_x(svptrue_b32(), v276, v277); + svfloat32_t v280 = svsub_f32_x(svptrue_b32(), v276, v277); + svfloat32_t v281 = svsub_f32_x(svptrue_b32(), v274, v275); + svfloat32_t v292 = svadd_f32_x(svptrue_b32(), v282, v288); + svfloat32_t v293 = svsub_f32_x(svptrue_b32(), v282, v288); + svfloat32_t v294 = svadd_f32_x(svptrue_b32(), v282, v290); + svfloat32_t v295 = svsub_f32_x(svptrue_b32(), v282, v290); + svfloat32_t v298 = svadd_f32_x(svptrue_b32(), v283, v291); + svfloat32_t v299 = svsub_f32_x(svptrue_b32(), v283, v291); + svfloat32_t v302 = svadd_f32_x(svptrue_b32(), v286, v284); + svfloat32_t v303 = svsub_f32_x(svptrue_b32(), v286, v284); + svfloat32_t v304 = svadd_f32_x(svptrue_b32(), v287, v289); + svfloat32_t v305 = svsub_f32_x(svptrue_b32(), v287, v289); + svfloat32_t v306 = svadd_f32_x(svptrue_b32(), v287, v285); + svfloat32_t v307 = svsub_f32_x(svptrue_b32(), v287, v285); + svst1_f64(pred_full, (double *)(v634), svreinterpret_f64_f32(v273)); + svst1_f64(pred_full, (double *)(v706), svreinterpret_f64_f32(v272)); + svfloat32_t v308 = svadd_f32_x(svptrue_b32(), v292, v302); + svfloat32_t v309 = svadd_f32_x(svptrue_b32(), v293, v303); + svfloat32_t v310 = svsub_f32_x(svptrue_b32(), v294, v303); + svfloat32_t v311 = svsub_f32_x(svptrue_b32(), v295, v302); + svfloat32_t v312 = svadd_f32_x(svptrue_b32(), v296, v304); + svfloat32_t v313 = svadd_f32_x(svptrue_b32(), v297, v305); + svfloat32_t v314 = svsub_f32_x(svptrue_b32(), v298, v307); + svfloat32_t v315 = svsub_f32_x(svptrue_b32(), v299, v306); + svst1_f64(pred_full, (double *)(v616), svreinterpret_f64_f32(v281)); + svst1_f64(pred_full, (double *)(v652), svreinterpret_f64_f32(v280)); + svst1_f64(pred_full, (double *)(v688), svreinterpret_f64_f32(v279)); + svst1_f64(pred_full, (double *)(v724), svreinterpret_f64_f32(v278)); + svst1_f64(pred_full, (double *)(v607), svreinterpret_f64_f32(v311)); + svst1_f64(pred_full, (double *)(v625), svreinterpret_f64_f32(v314)); + svst1_f64(pred_full, (double *)(v643), svreinterpret_f64_f32(v315)); + svst1_f64(pred_full, (double *)(v661), svreinterpret_f64_f32(v310)); + svst1_f64(pred_full, (double *)(v679), svreinterpret_f64_f32(v309)); + svst1_f64(pred_full, (double *)(v697), svreinterpret_f64_f32(v312)); + svst1_f64(pred_full, (double *)(v715), svreinterpret_f64_f32(v313)); + svst1_f64(pred_full, (double *)(v733), svreinterpret_f64_f32(v308)); +} +#endif + +#ifndef ARMRAL_ARCH_SVE +void armral_fft_cf32_cf32_cf32_ac_n_uun17(const armral_cmplx_f32_t *restrict x, + armral_cmplx_f32_t *restrict y, + int istride, int ostride, int howmany, + float dir) { + float v4 = dir; + const float32x2_t *v5 = (const float32x2_t *)x; + float32x2_t *v6 = (float32x2_t *)y; + float v166 = -4.2602849117736000e-02F; + float v170 = 2.0497965023262180e-01F; + float v174 = 1.0451835201736759e+00F; + float v178 = 1.7645848660222969e+00F; + float v182 = -7.2340797728605655e-01F; + float v186 = -8.9055591620606403e-02F; + float v190 = -1.0625000000000000e+00F; + float v194 = 2.5769410160110379e-01F; + float v198 = 7.7980260789483757e-01F; + float v202 = 5.4389318464570580e-01F; + float v206 = 4.2010193497052700e-01F; + float v210 = 1.2810929434228073e+00F; + float v214 = 4.4088907348175338e-01F; + float v218 = 3.1717619283272508e-01F; + float v221 = -9.0138318648016680e-01F; + float v222 = 9.0138318648016680e-01F; + float v228 = -4.3248756360072310e-01F; + float v229 = 4.3248756360072310e-01F; + float v235 = 6.6693537504044498e-01F; + float v236 = -6.6693537504044498e-01F; + float v242 = -6.0389004312516970e-01F; + float v243 = 6.0389004312516970e-01F; + float v249 = -3.6924873198582547e-01F; + float v250 = 3.6924873198582547e-01F; + float v256 = 4.8656938755549761e-01F; + float v257 = -4.8656938755549761e-01F; + float v263 = 2.3813712136760609e-01F; + float v264 = -2.3813712136760609e-01F; + float v270 = -1.5573820617422458e+00F; + float v271 = 1.5573820617422458e+00F; + float v277 = 6.5962247018731990e-01F; + float v278 = -6.5962247018731990e-01F; + float v284 = -1.4316961569866241e-01F; + float v285 = 1.4316961569866241e-01F; + float v291 = 2.3903469959860771e-01F; + float v292 = -2.3903469959860771e-01F; + float v298 = -4.7932541949972603e-02F; + float v299 = 4.7932541949972603e-02F; + float v305 = -2.3188014856550065e+00F; + float v306 = 2.3188014856550065e+00F; + float v312 = 7.8914568419206255e-01F; + float v313 = -7.8914568419206255e-01F; + float v319 = 3.8484572871179505e+00F; + float v320 = -3.8484572871179505e+00F; + float v326 = -1.3003804568801376e+00F; + float v327 = 1.3003804568801376e+00F; + float v333 = 4.0814769046889037e+00F; + float v334 = -4.0814769046889037e+00F; + float v340 = -1.4807159909286283e+00F; + float v341 = 1.4807159909286283e+00F; + float v347 = -1.3332470363551400e-02F; + float v348 = 1.3332470363551400e-02F; + float v354 = -3.7139778690557629e-01F; + float v355 = 3.7139778690557629e-01F; + float v361 = 1.9236512863456379e-01F; + float v362 = -1.9236512863456379e-01F; + float32x2_t v13 = v5[istride]; + float32x2_t v159 = v5[0]; + float32x2_t v167 = (float32x2_t){v166, v166}; + float32x2_t v171 = (float32x2_t){v170, v170}; + float32x2_t v175 = (float32x2_t){v174, v174}; + float32x2_t v179 = (float32x2_t){v178, v178}; + float32x2_t v183 = (float32x2_t){v182, v182}; + float32x2_t v187 = (float32x2_t){v186, v186}; + float32x2_t v191 = (float32x2_t){v190, v190}; + float32x2_t v195 = (float32x2_t){v194, v194}; + float32x2_t v199 = (float32x2_t){v198, v198}; + float32x2_t v203 = (float32x2_t){v202, v202}; + float32x2_t v207 = (float32x2_t){v206, v206}; + float32x2_t v211 = (float32x2_t){v210, v210}; + float32x2_t v215 = (float32x2_t){v214, v214}; + float32x2_t v219 = (float32x2_t){v218, v218}; + float32x2_t v223 = (float32x2_t){v221, v222}; + float32x2_t v230 = (float32x2_t){v228, v229}; + float32x2_t v237 = (float32x2_t){v235, v236}; + float32x2_t v244 = (float32x2_t){v242, v243}; + float32x2_t v251 = (float32x2_t){v249, v250}; + float32x2_t v258 = (float32x2_t){v256, v257}; + float32x2_t v265 = (float32x2_t){v263, v264}; + float32x2_t v272 = (float32x2_t){v270, v271}; + float32x2_t v279 = (float32x2_t){v277, v278}; + float32x2_t v286 = (float32x2_t){v284, v285}; + float32x2_t v293 = (float32x2_t){v291, v292}; + float32x2_t v300 = (float32x2_t){v298, v299}; + float32x2_t v307 = (float32x2_t){v305, v306}; + float32x2_t v314 = (float32x2_t){v312, v313}; + float32x2_t v321 = (float32x2_t){v319, v320}; + float32x2_t v328 = (float32x2_t){v326, v327}; + float32x2_t v335 = (float32x2_t){v333, v334}; + float32x2_t v342 = (float32x2_t){v340, v341}; + float32x2_t v349 = (float32x2_t){v347, v348}; + float32x2_t v356 = (float32x2_t){v354, v355}; + float32x2_t v363 = (float32x2_t){v361, v362}; + float32x2_t v364 = (float32x2_t){v4, v4}; + float32x2_t v18 = v5[istride * 16]; + float32x2_t v25 = v5[istride * 3]; + float32x2_t v30 = v5[istride * 14]; + float32x2_t v37 = v5[istride * 9]; + float32x2_t v42 = v5[istride * 8]; + float32x2_t v49 = v5[istride * 10]; + float32x2_t v54 = v5[istride * 7]; + float32x2_t v61 = v5[istride * 13]; + float32x2_t v66 = v5[istride * 4]; + float32x2_t v73 = v5[istride * 5]; + float32x2_t v78 = v5[istride * 12]; + float32x2_t v85 = v5[istride * 15]; + float32x2_t v90 = v5[istride * 2]; + float32x2_t v97 = v5[istride * 11]; + float32x2_t v102 = v5[istride * 6]; + float32x2_t v225 = vmul_f32(v364, v223); + float32x2_t v232 = vmul_f32(v364, v230); + float32x2_t v239 = vmul_f32(v364, v237); + float32x2_t v246 = vmul_f32(v364, v244); + float32x2_t v253 = vmul_f32(v364, v251); + float32x2_t v260 = vmul_f32(v364, v258); + float32x2_t v267 = vmul_f32(v364, v265); + float32x2_t v274 = vmul_f32(v364, v272); + float32x2_t v281 = vmul_f32(v364, v279); + float32x2_t v288 = vmul_f32(v364, v286); + float32x2_t v295 = vmul_f32(v364, v293); + float32x2_t v302 = vmul_f32(v364, v300); + float32x2_t v309 = vmul_f32(v364, v307); + float32x2_t v316 = vmul_f32(v364, v314); + float32x2_t v323 = vmul_f32(v364, v321); + float32x2_t v330 = vmul_f32(v364, v328); + float32x2_t v337 = vmul_f32(v364, v335); + float32x2_t v344 = vmul_f32(v364, v342); + float32x2_t v351 = vmul_f32(v364, v349); + float32x2_t v358 = vmul_f32(v364, v356); + float32x2_t v365 = vmul_f32(v364, v363); + float32x2_t v19 = vadd_f32(v13, v18); + float32x2_t v20 = vsub_f32(v13, v18); + float32x2_t v31 = vadd_f32(v25, v30); + float32x2_t v32 = vsub_f32(v25, v30); + float32x2_t v43 = vadd_f32(v37, v42); + float32x2_t v44 = vsub_f32(v37, v42); + float32x2_t v55 = vadd_f32(v49, v54); + float32x2_t v56 = vsub_f32(v49, v54); + float32x2_t v67 = vadd_f32(v61, v66); + float32x2_t v68 = vsub_f32(v61, v66); + float32x2_t v79 = vadd_f32(v73, v78); + float32x2_t v80 = vsub_f32(v73, v78); + float32x2_t v91 = vadd_f32(v85, v90); + float32x2_t v92 = vsub_f32(v85, v90); + float32x2_t v103 = vadd_f32(v97, v102); + float32x2_t v104 = vsub_f32(v97, v102); + float32x2_t v105 = vadd_f32(v19, v67); + float32x2_t v106 = vadd_f32(v31, v79); + float32x2_t v107 = vadd_f32(v43, v91); + float32x2_t v108 = vadd_f32(v55, v103); + float32x2_t v111 = vsub_f32(v19, v67); + float32x2_t v112 = vsub_f32(v31, v79); + float32x2_t v113 = vsub_f32(v43, v91); + float32x2_t v114 = vsub_f32(v55, v103); + float32x2_t v125 = vadd_f32(v20, v44); + float32x2_t v126 = vadd_f32(v32, v56); + float32x2_t v127 = vsub_f32(v20, v44); + float32x2_t v128 = vsub_f32(v104, v80); + float32x2_t v129 = vadd_f32(v68, v92); + float32x2_t v130 = vadd_f32(v80, v104); + float32x2_t v131 = vsub_f32(v68, v92); + float32x2_t v132 = vsub_f32(v32, v56); + float32x2_t v145 = vadd_f32(v20, v68); + float32x2_t v146 = vadd_f32(v56, v104); + float32x2_t v317 = vrev64_f32(v20); + float32x2_t v324 = vrev64_f32(v68); + float32x2_t v338 = vrev64_f32(v56); + float32x2_t v345 = vrev64_f32(v104); + float32x2_t v109 = vadd_f32(v105, v107); + float32x2_t v110 = vadd_f32(v106, v108); + float32x2_t v115 = vsub_f32(v105, v107); + float32x2_t v116 = vsub_f32(v106, v108); + float32x2_t v119 = vadd_f32(v112, v114); + float32x2_t v120 = vadd_f32(v111, v113); + float32x2_t v122 = vsub_f32(v113, v114); + float32x2_t v123 = vsub_f32(v111, v112); + float32x2_t v133 = vadd_f32(v125, v126); + float32x2_t v134 = vadd_f32(v129, v130); + float32x2_t v136 = vsub_f32(v125, v126); + float32x2_t v137 = vsub_f32(v129, v130); + float32x2_t v139 = vadd_f32(v127, v128); + float32x2_t v140 = vadd_f32(v131, v132); + float32x2_t v142 = vsub_f32(v127, v128); + float32x2_t v143 = vsub_f32(v131, v132); + float32x2_t v168 = vmul_f32(v111, v167); + float32x2_t v172 = vmul_f32(v112, v171); + float32x2_t v176 = vmul_f32(v113, v175); + float32x2_t v180 = vmul_f32(v114, v179); + float32x2_t v310 = vrev64_f32(v145); + float32x2_t v318 = vmul_f32(v317, v316); + float32x2_t v325 = vmul_f32(v324, v323); + float32x2_t v331 = vrev64_f32(v146); + float32x2_t v339 = vmul_f32(v338, v337); + float32x2_t v346 = vmul_f32(v345, v344); + float32x2_t v117 = vadd_f32(v109, v110); + float32x2_t v118 = vsub_f32(v109, v110); + float32x2_t v121 = vsub_f32(v120, v119); + float32x2_t v124 = vadd_f32(v115, v116); + float32x2_t v135 = vadd_f32(v133, v134); + float32x2_t v138 = vadd_f32(v136, v137); + float32x2_t v141 = vadd_f32(v139, v140); + float32x2_t v144 = vadd_f32(v142, v143); + float32x2_t v147 = vsub_f32(v140, v134); + float32x2_t v150 = vsub_f32(v133, v139); + float32x2_t v184 = vmul_f32(v115, v183); + float32x2_t v188 = vmul_f32(v116, v187); + float32x2_t v200 = vmul_f32(v119, v199); + float32x2_t v204 = vmul_f32(v120, v203); + float32x2_t v212 = vmul_f32(v122, v211); + float32x2_t v216 = vmul_f32(v123, v215); + float32x2_t v226 = vrev64_f32(v133); + float32x2_t v233 = vrev64_f32(v134); + float32x2_t v247 = vrev64_f32(v136); + float32x2_t v254 = vrev64_f32(v137); + float32x2_t v268 = vrev64_f32(v139); + float32x2_t v275 = vrev64_f32(v140); + float32x2_t v289 = vrev64_f32(v142); + float32x2_t v296 = vrev64_f32(v143); + float32x2_t v311 = vmul_f32(v310, v309); + float32x2_t v332 = vmul_f32(v331, v330); + float32x2_t v148 = vadd_f32(v147, v20); + float32x2_t v151 = vadd_f32(v150, v56); + float32x2_t v160 = vadd_f32(v159, v117); + float32x2_t v192 = vmul_f32(v117, v191); + float32x2_t v196 = vmul_f32(v118, v195); + float32x2_t v208 = vmul_f32(v121, v207); + float32x2_t v220 = vmul_f32(v124, v219); + float32x2_t v227 = vmul_f32(v226, v225); + float32x2_t v234 = vmul_f32(v233, v232); + float32x2_t v240 = vrev64_f32(v135); + float32x2_t v248 = vmul_f32(v247, v246); + float32x2_t v255 = vmul_f32(v254, v253); + float32x2_t v261 = vrev64_f32(v138); + float32x2_t v269 = vmul_f32(v268, v267); + float32x2_t v276 = vmul_f32(v275, v274); + float32x2_t v282 = vrev64_f32(v141); + float32x2_t v290 = vmul_f32(v289, v288); + float32x2_t v297 = vmul_f32(v296, v295); + float32x2_t v303 = vrev64_f32(v144); + float32x2_t v370 = vadd_f32(v180, v212); + float32x2_t v371 = vsub_f32(v212, v176); + float32x2_t v372 = vadd_f32(v172, v216); + float32x2_t v373 = vsub_f32(v168, v216); + float32x2_t v149 = vsub_f32(v148, v146); + float32x2_t v152 = vadd_f32(v151, v68); + float32x2_t v241 = vmul_f32(v240, v239); + float32x2_t v262 = vmul_f32(v261, v260); + float32x2_t v283 = vmul_f32(v282, v281); + float32x2_t v304 = vmul_f32(v303, v302); + float32x2_t v368 = vadd_f32(v200, v208); + float32x2_t v369 = vsub_f32(v204, v208); + float32x2_t v374 = vsub_f32(v220, v188); + float32x2_t v375 = vadd_f32(v220, v184); + float32x2_t v376 = vadd_f32(v192, v160); + v6[0] = v160; + float32x2_t v153 = vsub_f32(v152, v104); + float32x2_t v352 = vrev64_f32(v149); + float32x2_t v377 = vadd_f32(v196, v376); + float32x2_t v378 = vsub_f32(v376, v196); + float32x2_t v379 = vsub_f32(v368, v370); + float32x2_t v381 = vadd_f32(v369, v371); + float32x2_t v383 = vadd_f32(v368, v372); + float32x2_t v385 = vadd_f32(v369, v373); + float32x2_t v395 = vadd_f32(v227, v241); + float32x2_t v396 = vadd_f32(v234, v241); + float32x2_t v397 = vadd_f32(v248, v262); + float32x2_t v398 = vadd_f32(v255, v262); + float32x2_t v399 = vadd_f32(v269, v283); + float32x2_t v400 = vadd_f32(v276, v283); + float32x2_t v401 = vadd_f32(v290, v304); + float32x2_t v402 = vadd_f32(v297, v304); + float32x2_t v154 = vadd_f32(v149, v153); + float32x2_t v353 = vmul_f32(v352, v351); + float32x2_t v359 = vrev64_f32(v153); + float32x2_t v380 = vadd_f32(v374, v377); + float32x2_t v382 = vadd_f32(v375, v378); + float32x2_t v384 = vsub_f32(v377, v374); + float32x2_t v386 = vsub_f32(v378, v375); + float32x2_t v406 = vadd_f32(v395, v397); + float32x2_t v407 = vsub_f32(v395, v397); + float32x2_t v408 = vadd_f32(v396, v398); + float32x2_t v409 = vsub_f32(v396, v398); + float32x2_t v410 = vadd_f32(v399, v401); + float32x2_t v411 = vsub_f32(v401, v399); + float32x2_t v412 = vadd_f32(v400, v402); + float32x2_t v413 = vsub_f32(v402, v400); + float32x2_t v360 = vmul_f32(v359, v358); + float32x2_t v366 = vrev64_f32(v154); + float32x2_t v387 = vadd_f32(v379, v380); + float32x2_t v388 = vadd_f32(v381, v382); + float32x2_t v389 = vadd_f32(v383, v384); + float32x2_t v390 = vadd_f32(v385, v386); + float32x2_t v391 = vsub_f32(v380, v379); + float32x2_t v392 = vsub_f32(v382, v381); + float32x2_t v393 = vsub_f32(v384, v383); + float32x2_t v394 = vsub_f32(v386, v385); + float32x2_t v423 = vadd_f32(v408, v412); + float32x2_t v425 = vadd_f32(v407, v413); + float32x2_t v427 = vsub_f32(v406, v410); + float32x2_t v429 = vsub_f32(v413, v407); + float32x2_t v431 = vadd_f32(v406, v410); + float32x2_t v434 = vsub_f32(v411, v409); + float32x2_t v437 = vsub_f32(v412, v408); + float32x2_t v440 = vadd_f32(v409, v411); + float32x2_t v367 = vmul_f32(v366, v365); + float32x2_t v414 = vsub_f32(v353, v360); + float32x2_t v403 = vadd_f32(v367, v360); + float32x2_t v416 = vadd_f32(v414, v414); + float32x2_t v441 = vsub_f32(v440, v414); + float32x2_t v404 = vadd_f32(v311, v403); + float32x2_t v417 = vsub_f32(v332, v416); + float32x2_t v420 = vadd_f32(v403, v403); + float32x2_t v438 = vadd_f32(v437, v416); + float32x2_t v471 = vadd_f32(v394, v441); + float32x2_t v477 = vsub_f32(v394, v441); + float32x2_t v405 = vadd_f32(v404, v318); + float32x2_t v415 = vadd_f32(v404, v325); + float32x2_t v418 = vadd_f32(v417, v339); + float32x2_t v419 = vadd_f32(v417, v346); + float32x2_t v421 = vadd_f32(v420, v420); + float32x2_t v422 = vadd_f32(v414, v420); + float32x2_t v428 = vadd_f32(v427, v420); + float32x2_t v439 = vadd_f32(v438, v420); + v6[ostride * 3] = v471; + v6[ostride * 14] = v477; + float32x2_t v424 = vadd_f32(v423, v415); + float32x2_t v426 = vadd_f32(v425, v418); + float32x2_t v430 = vsub_f32(v429, v422); + float32x2_t v432 = vadd_f32(v431, v405); + float32x2_t v435 = vsub_f32(v434, v419); + float32x2_t v459 = vadd_f32(v389, v428); + float32x2_t v465 = vsub_f32(v389, v428); + float32x2_t v531 = vadd_f32(v393, v439); + float32x2_t v537 = vsub_f32(v393, v439); + float32x2_t v433 = vadd_f32(v432, v414); + float32x2_t v436 = vadd_f32(v435, v421); + float32x2_t v447 = vadd_f32(v387, v424); + float32x2_t v453 = vsub_f32(v387, v424); + v6[ostride * 2] = v459; + v6[ostride * 15] = v465; + float32x2_t v495 = vadd_f32(v390, v430); + float32x2_t v501 = vsub_f32(v390, v430); + float32x2_t v507 = vadd_f32(v388, v426); + float32x2_t v513 = vsub_f32(v388, v426); + v6[ostride * 8] = v531; + v6[ostride * 9] = v537; + v6[ostride] = v447; + v6[ostride * 16] = v453; + float32x2_t v483 = vadd_f32(v391, v433); + float32x2_t v489 = vsub_f32(v391, v433); + v6[ostride * 5] = v495; + v6[ostride * 12] = v501; + v6[ostride * 6] = v507; + v6[ostride * 11] = v513; + float32x2_t v519 = vadd_f32(v392, v436); + float32x2_t v525 = vsub_f32(v392, v436); + v6[ostride * 4] = v483; + v6[ostride * 13] = v489; + v6[ostride * 7] = v519; + v6[ostride * 10] = v525; +} +#endif + +#ifdef ARMRAL_ARCH_SVE +void armral_fft_cf32_cf32_cf32_ac_n_uun17(const armral_cmplx_f32_t *restrict x, + armral_cmplx_f32_t *restrict y, + int istride, int ostride, int howmany, + float dir) { + int64_t v0 = istride; + int64_t v2 = ostride; + float v4 = dir; + const float32x2_t *v5 = (const float32x2_t *)x; + float32x2_t *v6 = (float32x2_t *)y; + svbool_t pred_full = svptrue_pat_b32(SV_VL2); + float v206 = -4.2602849117736000e-02F; + float v211 = 2.0497965023262180e-01F; + float v216 = 1.0451835201736759e+00F; + float v221 = 1.7645848660222969e+00F; + float v226 = -7.2340797728605655e-01F; + float v231 = -8.9055591620606403e-02F; + float v236 = -1.0625000000000000e+00F; + float v241 = 2.5769410160110379e-01F; + float v246 = 7.7980260789483757e-01F; + float v251 = 5.4389318464570580e-01F; + float v256 = 4.2010193497052700e-01F; + float v261 = 1.2810929434228073e+00F; + float v266 = 4.4088907348175338e-01F; + float v271 = 3.1717619283272508e-01F; + float v276 = 9.0138318648016680e-01F; + float v283 = 4.3248756360072310e-01F; + float v290 = -6.6693537504044498e-01F; + float v297 = 6.0389004312516970e-01F; + float v304 = 3.6924873198582547e-01F; + float v311 = -4.8656938755549761e-01F; + float v318 = -2.3813712136760609e-01F; + float v325 = 1.5573820617422458e+00F; + float v332 = -6.5962247018731990e-01F; + float v339 = 1.4316961569866241e-01F; + float v346 = -2.3903469959860771e-01F; + float v353 = 4.7932541949972603e-02F; + float v360 = 2.3188014856550065e+00F; + float v367 = -7.8914568419206255e-01F; + float v374 = -3.8484572871179505e+00F; + float v381 = 1.3003804568801376e+00F; + float v388 = -4.0814769046889037e+00F; + float v395 = 1.4807159909286283e+00F; + float v402 = 1.3332470363551400e-02F; + float v409 = 3.7139778690557629e-01F; + float v416 = -1.9236512863456379e-01F; + const float32x2_t *v637 = &v5[v0]; + float32x2_t *v837 = &v6[v2]; + int64_t v22 = v0 * 16; + int64_t v31 = v0 * 3; + int64_t v38 = v0 * 14; + int64_t v47 = v0 * 9; + int64_t v54 = v0 * 8; + int64_t v63 = v0 * 10; + int64_t v70 = v0 * 7; + int64_t v79 = v0 * 13; + int64_t v86 = v0 * 4; + int64_t v95 = v0 * 5; + int64_t v102 = v0 * 12; + int64_t v111 = v0 * 15; + int64_t v118 = v0 * 2; + int64_t v127 = v0 * 11; + int64_t v134 = v0 * 6; + float v279 = v4 * v276; + float v286 = v4 * v283; + float v293 = v4 * v290; + float v300 = v4 * v297; + float v307 = v4 * v304; + float v314 = v4 * v311; + float v321 = v4 * v318; + float v328 = v4 * v325; + float v335 = v4 * v332; + float v342 = v4 * v339; + float v349 = v4 * v346; + float v356 = v4 * v353; + float v363 = v4 * v360; + float v370 = v4 * v367; + float v377 = v4 * v374; + float v384 = v4 * v381; + float v391 = v4 * v388; + float v398 = v4 * v395; + float v405 = v4 * v402; + float v412 = v4 * v409; + float v419 = v4 * v416; + int64_t v513 = v2 * 16; + int64_t v521 = v2 * 2; + int64_t v529 = v2 * 15; + int64_t v537 = v2 * 3; + int64_t v545 = v2 * 14; + int64_t v553 = v2 * 4; + int64_t v561 = v2 * 13; + int64_t v569 = v2 * 5; + int64_t v577 = v2 * 12; + int64_t v585 = v2 * 6; + int64_t v593 = v2 * 11; + int64_t v601 = v2 * 7; + int64_t v609 = v2 * 10; + int64_t v617 = v2 * 8; + int64_t v625 = v2 * 9; + const float32x2_t *v782 = &v5[0]; + svfloat32_t v786 = svdup_n_f32(v206); + svfloat32_t v787 = svdup_n_f32(v211); + svfloat32_t v788 = svdup_n_f32(v216); + svfloat32_t v789 = svdup_n_f32(v221); + svfloat32_t v790 = svdup_n_f32(v226); + svfloat32_t v791 = svdup_n_f32(v231); + svfloat32_t v792 = svdup_n_f32(v236); + svfloat32_t v793 = svdup_n_f32(v241); + svfloat32_t v794 = svdup_n_f32(v246); + svfloat32_t v795 = svdup_n_f32(v251); + svfloat32_t v796 = svdup_n_f32(v256); + svfloat32_t v797 = svdup_n_f32(v261); + svfloat32_t v798 = svdup_n_f32(v266); + svfloat32_t v799 = svdup_n_f32(v271); + float32x2_t *v828 = &v6[0]; + svfloat32_t v976 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v637)[0])); + const float32x2_t *v646 = &v5[v22]; + const float32x2_t *v655 = &v5[v31]; + const float32x2_t *v664 = &v5[v38]; + const float32x2_t *v673 = &v5[v47]; + const float32x2_t *v682 = &v5[v54]; + const float32x2_t *v691 = &v5[v63]; + const float32x2_t *v700 = &v5[v70]; + const float32x2_t *v709 = &v5[v79]; + const float32x2_t *v718 = &v5[v86]; + const float32x2_t *v727 = &v5[v95]; + const float32x2_t *v736 = &v5[v102]; + const float32x2_t *v745 = &v5[v111]; + const float32x2_t *v754 = &v5[v118]; + const float32x2_t *v763 = &v5[v127]; + const float32x2_t *v772 = &v5[v134]; + svfloat32_t v800 = svdup_n_f32(v279); + svfloat32_t v801 = svdup_n_f32(v286); + svfloat32_t v802 = svdup_n_f32(v293); + svfloat32_t v803 = svdup_n_f32(v300); + svfloat32_t v804 = svdup_n_f32(v307); + svfloat32_t v805 = svdup_n_f32(v314); + svfloat32_t v806 = svdup_n_f32(v321); + svfloat32_t v807 = svdup_n_f32(v328); + svfloat32_t v808 = svdup_n_f32(v335); + svfloat32_t v809 = svdup_n_f32(v342); + svfloat32_t v810 = svdup_n_f32(v349); + svfloat32_t v811 = svdup_n_f32(v356); + svfloat32_t v812 = svdup_n_f32(v363); + svfloat32_t v813 = svdup_n_f32(v370); + svfloat32_t v814 = svdup_n_f32(v377); + svfloat32_t v815 = svdup_n_f32(v384); + svfloat32_t v816 = svdup_n_f32(v391); + svfloat32_t v817 = svdup_n_f32(v398); + svfloat32_t v818 = svdup_n_f32(v405); + svfloat32_t v819 = svdup_n_f32(v412); + svfloat32_t v820 = svdup_n_f32(v419); + float32x2_t *v846 = &v6[v513]; + float32x2_t *v855 = &v6[v521]; + float32x2_t *v864 = &v6[v529]; + float32x2_t *v873 = &v6[v537]; + float32x2_t *v882 = &v6[v545]; + float32x2_t *v891 = &v6[v553]; + float32x2_t *v900 = &v6[v561]; + float32x2_t *v909 = &v6[v569]; + float32x2_t *v918 = &v6[v577]; + float32x2_t *v927 = &v6[v585]; + float32x2_t *v936 = &v6[v593]; + float32x2_t *v945 = &v6[v601]; + float32x2_t *v954 = &v6[v609]; + float32x2_t *v963 = &v6[v617]; + float32x2_t *v972 = &v6[v625]; + svfloat32_t v1008 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v782)[0])); + svfloat32_t v978 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v646)[0])); + svfloat32_t v980 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v655)[0])); + svfloat32_t v982 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v664)[0])); + svfloat32_t v984 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v673)[0])); + svfloat32_t v986 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v682)[0])); + svfloat32_t v988 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v691)[0])); + svfloat32_t v990 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v700)[0])); + svfloat32_t v992 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v709)[0])); + svfloat32_t v994 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v718)[0])); + svfloat32_t v996 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v727)[0])); + svfloat32_t v998 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v736)[0])); + svfloat32_t v1000 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v745)[0])); + svfloat32_t v1002 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v754)[0])); + svfloat32_t v1004 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v763)[0])); + svfloat32_t v1006 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v772)[0])); + svfloat32_t v28 = svadd_f32_x(svptrue_b32(), v976, v978); + svfloat32_t v29 = svsub_f32_x(svptrue_b32(), v976, v978); + svfloat32_t v44 = svadd_f32_x(svptrue_b32(), v980, v982); + svfloat32_t v45 = svsub_f32_x(svptrue_b32(), v980, v982); + svfloat32_t v60 = svadd_f32_x(svptrue_b32(), v984, v986); + svfloat32_t v61 = svsub_f32_x(svptrue_b32(), v984, v986); + svfloat32_t v76 = svadd_f32_x(svptrue_b32(), v988, v990); + svfloat32_t v77 = svsub_f32_x(svptrue_b32(), v988, v990); + svfloat32_t v92 = svadd_f32_x(svptrue_b32(), v992, v994); + svfloat32_t v93 = svsub_f32_x(svptrue_b32(), v992, v994); + svfloat32_t v108 = svadd_f32_x(svptrue_b32(), v996, v998); + svfloat32_t v109 = svsub_f32_x(svptrue_b32(), v996, v998); + svfloat32_t v124 = svadd_f32_x(svptrue_b32(), v1000, v1002); + svfloat32_t v125 = svsub_f32_x(svptrue_b32(), v1000, v1002); + svfloat32_t v140 = svadd_f32_x(svptrue_b32(), v1004, v1006); + svfloat32_t v141 = svsub_f32_x(svptrue_b32(), v1004, v1006); + svfloat32_t v142 = svadd_f32_x(svptrue_b32(), v28, v92); + svfloat32_t v143 = svadd_f32_x(svptrue_b32(), v44, v108); + svfloat32_t v144 = svadd_f32_x(svptrue_b32(), v60, v124); + svfloat32_t v145 = svadd_f32_x(svptrue_b32(), v76, v140); + svfloat32_t v148 = svsub_f32_x(svptrue_b32(), v28, v92); + svfloat32_t v149 = svsub_f32_x(svptrue_b32(), v44, v108); + svfloat32_t v150 = svsub_f32_x(svptrue_b32(), v60, v124); + svfloat32_t v151 = svsub_f32_x(svptrue_b32(), v76, v140); + svfloat32_t v162 = svadd_f32_x(svptrue_b32(), v29, v61); + svfloat32_t v163 = svadd_f32_x(svptrue_b32(), v45, v77); + svfloat32_t v164 = svsub_f32_x(svptrue_b32(), v29, v61); + svfloat32_t v165 = svsub_f32_x(svptrue_b32(), v141, v109); + svfloat32_t v166 = svadd_f32_x(svptrue_b32(), v93, v125); + svfloat32_t v167 = svadd_f32_x(svptrue_b32(), v109, v141); + svfloat32_t v168 = svsub_f32_x(svptrue_b32(), v93, v125); + svfloat32_t v169 = svsub_f32_x(svptrue_b32(), v45, v77); + svfloat32_t v182 = svadd_f32_x(svptrue_b32(), v29, v93); + svfloat32_t v183 = svadd_f32_x(svptrue_b32(), v77, v141); + svfloat32_t v146 = svadd_f32_x(svptrue_b32(), v142, v144); + svfloat32_t v147 = svadd_f32_x(svptrue_b32(), v143, v145); + svfloat32_t v152 = svsub_f32_x(svptrue_b32(), v142, v144); + svfloat32_t v153 = svsub_f32_x(svptrue_b32(), v143, v145); + svfloat32_t v156 = svadd_f32_x(svptrue_b32(), v149, v151); + svfloat32_t v157 = svadd_f32_x(svptrue_b32(), v148, v150); + svfloat32_t v159 = svsub_f32_x(svptrue_b32(), v150, v151); + svfloat32_t v160 = svsub_f32_x(svptrue_b32(), v148, v149); + svfloat32_t v170 = svadd_f32_x(svptrue_b32(), v162, v163); + svfloat32_t v171 = svadd_f32_x(svptrue_b32(), v166, v167); + svfloat32_t v173 = svsub_f32_x(svptrue_b32(), v162, v163); + svfloat32_t v174 = svsub_f32_x(svptrue_b32(), v166, v167); + svfloat32_t v176 = svadd_f32_x(svptrue_b32(), v164, v165); + svfloat32_t v177 = svadd_f32_x(svptrue_b32(), v168, v169); + svfloat32_t v179 = svsub_f32_x(svptrue_b32(), v164, v165); + svfloat32_t v180 = svsub_f32_x(svptrue_b32(), v168, v169); + svfloat32_t v219 = svmul_f32_x(svptrue_b32(), v150, v788); + svfloat32_t zero386 = svdup_n_f32(0); + svfloat32_t v386 = svcmla_f32_x(pred_full, zero386, v815, v183, 90); + svfloat32_t v154 = svadd_f32_x(svptrue_b32(), v146, v147); + svfloat32_t v155 = svsub_f32_x(svptrue_b32(), v146, v147); + svfloat32_t v158 = svsub_f32_x(svptrue_b32(), v157, v156); + svfloat32_t v161 = svadd_f32_x(svptrue_b32(), v152, v153); + svfloat32_t v172 = svadd_f32_x(svptrue_b32(), v170, v171); + svfloat32_t v175 = svadd_f32_x(svptrue_b32(), v173, v174); + svfloat32_t v178 = svadd_f32_x(svptrue_b32(), v176, v177); + svfloat32_t v181 = svadd_f32_x(svptrue_b32(), v179, v180); + svfloat32_t v184 = svsub_f32_x(svptrue_b32(), v177, v171); + svfloat32_t v187 = svsub_f32_x(svptrue_b32(), v170, v176); + svfloat32_t v229 = svmul_f32_x(svptrue_b32(), v152, v790); + svfloat32_t v234 = svmul_f32_x(svptrue_b32(), v153, v791); + svfloat32_t v264 = svmul_f32_x(svptrue_b32(), v159, v797); + svfloat32_t v269 = svmul_f32_x(svptrue_b32(), v160, v798); + svfloat32_t v185 = svadd_f32_x(svptrue_b32(), v184, v29); + svfloat32_t v188 = svadd_f32_x(svptrue_b32(), v187, v77); + svfloat32_t v199 = svadd_f32_x(svptrue_b32(), v1008, v154); + svfloat32_t v259 = svmul_f32_x(svptrue_b32(), v158, v796); + svfloat32_t zero295 = svdup_n_f32(0); + svfloat32_t v295 = svcmla_f32_x(pred_full, zero295, v802, v172, 90); + svfloat32_t zero316 = svdup_n_f32(0); + svfloat32_t v316 = svcmla_f32_x(pred_full, zero316, v805, v175, 90); + svfloat32_t zero337 = svdup_n_f32(0); + svfloat32_t v337 = svcmla_f32_x(pred_full, zero337, v808, v178, 90); + svfloat32_t zero358 = svdup_n_f32(0); + svfloat32_t v358 = svcmla_f32_x(pred_full, zero358, v811, v181, 90); + svfloat32_t v424 = svmla_f32_x(pred_full, v264, v151, v789); + svfloat32_t v425 = svnmls_f32_x(pred_full, v219, v159, v797); + svfloat32_t v426 = svmla_f32_x(pred_full, v269, v149, v787); + svfloat32_t v427 = svnmls_f32_x(pred_full, v269, v148, v786); + svfloat32_t v186 = svsub_f32_x(svptrue_b32(), v185, v183); + svfloat32_t v189 = svadd_f32_x(svptrue_b32(), v188, v93); + svfloat32_t v422 = svmla_f32_x(pred_full, v259, v156, v794); + svfloat32_t v423 = svnmls_f32_x(pred_full, v259, v157, v795); + svfloat32_t v428 = svnmls_f32_x(pred_full, v234, v161, v799); + svfloat32_t v429 = svmla_f32_x(pred_full, v229, v161, v799); + svfloat32_t v430 = svmla_f32_x(pred_full, v199, v154, v792); + svfloat32_t v449 = svcmla_f32_x(pred_full, v295, v800, v170, 90); + svfloat32_t v450 = svcmla_f32_x(pred_full, v295, v801, v171, 90); + svfloat32_t v451 = svcmla_f32_x(pred_full, v316, v803, v173, 90); + svfloat32_t v452 = svcmla_f32_x(pred_full, v316, v804, v174, 90); + svfloat32_t v453 = svcmla_f32_x(pred_full, v337, v806, v176, 90); + svfloat32_t v454 = svcmla_f32_x(pred_full, v337, v807, v177, 90); + svfloat32_t v455 = svcmla_f32_x(pred_full, v358, v809, v179, 90); + svfloat32_t v456 = svcmla_f32_x(pred_full, v358, v810, v180, 90); + svst1_f64(pred_full, (double *)(v828), svreinterpret_f64_f32(v199)); + svfloat32_t v190 = svsub_f32_x(svptrue_b32(), v189, v141); + svfloat32_t zero407 = svdup_n_f32(0); + svfloat32_t v407 = svcmla_f32_x(pred_full, zero407, v818, v186, 90); + svfloat32_t v431 = svmla_f32_x(pred_full, v430, v155, v793); + svfloat32_t v432 = svmls_f32_x(pred_full, v430, v155, v793); + svfloat32_t v433 = svsub_f32_x(svptrue_b32(), v422, v424); + svfloat32_t v435 = svadd_f32_x(svptrue_b32(), v423, v425); + svfloat32_t v437 = svadd_f32_x(svptrue_b32(), v422, v426); + svfloat32_t v439 = svadd_f32_x(svptrue_b32(), v423, v427); + svfloat32_t v460 = svadd_f32_x(svptrue_b32(), v449, v451); + svfloat32_t v461 = svsub_f32_x(svptrue_b32(), v449, v451); + svfloat32_t v462 = svadd_f32_x(svptrue_b32(), v450, v452); + svfloat32_t v463 = svsub_f32_x(svptrue_b32(), v450, v452); + svfloat32_t v464 = svadd_f32_x(svptrue_b32(), v453, v455); + svfloat32_t v465 = svsub_f32_x(svptrue_b32(), v455, v453); + svfloat32_t v466 = svadd_f32_x(svptrue_b32(), v454, v456); + svfloat32_t v467 = svsub_f32_x(svptrue_b32(), v456, v454); + svfloat32_t v191 = svadd_f32_x(svptrue_b32(), v186, v190); + svfloat32_t zero414 = svdup_n_f32(0); + svfloat32_t v414 = svcmla_f32_x(pred_full, zero414, v819, v190, 90); + svfloat32_t v434 = svadd_f32_x(svptrue_b32(), v428, v431); + svfloat32_t v436 = svadd_f32_x(svptrue_b32(), v429, v432); + svfloat32_t v438 = svsub_f32_x(svptrue_b32(), v431, v428); + svfloat32_t v440 = svsub_f32_x(svptrue_b32(), v432, v429); + svfloat32_t v477 = svadd_f32_x(svptrue_b32(), v462, v466); + svfloat32_t v479 = svadd_f32_x(svptrue_b32(), v461, v467); + svfloat32_t v481 = svsub_f32_x(svptrue_b32(), v460, v464); + svfloat32_t v483 = svsub_f32_x(svptrue_b32(), v467, v461); + svfloat32_t v485 = svadd_f32_x(svptrue_b32(), v460, v464); + svfloat32_t v488 = svsub_f32_x(svptrue_b32(), v465, v463); + svfloat32_t v491 = svsub_f32_x(svptrue_b32(), v466, v462); + svfloat32_t v494 = svadd_f32_x(svptrue_b32(), v463, v465); + svfloat32_t v441 = svadd_f32_x(svptrue_b32(), v433, v434); + svfloat32_t v442 = svadd_f32_x(svptrue_b32(), v435, v436); + svfloat32_t v443 = svadd_f32_x(svptrue_b32(), v437, v438); + svfloat32_t v444 = svadd_f32_x(svptrue_b32(), v439, v440); + svfloat32_t v445 = svsub_f32_x(svptrue_b32(), v434, v433); + svfloat32_t v446 = svsub_f32_x(svptrue_b32(), v436, v435); + svfloat32_t v447 = svsub_f32_x(svptrue_b32(), v438, v437); + svfloat32_t v448 = svsub_f32_x(svptrue_b32(), v440, v439); + svfloat32_t v468 = svsub_f32_x(svptrue_b32(), v407, v414); + svfloat32_t v457 = svcmla_f32_x(pred_full, v414, v820, v191, 90); + svfloat32_t v470 = svadd_f32_x(svptrue_b32(), v468, v468); + svfloat32_t v495 = svsub_f32_x(svptrue_b32(), v494, v468); + svfloat32_t v458 = svcmla_f32_x(pred_full, v457, v812, v182, 90); + svfloat32_t v471 = svsub_f32_x(svptrue_b32(), v386, v470); + svfloat32_t v474 = svadd_f32_x(svptrue_b32(), v457, v457); + svfloat32_t v492 = svadd_f32_x(svptrue_b32(), v491, v470); + svfloat32_t v535 = svadd_f32_x(svptrue_b32(), v448, v495); + svfloat32_t v543 = svsub_f32_x(svptrue_b32(), v448, v495); + svfloat32_t v459 = svcmla_f32_x(pred_full, v458, v813, v29, 90); + svfloat32_t v469 = svcmla_f32_x(pred_full, v458, v814, v93, 90); + svfloat32_t v472 = svcmla_f32_x(pred_full, v471, v816, v77, 90); + svfloat32_t v473 = svcmla_f32_x(pred_full, v471, v817, v141, 90); + svfloat32_t v475 = svadd_f32_x(svptrue_b32(), v474, v474); + svfloat32_t v476 = svadd_f32_x(svptrue_b32(), v468, v474); + svfloat32_t v482 = svadd_f32_x(svptrue_b32(), v481, v474); + svfloat32_t v493 = svadd_f32_x(svptrue_b32(), v492, v474); + svst1_f64(pred_full, (double *)(v873), svreinterpret_f64_f32(v535)); + svst1_f64(pred_full, (double *)(v882), svreinterpret_f64_f32(v543)); + svfloat32_t v478 = svadd_f32_x(svptrue_b32(), v477, v469); + svfloat32_t v480 = svadd_f32_x(svptrue_b32(), v479, v472); + svfloat32_t v484 = svsub_f32_x(svptrue_b32(), v483, v476); + svfloat32_t v486 = svadd_f32_x(svptrue_b32(), v485, v459); + svfloat32_t v489 = svsub_f32_x(svptrue_b32(), v488, v473); + svfloat32_t v519 = svadd_f32_x(svptrue_b32(), v443, v482); + svfloat32_t v527 = svsub_f32_x(svptrue_b32(), v443, v482); + svfloat32_t v615 = svadd_f32_x(svptrue_b32(), v447, v493); + svfloat32_t v623 = svsub_f32_x(svptrue_b32(), v447, v493); + svfloat32_t v487 = svadd_f32_x(svptrue_b32(), v486, v468); + svfloat32_t v490 = svadd_f32_x(svptrue_b32(), v489, v475); + svfloat32_t v503 = svadd_f32_x(svptrue_b32(), v441, v478); + svfloat32_t v511 = svsub_f32_x(svptrue_b32(), v441, v478); + svfloat32_t v567 = svadd_f32_x(svptrue_b32(), v444, v484); + svfloat32_t v575 = svsub_f32_x(svptrue_b32(), v444, v484); + svfloat32_t v583 = svadd_f32_x(svptrue_b32(), v442, v480); + svfloat32_t v591 = svsub_f32_x(svptrue_b32(), v442, v480); + svst1_f64(pred_full, (double *)(v855), svreinterpret_f64_f32(v519)); + svst1_f64(pred_full, (double *)(v864), svreinterpret_f64_f32(v527)); + svst1_f64(pred_full, (double *)(v963), svreinterpret_f64_f32(v615)); + svst1_f64(pred_full, (double *)(v972), svreinterpret_f64_f32(v623)); + svfloat32_t v551 = svadd_f32_x(svptrue_b32(), v445, v487); + svfloat32_t v559 = svsub_f32_x(svptrue_b32(), v445, v487); + svfloat32_t v599 = svadd_f32_x(svptrue_b32(), v446, v490); + svfloat32_t v607 = svsub_f32_x(svptrue_b32(), v446, v490); + svst1_f64(pred_full, (double *)(v837), svreinterpret_f64_f32(v503)); + svst1_f64(pred_full, (double *)(v846), svreinterpret_f64_f32(v511)); + svst1_f64(pred_full, (double *)(v909), svreinterpret_f64_f32(v567)); + svst1_f64(pred_full, (double *)(v918), svreinterpret_f64_f32(v575)); + svst1_f64(pred_full, (double *)(v927), svreinterpret_f64_f32(v583)); + svst1_f64(pred_full, (double *)(v936), svreinterpret_f64_f32(v591)); + svst1_f64(pred_full, (double *)(v891), svreinterpret_f64_f32(v551)); + svst1_f64(pred_full, (double *)(v900), svreinterpret_f64_f32(v559)); + svst1_f64(pred_full, (double *)(v945), svreinterpret_f64_f32(v599)); + svst1_f64(pred_full, (double *)(v954), svreinterpret_f64_f32(v607)); +} +#endif + +#ifndef ARMRAL_ARCH_SVE +void armral_fft_cf32_cf32_cf32_ac_n_uun18(const armral_cmplx_f32_t *restrict x, + armral_cmplx_f32_t *restrict y, + int istride, int ostride, int howmany, + float dir) { + float v4 = dir; + const float32x2_t *v5 = (const float32x2_t *)x; + float32x2_t *v6 = (float32x2_t *)y; + float v246 = -5.0000000000000000e-01F; + float v257 = -1.4999999999999998e+00F; + float v260 = 8.6602540378443871e-01F; + float v261 = -8.6602540378443871e-01F; + float v268 = 7.6604444311897801e-01F; + float v272 = 9.3969262078590832e-01F; + float v276 = -1.7364817766693039e-01F; + float v279 = 6.4278760968653925e-01F; + float v280 = -6.4278760968653925e-01F; + float v286 = -3.4202014332566888e-01F; + float v287 = 3.4202014332566888e-01F; + float v293 = 9.8480775301220802e-01F; + float v294 = -9.8480775301220802e-01F; + float32x2_t v13 = v5[0]; + float32x2_t v78 = v5[istride]; + float32x2_t v247 = (float32x2_t){v246, v246}; + float32x2_t v258 = (float32x2_t){v257, v257}; + float32x2_t v262 = (float32x2_t){v260, v261}; + float32x2_t v269 = (float32x2_t){v268, v268}; + float32x2_t v273 = (float32x2_t){v272, v272}; + float32x2_t v277 = (float32x2_t){v276, v276}; + float32x2_t v281 = (float32x2_t){v279, v280}; + float32x2_t v288 = (float32x2_t){v286, v287}; + float32x2_t v295 = (float32x2_t){v293, v294}; + float32x2_t v296 = (float32x2_t){v4, v4}; + float32x2_t v18 = v5[istride * 9]; + float32x2_t v25 = v5[istride * 2]; + float32x2_t v30 = v5[istride * 11]; + float32x2_t v37 = v5[istride * 4]; + float32x2_t v42 = v5[istride * 13]; + float32x2_t v49 = v5[istride * 6]; + float32x2_t v54 = v5[istride * 15]; + float32x2_t v61 = v5[istride * 8]; + float32x2_t v66 = v5[istride * 17]; + float32x2_t v73 = v5[istride * 10]; + float32x2_t v85 = v5[istride * 12]; + float32x2_t v90 = v5[istride * 3]; + float32x2_t v97 = v5[istride * 14]; + float32x2_t v102 = v5[istride * 5]; + float32x2_t v109 = v5[istride * 16]; + float32x2_t v114 = v5[istride * 7]; + float32x2_t v264 = vmul_f32(v296, v262); + float32x2_t v283 = vmul_f32(v296, v281); + float32x2_t v290 = vmul_f32(v296, v288); + float32x2_t v297 = vmul_f32(v296, v295); + float32x2_t v19 = vadd_f32(v13, v18); + float32x2_t v20 = vsub_f32(v13, v18); + float32x2_t v31 = vadd_f32(v25, v30); + float32x2_t v32 = vsub_f32(v25, v30); + float32x2_t v43 = vadd_f32(v37, v42); + float32x2_t v44 = vsub_f32(v37, v42); + float32x2_t v55 = vadd_f32(v49, v54); + float32x2_t v56 = vsub_f32(v49, v54); + float32x2_t v67 = vadd_f32(v61, v66); + float32x2_t v68 = vsub_f32(v61, v66); + float32x2_t v79 = vadd_f32(v73, v78); + float32x2_t v80 = vsub_f32(v73, v78); + float32x2_t v91 = vadd_f32(v85, v90); + float32x2_t v92 = vsub_f32(v85, v90); + float32x2_t v103 = vadd_f32(v97, v102); + float32x2_t v104 = vsub_f32(v97, v102); + float32x2_t v115 = vadd_f32(v109, v114); + float32x2_t v116 = vsub_f32(v109, v114); + float32x2_t v117 = vadd_f32(v31, v115); + float32x2_t v118 = vsub_f32(v31, v115); + float32x2_t v119 = vadd_f32(v103, v43); + float32x2_t v120 = vsub_f32(v103, v43); + float32x2_t v121 = vadd_f32(v55, v91); + float32x2_t v122 = vsub_f32(v55, v91); + float32x2_t v123 = vadd_f32(v67, v79); + float32x2_t v124 = vsub_f32(v67, v79); + float32x2_t v221 = vadd_f32(v32, v116); + float32x2_t v222 = vsub_f32(v32, v116); + float32x2_t v223 = vadd_f32(v104, v44); + float32x2_t v224 = vsub_f32(v104, v44); + float32x2_t v225 = vadd_f32(v56, v92); + float32x2_t v226 = vsub_f32(v56, v92); + float32x2_t v227 = vadd_f32(v68, v80); + float32x2_t v228 = vsub_f32(v68, v80); + float32x2_t v125 = vadd_f32(v117, v119); + float32x2_t v129 = vadd_f32(v118, v120); + float32x2_t v131 = vsub_f32(v117, v119); + float32x2_t v132 = vsub_f32(v119, v123); + float32x2_t v133 = vsub_f32(v123, v117); + float32x2_t v134 = vsub_f32(v118, v120); + float32x2_t v135 = vsub_f32(v120, v124); + float32x2_t v136 = vsub_f32(v124, v118); + float32x2_t v155 = vmul_f32(v121, v258); + float32x2_t v161 = vrev64_f32(v122); + float32x2_t v229 = vadd_f32(v221, v223); + float32x2_t v233 = vadd_f32(v222, v224); + float32x2_t v235 = vsub_f32(v221, v223); + float32x2_t v236 = vsub_f32(v223, v227); + float32x2_t v237 = vsub_f32(v227, v221); + float32x2_t v238 = vsub_f32(v222, v224); + float32x2_t v239 = vsub_f32(v224, v228); + float32x2_t v240 = vsub_f32(v228, v222); + float32x2_t v259 = vmul_f32(v225, v258); + float32x2_t v265 = vrev64_f32(v226); + float32x2_t v126 = vadd_f32(v125, v123); + float32x2_t v130 = vadd_f32(v129, v124); + float32x2_t v162 = vmul_f32(v161, v264); + float32x2_t v166 = vmul_f32(v131, v269); + float32x2_t v170 = vmul_f32(v132, v273); + float32x2_t v174 = vmul_f32(v133, v277); + float32x2_t v180 = vrev64_f32(v134); + float32x2_t v187 = vrev64_f32(v135); + float32x2_t v194 = vrev64_f32(v136); + float32x2_t v230 = vadd_f32(v229, v227); + float32x2_t v234 = vadd_f32(v233, v228); + float32x2_t v266 = vmul_f32(v265, v264); + float32x2_t v270 = vmul_f32(v235, v269); + float32x2_t v274 = vmul_f32(v236, v273); + float32x2_t v278 = vmul_f32(v237, v277); + float32x2_t v284 = vrev64_f32(v238); + float32x2_t v291 = vrev64_f32(v239); + float32x2_t v298 = vrev64_f32(v240); + float32x2_t v127 = vadd_f32(v126, v121); + float32x2_t v144 = vmul_f32(v126, v247); + float32x2_t v150 = vrev64_f32(v130); + float32x2_t v181 = vmul_f32(v180, v283); + float32x2_t v188 = vmul_f32(v187, v290); + float32x2_t v195 = vmul_f32(v194, v297); + float32x2_t v231 = vadd_f32(v230, v225); + float32x2_t v248 = vmul_f32(v230, v247); + float32x2_t v254 = vrev64_f32(v234); + float32x2_t v285 = vmul_f32(v284, v283); + float32x2_t v292 = vmul_f32(v291, v290); + float32x2_t v299 = vmul_f32(v298, v297); + float32x2_t v128 = vadd_f32(v127, v19); + float32x2_t v151 = vmul_f32(v150, v264); + float32x2_t v196 = vadd_f32(v144, v144); + float32x2_t v209 = vadd_f32(v162, v181); + float32x2_t v211 = vsub_f32(v162, v188); + float32x2_t v213 = vsub_f32(v162, v181); + float32x2_t v232 = vadd_f32(v231, v20); + float32x2_t v255 = vmul_f32(v254, v264); + float32x2_t v300 = vadd_f32(v248, v248); + float32x2_t v313 = vadd_f32(v266, v285); + float32x2_t v315 = vsub_f32(v266, v292); + float32x2_t v317 = vsub_f32(v266, v285); + float32x2_t v197 = vadd_f32(v196, v144); + float32x2_t v201 = vadd_f32(v128, v155); + float32x2_t v210 = vadd_f32(v209, v188); + float32x2_t v212 = vadd_f32(v211, v195); + float32x2_t v214 = vsub_f32(v213, v195); + float32x2_t v301 = vadd_f32(v300, v248); + float32x2_t v305 = vadd_f32(v232, v259); + float32x2_t v314 = vadd_f32(v313, v292); + float32x2_t v316 = vadd_f32(v315, v299); + float32x2_t v318 = vsub_f32(v317, v299); + v6[0] = v128; + v6[ostride * 9] = v232; + float32x2_t v198 = vadd_f32(v128, v197); + float32x2_t v202 = vadd_f32(v201, v196); + float32x2_t v302 = vadd_f32(v232, v301); + float32x2_t v306 = vadd_f32(v305, v300); + float32x2_t v199 = vadd_f32(v198, v151); + float32x2_t v200 = vsub_f32(v198, v151); + float32x2_t v203 = vadd_f32(v202, v166); + float32x2_t v205 = vsub_f32(v202, v170); + float32x2_t v207 = vsub_f32(v202, v166); + float32x2_t v303 = vadd_f32(v302, v255); + float32x2_t v304 = vsub_f32(v302, v255); + float32x2_t v307 = vadd_f32(v306, v270); + float32x2_t v309 = vsub_f32(v306, v274); + float32x2_t v311 = vsub_f32(v306, v270); + float32x2_t v204 = vadd_f32(v203, v170); + float32x2_t v206 = vadd_f32(v205, v174); + float32x2_t v208 = vsub_f32(v207, v174); + float32x2_t v308 = vadd_f32(v307, v274); + float32x2_t v310 = vadd_f32(v309, v278); + float32x2_t v312 = vsub_f32(v311, v278); + v6[ostride * 12] = v200; + v6[ostride * 3] = v304; + v6[ostride * 6] = v199; + v6[ostride * 15] = v303; + float32x2_t v215 = vadd_f32(v204, v210); + float32x2_t v216 = vsub_f32(v204, v210); + float32x2_t v217 = vadd_f32(v206, v212); + float32x2_t v218 = vsub_f32(v206, v212); + float32x2_t v219 = vadd_f32(v208, v214); + float32x2_t v220 = vsub_f32(v208, v214); + float32x2_t v319 = vadd_f32(v308, v314); + float32x2_t v320 = vsub_f32(v308, v314); + float32x2_t v321 = vadd_f32(v310, v316); + float32x2_t v322 = vsub_f32(v310, v316); + float32x2_t v323 = vadd_f32(v312, v318); + float32x2_t v324 = vsub_f32(v312, v318); + v6[ostride * 10] = v216; + v6[ostride] = v320; + v6[ostride * 2] = v217; + v6[ostride * 11] = v321; + v6[ostride * 4] = v220; + v6[ostride * 13] = v324; + v6[ostride * 14] = v219; + v6[ostride * 5] = v323; + v6[ostride * 16] = v218; + v6[ostride * 7] = v322; + v6[ostride * 8] = v215; + v6[ostride * 17] = v319; +} +#endif + +#ifdef ARMRAL_ARCH_SVE +void armral_fft_cf32_cf32_cf32_ac_n_uun18(const armral_cmplx_f32_t *restrict x, + armral_cmplx_f32_t *restrict y, + int istride, int ostride, int howmany, + float dir) { + int64_t v0 = istride; + int64_t v2 = ostride; + float v4 = dir; + const float32x2_t *v5 = (const float32x2_t *)x; + float32x2_t *v6 = (float32x2_t *)y; + svbool_t pred_full = svptrue_pat_b32(SV_VL2); + float v294 = -5.0000000000000000e-01F; + float v306 = -1.4999999999999998e+00F; + float v311 = -8.6602540378443871e-01F; + float v318 = 7.6604444311897801e-01F; + float v323 = 9.3969262078590832e-01F; + float v328 = -1.7364817766693039e-01F; + float v333 = -6.4278760968653925e-01F; + float v340 = 3.4202014332566888e-01F; + float v347 = -9.8480775301220802e-01F; + const float32x2_t *v610 = &v5[v0]; + float32x2_t *v723 = &v6[v2]; + int64_t v22 = v0 * 9; + int64_t v31 = v0 * 2; + int64_t v38 = v0 * 11; + int64_t v47 = v0 * 4; + int64_t v54 = v0 * 13; + int64_t v63 = v0 * 6; + int64_t v70 = v0 * 15; + int64_t v79 = v0 * 8; + int64_t v86 = v0 * 17; + int64_t v95 = v0 * 10; + int64_t v111 = v0 * 12; + int64_t v118 = v0 * 3; + int64_t v127 = v0 * 14; + int64_t v134 = v0 * 5; + int64_t v143 = v0 * 16; + int64_t v150 = v0 * 7; + float v314 = v4 * v311; + float v336 = v4 * v333; + float v343 = v4 * v340; + float v350 = v4 * v347; + int64_t v386 = v2 * 9; + int64_t v393 = v2 * 10; + int64_t v407 = v2 * 2; + int64_t v414 = v2 * 11; + int64_t v421 = v2 * 12; + int64_t v428 = v2 * 3; + int64_t v435 = v2 * 4; + int64_t v442 = v2 * 13; + int64_t v449 = v2 * 14; + int64_t v456 = v2 * 5; + int64_t v463 = v2 * 6; + int64_t v470 = v2 * 15; + int64_t v477 = v2 * 16; + int64_t v484 = v2 * 7; + int64_t v491 = v2 * 8; + int64_t v498 = v2 * 17; + const float32x2_t *v511 = &v5[0]; + svfloat32_t v679 = svdup_n_f32(v294); + svfloat32_t v681 = svdup_n_f32(v306); + svfloat32_t v683 = svdup_n_f32(v318); + svfloat32_t v684 = svdup_n_f32(v323); + svfloat32_t v685 = svdup_n_f32(v328); + float32x2_t *v696 = &v6[0]; + svfloat32_t v875 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v610)[0])); + const float32x2_t *v520 = &v5[v22]; + const float32x2_t *v529 = &v5[v31]; + const float32x2_t *v538 = &v5[v38]; + const float32x2_t *v547 = &v5[v47]; + const float32x2_t *v556 = &v5[v54]; + const float32x2_t *v565 = &v5[v63]; + const float32x2_t *v574 = &v5[v70]; + const float32x2_t *v583 = &v5[v79]; + const float32x2_t *v592 = &v5[v86]; + const float32x2_t *v601 = &v5[v95]; + const float32x2_t *v619 = &v5[v111]; + const float32x2_t *v628 = &v5[v118]; + const float32x2_t *v637 = &v5[v127]; + const float32x2_t *v646 = &v5[v134]; + const float32x2_t *v655 = &v5[v143]; + const float32x2_t *v664 = &v5[v150]; + svfloat32_t v682 = svdup_n_f32(v314); + svfloat32_t v686 = svdup_n_f32(v336); + svfloat32_t v687 = svdup_n_f32(v343); + svfloat32_t v688 = svdup_n_f32(v350); + float32x2_t *v705 = &v6[v386]; + float32x2_t *v714 = &v6[v393]; + float32x2_t *v732 = &v6[v407]; + float32x2_t *v741 = &v6[v414]; + float32x2_t *v750 = &v6[v421]; + float32x2_t *v759 = &v6[v428]; + float32x2_t *v768 = &v6[v435]; + float32x2_t *v777 = &v6[v442]; + float32x2_t *v786 = &v6[v449]; + float32x2_t *v795 = &v6[v456]; + float32x2_t *v804 = &v6[v463]; + float32x2_t *v813 = &v6[v470]; + float32x2_t *v822 = &v6[v477]; + float32x2_t *v831 = &v6[v484]; + float32x2_t *v840 = &v6[v491]; + float32x2_t *v849 = &v6[v498]; + svfloat32_t v853 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v511)[0])); + svfloat32_t v855 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v520)[0])); + svfloat32_t v857 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v529)[0])); + svfloat32_t v859 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v538)[0])); + svfloat32_t v861 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v547)[0])); + svfloat32_t v863 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v556)[0])); + svfloat32_t v865 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v565)[0])); + svfloat32_t v867 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v574)[0])); + svfloat32_t v869 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v583)[0])); + svfloat32_t v871 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v592)[0])); + svfloat32_t v873 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v601)[0])); + svfloat32_t v877 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v619)[0])); + svfloat32_t v879 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v628)[0])); + svfloat32_t v881 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v637)[0])); + svfloat32_t v883 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v646)[0])); + svfloat32_t v885 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v655)[0])); + svfloat32_t v887 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v664)[0])); + svfloat32_t v28 = svadd_f32_x(svptrue_b32(), v853, v855); + svfloat32_t v29 = svsub_f32_x(svptrue_b32(), v853, v855); + svfloat32_t v44 = svadd_f32_x(svptrue_b32(), v857, v859); + svfloat32_t v45 = svsub_f32_x(svptrue_b32(), v857, v859); + svfloat32_t v60 = svadd_f32_x(svptrue_b32(), v861, v863); + svfloat32_t v61 = svsub_f32_x(svptrue_b32(), v861, v863); + svfloat32_t v76 = svadd_f32_x(svptrue_b32(), v865, v867); + svfloat32_t v77 = svsub_f32_x(svptrue_b32(), v865, v867); + svfloat32_t v92 = svadd_f32_x(svptrue_b32(), v869, v871); + svfloat32_t v93 = svsub_f32_x(svptrue_b32(), v869, v871); + svfloat32_t v108 = svadd_f32_x(svptrue_b32(), v873, v875); + svfloat32_t v109 = svsub_f32_x(svptrue_b32(), v873, v875); + svfloat32_t v124 = svadd_f32_x(svptrue_b32(), v877, v879); + svfloat32_t v125 = svsub_f32_x(svptrue_b32(), v877, v879); + svfloat32_t v140 = svadd_f32_x(svptrue_b32(), v881, v883); + svfloat32_t v141 = svsub_f32_x(svptrue_b32(), v881, v883); + svfloat32_t v156 = svadd_f32_x(svptrue_b32(), v885, v887); + svfloat32_t v157 = svsub_f32_x(svptrue_b32(), v885, v887); + svfloat32_t v158 = svadd_f32_x(svptrue_b32(), v44, v156); + svfloat32_t v159 = svsub_f32_x(svptrue_b32(), v44, v156); + svfloat32_t v160 = svadd_f32_x(svptrue_b32(), v140, v60); + svfloat32_t v161 = svsub_f32_x(svptrue_b32(), v140, v60); + svfloat32_t v162 = svadd_f32_x(svptrue_b32(), v76, v124); + svfloat32_t v163 = svsub_f32_x(svptrue_b32(), v76, v124); + svfloat32_t v164 = svadd_f32_x(svptrue_b32(), v92, v108); + svfloat32_t v165 = svsub_f32_x(svptrue_b32(), v92, v108); + svfloat32_t v268 = svadd_f32_x(svptrue_b32(), v45, v157); + svfloat32_t v269 = svsub_f32_x(svptrue_b32(), v45, v157); + svfloat32_t v270 = svadd_f32_x(svptrue_b32(), v141, v61); + svfloat32_t v271 = svsub_f32_x(svptrue_b32(), v141, v61); + svfloat32_t v272 = svadd_f32_x(svptrue_b32(), v77, v125); + svfloat32_t v273 = svsub_f32_x(svptrue_b32(), v77, v125); + svfloat32_t v274 = svadd_f32_x(svptrue_b32(), v93, v109); + svfloat32_t v275 = svsub_f32_x(svptrue_b32(), v93, v109); + svfloat32_t v166 = svadd_f32_x(svptrue_b32(), v158, v160); + svfloat32_t v170 = svadd_f32_x(svptrue_b32(), v159, v161); + svfloat32_t v172 = svsub_f32_x(svptrue_b32(), v158, v160); + svfloat32_t v173 = svsub_f32_x(svptrue_b32(), v160, v164); + svfloat32_t v174 = svsub_f32_x(svptrue_b32(), v164, v158); + svfloat32_t v175 = svsub_f32_x(svptrue_b32(), v159, v161); + svfloat32_t v176 = svsub_f32_x(svptrue_b32(), v161, v165); + svfloat32_t v177 = svsub_f32_x(svptrue_b32(), v165, v159); + svfloat32_t zero206 = svdup_n_f32(0); + svfloat32_t v206 = svcmla_f32_x(pred_full, zero206, v682, v163, 90); + svfloat32_t v276 = svadd_f32_x(svptrue_b32(), v268, v270); + svfloat32_t v280 = svadd_f32_x(svptrue_b32(), v269, v271); + svfloat32_t v282 = svsub_f32_x(svptrue_b32(), v268, v270); + svfloat32_t v283 = svsub_f32_x(svptrue_b32(), v270, v274); + svfloat32_t v284 = svsub_f32_x(svptrue_b32(), v274, v268); + svfloat32_t v285 = svsub_f32_x(svptrue_b32(), v269, v271); + svfloat32_t v286 = svsub_f32_x(svptrue_b32(), v271, v275); + svfloat32_t v287 = svsub_f32_x(svptrue_b32(), v275, v269); + svfloat32_t zero316 = svdup_n_f32(0); + svfloat32_t v316 = svcmla_f32_x(pred_full, zero316, v682, v273, 90); + svfloat32_t v167 = svadd_f32_x(svptrue_b32(), v166, v164); + svfloat32_t v171 = svadd_f32_x(svptrue_b32(), v170, v165); + svfloat32_t zero228 = svdup_n_f32(0); + svfloat32_t v228 = svcmla_f32_x(pred_full, zero228, v686, v175, 90); + svfloat32_t zero235 = svdup_n_f32(0); + svfloat32_t v235 = svcmla_f32_x(pred_full, zero235, v687, v176, 90); + svfloat32_t zero242 = svdup_n_f32(0); + svfloat32_t v242 = svcmla_f32_x(pred_full, zero242, v688, v177, 90); + svfloat32_t v277 = svadd_f32_x(svptrue_b32(), v276, v274); + svfloat32_t v281 = svadd_f32_x(svptrue_b32(), v280, v275); + svfloat32_t zero338 = svdup_n_f32(0); + svfloat32_t v338 = svcmla_f32_x(pred_full, zero338, v686, v285, 90); + svfloat32_t zero345 = svdup_n_f32(0); + svfloat32_t v345 = svcmla_f32_x(pred_full, zero345, v687, v286, 90); + svfloat32_t zero352 = svdup_n_f32(0); + svfloat32_t v352 = svcmla_f32_x(pred_full, zero352, v688, v287, 90); + svfloat32_t v168 = svadd_f32_x(svptrue_b32(), v167, v162); + svfloat32_t v187 = svmul_f32_x(svptrue_b32(), v167, v679); + svfloat32_t zero194 = svdup_n_f32(0); + svfloat32_t v194 = svcmla_f32_x(pred_full, zero194, v682, v171, 90); + svfloat32_t v256 = svadd_f32_x(svptrue_b32(), v206, v228); + svfloat32_t v258 = svsub_f32_x(svptrue_b32(), v206, v235); + svfloat32_t v260 = svsub_f32_x(svptrue_b32(), v206, v228); + svfloat32_t v278 = svadd_f32_x(svptrue_b32(), v277, v272); + svfloat32_t v297 = svmul_f32_x(svptrue_b32(), v277, v679); + svfloat32_t zero304 = svdup_n_f32(0); + svfloat32_t v304 = svcmla_f32_x(pred_full, zero304, v682, v281, 90); + svfloat32_t v366 = svadd_f32_x(svptrue_b32(), v316, v338); + svfloat32_t v368 = svsub_f32_x(svptrue_b32(), v316, v345); + svfloat32_t v370 = svsub_f32_x(svptrue_b32(), v316, v338); + svfloat32_t v169 = svadd_f32_x(svptrue_b32(), v168, v28); + svfloat32_t v243 = svadd_f32_x(svptrue_b32(), v187, v187); + svfloat32_t v257 = svadd_f32_x(svptrue_b32(), v256, v235); + svfloat32_t v259 = svadd_f32_x(svptrue_b32(), v258, v242); + svfloat32_t v261 = svsub_f32_x(svptrue_b32(), v260, v242); + svfloat32_t v279 = svadd_f32_x(svptrue_b32(), v278, v29); + svfloat32_t v353 = svadd_f32_x(svptrue_b32(), v297, v297); + svfloat32_t v367 = svadd_f32_x(svptrue_b32(), v366, v345); + svfloat32_t v369 = svadd_f32_x(svptrue_b32(), v368, v352); + svfloat32_t v371 = svsub_f32_x(svptrue_b32(), v370, v352); + svfloat32_t v244 = svmla_f32_x(pred_full, v243, v167, v679); + svfloat32_t v248 = svmla_f32_x(pred_full, v169, v162, v681); + svfloat32_t v354 = svmla_f32_x(pred_full, v353, v277, v679); + svfloat32_t v358 = svmla_f32_x(pred_full, v279, v272, v681); + svst1_f64(pred_full, (double *)(v696), svreinterpret_f64_f32(v169)); + svst1_f64(pred_full, (double *)(v705), svreinterpret_f64_f32(v279)); + svfloat32_t v245 = svadd_f32_x(svptrue_b32(), v169, v244); + svfloat32_t v249 = svadd_f32_x(svptrue_b32(), v248, v243); + svfloat32_t v355 = svadd_f32_x(svptrue_b32(), v279, v354); + svfloat32_t v359 = svadd_f32_x(svptrue_b32(), v358, v353); + svfloat32_t v246 = svadd_f32_x(svptrue_b32(), v245, v194); + svfloat32_t v247 = svsub_f32_x(svptrue_b32(), v245, v194); + svfloat32_t v250 = svmla_f32_x(pred_full, v249, v172, v683); + svfloat32_t v252 = svmls_f32_x(pred_full, v249, v173, v684); + svfloat32_t v254 = svmls_f32_x(pred_full, v249, v172, v683); + svfloat32_t v356 = svadd_f32_x(svptrue_b32(), v355, v304); + svfloat32_t v357 = svsub_f32_x(svptrue_b32(), v355, v304); + svfloat32_t v360 = svmla_f32_x(pred_full, v359, v282, v683); + svfloat32_t v362 = svmls_f32_x(pred_full, v359, v283, v684); + svfloat32_t v364 = svmls_f32_x(pred_full, v359, v282, v683); + svfloat32_t v251 = svmla_f32_x(pred_full, v250, v173, v684); + svfloat32_t v253 = svmla_f32_x(pred_full, v252, v174, v685); + svfloat32_t v255 = svmls_f32_x(pred_full, v254, v174, v685); + svfloat32_t v361 = svmla_f32_x(pred_full, v360, v283, v684); + svfloat32_t v363 = svmla_f32_x(pred_full, v362, v284, v685); + svfloat32_t v365 = svmls_f32_x(pred_full, v364, v284, v685); + svst1_f64(pred_full, (double *)(v750), svreinterpret_f64_f32(v247)); + svst1_f64(pred_full, (double *)(v759), svreinterpret_f64_f32(v357)); + svst1_f64(pred_full, (double *)(v804), svreinterpret_f64_f32(v246)); + svst1_f64(pred_full, (double *)(v813), svreinterpret_f64_f32(v356)); + svfloat32_t v262 = svadd_f32_x(svptrue_b32(), v251, v257); + svfloat32_t v263 = svsub_f32_x(svptrue_b32(), v251, v257); + svfloat32_t v264 = svadd_f32_x(svptrue_b32(), v253, v259); + svfloat32_t v265 = svsub_f32_x(svptrue_b32(), v253, v259); + svfloat32_t v266 = svadd_f32_x(svptrue_b32(), v255, v261); + svfloat32_t v267 = svsub_f32_x(svptrue_b32(), v255, v261); + svfloat32_t v372 = svadd_f32_x(svptrue_b32(), v361, v367); + svfloat32_t v373 = svsub_f32_x(svptrue_b32(), v361, v367); + svfloat32_t v374 = svadd_f32_x(svptrue_b32(), v363, v369); + svfloat32_t v375 = svsub_f32_x(svptrue_b32(), v363, v369); + svfloat32_t v376 = svadd_f32_x(svptrue_b32(), v365, v371); + svfloat32_t v377 = svsub_f32_x(svptrue_b32(), v365, v371); + svst1_f64(pred_full, (double *)(v714), svreinterpret_f64_f32(v263)); + svst1_f64(pred_full, (double *)(v723), svreinterpret_f64_f32(v373)); + svst1_f64(pred_full, (double *)(v732), svreinterpret_f64_f32(v264)); + svst1_f64(pred_full, (double *)(v741), svreinterpret_f64_f32(v374)); + svst1_f64(pred_full, (double *)(v768), svreinterpret_f64_f32(v267)); + svst1_f64(pred_full, (double *)(v777), svreinterpret_f64_f32(v377)); + svst1_f64(pred_full, (double *)(v786), svreinterpret_f64_f32(v266)); + svst1_f64(pred_full, (double *)(v795), svreinterpret_f64_f32(v376)); + svst1_f64(pred_full, (double *)(v822), svreinterpret_f64_f32(v265)); + svst1_f64(pred_full, (double *)(v831), svreinterpret_f64_f32(v375)); + svst1_f64(pred_full, (double *)(v840), svreinterpret_f64_f32(v262)); + svst1_f64(pred_full, (double *)(v849), svreinterpret_f64_f32(v372)); +} +#endif + +#ifndef ARMRAL_ARCH_SVE +void armral_fft_cf32_cf32_cf32_ac_n_uun19(const armral_cmplx_f32_t *restrict x, + armral_cmplx_f32_t *restrict y, + int istride, int ostride, int howmany, + float dir) { + float v4 = dir; + const float32x2_t *v5 = (const float32x2_t *)x; + float32x2_t *v6 = (float32x2_t *)y; + float v192 = -1.0555555555555556e+00F; + float v196 = 1.7752228513927079e-01F; + float v200 = -1.2820077502191529e-01F; + float v204 = 4.9321510117355499e-02F; + float v208 = 5.7611011491005903e-01F; + float v212 = -7.4996449655536279e-01F; + float v216 = -1.7385438164530381e-01F; + float v220 = -2.1729997561977314e+00F; + float v224 = -1.7021211726914738e+00F; + float v228 = 4.7087858350625778e-01F; + float v232 = -2.0239400846888440e+00F; + float v236 = 1.0551641201664090e-01F; + float v240 = 2.1294564967054850e+00F; + float v244 = -7.5087543897371167e-01F; + float v248 = 1.4812817695157160e-01F; + float v252 = 8.9900361592528333e-01F; + float v256 = -6.2148246772602778e-01F; + float v260 = -7.9869352098712687e-01F; + float v264 = -4.7339199623771833e-01F; + float v267 = -2.4216105241892630e-01F; + float v268 = 2.4216105241892630e-01F; + float v274 = -5.9368607967505101e-02F; + float v275 = 5.9368607967505101e-02F; + float v281 = 1.2578688255176201e-02F; + float v282 = -1.2578688255176201e-02F; + float v288 = -4.6789919712328903e-02F; + float v289 = 4.6789919712328903e-02F; + float v295 = -9.3750121913782358e-01F; + float v296 = 9.3750121913782358e-01F; + float v302 = -5.0111537043352902e-02F; + float v303 = 5.0111537043352902e-02F; + float v309 = -9.8761275618117661e-01F; + float v310 = 9.8761275618117661e-01F; + float v316 = -1.1745786501205959e+00F; + float v317 = 1.1745786501205959e+00F; + float v323 = 1.1114482296234993e+00F; + float v324 = -1.1114482296234993e+00F; + float v330 = 2.2860268797440955e+00F; + float v331 = -2.2860268797440955e+00F; + float v337 = 2.6420523257930939e-01F; + float v338 = -2.6420523257930939e-01F; + float v344 = 2.1981792779352136e+00F; + float v345 = -2.1981792779352136e+00F; + float v351 = 1.9339740453559042e+00F; + float v352 = -1.9339740453559042e+00F; + float v358 = -7.4825847091254893e-01F; + float v359 = 7.4825847091254893e-01F; + float v365 = -4.7820835642768872e-01F; + float v366 = 4.7820835642768872e-01F; + float v372 = 2.7005011448486022e-01F; + float v373 = -2.7005011448486022e-01F; + float v379 = -3.4642356159542270e-01F; + float v380 = 3.4642356159542270e-01F; + float v386 = -8.3485429360688279e-01F; + float v387 = 8.3485429360688279e-01F; + float v393 = -3.9375928506743518e-01F; + float v394 = 3.9375928506743518e-01F; + float32x2_t v13 = v5[istride]; + float32x2_t v137 = v5[0]; + float32x2_t v193 = (float32x2_t){v192, v192}; + float32x2_t v197 = (float32x2_t){v196, v196}; + float32x2_t v201 = (float32x2_t){v200, v200}; + float32x2_t v205 = (float32x2_t){v204, v204}; + float32x2_t v209 = (float32x2_t){v208, v208}; + float32x2_t v213 = (float32x2_t){v212, v212}; + float32x2_t v217 = (float32x2_t){v216, v216}; + float32x2_t v221 = (float32x2_t){v220, v220}; + float32x2_t v225 = (float32x2_t){v224, v224}; + float32x2_t v229 = (float32x2_t){v228, v228}; + float32x2_t v233 = (float32x2_t){v232, v232}; + float32x2_t v237 = (float32x2_t){v236, v236}; + float32x2_t v241 = (float32x2_t){v240, v240}; + float32x2_t v245 = (float32x2_t){v244, v244}; + float32x2_t v249 = (float32x2_t){v248, v248}; + float32x2_t v253 = (float32x2_t){v252, v252}; + float32x2_t v257 = (float32x2_t){v256, v256}; + float32x2_t v261 = (float32x2_t){v260, v260}; + float32x2_t v265 = (float32x2_t){v264, v264}; + float32x2_t v269 = (float32x2_t){v267, v268}; + float32x2_t v276 = (float32x2_t){v274, v275}; + float32x2_t v283 = (float32x2_t){v281, v282}; + float32x2_t v290 = (float32x2_t){v288, v289}; + float32x2_t v297 = (float32x2_t){v295, v296}; + float32x2_t v304 = (float32x2_t){v302, v303}; + float32x2_t v311 = (float32x2_t){v309, v310}; + float32x2_t v318 = (float32x2_t){v316, v317}; + float32x2_t v325 = (float32x2_t){v323, v324}; + float32x2_t v332 = (float32x2_t){v330, v331}; + float32x2_t v339 = (float32x2_t){v337, v338}; + float32x2_t v346 = (float32x2_t){v344, v345}; + float32x2_t v353 = (float32x2_t){v351, v352}; + float32x2_t v360 = (float32x2_t){v358, v359}; + float32x2_t v367 = (float32x2_t){v365, v366}; + float32x2_t v374 = (float32x2_t){v372, v373}; + float32x2_t v381 = (float32x2_t){v379, v380}; + float32x2_t v388 = (float32x2_t){v386, v387}; + float32x2_t v395 = (float32x2_t){v393, v394}; + float32x2_t v396 = (float32x2_t){v4, v4}; + float32x2_t v18 = v5[istride * 18]; + float32x2_t v25 = v5[istride * 2]; + float32x2_t v30 = v5[istride * 17]; + float32x2_t v37 = v5[istride * 4]; + float32x2_t v42 = v5[istride * 15]; + float32x2_t v49 = v5[istride * 8]; + float32x2_t v54 = v5[istride * 11]; + float32x2_t v61 = v5[istride * 16]; + float32x2_t v66 = v5[istride * 3]; + float32x2_t v73 = v5[istride * 13]; + float32x2_t v78 = v5[istride * 6]; + float32x2_t v85 = v5[istride * 7]; + float32x2_t v90 = v5[istride * 12]; + float32x2_t v97 = v5[istride * 14]; + float32x2_t v102 = v5[istride * 5]; + float32x2_t v109 = v5[istride * 9]; + float32x2_t v114 = v5[istride * 10]; + float32x2_t v271 = vmul_f32(v396, v269); + float32x2_t v278 = vmul_f32(v396, v276); + float32x2_t v285 = vmul_f32(v396, v283); + float32x2_t v292 = vmul_f32(v396, v290); + float32x2_t v299 = vmul_f32(v396, v297); + float32x2_t v306 = vmul_f32(v396, v304); + float32x2_t v313 = vmul_f32(v396, v311); + float32x2_t v320 = vmul_f32(v396, v318); + float32x2_t v327 = vmul_f32(v396, v325); + float32x2_t v334 = vmul_f32(v396, v332); + float32x2_t v341 = vmul_f32(v396, v339); + float32x2_t v348 = vmul_f32(v396, v346); + float32x2_t v355 = vmul_f32(v396, v353); + float32x2_t v362 = vmul_f32(v396, v360); + float32x2_t v369 = vmul_f32(v396, v367); + float32x2_t v376 = vmul_f32(v396, v374); + float32x2_t v383 = vmul_f32(v396, v381); + float32x2_t v390 = vmul_f32(v396, v388); + float32x2_t v397 = vmul_f32(v396, v395); + float32x2_t v19 = vadd_f32(v13, v18); + float32x2_t v20 = vsub_f32(v13, v18); + float32x2_t v31 = vadd_f32(v25, v30); + float32x2_t v32 = vsub_f32(v30, v25); + float32x2_t v43 = vadd_f32(v37, v42); + float32x2_t v44 = vsub_f32(v37, v42); + float32x2_t v55 = vadd_f32(v49, v54); + float32x2_t v56 = vsub_f32(v54, v49); + float32x2_t v67 = vadd_f32(v61, v66); + float32x2_t v68 = vsub_f32(v61, v66); + float32x2_t v79 = vadd_f32(v73, v78); + float32x2_t v80 = vsub_f32(v78, v73); + float32x2_t v91 = vadd_f32(v85, v90); + float32x2_t v92 = vsub_f32(v85, v90); + float32x2_t v103 = vadd_f32(v97, v102); + float32x2_t v104 = vsub_f32(v102, v97); + float32x2_t v115 = vadd_f32(v109, v114); + float32x2_t v116 = vsub_f32(v109, v114); + float32x2_t v117 = vsub_f32(v19, v91); + float32x2_t v118 = vsub_f32(v31, v103); + float32x2_t v119 = vsub_f32(v43, v115); + float32x2_t v120 = vsub_f32(v55, v91); + float32x2_t v121 = vsub_f32(v67, v103); + float32x2_t v122 = vsub_f32(v79, v115); + float32x2_t v123 = vadd_f32(v19, v55); + float32x2_t v125 = vadd_f32(v31, v67); + float32x2_t v127 = vadd_f32(v43, v79); + float32x2_t v155 = vsub_f32(v20, v92); + float32x2_t v156 = vsub_f32(v32, v104); + float32x2_t v157 = vsub_f32(v44, v116); + float32x2_t v158 = vsub_f32(v56, v92); + float32x2_t v159 = vsub_f32(v68, v104); + float32x2_t v160 = vsub_f32(v80, v116); + float32x2_t v161 = vadd_f32(v20, v56); + float32x2_t v163 = vadd_f32(v32, v68); + float32x2_t v165 = vadd_f32(v44, v80); + float32x2_t v124 = vadd_f32(v123, v91); + float32x2_t v126 = vadd_f32(v125, v103); + float32x2_t v128 = vadd_f32(v127, v115); + float32x2_t v129 = vadd_f32(v117, v119); + float32x2_t v130 = vadd_f32(v120, v122); + float32x2_t v145 = vsub_f32(v117, v120); + float32x2_t v146 = vsub_f32(v119, v122); + float32x2_t v162 = vadd_f32(v161, v92); + float32x2_t v164 = vadd_f32(v163, v104); + float32x2_t v166 = vadd_f32(v165, v116); + float32x2_t v167 = vadd_f32(v155, v157); + float32x2_t v168 = vadd_f32(v158, v160); + float32x2_t v177 = vsub_f32(v155, v158); + float32x2_t v178 = vsub_f32(v157, v160); + float32x2_t v222 = vmul_f32(v120, v221); + float32x2_t v234 = vmul_f32(v122, v233); + float32x2_t v242 = vmul_f32(v119, v241); + float32x2_t v321 = vrev64_f32(v158); + float32x2_t v335 = vrev64_f32(v155); + float32x2_t v342 = vrev64_f32(v160); + float32x2_t v356 = vrev64_f32(v157); + float32x2_t v131 = vadd_f32(v124, v126); + float32x2_t v139 = vadd_f32(v130, v121); + float32x2_t v140 = vadd_f32(v129, v118); + float32x2_t v142 = vsub_f32(v130, v121); + float32x2_t v143 = vsub_f32(v129, v118); + float32x2_t v147 = vsub_f32(v117, v146); + float32x2_t v149 = vadd_f32(v145, v122); + float32x2_t v152 = vsub_f32(v124, v128); + float32x2_t v153 = vsub_f32(v126, v128); + float32x2_t v169 = vadd_f32(v162, v164); + float32x2_t v171 = vadd_f32(v168, v159); + float32x2_t v172 = vadd_f32(v167, v156); + float32x2_t v174 = vsub_f32(v168, v159); + float32x2_t v175 = vsub_f32(v167, v156); + float32x2_t v179 = vsub_f32(v155, v178); + float32x2_t v181 = vadd_f32(v177, v160); + float32x2_t v184 = vsub_f32(v162, v166); + float32x2_t v185 = vsub_f32(v164, v166); + float32x2_t v226 = vmul_f32(v145, v225); + float32x2_t v238 = vmul_f32(v146, v237); + float32x2_t v322 = vmul_f32(v321, v320); + float32x2_t v328 = vrev64_f32(v177); + float32x2_t v343 = vmul_f32(v342, v341); + float32x2_t v349 = vrev64_f32(v178); + float32x2_t v357 = vmul_f32(v356, v355); + float32x2_t v132 = vadd_f32(v131, v128); + float32x2_t v141 = vsub_f32(v140, v139); + float32x2_t v144 = vsub_f32(v143, v142); + float32x2_t v148 = vsub_f32(v147, v121); + float32x2_t v150 = vsub_f32(v149, v118); + float32x2_t v154 = vadd_f32(v152, v153); + float32x2_t v170 = vadd_f32(v169, v166); + float32x2_t v173 = vsub_f32(v172, v171); + float32x2_t v176 = vsub_f32(v175, v174); + float32x2_t v180 = vsub_f32(v179, v159); + float32x2_t v182 = vsub_f32(v181, v156); + float32x2_t v186 = vadd_f32(v184, v185); + float32x2_t v198 = vmul_f32(v139, v197); + float32x2_t v202 = vmul_f32(v140, v201); + float32x2_t v210 = vmul_f32(v142, v209); + float32x2_t v214 = vmul_f32(v143, v213); + float32x2_t v258 = vmul_f32(v152, v257); + float32x2_t v262 = vmul_f32(v153, v261); + float32x2_t v279 = vrev64_f32(v171); + float32x2_t v286 = vrev64_f32(v172); + float32x2_t v300 = vrev64_f32(v174); + float32x2_t v307 = vrev64_f32(v175); + float32x2_t v329 = vmul_f32(v328, v327); + float32x2_t v350 = vmul_f32(v349, v348); + float32x2_t v384 = vrev64_f32(v184); + float32x2_t v391 = vrev64_f32(v185); + float32x2_t v138 = vadd_f32(v137, v132); + float32x2_t v151 = vsub_f32(v148, v150); + float32x2_t v183 = vsub_f32(v180, v182); + float32x2_t v194 = vmul_f32(v132, v193); + float32x2_t v206 = vmul_f32(v141, v205); + float32x2_t v218 = vmul_f32(v144, v217); + float32x2_t v246 = vmul_f32(v148, v245); + float32x2_t v250 = vmul_f32(v150, v249); + float32x2_t v266 = vmul_f32(v154, v265); + float32x2_t v272 = vrev64_f32(v170); + float32x2_t v280 = vmul_f32(v279, v278); + float32x2_t v287 = vmul_f32(v286, v285); + float32x2_t v293 = vrev64_f32(v173); + float32x2_t v301 = vmul_f32(v300, v299); + float32x2_t v308 = vmul_f32(v307, v306); + float32x2_t v314 = vrev64_f32(v176); + float32x2_t v363 = vrev64_f32(v180); + float32x2_t v370 = vrev64_f32(v182); + float32x2_t v385 = vmul_f32(v384, v383); + float32x2_t v392 = vmul_f32(v391, v390); + float32x2_t v398 = vrev64_f32(v186); + float32x2_t v400 = vadd_f32(v198, v202); + float32x2_t v401 = vadd_f32(v210, v214); + float32x2_t v254 = vmul_f32(v151, v253); + float32x2_t v273 = vmul_f32(v272, v271); + float32x2_t v294 = vmul_f32(v293, v292); + float32x2_t v315 = vmul_f32(v314, v313); + float32x2_t v364 = vmul_f32(v363, v362); + float32x2_t v371 = vmul_f32(v370, v369); + float32x2_t v377 = vrev64_f32(v183); + float32x2_t v399 = vmul_f32(v398, v397); + float32x2_t v403 = vadd_f32(v400, v401); + float32x2_t v404 = vadd_f32(v198, v206); + float32x2_t v405 = vadd_f32(v210, v218); + float32x2_t v422 = vsub_f32(v400, v401); + float32x2_t v424 = vsub_f32(v258, v266); + float32x2_t v425 = vsub_f32(v262, v266); + float32x2_t v426 = vadd_f32(v194, v138); + float32x2_t v431 = vadd_f32(v280, v287); + float32x2_t v432 = vadd_f32(v301, v308); + v6[0] = v138; + float32x2_t v378 = vmul_f32(v377, v376); + float32x2_t v402 = vadd_f32(v250, v254); + float32x2_t v406 = vadd_f32(v246, v254); + float32x2_t v407 = vsub_f32(v222, v403); + float32x2_t v408 = vadd_f32(v404, v405); + float32x2_t v414 = vsub_f32(v404, v405); + float32x2_t v419 = vadd_f32(v403, v242); + float32x2_t v427 = vadd_f32(v426, v424); + float32x2_t v428 = vsub_f32(v426, v424); + float32x2_t v430 = vadd_f32(v426, v425); + float32x2_t v434 = vadd_f32(v431, v432); + float32x2_t v435 = vadd_f32(v280, v294); + float32x2_t v436 = vadd_f32(v301, v315); + float32x2_t v453 = vsub_f32(v431, v432); + float32x2_t v455 = vsub_f32(v385, v399); + float32x2_t v456 = vsub_f32(v392, v399); + float32x2_t v409 = vsub_f32(v234, v406); + float32x2_t v410 = vadd_f32(v226, v402); + float32x2_t v412 = vadd_f32(v408, v238); + float32x2_t v415 = vadd_f32(v414, v402); + float32x2_t v416 = vadd_f32(v407, v408); + float32x2_t v423 = vadd_f32(v422, v406); + float32x2_t v429 = vsub_f32(v428, v425); + float32x2_t v433 = vadd_f32(v371, v378); + float32x2_t v437 = vadd_f32(v364, v378); + float32x2_t v438 = vsub_f32(v322, v434); + float32x2_t v439 = vadd_f32(v435, v436); + float32x2_t v445 = vsub_f32(v435, v436); + float32x2_t v450 = vadd_f32(v434, v357); + float32x2_t v457 = vadd_f32(v273, v455); + float32x2_t v458 = vsub_f32(v273, v455); + float32x2_t v460 = vadd_f32(v273, v456); + float32x2_t v411 = vadd_f32(v410, v407); + float32x2_t v413 = vadd_f32(v412, v409); + float32x2_t v417 = vfma_f32(v416, v117, v229); + float32x2_t v420 = vadd_f32(v419, v409); + float32x2_t v440 = vsub_f32(v343, v437); + float32x2_t v441 = vadd_f32(v329, v433); + float32x2_t v443 = vadd_f32(v439, v350); + float32x2_t v446 = vadd_f32(v445, v433); + float32x2_t v447 = vadd_f32(v438, v439); + float32x2_t v454 = vadd_f32(v453, v437); + float32x2_t v459 = vsub_f32(v458, v456); + float32x2_t v465 = vsub_f32(v423, v415); + float32x2_t v469 = vsub_f32(v430, v423); + float32x2_t v472 = vadd_f32(v415, v430); + float32x2_t v418 = vadd_f32(v417, v406); + float32x2_t v421 = vadd_f32(v420, v402); + float32x2_t v442 = vadd_f32(v441, v438); + float32x2_t v444 = vadd_f32(v443, v440); + float32x2_t v448 = vfma_f32(v447, v335, v334); + float32x2_t v451 = vadd_f32(v450, v440); + float32x2_t v466 = vadd_f32(v465, v430); + float32x2_t v470 = vadd_f32(v411, v427); + float32x2_t v471 = vadd_f32(v413, v429); + float32x2_t v477 = vsub_f32(v454, v446); + float32x2_t v481 = vsub_f32(v454, v460); + float32x2_t v484 = vadd_f32(v446, v460); + float32x2_t v449 = vadd_f32(v448, v437); + float32x2_t v452 = vadd_f32(v451, v433); + float32x2_t v461 = vsub_f32(v418, v411); + float32x2_t v463 = vsub_f32(v421, v413); + float32x2_t v467 = vsub_f32(v427, v418); + float32x2_t v468 = vsub_f32(v429, v421); + float32x2_t v478 = vadd_f32(v477, v460); + float32x2_t v482 = vadd_f32(v442, v457); + float32x2_t v483 = vadd_f32(v444, v459); + float32x2_t v502 = vsub_f32(v472, v484); + float32x2_t v508 = vadd_f32(v472, v484); + float32x2_t v514 = vadd_f32(v469, v481); + float32x2_t v520 = vsub_f32(v469, v481); + float32x2_t v462 = vadd_f32(v461, v427); + float32x2_t v464 = vadd_f32(v463, v429); + float32x2_t v473 = vsub_f32(v449, v442); + float32x2_t v475 = vsub_f32(v452, v444); + float32x2_t v479 = vsub_f32(v457, v449); + float32x2_t v480 = vsub_f32(v459, v452); + v6[ostride * 2] = v502; + v6[ostride * 17] = v508; + v6[ostride * 3] = v514; + v6[ostride * 16] = v520; + float32x2_t v526 = vadd_f32(v471, v483); + float32x2_t v532 = vsub_f32(v471, v483); + float32x2_t v538 = vadd_f32(v466, v478); + float32x2_t v544 = vsub_f32(v466, v478); + float32x2_t v574 = vsub_f32(v470, v482); + float32x2_t v580 = vadd_f32(v470, v482); + float32x2_t v474 = vadd_f32(v473, v457); + float32x2_t v476 = vadd_f32(v475, v459); + v6[ostride * 4] = v526; + v6[ostride * 15] = v532; + v6[ostride * 5] = v538; + v6[ostride * 14] = v544; + float32x2_t v550 = vadd_f32(v468, v480); + float32x2_t v556 = vsub_f32(v468, v480); + float32x2_t v562 = vadd_f32(v467, v479); + float32x2_t v568 = vsub_f32(v467, v479); + v6[ostride * 8] = v574; + v6[ostride * 11] = v580; + float32x2_t v490 = vadd_f32(v462, v474); + float32x2_t v496 = vsub_f32(v462, v474); + v6[ostride * 6] = v550; + v6[ostride * 13] = v556; + v6[ostride * 7] = v562; + v6[ostride * 12] = v568; + float32x2_t v586 = vadd_f32(v464, v476); + float32x2_t v592 = vsub_f32(v464, v476); + v6[ostride] = v490; + v6[ostride * 18] = v496; + v6[ostride * 9] = v586; + v6[ostride * 10] = v592; +} +#endif + +#ifdef ARMRAL_ARCH_SVE +void armral_fft_cf32_cf32_cf32_ac_n_uun19(const armral_cmplx_f32_t *restrict x, + armral_cmplx_f32_t *restrict y, + int istride, int ostride, int howmany, + float dir) { + int64_t v0 = istride; + int64_t v2 = ostride; + float v4 = dir; + const float32x2_t *v5 = (const float32x2_t *)x; + float32x2_t *v6 = (float32x2_t *)y; + svbool_t pred_full = svptrue_pat_b32(SV_VL2); + float v236 = -1.0555555555555556e+00F; + float v241 = 1.7752228513927079e-01F; + float v246 = -1.2820077502191529e-01F; + float v251 = 4.9321510117355499e-02F; + float v256 = 5.7611011491005903e-01F; + float v261 = -7.4996449655536279e-01F; + float v266 = -1.7385438164530381e-01F; + float v271 = -2.1729997561977314e+00F; + float v276 = -1.7021211726914738e+00F; + float v281 = 4.7087858350625778e-01F; + float v286 = -2.0239400846888440e+00F; + float v291 = 1.0551641201664090e-01F; + float v296 = 2.1294564967054850e+00F; + float v301 = -7.5087543897371167e-01F; + float v306 = 1.4812817695157160e-01F; + float v311 = 8.9900361592528333e-01F; + float v316 = -6.2148246772602778e-01F; + float v321 = -7.9869352098712687e-01F; + float v326 = -4.7339199623771833e-01F; + float v331 = 2.4216105241892630e-01F; + float v338 = 5.9368607967505101e-02F; + float v345 = -1.2578688255176201e-02F; + float v352 = 4.6789919712328903e-02F; + float v359 = 9.3750121913782358e-01F; + float v366 = 5.0111537043352902e-02F; + float v373 = 9.8761275618117661e-01F; + float v380 = 1.1745786501205959e+00F; + float v387 = -1.1114482296234993e+00F; + float v394 = -2.2860268797440955e+00F; + float v401 = -2.6420523257930939e-01F; + float v408 = -2.1981792779352136e+00F; + float v415 = -1.9339740453559042e+00F; + float v422 = 7.4825847091254893e-01F; + float v429 = 4.7820835642768872e-01F; + float v436 = -2.7005011448486022e-01F; + float v443 = 3.4642356159542270e-01F; + float v450 = 8.3485429360688279e-01F; + float v457 = 3.9375928506743518e-01F; + const float32x2_t *v705 = &v5[v0]; + float32x2_t *v926 = &v6[v2]; + int64_t v22 = v0 * 18; + int64_t v31 = v0 * 2; + int64_t v38 = v0 * 17; + int64_t v47 = v0 * 4; + int64_t v54 = v0 * 15; + int64_t v63 = v0 * 8; + int64_t v70 = v0 * 11; + int64_t v79 = v0 * 16; + int64_t v86 = v0 * 3; + int64_t v95 = v0 * 13; + int64_t v102 = v0 * 6; + int64_t v111 = v0 * 7; + int64_t v118 = v0 * 12; + int64_t v127 = v0 * 14; + int64_t v134 = v0 * 5; + int64_t v143 = v0 * 9; + int64_t v150 = v0 * 10; + float v334 = v4 * v331; + float v341 = v4 * v338; + float v348 = v4 * v345; + float v355 = v4 * v352; + float v362 = v4 * v359; + float v369 = v4 * v366; + float v376 = v4 * v373; + float v383 = v4 * v380; + float v390 = v4 * v387; + float v397 = v4 * v394; + float v404 = v4 * v401; + float v411 = v4 * v408; + float v418 = v4 * v415; + float v425 = v4 * v422; + float v432 = v4 * v429; + float v439 = v4 * v436; + float v446 = v4 * v443; + float v453 = v4 * v450; + float v460 = v4 * v457; + int64_t v565 = v2 * 18; + int64_t v573 = v2 * 2; + int64_t v581 = v2 * 17; + int64_t v589 = v2 * 3; + int64_t v597 = v2 * 16; + int64_t v605 = v2 * 4; + int64_t v613 = v2 * 15; + int64_t v621 = v2 * 5; + int64_t v629 = v2 * 14; + int64_t v637 = v2 * 6; + int64_t v645 = v2 * 13; + int64_t v653 = v2 * 7; + int64_t v661 = v2 * 12; + int64_t v669 = v2 * 8; + int64_t v677 = v2 * 11; + int64_t v685 = v2 * 9; + int64_t v693 = v2 * 10; + const float32x2_t *v868 = &v5[0]; + svfloat32_t v872 = svdup_n_f32(v236); + svfloat32_t v873 = svdup_n_f32(v241); + svfloat32_t v874 = svdup_n_f32(v246); + svfloat32_t v875 = svdup_n_f32(v251); + svfloat32_t v876 = svdup_n_f32(v256); + svfloat32_t v877 = svdup_n_f32(v261); + svfloat32_t v878 = svdup_n_f32(v266); + svfloat32_t v879 = svdup_n_f32(v271); + svfloat32_t v880 = svdup_n_f32(v276); + svfloat32_t v881 = svdup_n_f32(v281); + svfloat32_t v882 = svdup_n_f32(v286); + svfloat32_t v883 = svdup_n_f32(v291); + svfloat32_t v884 = svdup_n_f32(v296); + svfloat32_t v885 = svdup_n_f32(v301); + svfloat32_t v886 = svdup_n_f32(v306); + svfloat32_t v887 = svdup_n_f32(v311); + svfloat32_t v888 = svdup_n_f32(v316); + svfloat32_t v889 = svdup_n_f32(v321); + svfloat32_t v890 = svdup_n_f32(v326); + float32x2_t *v917 = &v6[0]; + svfloat32_t v1083 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v705)[0])); + const float32x2_t *v714 = &v5[v22]; + const float32x2_t *v723 = &v5[v31]; + const float32x2_t *v732 = &v5[v38]; + const float32x2_t *v741 = &v5[v47]; + const float32x2_t *v750 = &v5[v54]; + const float32x2_t *v759 = &v5[v63]; + const float32x2_t *v768 = &v5[v70]; + const float32x2_t *v777 = &v5[v79]; + const float32x2_t *v786 = &v5[v86]; + const float32x2_t *v795 = &v5[v95]; + const float32x2_t *v804 = &v5[v102]; + const float32x2_t *v813 = &v5[v111]; + const float32x2_t *v822 = &v5[v118]; + const float32x2_t *v831 = &v5[v127]; + const float32x2_t *v840 = &v5[v134]; + const float32x2_t *v849 = &v5[v143]; + const float32x2_t *v858 = &v5[v150]; + svfloat32_t v891 = svdup_n_f32(v334); + svfloat32_t v892 = svdup_n_f32(v341); + svfloat32_t v893 = svdup_n_f32(v348); + svfloat32_t v894 = svdup_n_f32(v355); + svfloat32_t v895 = svdup_n_f32(v362); + svfloat32_t v896 = svdup_n_f32(v369); + svfloat32_t v897 = svdup_n_f32(v376); + svfloat32_t v898 = svdup_n_f32(v383); + svfloat32_t v899 = svdup_n_f32(v390); + svfloat32_t v900 = svdup_n_f32(v397); + svfloat32_t v901 = svdup_n_f32(v404); + svfloat32_t v902 = svdup_n_f32(v411); + svfloat32_t v903 = svdup_n_f32(v418); + svfloat32_t v904 = svdup_n_f32(v425); + svfloat32_t v905 = svdup_n_f32(v432); + svfloat32_t v906 = svdup_n_f32(v439); + svfloat32_t v907 = svdup_n_f32(v446); + svfloat32_t v908 = svdup_n_f32(v453); + svfloat32_t v909 = svdup_n_f32(v460); + float32x2_t *v935 = &v6[v565]; + float32x2_t *v944 = &v6[v573]; + float32x2_t *v953 = &v6[v581]; + float32x2_t *v962 = &v6[v589]; + float32x2_t *v971 = &v6[v597]; + float32x2_t *v980 = &v6[v605]; + float32x2_t *v989 = &v6[v613]; + float32x2_t *v998 = &v6[v621]; + float32x2_t *v1007 = &v6[v629]; + float32x2_t *v1016 = &v6[v637]; + float32x2_t *v1025 = &v6[v645]; + float32x2_t *v1034 = &v6[v653]; + float32x2_t *v1043 = &v6[v661]; + float32x2_t *v1052 = &v6[v669]; + float32x2_t *v1061 = &v6[v677]; + float32x2_t *v1070 = &v6[v685]; + float32x2_t *v1079 = &v6[v693]; + svfloat32_t v1119 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v868)[0])); + svfloat32_t v1085 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v714)[0])); + svfloat32_t v1087 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v723)[0])); + svfloat32_t v1089 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v732)[0])); + svfloat32_t v1091 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v741)[0])); + svfloat32_t v1093 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v750)[0])); + svfloat32_t v1095 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v759)[0])); + svfloat32_t v1097 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v768)[0])); + svfloat32_t v1099 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v777)[0])); + svfloat32_t v1101 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v786)[0])); + svfloat32_t v1103 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v795)[0])); + svfloat32_t v1105 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v804)[0])); + svfloat32_t v1107 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v813)[0])); + svfloat32_t v1109 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v822)[0])); + svfloat32_t v1111 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v831)[0])); + svfloat32_t v1113 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v840)[0])); + svfloat32_t v1115 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v849)[0])); + svfloat32_t v1117 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v858)[0])); + svfloat32_t v28 = svadd_f32_x(svptrue_b32(), v1083, v1085); + svfloat32_t v29 = svsub_f32_x(svptrue_b32(), v1083, v1085); + svfloat32_t v44 = svadd_f32_x(svptrue_b32(), v1087, v1089); + svfloat32_t v45 = svsub_f32_x(svptrue_b32(), v1089, v1087); + svfloat32_t v60 = svadd_f32_x(svptrue_b32(), v1091, v1093); + svfloat32_t v61 = svsub_f32_x(svptrue_b32(), v1091, v1093); + svfloat32_t v76 = svadd_f32_x(svptrue_b32(), v1095, v1097); + svfloat32_t v77 = svsub_f32_x(svptrue_b32(), v1097, v1095); + svfloat32_t v92 = svadd_f32_x(svptrue_b32(), v1099, v1101); + svfloat32_t v93 = svsub_f32_x(svptrue_b32(), v1099, v1101); + svfloat32_t v108 = svadd_f32_x(svptrue_b32(), v1103, v1105); + svfloat32_t v109 = svsub_f32_x(svptrue_b32(), v1105, v1103); + svfloat32_t v124 = svadd_f32_x(svptrue_b32(), v1107, v1109); + svfloat32_t v125 = svsub_f32_x(svptrue_b32(), v1107, v1109); + svfloat32_t v140 = svadd_f32_x(svptrue_b32(), v1111, v1113); + svfloat32_t v141 = svsub_f32_x(svptrue_b32(), v1113, v1111); + svfloat32_t v156 = svadd_f32_x(svptrue_b32(), v1115, v1117); + svfloat32_t v157 = svsub_f32_x(svptrue_b32(), v1115, v1117); + svfloat32_t v158 = svsub_f32_x(svptrue_b32(), v28, v124); + svfloat32_t v159 = svsub_f32_x(svptrue_b32(), v44, v140); + svfloat32_t v160 = svsub_f32_x(svptrue_b32(), v60, v156); + svfloat32_t v161 = svsub_f32_x(svptrue_b32(), v76, v124); + svfloat32_t v162 = svsub_f32_x(svptrue_b32(), v92, v140); + svfloat32_t v163 = svsub_f32_x(svptrue_b32(), v108, v156); + svfloat32_t v164 = svadd_f32_x(svptrue_b32(), v28, v76); + svfloat32_t v166 = svadd_f32_x(svptrue_b32(), v44, v92); + svfloat32_t v168 = svadd_f32_x(svptrue_b32(), v60, v108); + svfloat32_t v198 = svsub_f32_x(svptrue_b32(), v29, v125); + svfloat32_t v199 = svsub_f32_x(svptrue_b32(), v45, v141); + svfloat32_t v200 = svsub_f32_x(svptrue_b32(), v61, v157); + svfloat32_t v201 = svsub_f32_x(svptrue_b32(), v77, v125); + svfloat32_t v202 = svsub_f32_x(svptrue_b32(), v93, v141); + svfloat32_t v203 = svsub_f32_x(svptrue_b32(), v109, v157); + svfloat32_t v204 = svadd_f32_x(svptrue_b32(), v29, v77); + svfloat32_t v206 = svadd_f32_x(svptrue_b32(), v45, v93); + svfloat32_t v208 = svadd_f32_x(svptrue_b32(), v61, v109); + svfloat32_t v165 = svadd_f32_x(svptrue_b32(), v164, v124); + svfloat32_t v167 = svadd_f32_x(svptrue_b32(), v166, v140); + svfloat32_t v169 = svadd_f32_x(svptrue_b32(), v168, v156); + svfloat32_t v170 = svadd_f32_x(svptrue_b32(), v158, v160); + svfloat32_t v171 = svadd_f32_x(svptrue_b32(), v161, v163); + svfloat32_t v188 = svsub_f32_x(svptrue_b32(), v158, v161); + svfloat32_t v189 = svsub_f32_x(svptrue_b32(), v160, v163); + svfloat32_t v205 = svadd_f32_x(svptrue_b32(), v204, v125); + svfloat32_t v207 = svadd_f32_x(svptrue_b32(), v206, v141); + svfloat32_t v209 = svadd_f32_x(svptrue_b32(), v208, v157); + svfloat32_t v210 = svadd_f32_x(svptrue_b32(), v198, v200); + svfloat32_t v211 = svadd_f32_x(svptrue_b32(), v201, v203); + svfloat32_t v220 = svsub_f32_x(svptrue_b32(), v198, v201); + svfloat32_t v221 = svsub_f32_x(svptrue_b32(), v200, v203); + svfloat32_t zero385 = svdup_n_f32(0); + svfloat32_t v385 = svcmla_f32_x(pred_full, zero385, v898, v201, 90); + svfloat32_t zero406 = svdup_n_f32(0); + svfloat32_t v406 = svcmla_f32_x(pred_full, zero406, v901, v203, 90); + svfloat32_t v172 = svadd_f32_x(svptrue_b32(), v165, v167); + svfloat32_t v182 = svadd_f32_x(svptrue_b32(), v171, v162); + svfloat32_t v183 = svadd_f32_x(svptrue_b32(), v170, v159); + svfloat32_t v185 = svsub_f32_x(svptrue_b32(), v171, v162); + svfloat32_t v186 = svsub_f32_x(svptrue_b32(), v170, v159); + svfloat32_t v190 = svsub_f32_x(svptrue_b32(), v158, v189); + svfloat32_t v192 = svadd_f32_x(svptrue_b32(), v188, v163); + svfloat32_t v195 = svsub_f32_x(svptrue_b32(), v165, v169); + svfloat32_t v196 = svsub_f32_x(svptrue_b32(), v167, v169); + svfloat32_t v212 = svadd_f32_x(svptrue_b32(), v205, v207); + svfloat32_t v214 = svadd_f32_x(svptrue_b32(), v211, v202); + svfloat32_t v215 = svadd_f32_x(svptrue_b32(), v210, v199); + svfloat32_t v217 = svsub_f32_x(svptrue_b32(), v211, v202); + svfloat32_t v218 = svsub_f32_x(svptrue_b32(), v210, v199); + svfloat32_t v222 = svsub_f32_x(svptrue_b32(), v198, v221); + svfloat32_t v224 = svadd_f32_x(svptrue_b32(), v220, v203); + svfloat32_t v227 = svsub_f32_x(svptrue_b32(), v205, v209); + svfloat32_t v228 = svsub_f32_x(svptrue_b32(), v207, v209); + svfloat32_t v173 = svadd_f32_x(svptrue_b32(), v172, v169); + svfloat32_t v184 = svsub_f32_x(svptrue_b32(), v183, v182); + svfloat32_t v187 = svsub_f32_x(svptrue_b32(), v186, v185); + svfloat32_t v191 = svsub_f32_x(svptrue_b32(), v190, v162); + svfloat32_t v193 = svsub_f32_x(svptrue_b32(), v192, v159); + svfloat32_t v197 = svadd_f32_x(svptrue_b32(), v195, v196); + svfloat32_t v213 = svadd_f32_x(svptrue_b32(), v212, v209); + svfloat32_t v216 = svsub_f32_x(svptrue_b32(), v215, v214); + svfloat32_t v219 = svsub_f32_x(svptrue_b32(), v218, v217); + svfloat32_t v223 = svsub_f32_x(svptrue_b32(), v222, v202); + svfloat32_t v225 = svsub_f32_x(svptrue_b32(), v224, v199); + svfloat32_t v229 = svadd_f32_x(svptrue_b32(), v227, v228); + svfloat32_t v249 = svmul_f32_x(svptrue_b32(), v183, v874); + svfloat32_t v264 = svmul_f32_x(svptrue_b32(), v186, v877); + svfloat32_t zero343 = svdup_n_f32(0); + svfloat32_t v343 = svcmla_f32_x(pred_full, zero343, v892, v214, 90); + svfloat32_t zero364 = svdup_n_f32(0); + svfloat32_t v364 = svcmla_f32_x(pred_full, zero364, v895, v217, 90); + svfloat32_t zero448 = svdup_n_f32(0); + svfloat32_t v448 = svcmla_f32_x(pred_full, zero448, v907, v227, 90); + svfloat32_t zero455 = svdup_n_f32(0); + svfloat32_t v455 = svcmla_f32_x(pred_full, zero455, v908, v228, 90); + svfloat32_t v181 = svadd_f32_x(svptrue_b32(), v1119, v173); + svfloat32_t v194 = svsub_f32_x(svptrue_b32(), v191, v193); + svfloat32_t v226 = svsub_f32_x(svptrue_b32(), v223, v225); + svfloat32_t v254 = svmul_f32_x(svptrue_b32(), v184, v875); + svfloat32_t v269 = svmul_f32_x(svptrue_b32(), v187, v878); + svfloat32_t v329 = svmul_f32_x(svptrue_b32(), v197, v890); + svfloat32_t zero336 = svdup_n_f32(0); + svfloat32_t v336 = svcmla_f32_x(pred_full, zero336, v891, v213, 90); + svfloat32_t zero462 = svdup_n_f32(0); + svfloat32_t v462 = svcmla_f32_x(pred_full, zero462, v909, v229, 90); + svfloat32_t v463 = svmla_f32_x(pred_full, v249, v182, v873); + svfloat32_t v464 = svmla_f32_x(pred_full, v264, v185, v876); + svfloat32_t v494 = svcmla_f32_x(pred_full, v343, v893, v215, 90); + svfloat32_t v495 = svcmla_f32_x(pred_full, v364, v896, v218, 90); + svfloat32_t v314 = svmul_f32_x(svptrue_b32(), v194, v887); + svfloat32_t zero441 = svdup_n_f32(0); + svfloat32_t v441 = svcmla_f32_x(pred_full, zero441, v906, v226, 90); + svfloat32_t v466 = svadd_f32_x(svptrue_b32(), v463, v464); + svfloat32_t v467 = svmla_f32_x(pred_full, v254, v182, v873); + svfloat32_t v468 = svmla_f32_x(pred_full, v269, v185, v876); + svfloat32_t v485 = svsub_f32_x(svptrue_b32(), v463, v464); + svfloat32_t v487 = svnmls_f32_x(pred_full, v329, v195, v888); + svfloat32_t v488 = svnmls_f32_x(pred_full, v329, v196, v889); + svfloat32_t v489 = svmla_f32_x(pred_full, v181, v173, v872); + svfloat32_t v497 = svadd_f32_x(svptrue_b32(), v494, v495); + svfloat32_t v498 = svcmla_f32_x(pred_full, v343, v894, v216, 90); + svfloat32_t v499 = svcmla_f32_x(pred_full, v364, v897, v219, 90); + svfloat32_t v516 = svsub_f32_x(svptrue_b32(), v494, v495); + svfloat32_t v518 = svsub_f32_x(svptrue_b32(), v448, v462); + svfloat32_t v519 = svsub_f32_x(svptrue_b32(), v455, v462); + svst1_f64(pred_full, (double *)(v917), svreinterpret_f64_f32(v181)); + svfloat32_t v465 = svmla_f32_x(pred_full, v314, v193, v886); + svfloat32_t v469 = svmla_f32_x(pred_full, v314, v191, v885); + svfloat32_t v470 = svnmls_f32_x(pred_full, v466, v161, v879); + svfloat32_t v471 = svadd_f32_x(svptrue_b32(), v467, v468); + svfloat32_t v477 = svsub_f32_x(svptrue_b32(), v467, v468); + svfloat32_t v482 = svmla_f32_x(pred_full, v466, v160, v884); + svfloat32_t v490 = svadd_f32_x(svptrue_b32(), v489, v487); + svfloat32_t v491 = svsub_f32_x(svptrue_b32(), v489, v487); + svfloat32_t v493 = svadd_f32_x(svptrue_b32(), v489, v488); + svfloat32_t v496 = svcmla_f32_x(pred_full, v441, v905, v225, 90); + svfloat32_t v500 = svcmla_f32_x(pred_full, v441, v904, v223, 90); + svfloat32_t v501 = svsub_f32_x(svptrue_b32(), v385, v497); + svfloat32_t v502 = svadd_f32_x(svptrue_b32(), v498, v499); + svfloat32_t v508 = svsub_f32_x(svptrue_b32(), v498, v499); + svfloat32_t v513 = svcmla_f32_x(pred_full, v497, v903, v200, 90); + svfloat32_t v520 = svadd_f32_x(svptrue_b32(), v336, v518); + svfloat32_t v521 = svsub_f32_x(svptrue_b32(), v336, v518); + svfloat32_t v523 = svadd_f32_x(svptrue_b32(), v336, v519); + svfloat32_t v472 = svnmls_f32_x(pred_full, v469, v163, v882); + svfloat32_t v473 = svmla_f32_x(pred_full, v465, v188, v880); + svfloat32_t v475 = svmla_f32_x(pred_full, v471, v189, v883); + svfloat32_t v478 = svadd_f32_x(svptrue_b32(), v477, v465); + svfloat32_t v479 = svadd_f32_x(svptrue_b32(), v470, v471); + svfloat32_t v486 = svadd_f32_x(svptrue_b32(), v485, v469); + svfloat32_t v492 = svsub_f32_x(svptrue_b32(), v491, v488); + svfloat32_t v503 = svsub_f32_x(svptrue_b32(), v406, v500); + svfloat32_t v504 = svcmla_f32_x(pred_full, v496, v899, v220, 90); + svfloat32_t v506 = svcmla_f32_x(pred_full, v502, v902, v221, 90); + svfloat32_t v509 = svadd_f32_x(svptrue_b32(), v508, v496); + svfloat32_t v510 = svadd_f32_x(svptrue_b32(), v501, v502); + svfloat32_t v517 = svadd_f32_x(svptrue_b32(), v516, v500); + svfloat32_t v522 = svsub_f32_x(svptrue_b32(), v521, v519); + svfloat32_t v474 = svadd_f32_x(svptrue_b32(), v473, v470); + svfloat32_t v476 = svadd_f32_x(svptrue_b32(), v475, v472); + svfloat32_t v480 = svmla_f32_x(pred_full, v479, v158, v881); + svfloat32_t v483 = svadd_f32_x(svptrue_b32(), v482, v472); + svfloat32_t v505 = svadd_f32_x(svptrue_b32(), v504, v501); + svfloat32_t v507 = svadd_f32_x(svptrue_b32(), v506, v503); + svfloat32_t v511 = svcmla_f32_x(pred_full, v510, v900, v198, 90); + svfloat32_t v514 = svadd_f32_x(svptrue_b32(), v513, v503); + svfloat32_t v528 = svsub_f32_x(svptrue_b32(), v486, v478); + svfloat32_t v532 = svsub_f32_x(svptrue_b32(), v493, v486); + svfloat32_t v535 = svadd_f32_x(svptrue_b32(), v478, v493); + svfloat32_t v540 = svsub_f32_x(svptrue_b32(), v517, v509); + svfloat32_t v544 = svsub_f32_x(svptrue_b32(), v517, v523); + svfloat32_t v547 = svadd_f32_x(svptrue_b32(), v509, v523); + svfloat32_t v481 = svadd_f32_x(svptrue_b32(), v480, v469); + svfloat32_t v484 = svadd_f32_x(svptrue_b32(), v483, v465); + svfloat32_t v512 = svadd_f32_x(svptrue_b32(), v511, v500); + svfloat32_t v515 = svadd_f32_x(svptrue_b32(), v514, v496); + svfloat32_t v529 = svadd_f32_x(svptrue_b32(), v528, v493); + svfloat32_t v533 = svadd_f32_x(svptrue_b32(), v474, v490); + svfloat32_t v534 = svadd_f32_x(svptrue_b32(), v476, v492); + svfloat32_t v541 = svadd_f32_x(svptrue_b32(), v540, v523); + svfloat32_t v545 = svadd_f32_x(svptrue_b32(), v505, v520); + svfloat32_t v546 = svadd_f32_x(svptrue_b32(), v507, v522); + svfloat32_t v571 = svsub_f32_x(svptrue_b32(), v535, v547); + svfloat32_t v579 = svadd_f32_x(svptrue_b32(), v535, v547); + svfloat32_t v587 = svadd_f32_x(svptrue_b32(), v532, v544); + svfloat32_t v595 = svsub_f32_x(svptrue_b32(), v532, v544); + svfloat32_t v524 = svsub_f32_x(svptrue_b32(), v481, v474); + svfloat32_t v526 = svsub_f32_x(svptrue_b32(), v484, v476); + svfloat32_t v530 = svsub_f32_x(svptrue_b32(), v490, v481); + svfloat32_t v531 = svsub_f32_x(svptrue_b32(), v492, v484); + svfloat32_t v536 = svsub_f32_x(svptrue_b32(), v512, v505); + svfloat32_t v538 = svsub_f32_x(svptrue_b32(), v515, v507); + svfloat32_t v542 = svsub_f32_x(svptrue_b32(), v520, v512); + svfloat32_t v543 = svsub_f32_x(svptrue_b32(), v522, v515); + svfloat32_t v603 = svadd_f32_x(svptrue_b32(), v534, v546); + svfloat32_t v611 = svsub_f32_x(svptrue_b32(), v534, v546); + svfloat32_t v619 = svadd_f32_x(svptrue_b32(), v529, v541); + svfloat32_t v627 = svsub_f32_x(svptrue_b32(), v529, v541); + svfloat32_t v667 = svsub_f32_x(svptrue_b32(), v533, v545); + svfloat32_t v675 = svadd_f32_x(svptrue_b32(), v533, v545); + svst1_f64(pred_full, (double *)(v944), svreinterpret_f64_f32(v571)); + svst1_f64(pred_full, (double *)(v953), svreinterpret_f64_f32(v579)); + svst1_f64(pred_full, (double *)(v962), svreinterpret_f64_f32(v587)); + svst1_f64(pred_full, (double *)(v971), svreinterpret_f64_f32(v595)); + svfloat32_t v525 = svadd_f32_x(svptrue_b32(), v524, v490); + svfloat32_t v527 = svadd_f32_x(svptrue_b32(), v526, v492); + svfloat32_t v537 = svadd_f32_x(svptrue_b32(), v536, v520); + svfloat32_t v539 = svadd_f32_x(svptrue_b32(), v538, v522); + svfloat32_t v635 = svadd_f32_x(svptrue_b32(), v531, v543); + svfloat32_t v643 = svsub_f32_x(svptrue_b32(), v531, v543); + svfloat32_t v651 = svadd_f32_x(svptrue_b32(), v530, v542); + svfloat32_t v659 = svsub_f32_x(svptrue_b32(), v530, v542); + svst1_f64(pred_full, (double *)(v980), svreinterpret_f64_f32(v603)); + svst1_f64(pred_full, (double *)(v989), svreinterpret_f64_f32(v611)); + svst1_f64(pred_full, (double *)(v998), svreinterpret_f64_f32(v619)); + svst1_f64(pred_full, (double *)(v1007), svreinterpret_f64_f32(v627)); + svst1_f64(pred_full, (double *)(v1052), svreinterpret_f64_f32(v667)); + svst1_f64(pred_full, (double *)(v1061), svreinterpret_f64_f32(v675)); + svfloat32_t v555 = svadd_f32_x(svptrue_b32(), v525, v537); + svfloat32_t v563 = svsub_f32_x(svptrue_b32(), v525, v537); + svfloat32_t v683 = svadd_f32_x(svptrue_b32(), v527, v539); + svfloat32_t v691 = svsub_f32_x(svptrue_b32(), v527, v539); + svst1_f64(pred_full, (double *)(v1016), svreinterpret_f64_f32(v635)); + svst1_f64(pred_full, (double *)(v1025), svreinterpret_f64_f32(v643)); + svst1_f64(pred_full, (double *)(v1034), svreinterpret_f64_f32(v651)); + svst1_f64(pred_full, (double *)(v1043), svreinterpret_f64_f32(v659)); + svst1_f64(pred_full, (double *)(v926), svreinterpret_f64_f32(v555)); + svst1_f64(pred_full, (double *)(v935), svreinterpret_f64_f32(v563)); + svst1_f64(pred_full, (double *)(v1070), svreinterpret_f64_f32(v683)); + svst1_f64(pred_full, (double *)(v1079), svreinterpret_f64_f32(v691)); +} +#endif + +#ifndef ARMRAL_ARCH_SVE +void armral_fft_cf32_cf32_cf32_ac_n_uun20(const armral_cmplx_f32_t *restrict x, + armral_cmplx_f32_t *restrict y, + int istride, int ostride, int howmany, + float dir) { + float v4 = dir; + const float32x2_t *v5 = (const float32x2_t *)x; + float32x2_t *v6 = (float32x2_t *)y; + float v259 = 1.5388417685876268e+00F; + float v266 = 5.8778525229247325e-01F; + float v273 = 3.6327126400268028e-01F; + float v297 = 1.0000000000000000e+00F; + float v298 = -1.0000000000000000e+00F; + float v304 = -1.2500000000000000e+00F; + float v305 = 1.2500000000000000e+00F; + float v311 = 5.5901699437494745e-01F; + float v312 = -5.5901699437494745e-01F; + float v319 = -1.5388417685876268e+00F; + float v323 = -5.8778525229247325e-01F; + float v327 = -3.6327126400268028e-01F; + float32x2_t v13 = v5[0]; + float32x2_t v129 = v5[istride]; + float32x2_t v253 = (float32x2_t){v304, v304}; + float32x2_t v257 = (float32x2_t){v311, v311}; + float32x2_t v261 = (float32x2_t){v259, v319}; + float32x2_t v268 = (float32x2_t){v266, v323}; + float32x2_t v275 = (float32x2_t){v273, v327}; + float32x2_t v299 = (float32x2_t){v297, v298}; + float32x2_t v306 = (float32x2_t){v304, v305}; + float32x2_t v313 = (float32x2_t){v311, v312}; + float32x2_t v314 = (float32x2_t){v4, v4}; + float32x2_t v320 = (float32x2_t){v319, v319}; + float32x2_t v324 = (float32x2_t){v323, v323}; + float32x2_t v328 = (float32x2_t){v327, v327}; + float32x2_t v18 = v5[istride * 10]; + float32x2_t v25 = v5[istride * 5]; + float32x2_t v30 = v5[istride * 15]; + float32x2_t v39 = v5[istride * 4]; + float32x2_t v44 = v5[istride * 14]; + float32x2_t v51 = v5[istride * 9]; + float32x2_t v56 = v5[istride * 19]; + float32x2_t v65 = v5[istride * 8]; + float32x2_t v70 = v5[istride * 18]; + float32x2_t v77 = v5[istride * 13]; + float32x2_t v82 = v5[istride * 3]; + float32x2_t v91 = v5[istride * 12]; + float32x2_t v96 = v5[istride * 2]; + float32x2_t v103 = v5[istride * 17]; + float32x2_t v108 = v5[istride * 7]; + float32x2_t v117 = v5[istride * 16]; + float32x2_t v122 = v5[istride * 6]; + float32x2_t v134 = v5[istride * 11]; + float32x2_t v263 = vmul_f32(v314, v261); + float32x2_t v270 = vmul_f32(v314, v268); + float32x2_t v277 = vmul_f32(v314, v275); + float32x2_t v301 = vmul_f32(v314, v299); + float32x2_t v308 = vmul_f32(v314, v306); + float32x2_t v315 = vmul_f32(v314, v313); + float32x2_t v19 = vadd_f32(v13, v18); + float32x2_t v20 = vsub_f32(v13, v18); + float32x2_t v31 = vadd_f32(v25, v30); + float32x2_t v32 = vsub_f32(v25, v30); + float32x2_t v45 = vadd_f32(v39, v44); + float32x2_t v46 = vsub_f32(v39, v44); + float32x2_t v57 = vadd_f32(v51, v56); + float32x2_t v58 = vsub_f32(v51, v56); + float32x2_t v71 = vadd_f32(v65, v70); + float32x2_t v72 = vsub_f32(v65, v70); + float32x2_t v83 = vadd_f32(v77, v82); + float32x2_t v84 = vsub_f32(v77, v82); + float32x2_t v97 = vadd_f32(v91, v96); + float32x2_t v98 = vsub_f32(v91, v96); + float32x2_t v109 = vadd_f32(v103, v108); + float32x2_t v110 = vsub_f32(v103, v108); + float32x2_t v123 = vadd_f32(v117, v122); + float32x2_t v124 = vsub_f32(v117, v122); + float32x2_t v135 = vadd_f32(v129, v134); + float32x2_t v136 = vsub_f32(v129, v134); + float32x2_t v33 = vadd_f32(v19, v31); + float32x2_t v34 = vsub_f32(v19, v31); + float32x2_t v59 = vadd_f32(v45, v57); + float32x2_t v60 = vsub_f32(v45, v57); + float32x2_t v85 = vadd_f32(v71, v83); + float32x2_t v86 = vsub_f32(v71, v83); + float32x2_t v111 = vadd_f32(v97, v109); + float32x2_t v112 = vsub_f32(v97, v109); + float32x2_t v137 = vadd_f32(v123, v135); + float32x2_t v138 = vsub_f32(v123, v135); + float32x2_t v239 = vadd_f32(v46, v124); + float32x2_t v240 = vsub_f32(v46, v124); + float32x2_t v241 = vadd_f32(v98, v72); + float32x2_t v242 = vsub_f32(v98, v72); + float32x2_t v289 = vadd_f32(v58, v136); + float32x2_t v290 = vsub_f32(v58, v136); + float32x2_t v291 = vadd_f32(v110, v84); + float32x2_t v292 = vsub_f32(v110, v84); + float32x2_t v139 = vadd_f32(v59, v137); + float32x2_t v140 = vsub_f32(v59, v137); + float32x2_t v141 = vadd_f32(v111, v85); + float32x2_t v142 = vsub_f32(v111, v85); + float32x2_t v189 = vadd_f32(v60, v138); + float32x2_t v190 = vsub_f32(v60, v138); + float32x2_t v191 = vadd_f32(v112, v86); + float32x2_t v192 = vsub_f32(v112, v86); + float32x2_t v243 = vadd_f32(v239, v241); + float32x2_t v244 = vsub_f32(v239, v241); + float32x2_t v245 = vadd_f32(v240, v242); + float32x2_t v264 = vrev64_f32(v240); + float32x2_t v278 = vrev64_f32(v242); + float32x2_t v293 = vadd_f32(v289, v291); + float32x2_t v294 = vsub_f32(v289, v291); + float32x2_t v295 = vadd_f32(v290, v292); + float32x2_t v321 = vmul_f32(v290, v320); + float32x2_t v329 = vmul_f32(v292, v328); + float32x2_t v143 = vadd_f32(v139, v141); + float32x2_t v144 = vsub_f32(v139, v141); + float32x2_t v145 = vadd_f32(v140, v142); + float32x2_t v164 = vrev64_f32(v140); + float32x2_t v178 = vrev64_f32(v142); + float32x2_t v193 = vadd_f32(v189, v191); + float32x2_t v194 = vsub_f32(v189, v191); + float32x2_t v195 = vadd_f32(v190, v192); + float32x2_t v214 = vrev64_f32(v190); + float32x2_t v228 = vrev64_f32(v192); + float32x2_t v246 = vadd_f32(v243, v20); + float32x2_t v254 = vmul_f32(v243, v253); + float32x2_t v258 = vmul_f32(v244, v257); + float32x2_t v265 = vmul_f32(v264, v263); + float32x2_t v271 = vrev64_f32(v245); + float32x2_t v279 = vmul_f32(v278, v277); + float32x2_t v296 = vadd_f32(v293, v32); + float32x2_t v309 = vrev64_f32(v293); + float32x2_t v316 = vrev64_f32(v294); + float32x2_t v325 = vmul_f32(v295, v324); + float32x2_t v146 = vadd_f32(v143, v33); + float32x2_t v154 = vmul_f32(v143, v253); + float32x2_t v158 = vmul_f32(v144, v257); + float32x2_t v165 = vmul_f32(v164, v263); + float32x2_t v171 = vrev64_f32(v145); + float32x2_t v179 = vmul_f32(v178, v277); + float32x2_t v196 = vadd_f32(v193, v34); + float32x2_t v204 = vmul_f32(v193, v253); + float32x2_t v208 = vmul_f32(v194, v257); + float32x2_t v215 = vmul_f32(v214, v263); + float32x2_t v221 = vrev64_f32(v195); + float32x2_t v229 = vmul_f32(v228, v277); + float32x2_t v272 = vmul_f32(v271, v270); + float32x2_t v280 = vadd_f32(v246, v254); + float32x2_t v302 = vrev64_f32(v296); + float32x2_t v310 = vmul_f32(v309, v308); + float32x2_t v317 = vmul_f32(v316, v315); + float32x2_t v333 = vsub_f32(v321, v325); + float32x2_t v334 = vadd_f32(v325, v329); + float32x2_t v172 = vmul_f32(v171, v270); + float32x2_t v180 = vadd_f32(v146, v154); + float32x2_t v222 = vmul_f32(v221, v270); + float32x2_t v230 = vadd_f32(v196, v204); + float32x2_t v281 = vadd_f32(v280, v258); + float32x2_t v282 = vsub_f32(v280, v258); + float32x2_t v283 = vsub_f32(v265, v272); + float32x2_t v284 = vadd_f32(v272, v279); + float32x2_t v303 = vmul_f32(v302, v301); + v6[0] = v146; + v6[ostride * 10] = v196; + float32x2_t v181 = vadd_f32(v180, v158); + float32x2_t v182 = vsub_f32(v180, v158); + float32x2_t v183 = vsub_f32(v165, v172); + float32x2_t v184 = vadd_f32(v172, v179); + float32x2_t v231 = vadd_f32(v230, v208); + float32x2_t v232 = vsub_f32(v230, v208); + float32x2_t v233 = vsub_f32(v215, v222); + float32x2_t v234 = vadd_f32(v222, v229); + float32x2_t v285 = vadd_f32(v281, v283); + float32x2_t v286 = vsub_f32(v281, v283); + float32x2_t v287 = vadd_f32(v282, v284); + float32x2_t v288 = vsub_f32(v282, v284); + float32x2_t v330 = vadd_f32(v303, v310); + float32x2_t v339 = vadd_f32(v246, v303); + float32x2_t v340 = vsub_f32(v246, v303); + float32x2_t v185 = vadd_f32(v181, v183); + float32x2_t v186 = vsub_f32(v181, v183); + float32x2_t v187 = vadd_f32(v182, v184); + float32x2_t v188 = vsub_f32(v182, v184); + float32x2_t v235 = vadd_f32(v231, v233); + float32x2_t v236 = vsub_f32(v231, v233); + float32x2_t v237 = vadd_f32(v232, v234); + float32x2_t v238 = vsub_f32(v232, v234); + float32x2_t v331 = vadd_f32(v330, v317); + float32x2_t v332 = vsub_f32(v330, v317); + v6[ostride * 5] = v340; + v6[ostride * 15] = v339; + float32x2_t v335 = vadd_f32(v331, v333); + float32x2_t v336 = vsub_f32(v331, v333); + float32x2_t v337 = vadd_f32(v332, v334); + float32x2_t v338 = vsub_f32(v332, v334); + v6[ostride * 16] = v186; + v6[ostride * 6] = v236; + v6[ostride * 12] = v188; + v6[ostride * 2] = v238; + v6[ostride * 8] = v187; + v6[ostride * 18] = v237; + v6[ostride * 4] = v185; + v6[ostride * 14] = v235; + float32x2_t v361 = vadd_f32(v286, v336); + float32x2_t v362 = vsub_f32(v286, v336); + float32x2_t v383 = vadd_f32(v288, v338); + float32x2_t v384 = vsub_f32(v288, v338); + float32x2_t v405 = vadd_f32(v287, v337); + float32x2_t v406 = vsub_f32(v287, v337); + float32x2_t v427 = vadd_f32(v285, v335); + float32x2_t v428 = vsub_f32(v285, v335); + v6[ostride] = v362; + v6[ostride * 11] = v361; + v6[ostride * 17] = v384; + v6[ostride * 7] = v383; + v6[ostride * 13] = v406; + v6[ostride * 3] = v405; + v6[ostride * 9] = v428; + v6[ostride * 19] = v427; +} +#endif + +#ifdef ARMRAL_ARCH_SVE +void armral_fft_cf32_cf32_cf32_ac_n_uun20(const armral_cmplx_f32_t *restrict x, + armral_cmplx_f32_t *restrict y, + int istride, int ostride, int howmany, + float dir) { + int64_t v0 = istride; + int64_t v2 = ostride; + float v4 = dir; + const float32x2_t *v5 = (const float32x2_t *)x; + float32x2_t *v6 = (float32x2_t *)y; + svbool_t pred_full = svptrue_pat_b32(SV_VL2); + float v304 = -1.2500000000000000e+00F; + float v309 = 5.5901699437494745e-01F; + float v352 = -1.0000000000000000e+00F; + float v359 = 1.2500000000000000e+00F; + float v366 = -5.5901699437494745e-01F; + float v373 = -1.5388417685876268e+00F; + float v378 = -5.8778525229247325e-01F; + float v383 = -3.6327126400268028e-01F; + const float32x2_t *v715 = &v5[v0]; + float32x2_t *v803 = &v6[v2]; + int64_t v22 = v0 * 10; + int64_t v31 = v0 * 5; + int64_t v38 = v0 * 15; + int64_t v49 = v0 * 4; + int64_t v56 = v0 * 14; + int64_t v65 = v0 * 9; + int64_t v72 = v0 * 19; + int64_t v83 = v0 * 8; + int64_t v90 = v0 * 18; + int64_t v99 = v0 * 13; + int64_t v106 = v0 * 3; + int64_t v117 = v0 * 12; + int64_t v124 = v0 * 2; + int64_t v133 = v0 * 17; + int64_t v140 = v0 * 7; + int64_t v151 = v0 * 16; + int64_t v158 = v0 * 6; + int64_t v174 = v0 * 11; + float v317 = v4 * v373; + float v324 = v4 * v378; + float v331 = v4 * v383; + float v355 = v4 * v352; + float v362 = v4 * v359; + float v369 = v4 * v366; + int64_t v406 = v2 * 5; + int64_t v413 = v2 * 10; + int64_t v420 = v2 * 15; + int64_t v429 = v2 * 16; + int64_t v443 = v2 * 6; + int64_t v450 = v2 * 11; + int64_t v459 = v2 * 12; + int64_t v466 = v2 * 17; + int64_t v473 = v2 * 2; + int64_t v480 = v2 * 7; + int64_t v489 = v2 * 8; + int64_t v496 = v2 * 13; + int64_t v503 = v2 * 18; + int64_t v510 = v2 * 3; + int64_t v519 = v2 * 4; + int64_t v526 = v2 * 9; + int64_t v533 = v2 * 14; + int64_t v540 = v2 * 19; + const float32x2_t *v553 = &v5[0]; + svfloat32_t v740 = svdup_n_f32(v304); + svfloat32_t v741 = svdup_n_f32(v309); + svfloat32_t v748 = svdup_n_f32(v373); + svfloat32_t v749 = svdup_n_f32(v378); + svfloat32_t v750 = svdup_n_f32(v383); + float32x2_t *v758 = &v6[0]; + svfloat32_t v969 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v715)[0])); + const float32x2_t *v562 = &v5[v22]; + const float32x2_t *v571 = &v5[v31]; + const float32x2_t *v580 = &v5[v38]; + const float32x2_t *v589 = &v5[v49]; + const float32x2_t *v598 = &v5[v56]; + const float32x2_t *v607 = &v5[v65]; + const float32x2_t *v616 = &v5[v72]; + const float32x2_t *v625 = &v5[v83]; + const float32x2_t *v634 = &v5[v90]; + const float32x2_t *v643 = &v5[v99]; + const float32x2_t *v652 = &v5[v106]; + const float32x2_t *v661 = &v5[v117]; + const float32x2_t *v670 = &v5[v124]; + const float32x2_t *v679 = &v5[v133]; + const float32x2_t *v688 = &v5[v140]; + const float32x2_t *v697 = &v5[v151]; + const float32x2_t *v706 = &v5[v158]; + const float32x2_t *v724 = &v5[v174]; + svfloat32_t v742 = svdup_n_f32(v317); + svfloat32_t v743 = svdup_n_f32(v324); + svfloat32_t v744 = svdup_n_f32(v331); + svfloat32_t v745 = svdup_n_f32(v355); + svfloat32_t v746 = svdup_n_f32(v362); + svfloat32_t v747 = svdup_n_f32(v369); + float32x2_t *v767 = &v6[v406]; + float32x2_t *v776 = &v6[v413]; + float32x2_t *v785 = &v6[v420]; + float32x2_t *v794 = &v6[v429]; + float32x2_t *v812 = &v6[v443]; + float32x2_t *v821 = &v6[v450]; + float32x2_t *v830 = &v6[v459]; + float32x2_t *v839 = &v6[v466]; + float32x2_t *v848 = &v6[v473]; + float32x2_t *v857 = &v6[v480]; + float32x2_t *v866 = &v6[v489]; + float32x2_t *v875 = &v6[v496]; + float32x2_t *v884 = &v6[v503]; + float32x2_t *v893 = &v6[v510]; + float32x2_t *v902 = &v6[v519]; + float32x2_t *v911 = &v6[v526]; + float32x2_t *v920 = &v6[v533]; + float32x2_t *v929 = &v6[v540]; + svfloat32_t v933 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v553)[0])); + svfloat32_t v935 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v562)[0])); + svfloat32_t v937 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v571)[0])); + svfloat32_t v939 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v580)[0])); + svfloat32_t v941 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v589)[0])); + svfloat32_t v943 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v598)[0])); + svfloat32_t v945 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v607)[0])); + svfloat32_t v947 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v616)[0])); + svfloat32_t v949 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v625)[0])); + svfloat32_t v951 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v634)[0])); + svfloat32_t v953 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v643)[0])); + svfloat32_t v955 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v652)[0])); + svfloat32_t v957 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v661)[0])); + svfloat32_t v959 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v670)[0])); + svfloat32_t v961 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v679)[0])); + svfloat32_t v963 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v688)[0])); + svfloat32_t v965 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v697)[0])); + svfloat32_t v967 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v706)[0])); + svfloat32_t v971 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v724)[0])); + svfloat32_t v28 = svadd_f32_x(svptrue_b32(), v933, v935); + svfloat32_t v29 = svsub_f32_x(svptrue_b32(), v933, v935); + svfloat32_t v44 = svadd_f32_x(svptrue_b32(), v937, v939); + svfloat32_t v45 = svsub_f32_x(svptrue_b32(), v937, v939); + svfloat32_t v62 = svadd_f32_x(svptrue_b32(), v941, v943); + svfloat32_t v63 = svsub_f32_x(svptrue_b32(), v941, v943); + svfloat32_t v78 = svadd_f32_x(svptrue_b32(), v945, v947); + svfloat32_t v79 = svsub_f32_x(svptrue_b32(), v945, v947); + svfloat32_t v96 = svadd_f32_x(svptrue_b32(), v949, v951); + svfloat32_t v97 = svsub_f32_x(svptrue_b32(), v949, v951); + svfloat32_t v112 = svadd_f32_x(svptrue_b32(), v953, v955); + svfloat32_t v113 = svsub_f32_x(svptrue_b32(), v953, v955); + svfloat32_t v130 = svadd_f32_x(svptrue_b32(), v957, v959); + svfloat32_t v131 = svsub_f32_x(svptrue_b32(), v957, v959); + svfloat32_t v146 = svadd_f32_x(svptrue_b32(), v961, v963); + svfloat32_t v147 = svsub_f32_x(svptrue_b32(), v961, v963); + svfloat32_t v164 = svadd_f32_x(svptrue_b32(), v965, v967); + svfloat32_t v165 = svsub_f32_x(svptrue_b32(), v965, v967); + svfloat32_t v180 = svadd_f32_x(svptrue_b32(), v969, v971); + svfloat32_t v181 = svsub_f32_x(svptrue_b32(), v969, v971); + svfloat32_t v46 = svadd_f32_x(svptrue_b32(), v28, v44); + svfloat32_t v47 = svsub_f32_x(svptrue_b32(), v28, v44); + svfloat32_t v80 = svadd_f32_x(svptrue_b32(), v62, v78); + svfloat32_t v81 = svsub_f32_x(svptrue_b32(), v62, v78); + svfloat32_t v114 = svadd_f32_x(svptrue_b32(), v96, v112); + svfloat32_t v115 = svsub_f32_x(svptrue_b32(), v96, v112); + svfloat32_t v148 = svadd_f32_x(svptrue_b32(), v130, v146); + svfloat32_t v149 = svsub_f32_x(svptrue_b32(), v130, v146); + svfloat32_t v182 = svadd_f32_x(svptrue_b32(), v164, v180); + svfloat32_t v183 = svsub_f32_x(svptrue_b32(), v164, v180); + svfloat32_t v290 = svadd_f32_x(svptrue_b32(), v63, v165); + svfloat32_t v291 = svsub_f32_x(svptrue_b32(), v63, v165); + svfloat32_t v292 = svadd_f32_x(svptrue_b32(), v131, v97); + svfloat32_t v293 = svsub_f32_x(svptrue_b32(), v131, v97); + svfloat32_t v343 = svadd_f32_x(svptrue_b32(), v79, v181); + svfloat32_t v344 = svsub_f32_x(svptrue_b32(), v79, v181); + svfloat32_t v345 = svadd_f32_x(svptrue_b32(), v147, v113); + svfloat32_t v346 = svsub_f32_x(svptrue_b32(), v147, v113); + svfloat32_t v184 = svadd_f32_x(svptrue_b32(), v80, v182); + svfloat32_t v185 = svsub_f32_x(svptrue_b32(), v80, v182); + svfloat32_t v186 = svadd_f32_x(svptrue_b32(), v148, v114); + svfloat32_t v187 = svsub_f32_x(svptrue_b32(), v148, v114); + svfloat32_t v237 = svadd_f32_x(svptrue_b32(), v81, v183); + svfloat32_t v238 = svsub_f32_x(svptrue_b32(), v81, v183); + svfloat32_t v239 = svadd_f32_x(svptrue_b32(), v149, v115); + svfloat32_t v240 = svsub_f32_x(svptrue_b32(), v149, v115); + svfloat32_t v294 = svadd_f32_x(svptrue_b32(), v290, v292); + svfloat32_t v295 = svsub_f32_x(svptrue_b32(), v290, v292); + svfloat32_t v296 = svadd_f32_x(svptrue_b32(), v291, v293); + svfloat32_t zero319 = svdup_n_f32(0); + svfloat32_t v319 = svcmla_f32_x(pred_full, zero319, v742, v291, 90); + svfloat32_t v347 = svadd_f32_x(svptrue_b32(), v343, v345); + svfloat32_t v348 = svsub_f32_x(svptrue_b32(), v343, v345); + svfloat32_t v349 = svadd_f32_x(svptrue_b32(), v344, v346); + svfloat32_t v386 = svmul_f32_x(svptrue_b32(), v346, v750); + svfloat32_t v188 = svadd_f32_x(svptrue_b32(), v184, v186); + svfloat32_t v189 = svsub_f32_x(svptrue_b32(), v184, v186); + svfloat32_t v190 = svadd_f32_x(svptrue_b32(), v185, v187); + svfloat32_t zero213 = svdup_n_f32(0); + svfloat32_t v213 = svcmla_f32_x(pred_full, zero213, v742, v185, 90); + svfloat32_t v241 = svadd_f32_x(svptrue_b32(), v237, v239); + svfloat32_t v242 = svsub_f32_x(svptrue_b32(), v237, v239); + svfloat32_t v243 = svadd_f32_x(svptrue_b32(), v238, v240); + svfloat32_t zero266 = svdup_n_f32(0); + svfloat32_t v266 = svcmla_f32_x(pred_full, zero266, v742, v238, 90); + svfloat32_t v297 = svadd_f32_x(svptrue_b32(), v294, v29); + svfloat32_t zero326 = svdup_n_f32(0); + svfloat32_t v326 = svcmla_f32_x(pred_full, zero326, v743, v296, 90); + svfloat32_t v350 = svadd_f32_x(svptrue_b32(), v347, v45); + svfloat32_t zero371 = svdup_n_f32(0); + svfloat32_t v371 = svcmla_f32_x(pred_full, zero371, v747, v348, 90); + svfloat32_t v381 = svmul_f32_x(svptrue_b32(), v349, v749); + svfloat32_t v191 = svadd_f32_x(svptrue_b32(), v188, v46); + svfloat32_t zero220 = svdup_n_f32(0); + svfloat32_t v220 = svcmla_f32_x(pred_full, zero220, v743, v190, 90); + svfloat32_t v244 = svadd_f32_x(svptrue_b32(), v241, v47); + svfloat32_t zero273 = svdup_n_f32(0); + svfloat32_t v273 = svcmla_f32_x(pred_full, zero273, v743, v243, 90); + svfloat32_t v334 = svmla_f32_x(pred_full, v297, v294, v740); + svfloat32_t v337 = svsub_f32_x(svptrue_b32(), v319, v326); + svfloat32_t v338 = svcmla_f32_x(pred_full, v326, v744, v293, 90); + svfloat32_t zero357 = svdup_n_f32(0); + svfloat32_t v357 = svcmla_f32_x(pred_full, zero357, v745, v350, 90); + svfloat32_t v390 = svnmls_f32_x(pred_full, v381, v344, v748); + svfloat32_t v391 = svmla_f32_x(pred_full, v386, v349, v749); + svfloat32_t v228 = svmla_f32_x(pred_full, v191, v188, v740); + svfloat32_t v231 = svsub_f32_x(svptrue_b32(), v213, v220); + svfloat32_t v232 = svcmla_f32_x(pred_full, v220, v744, v187, 90); + svfloat32_t v281 = svmla_f32_x(pred_full, v244, v241, v740); + svfloat32_t v284 = svsub_f32_x(svptrue_b32(), v266, v273); + svfloat32_t v285 = svcmla_f32_x(pred_full, v273, v744, v240, 90); + svfloat32_t v335 = svmla_f32_x(pred_full, v334, v295, v741); + svfloat32_t v336 = svmls_f32_x(pred_full, v334, v295, v741); + svfloat32_t v387 = svcmla_f32_x(pred_full, v357, v746, v347, 90); + svfloat32_t v396 = svadd_f32_x(svptrue_b32(), v297, v357); + svfloat32_t v397 = svsub_f32_x(svptrue_b32(), v297, v357); + svst1_f64(pred_full, (double *)(v758), svreinterpret_f64_f32(v191)); + svst1_f64(pred_full, (double *)(v776), svreinterpret_f64_f32(v244)); + svfloat32_t v229 = svmla_f32_x(pred_full, v228, v189, v741); + svfloat32_t v230 = svmls_f32_x(pred_full, v228, v189, v741); + svfloat32_t v282 = svmla_f32_x(pred_full, v281, v242, v741); + svfloat32_t v283 = svmls_f32_x(pred_full, v281, v242, v741); + svfloat32_t v339 = svadd_f32_x(svptrue_b32(), v335, v337); + svfloat32_t v340 = svsub_f32_x(svptrue_b32(), v335, v337); + svfloat32_t v341 = svadd_f32_x(svptrue_b32(), v336, v338); + svfloat32_t v342 = svsub_f32_x(svptrue_b32(), v336, v338); + svfloat32_t v388 = svadd_f32_x(svptrue_b32(), v387, v371); + svfloat32_t v389 = svsub_f32_x(svptrue_b32(), v387, v371); + svst1_f64(pred_full, (double *)(v767), svreinterpret_f64_f32(v397)); + svst1_f64(pred_full, (double *)(v785), svreinterpret_f64_f32(v396)); + svfloat32_t v233 = svadd_f32_x(svptrue_b32(), v229, v231); + svfloat32_t v234 = svsub_f32_x(svptrue_b32(), v229, v231); + svfloat32_t v235 = svadd_f32_x(svptrue_b32(), v230, v232); + svfloat32_t v236 = svsub_f32_x(svptrue_b32(), v230, v232); + svfloat32_t v286 = svadd_f32_x(svptrue_b32(), v282, v284); + svfloat32_t v287 = svsub_f32_x(svptrue_b32(), v282, v284); + svfloat32_t v288 = svadd_f32_x(svptrue_b32(), v283, v285); + svfloat32_t v289 = svsub_f32_x(svptrue_b32(), v283, v285); + svfloat32_t v392 = svadd_f32_x(svptrue_b32(), v388, v390); + svfloat32_t v393 = svsub_f32_x(svptrue_b32(), v388, v390); + svfloat32_t v394 = svadd_f32_x(svptrue_b32(), v389, v391); + svfloat32_t v395 = svsub_f32_x(svptrue_b32(), v389, v391); + svfloat32_t v426 = svadd_f32_x(svptrue_b32(), v340, v393); + svfloat32_t v427 = svsub_f32_x(svptrue_b32(), v340, v393); + svfloat32_t v456 = svadd_f32_x(svptrue_b32(), v342, v395); + svfloat32_t v457 = svsub_f32_x(svptrue_b32(), v342, v395); + svfloat32_t v486 = svadd_f32_x(svptrue_b32(), v341, v394); + svfloat32_t v487 = svsub_f32_x(svptrue_b32(), v341, v394); + svfloat32_t v516 = svadd_f32_x(svptrue_b32(), v339, v392); + svfloat32_t v517 = svsub_f32_x(svptrue_b32(), v339, v392); + svst1_f64(pred_full, (double *)(v794), svreinterpret_f64_f32(v234)); + svst1_f64(pred_full, (double *)(v812), svreinterpret_f64_f32(v287)); + svst1_f64(pred_full, (double *)(v830), svreinterpret_f64_f32(v236)); + svst1_f64(pred_full, (double *)(v848), svreinterpret_f64_f32(v289)); + svst1_f64(pred_full, (double *)(v866), svreinterpret_f64_f32(v235)); + svst1_f64(pred_full, (double *)(v884), svreinterpret_f64_f32(v288)); + svst1_f64(pred_full, (double *)(v902), svreinterpret_f64_f32(v233)); + svst1_f64(pred_full, (double *)(v920), svreinterpret_f64_f32(v286)); + svst1_f64(pred_full, (double *)(v803), svreinterpret_f64_f32(v427)); + svst1_f64(pred_full, (double *)(v821), svreinterpret_f64_f32(v426)); + svst1_f64(pred_full, (double *)(v839), svreinterpret_f64_f32(v457)); + svst1_f64(pred_full, (double *)(v857), svreinterpret_f64_f32(v456)); + svst1_f64(pred_full, (double *)(v875), svreinterpret_f64_f32(v487)); + svst1_f64(pred_full, (double *)(v893), svreinterpret_f64_f32(v486)); + svst1_f64(pred_full, (double *)(v911), svreinterpret_f64_f32(v517)); + svst1_f64(pred_full, (double *)(v929), svreinterpret_f64_f32(v516)); +} +#endif + +#ifndef ARMRAL_ARCH_SVE +void armral_fft_cf32_cf32_cf32_ac_n_uun21(const armral_cmplx_f32_t *restrict x, + armral_cmplx_f32_t *restrict y, + int istride, int ostride, int howmany, + float dir) { + float v4 = dir; + const float32x2_t *v5 = (const float32x2_t *)x; + float32x2_t *v6 = (float32x2_t *)y; + float v157 = -1.1666666666666665e+00F; + float v161 = 7.9015646852540022e-01F; + float v165 = 5.5854267289647742e-02F; + float v169 = 7.3430220123575241e-01F; + float v172 = 4.4095855184409838e-01F; + float v173 = -4.4095855184409838e-01F; + float v179 = 3.4087293062393137e-01F; + float v180 = -3.4087293062393137e-01F; + float v186 = -5.3396936033772524e-01F; + float v187 = 5.3396936033772524e-01F; + float v193 = 8.7484229096165667e-01F; + float v194 = -8.7484229096165667e-01F; + float v237 = -1.4999999999999998e+00F; + float v241 = 1.7499999999999996e+00F; + float v245 = -1.1852347027881001e+00F; + float v249 = -8.3781400934471603e-02F; + float v253 = -1.1014533018536286e+00F; + float v256 = -6.6143782776614746e-01F; + float v257 = 6.6143782776614746e-01F; + float v263 = -5.1130939593589697e-01F; + float v264 = 5.1130939593589697e-01F; + float v270 = 8.0095404050658769e-01F; + float v271 = -8.0095404050658769e-01F; + float v277 = -1.3122634364424848e+00F; + float v278 = 1.3122634364424848e+00F; + float v320 = 8.6602540378443871e-01F; + float v321 = -8.6602540378443871e-01F; + float v327 = -1.0103629710818451e+00F; + float v328 = 1.0103629710818451e+00F; + float v334 = 6.8429557470759583e-01F; + float v335 = -6.8429557470759583e-01F; + float v341 = 4.8371214382601155e-02F; + float v342 = -4.8371214382601155e-02F; + float v348 = 6.3592436032499466e-01F; + float v349 = -6.3592436032499466e-01F; + float v356 = -3.8188130791298663e-01F; + float v360 = -2.9520461738277515e-01F; + float v364 = 4.6243103089499693e-01F; + float v368 = -7.5763564827777208e-01F; + float32x2_t v25 = v5[0]; + float32x2_t v103 = v5[istride]; + float32x2_t v158 = (float32x2_t){v157, v157}; + float32x2_t v162 = (float32x2_t){v161, v161}; + float32x2_t v166 = (float32x2_t){v165, v165}; + float32x2_t v170 = (float32x2_t){v169, v169}; + float32x2_t v174 = (float32x2_t){v172, v173}; + float32x2_t v181 = (float32x2_t){v179, v180}; + float32x2_t v188 = (float32x2_t){v186, v187}; + float32x2_t v195 = (float32x2_t){v193, v194}; + float32x2_t v238 = (float32x2_t){v237, v237}; + float32x2_t v242 = (float32x2_t){v241, v241}; + float32x2_t v246 = (float32x2_t){v245, v245}; + float32x2_t v250 = (float32x2_t){v249, v249}; + float32x2_t v254 = (float32x2_t){v253, v253}; + float32x2_t v258 = (float32x2_t){v256, v257}; + float32x2_t v265 = (float32x2_t){v263, v264}; + float32x2_t v272 = (float32x2_t){v270, v271}; + float32x2_t v279 = (float32x2_t){v277, v278}; + float32x2_t v322 = (float32x2_t){v320, v321}; + float32x2_t v329 = (float32x2_t){v327, v328}; + float32x2_t v336 = (float32x2_t){v334, v335}; + float32x2_t v343 = (float32x2_t){v341, v342}; + float32x2_t v350 = (float32x2_t){v348, v349}; + float32x2_t v351 = (float32x2_t){v4, v4}; + float32x2_t v357 = (float32x2_t){v356, v356}; + float32x2_t v361 = (float32x2_t){v360, v360}; + float32x2_t v365 = (float32x2_t){v364, v364}; + float32x2_t v369 = (float32x2_t){v368, v368}; + float32x2_t v13 = v5[istride * 7]; + float32x2_t v18 = v5[istride * 14]; + float32x2_t v31 = v5[istride * 10]; + float32x2_t v36 = v5[istride * 17]; + float32x2_t v43 = v5[istride * 3]; + float32x2_t v49 = v5[istride * 13]; + float32x2_t v54 = v5[istride * 20]; + float32x2_t v61 = v5[istride * 6]; + float32x2_t v67 = v5[istride * 16]; + float32x2_t v72 = v5[istride * 2]; + float32x2_t v79 = v5[istride * 9]; + float32x2_t v85 = v5[istride * 19]; + float32x2_t v90 = v5[istride * 5]; + float32x2_t v97 = v5[istride * 12]; + float32x2_t v108 = v5[istride * 8]; + float32x2_t v115 = v5[istride * 15]; + float32x2_t v121 = v5[istride * 4]; + float32x2_t v126 = v5[istride * 11]; + float32x2_t v133 = v5[istride * 18]; + float32x2_t v176 = vmul_f32(v351, v174); + float32x2_t v183 = vmul_f32(v351, v181); + float32x2_t v190 = vmul_f32(v351, v188); + float32x2_t v197 = vmul_f32(v351, v195); + float32x2_t v260 = vmul_f32(v351, v258); + float32x2_t v267 = vmul_f32(v351, v265); + float32x2_t v274 = vmul_f32(v351, v272); + float32x2_t v281 = vmul_f32(v351, v279); + float32x2_t v324 = vmul_f32(v351, v322); + float32x2_t v331 = vmul_f32(v351, v329); + float32x2_t v338 = vmul_f32(v351, v336); + float32x2_t v345 = vmul_f32(v351, v343); + float32x2_t v352 = vmul_f32(v351, v350); + float32x2_t v19 = vadd_f32(v13, v18); + float32x2_t v20 = vsub_f32(v13, v18); + float32x2_t v37 = vadd_f32(v31, v36); + float32x2_t v38 = vsub_f32(v31, v36); + float32x2_t v55 = vadd_f32(v49, v54); + float32x2_t v56 = vsub_f32(v49, v54); + float32x2_t v73 = vadd_f32(v67, v72); + float32x2_t v74 = vsub_f32(v67, v72); + float32x2_t v91 = vadd_f32(v85, v90); + float32x2_t v92 = vsub_f32(v85, v90); + float32x2_t v109 = vadd_f32(v103, v108); + float32x2_t v110 = vsub_f32(v103, v108); + float32x2_t v127 = vadd_f32(v121, v126); + float32x2_t v128 = vsub_f32(v121, v126); + float32x2_t v26 = vadd_f32(v19, v25); + float32x2_t v44 = vadd_f32(v37, v43); + float32x2_t v62 = vadd_f32(v55, v61); + float32x2_t v80 = vadd_f32(v73, v79); + float32x2_t v98 = vadd_f32(v91, v97); + float32x2_t v116 = vadd_f32(v109, v115); + float32x2_t v134 = vadd_f32(v127, v133); + float32x2_t v219 = vadd_f32(v37, v127); + float32x2_t v220 = vsub_f32(v37, v127); + float32x2_t v221 = vadd_f32(v91, v73); + float32x2_t v222 = vsub_f32(v91, v73); + float32x2_t v223 = vadd_f32(v55, v109); + float32x2_t v224 = vsub_f32(v55, v109); + float32x2_t v303 = vadd_f32(v38, v128); + float32x2_t v304 = vsub_f32(v38, v128); + float32x2_t v305 = vadd_f32(v92, v74); + float32x2_t v306 = vsub_f32(v92, v74); + float32x2_t v307 = vadd_f32(v56, v110); + float32x2_t v308 = vsub_f32(v56, v110); + float32x2_t v135 = vadd_f32(v44, v134); + float32x2_t v136 = vsub_f32(v44, v134); + float32x2_t v137 = vadd_f32(v98, v80); + float32x2_t v138 = vsub_f32(v98, v80); + float32x2_t v139 = vadd_f32(v62, v116); + float32x2_t v140 = vsub_f32(v62, v116); + float32x2_t v225 = vadd_f32(v219, v221); + float32x2_t v228 = vsub_f32(v219, v221); + float32x2_t v229 = vsub_f32(v221, v223); + float32x2_t v230 = vsub_f32(v223, v219); + float32x2_t v231 = vadd_f32(v220, v222); + float32x2_t v233 = vsub_f32(v220, v222); + float32x2_t v234 = vsub_f32(v222, v224); + float32x2_t v235 = vsub_f32(v224, v220); + float32x2_t v309 = vadd_f32(v303, v305); + float32x2_t v312 = vsub_f32(v303, v305); + float32x2_t v313 = vsub_f32(v305, v307); + float32x2_t v314 = vsub_f32(v307, v303); + float32x2_t v315 = vadd_f32(v304, v306); + float32x2_t v317 = vsub_f32(v304, v306); + float32x2_t v318 = vsub_f32(v306, v308); + float32x2_t v319 = vsub_f32(v308, v304); + float32x2_t v141 = vadd_f32(v135, v137); + float32x2_t v144 = vsub_f32(v135, v137); + float32x2_t v145 = vsub_f32(v137, v139); + float32x2_t v146 = vsub_f32(v139, v135); + float32x2_t v147 = vadd_f32(v136, v138); + float32x2_t v149 = vsub_f32(v136, v138); + float32x2_t v150 = vsub_f32(v138, v140); + float32x2_t v151 = vsub_f32(v140, v136); + float32x2_t v226 = vadd_f32(v225, v223); + float32x2_t v232 = vadd_f32(v231, v224); + float32x2_t v247 = vmul_f32(v228, v246); + float32x2_t v251 = vmul_f32(v229, v250); + float32x2_t v255 = vmul_f32(v230, v254); + float32x2_t v268 = vrev64_f32(v233); + float32x2_t v275 = vrev64_f32(v234); + float32x2_t v282 = vrev64_f32(v235); + float32x2_t v310 = vadd_f32(v309, v307); + float32x2_t v316 = vadd_f32(v315, v308); + float32x2_t v339 = vrev64_f32(v312); + float32x2_t v346 = vrev64_f32(v313); + float32x2_t v353 = vrev64_f32(v314); + float32x2_t v362 = vmul_f32(v317, v361); + float32x2_t v366 = vmul_f32(v318, v365); + float32x2_t v370 = vmul_f32(v319, v369); + float32x2_t v142 = vadd_f32(v141, v139); + float32x2_t v148 = vadd_f32(v147, v140); + float32x2_t v163 = vmul_f32(v144, v162); + float32x2_t v167 = vmul_f32(v145, v166); + float32x2_t v171 = vmul_f32(v146, v170); + float32x2_t v184 = vrev64_f32(v149); + float32x2_t v191 = vrev64_f32(v150); + float32x2_t v198 = vrev64_f32(v151); + float32x2_t v227 = vadd_f32(v226, v19); + float32x2_t v243 = vmul_f32(v226, v242); + float32x2_t v261 = vrev64_f32(v232); + float32x2_t v269 = vmul_f32(v268, v267); + float32x2_t v276 = vmul_f32(v275, v274); + float32x2_t v283 = vmul_f32(v282, v281); + float32x2_t v311 = vadd_f32(v310, v20); + float32x2_t v332 = vrev64_f32(v310); + float32x2_t v340 = vmul_f32(v339, v338); + float32x2_t v347 = vmul_f32(v346, v345); + float32x2_t v354 = vmul_f32(v353, v352); + float32x2_t v358 = vmul_f32(v316, v357); + float32x2_t v143 = vadd_f32(v142, v26); + float32x2_t v159 = vmul_f32(v142, v158); + float32x2_t v177 = vrev64_f32(v148); + float32x2_t v185 = vmul_f32(v184, v183); + float32x2_t v192 = vmul_f32(v191, v190); + float32x2_t v199 = vmul_f32(v198, v197); + float32x2_t v239 = vmul_f32(v227, v238); + float32x2_t v262 = vmul_f32(v261, v260); + float32x2_t v325 = vrev64_f32(v311); + float32x2_t v333 = vmul_f32(v332, v331); + float32x2_t v378 = vadd_f32(v358, v362); + float32x2_t v380 = vsub_f32(v358, v362); + float32x2_t v382 = vsub_f32(v358, v366); + float32x2_t v178 = vmul_f32(v177, v176); + float32x2_t v200 = vadd_f32(v143, v159); + float32x2_t v284 = vadd_f32(v239, v243); + float32x2_t v291 = vadd_f32(v262, v269); + float32x2_t v293 = vsub_f32(v262, v269); + float32x2_t v295 = vsub_f32(v262, v276); + float32x2_t v326 = vmul_f32(v325, v324); + float32x2_t v379 = vadd_f32(v378, v366); + float32x2_t v381 = vsub_f32(v380, v370); + float32x2_t v383 = vadd_f32(v382, v370); + float32x2_t v390 = vadd_f32(v143, v239); + v6[0] = v143; + float32x2_t v201 = vadd_f32(v200, v163); + float32x2_t v203 = vsub_f32(v200, v163); + float32x2_t v205 = vsub_f32(v200, v167); + float32x2_t v207 = vadd_f32(v178, v185); + float32x2_t v209 = vsub_f32(v178, v185); + float32x2_t v211 = vsub_f32(v178, v192); + float32x2_t v285 = vadd_f32(v284, v247); + float32x2_t v287 = vsub_f32(v284, v247); + float32x2_t v289 = vsub_f32(v284, v251); + float32x2_t v292 = vadd_f32(v291, v276); + float32x2_t v294 = vsub_f32(v293, v283); + float32x2_t v296 = vadd_f32(v295, v283); + float32x2_t v371 = vadd_f32(v326, v333); + float32x2_t v391 = vadd_f32(v390, v326); + float32x2_t v392 = vsub_f32(v390, v326); + float32x2_t v202 = vadd_f32(v201, v167); + float32x2_t v204 = vsub_f32(v203, v171); + float32x2_t v206 = vadd_f32(v205, v171); + float32x2_t v208 = vadd_f32(v207, v192); + float32x2_t v210 = vsub_f32(v209, v199); + float32x2_t v212 = vadd_f32(v211, v199); + float32x2_t v286 = vadd_f32(v285, v251); + float32x2_t v288 = vsub_f32(v287, v255); + float32x2_t v290 = vadd_f32(v289, v255); + float32x2_t v372 = vadd_f32(v371, v340); + float32x2_t v374 = vsub_f32(v371, v340); + float32x2_t v376 = vsub_f32(v371, v347); + v6[ostride * 7] = v392; + v6[ostride * 14] = v391; + float32x2_t v213 = vadd_f32(v202, v208); + float32x2_t v214 = vsub_f32(v202, v208); + float32x2_t v215 = vadd_f32(v204, v210); + float32x2_t v216 = vsub_f32(v204, v210); + float32x2_t v217 = vadd_f32(v206, v212); + float32x2_t v218 = vsub_f32(v206, v212); + float32x2_t v297 = vadd_f32(v286, v292); + float32x2_t v298 = vsub_f32(v286, v292); + float32x2_t v299 = vadd_f32(v288, v294); + float32x2_t v300 = vsub_f32(v288, v294); + float32x2_t v301 = vadd_f32(v290, v296); + float32x2_t v302 = vsub_f32(v290, v296); + float32x2_t v373 = vadd_f32(v372, v347); + float32x2_t v375 = vsub_f32(v374, v354); + float32x2_t v377 = vadd_f32(v376, v354); + float32x2_t v384 = vadd_f32(v373, v379); + float32x2_t v385 = vsub_f32(v373, v379); + float32x2_t v386 = vadd_f32(v375, v381); + float32x2_t v387 = vsub_f32(v375, v381); + float32x2_t v388 = vadd_f32(v377, v383); + float32x2_t v389 = vsub_f32(v377, v383); + float32x2_t v408 = vadd_f32(v214, v298); + v6[ostride * 15] = v214; + float32x2_t v426 = vadd_f32(v216, v300); + v6[ostride * 9] = v216; + float32x2_t v444 = vadd_f32(v217, v301); + v6[ostride * 3] = v217; + float32x2_t v462 = vadd_f32(v218, v302); + v6[ostride * 18] = v218; + float32x2_t v480 = vadd_f32(v215, v299); + v6[ostride * 12] = v215; + float32x2_t v498 = vadd_f32(v213, v297); + v6[ostride * 6] = v213; + float32x2_t v409 = vadd_f32(v408, v385); + float32x2_t v410 = vsub_f32(v408, v385); + float32x2_t v427 = vadd_f32(v426, v387); + float32x2_t v428 = vsub_f32(v426, v387); + float32x2_t v445 = vadd_f32(v444, v388); + float32x2_t v446 = vsub_f32(v444, v388); + float32x2_t v463 = vadd_f32(v462, v389); + float32x2_t v464 = vsub_f32(v462, v389); + float32x2_t v481 = vadd_f32(v480, v386); + float32x2_t v482 = vsub_f32(v480, v386); + float32x2_t v499 = vadd_f32(v498, v384); + float32x2_t v500 = vsub_f32(v498, v384); + v6[ostride] = v410; + v6[ostride * 8] = v409; + v6[ostride * 16] = v428; + v6[ostride * 2] = v427; + v6[ostride * 10] = v446; + v6[ostride * 17] = v445; + v6[ostride * 4] = v464; + v6[ostride * 11] = v463; + v6[ostride * 19] = v482; + v6[ostride * 5] = v481; + v6[ostride * 13] = v500; + v6[ostride * 20] = v499; +} +#endif + +#ifdef ARMRAL_ARCH_SVE +void armral_fft_cf32_cf32_cf32_ac_n_uun21(const armral_cmplx_f32_t *restrict x, + armral_cmplx_f32_t *restrict y, + int istride, int ostride, int howmany, + float dir) { + int64_t v0 = istride; + int64_t v2 = ostride; + float v4 = dir; + const float32x2_t *v5 = (const float32x2_t *)x; + float32x2_t *v6 = (float32x2_t *)y; + svbool_t pred_full = svptrue_pat_b32(SV_VL2); + float v205 = -1.1666666666666665e+00F; + float v210 = 7.9015646852540022e-01F; + float v215 = 5.5854267289647742e-02F; + float v220 = 7.3430220123575241e-01F; + float v225 = -4.4095855184409838e-01F; + float v232 = -3.4087293062393137e-01F; + float v239 = 5.3396936033772524e-01F; + float v246 = -8.7484229096165667e-01F; + float v289 = -1.4999999999999998e+00F; + float v294 = 1.7499999999999996e+00F; + float v299 = -1.1852347027881001e+00F; + float v304 = -8.3781400934471603e-02F; + float v309 = -1.1014533018536286e+00F; + float v314 = 6.6143782776614746e-01F; + float v321 = 5.1130939593589697e-01F; + float v328 = -8.0095404050658769e-01F; + float v335 = 1.3122634364424848e+00F; + float v378 = -8.6602540378443871e-01F; + float v385 = 1.0103629710818451e+00F; + float v392 = -6.8429557470759583e-01F; + float v399 = -4.8371214382601155e-02F; + float v406 = -6.3592436032499466e-01F; + float v413 = -3.8188130791298663e-01F; + float v418 = -2.9520461738277515e-01F; + float v423 = 4.6243103089499693e-01F; + float v428 = -7.5763564827777208e-01F; + const float32x2_t *v761 = &v5[v0]; + float32x2_t *v879 = &v6[v2]; + int64_t v15 = v0 * 7; + int64_t v22 = v0 * 14; + int64_t v39 = v0 * 10; + int64_t v46 = v0 * 17; + int64_t v55 = v0 * 3; + int64_t v63 = v0 * 13; + int64_t v70 = v0 * 20; + int64_t v79 = v0 * 6; + int64_t v87 = v0 * 16; + int64_t v94 = v0 * 2; + int64_t v103 = v0 * 9; + int64_t v111 = v0 * 19; + int64_t v118 = v0 * 5; + int64_t v127 = v0 * 12; + int64_t v142 = v0 * 8; + int64_t v151 = v0 * 15; + int64_t v159 = v0 * 4; + int64_t v166 = v0 * 11; + int64_t v175 = v0 * 18; + float v228 = v4 * v225; + float v235 = v4 * v232; + float v242 = v4 * v239; + float v249 = v4 * v246; + float v317 = v4 * v314; + float v324 = v4 * v321; + float v331 = v4 * v328; + float v338 = v4 * v335; + float v381 = v4 * v378; + float v388 = v4 * v385; + float v395 = v4 * v392; + float v402 = v4 * v399; + float v409 = v4 * v406; + int64_t v462 = v2 * 7; + int64_t v469 = v2 * 14; + int64_t v479 = v2 * 15; + int64_t v493 = v2 * 8; + int64_t v503 = v2 * 9; + int64_t v510 = v2 * 16; + int64_t v517 = v2 * 2; + int64_t v527 = v2 * 3; + int64_t v534 = v2 * 10; + int64_t v541 = v2 * 17; + int64_t v551 = v2 * 18; + int64_t v558 = v2 * 4; + int64_t v565 = v2 * 11; + int64_t v575 = v2 * 12; + int64_t v582 = v2 * 19; + int64_t v589 = v2 * 5; + int64_t v599 = v2 * 6; + int64_t v606 = v2 * 13; + int64_t v613 = v2 * 20; + const float32x2_t *v644 = &v5[0]; + svfloat32_t v810 = svdup_n_f32(v205); + svfloat32_t v811 = svdup_n_f32(v210); + svfloat32_t v812 = svdup_n_f32(v215); + svfloat32_t v813 = svdup_n_f32(v220); + svfloat32_t v818 = svdup_n_f32(v289); + svfloat32_t v819 = svdup_n_f32(v294); + svfloat32_t v820 = svdup_n_f32(v299); + svfloat32_t v821 = svdup_n_f32(v304); + svfloat32_t v822 = svdup_n_f32(v309); + svfloat32_t v832 = svdup_n_f32(v413); + svfloat32_t v833 = svdup_n_f32(v418); + svfloat32_t v834 = svdup_n_f32(v423); + svfloat32_t v835 = svdup_n_f32(v428); + float32x2_t *v843 = &v6[0]; + svfloat32_t v1057 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v761)[0])); + const float32x2_t *v625 = &v5[v15]; + const float32x2_t *v634 = &v5[v22]; + const float32x2_t *v653 = &v5[v39]; + const float32x2_t *v662 = &v5[v46]; + const float32x2_t *v671 = &v5[v55]; + const float32x2_t *v680 = &v5[v63]; + const float32x2_t *v689 = &v5[v70]; + const float32x2_t *v698 = &v5[v79]; + const float32x2_t *v707 = &v5[v87]; + const float32x2_t *v716 = &v5[v94]; + const float32x2_t *v725 = &v5[v103]; + const float32x2_t *v734 = &v5[v111]; + const float32x2_t *v743 = &v5[v118]; + const float32x2_t *v752 = &v5[v127]; + const float32x2_t *v770 = &v5[v142]; + const float32x2_t *v779 = &v5[v151]; + const float32x2_t *v788 = &v5[v159]; + const float32x2_t *v797 = &v5[v166]; + const float32x2_t *v806 = &v5[v175]; + svfloat32_t v814 = svdup_n_f32(v228); + svfloat32_t v815 = svdup_n_f32(v235); + svfloat32_t v816 = svdup_n_f32(v242); + svfloat32_t v817 = svdup_n_f32(v249); + svfloat32_t v823 = svdup_n_f32(v317); + svfloat32_t v824 = svdup_n_f32(v324); + svfloat32_t v825 = svdup_n_f32(v331); + svfloat32_t v826 = svdup_n_f32(v338); + svfloat32_t v827 = svdup_n_f32(v381); + svfloat32_t v828 = svdup_n_f32(v388); + svfloat32_t v829 = svdup_n_f32(v395); + svfloat32_t v830 = svdup_n_f32(v402); + svfloat32_t v831 = svdup_n_f32(v409); + float32x2_t *v852 = &v6[v462]; + float32x2_t *v861 = &v6[v469]; + float32x2_t *v870 = &v6[v479]; + float32x2_t *v888 = &v6[v493]; + float32x2_t *v897 = &v6[v503]; + float32x2_t *v906 = &v6[v510]; + float32x2_t *v915 = &v6[v517]; + float32x2_t *v924 = &v6[v527]; + float32x2_t *v933 = &v6[v534]; + float32x2_t *v942 = &v6[v541]; + float32x2_t *v951 = &v6[v551]; + float32x2_t *v960 = &v6[v558]; + float32x2_t *v969 = &v6[v565]; + float32x2_t *v978 = &v6[v575]; + float32x2_t *v987 = &v6[v582]; + float32x2_t *v996 = &v6[v589]; + float32x2_t *v1005 = &v6[v599]; + float32x2_t *v1014 = &v6[v606]; + float32x2_t *v1023 = &v6[v613]; + svfloat32_t v1031 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v644)[0])); + svfloat32_t v1027 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v625)[0])); + svfloat32_t v1029 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v634)[0])); + svfloat32_t v1033 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v653)[0])); + svfloat32_t v1035 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v662)[0])); + svfloat32_t v1037 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v671)[0])); + svfloat32_t v1039 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v680)[0])); + svfloat32_t v1041 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v689)[0])); + svfloat32_t v1043 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v698)[0])); + svfloat32_t v1045 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v707)[0])); + svfloat32_t v1047 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v716)[0])); + svfloat32_t v1049 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v725)[0])); + svfloat32_t v1051 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v734)[0])); + svfloat32_t v1053 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v743)[0])); + svfloat32_t v1055 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v752)[0])); + svfloat32_t v1059 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v770)[0])); + svfloat32_t v1061 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v779)[0])); + svfloat32_t v1063 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v788)[0])); + svfloat32_t v1065 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v797)[0])); + svfloat32_t v1067 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v806)[0])); + svfloat32_t v28 = svadd_f32_x(svptrue_b32(), v1027, v1029); + svfloat32_t v29 = svsub_f32_x(svptrue_b32(), v1027, v1029); + svfloat32_t v52 = svadd_f32_x(svptrue_b32(), v1033, v1035); + svfloat32_t v53 = svsub_f32_x(svptrue_b32(), v1033, v1035); + svfloat32_t v76 = svadd_f32_x(svptrue_b32(), v1039, v1041); + svfloat32_t v77 = svsub_f32_x(svptrue_b32(), v1039, v1041); + svfloat32_t v100 = svadd_f32_x(svptrue_b32(), v1045, v1047); + svfloat32_t v101 = svsub_f32_x(svptrue_b32(), v1045, v1047); + svfloat32_t v124 = svadd_f32_x(svptrue_b32(), v1051, v1053); + svfloat32_t v125 = svsub_f32_x(svptrue_b32(), v1051, v1053); + svfloat32_t v148 = svadd_f32_x(svptrue_b32(), v1057, v1059); + svfloat32_t v149 = svsub_f32_x(svptrue_b32(), v1057, v1059); + svfloat32_t v172 = svadd_f32_x(svptrue_b32(), v1063, v1065); + svfloat32_t v173 = svsub_f32_x(svptrue_b32(), v1063, v1065); + svfloat32_t v37 = svadd_f32_x(svptrue_b32(), v28, v1031); + svfloat32_t v61 = svadd_f32_x(svptrue_b32(), v52, v1037); + svfloat32_t v85 = svadd_f32_x(svptrue_b32(), v76, v1043); + svfloat32_t v109 = svadd_f32_x(svptrue_b32(), v100, v1049); + svfloat32_t v133 = svadd_f32_x(svptrue_b32(), v124, v1055); + svfloat32_t v157 = svadd_f32_x(svptrue_b32(), v148, v1061); + svfloat32_t v181 = svadd_f32_x(svptrue_b32(), v172, v1067); + svfloat32_t v271 = svadd_f32_x(svptrue_b32(), v52, v172); + svfloat32_t v272 = svsub_f32_x(svptrue_b32(), v52, v172); + svfloat32_t v273 = svadd_f32_x(svptrue_b32(), v124, v100); + svfloat32_t v274 = svsub_f32_x(svptrue_b32(), v124, v100); + svfloat32_t v275 = svadd_f32_x(svptrue_b32(), v76, v148); + svfloat32_t v276 = svsub_f32_x(svptrue_b32(), v76, v148); + svfloat32_t v360 = svadd_f32_x(svptrue_b32(), v53, v173); + svfloat32_t v361 = svsub_f32_x(svptrue_b32(), v53, v173); + svfloat32_t v362 = svadd_f32_x(svptrue_b32(), v125, v101); + svfloat32_t v363 = svsub_f32_x(svptrue_b32(), v125, v101); + svfloat32_t v364 = svadd_f32_x(svptrue_b32(), v77, v149); + svfloat32_t v365 = svsub_f32_x(svptrue_b32(), v77, v149); + svfloat32_t v182 = svadd_f32_x(svptrue_b32(), v61, v181); + svfloat32_t v183 = svsub_f32_x(svptrue_b32(), v61, v181); + svfloat32_t v184 = svadd_f32_x(svptrue_b32(), v133, v109); + svfloat32_t v185 = svsub_f32_x(svptrue_b32(), v133, v109); + svfloat32_t v186 = svadd_f32_x(svptrue_b32(), v85, v157); + svfloat32_t v187 = svsub_f32_x(svptrue_b32(), v85, v157); + svfloat32_t v277 = svadd_f32_x(svptrue_b32(), v271, v273); + svfloat32_t v280 = svsub_f32_x(svptrue_b32(), v271, v273); + svfloat32_t v281 = svsub_f32_x(svptrue_b32(), v273, v275); + svfloat32_t v282 = svsub_f32_x(svptrue_b32(), v275, v271); + svfloat32_t v283 = svadd_f32_x(svptrue_b32(), v272, v274); + svfloat32_t v285 = svsub_f32_x(svptrue_b32(), v272, v274); + svfloat32_t v286 = svsub_f32_x(svptrue_b32(), v274, v276); + svfloat32_t v287 = svsub_f32_x(svptrue_b32(), v276, v272); + svfloat32_t v366 = svadd_f32_x(svptrue_b32(), v360, v362); + svfloat32_t v369 = svsub_f32_x(svptrue_b32(), v360, v362); + svfloat32_t v370 = svsub_f32_x(svptrue_b32(), v362, v364); + svfloat32_t v371 = svsub_f32_x(svptrue_b32(), v364, v360); + svfloat32_t v372 = svadd_f32_x(svptrue_b32(), v361, v363); + svfloat32_t v374 = svsub_f32_x(svptrue_b32(), v361, v363); + svfloat32_t v375 = svsub_f32_x(svptrue_b32(), v363, v365); + svfloat32_t v376 = svsub_f32_x(svptrue_b32(), v365, v361); + svfloat32_t v188 = svadd_f32_x(svptrue_b32(), v182, v184); + svfloat32_t v191 = svsub_f32_x(svptrue_b32(), v182, v184); + svfloat32_t v192 = svsub_f32_x(svptrue_b32(), v184, v186); + svfloat32_t v193 = svsub_f32_x(svptrue_b32(), v186, v182); + svfloat32_t v194 = svadd_f32_x(svptrue_b32(), v183, v185); + svfloat32_t v196 = svsub_f32_x(svptrue_b32(), v183, v185); + svfloat32_t v197 = svsub_f32_x(svptrue_b32(), v185, v187); + svfloat32_t v198 = svsub_f32_x(svptrue_b32(), v187, v183); + svfloat32_t v278 = svadd_f32_x(svptrue_b32(), v277, v275); + svfloat32_t v284 = svadd_f32_x(svptrue_b32(), v283, v276); + svfloat32_t zero326 = svdup_n_f32(0); + svfloat32_t v326 = svcmla_f32_x(pred_full, zero326, v824, v285, 90); + svfloat32_t zero333 = svdup_n_f32(0); + svfloat32_t v333 = svcmla_f32_x(pred_full, zero333, v825, v286, 90); + svfloat32_t zero340 = svdup_n_f32(0); + svfloat32_t v340 = svcmla_f32_x(pred_full, zero340, v826, v287, 90); + svfloat32_t v367 = svadd_f32_x(svptrue_b32(), v366, v364); + svfloat32_t v373 = svadd_f32_x(svptrue_b32(), v372, v365); + svfloat32_t zero397 = svdup_n_f32(0); + svfloat32_t v397 = svcmla_f32_x(pred_full, zero397, v829, v369, 90); + svfloat32_t zero404 = svdup_n_f32(0); + svfloat32_t v404 = svcmla_f32_x(pred_full, zero404, v830, v370, 90); + svfloat32_t zero411 = svdup_n_f32(0); + svfloat32_t v411 = svcmla_f32_x(pred_full, zero411, v831, v371, 90); + svfloat32_t v421 = svmul_f32_x(svptrue_b32(), v374, v833); + svfloat32_t v426 = svmul_f32_x(svptrue_b32(), v375, v834); + svfloat32_t v189 = svadd_f32_x(svptrue_b32(), v188, v186); + svfloat32_t v195 = svadd_f32_x(svptrue_b32(), v194, v187); + svfloat32_t zero237 = svdup_n_f32(0); + svfloat32_t v237 = svcmla_f32_x(pred_full, zero237, v815, v196, 90); + svfloat32_t zero244 = svdup_n_f32(0); + svfloat32_t v244 = svcmla_f32_x(pred_full, zero244, v816, v197, 90); + svfloat32_t zero251 = svdup_n_f32(0); + svfloat32_t v251 = svcmla_f32_x(pred_full, zero251, v817, v198, 90); + svfloat32_t v279 = svadd_f32_x(svptrue_b32(), v278, v28); + svfloat32_t v297 = svmul_f32_x(svptrue_b32(), v278, v819); + svfloat32_t zero319 = svdup_n_f32(0); + svfloat32_t v319 = svcmla_f32_x(pred_full, zero319, v823, v284, 90); + svfloat32_t v368 = svadd_f32_x(svptrue_b32(), v367, v29); + svfloat32_t v190 = svadd_f32_x(svptrue_b32(), v189, v37); + svfloat32_t zero230 = svdup_n_f32(0); + svfloat32_t v230 = svcmla_f32_x(pred_full, zero230, v814, v195, 90); + svfloat32_t v348 = svadd_f32_x(svptrue_b32(), v319, v326); + svfloat32_t v350 = svsub_f32_x(svptrue_b32(), v319, v326); + svfloat32_t v352 = svsub_f32_x(svptrue_b32(), v319, v333); + svfloat32_t zero383 = svdup_n_f32(0); + svfloat32_t v383 = svcmla_f32_x(pred_full, zero383, v827, v368, 90); + svfloat32_t v439 = svmla_f32_x(pred_full, v421, v373, v832); + svfloat32_t v441 = svnmls_f32_x(pred_full, v421, v373, v832); + svfloat32_t v443 = svnmls_f32_x(pred_full, v426, v373, v832); + svfloat32_t v252 = svmla_f32_x(pred_full, v190, v189, v810); + svfloat32_t v259 = svadd_f32_x(svptrue_b32(), v230, v237); + svfloat32_t v261 = svsub_f32_x(svptrue_b32(), v230, v237); + svfloat32_t v263 = svsub_f32_x(svptrue_b32(), v230, v244); + svfloat32_t v341 = svmla_f32_x(pred_full, v297, v279, v818); + svfloat32_t v349 = svadd_f32_x(svptrue_b32(), v348, v333); + svfloat32_t v351 = svsub_f32_x(svptrue_b32(), v350, v340); + svfloat32_t v353 = svadd_f32_x(svptrue_b32(), v352, v340); + svfloat32_t v432 = svcmla_f32_x(pred_full, v383, v828, v367, 90); + svfloat32_t v440 = svmla_f32_x(pred_full, v439, v375, v834); + svfloat32_t v442 = svmls_f32_x(pred_full, v441, v376, v835); + svfloat32_t v444 = svmla_f32_x(pred_full, v443, v376, v835); + svfloat32_t v451 = svmla_f32_x(pred_full, v190, v279, v818); + svst1_f64(pred_full, (double *)(v843), svreinterpret_f64_f32(v190)); + svfloat32_t v253 = svmla_f32_x(pred_full, v252, v191, v811); + svfloat32_t v255 = svmls_f32_x(pred_full, v252, v191, v811); + svfloat32_t v257 = svmls_f32_x(pred_full, v252, v192, v812); + svfloat32_t v260 = svadd_f32_x(svptrue_b32(), v259, v244); + svfloat32_t v262 = svsub_f32_x(svptrue_b32(), v261, v251); + svfloat32_t v264 = svadd_f32_x(svptrue_b32(), v263, v251); + svfloat32_t v342 = svmla_f32_x(pred_full, v341, v280, v820); + svfloat32_t v344 = svmls_f32_x(pred_full, v341, v280, v820); + svfloat32_t v346 = svmls_f32_x(pred_full, v341, v281, v821); + svfloat32_t v433 = svadd_f32_x(svptrue_b32(), v432, v397); + svfloat32_t v435 = svsub_f32_x(svptrue_b32(), v432, v397); + svfloat32_t v437 = svsub_f32_x(svptrue_b32(), v432, v404); + svfloat32_t v452 = svadd_f32_x(svptrue_b32(), v451, v383); + svfloat32_t v453 = svsub_f32_x(svptrue_b32(), v451, v383); + svfloat32_t v254 = svmla_f32_x(pred_full, v253, v192, v812); + svfloat32_t v256 = svmls_f32_x(pred_full, v255, v193, v813); + svfloat32_t v258 = svmla_f32_x(pred_full, v257, v193, v813); + svfloat32_t v343 = svmla_f32_x(pred_full, v342, v281, v821); + svfloat32_t v345 = svmls_f32_x(pred_full, v344, v282, v822); + svfloat32_t v347 = svmla_f32_x(pred_full, v346, v282, v822); + svfloat32_t v434 = svadd_f32_x(svptrue_b32(), v433, v404); + svfloat32_t v436 = svsub_f32_x(svptrue_b32(), v435, v411); + svfloat32_t v438 = svadd_f32_x(svptrue_b32(), v437, v411); + svst1_f64(pred_full, (double *)(v852), svreinterpret_f64_f32(v453)); + svst1_f64(pred_full, (double *)(v861), svreinterpret_f64_f32(v452)); + svfloat32_t v265 = svadd_f32_x(svptrue_b32(), v254, v260); + svfloat32_t v266 = svsub_f32_x(svptrue_b32(), v254, v260); + svfloat32_t v267 = svadd_f32_x(svptrue_b32(), v256, v262); + svfloat32_t v268 = svsub_f32_x(svptrue_b32(), v256, v262); + svfloat32_t v269 = svadd_f32_x(svptrue_b32(), v258, v264); + svfloat32_t v270 = svsub_f32_x(svptrue_b32(), v258, v264); + svfloat32_t v354 = svadd_f32_x(svptrue_b32(), v343, v349); + svfloat32_t v355 = svsub_f32_x(svptrue_b32(), v343, v349); + svfloat32_t v356 = svadd_f32_x(svptrue_b32(), v345, v351); + svfloat32_t v357 = svsub_f32_x(svptrue_b32(), v345, v351); + svfloat32_t v358 = svadd_f32_x(svptrue_b32(), v347, v353); + svfloat32_t v359 = svsub_f32_x(svptrue_b32(), v347, v353); + svfloat32_t v445 = svadd_f32_x(svptrue_b32(), v434, v440); + svfloat32_t v446 = svsub_f32_x(svptrue_b32(), v434, v440); + svfloat32_t v447 = svadd_f32_x(svptrue_b32(), v436, v442); + svfloat32_t v448 = svsub_f32_x(svptrue_b32(), v436, v442); + svfloat32_t v449 = svadd_f32_x(svptrue_b32(), v438, v444); + svfloat32_t v450 = svsub_f32_x(svptrue_b32(), v438, v444); + svfloat32_t v475 = svadd_f32_x(svptrue_b32(), v266, v355); + svfloat32_t v499 = svadd_f32_x(svptrue_b32(), v268, v357); + svfloat32_t v523 = svadd_f32_x(svptrue_b32(), v269, v358); + svfloat32_t v547 = svadd_f32_x(svptrue_b32(), v270, v359); + svfloat32_t v571 = svadd_f32_x(svptrue_b32(), v267, v356); + svfloat32_t v595 = svadd_f32_x(svptrue_b32(), v265, v354); + svst1_f64(pred_full, (double *)(v870), svreinterpret_f64_f32(v266)); + svst1_f64(pred_full, (double *)(v897), svreinterpret_f64_f32(v268)); + svst1_f64(pred_full, (double *)(v924), svreinterpret_f64_f32(v269)); + svst1_f64(pred_full, (double *)(v951), svreinterpret_f64_f32(v270)); + svst1_f64(pred_full, (double *)(v978), svreinterpret_f64_f32(v267)); + svst1_f64(pred_full, (double *)(v1005), svreinterpret_f64_f32(v265)); + svfloat32_t v476 = svadd_f32_x(svptrue_b32(), v475, v446); + svfloat32_t v477 = svsub_f32_x(svptrue_b32(), v475, v446); + svfloat32_t v500 = svadd_f32_x(svptrue_b32(), v499, v448); + svfloat32_t v501 = svsub_f32_x(svptrue_b32(), v499, v448); + svfloat32_t v524 = svadd_f32_x(svptrue_b32(), v523, v449); + svfloat32_t v525 = svsub_f32_x(svptrue_b32(), v523, v449); + svfloat32_t v548 = svadd_f32_x(svptrue_b32(), v547, v450); + svfloat32_t v549 = svsub_f32_x(svptrue_b32(), v547, v450); + svfloat32_t v572 = svadd_f32_x(svptrue_b32(), v571, v447); + svfloat32_t v573 = svsub_f32_x(svptrue_b32(), v571, v447); + svfloat32_t v596 = svadd_f32_x(svptrue_b32(), v595, v445); + svfloat32_t v597 = svsub_f32_x(svptrue_b32(), v595, v445); + svst1_f64(pred_full, (double *)(v879), svreinterpret_f64_f32(v477)); + svst1_f64(pred_full, (double *)(v888), svreinterpret_f64_f32(v476)); + svst1_f64(pred_full, (double *)(v906), svreinterpret_f64_f32(v501)); + svst1_f64(pred_full, (double *)(v915), svreinterpret_f64_f32(v500)); + svst1_f64(pred_full, (double *)(v933), svreinterpret_f64_f32(v525)); + svst1_f64(pred_full, (double *)(v942), svreinterpret_f64_f32(v524)); + svst1_f64(pred_full, (double *)(v960), svreinterpret_f64_f32(v549)); + svst1_f64(pred_full, (double *)(v969), svreinterpret_f64_f32(v548)); + svst1_f64(pred_full, (double *)(v987), svreinterpret_f64_f32(v573)); + svst1_f64(pred_full, (double *)(v996), svreinterpret_f64_f32(v572)); + svst1_f64(pred_full, (double *)(v1014), svreinterpret_f64_f32(v597)); + svst1_f64(pred_full, (double *)(v1023), svreinterpret_f64_f32(v596)); +} +#endif + +#ifndef ARMRAL_ARCH_SVE +void armral_fft_cf32_cf32_cf32_ac_n_uun22(const armral_cmplx_f32_t *restrict x, + armral_cmplx_f32_t *restrict y, + int istride, int ostride, int howmany, + float dir) { + float v4 = dir; + const float32x2_t *v5 = (const float32x2_t *)x; + float32x2_t *v6 = (float32x2_t *)y; + float v381 = 1.1000000000000001e+00F; + float v384 = 3.3166247903554003e-01F; + float v385 = -3.3166247903554003e-01F; + float v392 = 5.1541501300188641e-01F; + float v396 = 9.4125353283118118e-01F; + float v400 = 1.4143537075597825e+00F; + float v404 = 8.5949297361449750e-01F; + float v408 = 4.2314838273285138e-02F; + float v412 = 3.8639279888589606e-01F; + float v416 = 5.1254589567200015e-01F; + float v420 = 1.0702757469471715e+00F; + float v424 = 5.5486073394528512e-01F; + float v427 = 1.2412944743900585e+00F; + float v428 = -1.2412944743900585e+00F; + float v434 = 2.0897833842005756e-01F; + float v435 = -2.0897833842005756e-01F; + float v441 = 3.7415717312460811e-01F; + float v442 = -3.7415717312460811e-01F; + float v448 = 4.9929922194110327e-02F; + float v449 = -4.9929922194110327e-02F; + float v455 = 6.5815896284539266e-01F; + float v456 = -6.5815896284539266e-01F; + float v462 = 6.3306543373877577e-01F; + float v463 = -6.3306543373877577e-01F; + float v469 = 1.0822460581641109e+00F; + float v470 = -1.0822460581641109e+00F; + float v476 = 8.1720737907134022e-01F; + float v477 = -8.1720737907134022e-01F; + float v483 = 4.2408709531871824e-01F; + float v484 = -4.2408709531871824e-01F; + float32x2_t v13 = v5[0]; + float32x2_t v90 = v5[istride]; + float32x2_t v382 = (float32x2_t){v381, v381}; + float32x2_t v386 = (float32x2_t){v384, v385}; + float32x2_t v393 = (float32x2_t){v392, v392}; + float32x2_t v397 = (float32x2_t){v396, v396}; + float32x2_t v401 = (float32x2_t){v400, v400}; + float32x2_t v405 = (float32x2_t){v404, v404}; + float32x2_t v409 = (float32x2_t){v408, v408}; + float32x2_t v413 = (float32x2_t){v412, v412}; + float32x2_t v417 = (float32x2_t){v416, v416}; + float32x2_t v421 = (float32x2_t){v420, v420}; + float32x2_t v425 = (float32x2_t){v424, v424}; + float32x2_t v429 = (float32x2_t){v427, v428}; + float32x2_t v436 = (float32x2_t){v434, v435}; + float32x2_t v443 = (float32x2_t){v441, v442}; + float32x2_t v450 = (float32x2_t){v448, v449}; + float32x2_t v457 = (float32x2_t){v455, v456}; + float32x2_t v464 = (float32x2_t){v462, v463}; + float32x2_t v471 = (float32x2_t){v469, v470}; + float32x2_t v478 = (float32x2_t){v476, v477}; + float32x2_t v485 = (float32x2_t){v483, v484}; + float32x2_t v486 = (float32x2_t){v4, v4}; + float32x2_t v18 = v5[istride * 11]; + float32x2_t v25 = v5[istride * 2]; + float32x2_t v30 = v5[istride * 13]; + float32x2_t v37 = v5[istride * 4]; + float32x2_t v42 = v5[istride * 15]; + float32x2_t v49 = v5[istride * 6]; + float32x2_t v54 = v5[istride * 17]; + float32x2_t v61 = v5[istride * 8]; + float32x2_t v66 = v5[istride * 19]; + float32x2_t v73 = v5[istride * 10]; + float32x2_t v78 = v5[istride * 21]; + float32x2_t v85 = v5[istride * 12]; + float32x2_t v97 = v5[istride * 14]; + float32x2_t v102 = v5[istride * 3]; + float32x2_t v109 = v5[istride * 16]; + float32x2_t v114 = v5[istride * 5]; + float32x2_t v121 = v5[istride * 18]; + float32x2_t v126 = v5[istride * 7]; + float32x2_t v133 = v5[istride * 20]; + float32x2_t v138 = v5[istride * 9]; + float32x2_t v388 = vmul_f32(v486, v386); + float32x2_t v431 = vmul_f32(v486, v429); + float32x2_t v438 = vmul_f32(v486, v436); + float32x2_t v445 = vmul_f32(v486, v443); + float32x2_t v452 = vmul_f32(v486, v450); + float32x2_t v459 = vmul_f32(v486, v457); + float32x2_t v466 = vmul_f32(v486, v464); + float32x2_t v473 = vmul_f32(v486, v471); + float32x2_t v480 = vmul_f32(v486, v478); + float32x2_t v487 = vmul_f32(v486, v485); + float32x2_t v19 = vadd_f32(v13, v18); + float32x2_t v20 = vsub_f32(v13, v18); + float32x2_t v31 = vadd_f32(v25, v30); + float32x2_t v32 = vsub_f32(v25, v30); + float32x2_t v43 = vadd_f32(v37, v42); + float32x2_t v44 = vsub_f32(v37, v42); + float32x2_t v55 = vadd_f32(v49, v54); + float32x2_t v56 = vsub_f32(v49, v54); + float32x2_t v67 = vadd_f32(v61, v66); + float32x2_t v68 = vsub_f32(v61, v66); + float32x2_t v79 = vadd_f32(v73, v78); + float32x2_t v80 = vsub_f32(v73, v78); + float32x2_t v91 = vadd_f32(v85, v90); + float32x2_t v92 = vsub_f32(v85, v90); + float32x2_t v103 = vadd_f32(v97, v102); + float32x2_t v104 = vsub_f32(v97, v102); + float32x2_t v115 = vadd_f32(v109, v114); + float32x2_t v116 = vsub_f32(v109, v114); + float32x2_t v127 = vadd_f32(v121, v126); + float32x2_t v128 = vsub_f32(v121, v126); + float32x2_t v139 = vadd_f32(v133, v138); + float32x2_t v140 = vsub_f32(v133, v138); + float32x2_t v141 = vadd_f32(v31, v139); + float32x2_t v142 = vadd_f32(v43, v127); + float32x2_t v143 = vadd_f32(v55, v115); + float32x2_t v144 = vadd_f32(v67, v103); + float32x2_t v145 = vadd_f32(v79, v91); + float32x2_t v146 = vsub_f32(v31, v139); + float32x2_t v147 = vsub_f32(v43, v127); + float32x2_t v148 = vsub_f32(v55, v115); + float32x2_t v149 = vsub_f32(v67, v103); + float32x2_t v150 = vsub_f32(v79, v91); + float32x2_t v339 = vadd_f32(v32, v140); + float32x2_t v340 = vadd_f32(v44, v128); + float32x2_t v341 = vadd_f32(v56, v116); + float32x2_t v342 = vadd_f32(v68, v104); + float32x2_t v343 = vadd_f32(v80, v92); + float32x2_t v344 = vsub_f32(v32, v140); + float32x2_t v345 = vsub_f32(v44, v128); + float32x2_t v346 = vsub_f32(v56, v116); + float32x2_t v347 = vsub_f32(v68, v104); + float32x2_t v348 = vsub_f32(v80, v92); + float32x2_t v151 = vadd_f32(v141, v142); + float32x2_t v152 = vadd_f32(v143, v145); + float32x2_t v154 = vsub_f32(v147, v148); + float32x2_t v155 = vadd_f32(v146, v150); + float32x2_t v160 = vsub_f32(v142, v144); + float32x2_t v161 = vsub_f32(v141, v144); + float32x2_t v162 = vsub_f32(v142, v141); + float32x2_t v163 = vsub_f32(v145, v144); + float32x2_t v164 = vsub_f32(v143, v144); + float32x2_t v165 = vsub_f32(v145, v143); + float32x2_t v166 = vsub_f32(v142, v145); + float32x2_t v167 = vsub_f32(v141, v143); + float32x2_t v169 = vadd_f32(v147, v149); + float32x2_t v170 = vsub_f32(v146, v149); + float32x2_t v171 = vadd_f32(v146, v147); + float32x2_t v172 = vsub_f32(v149, v150); + float32x2_t v173 = vsub_f32(v148, v149); + float32x2_t v174 = vsub_f32(v148, v150); + float32x2_t v175 = vadd_f32(v147, v150); + float32x2_t v176 = vsub_f32(v146, v148); + float32x2_t v349 = vadd_f32(v339, v340); + float32x2_t v350 = vadd_f32(v341, v343); + float32x2_t v352 = vsub_f32(v345, v346); + float32x2_t v353 = vadd_f32(v344, v348); + float32x2_t v358 = vsub_f32(v340, v342); + float32x2_t v359 = vsub_f32(v339, v342); + float32x2_t v360 = vsub_f32(v340, v339); + float32x2_t v361 = vsub_f32(v343, v342); + float32x2_t v362 = vsub_f32(v341, v342); + float32x2_t v363 = vsub_f32(v343, v341); + float32x2_t v364 = vsub_f32(v340, v343); + float32x2_t v365 = vsub_f32(v339, v341); + float32x2_t v367 = vadd_f32(v345, v347); + float32x2_t v368 = vsub_f32(v344, v347); + float32x2_t v369 = vadd_f32(v344, v345); + float32x2_t v370 = vsub_f32(v347, v348); + float32x2_t v371 = vsub_f32(v346, v347); + float32x2_t v372 = vsub_f32(v346, v348); + float32x2_t v373 = vadd_f32(v345, v348); + float32x2_t v374 = vsub_f32(v344, v346); + float32x2_t v153 = vadd_f32(v144, v151); + float32x2_t v158 = vsub_f32(v154, v155); + float32x2_t v168 = vsub_f32(v152, v151); + float32x2_t v177 = vadd_f32(v154, v155); + float32x2_t v196 = vmul_f32(v160, v393); + float32x2_t v200 = vmul_f32(v161, v397); + float32x2_t v204 = vmul_f32(v162, v401); + float32x2_t v208 = vmul_f32(v163, v405); + float32x2_t v212 = vmul_f32(v164, v409); + float32x2_t v216 = vmul_f32(v165, v413); + float32x2_t v220 = vmul_f32(v166, v417); + float32x2_t v224 = vmul_f32(v167, v421); + float32x2_t v234 = vrev64_f32(v169); + float32x2_t v241 = vrev64_f32(v170); + float32x2_t v248 = vrev64_f32(v171); + float32x2_t v255 = vrev64_f32(v172); + float32x2_t v262 = vrev64_f32(v173); + float32x2_t v269 = vrev64_f32(v174); + float32x2_t v276 = vrev64_f32(v175); + float32x2_t v283 = vrev64_f32(v176); + float32x2_t v351 = vadd_f32(v342, v349); + float32x2_t v356 = vsub_f32(v352, v353); + float32x2_t v366 = vsub_f32(v350, v349); + float32x2_t v375 = vadd_f32(v352, v353); + float32x2_t v394 = vmul_f32(v358, v393); + float32x2_t v398 = vmul_f32(v359, v397); + float32x2_t v402 = vmul_f32(v360, v401); + float32x2_t v406 = vmul_f32(v361, v405); + float32x2_t v410 = vmul_f32(v362, v409); + float32x2_t v414 = vmul_f32(v363, v413); + float32x2_t v418 = vmul_f32(v364, v417); + float32x2_t v422 = vmul_f32(v365, v421); + float32x2_t v432 = vrev64_f32(v367); + float32x2_t v439 = vrev64_f32(v368); + float32x2_t v446 = vrev64_f32(v369); + float32x2_t v453 = vrev64_f32(v370); + float32x2_t v460 = vrev64_f32(v371); + float32x2_t v467 = vrev64_f32(v372); + float32x2_t v474 = vrev64_f32(v373); + float32x2_t v481 = vrev64_f32(v374); + float32x2_t v156 = vadd_f32(v153, v152); + float32x2_t v159 = vsub_f32(v158, v149); + float32x2_t v228 = vmul_f32(v168, v425); + float32x2_t v235 = vmul_f32(v234, v431); + float32x2_t v242 = vmul_f32(v241, v438); + float32x2_t v249 = vmul_f32(v248, v445); + float32x2_t v256 = vmul_f32(v255, v452); + float32x2_t v263 = vmul_f32(v262, v459); + float32x2_t v270 = vmul_f32(v269, v466); + float32x2_t v277 = vmul_f32(v276, v473); + float32x2_t v284 = vmul_f32(v283, v480); + float32x2_t v290 = vrev64_f32(v177); + float32x2_t v293 = vadd_f32(v196, v200); + float32x2_t v294 = vadd_f32(v200, v204); + float32x2_t v295 = vsub_f32(v196, v204); + float32x2_t v296 = vadd_f32(v208, v212); + float32x2_t v297 = vadd_f32(v212, v216); + float32x2_t v298 = vsub_f32(v208, v216); + float32x2_t v354 = vadd_f32(v351, v350); + float32x2_t v357 = vsub_f32(v356, v347); + float32x2_t v426 = vmul_f32(v366, v425); + float32x2_t v433 = vmul_f32(v432, v431); + float32x2_t v440 = vmul_f32(v439, v438); + float32x2_t v447 = vmul_f32(v446, v445); + float32x2_t v454 = vmul_f32(v453, v452); + float32x2_t v461 = vmul_f32(v460, v459); + float32x2_t v468 = vmul_f32(v467, v466); + float32x2_t v475 = vmul_f32(v474, v473); + float32x2_t v482 = vmul_f32(v481, v480); + float32x2_t v488 = vrev64_f32(v375); + float32x2_t v491 = vadd_f32(v394, v398); + float32x2_t v492 = vadd_f32(v398, v402); + float32x2_t v493 = vsub_f32(v394, v402); + float32x2_t v494 = vadd_f32(v406, v410); + float32x2_t v495 = vadd_f32(v410, v414); + float32x2_t v496 = vsub_f32(v406, v414); + float32x2_t v157 = vadd_f32(v19, v156); + float32x2_t v185 = vmul_f32(v156, v382); + float32x2_t v191 = vrev64_f32(v159); + float32x2_t v291 = vmul_f32(v290, v487); + float32x2_t v299 = vadd_f32(v224, v228); + float32x2_t v300 = vadd_f32(v220, v228); + float32x2_t v301 = vadd_f32(v242, v249); + float32x2_t v302 = vsub_f32(v235, v249); + float32x2_t v303 = vadd_f32(v263, v270); + float32x2_t v304 = vsub_f32(v256, v270); + float32x2_t v355 = vadd_f32(v20, v354); + float32x2_t v383 = vmul_f32(v354, v382); + float32x2_t v389 = vrev64_f32(v357); + float32x2_t v489 = vmul_f32(v488, v487); + float32x2_t v497 = vadd_f32(v422, v426); + float32x2_t v498 = vadd_f32(v418, v426); + float32x2_t v499 = vadd_f32(v440, v447); + float32x2_t v500 = vsub_f32(v433, v447); + float32x2_t v501 = vadd_f32(v461, v468); + float32x2_t v502 = vsub_f32(v454, v468); + float32x2_t v192 = vmul_f32(v191, v388); + float32x2_t v292 = vsub_f32(v157, v185); + float32x2_t v305 = vadd_f32(v284, v291); + float32x2_t v306 = vsub_f32(v277, v291); + float32x2_t v307 = vadd_f32(v297, v299); + float32x2_t v325 = vadd_f32(v301, v302); + float32x2_t v390 = vmul_f32(v389, v388); + float32x2_t v490 = vsub_f32(v355, v383); + float32x2_t v503 = vadd_f32(v482, v489); + float32x2_t v504 = vsub_f32(v475, v489); + float32x2_t v505 = vadd_f32(v495, v497); + float32x2_t v523 = vadd_f32(v499, v500); + v6[0] = v157; + v6[ostride * 11] = v355; + float32x2_t v308 = vadd_f32(v307, v292); + float32x2_t v309 = vsub_f32(v292, v294); + float32x2_t v311 = vadd_f32(v292, v298); + float32x2_t v313 = vsub_f32(v292, v295); + float32x2_t v315 = vadd_f32(v292, v293); + float32x2_t v317 = vadd_f32(v192, v303); + float32x2_t v319 = vsub_f32(v305, v301); + float32x2_t v321 = vadd_f32(v192, v306); + float32x2_t v323 = vsub_f32(v306, v302); + float32x2_t v326 = vadd_f32(v325, v303); + float32x2_t v506 = vadd_f32(v505, v490); + float32x2_t v507 = vsub_f32(v490, v492); + float32x2_t v509 = vadd_f32(v490, v496); + float32x2_t v511 = vsub_f32(v490, v493); + float32x2_t v513 = vadd_f32(v490, v491); + float32x2_t v515 = vadd_f32(v390, v501); + float32x2_t v517 = vsub_f32(v503, v499); + float32x2_t v519 = vadd_f32(v390, v504); + float32x2_t v521 = vsub_f32(v504, v500); + float32x2_t v524 = vadd_f32(v523, v501); + float32x2_t v310 = vsub_f32(v309, v299); + float32x2_t v312 = vadd_f32(v311, v300); + float32x2_t v314 = vsub_f32(v313, v300); + float32x2_t v316 = vsub_f32(v315, v296); + float32x2_t v318 = vadd_f32(v317, v305); + float32x2_t v320 = vsub_f32(v319, v192); + float32x2_t v322 = vadd_f32(v321, v304); + float32x2_t v324 = vsub_f32(v323, v192); + float32x2_t v327 = vadd_f32(v326, v304); + float32x2_t v508 = vsub_f32(v507, v497); + float32x2_t v510 = vadd_f32(v509, v498); + float32x2_t v512 = vsub_f32(v511, v498); + float32x2_t v514 = vsub_f32(v513, v494); + float32x2_t v516 = vadd_f32(v515, v503); + float32x2_t v518 = vsub_f32(v517, v390); + float32x2_t v520 = vadd_f32(v519, v502); + float32x2_t v522 = vsub_f32(v521, v390); + float32x2_t v525 = vadd_f32(v524, v502); + float32x2_t v328 = vsub_f32(v327, v192); + float32x2_t v330 = vadd_f32(v308, v318); + float32x2_t v331 = vadd_f32(v310, v320); + float32x2_t v332 = vsub_f32(v312, v322); + float32x2_t v333 = vadd_f32(v314, v324); + float32x2_t v334 = vsub_f32(v314, v324); + float32x2_t v335 = vadd_f32(v312, v322); + float32x2_t v336 = vsub_f32(v310, v320); + float32x2_t v337 = vsub_f32(v308, v318); + float32x2_t v526 = vsub_f32(v525, v390); + float32x2_t v528 = vadd_f32(v506, v516); + float32x2_t v529 = vadd_f32(v508, v518); + float32x2_t v530 = vsub_f32(v510, v520); + float32x2_t v531 = vadd_f32(v512, v522); + float32x2_t v532 = vsub_f32(v512, v522); + float32x2_t v533 = vadd_f32(v510, v520); + float32x2_t v534 = vsub_f32(v508, v518); + float32x2_t v535 = vsub_f32(v506, v516); + float32x2_t v329 = vadd_f32(v316, v328); + float32x2_t v338 = vsub_f32(v316, v328); + float32x2_t v527 = vadd_f32(v514, v526); + float32x2_t v536 = vsub_f32(v514, v526); + v6[ostride * 2] = v337; + v6[ostride * 13] = v535; + v6[ostride * 14] = v336; + v6[ostride * 3] = v534; + v6[ostride * 4] = v335; + v6[ostride * 15] = v533; + v6[ostride * 16] = v334; + v6[ostride * 5] = v532; + v6[ostride * 6] = v333; + v6[ostride * 17] = v531; + v6[ostride * 18] = v332; + v6[ostride * 7] = v530; + v6[ostride * 8] = v331; + v6[ostride * 19] = v529; + v6[ostride * 20] = v330; + v6[ostride * 9] = v528; + v6[ostride * 12] = v338; + v6[ostride] = v536; + v6[ostride * 10] = v329; + v6[ostride * 21] = v527; +} +#endif + +#ifdef ARMRAL_ARCH_SVE +void armral_fft_cf32_cf32_cf32_ac_n_uun22(const armral_cmplx_f32_t *restrict x, + armral_cmplx_f32_t *restrict y, + int istride, int ostride, int howmany, + float dir) { + int64_t v0 = istride; + int64_t v2 = ostride; + float v4 = dir; + const float32x2_t *v5 = (const float32x2_t *)x; + float32x2_t *v6 = (float32x2_t *)y; + svbool_t pred_full = svptrue_pat_b32(SV_VL2); + float v442 = 1.1000000000000001e+00F; + float v447 = -3.3166247903554003e-01F; + float v454 = 5.1541501300188641e-01F; + float v459 = 9.4125353283118118e-01F; + float v464 = 1.4143537075597825e+00F; + float v469 = 8.5949297361449750e-01F; + float v474 = 4.2314838273285138e-02F; + float v479 = 3.8639279888589606e-01F; + float v484 = 5.1254589567200015e-01F; + float v489 = 1.0702757469471715e+00F; + float v494 = 5.5486073394528512e-01F; + float v499 = -1.2412944743900585e+00F; + float v506 = -2.0897833842005756e-01F; + float v513 = -3.7415717312460811e-01F; + float v520 = -4.9929922194110327e-02F; + float v527 = -6.5815896284539266e-01F; + float v534 = -6.3306543373877577e-01F; + float v541 = -1.0822460581641109e+00F; + float v548 = -8.1720737907134022e-01F; + float v555 = -4.2408709531871824e-01F; + const float32x2_t *v886 = &v5[v0]; + float32x2_t *v1037 = &v6[v2]; + int64_t v22 = v0 * 11; + int64_t v31 = v0 * 2; + int64_t v38 = v0 * 13; + int64_t v47 = v0 * 4; + int64_t v54 = v0 * 15; + int64_t v63 = v0 * 6; + int64_t v70 = v0 * 17; + int64_t v79 = v0 * 8; + int64_t v86 = v0 * 19; + int64_t v95 = v0 * 10; + int64_t v102 = v0 * 21; + int64_t v111 = v0 * 12; + int64_t v127 = v0 * 14; + int64_t v134 = v0 * 3; + int64_t v143 = v0 * 16; + int64_t v150 = v0 * 5; + int64_t v159 = v0 * 18; + int64_t v166 = v0 * 7; + int64_t v175 = v0 * 20; + int64_t v182 = v0 * 9; + float v450 = v4 * v447; + float v502 = v4 * v499; + float v509 = v4 * v506; + float v516 = v4 * v513; + float v523 = v4 * v520; + float v530 = v4 * v527; + float v537 = v4 * v534; + float v544 = v4 * v541; + float v551 = v4 * v548; + float v558 = v4 * v555; + int64_t v616 = v2 * 11; + int64_t v623 = v2 * 12; + int64_t v637 = v2 * 2; + int64_t v644 = v2 * 13; + int64_t v651 = v2 * 14; + int64_t v658 = v2 * 3; + int64_t v665 = v2 * 4; + int64_t v672 = v2 * 15; + int64_t v679 = v2 * 16; + int64_t v686 = v2 * 5; + int64_t v693 = v2 * 6; + int64_t v700 = v2 * 17; + int64_t v707 = v2 * 18; + int64_t v714 = v2 * 7; + int64_t v721 = v2 * 8; + int64_t v728 = v2 * 19; + int64_t v735 = v2 * 20; + int64_t v742 = v2 * 9; + int64_t v749 = v2 * 10; + int64_t v756 = v2 * 21; + const float32x2_t *v769 = &v5[0]; + svfloat32_t v983 = svdup_n_f32(v442); + svfloat32_t v985 = svdup_n_f32(v454); + svfloat32_t v986 = svdup_n_f32(v459); + svfloat32_t v987 = svdup_n_f32(v464); + svfloat32_t v988 = svdup_n_f32(v469); + svfloat32_t v989 = svdup_n_f32(v474); + svfloat32_t v990 = svdup_n_f32(v479); + svfloat32_t v991 = svdup_n_f32(v484); + svfloat32_t v992 = svdup_n_f32(v489); + svfloat32_t v993 = svdup_n_f32(v494); + float32x2_t *v1010 = &v6[0]; + svfloat32_t v1229 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v886)[0])); + const float32x2_t *v778 = &v5[v22]; + const float32x2_t *v787 = &v5[v31]; + const float32x2_t *v796 = &v5[v38]; + const float32x2_t *v805 = &v5[v47]; + const float32x2_t *v814 = &v5[v54]; + const float32x2_t *v823 = &v5[v63]; + const float32x2_t *v832 = &v5[v70]; + const float32x2_t *v841 = &v5[v79]; + const float32x2_t *v850 = &v5[v86]; + const float32x2_t *v859 = &v5[v95]; + const float32x2_t *v868 = &v5[v102]; + const float32x2_t *v877 = &v5[v111]; + const float32x2_t *v895 = &v5[v127]; + const float32x2_t *v904 = &v5[v134]; + const float32x2_t *v913 = &v5[v143]; + const float32x2_t *v922 = &v5[v150]; + const float32x2_t *v931 = &v5[v159]; + const float32x2_t *v940 = &v5[v166]; + const float32x2_t *v949 = &v5[v175]; + const float32x2_t *v958 = &v5[v182]; + svfloat32_t v984 = svdup_n_f32(v450); + svfloat32_t v994 = svdup_n_f32(v502); + svfloat32_t v995 = svdup_n_f32(v509); + svfloat32_t v996 = svdup_n_f32(v516); + svfloat32_t v997 = svdup_n_f32(v523); + svfloat32_t v998 = svdup_n_f32(v530); + svfloat32_t v999 = svdup_n_f32(v537); + svfloat32_t v1000 = svdup_n_f32(v544); + svfloat32_t v1001 = svdup_n_f32(v551); + svfloat32_t v1002 = svdup_n_f32(v558); + float32x2_t *v1019 = &v6[v616]; + float32x2_t *v1028 = &v6[v623]; + float32x2_t *v1046 = &v6[v637]; + float32x2_t *v1055 = &v6[v644]; + float32x2_t *v1064 = &v6[v651]; + float32x2_t *v1073 = &v6[v658]; + float32x2_t *v1082 = &v6[v665]; + float32x2_t *v1091 = &v6[v672]; + float32x2_t *v1100 = &v6[v679]; + float32x2_t *v1109 = &v6[v686]; + float32x2_t *v1118 = &v6[v693]; + float32x2_t *v1127 = &v6[v700]; + float32x2_t *v1136 = &v6[v707]; + float32x2_t *v1145 = &v6[v714]; + float32x2_t *v1154 = &v6[v721]; + float32x2_t *v1163 = &v6[v728]; + float32x2_t *v1172 = &v6[v735]; + float32x2_t *v1181 = &v6[v742]; + float32x2_t *v1190 = &v6[v749]; + float32x2_t *v1199 = &v6[v756]; + svfloat32_t v1203 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v769)[0])); + svfloat32_t v1205 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v778)[0])); + svfloat32_t v1207 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v787)[0])); + svfloat32_t v1209 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v796)[0])); + svfloat32_t v1211 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v805)[0])); + svfloat32_t v1213 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v814)[0])); + svfloat32_t v1215 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v823)[0])); + svfloat32_t v1217 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v832)[0])); + svfloat32_t v1219 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v841)[0])); + svfloat32_t v1221 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v850)[0])); + svfloat32_t v1223 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v859)[0])); + svfloat32_t v1225 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v868)[0])); + svfloat32_t v1227 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v877)[0])); + svfloat32_t v1231 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v895)[0])); + svfloat32_t v1233 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v904)[0])); + svfloat32_t v1235 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v913)[0])); + svfloat32_t v1237 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v922)[0])); + svfloat32_t v1239 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v931)[0])); + svfloat32_t v1241 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v940)[0])); + svfloat32_t v1243 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v949)[0])); + svfloat32_t v1245 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v958)[0])); + svfloat32_t v28 = svadd_f32_x(svptrue_b32(), v1203, v1205); + svfloat32_t v29 = svsub_f32_x(svptrue_b32(), v1203, v1205); + svfloat32_t v44 = svadd_f32_x(svptrue_b32(), v1207, v1209); + svfloat32_t v45 = svsub_f32_x(svptrue_b32(), v1207, v1209); + svfloat32_t v60 = svadd_f32_x(svptrue_b32(), v1211, v1213); + svfloat32_t v61 = svsub_f32_x(svptrue_b32(), v1211, v1213); + svfloat32_t v76 = svadd_f32_x(svptrue_b32(), v1215, v1217); + svfloat32_t v77 = svsub_f32_x(svptrue_b32(), v1215, v1217); + svfloat32_t v92 = svadd_f32_x(svptrue_b32(), v1219, v1221); + svfloat32_t v93 = svsub_f32_x(svptrue_b32(), v1219, v1221); + svfloat32_t v108 = svadd_f32_x(svptrue_b32(), v1223, v1225); + svfloat32_t v109 = svsub_f32_x(svptrue_b32(), v1223, v1225); + svfloat32_t v124 = svadd_f32_x(svptrue_b32(), v1227, v1229); + svfloat32_t v125 = svsub_f32_x(svptrue_b32(), v1227, v1229); + svfloat32_t v140 = svadd_f32_x(svptrue_b32(), v1231, v1233); + svfloat32_t v141 = svsub_f32_x(svptrue_b32(), v1231, v1233); + svfloat32_t v156 = svadd_f32_x(svptrue_b32(), v1235, v1237); + svfloat32_t v157 = svsub_f32_x(svptrue_b32(), v1235, v1237); + svfloat32_t v172 = svadd_f32_x(svptrue_b32(), v1239, v1241); + svfloat32_t v173 = svsub_f32_x(svptrue_b32(), v1239, v1241); + svfloat32_t v188 = svadd_f32_x(svptrue_b32(), v1243, v1245); + svfloat32_t v189 = svsub_f32_x(svptrue_b32(), v1243, v1245); + svfloat32_t v190 = svadd_f32_x(svptrue_b32(), v44, v188); + svfloat32_t v191 = svadd_f32_x(svptrue_b32(), v60, v172); + svfloat32_t v192 = svadd_f32_x(svptrue_b32(), v76, v156); + svfloat32_t v193 = svadd_f32_x(svptrue_b32(), v92, v140); + svfloat32_t v194 = svadd_f32_x(svptrue_b32(), v108, v124); + svfloat32_t v195 = svsub_f32_x(svptrue_b32(), v44, v188); + svfloat32_t v196 = svsub_f32_x(svptrue_b32(), v60, v172); + svfloat32_t v197 = svsub_f32_x(svptrue_b32(), v76, v156); + svfloat32_t v198 = svsub_f32_x(svptrue_b32(), v92, v140); + svfloat32_t v199 = svsub_f32_x(svptrue_b32(), v108, v124); + svfloat32_t v399 = svadd_f32_x(svptrue_b32(), v45, v189); + svfloat32_t v400 = svadd_f32_x(svptrue_b32(), v61, v173); + svfloat32_t v401 = svadd_f32_x(svptrue_b32(), v77, v157); + svfloat32_t v402 = svadd_f32_x(svptrue_b32(), v93, v141); + svfloat32_t v403 = svadd_f32_x(svptrue_b32(), v109, v125); + svfloat32_t v404 = svsub_f32_x(svptrue_b32(), v45, v189); + svfloat32_t v405 = svsub_f32_x(svptrue_b32(), v61, v173); + svfloat32_t v406 = svsub_f32_x(svptrue_b32(), v77, v157); + svfloat32_t v407 = svsub_f32_x(svptrue_b32(), v93, v141); + svfloat32_t v408 = svsub_f32_x(svptrue_b32(), v109, v125); + svfloat32_t v200 = svadd_f32_x(svptrue_b32(), v190, v191); + svfloat32_t v201 = svadd_f32_x(svptrue_b32(), v192, v194); + svfloat32_t v203 = svsub_f32_x(svptrue_b32(), v196, v197); + svfloat32_t v204 = svadd_f32_x(svptrue_b32(), v195, v199); + svfloat32_t v209 = svsub_f32_x(svptrue_b32(), v191, v193); + svfloat32_t v210 = svsub_f32_x(svptrue_b32(), v190, v193); + svfloat32_t v211 = svsub_f32_x(svptrue_b32(), v191, v190); + svfloat32_t v212 = svsub_f32_x(svptrue_b32(), v194, v193); + svfloat32_t v213 = svsub_f32_x(svptrue_b32(), v192, v193); + svfloat32_t v214 = svsub_f32_x(svptrue_b32(), v194, v192); + svfloat32_t v215 = svsub_f32_x(svptrue_b32(), v191, v194); + svfloat32_t v216 = svsub_f32_x(svptrue_b32(), v190, v192); + svfloat32_t v218 = svadd_f32_x(svptrue_b32(), v196, v198); + svfloat32_t v219 = svsub_f32_x(svptrue_b32(), v195, v198); + svfloat32_t v220 = svadd_f32_x(svptrue_b32(), v195, v196); + svfloat32_t v221 = svsub_f32_x(svptrue_b32(), v198, v199); + svfloat32_t v222 = svsub_f32_x(svptrue_b32(), v197, v198); + svfloat32_t v223 = svsub_f32_x(svptrue_b32(), v197, v199); + svfloat32_t v224 = svadd_f32_x(svptrue_b32(), v196, v199); + svfloat32_t v225 = svsub_f32_x(svptrue_b32(), v195, v197); + svfloat32_t v409 = svadd_f32_x(svptrue_b32(), v399, v400); + svfloat32_t v410 = svadd_f32_x(svptrue_b32(), v401, v403); + svfloat32_t v412 = svsub_f32_x(svptrue_b32(), v405, v406); + svfloat32_t v413 = svadd_f32_x(svptrue_b32(), v404, v408); + svfloat32_t v418 = svsub_f32_x(svptrue_b32(), v400, v402); + svfloat32_t v419 = svsub_f32_x(svptrue_b32(), v399, v402); + svfloat32_t v420 = svsub_f32_x(svptrue_b32(), v400, v399); + svfloat32_t v421 = svsub_f32_x(svptrue_b32(), v403, v402); + svfloat32_t v422 = svsub_f32_x(svptrue_b32(), v401, v402); + svfloat32_t v423 = svsub_f32_x(svptrue_b32(), v403, v401); + svfloat32_t v424 = svsub_f32_x(svptrue_b32(), v400, v403); + svfloat32_t v425 = svsub_f32_x(svptrue_b32(), v399, v401); + svfloat32_t v427 = svadd_f32_x(svptrue_b32(), v405, v407); + svfloat32_t v428 = svsub_f32_x(svptrue_b32(), v404, v407); + svfloat32_t v429 = svadd_f32_x(svptrue_b32(), v404, v405); + svfloat32_t v430 = svsub_f32_x(svptrue_b32(), v407, v408); + svfloat32_t v431 = svsub_f32_x(svptrue_b32(), v406, v407); + svfloat32_t v432 = svsub_f32_x(svptrue_b32(), v406, v408); + svfloat32_t v433 = svadd_f32_x(svptrue_b32(), v405, v408); + svfloat32_t v434 = svsub_f32_x(svptrue_b32(), v404, v406); + svfloat32_t v202 = svadd_f32_x(svptrue_b32(), v193, v200); + svfloat32_t v207 = svsub_f32_x(svptrue_b32(), v203, v204); + svfloat32_t v217 = svsub_f32_x(svptrue_b32(), v201, v200); + svfloat32_t v226 = svadd_f32_x(svptrue_b32(), v203, v204); + svfloat32_t v253 = svmul_f32_x(svptrue_b32(), v210, v986); + svfloat32_t v258 = svmul_f32_x(svptrue_b32(), v211, v987); + svfloat32_t v268 = svmul_f32_x(svptrue_b32(), v213, v989); + svfloat32_t v273 = svmul_f32_x(svptrue_b32(), v214, v990); + svfloat32_t zero295 = svdup_n_f32(0); + svfloat32_t v295 = svcmla_f32_x(pred_full, zero295, v994, v218, 90); + svfloat32_t zero309 = svdup_n_f32(0); + svfloat32_t v309 = svcmla_f32_x(pred_full, zero309, v996, v220, 90); + svfloat32_t zero316 = svdup_n_f32(0); + svfloat32_t v316 = svcmla_f32_x(pred_full, zero316, v997, v221, 90); + svfloat32_t zero330 = svdup_n_f32(0); + svfloat32_t v330 = svcmla_f32_x(pred_full, zero330, v999, v223, 90); + svfloat32_t zero337 = svdup_n_f32(0); + svfloat32_t v337 = svcmla_f32_x(pred_full, zero337, v1000, v224, 90); + svfloat32_t v411 = svadd_f32_x(svptrue_b32(), v402, v409); + svfloat32_t v416 = svsub_f32_x(svptrue_b32(), v412, v413); + svfloat32_t v426 = svsub_f32_x(svptrue_b32(), v410, v409); + svfloat32_t v435 = svadd_f32_x(svptrue_b32(), v412, v413); + svfloat32_t v462 = svmul_f32_x(svptrue_b32(), v419, v986); + svfloat32_t v467 = svmul_f32_x(svptrue_b32(), v420, v987); + svfloat32_t v477 = svmul_f32_x(svptrue_b32(), v422, v989); + svfloat32_t v482 = svmul_f32_x(svptrue_b32(), v423, v990); + svfloat32_t zero504 = svdup_n_f32(0); + svfloat32_t v504 = svcmla_f32_x(pred_full, zero504, v994, v427, 90); + svfloat32_t zero518 = svdup_n_f32(0); + svfloat32_t v518 = svcmla_f32_x(pred_full, zero518, v996, v429, 90); + svfloat32_t zero525 = svdup_n_f32(0); + svfloat32_t v525 = svcmla_f32_x(pred_full, zero525, v997, v430, 90); + svfloat32_t zero539 = svdup_n_f32(0); + svfloat32_t v539 = svcmla_f32_x(pred_full, zero539, v999, v432, 90); + svfloat32_t zero546 = svdup_n_f32(0); + svfloat32_t v546 = svcmla_f32_x(pred_full, zero546, v1000, v433, 90); + svfloat32_t v205 = svadd_f32_x(svptrue_b32(), v202, v201); + svfloat32_t v208 = svsub_f32_x(svptrue_b32(), v207, v198); + svfloat32_t v288 = svmul_f32_x(svptrue_b32(), v217, v993); + svfloat32_t zero351 = svdup_n_f32(0); + svfloat32_t v351 = svcmla_f32_x(pred_full, zero351, v1002, v226, 90); + svfloat32_t v353 = svmla_f32_x(pred_full, v253, v209, v985); + svfloat32_t v354 = svmla_f32_x(pred_full, v258, v210, v986); + svfloat32_t v355 = svnmls_f32_x(pred_full, v258, v209, v985); + svfloat32_t v356 = svmla_f32_x(pred_full, v268, v212, v988); + svfloat32_t v357 = svmla_f32_x(pred_full, v273, v213, v989); + svfloat32_t v358 = svnmls_f32_x(pred_full, v273, v212, v988); + svfloat32_t v361 = svcmla_f32_x(pred_full, v309, v995, v219, 90); + svfloat32_t v362 = svsub_f32_x(svptrue_b32(), v295, v309); + svfloat32_t v363 = svcmla_f32_x(pred_full, v330, v998, v222, 90); + svfloat32_t v364 = svsub_f32_x(svptrue_b32(), v316, v330); + svfloat32_t v414 = svadd_f32_x(svptrue_b32(), v411, v410); + svfloat32_t v417 = svsub_f32_x(svptrue_b32(), v416, v407); + svfloat32_t v497 = svmul_f32_x(svptrue_b32(), v426, v993); + svfloat32_t zero560 = svdup_n_f32(0); + svfloat32_t v560 = svcmla_f32_x(pred_full, zero560, v1002, v435, 90); + svfloat32_t v562 = svmla_f32_x(pred_full, v462, v418, v985); + svfloat32_t v563 = svmla_f32_x(pred_full, v467, v419, v986); + svfloat32_t v564 = svnmls_f32_x(pred_full, v467, v418, v985); + svfloat32_t v565 = svmla_f32_x(pred_full, v477, v421, v988); + svfloat32_t v566 = svmla_f32_x(pred_full, v482, v422, v989); + svfloat32_t v567 = svnmls_f32_x(pred_full, v482, v421, v988); + svfloat32_t v570 = svcmla_f32_x(pred_full, v518, v995, v428, 90); + svfloat32_t v571 = svsub_f32_x(svptrue_b32(), v504, v518); + svfloat32_t v572 = svcmla_f32_x(pred_full, v539, v998, v431, 90); + svfloat32_t v573 = svsub_f32_x(svptrue_b32(), v525, v539); + svfloat32_t v206 = svadd_f32_x(svptrue_b32(), v28, v205); + svfloat32_t zero243 = svdup_n_f32(0); + svfloat32_t v243 = svcmla_f32_x(pred_full, zero243, v984, v208, 90); + svfloat32_t v359 = svmla_f32_x(pred_full, v288, v216, v992); + svfloat32_t v360 = svmla_f32_x(pred_full, v288, v215, v991); + svfloat32_t v365 = svcmla_f32_x(pred_full, v351, v1001, v225, 90); + svfloat32_t v366 = svsub_f32_x(svptrue_b32(), v337, v351); + svfloat32_t v385 = svadd_f32_x(svptrue_b32(), v361, v362); + svfloat32_t v415 = svadd_f32_x(svptrue_b32(), v29, v414); + svfloat32_t zero452 = svdup_n_f32(0); + svfloat32_t v452 = svcmla_f32_x(pred_full, zero452, v984, v417, 90); + svfloat32_t v568 = svmla_f32_x(pred_full, v497, v425, v992); + svfloat32_t v569 = svmla_f32_x(pred_full, v497, v424, v991); + svfloat32_t v574 = svcmla_f32_x(pred_full, v560, v1001, v434, 90); + svfloat32_t v575 = svsub_f32_x(svptrue_b32(), v546, v560); + svfloat32_t v594 = svadd_f32_x(svptrue_b32(), v570, v571); + svfloat32_t v352 = svmls_f32_x(pred_full, v206, v205, v983); + svfloat32_t v367 = svadd_f32_x(svptrue_b32(), v357, v359); + svfloat32_t v377 = svadd_f32_x(svptrue_b32(), v243, v363); + svfloat32_t v379 = svsub_f32_x(svptrue_b32(), v365, v361); + svfloat32_t v381 = svadd_f32_x(svptrue_b32(), v243, v366); + svfloat32_t v383 = svsub_f32_x(svptrue_b32(), v366, v362); + svfloat32_t v386 = svadd_f32_x(svptrue_b32(), v385, v363); + svfloat32_t v561 = svmls_f32_x(pred_full, v415, v414, v983); + svfloat32_t v576 = svadd_f32_x(svptrue_b32(), v566, v568); + svfloat32_t v586 = svadd_f32_x(svptrue_b32(), v452, v572); + svfloat32_t v588 = svsub_f32_x(svptrue_b32(), v574, v570); + svfloat32_t v590 = svadd_f32_x(svptrue_b32(), v452, v575); + svfloat32_t v592 = svsub_f32_x(svptrue_b32(), v575, v571); + svfloat32_t v595 = svadd_f32_x(svptrue_b32(), v594, v572); + svst1_f64(pred_full, (double *)(v1010), svreinterpret_f64_f32(v206)); + svst1_f64(pred_full, (double *)(v1019), svreinterpret_f64_f32(v415)); + svfloat32_t v368 = svadd_f32_x(svptrue_b32(), v367, v352); + svfloat32_t v369 = svsub_f32_x(svptrue_b32(), v352, v354); + svfloat32_t v371 = svadd_f32_x(svptrue_b32(), v352, v358); + svfloat32_t v373 = svsub_f32_x(svptrue_b32(), v352, v355); + svfloat32_t v375 = svadd_f32_x(svptrue_b32(), v352, v353); + svfloat32_t v378 = svadd_f32_x(svptrue_b32(), v377, v365); + svfloat32_t v380 = svsub_f32_x(svptrue_b32(), v379, v243); + svfloat32_t v382 = svadd_f32_x(svptrue_b32(), v381, v364); + svfloat32_t v384 = svsub_f32_x(svptrue_b32(), v383, v243); + svfloat32_t v387 = svadd_f32_x(svptrue_b32(), v386, v364); + svfloat32_t v577 = svadd_f32_x(svptrue_b32(), v576, v561); + svfloat32_t v578 = svsub_f32_x(svptrue_b32(), v561, v563); + svfloat32_t v580 = svadd_f32_x(svptrue_b32(), v561, v567); + svfloat32_t v582 = svsub_f32_x(svptrue_b32(), v561, v564); + svfloat32_t v584 = svadd_f32_x(svptrue_b32(), v561, v562); + svfloat32_t v587 = svadd_f32_x(svptrue_b32(), v586, v574); + svfloat32_t v589 = svsub_f32_x(svptrue_b32(), v588, v452); + svfloat32_t v591 = svadd_f32_x(svptrue_b32(), v590, v573); + svfloat32_t v593 = svsub_f32_x(svptrue_b32(), v592, v452); + svfloat32_t v596 = svadd_f32_x(svptrue_b32(), v595, v573); + svfloat32_t v370 = svsub_f32_x(svptrue_b32(), v369, v359); + svfloat32_t v372 = svadd_f32_x(svptrue_b32(), v371, v360); + svfloat32_t v374 = svsub_f32_x(svptrue_b32(), v373, v360); + svfloat32_t v376 = svsub_f32_x(svptrue_b32(), v375, v356); + svfloat32_t v388 = svsub_f32_x(svptrue_b32(), v387, v243); + svfloat32_t v390 = svadd_f32_x(svptrue_b32(), v368, v378); + svfloat32_t v397 = svsub_f32_x(svptrue_b32(), v368, v378); + svfloat32_t v579 = svsub_f32_x(svptrue_b32(), v578, v568); + svfloat32_t v581 = svadd_f32_x(svptrue_b32(), v580, v569); + svfloat32_t v583 = svsub_f32_x(svptrue_b32(), v582, v569); + svfloat32_t v585 = svsub_f32_x(svptrue_b32(), v584, v565); + svfloat32_t v597 = svsub_f32_x(svptrue_b32(), v596, v452); + svfloat32_t v599 = svadd_f32_x(svptrue_b32(), v577, v587); + svfloat32_t v606 = svsub_f32_x(svptrue_b32(), v577, v587); + svfloat32_t v389 = svadd_f32_x(svptrue_b32(), v376, v388); + svfloat32_t v391 = svadd_f32_x(svptrue_b32(), v370, v380); + svfloat32_t v392 = svsub_f32_x(svptrue_b32(), v372, v382); + svfloat32_t v393 = svadd_f32_x(svptrue_b32(), v374, v384); + svfloat32_t v394 = svsub_f32_x(svptrue_b32(), v374, v384); + svfloat32_t v395 = svadd_f32_x(svptrue_b32(), v372, v382); + svfloat32_t v396 = svsub_f32_x(svptrue_b32(), v370, v380); + svfloat32_t v398 = svsub_f32_x(svptrue_b32(), v376, v388); + svfloat32_t v598 = svadd_f32_x(svptrue_b32(), v585, v597); + svfloat32_t v600 = svadd_f32_x(svptrue_b32(), v579, v589); + svfloat32_t v601 = svsub_f32_x(svptrue_b32(), v581, v591); + svfloat32_t v602 = svadd_f32_x(svptrue_b32(), v583, v593); + svfloat32_t v603 = svsub_f32_x(svptrue_b32(), v583, v593); + svfloat32_t v604 = svadd_f32_x(svptrue_b32(), v581, v591); + svfloat32_t v605 = svsub_f32_x(svptrue_b32(), v579, v589); + svfloat32_t v607 = svsub_f32_x(svptrue_b32(), v585, v597); + svst1_f64(pred_full, (double *)(v1046), svreinterpret_f64_f32(v397)); + svst1_f64(pred_full, (double *)(v1055), svreinterpret_f64_f32(v606)); + svst1_f64(pred_full, (double *)(v1172), svreinterpret_f64_f32(v390)); + svst1_f64(pred_full, (double *)(v1181), svreinterpret_f64_f32(v599)); + svst1_f64(pred_full, (double *)(v1028), svreinterpret_f64_f32(v398)); + svst1_f64(pred_full, (double *)(v1037), svreinterpret_f64_f32(v607)); + svst1_f64(pred_full, (double *)(v1064), svreinterpret_f64_f32(v396)); + svst1_f64(pred_full, (double *)(v1073), svreinterpret_f64_f32(v605)); + svst1_f64(pred_full, (double *)(v1082), svreinterpret_f64_f32(v395)); + svst1_f64(pred_full, (double *)(v1091), svreinterpret_f64_f32(v604)); + svst1_f64(pred_full, (double *)(v1100), svreinterpret_f64_f32(v394)); + svst1_f64(pred_full, (double *)(v1109), svreinterpret_f64_f32(v603)); + svst1_f64(pred_full, (double *)(v1118), svreinterpret_f64_f32(v393)); + svst1_f64(pred_full, (double *)(v1127), svreinterpret_f64_f32(v602)); + svst1_f64(pred_full, (double *)(v1136), svreinterpret_f64_f32(v392)); + svst1_f64(pred_full, (double *)(v1145), svreinterpret_f64_f32(v601)); + svst1_f64(pred_full, (double *)(v1154), svreinterpret_f64_f32(v391)); + svst1_f64(pred_full, (double *)(v1163), svreinterpret_f64_f32(v600)); + svst1_f64(pred_full, (double *)(v1190), svreinterpret_f64_f32(v389)); + svst1_f64(pred_full, (double *)(v1199), svreinterpret_f64_f32(v598)); +} +#endif + +#ifndef ARMRAL_ARCH_SVE +void armral_fft_cf32_cf32_cf32_ac_n_uun24(const armral_cmplx_f32_t *restrict x, + armral_cmplx_f32_t *restrict y, + int istride, int ostride, int howmany, + float dir) { + float v4 = dir; + const float32x2_t *v5 = (const float32x2_t *)x; + float32x2_t *v6 = (float32x2_t *)y; + float v192 = 1.0000000000000000e+00F; + float v193 = -1.0000000000000000e+00F; + float v200 = -7.0710678118654746e-01F; + float v207 = 7.0710678118654757e-01F; + float v259 = -1.4999999999999998e+00F; + float v260 = 1.4999999999999998e+00F; + float v267 = 1.0606601717798210e+00F; + float v274 = -1.0606601717798212e+00F; + float v328 = 8.6602540378443871e-01F; + float v336 = -8.6602540378443871e-01F; + float v343 = 6.1237243569579458e-01F; + float v344 = -6.1237243569579458e-01F; + float32x2_t v25 = v5[0]; + float32x2_t v72 = v5[istride]; + float32x2_t v194 = (float32x2_t){v192, v193}; + float32x2_t v201 = (float32x2_t){v207, v200}; + float32x2_t v208 = (float32x2_t){v207, v207}; + float32x2_t v257 = (float32x2_t){v259, v259}; + float32x2_t v261 = (float32x2_t){v259, v260}; + float32x2_t v268 = (float32x2_t){v274, v267}; + float32x2_t v275 = (float32x2_t){v274, v274}; + float32x2_t v330 = (float32x2_t){v328, v336}; + float32x2_t v337 = (float32x2_t){v336, v336}; + float32x2_t v341 = (float32x2_t){v344, v344}; + float32x2_t v345 = (float32x2_t){v343, v344}; + float32x2_t v346 = (float32x2_t){v4, v4}; + float32x2_t v13 = v5[istride * 8]; + float32x2_t v18 = v5[istride * 16]; + float32x2_t v31 = v5[istride * 11]; + float32x2_t v36 = v5[istride * 19]; + float32x2_t v43 = v5[istride * 3]; + float32x2_t v49 = v5[istride * 14]; + float32x2_t v54 = v5[istride * 22]; + float32x2_t v61 = v5[istride * 6]; + float32x2_t v67 = v5[istride * 17]; + float32x2_t v79 = v5[istride * 9]; + float32x2_t v85 = v5[istride * 20]; + float32x2_t v90 = v5[istride * 4]; + float32x2_t v97 = v5[istride * 12]; + float32x2_t v103 = v5[istride * 23]; + float32x2_t v108 = v5[istride * 7]; + float32x2_t v115 = v5[istride * 15]; + float32x2_t v121 = v5[istride * 2]; + float32x2_t v126 = v5[istride * 10]; + float32x2_t v133 = v5[istride * 18]; + float32x2_t v139 = v5[istride * 5]; + float32x2_t v144 = v5[istride * 13]; + float32x2_t v151 = v5[istride * 21]; + float32x2_t v196 = vmul_f32(v346, v194); + float32x2_t v203 = vmul_f32(v346, v201); + float32x2_t v263 = vmul_f32(v346, v261); + float32x2_t v270 = vmul_f32(v346, v268); + float32x2_t v332 = vmul_f32(v346, v330); + float32x2_t v347 = vmul_f32(v346, v345); + float32x2_t v19 = vadd_f32(v13, v18); + float32x2_t v20 = vsub_f32(v13, v18); + float32x2_t v37 = vadd_f32(v31, v36); + float32x2_t v38 = vsub_f32(v31, v36); + float32x2_t v55 = vadd_f32(v49, v54); + float32x2_t v56 = vsub_f32(v49, v54); + float32x2_t v73 = vadd_f32(v67, v72); + float32x2_t v74 = vsub_f32(v67, v72); + float32x2_t v91 = vadd_f32(v85, v90); + float32x2_t v92 = vsub_f32(v85, v90); + float32x2_t v109 = vadd_f32(v103, v108); + float32x2_t v110 = vsub_f32(v103, v108); + float32x2_t v127 = vadd_f32(v121, v126); + float32x2_t v128 = vsub_f32(v121, v126); + float32x2_t v145 = vadd_f32(v139, v144); + float32x2_t v146 = vsub_f32(v139, v144); + float32x2_t v26 = vadd_f32(v19, v25); + float32x2_t v44 = vadd_f32(v37, v43); + float32x2_t v62 = vadd_f32(v55, v61); + float32x2_t v80 = vadd_f32(v73, v79); + float32x2_t v98 = vadd_f32(v91, v97); + float32x2_t v116 = vadd_f32(v109, v115); + float32x2_t v134 = vadd_f32(v127, v133); + float32x2_t v152 = vadd_f32(v145, v151); + float32x2_t v220 = vadd_f32(v19, v91); + float32x2_t v221 = vsub_f32(v19, v91); + float32x2_t v222 = vadd_f32(v55, v127); + float32x2_t v223 = vsub_f32(v55, v127); + float32x2_t v224 = vadd_f32(v37, v109); + float32x2_t v225 = vsub_f32(v37, v109); + float32x2_t v226 = vadd_f32(v73, v145); + float32x2_t v227 = vsub_f32(v73, v145); + float32x2_t v287 = vadd_f32(v20, v92); + float32x2_t v288 = vsub_f32(v20, v92); + float32x2_t v289 = vadd_f32(v56, v128); + float32x2_t v290 = vsub_f32(v56, v128); + float32x2_t v291 = vadd_f32(v38, v110); + float32x2_t v292 = vsub_f32(v38, v110); + float32x2_t v293 = vadd_f32(v74, v146); + float32x2_t v294 = vsub_f32(v74, v146); + float32x2_t v153 = vadd_f32(v26, v98); + float32x2_t v154 = vsub_f32(v26, v98); + float32x2_t v155 = vadd_f32(v62, v134); + float32x2_t v156 = vsub_f32(v62, v134); + float32x2_t v157 = vadd_f32(v44, v116); + float32x2_t v158 = vsub_f32(v44, v116); + float32x2_t v159 = vadd_f32(v80, v152); + float32x2_t v160 = vsub_f32(v80, v152); + float32x2_t v228 = vadd_f32(v220, v222); + float32x2_t v229 = vsub_f32(v220, v222); + float32x2_t v230 = vadd_f32(v224, v226); + float32x2_t v231 = vsub_f32(v224, v226); + float32x2_t v234 = vadd_f32(v225, v227); + float32x2_t v235 = vsub_f32(v225, v227); + float32x2_t v258 = vmul_f32(v221, v257); + float32x2_t v264 = vrev64_f32(v223); + float32x2_t v295 = vadd_f32(v287, v289); + float32x2_t v296 = vsub_f32(v287, v289); + float32x2_t v297 = vadd_f32(v291, v293); + float32x2_t v298 = vsub_f32(v291, v293); + float32x2_t v301 = vadd_f32(v292, v294); + float32x2_t v302 = vsub_f32(v292, v294); + float32x2_t v333 = vrev64_f32(v288); + float32x2_t v338 = vmul_f32(v290, v337); + float32x2_t v161 = vadd_f32(v153, v155); + float32x2_t v162 = vsub_f32(v153, v155); + float32x2_t v163 = vadd_f32(v157, v159); + float32x2_t v164 = vsub_f32(v157, v159); + float32x2_t v167 = vadd_f32(v158, v160); + float32x2_t v168 = vsub_f32(v158, v160); + float32x2_t v197 = vrev64_f32(v156); + float32x2_t v232 = vadd_f32(v228, v230); + float32x2_t v233 = vsub_f32(v228, v230); + float32x2_t v247 = vmul_f32(v229, v257); + float32x2_t v253 = vrev64_f32(v231); + float32x2_t v265 = vmul_f32(v264, v263); + float32x2_t v271 = vrev64_f32(v234); + float32x2_t v276 = vmul_f32(v235, v275); + float32x2_t v299 = vadd_f32(v295, v297); + float32x2_t v300 = vsub_f32(v295, v297); + float32x2_t v322 = vrev64_f32(v296); + float32x2_t v327 = vmul_f32(v298, v337); + float32x2_t v334 = vmul_f32(v333, v332); + float32x2_t v342 = vmul_f32(v301, v341); + float32x2_t v348 = vrev64_f32(v302); + float32x2_t v165 = vadd_f32(v161, v163); + float32x2_t v166 = vsub_f32(v161, v163); + float32x2_t v186 = vrev64_f32(v164); + float32x2_t v198 = vmul_f32(v197, v196); + float32x2_t v204 = vrev64_f32(v167); + float32x2_t v209 = vmul_f32(v168, v208); + float32x2_t v239 = vmul_f32(v232, v257); + float32x2_t v243 = vmul_f32(v233, v257); + float32x2_t v254 = vmul_f32(v253, v263); + float32x2_t v272 = vmul_f32(v271, v270); + float32x2_t v279 = vadd_f32(v258, v276); + float32x2_t v280 = vsub_f32(v258, v276); + float32x2_t v308 = vrev64_f32(v299); + float32x2_t v315 = vrev64_f32(v300); + float32x2_t v323 = vmul_f32(v322, v332); + float32x2_t v349 = vmul_f32(v348, v347); + float32x2_t v354 = vadd_f32(v338, v342); + float32x2_t v355 = vsub_f32(v338, v342); + float32x2_t v187 = vmul_f32(v186, v196); + float32x2_t v205 = vmul_f32(v204, v203); + float32x2_t v212 = vadd_f32(v154, v209); + float32x2_t v213 = vsub_f32(v154, v209); + float32x2_t v277 = vadd_f32(v247, v254); + float32x2_t v278 = vsub_f32(v247, v254); + float32x2_t v281 = vadd_f32(v265, v272); + float32x2_t v282 = vsub_f32(v265, v272); + float32x2_t v309 = vmul_f32(v308, v332); + float32x2_t v316 = vmul_f32(v315, v332); + float32x2_t v350 = vadd_f32(v323, v327); + float32x2_t v351 = vsub_f32(v323, v327); + float32x2_t v352 = vadd_f32(v334, v349); + float32x2_t v353 = vsub_f32(v334, v349); + float32x2_t v360 = vadd_f32(v165, v239); + v6[0] = v165; + float32x2_t v432 = vadd_f32(v166, v243); + v6[ostride * 12] = v166; + float32x2_t v210 = vadd_f32(v162, v187); + float32x2_t v211 = vsub_f32(v162, v187); + float32x2_t v214 = vadd_f32(v198, v205); + float32x2_t v215 = vsub_f32(v198, v205); + float32x2_t v283 = vadd_f32(v279, v281); + float32x2_t v284 = vsub_f32(v279, v281); + float32x2_t v285 = vadd_f32(v280, v282); + float32x2_t v286 = vsub_f32(v280, v282); + float32x2_t v356 = vadd_f32(v352, v354); + float32x2_t v357 = vsub_f32(v352, v354); + float32x2_t v358 = vadd_f32(v353, v355); + float32x2_t v359 = vsub_f32(v353, v355); + float32x2_t v361 = vadd_f32(v360, v309); + float32x2_t v362 = vsub_f32(v360, v309); + float32x2_t v433 = vadd_f32(v432, v316); + float32x2_t v434 = vsub_f32(v432, v316); + float32x2_t v216 = vadd_f32(v212, v214); + float32x2_t v217 = vsub_f32(v212, v214); + float32x2_t v218 = vadd_f32(v213, v215); + float32x2_t v219 = vsub_f32(v213, v215); + v6[ostride * 16] = v362; + v6[ostride * 8] = v361; + float32x2_t v396 = vadd_f32(v211, v278); + v6[ostride * 18] = v211; + v6[ostride * 4] = v434; + v6[ostride * 20] = v433; + float32x2_t v468 = vadd_f32(v210, v277); + v6[ostride * 6] = v210; + float32x2_t v378 = vadd_f32(v217, v284); + v6[ostride * 9] = v217; + float32x2_t v397 = vadd_f32(v396, v351); + float32x2_t v398 = vsub_f32(v396, v351); + float32x2_t v414 = vadd_f32(v218, v285); + v6[ostride * 3] = v218; + float32x2_t v450 = vadd_f32(v219, v286); + v6[ostride * 21] = v219; + float32x2_t v469 = vadd_f32(v468, v350); + float32x2_t v470 = vsub_f32(v468, v350); + float32x2_t v486 = vadd_f32(v216, v283); + v6[ostride * 15] = v216; + float32x2_t v379 = vadd_f32(v378, v357); + float32x2_t v380 = vsub_f32(v378, v357); + v6[ostride * 10] = v398; + v6[ostride * 2] = v397; + float32x2_t v415 = vadd_f32(v414, v358); + float32x2_t v416 = vsub_f32(v414, v358); + float32x2_t v451 = vadd_f32(v450, v359); + float32x2_t v452 = vsub_f32(v450, v359); + v6[ostride * 22] = v470; + v6[ostride * 14] = v469; + float32x2_t v487 = vadd_f32(v486, v356); + float32x2_t v488 = vsub_f32(v486, v356); + v6[ostride] = v380; + v6[ostride * 17] = v379; + v6[ostride * 19] = v416; + v6[ostride * 11] = v415; + v6[ostride * 13] = v452; + v6[ostride * 5] = v451; + v6[ostride * 7] = v488; + v6[ostride * 23] = v487; +} +#endif + +#ifdef ARMRAL_ARCH_SVE +void armral_fft_cf32_cf32_cf32_ac_n_uun24(const armral_cmplx_f32_t *restrict x, + armral_cmplx_f32_t *restrict y, + int istride, int ostride, int howmany, + float dir) { + int64_t v0 = istride; + int64_t v2 = ostride; + float v4 = dir; + const float32x2_t *v5 = (const float32x2_t *)x; + float32x2_t *v6 = (float32x2_t *)y; + svbool_t pred_full = svptrue_pat_b32(SV_VL2); + float v250 = -1.0000000000000000e+00F; + float v257 = -7.0710678118654746e-01F; + float v264 = 7.0710678118654757e-01F; + float v317 = -1.4999999999999998e+00F; + float v322 = 1.4999999999999998e+00F; + float v329 = 1.0606601717798210e+00F; + float v336 = -1.0606601717798212e+00F; + float v400 = -8.6602540378443871e-01F; + float v410 = -6.1237243569579458e-01F; + const float32x2_t *v715 = &v5[v0]; + float32x2_t *v902 = &v6[v2]; + int64_t v15 = v0 * 8; + int64_t v22 = v0 * 16; + int64_t v39 = v0 * 11; + int64_t v46 = v0 * 19; + int64_t v55 = v0 * 3; + int64_t v63 = v0 * 14; + int64_t v70 = v0 * 22; + int64_t v79 = v0 * 6; + int64_t v87 = v0 * 17; + int64_t v103 = v0 * 9; + int64_t v111 = v0 * 20; + int64_t v118 = v0 * 4; + int64_t v127 = v0 * 12; + int64_t v135 = v0 * 23; + int64_t v142 = v0 * 7; + int64_t v151 = v0 * 15; + int64_t v159 = v0 * 2; + int64_t v166 = v0 * 10; + int64_t v175 = v0 * 18; + int64_t v183 = v0 * 5; + int64_t v190 = v0 * 13; + int64_t v199 = v0 * 21; + float v253 = v4 * v250; + float v260 = v4 * v257; + float v325 = v4 * v322; + float v332 = v4 * v329; + float v396 = v4 * v400; + float v413 = v4 * v410; + int64_t v437 = v2 * 16; + int64_t v444 = v2 * 8; + int64_t v454 = v2 * 9; + int64_t v468 = v2 * 17; + int64_t v478 = v2 * 18; + int64_t v485 = v2 * 10; + int64_t v492 = v2 * 2; + int64_t v502 = v2 * 3; + int64_t v509 = v2 * 19; + int64_t v516 = v2 * 11; + int64_t v526 = v2 * 12; + int64_t v533 = v2 * 4; + int64_t v540 = v2 * 20; + int64_t v550 = v2 * 21; + int64_t v557 = v2 * 13; + int64_t v564 = v2 * 5; + int64_t v574 = v2 * 6; + int64_t v581 = v2 * 22; + int64_t v588 = v2 * 14; + int64_t v598 = v2 * 15; + int64_t v605 = v2 * 7; + int64_t v612 = v2 * 23; + const float32x2_t *v643 = &v5[0]; + svfloat32_t v842 = svdup_n_f32(v264); + svfloat32_t v847 = svdup_n_f32(v317); + svfloat32_t v850 = svdup_n_f32(v336); + svfloat32_t v856 = svdup_n_f32(v400); + svfloat32_t v857 = svdup_n_f32(v410); + float32x2_t *v866 = &v6[0]; + svfloat32_t v1097 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v715)[0])); + const float32x2_t *v624 = &v5[v15]; + const float32x2_t *v633 = &v5[v22]; + const float32x2_t *v652 = &v5[v39]; + const float32x2_t *v661 = &v5[v46]; + const float32x2_t *v670 = &v5[v55]; + const float32x2_t *v679 = &v5[v63]; + const float32x2_t *v688 = &v5[v70]; + const float32x2_t *v697 = &v5[v79]; + const float32x2_t *v706 = &v5[v87]; + const float32x2_t *v724 = &v5[v103]; + const float32x2_t *v733 = &v5[v111]; + const float32x2_t *v742 = &v5[v118]; + const float32x2_t *v751 = &v5[v127]; + const float32x2_t *v760 = &v5[v135]; + const float32x2_t *v769 = &v5[v142]; + const float32x2_t *v778 = &v5[v151]; + const float32x2_t *v787 = &v5[v159]; + const float32x2_t *v796 = &v5[v166]; + const float32x2_t *v805 = &v5[v175]; + const float32x2_t *v814 = &v5[v183]; + const float32x2_t *v823 = &v5[v190]; + const float32x2_t *v832 = &v5[v199]; + svfloat32_t v840 = svdup_n_f32(v253); + svfloat32_t v841 = svdup_n_f32(v260); + svfloat32_t v848 = svdup_n_f32(v325); + svfloat32_t v849 = svdup_n_f32(v332); + svfloat32_t v855 = svdup_n_f32(v396); + svfloat32_t v858 = svdup_n_f32(v413); + float32x2_t *v875 = &v6[v437]; + float32x2_t *v884 = &v6[v444]; + float32x2_t *v893 = &v6[v454]; + float32x2_t *v911 = &v6[v468]; + float32x2_t *v920 = &v6[v478]; + float32x2_t *v929 = &v6[v485]; + float32x2_t *v938 = &v6[v492]; + float32x2_t *v947 = &v6[v502]; + float32x2_t *v956 = &v6[v509]; + float32x2_t *v965 = &v6[v516]; + float32x2_t *v974 = &v6[v526]; + float32x2_t *v983 = &v6[v533]; + float32x2_t *v992 = &v6[v540]; + float32x2_t *v1001 = &v6[v550]; + float32x2_t *v1010 = &v6[v557]; + float32x2_t *v1019 = &v6[v564]; + float32x2_t *v1028 = &v6[v574]; + float32x2_t *v1037 = &v6[v581]; + float32x2_t *v1046 = &v6[v588]; + float32x2_t *v1055 = &v6[v598]; + float32x2_t *v1064 = &v6[v605]; + float32x2_t *v1073 = &v6[v612]; + svfloat32_t v1081 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v643)[0])); + svfloat32_t v1077 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v624)[0])); + svfloat32_t v1079 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v633)[0])); + svfloat32_t v1083 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v652)[0])); + svfloat32_t v1085 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v661)[0])); + svfloat32_t v1087 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v670)[0])); + svfloat32_t v1089 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v679)[0])); + svfloat32_t v1091 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v688)[0])); + svfloat32_t v1093 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v697)[0])); + svfloat32_t v1095 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v706)[0])); + svfloat32_t v1099 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v724)[0])); + svfloat32_t v1101 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v733)[0])); + svfloat32_t v1103 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v742)[0])); + svfloat32_t v1105 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v751)[0])); + svfloat32_t v1107 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v760)[0])); + svfloat32_t v1109 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v769)[0])); + svfloat32_t v1111 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v778)[0])); + svfloat32_t v1113 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v787)[0])); + svfloat32_t v1115 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v796)[0])); + svfloat32_t v1117 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v805)[0])); + svfloat32_t v1119 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v814)[0])); + svfloat32_t v1121 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v823)[0])); + svfloat32_t v1123 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v832)[0])); + svfloat32_t v28 = svadd_f32_x(svptrue_b32(), v1077, v1079); + svfloat32_t v29 = svsub_f32_x(svptrue_b32(), v1077, v1079); + svfloat32_t v52 = svadd_f32_x(svptrue_b32(), v1083, v1085); + svfloat32_t v53 = svsub_f32_x(svptrue_b32(), v1083, v1085); + svfloat32_t v76 = svadd_f32_x(svptrue_b32(), v1089, v1091); + svfloat32_t v77 = svsub_f32_x(svptrue_b32(), v1089, v1091); + svfloat32_t v100 = svadd_f32_x(svptrue_b32(), v1095, v1097); + svfloat32_t v101 = svsub_f32_x(svptrue_b32(), v1095, v1097); + svfloat32_t v124 = svadd_f32_x(svptrue_b32(), v1101, v1103); + svfloat32_t v125 = svsub_f32_x(svptrue_b32(), v1101, v1103); + svfloat32_t v148 = svadd_f32_x(svptrue_b32(), v1107, v1109); + svfloat32_t v149 = svsub_f32_x(svptrue_b32(), v1107, v1109); + svfloat32_t v172 = svadd_f32_x(svptrue_b32(), v1113, v1115); + svfloat32_t v173 = svsub_f32_x(svptrue_b32(), v1113, v1115); + svfloat32_t v196 = svadd_f32_x(svptrue_b32(), v1119, v1121); + svfloat32_t v197 = svsub_f32_x(svptrue_b32(), v1119, v1121); + svfloat32_t v37 = svadd_f32_x(svptrue_b32(), v28, v1081); + svfloat32_t v61 = svadd_f32_x(svptrue_b32(), v52, v1087); + svfloat32_t v85 = svadd_f32_x(svptrue_b32(), v76, v1093); + svfloat32_t v109 = svadd_f32_x(svptrue_b32(), v100, v1099); + svfloat32_t v133 = svadd_f32_x(svptrue_b32(), v124, v1105); + svfloat32_t v157 = svadd_f32_x(svptrue_b32(), v148, v1111); + svfloat32_t v181 = svadd_f32_x(svptrue_b32(), v172, v1117); + svfloat32_t v205 = svadd_f32_x(svptrue_b32(), v196, v1123); + svfloat32_t v278 = svadd_f32_x(svptrue_b32(), v28, v124); + svfloat32_t v279 = svsub_f32_x(svptrue_b32(), v28, v124); + svfloat32_t v280 = svadd_f32_x(svptrue_b32(), v76, v172); + svfloat32_t v281 = svsub_f32_x(svptrue_b32(), v76, v172); + svfloat32_t v282 = svadd_f32_x(svptrue_b32(), v52, v148); + svfloat32_t v283 = svsub_f32_x(svptrue_b32(), v52, v148); + svfloat32_t v284 = svadd_f32_x(svptrue_b32(), v100, v196); + svfloat32_t v285 = svsub_f32_x(svptrue_b32(), v100, v196); + svfloat32_t v350 = svadd_f32_x(svptrue_b32(), v29, v125); + svfloat32_t v351 = svsub_f32_x(svptrue_b32(), v29, v125); + svfloat32_t v352 = svadd_f32_x(svptrue_b32(), v77, v173); + svfloat32_t v353 = svsub_f32_x(svptrue_b32(), v77, v173); + svfloat32_t v354 = svadd_f32_x(svptrue_b32(), v53, v149); + svfloat32_t v355 = svsub_f32_x(svptrue_b32(), v53, v149); + svfloat32_t v356 = svadd_f32_x(svptrue_b32(), v101, v197); + svfloat32_t v357 = svsub_f32_x(svptrue_b32(), v101, v197); + svfloat32_t v206 = svadd_f32_x(svptrue_b32(), v37, v133); + svfloat32_t v207 = svsub_f32_x(svptrue_b32(), v37, v133); + svfloat32_t v208 = svadd_f32_x(svptrue_b32(), v85, v181); + svfloat32_t v209 = svsub_f32_x(svptrue_b32(), v85, v181); + svfloat32_t v210 = svadd_f32_x(svptrue_b32(), v61, v157); + svfloat32_t v211 = svsub_f32_x(svptrue_b32(), v61, v157); + svfloat32_t v212 = svadd_f32_x(svptrue_b32(), v109, v205); + svfloat32_t v213 = svsub_f32_x(svptrue_b32(), v109, v205); + svfloat32_t v286 = svadd_f32_x(svptrue_b32(), v278, v280); + svfloat32_t v287 = svsub_f32_x(svptrue_b32(), v278, v280); + svfloat32_t v288 = svadd_f32_x(svptrue_b32(), v282, v284); + svfloat32_t v289 = svsub_f32_x(svptrue_b32(), v282, v284); + svfloat32_t v292 = svadd_f32_x(svptrue_b32(), v283, v285); + svfloat32_t v293 = svsub_f32_x(svptrue_b32(), v283, v285); + svfloat32_t zero327 = svdup_n_f32(0); + svfloat32_t v327 = svcmla_f32_x(pred_full, zero327, v848, v281, 90); + svfloat32_t v358 = svadd_f32_x(svptrue_b32(), v350, v352); + svfloat32_t v359 = svsub_f32_x(svptrue_b32(), v350, v352); + svfloat32_t v360 = svadd_f32_x(svptrue_b32(), v354, v356); + svfloat32_t v361 = svsub_f32_x(svptrue_b32(), v354, v356); + svfloat32_t v364 = svadd_f32_x(svptrue_b32(), v355, v357); + svfloat32_t v365 = svsub_f32_x(svptrue_b32(), v355, v357); + svfloat32_t zero398 = svdup_n_f32(0); + svfloat32_t v398 = svcmla_f32_x(pred_full, zero398, v855, v351, 90); + svfloat32_t v214 = svadd_f32_x(svptrue_b32(), v206, v208); + svfloat32_t v215 = svsub_f32_x(svptrue_b32(), v206, v208); + svfloat32_t v216 = svadd_f32_x(svptrue_b32(), v210, v212); + svfloat32_t v217 = svsub_f32_x(svptrue_b32(), v210, v212); + svfloat32_t v220 = svadd_f32_x(svptrue_b32(), v211, v213); + svfloat32_t v221 = svsub_f32_x(svptrue_b32(), v211, v213); + svfloat32_t zero255 = svdup_n_f32(0); + svfloat32_t v255 = svcmla_f32_x(pred_full, zero255, v840, v209, 90); + svfloat32_t v290 = svadd_f32_x(svptrue_b32(), v286, v288); + svfloat32_t v291 = svsub_f32_x(svptrue_b32(), v286, v288); + svfloat32_t zero315 = svdup_n_f32(0); + svfloat32_t v315 = svcmla_f32_x(pred_full, zero315, v848, v289, 90); + svfloat32_t zero334 = svdup_n_f32(0); + svfloat32_t v334 = svcmla_f32_x(pred_full, zero334, v849, v292, 90); + svfloat32_t v339 = svmul_f32_x(svptrue_b32(), v293, v850); + svfloat32_t v362 = svadd_f32_x(svptrue_b32(), v358, v360); + svfloat32_t v363 = svsub_f32_x(svptrue_b32(), v358, v360); + svfloat32_t zero386 = svdup_n_f32(0); + svfloat32_t v386 = svcmla_f32_x(pred_full, zero386, v855, v359, 90); + svfloat32_t v408 = svmul_f32_x(svptrue_b32(), v364, v857); + svfloat32_t zero415 = svdup_n_f32(0); + svfloat32_t v415 = svcmla_f32_x(pred_full, zero415, v858, v365, 90); + svfloat32_t v218 = svadd_f32_x(svptrue_b32(), v214, v216); + svfloat32_t v219 = svsub_f32_x(svptrue_b32(), v214, v216); + svfloat32_t zero243 = svdup_n_f32(0); + svfloat32_t v243 = svcmla_f32_x(pred_full, zero243, v840, v217, 90); + svfloat32_t zero262 = svdup_n_f32(0); + svfloat32_t v262 = svcmla_f32_x(pred_full, zero262, v841, v220, 90); + svfloat32_t v340 = svmla_f32_x(pred_full, v315, v287, v847); + svfloat32_t v341 = svnmls_f32_x(pred_full, v315, v287, v847); + svfloat32_t v342 = svmla_f32_x(pred_full, v339, v279, v847); + svfloat32_t v343 = svnmls_f32_x(pred_full, v339, v279, v847); + svfloat32_t v344 = svadd_f32_x(svptrue_b32(), v327, v334); + svfloat32_t v345 = svsub_f32_x(svptrue_b32(), v327, v334); + svfloat32_t zero372 = svdup_n_f32(0); + svfloat32_t v372 = svcmla_f32_x(pred_full, zero372, v855, v362, 90); + svfloat32_t zero379 = svdup_n_f32(0); + svfloat32_t v379 = svcmla_f32_x(pred_full, zero379, v855, v363, 90); + svfloat32_t v416 = svmla_f32_x(pred_full, v386, v361, v856); + svfloat32_t v417 = svmls_f32_x(pred_full, v386, v361, v856); + svfloat32_t v418 = svadd_f32_x(svptrue_b32(), v398, v415); + svfloat32_t v419 = svsub_f32_x(svptrue_b32(), v398, v415); + svfloat32_t v420 = svmla_f32_x(pred_full, v408, v353, v856); + svfloat32_t v421 = svnmls_f32_x(pred_full, v408, v353, v856); + svfloat32_t v268 = svadd_f32_x(svptrue_b32(), v215, v243); + svfloat32_t v269 = svsub_f32_x(svptrue_b32(), v215, v243); + svfloat32_t v270 = svmla_f32_x(pred_full, v207, v221, v842); + svfloat32_t v271 = svmls_f32_x(pred_full, v207, v221, v842); + svfloat32_t v272 = svadd_f32_x(svptrue_b32(), v255, v262); + svfloat32_t v273 = svsub_f32_x(svptrue_b32(), v255, v262); + svfloat32_t v346 = svadd_f32_x(svptrue_b32(), v342, v344); + svfloat32_t v347 = svsub_f32_x(svptrue_b32(), v342, v344); + svfloat32_t v348 = svadd_f32_x(svptrue_b32(), v343, v345); + svfloat32_t v349 = svsub_f32_x(svptrue_b32(), v343, v345); + svfloat32_t v422 = svadd_f32_x(svptrue_b32(), v418, v420); + svfloat32_t v423 = svsub_f32_x(svptrue_b32(), v418, v420); + svfloat32_t v424 = svadd_f32_x(svptrue_b32(), v419, v421); + svfloat32_t v425 = svsub_f32_x(svptrue_b32(), v419, v421); + svfloat32_t v426 = svmla_f32_x(pred_full, v218, v290, v847); + svfloat32_t v522 = svmla_f32_x(pred_full, v219, v291, v847); + svst1_f64(pred_full, (double *)(v866), svreinterpret_f64_f32(v218)); + svst1_f64(pred_full, (double *)(v974), svreinterpret_f64_f32(v219)); + svfloat32_t v274 = svadd_f32_x(svptrue_b32(), v270, v272); + svfloat32_t v275 = svsub_f32_x(svptrue_b32(), v270, v272); + svfloat32_t v276 = svadd_f32_x(svptrue_b32(), v271, v273); + svfloat32_t v277 = svsub_f32_x(svptrue_b32(), v271, v273); + svfloat32_t v427 = svadd_f32_x(svptrue_b32(), v426, v372); + svfloat32_t v428 = svsub_f32_x(svptrue_b32(), v426, v372); + svfloat32_t v474 = svadd_f32_x(svptrue_b32(), v269, v341); + svfloat32_t v523 = svadd_f32_x(svptrue_b32(), v522, v379); + svfloat32_t v524 = svsub_f32_x(svptrue_b32(), v522, v379); + svfloat32_t v570 = svadd_f32_x(svptrue_b32(), v268, v340); + svst1_f64(pred_full, (double *)(v920), svreinterpret_f64_f32(v269)); + svst1_f64(pred_full, (double *)(v1028), svreinterpret_f64_f32(v268)); + svfloat32_t v450 = svadd_f32_x(svptrue_b32(), v275, v347); + svfloat32_t v475 = svadd_f32_x(svptrue_b32(), v474, v417); + svfloat32_t v476 = svsub_f32_x(svptrue_b32(), v474, v417); + svfloat32_t v498 = svadd_f32_x(svptrue_b32(), v276, v348); + svfloat32_t v546 = svadd_f32_x(svptrue_b32(), v277, v349); + svfloat32_t v571 = svadd_f32_x(svptrue_b32(), v570, v416); + svfloat32_t v572 = svsub_f32_x(svptrue_b32(), v570, v416); + svfloat32_t v594 = svadd_f32_x(svptrue_b32(), v274, v346); + svst1_f64(pred_full, (double *)(v875), svreinterpret_f64_f32(v428)); + svst1_f64(pred_full, (double *)(v884), svreinterpret_f64_f32(v427)); + svst1_f64(pred_full, (double *)(v893), svreinterpret_f64_f32(v275)); + svst1_f64(pred_full, (double *)(v947), svreinterpret_f64_f32(v276)); + svst1_f64(pred_full, (double *)(v983), svreinterpret_f64_f32(v524)); + svst1_f64(pred_full, (double *)(v992), svreinterpret_f64_f32(v523)); + svst1_f64(pred_full, (double *)(v1001), svreinterpret_f64_f32(v277)); + svst1_f64(pred_full, (double *)(v1055), svreinterpret_f64_f32(v274)); + svfloat32_t v451 = svadd_f32_x(svptrue_b32(), v450, v423); + svfloat32_t v452 = svsub_f32_x(svptrue_b32(), v450, v423); + svfloat32_t v499 = svadd_f32_x(svptrue_b32(), v498, v424); + svfloat32_t v500 = svsub_f32_x(svptrue_b32(), v498, v424); + svfloat32_t v547 = svadd_f32_x(svptrue_b32(), v546, v425); + svfloat32_t v548 = svsub_f32_x(svptrue_b32(), v546, v425); + svfloat32_t v595 = svadd_f32_x(svptrue_b32(), v594, v422); + svfloat32_t v596 = svsub_f32_x(svptrue_b32(), v594, v422); + svst1_f64(pred_full, (double *)(v929), svreinterpret_f64_f32(v476)); + svst1_f64(pred_full, (double *)(v938), svreinterpret_f64_f32(v475)); + svst1_f64(pred_full, (double *)(v1037), svreinterpret_f64_f32(v572)); + svst1_f64(pred_full, (double *)(v1046), svreinterpret_f64_f32(v571)); + svst1_f64(pred_full, (double *)(v902), svreinterpret_f64_f32(v452)); + svst1_f64(pred_full, (double *)(v911), svreinterpret_f64_f32(v451)); + svst1_f64(pred_full, (double *)(v956), svreinterpret_f64_f32(v500)); + svst1_f64(pred_full, (double *)(v965), svreinterpret_f64_f32(v499)); + svst1_f64(pred_full, (double *)(v1010), svreinterpret_f64_f32(v548)); + svst1_f64(pred_full, (double *)(v1019), svreinterpret_f64_f32(v547)); + svst1_f64(pred_full, (double *)(v1064), svreinterpret_f64_f32(v596)); + svst1_f64(pred_full, (double *)(v1073), svreinterpret_f64_f32(v595)); +} +#endif + +#ifndef ARMRAL_ARCH_SVE +void armral_fft_cf32_cf32_cf32_ac_n_uun25(const armral_cmplx_f32_t *restrict x, + armral_cmplx_f32_t *restrict y, + int istride, int ostride, int howmany, + float dir) { + float v4 = dir; + const float32x2_t *v5 = (const float32x2_t *)x; + float32x2_t *v6 = (float32x2_t *)y; + float v844 = 9.6858316112863108e-01F; + float v847 = -2.4868988716485479e-01F; + float v848 = 2.4868988716485479e-01F; + float v983 = 8.7630668004386358e-01F; + float v986 = -4.8175367410171532e-01F; + float v987 = 4.8175367410171532e-01F; + float v1122 = 7.2896862742141155e-01F; + float v1125 = -6.8454710592868862e-01F; + float v1126 = 6.8454710592868862e-01F; + float v1134 = 6.2790519529313527e-02F; + float v1137 = -9.9802672842827156e-01F; + float v1138 = 9.9802672842827156e-01F; + float v1261 = 5.3582679497899655e-01F; + float v1264 = -8.4432792550201508e-01F; + float v1265 = 8.4432792550201508e-01F; + float v1273 = -4.2577929156507272e-01F; + float v1276 = -9.0482705246601947e-01F; + float v1277 = 9.0482705246601947e-01F; + float v1285 = -6.3742398974868952e-01F; + float v1288 = 7.7051324277578936e-01F; + float v1289 = -7.7051324277578936e-01F; + float v1303 = -9.9211470131447776e-01F; + float v1306 = -1.2533323356430454e-01F; + float v1307 = 1.2533323356430454e-01F; + float v1323 = 2.5000000000000000e-01F; + float v1333 = 5.5901699437494745e-01F; + float v1343 = 6.1803398874989490e-01F; + float v1366 = 9.5105651629515353e-01F; + float v1367 = -9.5105651629515353e-01F; + float v1390 = 2.0000000000000000e+00F; + float32x2_t v13 = v5[0]; + float32x2_t v152 = v5[istride]; + float32x2_t v845 = (float32x2_t){v844, v844}; + float32x2_t v849 = (float32x2_t){v847, v848}; + float32x2_t v984 = (float32x2_t){v983, v983}; + float32x2_t v988 = (float32x2_t){v986, v987}; + float32x2_t v1123 = (float32x2_t){v1122, v1122}; + float32x2_t v1127 = (float32x2_t){v1125, v1126}; + float32x2_t v1135 = (float32x2_t){v1134, v1134}; + float32x2_t v1139 = (float32x2_t){v1137, v1138}; + float32x2_t v1169 = (float32x2_t){v1289, v1288}; + float32x2_t v1262 = (float32x2_t){v1261, v1261}; + float32x2_t v1266 = (float32x2_t){v1264, v1265}; + float32x2_t v1274 = (float32x2_t){v1273, v1273}; + float32x2_t v1278 = (float32x2_t){v1276, v1277}; + float32x2_t v1286 = (float32x2_t){v1285, v1285}; + float32x2_t v1290 = (float32x2_t){v1288, v1289}; + float32x2_t v1304 = (float32x2_t){v1303, v1303}; + float32x2_t v1308 = (float32x2_t){v1306, v1307}; + float32x2_t v1324 = (float32x2_t){v1323, v1323}; + float32x2_t v1334 = (float32x2_t){v1333, v1333}; + float32x2_t v1344 = (float32x2_t){v1343, v1343}; + float32x2_t v1368 = (float32x2_t){v1366, v1367}; + float32x2_t v1369 = (float32x2_t){v4, v4}; + float32x2_t v1391 = (float32x2_t){v1390, v1390}; + float32x2_t v18 = v5[istride * 5]; + float32x2_t v23 = v5[istride * 10]; + float32x2_t v28 = v5[istride * 15]; + float32x2_t v33 = v5[istride * 20]; + float32x2_t v157 = v5[istride * 6]; + float32x2_t v162 = v5[istride * 11]; + float32x2_t v167 = v5[istride * 16]; + float32x2_t v172 = v5[istride * 21]; + float32x2_t v291 = v5[istride * 2]; + float32x2_t v296 = v5[istride * 7]; + float32x2_t v301 = v5[istride * 12]; + float32x2_t v306 = v5[istride * 17]; + float32x2_t v311 = v5[istride * 22]; + float32x2_t v430 = v5[istride * 3]; + float32x2_t v435 = v5[istride * 8]; + float32x2_t v440 = v5[istride * 13]; + float32x2_t v445 = v5[istride * 18]; + float32x2_t v450 = v5[istride * 23]; + float32x2_t v569 = v5[istride * 4]; + float32x2_t v574 = v5[istride * 9]; + float32x2_t v579 = v5[istride * 14]; + float32x2_t v584 = v5[istride * 19]; + float32x2_t v589 = v5[istride * 24]; + float32x2_t v851 = vmul_f32(v1369, v849); + float32x2_t v990 = vmul_f32(v1369, v988); + float32x2_t v1129 = vmul_f32(v1369, v1127); + float32x2_t v1141 = vmul_f32(v1369, v1139); + float32x2_t v1171 = vmul_f32(v1369, v1169); + float32x2_t v1268 = vmul_f32(v1369, v1266); + float32x2_t v1280 = vmul_f32(v1369, v1278); + float32x2_t v1292 = vmul_f32(v1369, v1290); + float32x2_t v1310 = vmul_f32(v1369, v1308); + float32x2_t v1370 = vmul_f32(v1369, v1368); + float32x2_t v70 = vsub_f32(v18, v33); + float32x2_t v74 = vmul_f32(v18, v1391); + float32x2_t v88 = vsub_f32(v23, v28); + float32x2_t v92 = vmul_f32(v23, v1391); + float32x2_t v209 = vsub_f32(v157, v172); + float32x2_t v213 = vmul_f32(v157, v1391); + float32x2_t v227 = vsub_f32(v162, v167); + float32x2_t v231 = vmul_f32(v162, v1391); + float32x2_t v348 = vsub_f32(v296, v311); + float32x2_t v352 = vmul_f32(v296, v1391); + float32x2_t v366 = vsub_f32(v301, v306); + float32x2_t v370 = vmul_f32(v301, v1391); + float32x2_t v487 = vsub_f32(v435, v450); + float32x2_t v491 = vmul_f32(v435, v1391); + float32x2_t v505 = vsub_f32(v440, v445); + float32x2_t v509 = vmul_f32(v440, v1391); + float32x2_t v626 = vsub_f32(v574, v589); + float32x2_t v630 = vmul_f32(v574, v1391); + float32x2_t v644 = vsub_f32(v579, v584); + float32x2_t v648 = vmul_f32(v579, v1391); + float32x2_t v75 = vsub_f32(v74, v70); + float32x2_t v93 = vsub_f32(v92, v88); + float32x2_t v104 = vmul_f32(v88, v1344); + float32x2_t v119 = vmul_f32(v70, v1344); + float32x2_t v214 = vsub_f32(v213, v209); + float32x2_t v232 = vsub_f32(v231, v227); + float32x2_t v243 = vmul_f32(v227, v1344); + float32x2_t v258 = vmul_f32(v209, v1344); + float32x2_t v353 = vsub_f32(v352, v348); + float32x2_t v371 = vsub_f32(v370, v366); + float32x2_t v382 = vmul_f32(v366, v1344); + float32x2_t v397 = vmul_f32(v348, v1344); + float32x2_t v492 = vsub_f32(v491, v487); + float32x2_t v510 = vsub_f32(v509, v505); + float32x2_t v521 = vmul_f32(v505, v1344); + float32x2_t v536 = vmul_f32(v487, v1344); + float32x2_t v631 = vsub_f32(v630, v626); + float32x2_t v649 = vsub_f32(v648, v644); + float32x2_t v660 = vmul_f32(v644, v1344); + float32x2_t v675 = vmul_f32(v626, v1344); + float32x2_t v94 = vadd_f32(v75, v93); + float32x2_t v95 = vsub_f32(v75, v93); + float32x2_t v105 = vadd_f32(v70, v104); + float32x2_t v120 = vsub_f32(v119, v88); + float32x2_t v233 = vadd_f32(v214, v232); + float32x2_t v234 = vsub_f32(v214, v232); + float32x2_t v244 = vadd_f32(v209, v243); + float32x2_t v259 = vsub_f32(v258, v227); + float32x2_t v372 = vadd_f32(v353, v371); + float32x2_t v373 = vsub_f32(v353, v371); + float32x2_t v383 = vadd_f32(v348, v382); + float32x2_t v398 = vsub_f32(v397, v366); + float32x2_t v511 = vadd_f32(v492, v510); + float32x2_t v512 = vsub_f32(v492, v510); + float32x2_t v522 = vadd_f32(v487, v521); + float32x2_t v537 = vsub_f32(v536, v505); + float32x2_t v650 = vadd_f32(v631, v649); + float32x2_t v651 = vsub_f32(v631, v649); + float32x2_t v661 = vadd_f32(v626, v660); + float32x2_t v676 = vsub_f32(v675, v644); + float32x2_t v99 = vmul_f32(v94, v1324); + float32x2_t v109 = vmul_f32(v95, v1334); + float32x2_t v121 = vadd_f32(v13, v94); + float32x2_t v127 = vrev64_f32(v105); + float32x2_t v135 = vrev64_f32(v120); + float32x2_t v238 = vmul_f32(v233, v1324); + float32x2_t v248 = vmul_f32(v234, v1334); + float32x2_t v260 = vadd_f32(v152, v233); + float32x2_t v266 = vrev64_f32(v244); + float32x2_t v274 = vrev64_f32(v259); + float32x2_t v377 = vmul_f32(v372, v1324); + float32x2_t v387 = vmul_f32(v373, v1334); + float32x2_t v399 = vadd_f32(v291, v372); + float32x2_t v405 = vrev64_f32(v383); + float32x2_t v413 = vrev64_f32(v398); + float32x2_t v516 = vmul_f32(v511, v1324); + float32x2_t v526 = vmul_f32(v512, v1334); + float32x2_t v538 = vadd_f32(v430, v511); + float32x2_t v544 = vrev64_f32(v522); + float32x2_t v552 = vrev64_f32(v537); + float32x2_t v655 = vmul_f32(v650, v1324); + float32x2_t v665 = vmul_f32(v651, v1334); + float32x2_t v677 = vadd_f32(v569, v650); + float32x2_t v683 = vrev64_f32(v661); + float32x2_t v691 = vrev64_f32(v676); + float32x2_t v100 = vsub_f32(v13, v99); + float32x2_t v128 = vmul_f32(v127, v1370); + float32x2_t v136 = vmul_f32(v135, v1370); + float32x2_t v239 = vsub_f32(v152, v238); + float32x2_t v267 = vmul_f32(v266, v1370); + float32x2_t v275 = vmul_f32(v274, v1370); + float32x2_t v378 = vsub_f32(v291, v377); + float32x2_t v406 = vmul_f32(v405, v1370); + float32x2_t v414 = vmul_f32(v413, v1370); + float32x2_t v517 = vsub_f32(v430, v516); + float32x2_t v545 = vmul_f32(v544, v1370); + float32x2_t v553 = vmul_f32(v552, v1370); + float32x2_t v656 = vsub_f32(v569, v655); + float32x2_t v684 = vmul_f32(v683, v1370); + float32x2_t v692 = vmul_f32(v691, v1370); + float32x2_t v740 = vsub_f32(v260, v677); + float32x2_t v744 = vmul_f32(v260, v1391); + float32x2_t v758 = vsub_f32(v399, v538); + float32x2_t v762 = vmul_f32(v399, v1391); + float32x2_t v110 = vsub_f32(v100, v109); + float32x2_t v114 = vmul_f32(v100, v1391); + float32x2_t v249 = vsub_f32(v239, v248); + float32x2_t v253 = vmul_f32(v239, v1391); + float32x2_t v388 = vsub_f32(v378, v387); + float32x2_t v392 = vmul_f32(v378, v1391); + float32x2_t v527 = vsub_f32(v517, v526); + float32x2_t v531 = vmul_f32(v517, v1391); + float32x2_t v666 = vsub_f32(v656, v665); + float32x2_t v670 = vmul_f32(v656, v1391); + float32x2_t v745 = vsub_f32(v744, v740); + float32x2_t v763 = vsub_f32(v762, v758); + float32x2_t v774 = vmul_f32(v758, v1344); + float32x2_t v789 = vmul_f32(v740, v1344); + float32x2_t v115 = vsub_f32(v114, v110); + float32x2_t v137 = vsub_f32(v110, v136); + float32x2_t v141 = vmul_f32(v110, v1391); + float32x2_t v254 = vsub_f32(v253, v249); + float32x2_t v276 = vsub_f32(v249, v275); + float32x2_t v280 = vmul_f32(v249, v1391); + float32x2_t v393 = vsub_f32(v392, v388); + float32x2_t v415 = vsub_f32(v388, v414); + float32x2_t v419 = vmul_f32(v388, v1391); + float32x2_t v532 = vsub_f32(v531, v527); + float32x2_t v554 = vsub_f32(v527, v553); + float32x2_t v558 = vmul_f32(v527, v1391); + float32x2_t v671 = vsub_f32(v670, v666); + float32x2_t v693 = vsub_f32(v666, v692); + float32x2_t v697 = vmul_f32(v666, v1391); + float32x2_t v764 = vadd_f32(v745, v763); + float32x2_t v765 = vsub_f32(v745, v763); + float32x2_t v775 = vadd_f32(v740, v774); + float32x2_t v790 = vsub_f32(v789, v758); + float32x2_t v129 = vsub_f32(v115, v128); + float32x2_t v142 = vsub_f32(v141, v137); + float32x2_t v146 = vmul_f32(v115, v1391); + float32x2_t v268 = vsub_f32(v254, v267); + float32x2_t v281 = vsub_f32(v280, v276); + float32x2_t v285 = vmul_f32(v254, v1391); + float32x2_t v407 = vsub_f32(v393, v406); + float32x2_t v420 = vsub_f32(v419, v415); + float32x2_t v424 = vmul_f32(v393, v1391); + float32x2_t v546 = vsub_f32(v532, v545); + float32x2_t v559 = vsub_f32(v558, v554); + float32x2_t v563 = vmul_f32(v532, v1391); + float32x2_t v685 = vsub_f32(v671, v684); + float32x2_t v698 = vsub_f32(v697, v693); + float32x2_t v702 = vmul_f32(v671, v1391); + float32x2_t v769 = vmul_f32(v764, v1324); + float32x2_t v779 = vmul_f32(v765, v1334); + float32x2_t v791 = vadd_f32(v121, v764); + float32x2_t v802 = vrev64_f32(v775); + float32x2_t v815 = vrev64_f32(v790); + float32x2_t v991 = vrev64_f32(v276); + float32x2_t v1003 = vrev64_f32(v415); + float32x2_t v1015 = vrev64_f32(v693); + float32x2_t v1033 = vrev64_f32(v554); + float32x2_t v147 = vsub_f32(v146, v129); + float32x2_t v286 = vsub_f32(v285, v268); + float32x2_t v425 = vsub_f32(v424, v407); + float32x2_t v564 = vsub_f32(v563, v546); + float32x2_t v703 = vsub_f32(v702, v685); + float32x2_t v770 = vsub_f32(v121, v769); + v6[0] = v791; + float32x2_t v803 = vmul_f32(v802, v1370); + float32x2_t v816 = vmul_f32(v815, v1370); + float32x2_t v852 = vrev64_f32(v268); + float32x2_t v864 = vrev64_f32(v407); + float32x2_t v876 = vrev64_f32(v685); + float32x2_t v894 = vrev64_f32(v546); + float32x2_t v992 = vmul_f32(v991, v990); + float32x2_t v1004 = vmul_f32(v1003, v1268); + float32x2_t v1016 = vmul_f32(v1015, v1280); + float32x2_t v1034 = vmul_f32(v1033, v1141); + float32x2_t v1130 = vrev64_f32(v281); + float32x2_t v1142 = vrev64_f32(v420); + float32x2_t v1154 = vrev64_f32(v698); + float32x2_t v1172 = vrev64_f32(v559); + float32x2_t v780 = vsub_f32(v770, v779); + float32x2_t v784 = vmul_f32(v770, v1391); + float32x2_t v853 = vmul_f32(v852, v851); + float32x2_t v865 = vmul_f32(v864, v990); + float32x2_t v877 = vmul_f32(v876, v1268); + float32x2_t v895 = vmul_f32(v894, v1129); + float32x2_t v993 = vfma_f32(v992, v276, v984); + float32x2_t v1005 = vfma_f32(v1004, v415, v1262); + float32x2_t v1017 = vfma_f32(v1016, v693, v1274); + float32x2_t v1035 = vfma_f32(v1034, v554, v1135); + float32x2_t v1131 = vmul_f32(v1130, v1129); + float32x2_t v1143 = vmul_f32(v1142, v1141); + float32x2_t v1155 = vmul_f32(v1154, v1310); + float32x2_t v1173 = vmul_f32(v1172, v1171); + float32x2_t v1269 = vrev64_f32(v286); + float32x2_t v1281 = vrev64_f32(v425); + float32x2_t v1293 = vrev64_f32(v703); + float32x2_t v1311 = vrev64_f32(v564); + float32x2_t v785 = vsub_f32(v784, v780); + float32x2_t v817 = vsub_f32(v780, v816); + float32x2_t v826 = vmul_f32(v780, v1391); + float32x2_t v854 = vfma_f32(v853, v268, v845); + float32x2_t v866 = vfma_f32(v865, v407, v984); + float32x2_t v878 = vfma_f32(v877, v685, v1262); + float32x2_t v896 = vfma_f32(v895, v546, v1123); + float32x2_t v1018 = vsub_f32(v993, v1017); + float32x2_t v1022 = vmul_f32(v993, v1391); + float32x2_t v1036 = vsub_f32(v1005, v1035); + float32x2_t v1040 = vmul_f32(v1005, v1391); + float32x2_t v1132 = vfma_f32(v1131, v281, v1123); + float32x2_t v1144 = vfma_f32(v1143, v420, v1135); + float32x2_t v1156 = vfma_f32(v1155, v698, v1304); + float32x2_t v1174 = vfma_f32(v1173, v559, v1286); + float32x2_t v1270 = vmul_f32(v1269, v1268); + float32x2_t v1282 = vmul_f32(v1281, v1280); + float32x2_t v1294 = vmul_f32(v1293, v1292); + float32x2_t v1312 = vmul_f32(v1311, v1310); + float32x2_t v804 = vsub_f32(v785, v803); + v6[ostride * 10] = v817; + float32x2_t v827 = vsub_f32(v826, v817); + float32x2_t v836 = vmul_f32(v785, v1391); + float32x2_t v879 = vsub_f32(v854, v878); + float32x2_t v883 = vmul_f32(v854, v1391); + float32x2_t v897 = vsub_f32(v866, v896); + float32x2_t v901 = vmul_f32(v866, v1391); + float32x2_t v1023 = vsub_f32(v1022, v1018); + float32x2_t v1041 = vsub_f32(v1040, v1036); + float32x2_t v1052 = vmul_f32(v1036, v1344); + float32x2_t v1067 = vmul_f32(v1018, v1344); + float32x2_t v1157 = vsub_f32(v1132, v1156); + float32x2_t v1161 = vmul_f32(v1132, v1391); + float32x2_t v1175 = vsub_f32(v1144, v1174); + float32x2_t v1179 = vmul_f32(v1144, v1391); + float32x2_t v1271 = vfma_f32(v1270, v286, v1262); + float32x2_t v1283 = vfma_f32(v1282, v425, v1274); + float32x2_t v1295 = vfma_f32(v1294, v703, v1286); + float32x2_t v1313 = vfma_f32(v1312, v564, v1304); + v6[ostride * 5] = v804; + v6[ostride * 15] = v827; + float32x2_t v837 = vsub_f32(v836, v804); + float32x2_t v884 = vsub_f32(v883, v879); + float32x2_t v902 = vsub_f32(v901, v897); + float32x2_t v913 = vmul_f32(v897, v1344); + float32x2_t v928 = vmul_f32(v879, v1344); + float32x2_t v1042 = vadd_f32(v1023, v1041); + float32x2_t v1043 = vsub_f32(v1023, v1041); + float32x2_t v1053 = vadd_f32(v1018, v1052); + float32x2_t v1068 = vsub_f32(v1067, v1036); + float32x2_t v1162 = vsub_f32(v1161, v1157); + float32x2_t v1180 = vsub_f32(v1179, v1175); + float32x2_t v1191 = vmul_f32(v1175, v1344); + float32x2_t v1206 = vmul_f32(v1157, v1344); + float32x2_t v1296 = vsub_f32(v1271, v1295); + float32x2_t v1300 = vmul_f32(v1271, v1391); + float32x2_t v1314 = vsub_f32(v1283, v1313); + float32x2_t v1318 = vmul_f32(v1283, v1391); + v6[ostride * 20] = v837; + float32x2_t v903 = vadd_f32(v884, v902); + float32x2_t v904 = vsub_f32(v884, v902); + float32x2_t v914 = vadd_f32(v879, v913); + float32x2_t v929 = vsub_f32(v928, v897); + float32x2_t v1047 = vmul_f32(v1042, v1324); + float32x2_t v1057 = vmul_f32(v1043, v1334); + float32x2_t v1069 = vadd_f32(v137, v1042); + float32x2_t v1080 = vrev64_f32(v1053); + float32x2_t v1093 = vrev64_f32(v1068); + float32x2_t v1181 = vadd_f32(v1162, v1180); + float32x2_t v1182 = vsub_f32(v1162, v1180); + float32x2_t v1192 = vadd_f32(v1157, v1191); + float32x2_t v1207 = vsub_f32(v1206, v1175); + float32x2_t v1301 = vsub_f32(v1300, v1296); + float32x2_t v1319 = vsub_f32(v1318, v1314); + float32x2_t v1330 = vmul_f32(v1314, v1344); + float32x2_t v1345 = vmul_f32(v1296, v1344); + float32x2_t v908 = vmul_f32(v903, v1324); + float32x2_t v918 = vmul_f32(v904, v1334); + float32x2_t v930 = vadd_f32(v129, v903); + float32x2_t v941 = vrev64_f32(v914); + float32x2_t v954 = vrev64_f32(v929); + float32x2_t v1048 = vsub_f32(v137, v1047); + v6[ostride * 2] = v1069; + float32x2_t v1081 = vmul_f32(v1080, v1370); + float32x2_t v1094 = vmul_f32(v1093, v1370); + float32x2_t v1186 = vmul_f32(v1181, v1324); + float32x2_t v1196 = vmul_f32(v1182, v1334); + float32x2_t v1208 = vadd_f32(v142, v1181); + float32x2_t v1219 = vrev64_f32(v1192); + float32x2_t v1232 = vrev64_f32(v1207); + float32x2_t v1320 = vadd_f32(v1301, v1319); + float32x2_t v1321 = vsub_f32(v1301, v1319); + float32x2_t v1331 = vadd_f32(v1296, v1330); + float32x2_t v1346 = vsub_f32(v1345, v1314); + float32x2_t v909 = vsub_f32(v129, v908); + v6[ostride] = v930; + float32x2_t v942 = vmul_f32(v941, v1370); + float32x2_t v955 = vmul_f32(v954, v1370); + float32x2_t v1058 = vsub_f32(v1048, v1057); + float32x2_t v1062 = vmul_f32(v1048, v1391); + float32x2_t v1187 = vsub_f32(v142, v1186); + v6[ostride * 3] = v1208; + float32x2_t v1220 = vmul_f32(v1219, v1370); + float32x2_t v1233 = vmul_f32(v1232, v1370); + float32x2_t v1325 = vmul_f32(v1320, v1324); + float32x2_t v1335 = vmul_f32(v1321, v1334); + float32x2_t v1347 = vadd_f32(v147, v1320); + float32x2_t v1358 = vrev64_f32(v1331); + float32x2_t v1371 = vrev64_f32(v1346); + float32x2_t v919 = vsub_f32(v909, v918); + float32x2_t v923 = vmul_f32(v909, v1391); + float32x2_t v1063 = vsub_f32(v1062, v1058); + float32x2_t v1095 = vsub_f32(v1058, v1094); + float32x2_t v1104 = vmul_f32(v1058, v1391); + float32x2_t v1197 = vsub_f32(v1187, v1196); + float32x2_t v1201 = vmul_f32(v1187, v1391); + float32x2_t v1326 = vsub_f32(v147, v1325); + v6[ostride * 4] = v1347; + float32x2_t v1359 = vmul_f32(v1358, v1370); + float32x2_t v1372 = vmul_f32(v1371, v1370); + float32x2_t v924 = vsub_f32(v923, v919); + float32x2_t v956 = vsub_f32(v919, v955); + float32x2_t v965 = vmul_f32(v919, v1391); + float32x2_t v1082 = vsub_f32(v1063, v1081); + v6[ostride * 12] = v1095; + float32x2_t v1105 = vsub_f32(v1104, v1095); + float32x2_t v1114 = vmul_f32(v1063, v1391); + float32x2_t v1202 = vsub_f32(v1201, v1197); + float32x2_t v1234 = vsub_f32(v1197, v1233); + float32x2_t v1243 = vmul_f32(v1197, v1391); + float32x2_t v1336 = vsub_f32(v1326, v1335); + float32x2_t v1340 = vmul_f32(v1326, v1391); + float32x2_t v943 = vsub_f32(v924, v942); + v6[ostride * 11] = v956; + float32x2_t v966 = vsub_f32(v965, v956); + float32x2_t v975 = vmul_f32(v924, v1391); + v6[ostride * 7] = v1082; + v6[ostride * 17] = v1105; + float32x2_t v1115 = vsub_f32(v1114, v1082); + float32x2_t v1221 = vsub_f32(v1202, v1220); + v6[ostride * 13] = v1234; + float32x2_t v1244 = vsub_f32(v1243, v1234); + float32x2_t v1253 = vmul_f32(v1202, v1391); + float32x2_t v1341 = vsub_f32(v1340, v1336); + float32x2_t v1373 = vsub_f32(v1336, v1372); + float32x2_t v1382 = vmul_f32(v1336, v1391); + v6[ostride * 6] = v943; + v6[ostride * 16] = v966; + float32x2_t v976 = vsub_f32(v975, v943); + v6[ostride * 22] = v1115; + v6[ostride * 8] = v1221; + v6[ostride * 18] = v1244; + float32x2_t v1254 = vsub_f32(v1253, v1221); + float32x2_t v1360 = vsub_f32(v1341, v1359); + v6[ostride * 14] = v1373; + float32x2_t v1383 = vsub_f32(v1382, v1373); + float32x2_t v1392 = vmul_f32(v1341, v1391); + v6[ostride * 21] = v976; + v6[ostride * 23] = v1254; + v6[ostride * 9] = v1360; + v6[ostride * 19] = v1383; + float32x2_t v1393 = vsub_f32(v1392, v1360); + v6[ostride * 24] = v1393; +} +#endif + +#ifdef ARMRAL_ARCH_SVE +void armral_fft_cf32_cf32_cf32_ac_n_uun25(const armral_cmplx_f32_t *restrict x, + armral_cmplx_f32_t *restrict y, + int istride, int ostride, int howmany, + float dir) { + int64_t v0 = istride; + int64_t v2 = ostride; + float v4 = dir; + const float32x2_t *v5 = (const float32x2_t *)x; + float32x2_t *v6 = (float32x2_t *)y; + svbool_t pred_full = svptrue_pat_b32(SV_VL2); + float v987 = 9.6858316112863108e-01F; + float v992 = 2.4868988716485479e-01F; + float v1149 = 8.7630668004386358e-01F; + float v1154 = 4.8175367410171532e-01F; + float v1311 = 7.2896862742141155e-01F; + float v1316 = 6.8454710592868862e-01F; + float v1324 = 6.2790519529313527e-02F; + float v1329 = 9.9802672842827156e-01F; + float v1362 = 7.7051324277578925e-01F; + float v1473 = 5.3582679497899655e-01F; + float v1478 = 8.4432792550201508e-01F; + float v1486 = -4.2577929156507272e-01F; + float v1491 = 9.0482705246601947e-01F; + float v1499 = -6.3742398974868952e-01F; + float v1504 = -7.7051324277578936e-01F; + float v1519 = -9.9211470131447776e-01F; + float v1524 = 1.2533323356430454e-01F; + float v1541 = 2.5000000000000000e-01F; + float v1553 = 5.5901699437494745e-01F; + float v1565 = 6.1803398874989490e-01F; + float v1594 = -9.5105651629515353e-01F; + float v1622 = 2.0000000000000000e+00F; + const float32x2_t *v1705 = &v5[v0]; + float32x2_t *v2041 = &v6[v2]; + int64_t v22 = v0 * 5; + int64_t v29 = v0 * 10; + int64_t v36 = v0 * 15; + int64_t v43 = v0 * 20; + int64_t v184 = v0 * 6; + int64_t v191 = v0 * 11; + int64_t v198 = v0 * 16; + int64_t v205 = v0 * 21; + int64_t v339 = v0 * 2; + int64_t v346 = v0 * 7; + int64_t v353 = v0 * 12; + int64_t v360 = v0 * 17; + int64_t v367 = v0 * 22; + int64_t v501 = v0 * 3; + int64_t v508 = v0 * 8; + int64_t v515 = v0 * 13; + int64_t v522 = v0 * 18; + int64_t v529 = v0 * 23; + int64_t v663 = v0 * 4; + int64_t v670 = v0 * 9; + int64_t v677 = v0 * 14; + int64_t v684 = v0 * 19; + int64_t v691 = v0 * 24; + int64_t v939 = v2 * 5; + int64_t v954 = v2 * 10; + int64_t v967 = v2 * 15; + int64_t v980 = v2 * 20; + float v995 = v4 * v992; + int64_t v1101 = v2 * 6; + int64_t v1116 = v2 * 11; + int64_t v1129 = v2 * 16; + int64_t v1142 = v2 * 21; + float v1157 = v4 * v1154; + int64_t v1248 = v2 * 2; + int64_t v1263 = v2 * 7; + int64_t v1278 = v2 * 12; + int64_t v1291 = v2 * 17; + int64_t v1304 = v2 * 22; + float v1319 = v4 * v1316; + float v1332 = v4 * v1329; + float v1365 = v4 * v1362; + int64_t v1410 = v2 * 3; + int64_t v1425 = v2 * 8; + int64_t v1440 = v2 * 13; + int64_t v1453 = v2 * 18; + int64_t v1466 = v2 * 23; + float v1481 = v4 * v1478; + float v1494 = v4 * v1491; + float v1507 = v4 * v1504; + float v1527 = v4 * v1524; + int64_t v1572 = v2 * 4; + int64_t v1587 = v2 * 9; + float v1597 = v4 * v1594; + int64_t v1602 = v2 * 14; + int64_t v1615 = v2 * 19; + int64_t v1628 = v2 * 24; + const float32x2_t *v1641 = &v5[0]; + svfloat32_t v1963 = svdup_n_f32(0); + float32x2_t *v1977 = &v6[0]; + svfloat32_t v2020 = svdup_n_f32(v987); + svfloat32_t v2084 = svdup_n_f32(v1149); + svfloat32_t v2148 = svdup_n_f32(v1311); + svfloat32_t v2150 = svdup_n_f32(v1324); + svfloat32_t v2212 = svdup_n_f32(v1473); + svfloat32_t v2214 = svdup_n_f32(v1486); + svfloat32_t v2216 = svdup_n_f32(v1499); + svfloat32_t v2219 = svdup_n_f32(v1519); + svfloat32_t v2222 = svdup_n_f32(v1541); + svfloat32_t v2224 = svdup_n_f32(v1553); + svfloat32_t v2226 = svdup_n_f32(v1565); + svfloat32_t v2266 = svdup_n_f32(v1622); + svfloat32_t v2311 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v1705)[0])); + const float32x2_t *v1650 = &v5[v22]; + const float32x2_t *v1659 = &v5[v29]; + const float32x2_t *v1668 = &v5[v36]; + const float32x2_t *v1677 = &v5[v43]; + const float32x2_t *v1714 = &v5[v184]; + const float32x2_t *v1723 = &v5[v191]; + const float32x2_t *v1732 = &v5[v198]; + const float32x2_t *v1741 = &v5[v205]; + const float32x2_t *v1769 = &v5[v339]; + const float32x2_t *v1778 = &v5[v346]; + const float32x2_t *v1787 = &v5[v353]; + const float32x2_t *v1796 = &v5[v360]; + const float32x2_t *v1805 = &v5[v367]; + const float32x2_t *v1833 = &v5[v501]; + const float32x2_t *v1842 = &v5[v508]; + const float32x2_t *v1851 = &v5[v515]; + const float32x2_t *v1860 = &v5[v522]; + const float32x2_t *v1869 = &v5[v529]; + const float32x2_t *v1897 = &v5[v663]; + const float32x2_t *v1906 = &v5[v670]; + const float32x2_t *v1915 = &v5[v677]; + const float32x2_t *v1924 = &v5[v684]; + const float32x2_t *v1933 = &v5[v691]; + float32x2_t *v1987 = &v6[v939]; + float32x2_t *v1997 = &v6[v954]; + float32x2_t *v2007 = &v6[v967]; + float32x2_t *v2017 = &v6[v980]; + svfloat32_t v2021 = svdup_n_f32(v995); + float32x2_t *v2051 = &v6[v1101]; + float32x2_t *v2061 = &v6[v1116]; + float32x2_t *v2071 = &v6[v1129]; + float32x2_t *v2081 = &v6[v1142]; + svfloat32_t v2085 = svdup_n_f32(v1157); + float32x2_t *v2105 = &v6[v1248]; + float32x2_t *v2115 = &v6[v1263]; + float32x2_t *v2125 = &v6[v1278]; + float32x2_t *v2135 = &v6[v1291]; + float32x2_t *v2145 = &v6[v1304]; + svfloat32_t v2149 = svdup_n_f32(v1319); + svfloat32_t v2151 = svdup_n_f32(v1332); + svfloat32_t v2156 = svdup_n_f32(v1365); + float32x2_t *v2169 = &v6[v1410]; + float32x2_t *v2179 = &v6[v1425]; + float32x2_t *v2189 = &v6[v1440]; + float32x2_t *v2199 = &v6[v1453]; + float32x2_t *v2209 = &v6[v1466]; + svfloat32_t v2213 = svdup_n_f32(v1481); + svfloat32_t v2215 = svdup_n_f32(v1494); + svfloat32_t v2217 = svdup_n_f32(v1507); + svfloat32_t v2220 = svdup_n_f32(v1527); + float32x2_t *v2233 = &v6[v1572]; + float32x2_t *v2243 = &v6[v1587]; + svfloat32_t v2246 = svdup_n_f32(v1597); + float32x2_t *v2253 = &v6[v1602]; + float32x2_t *v2263 = &v6[v1615]; + float32x2_t *v2273 = &v6[v1628]; + svfloat32_t v2301 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v1641)[0])); + svfloat32_t v2303 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v1650)[0])); + svfloat32_t v2305 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v1659)[0])); + svfloat32_t v2307 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v1668)[0])); + svfloat32_t v2309 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v1677)[0])); + svfloat32_t v2313 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v1714)[0])); + svfloat32_t v2315 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v1723)[0])); + svfloat32_t v2317 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v1732)[0])); + svfloat32_t v2319 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v1741)[0])); + svfloat32_t v2321 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v1769)[0])); + svfloat32_t v2323 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v1778)[0])); + svfloat32_t v2325 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v1787)[0])); + svfloat32_t v2327 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v1796)[0])); + svfloat32_t v2329 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v1805)[0])); + svfloat32_t v2331 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v1833)[0])); + svfloat32_t v2333 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v1842)[0])); + svfloat32_t v2335 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v1851)[0])); + svfloat32_t v2337 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v1860)[0])); + svfloat32_t v2339 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v1869)[0])); + svfloat32_t v2341 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v1897)[0])); + svfloat32_t v2343 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v1906)[0])); + svfloat32_t v2345 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v1915)[0])); + svfloat32_t v2347 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v1924)[0])); + svfloat32_t v2349 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v1933)[0])); + svfloat32_t v61 = svcmla_f32_x(pred_full, v2303, v1963, v2303, 90); + svfloat32_t v74 = svcmla_f32_x(pred_full, v2305, v1963, v2305, 90); + svfloat32_t v87 = svcmla_f32_x(pred_full, v2309, v1963, v2309, 90); + svfloat32_t v107 = svcmla_f32_x(pred_full, v2307, v1963, v2307, 90); + svfloat32_t v223 = svcmla_f32_x(pred_full, v2313, v1963, v2313, 90); + svfloat32_t v236 = svcmla_f32_x(pred_full, v2315, v1963, v2315, 90); + svfloat32_t v249 = svcmla_f32_x(pred_full, v2319, v1963, v2319, 90); + svfloat32_t v269 = svcmla_f32_x(pred_full, v2317, v1963, v2317, 90); + svfloat32_t v385 = svcmla_f32_x(pred_full, v2323, v1963, v2323, 90); + svfloat32_t v398 = svcmla_f32_x(pred_full, v2325, v1963, v2325, 90); + svfloat32_t v411 = svcmla_f32_x(pred_full, v2329, v1963, v2329, 90); + svfloat32_t v431 = svcmla_f32_x(pred_full, v2327, v1963, v2327, 90); + svfloat32_t v547 = svcmla_f32_x(pred_full, v2333, v1963, v2333, 90); + svfloat32_t v560 = svcmla_f32_x(pred_full, v2335, v1963, v2335, 90); + svfloat32_t v573 = svcmla_f32_x(pred_full, v2339, v1963, v2339, 90); + svfloat32_t v593 = svcmla_f32_x(pred_full, v2337, v1963, v2337, 90); + svfloat32_t v709 = svcmla_f32_x(pred_full, v2343, v1963, v2343, 90); + svfloat32_t v722 = svcmla_f32_x(pred_full, v2345, v1963, v2345, 90); + svfloat32_t v735 = svcmla_f32_x(pred_full, v2349, v1963, v2349, 90); + svfloat32_t v755 = svcmla_f32_x(pred_full, v2347, v1963, v2347, 90); + svfloat32_t v88 = svsub_f32_x(svptrue_b32(), v61, v87); + svfloat32_t v108 = svsub_f32_x(svptrue_b32(), v74, v107); + svfloat32_t v250 = svsub_f32_x(svptrue_b32(), v223, v249); + svfloat32_t v270 = svsub_f32_x(svptrue_b32(), v236, v269); + svfloat32_t v412 = svsub_f32_x(svptrue_b32(), v385, v411); + svfloat32_t v432 = svsub_f32_x(svptrue_b32(), v398, v431); + svfloat32_t v574 = svsub_f32_x(svptrue_b32(), v547, v573); + svfloat32_t v594 = svsub_f32_x(svptrue_b32(), v560, v593); + svfloat32_t v736 = svsub_f32_x(svptrue_b32(), v709, v735); + svfloat32_t v756 = svsub_f32_x(svptrue_b32(), v722, v755); + svfloat32_t v94 = svnmls_f32_x(pred_full, v88, v61, v2266); + svfloat32_t v114 = svnmls_f32_x(pred_full, v108, v74, v2266); + svfloat32_t v256 = svnmls_f32_x(pred_full, v250, v223, v2266); + svfloat32_t v276 = svnmls_f32_x(pred_full, v270, v236, v2266); + svfloat32_t v418 = svnmls_f32_x(pred_full, v412, v385, v2266); + svfloat32_t v438 = svnmls_f32_x(pred_full, v432, v398, v2266); + svfloat32_t v580 = svnmls_f32_x(pred_full, v574, v547, v2266); + svfloat32_t v600 = svnmls_f32_x(pred_full, v594, v560, v2266); + svfloat32_t v742 = svnmls_f32_x(pred_full, v736, v709, v2266); + svfloat32_t v762 = svnmls_f32_x(pred_full, v756, v722, v2266); + svfloat32_t v115 = svadd_f32_x(svptrue_b32(), v94, v114); + svfloat32_t v116 = svsub_f32_x(svptrue_b32(), v94, v114); + svfloat32_t v128 = svmla_f32_x(pred_full, v88, v108, v2226); + svfloat32_t v146 = svnmls_f32_x(pred_full, v108, v88, v2226); + svfloat32_t v277 = svadd_f32_x(svptrue_b32(), v256, v276); + svfloat32_t v278 = svsub_f32_x(svptrue_b32(), v256, v276); + svfloat32_t v290 = svmla_f32_x(pred_full, v250, v270, v2226); + svfloat32_t v308 = svnmls_f32_x(pred_full, v270, v250, v2226); + svfloat32_t v439 = svadd_f32_x(svptrue_b32(), v418, v438); + svfloat32_t v440 = svsub_f32_x(svptrue_b32(), v418, v438); + svfloat32_t v452 = svmla_f32_x(pred_full, v412, v432, v2226); + svfloat32_t v470 = svnmls_f32_x(pred_full, v432, v412, v2226); + svfloat32_t v601 = svadd_f32_x(svptrue_b32(), v580, v600); + svfloat32_t v602 = svsub_f32_x(svptrue_b32(), v580, v600); + svfloat32_t v614 = svmla_f32_x(pred_full, v574, v594, v2226); + svfloat32_t v632 = svnmls_f32_x(pred_full, v594, v574, v2226); + svfloat32_t v763 = svadd_f32_x(svptrue_b32(), v742, v762); + svfloat32_t v764 = svsub_f32_x(svptrue_b32(), v742, v762); + svfloat32_t v776 = svmla_f32_x(pred_full, v736, v756, v2226); + svfloat32_t v794 = svnmls_f32_x(pred_full, v756, v736, v2226); + svfloat32_t v147 = svadd_f32_x(svptrue_b32(), v2301, v115); + svfloat32_t zero154 = svdup_n_f32(0); + svfloat32_t v154 = svcmla_f32_x(pred_full, zero154, v2246, v128, 90); + svfloat32_t zero162 = svdup_n_f32(0); + svfloat32_t v162 = svcmla_f32_x(pred_full, zero162, v2246, v146, 90); + svfloat32_t v309 = svadd_f32_x(svptrue_b32(), v2311, v277); + svfloat32_t zero316 = svdup_n_f32(0); + svfloat32_t v316 = svcmla_f32_x(pred_full, zero316, v2246, v290, 90); + svfloat32_t zero324 = svdup_n_f32(0); + svfloat32_t v324 = svcmla_f32_x(pred_full, zero324, v2246, v308, 90); + svfloat32_t v471 = svadd_f32_x(svptrue_b32(), v2321, v439); + svfloat32_t zero478 = svdup_n_f32(0); + svfloat32_t v478 = svcmla_f32_x(pred_full, zero478, v2246, v452, 90); + svfloat32_t zero486 = svdup_n_f32(0); + svfloat32_t v486 = svcmla_f32_x(pred_full, zero486, v2246, v470, 90); + svfloat32_t v633 = svadd_f32_x(svptrue_b32(), v2331, v601); + svfloat32_t zero640 = svdup_n_f32(0); + svfloat32_t v640 = svcmla_f32_x(pred_full, zero640, v2246, v614, 90); + svfloat32_t zero648 = svdup_n_f32(0); + svfloat32_t v648 = svcmla_f32_x(pred_full, zero648, v2246, v632, 90); + svfloat32_t v795 = svadd_f32_x(svptrue_b32(), v2341, v763); + svfloat32_t zero802 = svdup_n_f32(0); + svfloat32_t v802 = svcmla_f32_x(pred_full, zero802, v2246, v776, 90); + svfloat32_t zero810 = svdup_n_f32(0); + svfloat32_t v810 = svcmla_f32_x(pred_full, zero810, v2246, v794, 90); + svfloat32_t v122 = svmls_f32_x(pred_full, v2301, v115, v2222); + svfloat32_t v284 = svmls_f32_x(pred_full, v2311, v277, v2222); + svfloat32_t v446 = svmls_f32_x(pred_full, v2321, v439, v2222); + svfloat32_t v608 = svmls_f32_x(pred_full, v2331, v601, v2222); + svfloat32_t v770 = svmls_f32_x(pred_full, v2341, v763, v2222); + svfloat32_t v134 = svmls_f32_x(pred_full, v122, v116, v2224); + svfloat32_t v296 = svmls_f32_x(pred_full, v284, v278, v2224); + svfloat32_t v458 = svmls_f32_x(pred_full, v446, v440, v2224); + svfloat32_t v620 = svmls_f32_x(pred_full, v608, v602, v2224); + svfloat32_t v782 = svmls_f32_x(pred_full, v770, v764, v2224); + svfloat32_t v836 = svcmla_f32_x(pred_full, v309, v1963, v309, 90); + svfloat32_t v849 = svcmla_f32_x(pred_full, v471, v1963, v471, 90); + svfloat32_t v862 = svcmla_f32_x(pred_full, v795, v1963, v795, 90); + svfloat32_t v882 = svcmla_f32_x(pred_full, v633, v1963, v633, 90); + svfloat32_t v140 = svnmls_f32_x(pred_full, v134, v122, v2266); + svfloat32_t v163 = svsub_f32_x(svptrue_b32(), v134, v162); + svfloat32_t v302 = svnmls_f32_x(pred_full, v296, v284, v2266); + svfloat32_t v325 = svsub_f32_x(svptrue_b32(), v296, v324); + svfloat32_t v464 = svnmls_f32_x(pred_full, v458, v446, v2266); + svfloat32_t v487 = svsub_f32_x(svptrue_b32(), v458, v486); + svfloat32_t v626 = svnmls_f32_x(pred_full, v620, v608, v2266); + svfloat32_t v649 = svsub_f32_x(svptrue_b32(), v620, v648); + svfloat32_t v788 = svnmls_f32_x(pred_full, v782, v770, v2266); + svfloat32_t v811 = svsub_f32_x(svptrue_b32(), v782, v810); + svfloat32_t v863 = svsub_f32_x(svptrue_b32(), v836, v862); + svfloat32_t v883 = svsub_f32_x(svptrue_b32(), v849, v882); + svfloat32_t v155 = svsub_f32_x(svptrue_b32(), v140, v154); + svfloat32_t v169 = svnmls_f32_x(pred_full, v163, v134, v2266); + svfloat32_t v317 = svsub_f32_x(svptrue_b32(), v302, v316); + svfloat32_t v331 = svnmls_f32_x(pred_full, v325, v296, v2266); + svfloat32_t v479 = svsub_f32_x(svptrue_b32(), v464, v478); + svfloat32_t v493 = svnmls_f32_x(pred_full, v487, v458, v2266); + svfloat32_t v641 = svsub_f32_x(svptrue_b32(), v626, v640); + svfloat32_t v655 = svnmls_f32_x(pred_full, v649, v620, v2266); + svfloat32_t v803 = svsub_f32_x(svptrue_b32(), v788, v802); + svfloat32_t v817 = svnmls_f32_x(pred_full, v811, v782, v2266); + svfloat32_t v869 = svnmls_f32_x(pred_full, v863, v836, v2266); + svfloat32_t v889 = svnmls_f32_x(pred_full, v883, v849, v2266); + svfloat32_t v1152 = svmul_f32_x(svptrue_b32(), v325, v2084); + svfloat32_t v1165 = svmul_f32_x(svptrue_b32(), v487, v2212); + svfloat32_t v1178 = svmul_f32_x(svptrue_b32(), v811, v2214); + svfloat32_t v1198 = svmul_f32_x(svptrue_b32(), v649, v2150); + svfloat32_t v175 = svnmls_f32_x(pred_full, v155, v140, v2266); + svfloat32_t v337 = svnmls_f32_x(pred_full, v317, v302, v2266); + svfloat32_t v499 = svnmls_f32_x(pred_full, v479, v464, v2266); + svfloat32_t v661 = svnmls_f32_x(pred_full, v641, v626, v2266); + svfloat32_t v823 = svnmls_f32_x(pred_full, v803, v788, v2266); + svfloat32_t v890 = svadd_f32_x(svptrue_b32(), v869, v889); + svfloat32_t v891 = svsub_f32_x(svptrue_b32(), v869, v889); + svfloat32_t v903 = svmla_f32_x(pred_full, v863, v883, v2226); + svfloat32_t v921 = svnmls_f32_x(pred_full, v883, v863, v2226); + svfloat32_t v990 = svmul_f32_x(svptrue_b32(), v317, v2020); + svfloat32_t v1003 = svmul_f32_x(svptrue_b32(), v479, v2084); + svfloat32_t v1016 = svmul_f32_x(svptrue_b32(), v803, v2212); + svfloat32_t v1036 = svmul_f32_x(svptrue_b32(), v641, v2148); + svfloat32_t v1160 = svcmla_f32_x(pred_full, v1152, v2085, v325, 90); + svfloat32_t v1173 = svcmla_f32_x(pred_full, v1165, v2213, v487, 90); + svfloat32_t v1186 = svcmla_f32_x(pred_full, v1178, v2215, v811, 90); + svfloat32_t v1206 = svcmla_f32_x(pred_full, v1198, v2151, v649, 90); + svfloat32_t v1314 = svmul_f32_x(svptrue_b32(), v331, v2148); + svfloat32_t v1327 = svmul_f32_x(svptrue_b32(), v493, v2150); + svfloat32_t v1340 = svmul_f32_x(svptrue_b32(), v817, v2219); + svfloat32_t v1360 = svmul_f32_x(svptrue_b32(), v655, v2216); + svfloat32_t v922 = svadd_f32_x(svptrue_b32(), v147, v890); + svfloat32_t zero936 = svdup_n_f32(0); + svfloat32_t v936 = svcmla_f32_x(pred_full, zero936, v2246, v903, 90); + svfloat32_t zero951 = svdup_n_f32(0); + svfloat32_t v951 = svcmla_f32_x(pred_full, zero951, v2246, v921, 90); + svfloat32_t v998 = svcmla_f32_x(pred_full, v990, v2021, v317, 90); + svfloat32_t v1011 = svcmla_f32_x(pred_full, v1003, v2085, v479, 90); + svfloat32_t v1024 = svcmla_f32_x(pred_full, v1016, v2213, v803, 90); + svfloat32_t v1044 = svcmla_f32_x(pred_full, v1036, v2149, v641, 90); + svfloat32_t v1187 = svsub_f32_x(svptrue_b32(), v1160, v1186); + svfloat32_t v1207 = svsub_f32_x(svptrue_b32(), v1173, v1206); + svfloat32_t v1322 = svcmla_f32_x(pred_full, v1314, v2149, v331, 90); + svfloat32_t v1335 = svcmla_f32_x(pred_full, v1327, v2151, v493, 90); + svfloat32_t v1348 = svcmla_f32_x(pred_full, v1340, v2220, v817, 90); + svfloat32_t v1368 = svcmla_f32_x(pred_full, v1360, v2156, v655, 90); + svfloat32_t v1476 = svmul_f32_x(svptrue_b32(), v337, v2212); + svfloat32_t v1489 = svmul_f32_x(svptrue_b32(), v499, v2214); + svfloat32_t v1502 = svmul_f32_x(svptrue_b32(), v823, v2216); + svfloat32_t v1522 = svmul_f32_x(svptrue_b32(), v661, v2219); + svfloat32_t v897 = svmls_f32_x(pred_full, v147, v890, v2222); + svfloat32_t v1025 = svsub_f32_x(svptrue_b32(), v998, v1024); + svfloat32_t v1045 = svsub_f32_x(svptrue_b32(), v1011, v1044); + svfloat32_t v1193 = svnmls_f32_x(pred_full, v1187, v1160, v2266); + svfloat32_t v1213 = svnmls_f32_x(pred_full, v1207, v1173, v2266); + svfloat32_t v1349 = svsub_f32_x(svptrue_b32(), v1322, v1348); + svfloat32_t v1369 = svsub_f32_x(svptrue_b32(), v1335, v1368); + svfloat32_t v1484 = svcmla_f32_x(pred_full, v1476, v2213, v337, 90); + svfloat32_t v1497 = svcmla_f32_x(pred_full, v1489, v2215, v499, 90); + svfloat32_t v1510 = svcmla_f32_x(pred_full, v1502, v2217, v823, 90); + svfloat32_t v1530 = svcmla_f32_x(pred_full, v1522, v2220, v661, 90); + svst1_f64(pred_full, (double *)(v1977), svreinterpret_f64_f32(v922)); + svfloat32_t v909 = svmls_f32_x(pred_full, v897, v891, v2224); + svfloat32_t v1031 = svnmls_f32_x(pred_full, v1025, v998, v2266); + svfloat32_t v1051 = svnmls_f32_x(pred_full, v1045, v1011, v2266); + svfloat32_t v1214 = svadd_f32_x(svptrue_b32(), v1193, v1213); + svfloat32_t v1215 = svsub_f32_x(svptrue_b32(), v1193, v1213); + svfloat32_t v1227 = svmla_f32_x(pred_full, v1187, v1207, v2226); + svfloat32_t v1245 = svnmls_f32_x(pred_full, v1207, v1187, v2226); + svfloat32_t v1355 = svnmls_f32_x(pred_full, v1349, v1322, v2266); + svfloat32_t v1375 = svnmls_f32_x(pred_full, v1369, v1335, v2266); + svfloat32_t v1511 = svsub_f32_x(svptrue_b32(), v1484, v1510); + svfloat32_t v1531 = svsub_f32_x(svptrue_b32(), v1497, v1530); + svfloat32_t v915 = svnmls_f32_x(pred_full, v909, v897, v2266); + svfloat32_t v952 = svsub_f32_x(svptrue_b32(), v909, v951); + svfloat32_t v1052 = svadd_f32_x(svptrue_b32(), v1031, v1051); + svfloat32_t v1053 = svsub_f32_x(svptrue_b32(), v1031, v1051); + svfloat32_t v1065 = svmla_f32_x(pred_full, v1025, v1045, v2226); + svfloat32_t v1083 = svnmls_f32_x(pred_full, v1045, v1025, v2226); + svfloat32_t v1246 = svadd_f32_x(svptrue_b32(), v163, v1214); + svfloat32_t zero1260 = svdup_n_f32(0); + svfloat32_t v1260 = svcmla_f32_x(pred_full, zero1260, v2246, v1227, 90); + svfloat32_t zero1275 = svdup_n_f32(0); + svfloat32_t v1275 = svcmla_f32_x(pred_full, zero1275, v2246, v1245, 90); + svfloat32_t v1376 = svadd_f32_x(svptrue_b32(), v1355, v1375); + svfloat32_t v1377 = svsub_f32_x(svptrue_b32(), v1355, v1375); + svfloat32_t v1389 = svmla_f32_x(pred_full, v1349, v1369, v2226); + svfloat32_t v1407 = svnmls_f32_x(pred_full, v1369, v1349, v2226); + svfloat32_t v1517 = svnmls_f32_x(pred_full, v1511, v1484, v2266); + svfloat32_t v1537 = svnmls_f32_x(pred_full, v1531, v1497, v2266); + svfloat32_t v937 = svsub_f32_x(svptrue_b32(), v915, v936); + svfloat32_t v965 = svnmls_f32_x(pred_full, v952, v909, v2266); + svfloat32_t v1084 = svadd_f32_x(svptrue_b32(), v155, v1052); + svfloat32_t zero1098 = svdup_n_f32(0); + svfloat32_t v1098 = svcmla_f32_x(pred_full, zero1098, v2246, v1065, 90); + svfloat32_t zero1113 = svdup_n_f32(0); + svfloat32_t v1113 = svcmla_f32_x(pred_full, zero1113, v2246, v1083, 90); + svfloat32_t v1221 = svmls_f32_x(pred_full, v163, v1214, v2222); + svfloat32_t v1408 = svadd_f32_x(svptrue_b32(), v169, v1376); + svfloat32_t zero1422 = svdup_n_f32(0); + svfloat32_t v1422 = svcmla_f32_x(pred_full, zero1422, v2246, v1389, 90); + svfloat32_t zero1437 = svdup_n_f32(0); + svfloat32_t v1437 = svcmla_f32_x(pred_full, zero1437, v2246, v1407, 90); + svfloat32_t v1538 = svadd_f32_x(svptrue_b32(), v1517, v1537); + svfloat32_t v1539 = svsub_f32_x(svptrue_b32(), v1517, v1537); + svfloat32_t v1551 = svmla_f32_x(pred_full, v1511, v1531, v2226); + svfloat32_t v1569 = svnmls_f32_x(pred_full, v1531, v1511, v2226); + svst1_f64(pred_full, (double *)(v1997), svreinterpret_f64_f32(v952)); + svst1_f64(pred_full, (double *)(v2105), svreinterpret_f64_f32(v1246)); + svfloat32_t v978 = svnmls_f32_x(pred_full, v937, v915, v2266); + svfloat32_t v1059 = svmls_f32_x(pred_full, v155, v1052, v2222); + svfloat32_t v1233 = svmls_f32_x(pred_full, v1221, v1215, v2224); + svfloat32_t v1383 = svmls_f32_x(pred_full, v169, v1376, v2222); + svfloat32_t v1570 = svadd_f32_x(svptrue_b32(), v175, v1538); + svfloat32_t zero1584 = svdup_n_f32(0); + svfloat32_t v1584 = svcmla_f32_x(pred_full, zero1584, v2246, v1551, 90); + svfloat32_t zero1599 = svdup_n_f32(0); + svfloat32_t v1599 = svcmla_f32_x(pred_full, zero1599, v2246, v1569, 90); + svst1_f64(pred_full, (double *)(v1987), svreinterpret_f64_f32(v937)); + svst1_f64(pred_full, (double *)(v2007), svreinterpret_f64_f32(v965)); + svst1_f64(pred_full, (double *)(v2041), svreinterpret_f64_f32(v1084)); + svst1_f64(pred_full, (double *)(v2169), svreinterpret_f64_f32(v1408)); + svfloat32_t v1071 = svmls_f32_x(pred_full, v1059, v1053, v2224); + svfloat32_t v1239 = svnmls_f32_x(pred_full, v1233, v1221, v2266); + svfloat32_t v1276 = svsub_f32_x(svptrue_b32(), v1233, v1275); + svfloat32_t v1395 = svmls_f32_x(pred_full, v1383, v1377, v2224); + svfloat32_t v1545 = svmls_f32_x(pred_full, v175, v1538, v2222); + svst1_f64(pred_full, (double *)(v2017), svreinterpret_f64_f32(v978)); + svst1_f64(pred_full, (double *)(v2233), svreinterpret_f64_f32(v1570)); + svfloat32_t v1077 = svnmls_f32_x(pred_full, v1071, v1059, v2266); + svfloat32_t v1114 = svsub_f32_x(svptrue_b32(), v1071, v1113); + svfloat32_t v1261 = svsub_f32_x(svptrue_b32(), v1239, v1260); + svfloat32_t v1289 = svnmls_f32_x(pred_full, v1276, v1233, v2266); + svfloat32_t v1401 = svnmls_f32_x(pred_full, v1395, v1383, v2266); + svfloat32_t v1438 = svsub_f32_x(svptrue_b32(), v1395, v1437); + svfloat32_t v1557 = svmls_f32_x(pred_full, v1545, v1539, v2224); + svst1_f64(pred_full, (double *)(v2125), svreinterpret_f64_f32(v1276)); + svfloat32_t v1099 = svsub_f32_x(svptrue_b32(), v1077, v1098); + svfloat32_t v1127 = svnmls_f32_x(pred_full, v1114, v1071, v2266); + svfloat32_t v1302 = svnmls_f32_x(pred_full, v1261, v1239, v2266); + svfloat32_t v1423 = svsub_f32_x(svptrue_b32(), v1401, v1422); + svfloat32_t v1451 = svnmls_f32_x(pred_full, v1438, v1395, v2266); + svfloat32_t v1563 = svnmls_f32_x(pred_full, v1557, v1545, v2266); + svfloat32_t v1600 = svsub_f32_x(svptrue_b32(), v1557, v1599); + svst1_f64(pred_full, (double *)(v2061), svreinterpret_f64_f32(v1114)); + svst1_f64(pred_full, (double *)(v2115), svreinterpret_f64_f32(v1261)); + svst1_f64(pred_full, (double *)(v2135), svreinterpret_f64_f32(v1289)); + svst1_f64(pred_full, (double *)(v2189), svreinterpret_f64_f32(v1438)); + svfloat32_t v1140 = svnmls_f32_x(pred_full, v1099, v1077, v2266); + svfloat32_t v1464 = svnmls_f32_x(pred_full, v1423, v1401, v2266); + svfloat32_t v1585 = svsub_f32_x(svptrue_b32(), v1563, v1584); + svfloat32_t v1613 = svnmls_f32_x(pred_full, v1600, v1557, v2266); + svst1_f64(pred_full, (double *)(v2051), svreinterpret_f64_f32(v1099)); + svst1_f64(pred_full, (double *)(v2071), svreinterpret_f64_f32(v1127)); + svst1_f64(pred_full, (double *)(v2145), svreinterpret_f64_f32(v1302)); + svst1_f64(pred_full, (double *)(v2179), svreinterpret_f64_f32(v1423)); + svst1_f64(pred_full, (double *)(v2199), svreinterpret_f64_f32(v1451)); + svst1_f64(pred_full, (double *)(v2253), svreinterpret_f64_f32(v1600)); + svfloat32_t v1626 = svnmls_f32_x(pred_full, v1585, v1563, v2266); + svst1_f64(pred_full, (double *)(v2081), svreinterpret_f64_f32(v1140)); + svst1_f64(pred_full, (double *)(v2209), svreinterpret_f64_f32(v1464)); + svst1_f64(pred_full, (double *)(v2243), svreinterpret_f64_f32(v1585)); + svst1_f64(pred_full, (double *)(v2263), svreinterpret_f64_f32(v1613)); + svst1_f64(pred_full, (double *)(v2273), svreinterpret_f64_f32(v1626)); +} +#endif + +#ifndef ARMRAL_ARCH_SVE +void armral_fft_cf32_cf32_cf32_ac_n_uun32(const armral_cmplx_f32_t *restrict x, + armral_cmplx_f32_t *restrict y, + int istride, int ostride, int howmany, + float dir) { + float v4 = dir; + const float32x2_t *v5 = (const float32x2_t *)x; + float32x2_t *v6 = (float32x2_t *)y; + float v735 = 7.0710678118654757e-01F; + float v746 = -7.0710678118654746e-01F; + float v792 = 5.5557023301960229e-01F; + float v806 = -1.9509032201612861e-01F; + float v853 = 9.2387953251128674e-01F; + float v860 = -9.2387953251128685e-01F; + float v863 = 3.8268343236508967e-01F; + float v864 = -3.8268343236508967e-01F; + float v906 = 1.9509032201612833e-01F; + float v909 = -9.8078528040323043e-01F; + float v910 = 9.8078528040323043e-01F; + float v917 = -5.5557023301960218e-01F; + float v920 = 8.3146961230254524e-01F; + float v921 = -8.3146961230254524e-01F; + float v931 = -1.0000000000000000e+00F; + float v932 = 1.0000000000000000e+00F; + float32x2_t v13 = v5[0]; + float32x2_t v316 = v5[istride]; + float32x2_t v565 = (float32x2_t){v910, v910}; + float32x2_t v622 = (float32x2_t){v853, v853}; + float32x2_t v626 = (float32x2_t){v864, v863}; + float32x2_t v679 = (float32x2_t){v920, v920}; + float32x2_t v683 = (float32x2_t){v917, v792}; + float32x2_t v690 = (float32x2_t){v806, v806}; + float32x2_t v736 = (float32x2_t){v735, v735}; + float32x2_t v747 = (float32x2_t){v746, v746}; + float32x2_t v751 = (float32x2_t){v932, v931}; + float32x2_t v793 = (float32x2_t){v792, v792}; + float32x2_t v797 = (float32x2_t){v921, v920}; + float32x2_t v804 = (float32x2_t){v909, v909}; + float32x2_t v808 = (float32x2_t){v806, v906}; + float32x2_t v850 = (float32x2_t){v863, v863}; + float32x2_t v854 = (float32x2_t){v860, v853}; + float32x2_t v861 = (float32x2_t){v860, v860}; + float32x2_t v865 = (float32x2_t){v863, v864}; + float32x2_t v907 = (float32x2_t){v906, v906}; + float32x2_t v911 = (float32x2_t){v909, v910}; + float32x2_t v918 = (float32x2_t){v917, v917}; + float32x2_t v922 = (float32x2_t){v920, v921}; + float32x2_t v933 = (float32x2_t){v931, v932}; + float32x2_t v934 = (float32x2_t){v4, v4}; + float32x2_t v18 = v5[istride * 16]; + float32x2_t v25 = v5[istride * 8]; + float32x2_t v30 = v5[istride * 24]; + float32x2_t v48 = v5[istride * 4]; + float32x2_t v53 = v5[istride * 20]; + float32x2_t v60 = v5[istride * 12]; + float32x2_t v65 = v5[istride * 28]; + float32x2_t v122 = v5[istride * 2]; + float32x2_t v127 = v5[istride * 18]; + float32x2_t v134 = v5[istride * 10]; + float32x2_t v139 = v5[istride * 26]; + float32x2_t v157 = v5[istride * 6]; + float32x2_t v162 = v5[istride * 22]; + float32x2_t v169 = v5[istride * 14]; + float32x2_t v174 = v5[istride * 30]; + float32x2_t v321 = v5[istride * 17]; + float32x2_t v328 = v5[istride * 9]; + float32x2_t v333 = v5[istride * 25]; + float32x2_t v351 = v5[istride * 5]; + float32x2_t v356 = v5[istride * 21]; + float32x2_t v363 = v5[istride * 13]; + float32x2_t v368 = v5[istride * 29]; + float32x2_t v425 = v5[istride * 3]; + float32x2_t v430 = v5[istride * 19]; + float32x2_t v437 = v5[istride * 11]; + float32x2_t v442 = v5[istride * 27]; + float32x2_t v460 = v5[istride * 7]; + float32x2_t v465 = v5[istride * 23]; + float32x2_t v472 = v5[istride * 15]; + float32x2_t v477 = v5[istride * 31]; + float32x2_t v628 = vmul_f32(v934, v626); + float32x2_t v685 = vmul_f32(v934, v683); + float32x2_t v753 = vmul_f32(v934, v751); + float32x2_t v799 = vmul_f32(v934, v797); + float32x2_t v810 = vmul_f32(v934, v808); + float32x2_t v856 = vmul_f32(v934, v854); + float32x2_t v867 = vmul_f32(v934, v865); + float32x2_t v913 = vmul_f32(v934, v911); + float32x2_t v924 = vmul_f32(v934, v922); + float32x2_t v935 = vmul_f32(v934, v933); + float32x2_t v19 = vadd_f32(v13, v18); + float32x2_t v20 = vsub_f32(v13, v18); + float32x2_t v31 = vadd_f32(v25, v30); + float32x2_t v32 = vsub_f32(v25, v30); + float32x2_t v54 = vadd_f32(v48, v53); + float32x2_t v55 = vsub_f32(v48, v53); + float32x2_t v66 = vadd_f32(v60, v65); + float32x2_t v67 = vsub_f32(v60, v65); + float32x2_t v128 = vadd_f32(v122, v127); + float32x2_t v129 = vsub_f32(v122, v127); + float32x2_t v140 = vadd_f32(v134, v139); + float32x2_t v141 = vsub_f32(v134, v139); + float32x2_t v163 = vadd_f32(v157, v162); + float32x2_t v164 = vsub_f32(v157, v162); + float32x2_t v175 = vadd_f32(v169, v174); + float32x2_t v176 = vsub_f32(v169, v174); + float32x2_t v322 = vadd_f32(v316, v321); + float32x2_t v323 = vsub_f32(v316, v321); + float32x2_t v334 = vadd_f32(v328, v333); + float32x2_t v335 = vsub_f32(v328, v333); + float32x2_t v357 = vadd_f32(v351, v356); + float32x2_t v358 = vsub_f32(v351, v356); + float32x2_t v369 = vadd_f32(v363, v368); + float32x2_t v370 = vsub_f32(v363, v368); + float32x2_t v431 = vadd_f32(v425, v430); + float32x2_t v432 = vsub_f32(v425, v430); + float32x2_t v443 = vadd_f32(v437, v442); + float32x2_t v444 = vsub_f32(v437, v442); + float32x2_t v466 = vadd_f32(v460, v465); + float32x2_t v467 = vsub_f32(v460, v465); + float32x2_t v478 = vadd_f32(v472, v477); + float32x2_t v479 = vsub_f32(v472, v477); + float32x2_t v38 = vrev64_f32(v32); + float32x2_t v40 = vadd_f32(v19, v31); + float32x2_t v41 = vsub_f32(v19, v31); + float32x2_t v68 = vadd_f32(v54, v66); + float32x2_t v69 = vsub_f32(v54, v66); + float32x2_t v84 = vmul_f32(v55, v736); + float32x2_t v95 = vmul_f32(v67, v747); + float32x2_t v147 = vrev64_f32(v141); + float32x2_t v149 = vadd_f32(v128, v140); + float32x2_t v150 = vsub_f32(v128, v140); + float32x2_t v182 = vrev64_f32(v176); + float32x2_t v184 = vadd_f32(v163, v175); + float32x2_t v185 = vsub_f32(v163, v175); + float32x2_t v341 = vrev64_f32(v335); + float32x2_t v343 = vadd_f32(v322, v334); + float32x2_t v344 = vsub_f32(v322, v334); + float32x2_t v371 = vadd_f32(v357, v369); + float32x2_t v372 = vsub_f32(v357, v369); + float32x2_t v387 = vmul_f32(v358, v736); + float32x2_t v398 = vmul_f32(v370, v747); + float32x2_t v450 = vrev64_f32(v444); + float32x2_t v452 = vadd_f32(v431, v443); + float32x2_t v453 = vsub_f32(v431, v443); + float32x2_t v480 = vadd_f32(v466, v478); + float32x2_t v481 = vsub_f32(v466, v478); + float32x2_t v496 = vmul_f32(v467, v736); + float32x2_t v507 = vmul_f32(v479, v747); + float32x2_t v39 = vmul_f32(v38, v753); + float32x2_t v75 = vrev64_f32(v69); + float32x2_t v77 = vadd_f32(v40, v68); + float32x2_t v78 = vsub_f32(v40, v68); + float32x2_t v90 = vrev64_f32(v84); + float32x2_t v101 = vrev64_f32(v95); + float32x2_t v148 = vmul_f32(v147, v753); + float32x2_t v183 = vmul_f32(v182, v753); + float32x2_t v188 = vadd_f32(v149, v184); + float32x2_t v189 = vsub_f32(v149, v184); + float32x2_t v241 = vmul_f32(v150, v736); + float32x2_t v252 = vmul_f32(v185, v747); + float32x2_t v342 = vmul_f32(v341, v753); + float32x2_t v378 = vrev64_f32(v372); + float32x2_t v380 = vadd_f32(v343, v371); + float32x2_t v381 = vsub_f32(v343, v371); + float32x2_t v393 = vrev64_f32(v387); + float32x2_t v404 = vrev64_f32(v398); + float32x2_t v451 = vmul_f32(v450, v753); + float32x2_t v487 = vrev64_f32(v481); + float32x2_t v489 = vadd_f32(v452, v480); + float32x2_t v490 = vsub_f32(v452, v480); + float32x2_t v502 = vrev64_f32(v496); + float32x2_t v513 = vrev64_f32(v507); + float32x2_t v42 = vsub_f32(v20, v39); + float32x2_t v43 = vadd_f32(v20, v39); + float32x2_t v76 = vmul_f32(v75, v753); + float32x2_t v91 = vmul_f32(v90, v935); + float32x2_t v102 = vmul_f32(v101, v753); + float32x2_t v151 = vsub_f32(v129, v148); + float32x2_t v152 = vadd_f32(v129, v148); + float32x2_t v186 = vsub_f32(v164, v183); + float32x2_t v187 = vadd_f32(v164, v183); + float32x2_t v195 = vrev64_f32(v189); + float32x2_t v197 = vadd_f32(v77, v188); + float32x2_t v198 = vsub_f32(v77, v188); + float32x2_t v247 = vrev64_f32(v241); + float32x2_t v258 = vrev64_f32(v252); + float32x2_t v345 = vsub_f32(v323, v342); + float32x2_t v346 = vadd_f32(v323, v342); + float32x2_t v379 = vmul_f32(v378, v753); + float32x2_t v394 = vmul_f32(v393, v935); + float32x2_t v405 = vmul_f32(v404, v753); + float32x2_t v454 = vsub_f32(v432, v451); + float32x2_t v455 = vadd_f32(v432, v451); + float32x2_t v488 = vmul_f32(v487, v753); + float32x2_t v503 = vmul_f32(v502, v935); + float32x2_t v514 = vmul_f32(v513, v753); + float32x2_t v530 = vadd_f32(v380, v489); + float32x2_t v531 = vsub_f32(v380, v489); + float32x2_t v737 = vmul_f32(v381, v736); + float32x2_t v748 = vmul_f32(v490, v747); + float32x2_t v79 = vsub_f32(v41, v76); + float32x2_t v80 = vadd_f32(v41, v76); + float32x2_t v103 = vadd_f32(v84, v91); + float32x2_t v104 = vadd_f32(v95, v102); + float32x2_t v196 = vmul_f32(v195, v753); + float32x2_t v204 = vmul_f32(v151, v622); + float32x2_t v210 = vrev64_f32(v151); + float32x2_t v215 = vmul_f32(v186, v850); + float32x2_t v221 = vrev64_f32(v186); + float32x2_t v248 = vmul_f32(v247, v935); + float32x2_t v259 = vmul_f32(v258, v753); + float32x2_t v278 = vmul_f32(v152, v850); + float32x2_t v284 = vrev64_f32(v152); + float32x2_t v289 = vmul_f32(v187, v861); + float32x2_t v295 = vrev64_f32(v187); + float32x2_t v382 = vsub_f32(v344, v379); + float32x2_t v383 = vadd_f32(v344, v379); + float32x2_t v406 = vadd_f32(v387, v394); + float32x2_t v407 = vadd_f32(v398, v405); + float32x2_t v491 = vsub_f32(v453, v488); + float32x2_t v492 = vadd_f32(v453, v488); + float32x2_t v515 = vadd_f32(v496, v503); + float32x2_t v516 = vadd_f32(v507, v514); + float32x2_t v537 = vrev64_f32(v531); + float32x2_t v539 = vadd_f32(v197, v530); + float32x2_t v540 = vsub_f32(v197, v530); + float32x2_t v743 = vrev64_f32(v737); + float32x2_t v754 = vrev64_f32(v748); + float32x2_t v105 = vadd_f32(v103, v104); + float32x2_t v106 = vsub_f32(v104, v103); + float32x2_t v199 = vsub_f32(v78, v196); + float32x2_t v200 = vadd_f32(v78, v196); + float32x2_t v260 = vadd_f32(v241, v248); + float32x2_t v261 = vadd_f32(v252, v259); + float32x2_t v408 = vadd_f32(v406, v407); + float32x2_t v409 = vsub_f32(v407, v406); + float32x2_t v517 = vadd_f32(v515, v516); + float32x2_t v518 = vsub_f32(v516, v515); + float32x2_t v538 = vmul_f32(v537, v753); + v6[0] = v539; + v6[ostride * 16] = v540; + float32x2_t v623 = vmul_f32(v382, v622); + float32x2_t v629 = vrev64_f32(v382); + float32x2_t v634 = vmul_f32(v491, v850); + float32x2_t v640 = vrev64_f32(v491); + float32x2_t v744 = vmul_f32(v743, v935); + float32x2_t v755 = vmul_f32(v754, v753); + float32x2_t v851 = vmul_f32(v383, v850); + float32x2_t v857 = vrev64_f32(v383); + float32x2_t v862 = vmul_f32(v492, v861); + float32x2_t v868 = vrev64_f32(v492); + float32x2_t v112 = vrev64_f32(v106); + float32x2_t v114 = vadd_f32(v42, v105); + float32x2_t v115 = vsub_f32(v42, v105); + float32x2_t v223 = vfma_f32(v204, v210, v628); + float32x2_t v224 = vfma_f32(v215, v221, v856); + float32x2_t v262 = vadd_f32(v260, v261); + float32x2_t v263 = vsub_f32(v261, v260); + float32x2_t v297 = vfma_f32(v278, v284, v856); + float32x2_t v298 = vfma_f32(v289, v295, v867); + float32x2_t v415 = vrev64_f32(v409); + float32x2_t v417 = vadd_f32(v345, v408); + float32x2_t v418 = vsub_f32(v345, v408); + float32x2_t v524 = vrev64_f32(v518); + float32x2_t v526 = vadd_f32(v454, v517); + float32x2_t v527 = vsub_f32(v454, v517); + float32x2_t v541 = vsub_f32(v198, v538); + float32x2_t v542 = vadd_f32(v198, v538); + float32x2_t v756 = vadd_f32(v737, v744); + float32x2_t v757 = vadd_f32(v748, v755); + float32x2_t v113 = vmul_f32(v112, v935); + float32x2_t v225 = vadd_f32(v223, v224); + float32x2_t v226 = vsub_f32(v224, v223); + float32x2_t v269 = vrev64_f32(v263); + float32x2_t v271 = vadd_f32(v79, v262); + float32x2_t v272 = vsub_f32(v79, v262); + float32x2_t v299 = vadd_f32(v297, v298); + float32x2_t v300 = vsub_f32(v298, v297); + float32x2_t v416 = vmul_f32(v415, v935); + float32x2_t v525 = vmul_f32(v524, v935); + v6[ostride * 8] = v541; + v6[ostride * 24] = v542; + float32x2_t v566 = vmul_f32(v417, v565); + float32x2_t v572 = vrev64_f32(v417); + float32x2_t v577 = vmul_f32(v526, v679); + float32x2_t v583 = vrev64_f32(v526); + float32x2_t v642 = vfma_f32(v623, v629, v628); + float32x2_t v643 = vfma_f32(v634, v640, v856); + float32x2_t v758 = vadd_f32(v756, v757); + float32x2_t v759 = vsub_f32(v757, v756); + float32x2_t v794 = vmul_f32(v418, v793); + float32x2_t v800 = vrev64_f32(v418); + float32x2_t v805 = vmul_f32(v527, v804); + float32x2_t v811 = vrev64_f32(v527); + float32x2_t v870 = vfma_f32(v851, v857, v856); + float32x2_t v871 = vfma_f32(v862, v868, v867); + float32x2_t v116 = vsub_f32(v43, v113); + float32x2_t v117 = vadd_f32(v43, v113); + float32x2_t v232 = vrev64_f32(v226); + float32x2_t v234 = vadd_f32(v114, v225); + float32x2_t v235 = vsub_f32(v114, v225); + float32x2_t v270 = vmul_f32(v269, v935); + float32x2_t v306 = vrev64_f32(v300); + float32x2_t v419 = vsub_f32(v346, v416); + float32x2_t v420 = vadd_f32(v346, v416); + float32x2_t v528 = vsub_f32(v455, v525); + float32x2_t v529 = vadd_f32(v455, v525); + float32x2_t v644 = vadd_f32(v642, v643); + float32x2_t v645 = vsub_f32(v643, v642); + float32x2_t v765 = vrev64_f32(v759); + float32x2_t v767 = vadd_f32(v199, v758); + float32x2_t v768 = vsub_f32(v199, v758); + float32x2_t v872 = vadd_f32(v870, v871); + float32x2_t v873 = vsub_f32(v871, v870); + float32x2_t v233 = vmul_f32(v232, v935); + float32x2_t v273 = vsub_f32(v80, v270); + float32x2_t v274 = vadd_f32(v80, v270); + float32x2_t v307 = vmul_f32(v306, v935); + float32x2_t v308 = vadd_f32(v116, v299); + float32x2_t v309 = vsub_f32(v116, v299); + float32x2_t v585 = vfma_f32(v566, v572, v810); + float32x2_t v586 = vfma_f32(v577, v583, v685); + float32x2_t v651 = vrev64_f32(v645); + float32x2_t v653 = vadd_f32(v271, v644); + float32x2_t v654 = vsub_f32(v271, v644); + float32x2_t v680 = vmul_f32(v419, v679); + float32x2_t v686 = vrev64_f32(v419); + float32x2_t v691 = vmul_f32(v528, v690); + float32x2_t v697 = vrev64_f32(v528); + float32x2_t v766 = vmul_f32(v765, v935); + v6[ostride * 4] = v767; + v6[ostride * 20] = v768; + float32x2_t v813 = vfma_f32(v794, v800, v799); + float32x2_t v814 = vfma_f32(v805, v811, v810); + float32x2_t v879 = vrev64_f32(v873); + float32x2_t v908 = vmul_f32(v420, v907); + float32x2_t v914 = vrev64_f32(v420); + float32x2_t v919 = vmul_f32(v529, v918); + float32x2_t v925 = vrev64_f32(v529); + float32x2_t v236 = vsub_f32(v115, v233); + float32x2_t v237 = vadd_f32(v115, v233); + float32x2_t v310 = vsub_f32(v117, v307); + float32x2_t v311 = vadd_f32(v117, v307); + float32x2_t v587 = vadd_f32(v585, v586); + float32x2_t v588 = vsub_f32(v586, v585); + float32x2_t v652 = vmul_f32(v651, v935); + v6[ostride * 2] = v653; + v6[ostride * 18] = v654; + float32x2_t v769 = vsub_f32(v200, v766); + float32x2_t v770 = vadd_f32(v200, v766); + float32x2_t v815 = vadd_f32(v813, v814); + float32x2_t v816 = vsub_f32(v814, v813); + float32x2_t v880 = vmul_f32(v879, v935); + float32x2_t v881 = vadd_f32(v273, v872); + float32x2_t v882 = vsub_f32(v273, v872); + float32x2_t v594 = vrev64_f32(v588); + float32x2_t v596 = vadd_f32(v234, v587); + float32x2_t v597 = vsub_f32(v234, v587); + float32x2_t v655 = vsub_f32(v272, v652); + float32x2_t v656 = vadd_f32(v272, v652); + float32x2_t v699 = vfma_f32(v680, v686, v685); + float32x2_t v700 = vfma_f32(v691, v697, v913); + v6[ostride * 12] = v769; + v6[ostride * 28] = v770; + float32x2_t v822 = vrev64_f32(v816); + float32x2_t v824 = vadd_f32(v236, v815); + float32x2_t v825 = vsub_f32(v236, v815); + float32x2_t v883 = vsub_f32(v274, v880); + float32x2_t v884 = vadd_f32(v274, v880); + v6[ostride * 6] = v881; + v6[ostride * 22] = v882; + float32x2_t v927 = vfma_f32(v908, v914, v913); + float32x2_t v928 = vfma_f32(v919, v925, v924); + float32x2_t v595 = vmul_f32(v594, v935); + v6[ostride] = v596; + v6[ostride * 17] = v597; + v6[ostride * 10] = v655; + v6[ostride * 26] = v656; + float32x2_t v701 = vadd_f32(v699, v700); + float32x2_t v702 = vsub_f32(v700, v699); + float32x2_t v823 = vmul_f32(v822, v935); + v6[ostride * 5] = v824; + v6[ostride * 21] = v825; + v6[ostride * 14] = v883; + v6[ostride * 30] = v884; + float32x2_t v929 = vadd_f32(v927, v928); + float32x2_t v930 = vsub_f32(v928, v927); + float32x2_t v598 = vsub_f32(v235, v595); + float32x2_t v599 = vadd_f32(v235, v595); + float32x2_t v708 = vrev64_f32(v702); + float32x2_t v710 = vadd_f32(v308, v701); + float32x2_t v711 = vsub_f32(v308, v701); + float32x2_t v826 = vsub_f32(v237, v823); + float32x2_t v827 = vadd_f32(v237, v823); + float32x2_t v936 = vrev64_f32(v930); + float32x2_t v938 = vadd_f32(v310, v929); + float32x2_t v939 = vsub_f32(v310, v929); + v6[ostride * 9] = v598; + v6[ostride * 25] = v599; + float32x2_t v709 = vmul_f32(v708, v935); + v6[ostride * 3] = v710; + v6[ostride * 19] = v711; + v6[ostride * 13] = v826; + v6[ostride * 29] = v827; + float32x2_t v937 = vmul_f32(v936, v935); + v6[ostride * 7] = v938; + v6[ostride * 23] = v939; + float32x2_t v712 = vsub_f32(v309, v709); + float32x2_t v713 = vadd_f32(v309, v709); + float32x2_t v940 = vsub_f32(v311, v937); + float32x2_t v941 = vadd_f32(v311, v937); + v6[ostride * 11] = v712; + v6[ostride * 27] = v713; + v6[ostride * 15] = v940; + v6[ostride * 31] = v941; +} +#endif + +#ifdef ARMRAL_ARCH_SVE +void armral_fft_cf32_cf32_cf32_ac_n_uun32(const armral_cmplx_f32_t *restrict x, + armral_cmplx_f32_t *restrict y, + int istride, int ostride, int howmany, + float dir) { + int64_t v0 = istride; + int64_t v2 = ostride; + float v4 = dir; + const float32x2_t *v5 = (const float32x2_t *)x; + float32x2_t *v6 = (float32x2_t *)y; + svbool_t pred_full = svptrue_pat_b32(SV_VL2); + float v799 = -1.9509032201612819e-01F; + float v854 = 7.0710678118654757e-01F; + float v866 = -7.0710678118654746e-01F; + float v871 = -1.0000000000000000e+00F; + float v921 = 5.5557023301960229e-01F; + float v926 = 8.3146961230254524e-01F; + float v933 = -9.8078528040323043e-01F; + float v988 = 3.8268343236508984e-01F; + float v993 = 9.2387953251128674e-01F; + float v1000 = -9.2387953251128685e-01F; + float v1005 = -3.8268343236508967e-01F; + float v1055 = 1.9509032201612833e-01F; + float v1060 = 9.8078528040323043e-01F; + float v1067 = -5.5557023301960218e-01F; + float v1072 = -8.3146961230254524e-01F; + const float32x2_t *v1297 = &v5[v0]; + float32x2_t *v1498 = &v6[v2]; + int64_t v22 = v0 * 16; + int64_t v31 = v0 * 8; + int64_t v38 = v0 * 24; + int64_t v58 = v0 * 4; + int64_t v65 = v0 * 20; + int64_t v74 = v0 * 12; + int64_t v81 = v0 * 28; + int64_t v142 = v0 * 2; + int64_t v149 = v0 * 18; + int64_t v158 = v0 * 10; + int64_t v165 = v0 * 26; + int64_t v185 = v0 * 6; + int64_t v192 = v0 * 22; + int64_t v201 = v0 * 14; + int64_t v208 = v0 * 30; + int64_t v365 = v0 * 17; + int64_t v374 = v0 * 9; + int64_t v381 = v0 * 25; + int64_t v401 = v0 * 5; + int64_t v408 = v0 * 21; + int64_t v417 = v0 * 13; + int64_t v424 = v0 * 29; + int64_t v485 = v0 * 3; + int64_t v492 = v0 * 19; + int64_t v501 = v0 * 11; + int64_t v508 = v0 * 27; + int64_t v528 = v0 * 7; + int64_t v535 = v0 * 23; + int64_t v544 = v0 * 15; + int64_t v551 = v0 * 31; + int64_t v632 = v2 * 8; + int64_t v639 = v2 * 16; + int64_t v646 = v2 * 24; + int64_t v699 = v2 * 9; + int64_t v706 = v2 * 17; + int64_t v713 = v2 * 25; + float v728 = v4 * v988; + int64_t v759 = v2 * 2; + int64_t v766 = v2 * 10; + int64_t v773 = v2 * 18; + int64_t v780 = v2 * 26; + float v795 = v4 * v921; + int64_t v826 = v2 * 3; + int64_t v833 = v2 * 11; + int64_t v840 = v2 * 19; + int64_t v847 = v2 * 27; + float v874 = v4 * v871; + int64_t v893 = v2 * 4; + int64_t v900 = v2 * 12; + int64_t v907 = v2 * 20; + int64_t v914 = v2 * 28; + float v929 = v4 * v926; + float v941 = v4 * v1055; + int64_t v960 = v2 * 5; + int64_t v967 = v2 * 13; + int64_t v974 = v2 * 21; + int64_t v981 = v2 * 29; + float v996 = v4 * v993; + float v1008 = v4 * v1005; + int64_t v1027 = v2 * 6; + int64_t v1034 = v2 * 14; + int64_t v1041 = v2 * 22; + int64_t v1048 = v2 * 30; + float v1063 = v4 * v1060; + float v1075 = v4 * v1072; + int64_t v1094 = v2 * 7; + int64_t v1101 = v2 * 15; + int64_t v1108 = v2 * 23; + int64_t v1115 = v2 * 31; + const float32x2_t *v1128 = &v5[0]; + float32x2_t *v1457 = &v6[0]; + svfloat32_t v1487 = svdup_n_f32(v1060); + svfloat32_t v1528 = svdup_n_f32(v993); + svfloat32_t v1569 = svdup_n_f32(v926); + svfloat32_t v1571 = svdup_n_f32(v799); + svfloat32_t v1610 = svdup_n_f32(v854); + svfloat32_t v1612 = svdup_n_f32(v866); + svfloat32_t v1651 = svdup_n_f32(v921); + svfloat32_t v1653 = svdup_n_f32(v933); + svfloat32_t v1692 = svdup_n_f32(v988); + svfloat32_t v1694 = svdup_n_f32(v1000); + svfloat32_t v1733 = svdup_n_f32(v1055); + svfloat32_t v1735 = svdup_n_f32(v1067); + svfloat32_t v1737 = svdup_n_f32(v4); + svfloat32_t v1807 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v1297)[0])); + const float32x2_t *v1137 = &v5[v22]; + const float32x2_t *v1146 = &v5[v31]; + const float32x2_t *v1155 = &v5[v38]; + const float32x2_t *v1165 = &v5[v58]; + const float32x2_t *v1174 = &v5[v65]; + const float32x2_t *v1183 = &v5[v74]; + const float32x2_t *v1192 = &v5[v81]; + const float32x2_t *v1207 = &v5[v142]; + const float32x2_t *v1216 = &v5[v149]; + const float32x2_t *v1225 = &v5[v158]; + const float32x2_t *v1234 = &v5[v165]; + const float32x2_t *v1244 = &v5[v185]; + const float32x2_t *v1253 = &v5[v192]; + const float32x2_t *v1262 = &v5[v201]; + const float32x2_t *v1271 = &v5[v208]; + const float32x2_t *v1306 = &v5[v365]; + const float32x2_t *v1315 = &v5[v374]; + const float32x2_t *v1324 = &v5[v381]; + const float32x2_t *v1334 = &v5[v401]; + const float32x2_t *v1343 = &v5[v408]; + const float32x2_t *v1352 = &v5[v417]; + const float32x2_t *v1361 = &v5[v424]; + const float32x2_t *v1376 = &v5[v485]; + const float32x2_t *v1385 = &v5[v492]; + const float32x2_t *v1394 = &v5[v501]; + const float32x2_t *v1403 = &v5[v508]; + const float32x2_t *v1413 = &v5[v528]; + const float32x2_t *v1422 = &v5[v535]; + const float32x2_t *v1431 = &v5[v544]; + const float32x2_t *v1440 = &v5[v551]; + float32x2_t *v1466 = &v6[v632]; + float32x2_t *v1475 = &v6[v639]; + float32x2_t *v1484 = &v6[v646]; + float32x2_t *v1507 = &v6[v699]; + float32x2_t *v1516 = &v6[v706]; + float32x2_t *v1525 = &v6[v713]; + svfloat32_t v1529 = svdup_n_f32(v728); + float32x2_t *v1539 = &v6[v759]; + float32x2_t *v1548 = &v6[v766]; + float32x2_t *v1557 = &v6[v773]; + float32x2_t *v1566 = &v6[v780]; + svfloat32_t v1570 = svdup_n_f32(v795); + float32x2_t *v1580 = &v6[v826]; + float32x2_t *v1589 = &v6[v833]; + float32x2_t *v1598 = &v6[v840]; + float32x2_t *v1607 = &v6[v847]; + svfloat32_t v1613 = svdup_n_f32(v874); + float32x2_t *v1621 = &v6[v893]; + float32x2_t *v1630 = &v6[v900]; + float32x2_t *v1639 = &v6[v907]; + float32x2_t *v1648 = &v6[v914]; + svfloat32_t v1652 = svdup_n_f32(v929); + svfloat32_t v1654 = svdup_n_f32(v941); + float32x2_t *v1662 = &v6[v960]; + float32x2_t *v1671 = &v6[v967]; + float32x2_t *v1680 = &v6[v974]; + float32x2_t *v1689 = &v6[v981]; + svfloat32_t v1693 = svdup_n_f32(v996); + svfloat32_t v1695 = svdup_n_f32(v1008); + float32x2_t *v1703 = &v6[v1027]; + float32x2_t *v1712 = &v6[v1034]; + float32x2_t *v1721 = &v6[v1041]; + float32x2_t *v1730 = &v6[v1048]; + svfloat32_t v1734 = svdup_n_f32(v1063); + svfloat32_t v1736 = svdup_n_f32(v1075); + float32x2_t *v1744 = &v6[v1094]; + float32x2_t *v1753 = &v6[v1101]; + float32x2_t *v1762 = &v6[v1108]; + float32x2_t *v1771 = &v6[v1115]; + svfloat32_t v1775 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v1128)[0])); + svfloat32_t v1777 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v1137)[0])); + svfloat32_t v1779 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v1146)[0])); + svfloat32_t v1781 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v1155)[0])); + svfloat32_t v1783 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v1165)[0])); + svfloat32_t v1785 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v1174)[0])); + svfloat32_t v1787 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v1183)[0])); + svfloat32_t v1789 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v1192)[0])); + svfloat32_t v1791 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v1207)[0])); + svfloat32_t v1793 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v1216)[0])); + svfloat32_t v1795 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v1225)[0])); + svfloat32_t v1797 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v1234)[0])); + svfloat32_t v1799 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v1244)[0])); + svfloat32_t v1801 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v1253)[0])); + svfloat32_t v1803 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v1262)[0])); + svfloat32_t v1805 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v1271)[0])); + svfloat32_t v1809 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v1306)[0])); + svfloat32_t v1811 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v1315)[0])); + svfloat32_t v1813 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v1324)[0])); + svfloat32_t v1815 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v1334)[0])); + svfloat32_t v1817 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v1343)[0])); + svfloat32_t v1819 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v1352)[0])); + svfloat32_t v1821 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v1361)[0])); + svfloat32_t v1823 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v1376)[0])); + svfloat32_t v1825 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v1385)[0])); + svfloat32_t v1827 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v1394)[0])); + svfloat32_t v1829 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v1403)[0])); + svfloat32_t v1831 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v1413)[0])); + svfloat32_t v1833 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v1422)[0])); + svfloat32_t v1835 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v1431)[0])); + svfloat32_t v1837 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v1440)[0])); + svfloat32_t v28 = svadd_f32_x(svptrue_b32(), v1775, v1777); + svfloat32_t v29 = svsub_f32_x(svptrue_b32(), v1775, v1777); + svfloat32_t v44 = svadd_f32_x(svptrue_b32(), v1779, v1781); + svfloat32_t v45 = svsub_f32_x(svptrue_b32(), v1779, v1781); + svfloat32_t v71 = svadd_f32_x(svptrue_b32(), v1783, v1785); + svfloat32_t v72 = svsub_f32_x(svptrue_b32(), v1783, v1785); + svfloat32_t v87 = svadd_f32_x(svptrue_b32(), v1787, v1789); + svfloat32_t v88 = svsub_f32_x(svptrue_b32(), v1787, v1789); + svfloat32_t v155 = svadd_f32_x(svptrue_b32(), v1791, v1793); + svfloat32_t v156 = svsub_f32_x(svptrue_b32(), v1791, v1793); + svfloat32_t v171 = svadd_f32_x(svptrue_b32(), v1795, v1797); + svfloat32_t v172 = svsub_f32_x(svptrue_b32(), v1795, v1797); + svfloat32_t v198 = svadd_f32_x(svptrue_b32(), v1799, v1801); + svfloat32_t v199 = svsub_f32_x(svptrue_b32(), v1799, v1801); + svfloat32_t v214 = svadd_f32_x(svptrue_b32(), v1803, v1805); + svfloat32_t v215 = svsub_f32_x(svptrue_b32(), v1803, v1805); + svfloat32_t v371 = svadd_f32_x(svptrue_b32(), v1807, v1809); + svfloat32_t v372 = svsub_f32_x(svptrue_b32(), v1807, v1809); + svfloat32_t v387 = svadd_f32_x(svptrue_b32(), v1811, v1813); + svfloat32_t v388 = svsub_f32_x(svptrue_b32(), v1811, v1813); + svfloat32_t v414 = svadd_f32_x(svptrue_b32(), v1815, v1817); + svfloat32_t v415 = svsub_f32_x(svptrue_b32(), v1815, v1817); + svfloat32_t v430 = svadd_f32_x(svptrue_b32(), v1819, v1821); + svfloat32_t v431 = svsub_f32_x(svptrue_b32(), v1819, v1821); + svfloat32_t v498 = svadd_f32_x(svptrue_b32(), v1823, v1825); + svfloat32_t v499 = svsub_f32_x(svptrue_b32(), v1823, v1825); + svfloat32_t v514 = svadd_f32_x(svptrue_b32(), v1827, v1829); + svfloat32_t v515 = svsub_f32_x(svptrue_b32(), v1827, v1829); + svfloat32_t v541 = svadd_f32_x(svptrue_b32(), v1831, v1833); + svfloat32_t v542 = svsub_f32_x(svptrue_b32(), v1831, v1833); + svfloat32_t v557 = svadd_f32_x(svptrue_b32(), v1835, v1837); + svfloat32_t v558 = svsub_f32_x(svptrue_b32(), v1835, v1837); + svfloat32_t zero52 = svdup_n_f32(0); + svfloat32_t v52 = svcmla_f32_x(pred_full, zero52, v1613, v45, 90); + svfloat32_t v53 = svadd_f32_x(svptrue_b32(), v28, v44); + svfloat32_t v54 = svsub_f32_x(svptrue_b32(), v28, v44); + svfloat32_t v89 = svadd_f32_x(svptrue_b32(), v71, v87); + svfloat32_t v90 = svsub_f32_x(svptrue_b32(), v71, v87); + svfloat32_t v106 = svmul_f32_x(svptrue_b32(), v72, v1610); + svfloat32_t v118 = svmul_f32_x(svptrue_b32(), v88, v1612); + svfloat32_t zero179 = svdup_n_f32(0); + svfloat32_t v179 = svcmla_f32_x(pred_full, zero179, v1613, v172, 90); + svfloat32_t v180 = svadd_f32_x(svptrue_b32(), v155, v171); + svfloat32_t v181 = svsub_f32_x(svptrue_b32(), v155, v171); + svfloat32_t zero222 = svdup_n_f32(0); + svfloat32_t v222 = svcmla_f32_x(pred_full, zero222, v1613, v215, 90); + svfloat32_t v223 = svadd_f32_x(svptrue_b32(), v198, v214); + svfloat32_t v224 = svsub_f32_x(svptrue_b32(), v198, v214); + svfloat32_t zero395 = svdup_n_f32(0); + svfloat32_t v395 = svcmla_f32_x(pred_full, zero395, v1613, v388, 90); + svfloat32_t v396 = svadd_f32_x(svptrue_b32(), v371, v387); + svfloat32_t v397 = svsub_f32_x(svptrue_b32(), v371, v387); + svfloat32_t v432 = svadd_f32_x(svptrue_b32(), v414, v430); + svfloat32_t v433 = svsub_f32_x(svptrue_b32(), v414, v430); + svfloat32_t v449 = svmul_f32_x(svptrue_b32(), v415, v1610); + svfloat32_t v461 = svmul_f32_x(svptrue_b32(), v431, v1612); + svfloat32_t zero522 = svdup_n_f32(0); + svfloat32_t v522 = svcmla_f32_x(pred_full, zero522, v1613, v515, 90); + svfloat32_t v523 = svadd_f32_x(svptrue_b32(), v498, v514); + svfloat32_t v524 = svsub_f32_x(svptrue_b32(), v498, v514); + svfloat32_t v559 = svadd_f32_x(svptrue_b32(), v541, v557); + svfloat32_t v560 = svsub_f32_x(svptrue_b32(), v541, v557); + svfloat32_t v576 = svmul_f32_x(svptrue_b32(), v542, v1610); + svfloat32_t v588 = svmul_f32_x(svptrue_b32(), v558, v1612); + svfloat32_t v55 = svsub_f32_x(svptrue_b32(), v29, v52); + svfloat32_t v56 = svadd_f32_x(svptrue_b32(), v29, v52); + svfloat32_t zero97 = svdup_n_f32(0); + svfloat32_t v97 = svcmla_f32_x(pred_full, zero97, v1613, v90, 90); + svfloat32_t v98 = svadd_f32_x(svptrue_b32(), v53, v89); + svfloat32_t v99 = svsub_f32_x(svptrue_b32(), v53, v89); + svfloat32_t v182 = svsub_f32_x(svptrue_b32(), v156, v179); + svfloat32_t v183 = svadd_f32_x(svptrue_b32(), v156, v179); + svfloat32_t v225 = svsub_f32_x(svptrue_b32(), v199, v222); + svfloat32_t v226 = svadd_f32_x(svptrue_b32(), v199, v222); + svfloat32_t v227 = svadd_f32_x(svptrue_b32(), v180, v223); + svfloat32_t v228 = svsub_f32_x(svptrue_b32(), v180, v223); + svfloat32_t v283 = svmul_f32_x(svptrue_b32(), v181, v1610); + svfloat32_t v295 = svmul_f32_x(svptrue_b32(), v224, v1612); + svfloat32_t v398 = svsub_f32_x(svptrue_b32(), v372, v395); + svfloat32_t v399 = svadd_f32_x(svptrue_b32(), v372, v395); + svfloat32_t zero440 = svdup_n_f32(0); + svfloat32_t v440 = svcmla_f32_x(pred_full, zero440, v1613, v433, 90); + svfloat32_t v441 = svadd_f32_x(svptrue_b32(), v396, v432); + svfloat32_t v442 = svsub_f32_x(svptrue_b32(), v396, v432); + svfloat32_t v525 = svsub_f32_x(svptrue_b32(), v499, v522); + svfloat32_t v526 = svadd_f32_x(svptrue_b32(), v499, v522); + svfloat32_t zero567 = svdup_n_f32(0); + svfloat32_t v567 = svcmla_f32_x(pred_full, zero567, v1613, v560, 90); + svfloat32_t v568 = svadd_f32_x(svptrue_b32(), v523, v559); + svfloat32_t v569 = svsub_f32_x(svptrue_b32(), v523, v559); + svfloat32_t v100 = svsub_f32_x(svptrue_b32(), v54, v97); + svfloat32_t v101 = svadd_f32_x(svptrue_b32(), v54, v97); + svfloat32_t v126 = svcmla_f32_x(pred_full, v106, v1737, v106, 90); + svfloat32_t v127 = svcmla_f32_x(pred_full, v118, v1613, v118, 90); + svfloat32_t zero235 = svdup_n_f32(0); + svfloat32_t v235 = svcmla_f32_x(pred_full, zero235, v1613, v228, 90); + svfloat32_t v236 = svadd_f32_x(svptrue_b32(), v98, v227); + svfloat32_t v237 = svsub_f32_x(svptrue_b32(), v98, v227); + svfloat32_t v244 = svmul_f32_x(svptrue_b32(), v182, v1528); + svfloat32_t v256 = svmul_f32_x(svptrue_b32(), v225, v1692); + svfloat32_t v322 = svmul_f32_x(svptrue_b32(), v183, v1692); + svfloat32_t v334 = svmul_f32_x(svptrue_b32(), v226, v1694); + svfloat32_t v443 = svsub_f32_x(svptrue_b32(), v397, v440); + svfloat32_t v444 = svadd_f32_x(svptrue_b32(), v397, v440); + svfloat32_t v469 = svcmla_f32_x(pred_full, v449, v1737, v449, 90); + svfloat32_t v470 = svcmla_f32_x(pred_full, v461, v1613, v461, 90); + svfloat32_t v570 = svsub_f32_x(svptrue_b32(), v524, v567); + svfloat32_t v571 = svadd_f32_x(svptrue_b32(), v524, v567); + svfloat32_t v596 = svcmla_f32_x(pred_full, v576, v1737, v576, 90); + svfloat32_t v597 = svcmla_f32_x(pred_full, v588, v1613, v588, 90); + svfloat32_t v611 = svadd_f32_x(svptrue_b32(), v441, v568); + svfloat32_t v612 = svsub_f32_x(svptrue_b32(), v441, v568); + svfloat32_t v857 = svmul_f32_x(svptrue_b32(), v442, v1610); + svfloat32_t v869 = svmul_f32_x(svptrue_b32(), v569, v1612); + svfloat32_t v128 = svadd_f32_x(svptrue_b32(), v126, v127); + svfloat32_t v129 = svsub_f32_x(svptrue_b32(), v127, v126); + svfloat32_t v238 = svsub_f32_x(svptrue_b32(), v99, v235); + svfloat32_t v239 = svadd_f32_x(svptrue_b32(), v99, v235); + svfloat32_t v264 = svcmla_f32_x(pred_full, v244, v1529, v182, 90); + svfloat32_t v265 = svcmla_f32_x(pred_full, v256, v1693, v225, 90); + svfloat32_t v303 = svcmla_f32_x(pred_full, v283, v1737, v283, 90); + svfloat32_t v304 = svcmla_f32_x(pred_full, v295, v1613, v295, 90); + svfloat32_t v342 = svcmla_f32_x(pred_full, v322, v1693, v183, 90); + svfloat32_t v343 = svcmla_f32_x(pred_full, v334, v1695, v226, 90); + svfloat32_t v471 = svadd_f32_x(svptrue_b32(), v469, v470); + svfloat32_t v472 = svsub_f32_x(svptrue_b32(), v470, v469); + svfloat32_t v598 = svadd_f32_x(svptrue_b32(), v596, v597); + svfloat32_t v599 = svsub_f32_x(svptrue_b32(), v597, v596); + svfloat32_t zero619 = svdup_n_f32(0); + svfloat32_t v619 = svcmla_f32_x(pred_full, zero619, v1613, v612, 90); + svfloat32_t v620 = svadd_f32_x(svptrue_b32(), v236, v611); + svfloat32_t v621 = svsub_f32_x(svptrue_b32(), v236, v611); + svfloat32_t v723 = svmul_f32_x(svptrue_b32(), v443, v1528); + svfloat32_t v735 = svmul_f32_x(svptrue_b32(), v570, v1692); + svfloat32_t v991 = svmul_f32_x(svptrue_b32(), v444, v1692); + svfloat32_t v1003 = svmul_f32_x(svptrue_b32(), v571, v1694); + svfloat32_t zero136 = svdup_n_f32(0); + svfloat32_t v136 = svcmla_f32_x(pred_full, zero136, v1737, v129, 90); + svfloat32_t v137 = svadd_f32_x(svptrue_b32(), v55, v128); + svfloat32_t v138 = svsub_f32_x(svptrue_b32(), v55, v128); + svfloat32_t v266 = svadd_f32_x(svptrue_b32(), v264, v265); + svfloat32_t v267 = svsub_f32_x(svptrue_b32(), v265, v264); + svfloat32_t v305 = svadd_f32_x(svptrue_b32(), v303, v304); + svfloat32_t v306 = svsub_f32_x(svptrue_b32(), v304, v303); + svfloat32_t v344 = svadd_f32_x(svptrue_b32(), v342, v343); + svfloat32_t v345 = svsub_f32_x(svptrue_b32(), v343, v342); + svfloat32_t zero479 = svdup_n_f32(0); + svfloat32_t v479 = svcmla_f32_x(pred_full, zero479, v1737, v472, 90); + svfloat32_t v480 = svadd_f32_x(svptrue_b32(), v398, v471); + svfloat32_t v481 = svsub_f32_x(svptrue_b32(), v398, v471); + svfloat32_t zero606 = svdup_n_f32(0); + svfloat32_t v606 = svcmla_f32_x(pred_full, zero606, v1737, v599, 90); + svfloat32_t v607 = svadd_f32_x(svptrue_b32(), v525, v598); + svfloat32_t v608 = svsub_f32_x(svptrue_b32(), v525, v598); + svfloat32_t v622 = svsub_f32_x(svptrue_b32(), v237, v619); + svfloat32_t v623 = svadd_f32_x(svptrue_b32(), v237, v619); + svfloat32_t v743 = svcmla_f32_x(pred_full, v723, v1529, v443, 90); + svfloat32_t v744 = svcmla_f32_x(pred_full, v735, v1693, v570, 90); + svfloat32_t v877 = svcmla_f32_x(pred_full, v857, v1737, v857, 90); + svfloat32_t v878 = svcmla_f32_x(pred_full, v869, v1613, v869, 90); + svfloat32_t v1011 = svcmla_f32_x(pred_full, v991, v1693, v444, 90); + svfloat32_t v1012 = svcmla_f32_x(pred_full, v1003, v1695, v571, 90); + svst1_f64(pred_full, (double *)(v1457), svreinterpret_f64_f32(v620)); + svst1_f64(pred_full, (double *)(v1475), svreinterpret_f64_f32(v621)); + svfloat32_t v139 = svsub_f32_x(svptrue_b32(), v56, v136); + svfloat32_t v140 = svadd_f32_x(svptrue_b32(), v56, v136); + svfloat32_t zero274 = svdup_n_f32(0); + svfloat32_t v274 = svcmla_f32_x(pred_full, zero274, v1737, v267, 90); + svfloat32_t v275 = svadd_f32_x(svptrue_b32(), v137, v266); + svfloat32_t v276 = svsub_f32_x(svptrue_b32(), v137, v266); + svfloat32_t zero313 = svdup_n_f32(0); + svfloat32_t v313 = svcmla_f32_x(pred_full, zero313, v1737, v306, 90); + svfloat32_t v314 = svadd_f32_x(svptrue_b32(), v100, v305); + svfloat32_t v315 = svsub_f32_x(svptrue_b32(), v100, v305); + svfloat32_t zero352 = svdup_n_f32(0); + svfloat32_t v352 = svcmla_f32_x(pred_full, zero352, v1737, v345, 90); + svfloat32_t v482 = svsub_f32_x(svptrue_b32(), v399, v479); + svfloat32_t v483 = svadd_f32_x(svptrue_b32(), v399, v479); + svfloat32_t v609 = svsub_f32_x(svptrue_b32(), v526, v606); + svfloat32_t v610 = svadd_f32_x(svptrue_b32(), v526, v606); + svfloat32_t v656 = svmul_f32_x(svptrue_b32(), v480, v1487); + svfloat32_t v668 = svmul_f32_x(svptrue_b32(), v607, v1569); + svfloat32_t v745 = svadd_f32_x(svptrue_b32(), v743, v744); + svfloat32_t v746 = svsub_f32_x(svptrue_b32(), v744, v743); + svfloat32_t v879 = svadd_f32_x(svptrue_b32(), v877, v878); + svfloat32_t v880 = svsub_f32_x(svptrue_b32(), v878, v877); + svfloat32_t v924 = svmul_f32_x(svptrue_b32(), v481, v1651); + svfloat32_t v936 = svmul_f32_x(svptrue_b32(), v608, v1653); + svfloat32_t v1013 = svadd_f32_x(svptrue_b32(), v1011, v1012); + svfloat32_t v1014 = svsub_f32_x(svptrue_b32(), v1012, v1011); + svst1_f64(pred_full, (double *)(v1466), svreinterpret_f64_f32(v622)); + svst1_f64(pred_full, (double *)(v1484), svreinterpret_f64_f32(v623)); + svfloat32_t v277 = svsub_f32_x(svptrue_b32(), v138, v274); + svfloat32_t v278 = svadd_f32_x(svptrue_b32(), v138, v274); + svfloat32_t v316 = svsub_f32_x(svptrue_b32(), v101, v313); + svfloat32_t v317 = svadd_f32_x(svptrue_b32(), v101, v313); + svfloat32_t v353 = svadd_f32_x(svptrue_b32(), v139, v344); + svfloat32_t v354 = svsub_f32_x(svptrue_b32(), v139, v344); + svfloat32_t v355 = svsub_f32_x(svptrue_b32(), v140, v352); + svfloat32_t v356 = svadd_f32_x(svptrue_b32(), v140, v352); + svfloat32_t v676 = svcmla_f32_x(pred_full, v656, v1654, v480, 90); + svfloat32_t v677 = svcmla_f32_x(pred_full, v668, v1570, v607, 90); + svfloat32_t zero753 = svdup_n_f32(0); + svfloat32_t v753 = svcmla_f32_x(pred_full, zero753, v1737, v746, 90); + svfloat32_t v754 = svadd_f32_x(svptrue_b32(), v314, v745); + svfloat32_t v755 = svsub_f32_x(svptrue_b32(), v314, v745); + svfloat32_t v790 = svmul_f32_x(svptrue_b32(), v482, v1569); + svfloat32_t v802 = svmul_f32_x(svptrue_b32(), v609, v1571); + svfloat32_t zero887 = svdup_n_f32(0); + svfloat32_t v887 = svcmla_f32_x(pred_full, zero887, v1737, v880, 90); + svfloat32_t v888 = svadd_f32_x(svptrue_b32(), v238, v879); + svfloat32_t v889 = svsub_f32_x(svptrue_b32(), v238, v879); + svfloat32_t v944 = svcmla_f32_x(pred_full, v924, v1652, v481, 90); + svfloat32_t v945 = svcmla_f32_x(pred_full, v936, v1654, v608, 90); + svfloat32_t zero1021 = svdup_n_f32(0); + svfloat32_t v1021 = svcmla_f32_x(pred_full, zero1021, v1737, v1014, 90); + svfloat32_t v1058 = svmul_f32_x(svptrue_b32(), v483, v1733); + svfloat32_t v1070 = svmul_f32_x(svptrue_b32(), v610, v1735); + svfloat32_t v678 = svadd_f32_x(svptrue_b32(), v676, v677); + svfloat32_t v679 = svsub_f32_x(svptrue_b32(), v677, v676); + svfloat32_t v756 = svsub_f32_x(svptrue_b32(), v315, v753); + svfloat32_t v757 = svadd_f32_x(svptrue_b32(), v315, v753); + svfloat32_t v810 = svcmla_f32_x(pred_full, v790, v1570, v482, 90); + svfloat32_t v811 = svcmla_f32_x(pred_full, v802, v1734, v609, 90); + svfloat32_t v890 = svsub_f32_x(svptrue_b32(), v239, v887); + svfloat32_t v891 = svadd_f32_x(svptrue_b32(), v239, v887); + svfloat32_t v946 = svadd_f32_x(svptrue_b32(), v944, v945); + svfloat32_t v947 = svsub_f32_x(svptrue_b32(), v945, v944); + svfloat32_t v1022 = svadd_f32_x(svptrue_b32(), v316, v1013); + svfloat32_t v1023 = svsub_f32_x(svptrue_b32(), v316, v1013); + svfloat32_t v1024 = svsub_f32_x(svptrue_b32(), v317, v1021); + svfloat32_t v1025 = svadd_f32_x(svptrue_b32(), v317, v1021); + svfloat32_t v1078 = svcmla_f32_x(pred_full, v1058, v1734, v483, 90); + svfloat32_t v1079 = svcmla_f32_x(pred_full, v1070, v1736, v610, 90); + svst1_f64(pred_full, (double *)(v1539), svreinterpret_f64_f32(v754)); + svst1_f64(pred_full, (double *)(v1557), svreinterpret_f64_f32(v755)); + svst1_f64(pred_full, (double *)(v1621), svreinterpret_f64_f32(v888)); + svst1_f64(pred_full, (double *)(v1639), svreinterpret_f64_f32(v889)); + svfloat32_t zero686 = svdup_n_f32(0); + svfloat32_t v686 = svcmla_f32_x(pred_full, zero686, v1737, v679, 90); + svfloat32_t v687 = svadd_f32_x(svptrue_b32(), v275, v678); + svfloat32_t v688 = svsub_f32_x(svptrue_b32(), v275, v678); + svfloat32_t v812 = svadd_f32_x(svptrue_b32(), v810, v811); + svfloat32_t v813 = svsub_f32_x(svptrue_b32(), v811, v810); + svfloat32_t zero954 = svdup_n_f32(0); + svfloat32_t v954 = svcmla_f32_x(pred_full, zero954, v1737, v947, 90); + svfloat32_t v955 = svadd_f32_x(svptrue_b32(), v277, v946); + svfloat32_t v956 = svsub_f32_x(svptrue_b32(), v277, v946); + svfloat32_t v1080 = svadd_f32_x(svptrue_b32(), v1078, v1079); + svfloat32_t v1081 = svsub_f32_x(svptrue_b32(), v1079, v1078); + svst1_f64(pred_full, (double *)(v1548), svreinterpret_f64_f32(v756)); + svst1_f64(pred_full, (double *)(v1566), svreinterpret_f64_f32(v757)); + svst1_f64(pred_full, (double *)(v1630), svreinterpret_f64_f32(v890)); + svst1_f64(pred_full, (double *)(v1648), svreinterpret_f64_f32(v891)); + svst1_f64(pred_full, (double *)(v1703), svreinterpret_f64_f32(v1022)); + svst1_f64(pred_full, (double *)(v1712), svreinterpret_f64_f32(v1024)); + svst1_f64(pred_full, (double *)(v1721), svreinterpret_f64_f32(v1023)); + svst1_f64(pred_full, (double *)(v1730), svreinterpret_f64_f32(v1025)); + svfloat32_t v689 = svsub_f32_x(svptrue_b32(), v276, v686); + svfloat32_t v690 = svadd_f32_x(svptrue_b32(), v276, v686); + svfloat32_t zero820 = svdup_n_f32(0); + svfloat32_t v820 = svcmla_f32_x(pred_full, zero820, v1737, v813, 90); + svfloat32_t v821 = svadd_f32_x(svptrue_b32(), v353, v812); + svfloat32_t v822 = svsub_f32_x(svptrue_b32(), v353, v812); + svfloat32_t v957 = svsub_f32_x(svptrue_b32(), v278, v954); + svfloat32_t v958 = svadd_f32_x(svptrue_b32(), v278, v954); + svfloat32_t zero1088 = svdup_n_f32(0); + svfloat32_t v1088 = svcmla_f32_x(pred_full, zero1088, v1737, v1081, 90); + svfloat32_t v1089 = svadd_f32_x(svptrue_b32(), v355, v1080); + svfloat32_t v1090 = svsub_f32_x(svptrue_b32(), v355, v1080); + svst1_f64(pred_full, (double *)(v1498), svreinterpret_f64_f32(v687)); + svst1_f64(pred_full, (double *)(v1516), svreinterpret_f64_f32(v688)); + svst1_f64(pred_full, (double *)(v1662), svreinterpret_f64_f32(v955)); + svst1_f64(pred_full, (double *)(v1680), svreinterpret_f64_f32(v956)); + svfloat32_t v823 = svsub_f32_x(svptrue_b32(), v354, v820); + svfloat32_t v824 = svadd_f32_x(svptrue_b32(), v354, v820); + svfloat32_t v1091 = svsub_f32_x(svptrue_b32(), v356, v1088); + svfloat32_t v1092 = svadd_f32_x(svptrue_b32(), v356, v1088); + svst1_f64(pred_full, (double *)(v1507), svreinterpret_f64_f32(v689)); + svst1_f64(pred_full, (double *)(v1525), svreinterpret_f64_f32(v690)); + svst1_f64(pred_full, (double *)(v1580), svreinterpret_f64_f32(v821)); + svst1_f64(pred_full, (double *)(v1598), svreinterpret_f64_f32(v822)); + svst1_f64(pred_full, (double *)(v1671), svreinterpret_f64_f32(v957)); + svst1_f64(pred_full, (double *)(v1689), svreinterpret_f64_f32(v958)); + svst1_f64(pred_full, (double *)(v1744), svreinterpret_f64_f32(v1089)); + svst1_f64(pred_full, (double *)(v1762), svreinterpret_f64_f32(v1090)); + svst1_f64(pred_full, (double *)(v1589), svreinterpret_f64_f32(v823)); + svst1_f64(pred_full, (double *)(v1607), svreinterpret_f64_f32(v824)); + svst1_f64(pred_full, (double *)(v1753), svreinterpret_f64_f32(v1091)); + svst1_f64(pred_full, (double *)(v1771), svreinterpret_f64_f32(v1092)); +} +#endif diff --git a/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ac_n_uun.h b/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ac_n_uun.h new file mode 100644 index 0000000000000000000000000000000000000000..62f2af348979867376a0331df07e29f07c46fa03 --- /dev/null +++ b/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ac_n_uun.h @@ -0,0 +1,48 @@ +/* + Arm RAN Acceleration Library + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause +*/ +#pragma once + +#include "armral.h" +#include "fft_helper.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef void(cf32_cf32_cf32_ac_n_uun_fft_t)(const armral_cmplx_f32_t *x, + armral_cmplx_f32_t *y, int istride, + int ostride, int howmany, + float dir); + +cf32_cf32_cf32_ac_n_uun_fft_t armral_fft_cf32_cf32_cf32_ac_n_uun2; +cf32_cf32_cf32_ac_n_uun_fft_t armral_fft_cf32_cf32_cf32_ac_n_uun3; +cf32_cf32_cf32_ac_n_uun_fft_t armral_fft_cf32_cf32_cf32_ac_n_uun4; +cf32_cf32_cf32_ac_n_uun_fft_t armral_fft_cf32_cf32_cf32_ac_n_uun5; +cf32_cf32_cf32_ac_n_uun_fft_t armral_fft_cf32_cf32_cf32_ac_n_uun6; +cf32_cf32_cf32_ac_n_uun_fft_t armral_fft_cf32_cf32_cf32_ac_n_uun7; +cf32_cf32_cf32_ac_n_uun_fft_t armral_fft_cf32_cf32_cf32_ac_n_uun8; +cf32_cf32_cf32_ac_n_uun_fft_t armral_fft_cf32_cf32_cf32_ac_n_uun9; +cf32_cf32_cf32_ac_n_uun_fft_t armral_fft_cf32_cf32_cf32_ac_n_uun10; +cf32_cf32_cf32_ac_n_uun_fft_t armral_fft_cf32_cf32_cf32_ac_n_uun11; +cf32_cf32_cf32_ac_n_uun_fft_t armral_fft_cf32_cf32_cf32_ac_n_uun12; +cf32_cf32_cf32_ac_n_uun_fft_t armral_fft_cf32_cf32_cf32_ac_n_uun13; +cf32_cf32_cf32_ac_n_uun_fft_t armral_fft_cf32_cf32_cf32_ac_n_uun14; +cf32_cf32_cf32_ac_n_uun_fft_t armral_fft_cf32_cf32_cf32_ac_n_uun15; +cf32_cf32_cf32_ac_n_uun_fft_t armral_fft_cf32_cf32_cf32_ac_n_uun16; +cf32_cf32_cf32_ac_n_uun_fft_t armral_fft_cf32_cf32_cf32_ac_n_uun17; +cf32_cf32_cf32_ac_n_uun_fft_t armral_fft_cf32_cf32_cf32_ac_n_uun18; +cf32_cf32_cf32_ac_n_uun_fft_t armral_fft_cf32_cf32_cf32_ac_n_uun19; +cf32_cf32_cf32_ac_n_uun_fft_t armral_fft_cf32_cf32_cf32_ac_n_uun20; +cf32_cf32_cf32_ac_n_uun_fft_t armral_fft_cf32_cf32_cf32_ac_n_uun21; +cf32_cf32_cf32_ac_n_uun_fft_t armral_fft_cf32_cf32_cf32_ac_n_uun22; +cf32_cf32_cf32_ac_n_uun_fft_t armral_fft_cf32_cf32_cf32_ac_n_uun24; +cf32_cf32_cf32_ac_n_uun_fft_t armral_fft_cf32_cf32_cf32_ac_n_uun25; +cf32_cf32_cf32_ac_n_uun_fft_t armral_fft_cf32_cf32_cf32_ac_n_uun32; + +#ifdef __cplusplus +} // extern "C" +#endif \ No newline at end of file diff --git a/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ac_t_uu.c b/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ac_t_uu.c index 2f58b4cfaf0cd876bbc4a7e4a438a1f35491c565..627629d204eb3b6c63c389d9307716aa6eda749b 100644 --- a/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ac_t_uu.c +++ b/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ac_t_uu.c @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "fft_cf32_cf32_cf32_ac_t_uu.h" @@ -9,1170 +11,6 @@ #include #endif -#ifndef ARMRAL_ARCH_SVE -void armral_fft_cf32_cf32_cf32_ac_t_uu2(const armral_cmplx_f32_t *restrict x, - armral_cmplx_f32_t *restrict y, - int istride, int ostride, - const armral_cmplx_f32_t *restrict w, - int howmany, float dir) { - const float32x2_t *v5 = (const float32x2_t *)x; - float32x2_t *v6 = (float32x2_t *)y; - const float32x2_t *v7 = (const float32x2_t *)w; - int64_t v12 = howmany - 1; - int64_t v85 = howmany / 2; - for (int j = 0; j < v12; j += 2) { - const float32x2_t *v153 = &v5[istride]; - float32x2_t *v182 = &v6[ostride]; - const float32x2_t *v163 = &v5[0]; - float32x2_t *v173 = &v6[0]; - float32x4_t v186 = vld1q_f32((const float32_t *)v153); - float32x4_t v42 = vtrn1q_f32(v186, v186); - float32x4_t v43 = vtrn2q_f32(v186, v186); - float32x4_t v47 = - vreinterpretq_f32_u64(vld1q_dup_u64((const uint64_t *)&v7[0])); - float32x4_t v49 = - vreinterpretq_f32_u64(vld1q_dup_u64((const uint64_t *)&v7[1])); - float32x4_t v188 = vld1q_f32((const float32_t *)v163); - float32x4_t v48 = vmulq_f32(v42, v47); - float32x4_t v51 = vfmaq_f32(v48, v43, v49); - float32x4_t v59 = vaddq_f32(v188, v51); - float32x4_t v60 = vsubq_f32(v188, v51); - vst1q_f32((float32_t *)v173, v59); - vst1q_f32((float32_t *)v182, v60); - v5 += 2 * 1; - v6 += 2 * 1; - } - for (int j = v85 * 2; j < howmany; j += 1) { - float32x2_t v97 = v5[istride]; - float32x2_t v109 = v7[0]; - float32x2_t v110 = vtrn1_f32(v97, v97); - float32x2_t v111 = vtrn2_f32(v97, v97); - float32x2_t v114 = v7[1]; - float32x2_t v122 = v5[0]; - float32x2_t v115 = vmul_f32(v110, v109); - float32x2_t v117 = vfma_f32(v115, v111, v114); - float32x2_t v123 = vadd_f32(v122, v117); - float32x2_t v124 = vsub_f32(v122, v117); - v6[0] = v123; - v6[ostride] = v124; - v5 += 1 * 1; - v6 += 1 * 1; - } -} -#endif - -#ifdef ARMRAL_ARCH_SVE -void armral_fft_cf32_cf32_cf32_ac_t_uu2(const armral_cmplx_f32_t *restrict x, - armral_cmplx_f32_t *restrict y, - int istride, int ostride, - const armral_cmplx_f32_t *restrict w, - int howmany, float dir) { - int64_t v0 = istride; - int64_t v2 = ostride; - const float32x2_t *v5 = (const float32x2_t *)x; - float32x2_t *v6 = (float32x2_t *)y; - const float32x2_t *v7 = (const float32x2_t *)w; - int64_t v8 = howmany; - int64_t v10 = svcntd(); - int64_t v11 = v10 * 1; - int64_t v12 = v10 * 1; - for (int j = 0; j < v8; j += v10) { - svbool_t pred_full = svwhilelt_b32(j * 2, howmany * 2); - const float32x2_t *v72 = &v5[v0]; - float32x2_t *v103 = &v6[v2]; - float32x2_t v30 = v7[0]; - const float32x2_t *v82 = &v5[0]; - float32x2_t *v94 = &v6[0]; - svfloat32_t v107 = - svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v72)[0])); - svfloat32_t v31; - asm("mov %0.d, %d1" : "=w"(v31) : "w"(v30)); - svfloat32_t v109 = - svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v82)[0])); - svfloat32_t zero32; - asm volatile("mov %0.s, #0" : "=w"(zero32)); - svfloat32_t v32 = - svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero32, v107, v31, 0), - v107, v31, 90); - svfloat32_t v40; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v40) : "w"(v109), "w"(v32)); - svfloat32_t v41; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v41) : "w"(v109), "w"(v32)); - svst1_f64(pred_full, (double *)(v94), svreinterpret_f64_f32(v40)); - svst1_f64(pred_full, (double *)(v103), svreinterpret_f64_f32(v41)); - v5 += v11; - v6 += v12; - } -} -#endif - -#ifndef ARMRAL_ARCH_SVE -void armral_fft_cf32_cf32_cf32_ac_t_uu3(const armral_cmplx_f32_t *restrict x, - armral_cmplx_f32_t *restrict y, - int istride, int ostride, - const armral_cmplx_f32_t *restrict w, - int howmany, float dir) { - float v4 = dir; - const float32x2_t *v5 = (const float32x2_t *)x; - float32x2_t *v6 = (float32x2_t *)y; - const float32x2_t *v7 = (const float32x2_t *)w; - int64_t v12 = howmany - 1; - int64_t v135 = howmany / 2; - for (int j = 0; j < v12; j += 2) { - float v99 = -1.4999999999999998e+00F; - float v103 = 8.6602540378443871e-01F; - float v104 = -8.6602540378443871e-01F; - float32x2_t v106 = (float32x2_t){v4, v4}; - const float32x2_t *v244 = &v5[istride]; - float32x2_t *v284 = &v6[ostride]; - float32x2_t v100 = (float32x2_t){v99, v99}; - float32x2_t v105 = (float32x2_t){v103, v104}; - const float32x2_t *v265 = &v5[0]; - float32x2_t *v275 = &v6[0]; - float32x4_t v297 = vld1q_f32((const float32_t *)v244); - float32x4_t v61 = vtrn1q_f32(v297, v297); - float32x4_t v62 = vtrn2q_f32(v297, v297); - float32x4_t v66 = - vreinterpretq_f32_u64(vld1q_dup_u64((const uint64_t *)&v7[0])); - float32x4_t v68 = - vreinterpretq_f32_u64(vld1q_dup_u64((const uint64_t *)&v7[1])); - float32x4_t v78 = - vreinterpretq_f32_u64(vld1q_dup_u64((const uint64_t *)&v7[2])); - float32x4_t v80 = - vreinterpretq_f32_u64(vld1q_dup_u64((const uint64_t *)&v7[3])); - float32x4_t v101 = vcombine_f32(v100, v100); - float32x2_t v107 = vmul_f32(v106, v105); - const float32x2_t *v253 = &v5[istride * 2]; - float32x2_t *v293 = &v6[ostride * 2]; - float32x4_t v301 = vld1q_f32((const float32_t *)v265); - float32x4_t v67 = vmulq_f32(v61, v66); - float32x4_t v109 = vcombine_f32(v107, v107); - float32x4_t v299 = vld1q_f32((const float32_t *)v253); - float32x4_t v70 = vfmaq_f32(v67, v62, v68); - float32x4_t v73 = vtrn1q_f32(v299, v299); - float32x4_t v74 = vtrn2q_f32(v299, v299); - float32x4_t v79 = vmulq_f32(v73, v78); - float32x4_t v82 = vfmaq_f32(v79, v74, v80); - float32x4_t v83 = vaddq_f32(v70, v82); - float32x4_t v84 = vsubq_f32(v70, v82); - float32x4_t v92 = vaddq_f32(v83, v301); - float32x4_t v102 = vmulq_f32(v83, v101); - float32x4_t v108 = vrev64q_f32(v84); - float32x4_t v110 = vmulq_f32(v108, v109); - float32x4_t v111 = vaddq_f32(v92, v102); - vst1q_f32((float32_t *)v275, v92); - float32x4_t v112 = vaddq_f32(v111, v110); - float32x4_t v113 = vsubq_f32(v111, v110); - vst1q_f32((float32_t *)v284, v113); - vst1q_f32((float32_t *)v293, v112); - v5 += 2 * 1; - v6 += 2 * 1; - } - for (int j = v135 * 2; j < howmany; j += 1) { - float32x2_t v147 = v5[istride]; - float v206 = -1.4999999999999998e+00F; - float v209 = 8.6602540378443871e-01F; - float v210 = -8.6602540378443871e-01F; - float32x2_t v212 = (float32x2_t){v4, v4}; - float32x2_t v174 = v7[0]; - float32x2_t v175 = vtrn1_f32(v147, v147); - float32x2_t v176 = vtrn2_f32(v147, v147); - float32x2_t v179 = v7[1]; - float32x2_t v184 = v7[2]; - float32x2_t v189 = v7[3]; - float32x2_t v199 = v5[0]; - float32x2_t v207 = (float32x2_t){v206, v206}; - float32x2_t v211 = (float32x2_t){v209, v210}; - float32x2_t v162 = v5[istride * 2]; - float32x2_t v180 = vmul_f32(v175, v174); - float32x2_t v213 = vmul_f32(v212, v211); - float32x2_t v182 = vfma_f32(v180, v176, v179); - float32x2_t v185 = vtrn1_f32(v162, v162); - float32x2_t v186 = vtrn2_f32(v162, v162); - float32x2_t v190 = vmul_f32(v185, v184); - float32x2_t v192 = vfma_f32(v190, v186, v189); - float32x2_t v193 = vadd_f32(v182, v192); - float32x2_t v194 = vsub_f32(v182, v192); - float32x2_t v200 = vadd_f32(v193, v199); - float32x2_t v208 = vmul_f32(v193, v207); - float32x2_t v214 = vrev64_f32(v194); - float32x2_t v215 = vmul_f32(v214, v213); - float32x2_t v216 = vadd_f32(v200, v208); - v6[0] = v200; - float32x2_t v217 = vadd_f32(v216, v215); - float32x2_t v218 = vsub_f32(v216, v215); - v6[ostride] = v218; - v6[ostride * 2] = v217; - v5 += 1 * 1; - v6 += 1 * 1; - } -} -#endif - -#ifdef ARMRAL_ARCH_SVE -void armral_fft_cf32_cf32_cf32_ac_t_uu3(const armral_cmplx_f32_t *restrict x, - armral_cmplx_f32_t *restrict y, - int istride, int ostride, - const armral_cmplx_f32_t *restrict w, - int howmany, float dir) { - int64_t v0 = istride; - int64_t v2 = ostride; - float v4 = dir; - const float32x2_t *v5 = (const float32x2_t *)x; - float32x2_t *v6 = (float32x2_t *)y; - const float32x2_t *v7 = (const float32x2_t *)w; - int64_t v8 = howmany; - int64_t v10 = svcntd(); - int64_t v11 = v10 * 1; - int64_t v12 = v10 * 1; - for (int j = 0; j < v8; j += v10) { - svbool_t pred_full = svwhilelt_b32(j * 2, howmany * 2); - float v64 = -1.4999999999999998e+00F; - float v69 = -8.6602540378443871e-01F; - const float32x2_t *v105 = &v5[v0]; - float32x2_t *v146 = &v6[v2]; - int64_t v30 = v0 * 2; - float32x2_t v41 = v7[0]; - float32x2_t v45 = v7[1]; - float v72 = v4 * v69; - int64_t v93 = v2 * 2; - const float32x2_t *v124 = &v5[0]; - svfloat32_t v128 = svdup_n_f32(v64); - float32x2_t *v137 = &v6[0]; - svfloat32_t v159 = - svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v105)[0])); - svfloat32_t v42; - asm("mov %0.d, %d1" : "=w"(v42) : "w"(v41)); - svfloat32_t v46; - asm("mov %0.d, %d1" : "=w"(v46) : "w"(v45)); - const float32x2_t *v114 = &v5[v30]; - svfloat32_t v129 = svdup_n_f32(v72); - float32x2_t *v155 = &v6[v93]; - svfloat32_t v163 = - svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v124)[0])); - svfloat32_t zero43; - asm volatile("mov %0.s, #0" : "=w"(zero43)); - svfloat32_t v43 = - svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero43, v159, v42, 0), - v159, v42, 90); - svfloat32_t v161 = - svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v114)[0])); - svfloat32_t zero47; - asm volatile("mov %0.s, #0" : "=w"(zero47)); - svfloat32_t v47 = - svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero47, v161, v46, 0), - v161, v46, 90); - svfloat32_t v48; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v48) : "w"(v43), "w"(v47)); - svfloat32_t v49; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v49) : "w"(v43), "w"(v47)); - svfloat32_t v57; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v57) : "w"(v48), "w"(v163)); - svfloat32_t zero74; - asm volatile("mov %0.s, #0" : "=w"(zero74)); - svfloat32_t v74 = svcmla_f32_x(pred_full, zero74, v129, v49, 90); - svfloat32_t v75 = svmla_f32_x(pred_full, v57, v48, v128); - svst1_f64(pred_full, (double *)(v137), svreinterpret_f64_f32(v57)); - svfloat32_t v76; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v76) : "w"(v75), "w"(v74)); - svfloat32_t v77; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v77) : "w"(v75), "w"(v74)); - svst1_f64(pred_full, (double *)(v146), svreinterpret_f64_f32(v77)); - svst1_f64(pred_full, (double *)(v155), svreinterpret_f64_f32(v76)); - v5 += v11; - v6 += v12; - } -} -#endif - -#ifndef ARMRAL_ARCH_SVE -void armral_fft_cf32_cf32_cf32_ac_t_uu4(const armral_cmplx_f32_t *restrict x, - armral_cmplx_f32_t *restrict y, - int istride, int ostride, - const armral_cmplx_f32_t *restrict w, - int howmany, float dir) { - float v4 = dir; - const float32x2_t *v5 = (const float32x2_t *)x; - float32x2_t *v6 = (float32x2_t *)y; - const float32x2_t *v7 = (const float32x2_t *)w; - int64_t v12 = howmany - 1; - int64_t v180 = howmany / 2; - for (int j = 0; j < v12; j += 2) { - float v142 = 1.0000000000000000e+00F; - float v143 = -1.0000000000000000e+00F; - float32x2_t v145 = (float32x2_t){v4, v4}; - const float32x2_t *v336 = &v5[istride]; - float32x2_t *v376 = &v6[ostride]; - float32x2_t v144 = (float32x2_t){v142, v143}; - const float32x2_t *v357 = &v5[0]; - float32x2_t *v367 = &v6[0]; - float32x4_t v400 = vld1q_f32((const float32_t *)v336); - float32x4_t v47 = - vreinterpretq_f32_u64(vld1q_dup_u64((const uint64_t *)&v7[2])); - float32x4_t v49 = - vreinterpretq_f32_u64(vld1q_dup_u64((const uint64_t *)&v7[3])); - float32x4_t v92 = vtrn1q_f32(v400, v400); - float32x4_t v93 = vtrn2q_f32(v400, v400); - float32x4_t v97 = - vreinterpretq_f32_u64(vld1q_dup_u64((const uint64_t *)&v7[0])); - float32x4_t v99 = - vreinterpretq_f32_u64(vld1q_dup_u64((const uint64_t *)&v7[1])); - float32x4_t v109 = - vreinterpretq_f32_u64(vld1q_dup_u64((const uint64_t *)&v7[4])); - float32x4_t v111 = - vreinterpretq_f32_u64(vld1q_dup_u64((const uint64_t *)&v7[5])); - float32x2_t v146 = vmul_f32(v145, v144); - const float32x2_t *v325 = &v5[istride * 2]; - const float32x2_t *v345 = &v5[istride * 3]; - float32x2_t *v385 = &v6[ostride * 2]; - float32x2_t *v394 = &v6[ostride * 3]; - float32x4_t v404 = vld1q_f32((const float32_t *)v357); - float32x4_t v98 = vmulq_f32(v92, v97); - float32x4_t v148 = vcombine_f32(v146, v146); - float32x4_t v398 = vld1q_f32((const float32_t *)v325); - float32x4_t v402 = vld1q_f32((const float32_t *)v345); - float32x4_t v42 = vtrn1q_f32(v398, v398); - float32x4_t v43 = vtrn2q_f32(v398, v398); - float32x4_t v101 = vfmaq_f32(v98, v93, v99); - float32x4_t v104 = vtrn1q_f32(v402, v402); - float32x4_t v105 = vtrn2q_f32(v402, v402); - float32x4_t v48 = vmulq_f32(v42, v47); - float32x4_t v110 = vmulq_f32(v104, v109); - float32x4_t v51 = vfmaq_f32(v48, v43, v49); - float32x4_t v113 = vfmaq_f32(v110, v105, v111); - float32x4_t v121 = vaddq_f32(v404, v51); - float32x4_t v122 = vsubq_f32(v404, v51); - float32x4_t v123 = vaddq_f32(v101, v113); - float32x4_t v124 = vsubq_f32(v101, v113); - float32x4_t v125 = vaddq_f32(v121, v123); - float32x4_t v126 = vsubq_f32(v121, v123); - float32x4_t v147 = vrev64q_f32(v124); - float32x4_t v149 = vmulq_f32(v147, v148); - vst1q_f32((float32_t *)v367, v125); - vst1q_f32((float32_t *)v385, v126); - float32x4_t v150 = vaddq_f32(v122, v149); - float32x4_t v151 = vsubq_f32(v122, v149); - vst1q_f32((float32_t *)v376, v151); - vst1q_f32((float32_t *)v394, v150); - v5 += 2 * 1; - v6 += 2 * 1; - } - for (int j = v180 * 2; j < howmany; j += 1) { - float32x2_t v217 = v5[istride]; - float v286 = 1.0000000000000000e+00F; - float v287 = -1.0000000000000000e+00F; - float32x2_t v289 = (float32x2_t){v4, v4}; - float32x2_t v204 = v7[2]; - float32x2_t v209 = v7[3]; - float32x2_t v244 = v7[0]; - float32x2_t v245 = vtrn1_f32(v217, v217); - float32x2_t v246 = vtrn2_f32(v217, v217); - float32x2_t v249 = v7[1]; - float32x2_t v254 = v7[4]; - float32x2_t v259 = v7[5]; - float32x2_t v267 = v5[0]; - float32x2_t v288 = (float32x2_t){v286, v287}; - float32x2_t v192 = v5[istride * 2]; - float32x2_t v232 = v5[istride * 3]; - float32x2_t v250 = vmul_f32(v245, v244); - float32x2_t v290 = vmul_f32(v289, v288); - float32x2_t v205 = vtrn1_f32(v192, v192); - float32x2_t v206 = vtrn2_f32(v192, v192); - float32x2_t v252 = vfma_f32(v250, v246, v249); - float32x2_t v255 = vtrn1_f32(v232, v232); - float32x2_t v256 = vtrn2_f32(v232, v232); - float32x2_t v210 = vmul_f32(v205, v204); - float32x2_t v260 = vmul_f32(v255, v254); - float32x2_t v212 = vfma_f32(v210, v206, v209); - float32x2_t v262 = vfma_f32(v260, v256, v259); - float32x2_t v268 = vadd_f32(v267, v212); - float32x2_t v269 = vsub_f32(v267, v212); - float32x2_t v270 = vadd_f32(v252, v262); - float32x2_t v271 = vsub_f32(v252, v262); - float32x2_t v272 = vadd_f32(v268, v270); - float32x2_t v273 = vsub_f32(v268, v270); - float32x2_t v291 = vrev64_f32(v271); - float32x2_t v292 = vmul_f32(v291, v290); - v6[0] = v272; - v6[ostride * 2] = v273; - float32x2_t v293 = vadd_f32(v269, v292); - float32x2_t v294 = vsub_f32(v269, v292); - v6[ostride] = v294; - v6[ostride * 3] = v293; - v5 += 1 * 1; - v6 += 1 * 1; - } -} -#endif - -#ifdef ARMRAL_ARCH_SVE -void armral_fft_cf32_cf32_cf32_ac_t_uu4(const armral_cmplx_f32_t *restrict x, - armral_cmplx_f32_t *restrict y, - int istride, int ostride, - const armral_cmplx_f32_t *restrict w, - int howmany, float dir) { - int64_t v0 = istride; - int64_t v2 = ostride; - float v4 = dir; - const float32x2_t *v5 = (const float32x2_t *)x; - float32x2_t *v6 = (float32x2_t *)y; - const float32x2_t *v7 = (const float32x2_t *)w; - int64_t v8 = howmany; - int64_t v10 = svcntd(); - int64_t v11 = v10 * 1; - int64_t v12 = v10 * 1; - for (int j = 0; j < v8; j += v10) { - svbool_t pred_full = svwhilelt_b32(j * 2, howmany * 2); - float v92 = -1.0000000000000000e+00F; - const float32x2_t *v143 = &v5[v0]; - float32x2_t *v185 = &v6[v2]; - int64_t v19 = v0 * 2; - float32x2_t v30 = v7[1]; - int64_t v45 = v0 * 3; - float32x2_t v56 = v7[0]; - float32x2_t v60 = v7[2]; - float v95 = v4 * v92; - int64_t v115 = v2 * 2; - int64_t v122 = v2 * 3; - const float32x2_t *v162 = &v5[0]; - float32x2_t *v176 = &v6[0]; - svfloat32_t v209 = - svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v143)[0])); - svfloat32_t v31; - asm("mov %0.d, %d1" : "=w"(v31) : "w"(v30)); - svfloat32_t v57; - asm("mov %0.d, %d1" : "=w"(v57) : "w"(v56)); - svfloat32_t v61; - asm("mov %0.d, %d1" : "=w"(v61) : "w"(v60)); - const float32x2_t *v134 = &v5[v19]; - const float32x2_t *v152 = &v5[v45]; - svfloat32_t v168 = svdup_n_f32(v95); - float32x2_t *v194 = &v6[v115]; - float32x2_t *v203 = &v6[v122]; - svfloat32_t v213 = - svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v162)[0])); - svfloat32_t zero58; - asm volatile("mov %0.s, #0" : "=w"(zero58)); - svfloat32_t v58 = - svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero58, v209, v57, 0), - v209, v57, 90); - svfloat32_t v207 = - svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v134)[0])); - svfloat32_t v211 = - svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v152)[0])); - svfloat32_t zero32; - asm volatile("mov %0.s, #0" : "=w"(zero32)); - svfloat32_t v32 = - svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero32, v207, v31, 0), - v207, v31, 90); - svfloat32_t zero62; - asm volatile("mov %0.s, #0" : "=w"(zero62)); - svfloat32_t v62 = - svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero62, v211, v61, 0), - v211, v61, 90); - svfloat32_t v70; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v70) : "w"(v213), "w"(v32)); - svfloat32_t v71; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v71) : "w"(v213), "w"(v32)); - svfloat32_t v72; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v72) : "w"(v58), "w"(v62)); - svfloat32_t v73; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v73) : "w"(v58), "w"(v62)); - svfloat32_t v74; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v74) : "w"(v70), "w"(v72)); - svfloat32_t v75; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v75) : "w"(v70), "w"(v72)); - svfloat32_t zero97; - asm volatile("mov %0.s, #0" : "=w"(zero97)); - svfloat32_t v97 = svcmla_f32_x(pred_full, zero97, v168, v73, 90); - svfloat32_t v98; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v98) : "w"(v71), "w"(v97)); - svfloat32_t v99; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v99) : "w"(v71), "w"(v97)); - svst1_f64(pred_full, (double *)(v176), svreinterpret_f64_f32(v74)); - svst1_f64(pred_full, (double *)(v194), svreinterpret_f64_f32(v75)); - svst1_f64(pred_full, (double *)(v185), svreinterpret_f64_f32(v99)); - svst1_f64(pred_full, (double *)(v203), svreinterpret_f64_f32(v98)); - v5 += v11; - v6 += v12; - } -} -#endif - -#ifndef ARMRAL_ARCH_SVE -void armral_fft_cf32_cf32_cf32_ac_t_uu5(const armral_cmplx_f32_t *restrict x, - armral_cmplx_f32_t *restrict y, - int istride, int ostride, - const armral_cmplx_f32_t *restrict w, - int howmany, float dir) { - float v4 = dir; - const float32x2_t *v5 = (const float32x2_t *)x; - float32x2_t *v6 = (float32x2_t *)y; - const float32x2_t *v7 = (const float32x2_t *)w; - int64_t v12 = howmany - 1; - int64_t v243 = howmany / 2; - for (int j = 0; j < v12; j += 2) { - float v166 = -1.2500000000000000e+00F; - float v171 = 5.5901699437494745e-01F; - float v175 = 1.5388417685876268e+00F; - float v176 = -1.5388417685876268e+00F; - float v183 = 5.8778525229247325e-01F; - float v184 = -5.8778525229247325e-01F; - float v191 = 3.6327126400268028e-01F; - float v192 = -3.6327126400268028e-01F; - float32x2_t v194 = (float32x2_t){v4, v4}; - const float32x2_t *v441 = &v5[istride]; - float32x2_t *v503 = &v6[ostride]; - float32x2_t v167 = (float32x2_t){v166, v166}; - float32x2_t v172 = (float32x2_t){v171, v171}; - float32x2_t v177 = (float32x2_t){v175, v176}; - float32x2_t v185 = (float32x2_t){v183, v184}; - float32x2_t v193 = (float32x2_t){v191, v192}; - const float32x2_t *v484 = &v5[0]; - float32x2_t *v494 = &v6[0]; - float32x4_t v534 = vld1q_f32((const float32_t *)v441); - float32x4_t v61 = vtrn1q_f32(v534, v534); - float32x4_t v62 = vtrn2q_f32(v534, v534); - float32x4_t v66 = - vreinterpretq_f32_u64(vld1q_dup_u64((const uint64_t *)&v7[0])); - float32x4_t v68 = - vreinterpretq_f32_u64(vld1q_dup_u64((const uint64_t *)&v7[1])); - float32x4_t v78 = - vreinterpretq_f32_u64(vld1q_dup_u64((const uint64_t *)&v7[6])); - float32x4_t v80 = - vreinterpretq_f32_u64(vld1q_dup_u64((const uint64_t *)&v7[7])); - float32x4_t v128 = - vreinterpretq_f32_u64(vld1q_dup_u64((const uint64_t *)&v7[4])); - float32x4_t v130 = - vreinterpretq_f32_u64(vld1q_dup_u64((const uint64_t *)&v7[5])); - float32x4_t v140 = - vreinterpretq_f32_u64(vld1q_dup_u64((const uint64_t *)&v7[2])); - float32x4_t v142 = - vreinterpretq_f32_u64(vld1q_dup_u64((const uint64_t *)&v7[3])); - float32x4_t v168 = vcombine_f32(v167, v167); - float32x4_t v173 = vcombine_f32(v172, v172); - float32x2_t v179 = vmul_f32(v194, v177); - float32x2_t v187 = vmul_f32(v194, v185); - float32x2_t v195 = vmul_f32(v194, v193); - const float32x2_t *v450 = &v5[istride * 4]; - const float32x2_t *v461 = &v5[istride * 3]; - const float32x2_t *v471 = &v5[istride * 2]; - float32x2_t *v512 = &v6[ostride * 2]; - float32x2_t *v521 = &v6[ostride * 3]; - float32x2_t *v530 = &v6[ostride * 4]; - float32x4_t v542 = vld1q_f32((const float32_t *)v484); - float32x4_t v67 = vmulq_f32(v61, v66); - float32x4_t v181 = vcombine_f32(v179, v179); - float32x4_t v189 = vcombine_f32(v187, v187); - float32x4_t v197 = vcombine_f32(v195, v195); - float32x4_t v536 = vld1q_f32((const float32_t *)v450); - float32x4_t v538 = vld1q_f32((const float32_t *)v461); - float32x4_t v540 = vld1q_f32((const float32_t *)v471); - float32x4_t v70 = vfmaq_f32(v67, v62, v68); - float32x4_t v73 = vtrn1q_f32(v536, v536); - float32x4_t v74 = vtrn2q_f32(v536, v536); - float32x4_t v123 = vtrn1q_f32(v538, v538); - float32x4_t v124 = vtrn2q_f32(v538, v538); - float32x4_t v135 = vtrn1q_f32(v540, v540); - float32x4_t v136 = vtrn2q_f32(v540, v540); - float32x4_t v79 = vmulq_f32(v73, v78); - float32x4_t v129 = vmulq_f32(v123, v128); - float32x4_t v141 = vmulq_f32(v135, v140); - float32x4_t v82 = vfmaq_f32(v79, v74, v80); - float32x4_t v132 = vfmaq_f32(v129, v124, v130); - float32x4_t v144 = vfmaq_f32(v141, v136, v142); - float32x4_t v145 = vaddq_f32(v70, v82); - float32x4_t v146 = vsubq_f32(v70, v82); - float32x4_t v147 = vaddq_f32(v132, v144); - float32x4_t v148 = vsubq_f32(v132, v144); - float32x4_t v149 = vaddq_f32(v145, v147); - float32x4_t v150 = vsubq_f32(v145, v147); - float32x4_t v151 = vaddq_f32(v146, v148); - float32x4_t v180 = vrev64q_f32(v146); - float32x4_t v196 = vrev64q_f32(v148); - float32x4_t v159 = vaddq_f32(v149, v542); - float32x4_t v169 = vmulq_f32(v149, v168); - float32x4_t v174 = vmulq_f32(v150, v173); - float32x4_t v182 = vmulq_f32(v180, v181); - float32x4_t v188 = vrev64q_f32(v151); - float32x4_t v198 = vmulq_f32(v196, v197); - float32x4_t v190 = vmulq_f32(v188, v189); - float32x4_t v199 = vaddq_f32(v159, v169); - vst1q_f32((float32_t *)v494, v159); - float32x4_t v200 = vaddq_f32(v199, v174); - float32x4_t v201 = vsubq_f32(v199, v174); - float32x4_t v202 = vsubq_f32(v182, v190); - float32x4_t v203 = vaddq_f32(v190, v198); - float32x4_t v204 = vaddq_f32(v200, v202); - float32x4_t v205 = vsubq_f32(v200, v202); - float32x4_t v206 = vaddq_f32(v201, v203); - float32x4_t v207 = vsubq_f32(v201, v203); - vst1q_f32((float32_t *)v503, v205); - vst1q_f32((float32_t *)v512, v207); - vst1q_f32((float32_t *)v521, v206); - vst1q_f32((float32_t *)v530, v204); - v5 += 2 * 1; - v6 += 2 * 1; - } - for (int j = v243 * 2; j < howmany; j += 1) { - float32x2_t v255 = v5[istride]; - float v369 = -1.2500000000000000e+00F; - float v373 = 5.5901699437494745e-01F; - float v376 = 1.5388417685876268e+00F; - float v377 = -1.5388417685876268e+00F; - float v383 = 5.8778525229247325e-01F; - float v384 = -5.8778525229247325e-01F; - float v390 = 3.6327126400268028e-01F; - float v391 = -3.6327126400268028e-01F; - float32x2_t v393 = (float32x2_t){v4, v4}; - float32x2_t v282 = v7[0]; - float32x2_t v283 = vtrn1_f32(v255, v255); - float32x2_t v284 = vtrn2_f32(v255, v255); - float32x2_t v287 = v7[1]; - float32x2_t v292 = v7[6]; - float32x2_t v297 = v7[7]; - float32x2_t v332 = v7[4]; - float32x2_t v337 = v7[5]; - float32x2_t v342 = v7[2]; - float32x2_t v347 = v7[3]; - float32x2_t v362 = v5[0]; - float32x2_t v370 = (float32x2_t){v369, v369}; - float32x2_t v374 = (float32x2_t){v373, v373}; - float32x2_t v378 = (float32x2_t){v376, v377}; - float32x2_t v385 = (float32x2_t){v383, v384}; - float32x2_t v392 = (float32x2_t){v390, v391}; - float32x2_t v270 = v5[istride * 4]; - float32x2_t v288 = vmul_f32(v283, v282); - float32x2_t v305 = v5[istride * 3]; - float32x2_t v320 = v5[istride * 2]; - float32x2_t v380 = vmul_f32(v393, v378); - float32x2_t v387 = vmul_f32(v393, v385); - float32x2_t v394 = vmul_f32(v393, v392); - float32x2_t v290 = vfma_f32(v288, v284, v287); - float32x2_t v293 = vtrn1_f32(v270, v270); - float32x2_t v294 = vtrn2_f32(v270, v270); - float32x2_t v333 = vtrn1_f32(v305, v305); - float32x2_t v334 = vtrn2_f32(v305, v305); - float32x2_t v343 = vtrn1_f32(v320, v320); - float32x2_t v344 = vtrn2_f32(v320, v320); - float32x2_t v298 = vmul_f32(v293, v292); - float32x2_t v338 = vmul_f32(v333, v332); - float32x2_t v348 = vmul_f32(v343, v342); - float32x2_t v300 = vfma_f32(v298, v294, v297); - float32x2_t v340 = vfma_f32(v338, v334, v337); - float32x2_t v350 = vfma_f32(v348, v344, v347); - float32x2_t v351 = vadd_f32(v290, v300); - float32x2_t v352 = vsub_f32(v290, v300); - float32x2_t v353 = vadd_f32(v340, v350); - float32x2_t v354 = vsub_f32(v340, v350); - float32x2_t v355 = vadd_f32(v351, v353); - float32x2_t v356 = vsub_f32(v351, v353); - float32x2_t v357 = vadd_f32(v352, v354); - float32x2_t v381 = vrev64_f32(v352); - float32x2_t v395 = vrev64_f32(v354); - float32x2_t v363 = vadd_f32(v355, v362); - float32x2_t v371 = vmul_f32(v355, v370); - float32x2_t v375 = vmul_f32(v356, v374); - float32x2_t v382 = vmul_f32(v381, v380); - float32x2_t v388 = vrev64_f32(v357); - float32x2_t v396 = vmul_f32(v395, v394); - float32x2_t v389 = vmul_f32(v388, v387); - float32x2_t v397 = vadd_f32(v363, v371); - v6[0] = v363; - float32x2_t v398 = vadd_f32(v397, v375); - float32x2_t v399 = vsub_f32(v397, v375); - float32x2_t v400 = vsub_f32(v382, v389); - float32x2_t v401 = vadd_f32(v389, v396); - float32x2_t v402 = vadd_f32(v398, v400); - float32x2_t v403 = vsub_f32(v398, v400); - float32x2_t v404 = vadd_f32(v399, v401); - float32x2_t v405 = vsub_f32(v399, v401); - v6[ostride] = v403; - v6[ostride * 2] = v405; - v6[ostride * 3] = v404; - v6[ostride * 4] = v402; - v5 += 1 * 1; - v6 += 1 * 1; - } -} -#endif - -#ifdef ARMRAL_ARCH_SVE -void armral_fft_cf32_cf32_cf32_ac_t_uu5(const armral_cmplx_f32_t *restrict x, - armral_cmplx_f32_t *restrict y, - int istride, int ostride, - const armral_cmplx_f32_t *restrict w, - int howmany, float dir) { - int64_t v0 = istride; - int64_t v2 = ostride; - float v4 = dir; - const float32x2_t *v5 = (const float32x2_t *)x; - float32x2_t *v6 = (float32x2_t *)y; - const float32x2_t *v7 = (const float32x2_t *)w; - int64_t v8 = howmany; - int64_t v10 = svcntd(); - int64_t v11 = v10 * 1; - int64_t v12 = v10 * 1; - for (int j = 0; j < v8; j += v10) { - svbool_t pred_full = svwhilelt_b32(j * 2, howmany * 2); - float v99 = -1.2500000000000000e+00F; - float v104 = 5.5901699437494745e-01F; - float v109 = -1.5388417685876268e+00F; - float v116 = -5.8778525229247325e-01F; - float v123 = -3.6327126400268028e-01F; - const float32x2_t *v179 = &v5[v0]; - float32x2_t *v241 = &v6[v2]; - int64_t v30 = v0 * 4; - float32x2_t v41 = v7[0]; - float32x2_t v45 = v7[3]; - int64_t v49 = v0 * 3; - int64_t v60 = v0 * 2; - float32x2_t v71 = v7[2]; - float32x2_t v75 = v7[1]; - float v112 = v4 * v109; - float v119 = v4 * v116; - float v126 = v4 * v123; - int64_t v153 = v2 * 2; - int64_t v160 = v2 * 3; - int64_t v167 = v2 * 4; - const float32x2_t *v216 = &v5[0]; - svfloat32_t v220 = svdup_n_f32(v99); - svfloat32_t v221 = svdup_n_f32(v104); - float32x2_t *v232 = &v6[0]; - svfloat32_t v272 = - svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v179)[0])); - svfloat32_t v42; - asm("mov %0.d, %d1" : "=w"(v42) : "w"(v41)); - svfloat32_t v46; - asm("mov %0.d, %d1" : "=w"(v46) : "w"(v45)); - svfloat32_t v72; - asm("mov %0.d, %d1" : "=w"(v72) : "w"(v71)); - svfloat32_t v76; - asm("mov %0.d, %d1" : "=w"(v76) : "w"(v75)); - const float32x2_t *v188 = &v5[v30]; - const float32x2_t *v197 = &v5[v49]; - const float32x2_t *v206 = &v5[v60]; - svfloat32_t v222 = svdup_n_f32(v112); - svfloat32_t v223 = svdup_n_f32(v119); - svfloat32_t v224 = svdup_n_f32(v126); - float32x2_t *v250 = &v6[v153]; - float32x2_t *v259 = &v6[v160]; - float32x2_t *v268 = &v6[v167]; - svfloat32_t v280 = - svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v216)[0])); - svfloat32_t zero43; - asm volatile("mov %0.s, #0" : "=w"(zero43)); - svfloat32_t v43 = - svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero43, v272, v42, 0), - v272, v42, 90); - svfloat32_t v274 = - svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v188)[0])); - svfloat32_t v276 = - svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v197)[0])); - svfloat32_t v278 = - svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v206)[0])); - svfloat32_t zero47; - asm volatile("mov %0.s, #0" : "=w"(zero47)); - svfloat32_t v47 = - svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero47, v274, v46, 0), - v274, v46, 90); - svfloat32_t zero73; - asm volatile("mov %0.s, #0" : "=w"(zero73)); - svfloat32_t v73 = - svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero73, v276, v72, 0), - v276, v72, 90); - svfloat32_t zero77; - asm volatile("mov %0.s, #0" : "=w"(zero77)); - svfloat32_t v77 = - svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero77, v278, v76, 0), - v278, v76, 90); - svfloat32_t v78; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v78) : "w"(v43), "w"(v47)); - svfloat32_t v79; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v79) : "w"(v43), "w"(v47)); - svfloat32_t v80; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v80) : "w"(v73), "w"(v77)); - svfloat32_t v81; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v81) : "w"(v73), "w"(v77)); - svfloat32_t v82; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v82) : "w"(v78), "w"(v80)); - svfloat32_t v83; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v83) : "w"(v78), "w"(v80)); - svfloat32_t v84; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v84) : "w"(v79), "w"(v81)); - svfloat32_t zero114; - asm volatile("mov %0.s, #0" : "=w"(zero114)); - svfloat32_t v114 = svcmla_f32_x(pred_full, zero114, v222, v79, 90); - svfloat32_t v92; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v92) : "w"(v82), "w"(v280)); - svfloat32_t zero121; - asm volatile("mov %0.s, #0" : "=w"(zero121)); - svfloat32_t v121 = svcmla_f32_x(pred_full, zero121, v223, v84, 90); - svfloat32_t v129 = svmla_f32_x(pred_full, v92, v82, v220); - svfloat32_t v132; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v132) : "w"(v114), "w"(v121)); - svfloat32_t v133 = svcmla_f32_x(pred_full, v121, v224, v81, 90); - svst1_f64(pred_full, (double *)(v232), svreinterpret_f64_f32(v92)); - svfloat32_t v130 = svmla_f32_x(pred_full, v129, v83, v221); - svfloat32_t v131 = svmls_f32_x(pred_full, v129, v83, v221); - svfloat32_t v134; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v134) : "w"(v130), "w"(v132)); - svfloat32_t v135; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v135) : "w"(v130), "w"(v132)); - svfloat32_t v136; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v136) : "w"(v131), "w"(v133)); - svfloat32_t v137; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v137) : "w"(v131), "w"(v133)); - svst1_f64(pred_full, (double *)(v241), svreinterpret_f64_f32(v135)); - svst1_f64(pred_full, (double *)(v250), svreinterpret_f64_f32(v137)); - svst1_f64(pred_full, (double *)(v259), svreinterpret_f64_f32(v136)); - svst1_f64(pred_full, (double *)(v268), svreinterpret_f64_f32(v134)); - v5 += v11; - v6 += v12; - } -} -#endif - -#ifndef ARMRAL_ARCH_SVE -void armral_fft_cf32_cf32_cf32_ac_t_uu6(const armral_cmplx_f32_t *restrict x, - armral_cmplx_f32_t *restrict y, - int istride, int ostride, - const armral_cmplx_f32_t *restrict w, - int howmany, float dir) { - float v4 = dir; - const float32x2_t *v5 = (const float32x2_t *)x; - float32x2_t *v6 = (float32x2_t *)y; - const float32x2_t *v7 = (const float32x2_t *)w; - int64_t v12 = howmany - 1; - int64_t v279 = howmany / 2; - for (int j = 0; j < v12; j += 2) { - float v222 = -1.4999999999999998e+00F; - float v226 = 8.6602540378443871e-01F; - float v227 = -8.6602540378443871e-01F; - float32x2_t v229 = (float32x2_t){v4, v4}; - const float32x2_t *v548 = &v5[istride]; - float32x2_t *v596 = &v6[ostride]; - float32x2_t v223 = (float32x2_t){v222, v222}; - float32x2_t v228 = (float32x2_t){v226, v227}; - const float32x2_t *v559 = &v5[0]; - float32x2_t *v569 = &v6[0]; - float32x4_t v626 = vld1q_f32((const float32_t *)v548); - float32x4_t v47 = - vreinterpretq_f32_u64(vld1q_dup_u64((const uint64_t *)&v7[4])); - float32x4_t v49 = - vreinterpretq_f32_u64(vld1q_dup_u64((const uint64_t *)&v7[5])); - float32x4_t v97 = - vreinterpretq_f32_u64(vld1q_dup_u64((const uint64_t *)&v7[2])); - float32x4_t v99 = - vreinterpretq_f32_u64(vld1q_dup_u64((const uint64_t *)&v7[3])); - float32x4_t v109 = - vreinterpretq_f32_u64(vld1q_dup_u64((const uint64_t *)&v7[8])); - float32x4_t v111 = - vreinterpretq_f32_u64(vld1q_dup_u64((const uint64_t *)&v7[9])); - float32x4_t v159 = - vreinterpretq_f32_u64(vld1q_dup_u64((const uint64_t *)&v7[6])); - float32x4_t v161 = - vreinterpretq_f32_u64(vld1q_dup_u64((const uint64_t *)&v7[7])); - float32x4_t v166 = vtrn1q_f32(v626, v626); - float32x4_t v167 = vtrn2q_f32(v626, v626); - float32x4_t v171 = - vreinterpretq_f32_u64(vld1q_dup_u64((const uint64_t *)&v7[0])); - float32x4_t v173 = - vreinterpretq_f32_u64(vld1q_dup_u64((const uint64_t *)&v7[1])); - float32x4_t v224 = vcombine_f32(v223, v223); - float32x2_t v230 = vmul_f32(v229, v228); - const float32x2_t *v505 = &v5[istride * 3]; - const float32x2_t *v516 = &v5[istride * 2]; - const float32x2_t *v526 = &v5[istride * 5]; - const float32x2_t *v538 = &v5[istride * 4]; - float32x2_t *v578 = &v6[ostride * 3]; - float32x2_t *v587 = &v6[ostride * 4]; - float32x2_t *v605 = &v6[ostride * 2]; - float32x2_t *v614 = &v6[ostride * 5]; - float32x4_t v628 = vld1q_f32((const float32_t *)v559); - float32x4_t v172 = vmulq_f32(v166, v171); - float32x4_t v232 = vcombine_f32(v230, v230); - float32x4_t v618 = vld1q_f32((const float32_t *)v505); - float32x4_t v620 = vld1q_f32((const float32_t *)v516); - float32x4_t v622 = vld1q_f32((const float32_t *)v526); - float32x4_t v624 = vld1q_f32((const float32_t *)v538); - float32x4_t v42 = vtrn1q_f32(v618, v618); - float32x4_t v43 = vtrn2q_f32(v618, v618); - float32x4_t v92 = vtrn1q_f32(v620, v620); - float32x4_t v93 = vtrn2q_f32(v620, v620); - float32x4_t v104 = vtrn1q_f32(v622, v622); - float32x4_t v105 = vtrn2q_f32(v622, v622); - float32x4_t v154 = vtrn1q_f32(v624, v624); - float32x4_t v155 = vtrn2q_f32(v624, v624); - float32x4_t v175 = vfmaq_f32(v172, v167, v173); - float32x4_t v48 = vmulq_f32(v42, v47); - float32x4_t v98 = vmulq_f32(v92, v97); - float32x4_t v110 = vmulq_f32(v104, v109); - float32x4_t v160 = vmulq_f32(v154, v159); - float32x4_t v51 = vfmaq_f32(v48, v43, v49); - float32x4_t v101 = vfmaq_f32(v98, v93, v99); - float32x4_t v113 = vfmaq_f32(v110, v105, v111); - float32x4_t v163 = vfmaq_f32(v160, v155, v161); - float32x4_t v183 = vaddq_f32(v628, v51); - float32x4_t v184 = vsubq_f32(v628, v51); - float32x4_t v185 = vaddq_f32(v101, v113); - float32x4_t v186 = vsubq_f32(v101, v113); - float32x4_t v187 = vaddq_f32(v163, v175); - float32x4_t v188 = vsubq_f32(v163, v175); - float32x4_t v189 = vaddq_f32(v185, v187); - float32x4_t v190 = vsubq_f32(v185, v187); - float32x4_t v213 = vaddq_f32(v186, v188); - float32x4_t v214 = vsubq_f32(v186, v188); - float32x4_t v191 = vaddq_f32(v189, v183); - float32x4_t v201 = vmulq_f32(v189, v224); - float32x4_t v207 = vrev64q_f32(v190); - float32x4_t v215 = vaddq_f32(v213, v184); - float32x4_t v225 = vmulq_f32(v213, v224); - float32x4_t v231 = vrev64q_f32(v214); - float32x4_t v209 = vmulq_f32(v207, v232); - float32x4_t v210 = vaddq_f32(v191, v201); - float32x4_t v233 = vmulq_f32(v231, v232); - float32x4_t v234 = vaddq_f32(v215, v225); - vst1q_f32((float32_t *)v569, v191); - vst1q_f32((float32_t *)v578, v215); - float32x4_t v211 = vaddq_f32(v210, v209); - float32x4_t v212 = vsubq_f32(v210, v209); - float32x4_t v235 = vaddq_f32(v234, v233); - float32x4_t v236 = vsubq_f32(v234, v233); - vst1q_f32((float32_t *)v587, v212); - vst1q_f32((float32_t *)v596, v236); - vst1q_f32((float32_t *)v605, v211); - vst1q_f32((float32_t *)v614, v235); - v5 += 2 * 1; - v6 += 2 * 1; - } - for (int j = v279 * 2; j < howmany; j += 1) { - float32x2_t v381 = v5[istride]; - float v452 = -1.4999999999999998e+00F; - float v455 = 8.6602540378443871e-01F; - float v456 = -8.6602540378443871e-01F; - float32x2_t v458 = (float32x2_t){v4, v4}; - float32x2_t v303 = v7[4]; - float32x2_t v308 = v7[5]; - float32x2_t v343 = v7[2]; - float32x2_t v348 = v7[3]; - float32x2_t v353 = v7[8]; - float32x2_t v358 = v7[9]; - float32x2_t v393 = v7[6]; - float32x2_t v398 = v7[7]; - float32x2_t v403 = v7[0]; - float32x2_t v404 = vtrn1_f32(v381, v381); - float32x2_t v405 = vtrn2_f32(v381, v381); - float32x2_t v408 = v7[1]; - float32x2_t v416 = v5[0]; - float32x2_t v453 = (float32x2_t){v452, v452}; - float32x2_t v457 = (float32x2_t){v455, v456}; - float32x2_t v291 = v5[istride * 3]; - float32x2_t v316 = v5[istride * 2]; - float32x2_t v331 = v5[istride * 5]; - float32x2_t v366 = v5[istride * 4]; - float32x2_t v409 = vmul_f32(v404, v403); - float32x2_t v459 = vmul_f32(v458, v457); - float32x2_t v304 = vtrn1_f32(v291, v291); - float32x2_t v305 = vtrn2_f32(v291, v291); - float32x2_t v344 = vtrn1_f32(v316, v316); - float32x2_t v345 = vtrn2_f32(v316, v316); - float32x2_t v354 = vtrn1_f32(v331, v331); - float32x2_t v355 = vtrn2_f32(v331, v331); - float32x2_t v394 = vtrn1_f32(v366, v366); - float32x2_t v395 = vtrn2_f32(v366, v366); - float32x2_t v411 = vfma_f32(v409, v405, v408); - float32x2_t v309 = vmul_f32(v304, v303); - float32x2_t v349 = vmul_f32(v344, v343); - float32x2_t v359 = vmul_f32(v354, v353); - float32x2_t v399 = vmul_f32(v394, v393); - float32x2_t v311 = vfma_f32(v309, v305, v308); - float32x2_t v351 = vfma_f32(v349, v345, v348); - float32x2_t v361 = vfma_f32(v359, v355, v358); - float32x2_t v401 = vfma_f32(v399, v395, v398); - float32x2_t v417 = vadd_f32(v416, v311); - float32x2_t v418 = vsub_f32(v416, v311); - float32x2_t v419 = vadd_f32(v351, v361); - float32x2_t v420 = vsub_f32(v351, v361); - float32x2_t v421 = vadd_f32(v401, v411); - float32x2_t v422 = vsub_f32(v401, v411); - float32x2_t v423 = vadd_f32(v419, v421); - float32x2_t v424 = vsub_f32(v419, v421); - float32x2_t v444 = vadd_f32(v420, v422); - float32x2_t v445 = vsub_f32(v420, v422); - float32x2_t v425 = vadd_f32(v423, v417); - float32x2_t v433 = vmul_f32(v423, v453); - float32x2_t v439 = vrev64_f32(v424); - float32x2_t v446 = vadd_f32(v444, v418); - float32x2_t v454 = vmul_f32(v444, v453); - float32x2_t v460 = vrev64_f32(v445); - float32x2_t v440 = vmul_f32(v439, v459); - float32x2_t v441 = vadd_f32(v425, v433); - float32x2_t v461 = vmul_f32(v460, v459); - float32x2_t v462 = vadd_f32(v446, v454); - v6[0] = v425; - v6[ostride * 3] = v446; - float32x2_t v442 = vadd_f32(v441, v440); - float32x2_t v443 = vsub_f32(v441, v440); - float32x2_t v463 = vadd_f32(v462, v461); - float32x2_t v464 = vsub_f32(v462, v461); - v6[ostride * 4] = v443; - v6[ostride] = v464; - v6[ostride * 2] = v442; - v6[ostride * 5] = v463; - v5 += 1 * 1; - v6 += 1 * 1; - } -} -#endif - -#ifdef ARMRAL_ARCH_SVE -void armral_fft_cf32_cf32_cf32_ac_t_uu6(const armral_cmplx_f32_t *restrict x, - armral_cmplx_f32_t *restrict y, - int istride, int ostride, - const armral_cmplx_f32_t *restrict w, - int howmany, float dir) { - int64_t v0 = istride; - int64_t v2 = ostride; - float v4 = dir; - const float32x2_t *v5 = (const float32x2_t *)x; - float32x2_t *v6 = (float32x2_t *)y; - const float32x2_t *v7 = (const float32x2_t *)w; - int64_t v8 = howmany; - int64_t v10 = svcntd(); - int64_t v11 = v10 * 1; - int64_t v12 = v10 * 1; - for (int j = 0; j < v8; j += v10) { - svbool_t pred_full = svwhilelt_b32(j * 2, howmany * 2); - float v138 = -1.4999999999999998e+00F; - float v143 = -8.6602540378443871e-01F; - const float32x2_t *v236 = &v5[v0]; - float32x2_t *v289 = &v6[v2]; - int64_t v19 = v0 * 3; - float32x2_t v30 = v7[2]; - int64_t v34 = v0 * 2; - int64_t v45 = v0 * 5; - float32x2_t v56 = v7[1]; - float32x2_t v60 = v7[4]; - int64_t v64 = v0 * 4; - float32x2_t v86 = v7[3]; - float32x2_t v90 = v7[0]; - float v146 = v4 * v143; - int64_t v160 = v2 * 3; - int64_t v167 = v2 * 4; - int64_t v181 = v2 * 2; - int64_t v188 = v2 * 5; - const float32x2_t *v246 = &v5[0]; - svfloat32_t v253 = svdup_n_f32(v138); - float32x2_t *v262 = &v6[0]; - svfloat32_t v319 = - svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v236)[0])); - svfloat32_t v31; - asm("mov %0.d, %d1" : "=w"(v31) : "w"(v30)); - svfloat32_t v57; - asm("mov %0.d, %d1" : "=w"(v57) : "w"(v56)); - svfloat32_t v61; - asm("mov %0.d, %d1" : "=w"(v61) : "w"(v60)); - svfloat32_t v87; - asm("mov %0.d, %d1" : "=w"(v87) : "w"(v86)); - svfloat32_t v91; - asm("mov %0.d, %d1" : "=w"(v91) : "w"(v90)); - const float32x2_t *v200 = &v5[v19]; - const float32x2_t *v209 = &v5[v34]; - const float32x2_t *v218 = &v5[v45]; - const float32x2_t *v227 = &v5[v64]; - svfloat32_t v254 = svdup_n_f32(v146); - float32x2_t *v271 = &v6[v160]; - float32x2_t *v280 = &v6[v167]; - float32x2_t *v298 = &v6[v181]; - float32x2_t *v307 = &v6[v188]; - svfloat32_t v321 = - svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v246)[0])); - svfloat32_t zero92; - asm volatile("mov %0.s, #0" : "=w"(zero92)); - svfloat32_t v92 = - svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero92, v319, v91, 0), - v319, v91, 90); - svfloat32_t v311 = - svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v200)[0])); - svfloat32_t v313 = - svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v209)[0])); - svfloat32_t v315 = - svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v218)[0])); - svfloat32_t v317 = - svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v227)[0])); - svfloat32_t zero32; - asm volatile("mov %0.s, #0" : "=w"(zero32)); - svfloat32_t v32 = - svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero32, v311, v31, 0), - v311, v31, 90); - svfloat32_t zero58; - asm volatile("mov %0.s, #0" : "=w"(zero58)); - svfloat32_t v58 = - svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero58, v313, v57, 0), - v313, v57, 90); - svfloat32_t zero62; - asm volatile("mov %0.s, #0" : "=w"(zero62)); - svfloat32_t v62 = - svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero62, v315, v61, 0), - v315, v61, 90); - svfloat32_t zero88; - asm volatile("mov %0.s, #0" : "=w"(zero88)); - svfloat32_t v88 = - svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero88, v317, v87, 0), - v317, v87, 90); - svfloat32_t v100; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v100) : "w"(v321), "w"(v32)); - svfloat32_t v101; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v101) : "w"(v321), "w"(v32)); - svfloat32_t v102; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v102) : "w"(v58), "w"(v62)); - svfloat32_t v103; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v103) : "w"(v58), "w"(v62)); - svfloat32_t v104; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v104) : "w"(v88), "w"(v92)); - svfloat32_t v105; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v105) : "w"(v88), "w"(v92)); - svfloat32_t v106; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v106) : "w"(v102), "w"(v104)); - svfloat32_t v107; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v107) : "w"(v102), "w"(v104)); - svfloat32_t v129; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v129) : "w"(v103), "w"(v105)); - svfloat32_t v130; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v130) : "w"(v103), "w"(v105)); - svfloat32_t v108; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v108) : "w"(v106), "w"(v100)); - svfloat32_t zero125; - asm volatile("mov %0.s, #0" : "=w"(zero125)); - svfloat32_t v125 = svcmla_f32_x(pred_full, zero125, v254, v107, 90); - svfloat32_t v131; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v131) : "w"(v129), "w"(v101)); - svfloat32_t zero148; - asm volatile("mov %0.s, #0" : "=w"(zero148)); - svfloat32_t v148 = svcmla_f32_x(pred_full, zero148, v254, v130, 90); - svfloat32_t v126 = svmla_f32_x(pred_full, v108, v106, v253); - svfloat32_t v149 = svmla_f32_x(pred_full, v131, v129, v253); - svst1_f64(pred_full, (double *)(v262), svreinterpret_f64_f32(v108)); - svst1_f64(pred_full, (double *)(v271), svreinterpret_f64_f32(v131)); - svfloat32_t v127; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v127) : "w"(v126), "w"(v125)); - svfloat32_t v128; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v128) : "w"(v126), "w"(v125)); - svfloat32_t v150; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v150) : "w"(v149), "w"(v148)); - svfloat32_t v151; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v151) : "w"(v149), "w"(v148)); - svst1_f64(pred_full, (double *)(v280), svreinterpret_f64_f32(v128)); - svst1_f64(pred_full, (double *)(v289), svreinterpret_f64_f32(v151)); - svst1_f64(pred_full, (double *)(v298), svreinterpret_f64_f32(v127)); - svst1_f64(pred_full, (double *)(v307), svreinterpret_f64_f32(v150)); - v5 += v11; - v6 += v12; - } -} -#endif - #ifndef ARMRAL_ARCH_SVE void armral_fft_cf32_cf32_cf32_ac_t_uu7(const armral_cmplx_f32_t *restrict x, armral_cmplx_f32_t *restrict y, @@ -1505,16 +343,10 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu7(const armral_cmplx_f32_t *restrict x, const float32x2_t *v259 = &v5[v0]; float32x2_t *v342 = &v6[v2]; int64_t v30 = v0 * 6; - float32x2_t v41 = v7[0]; - float32x2_t v45 = v7[5]; int64_t v49 = v0 * 4; int64_t v60 = v0 * 3; - float32x2_t v71 = v7[3]; - float32x2_t v75 = v7[2]; int64_t v79 = v0 * 2; int64_t v90 = v0 * 5; - float32x2_t v101 = v7[1]; - float32x2_t v105 = v7[4]; float v161 = v4 * v158; float v168 = v4 * v165; float v175 = v4 * v172; @@ -1532,615 +364,119 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu7(const armral_cmplx_f32_t *restrict x, float32x2_t *v333 = &v6[0]; svfloat32_t v391 = svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v259)[0])); - svfloat32_t v42; - asm("mov %0.d, %d1" : "=w"(v42) : "w"(v41)); - svfloat32_t v46; - asm("mov %0.d, %d1" : "=w"(v46) : "w"(v45)); - svfloat32_t v72; - asm("mov %0.d, %d1" : "=w"(v72) : "w"(v71)); - svfloat32_t v76; - asm("mov %0.d, %d1" : "=w"(v76) : "w"(v75)); - svfloat32_t v102; - asm("mov %0.d, %d1" : "=w"(v102) : "w"(v101)); - svfloat32_t v106; - asm("mov %0.d, %d1" : "=w"(v106) : "w"(v105)); + svfloat32_t v42 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[0])); + svfloat32_t v46 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[5])); + svfloat32_t v72 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[3])); + svfloat32_t v76 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[2])); + svfloat32_t v102 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[1])); + svfloat32_t v106 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[4])); const float32x2_t *v268 = &v5[v30]; const float32x2_t *v277 = &v5[v49]; const float32x2_t *v286 = &v5[v60]; - const float32x2_t *v295 = &v5[v79]; - const float32x2_t *v304 = &v5[v90]; - svfloat32_t v322 = svdup_n_f32(v161); - svfloat32_t v323 = svdup_n_f32(v168); - svfloat32_t v324 = svdup_n_f32(v175); - svfloat32_t v325 = svdup_n_f32(v182); - float32x2_t *v351 = &v6[v219]; - float32x2_t *v360 = &v6[v226]; - float32x2_t *v369 = &v6[v233]; - float32x2_t *v378 = &v6[v240]; - float32x2_t *v387 = &v6[v247]; - svfloat32_t v403 = - svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v314)[0])); - svfloat32_t zero43; - asm volatile("mov %0.s, #0" : "=w"(zero43)); - svfloat32_t v43 = - svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero43, v391, v42, 0), - v391, v42, 90); - svfloat32_t v393 = - svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v268)[0])); - svfloat32_t v395 = - svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v277)[0])); - svfloat32_t v397 = - svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v286)[0])); - svfloat32_t v399 = - svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v295)[0])); - svfloat32_t v401 = - svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v304)[0])); - svfloat32_t zero47; - asm volatile("mov %0.s, #0" : "=w"(zero47)); - svfloat32_t v47 = - svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero47, v393, v46, 0), - v393, v46, 90); - svfloat32_t zero73; - asm volatile("mov %0.s, #0" : "=w"(zero73)); - svfloat32_t v73 = - svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero73, v395, v72, 0), - v395, v72, 90); - svfloat32_t zero77; - asm volatile("mov %0.s, #0" : "=w"(zero77)); - svfloat32_t v77 = - svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero77, v397, v76, 0), - v397, v76, 90); - svfloat32_t zero103; - asm volatile("mov %0.s, #0" : "=w"(zero103)); - svfloat32_t v103 = - svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero103, v399, v102, 0), - v399, v102, 90); - svfloat32_t zero107; - asm volatile("mov %0.s, #0" : "=w"(zero107)); - svfloat32_t v107 = - svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero107, v401, v106, 0), - v401, v106, 90); - svfloat32_t v108; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v108) : "w"(v43), "w"(v47)); - svfloat32_t v109; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v109) : "w"(v43), "w"(v47)); - svfloat32_t v110; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v110) : "w"(v73), "w"(v77)); - svfloat32_t v111; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v111) : "w"(v73), "w"(v77)); - svfloat32_t v112; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v112) : "w"(v103), "w"(v107)); - svfloat32_t v113; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v113) : "w"(v103), "w"(v107)); - svfloat32_t v114; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v114) : "w"(v108), "w"(v110)); - svfloat32_t v124; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v124) : "w"(v108), "w"(v110)); - svfloat32_t v125; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v125) : "w"(v110), "w"(v112)); - svfloat32_t v126; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v126) : "w"(v112), "w"(v108)); - svfloat32_t v127; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v127) : "w"(v109), "w"(v111)); - svfloat32_t v129; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v129) : "w"(v109), "w"(v111)); - svfloat32_t v130; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v130) : "w"(v111), "w"(v113)); - svfloat32_t v131; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v131) : "w"(v113), "w"(v109)); - svfloat32_t v115; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v115) : "w"(v114), "w"(v112)); - svfloat32_t v128; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v128) : "w"(v127), "w"(v113)); - svfloat32_t zero170; - asm volatile("mov %0.s, #0" : "=w"(zero170)); - svfloat32_t v170 = svcmla_f32_x(pred_full, zero170, v323, v129, 90); - svfloat32_t zero177; - asm volatile("mov %0.s, #0" : "=w"(zero177)); - svfloat32_t v177 = svcmla_f32_x(pred_full, zero177, v324, v130, 90); - svfloat32_t zero184; - asm volatile("mov %0.s, #0" : "=w"(zero184)); - svfloat32_t v184 = svcmla_f32_x(pred_full, zero184, v325, v131, 90); - svfloat32_t v123; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v123) : "w"(v115), "w"(v403)); - svfloat32_t zero163; - asm volatile("mov %0.s, #0" : "=w"(zero163)); - svfloat32_t v163 = svcmla_f32_x(pred_full, zero163, v322, v128, 90); - svfloat32_t v185 = svmla_f32_x(pred_full, v123, v115, v318); - svfloat32_t v192; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v192) : "w"(v163), "w"(v170)); - svfloat32_t v194; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v194) : "w"(v163), "w"(v170)); - svfloat32_t v196; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v196) : "w"(v163), "w"(v177)); - svst1_f64(pred_full, (double *)(v333), svreinterpret_f64_f32(v123)); - svfloat32_t v186 = svmla_f32_x(pred_full, v185, v124, v319); - svfloat32_t v188 = svmls_f32_x(pred_full, v185, v124, v319); - svfloat32_t v190 = svmls_f32_x(pred_full, v185, v125, v320); - svfloat32_t v193; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v193) : "w"(v192), "w"(v177)); - svfloat32_t v195; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v195) : "w"(v194), "w"(v184)); - svfloat32_t v197; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v197) : "w"(v196), "w"(v184)); - svfloat32_t v187 = svmla_f32_x(pred_full, v186, v125, v320); - svfloat32_t v189 = svmls_f32_x(pred_full, v188, v126, v321); - svfloat32_t v191 = svmla_f32_x(pred_full, v190, v126, v321); - svfloat32_t v198; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v198) : "w"(v187), "w"(v193)); - svfloat32_t v199; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v199) : "w"(v187), "w"(v193)); - svfloat32_t v200; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v200) : "w"(v189), "w"(v195)); - svfloat32_t v201; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v201) : "w"(v189), "w"(v195)); - svfloat32_t v202; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v202) : "w"(v191), "w"(v197)); - svfloat32_t v203; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v203) : "w"(v191), "w"(v197)); - svst1_f64(pred_full, (double *)(v342), svreinterpret_f64_f32(v199)); - svst1_f64(pred_full, (double *)(v351), svreinterpret_f64_f32(v201)); - svst1_f64(pred_full, (double *)(v360), svreinterpret_f64_f32(v202)); - svst1_f64(pred_full, (double *)(v369), svreinterpret_f64_f32(v203)); - svst1_f64(pred_full, (double *)(v378), svreinterpret_f64_f32(v200)); - svst1_f64(pred_full, (double *)(v387), svreinterpret_f64_f32(v198)); - v5 += v11; - v6 += v12; - } -} -#endif - -#ifndef ARMRAL_ARCH_SVE -void armral_fft_cf32_cf32_cf32_ac_t_uu8(const armral_cmplx_f32_t *restrict x, - armral_cmplx_f32_t *restrict y, - int istride, int ostride, - const armral_cmplx_f32_t *restrict w, - int howmany, float dir) { - float v4 = dir; - const float32x2_t *v5 = (const float32x2_t *)x; - float32x2_t *v6 = (float32x2_t *)y; - const float32x2_t *v7 = (const float32x2_t *)w; - int64_t v12 = howmany - 1; - int64_t v376 = howmany / 2; - for (int j = 0; j < v12; j += 2) { - float v289 = 1.0000000000000000e+00F; - float v290 = -1.0000000000000000e+00F; - float v298 = -7.0710678118654746e-01F; - float32x2_t v300 = (float32x2_t){v4, v4}; - float v306 = 7.0710678118654757e-01F; - const float32x2_t *v714 = &v5[istride]; - float32x2_t *v776 = &v6[ostride]; - float32x2_t v291 = (float32x2_t){v289, v290}; - float32x2_t v299 = (float32x2_t){v306, v298}; - float32x2_t v307 = (float32x2_t){v306, v306}; - const float32x2_t *v757 = &v5[0]; - float32x2_t *v767 = &v6[0]; - float32x4_t v840 = vld1q_f32((const float32_t *)v714); - float32x4_t v47 = - vreinterpretq_f32_u64(vld1q_dup_u64((const uint64_t *)&v7[6])); - float32x4_t v49 = - vreinterpretq_f32_u64(vld1q_dup_u64((const uint64_t *)&v7[7])); - float32x4_t v97 = - vreinterpretq_f32_u64(vld1q_dup_u64((const uint64_t *)&v7[2])); - float32x4_t v99 = - vreinterpretq_f32_u64(vld1q_dup_u64((const uint64_t *)&v7[3])); - float32x4_t v109 = - vreinterpretq_f32_u64(vld1q_dup_u64((const uint64_t *)&v7[10])); - float32x4_t v111 = - vreinterpretq_f32_u64(vld1q_dup_u64((const uint64_t *)&v7[11])); - float32x4_t v154 = vtrn1q_f32(v840, v840); - float32x4_t v155 = vtrn2q_f32(v840, v840); - float32x4_t v159 = - vreinterpretq_f32_u64(vld1q_dup_u64((const uint64_t *)&v7[0])); - float32x4_t v161 = - vreinterpretq_f32_u64(vld1q_dup_u64((const uint64_t *)&v7[1])); - float32x4_t v171 = - vreinterpretq_f32_u64(vld1q_dup_u64((const uint64_t *)&v7[8])); - float32x4_t v173 = - vreinterpretq_f32_u64(vld1q_dup_u64((const uint64_t *)&v7[9])); - float32x4_t v221 = - vreinterpretq_f32_u64(vld1q_dup_u64((const uint64_t *)&v7[4])); - float32x4_t v223 = - vreinterpretq_f32_u64(vld1q_dup_u64((const uint64_t *)&v7[5])); - float32x4_t v233 = - vreinterpretq_f32_u64(vld1q_dup_u64((const uint64_t *)&v7[12])); - float32x4_t v235 = - vreinterpretq_f32_u64(vld1q_dup_u64((const uint64_t *)&v7[13])); - float32x2_t v293 = vmul_f32(v300, v291); - float32x2_t v301 = vmul_f32(v300, v299); - float32x4_t v308 = vcombine_f32(v307, v307); - const float32x2_t *v681 = &v5[istride * 4]; - const float32x2_t *v692 = &v5[istride * 2]; - const float32x2_t *v702 = &v5[istride * 6]; - const float32x2_t *v723 = &v5[istride * 5]; - const float32x2_t *v734 = &v5[istride * 3]; - const float32x2_t *v744 = &v5[istride * 7]; - float32x2_t *v785 = &v6[ostride * 2]; - float32x2_t *v794 = &v6[ostride * 3]; - float32x2_t *v803 = &v6[ostride * 4]; - float32x2_t *v812 = &v6[ostride * 5]; - float32x2_t *v821 = &v6[ostride * 6]; - float32x2_t *v830 = &v6[ostride * 7]; - float32x4_t v848 = vld1q_f32((const float32_t *)v757); - float32x4_t v160 = vmulq_f32(v154, v159); - float32x4_t v295 = vcombine_f32(v293, v293); - float32x4_t v303 = vcombine_f32(v301, v301); - float32x4_t v834 = vld1q_f32((const float32_t *)v681); - float32x4_t v836 = vld1q_f32((const float32_t *)v692); - float32x4_t v838 = vld1q_f32((const float32_t *)v702); - float32x4_t v842 = vld1q_f32((const float32_t *)v723); - float32x4_t v844 = vld1q_f32((const float32_t *)v734); - float32x4_t v846 = vld1q_f32((const float32_t *)v744); - float32x4_t v42 = vtrn1q_f32(v834, v834); - float32x4_t v43 = vtrn2q_f32(v834, v834); - float32x4_t v92 = vtrn1q_f32(v836, v836); - float32x4_t v93 = vtrn2q_f32(v836, v836); - float32x4_t v104 = vtrn1q_f32(v838, v838); - float32x4_t v105 = vtrn2q_f32(v838, v838); - float32x4_t v163 = vfmaq_f32(v160, v155, v161); - float32x4_t v166 = vtrn1q_f32(v842, v842); - float32x4_t v167 = vtrn2q_f32(v842, v842); - float32x4_t v216 = vtrn1q_f32(v844, v844); - float32x4_t v217 = vtrn2q_f32(v844, v844); - float32x4_t v228 = vtrn1q_f32(v846, v846); - float32x4_t v229 = vtrn2q_f32(v846, v846); - float32x4_t v48 = vmulq_f32(v42, v47); - float32x4_t v98 = vmulq_f32(v92, v97); - float32x4_t v110 = vmulq_f32(v104, v109); - float32x4_t v172 = vmulq_f32(v166, v171); - float32x4_t v222 = vmulq_f32(v216, v221); - float32x4_t v234 = vmulq_f32(v228, v233); - float32x4_t v51 = vfmaq_f32(v48, v43, v49); - float32x4_t v101 = vfmaq_f32(v98, v93, v99); - float32x4_t v113 = vfmaq_f32(v110, v105, v111); - float32x4_t v175 = vfmaq_f32(v172, v167, v173); - float32x4_t v225 = vfmaq_f32(v222, v217, v223); - float32x4_t v237 = vfmaq_f32(v234, v229, v235); - float32x4_t v245 = vaddq_f32(v848, v51); - float32x4_t v246 = vsubq_f32(v848, v51); - float32x4_t v247 = vaddq_f32(v101, v113); - float32x4_t v248 = vsubq_f32(v101, v113); - float32x4_t v249 = vaddq_f32(v163, v175); - float32x4_t v250 = vsubq_f32(v163, v175); - float32x4_t v251 = vaddq_f32(v225, v237); - float32x4_t v252 = vsubq_f32(v225, v237); - float32x4_t v253 = vaddq_f32(v245, v247); - float32x4_t v254 = vsubq_f32(v245, v247); - float32x4_t v255 = vaddq_f32(v249, v251); - float32x4_t v256 = vsubq_f32(v249, v251); - float32x4_t v259 = vaddq_f32(v250, v252); - float32x4_t v260 = vsubq_f32(v250, v252); - float32x4_t v294 = vrev64q_f32(v248); - float32x4_t v257 = vaddq_f32(v253, v255); - float32x4_t v258 = vsubq_f32(v253, v255); - float32x4_t v281 = vrev64q_f32(v256); - float32x4_t v296 = vmulq_f32(v294, v295); - float32x4_t v302 = vrev64q_f32(v259); - float32x4_t v309 = vmulq_f32(v260, v308); - float32x4_t v283 = vmulq_f32(v281, v295); - float32x4_t v304 = vmulq_f32(v302, v303); - float32x4_t v312 = vaddq_f32(v246, v309); - float32x4_t v313 = vsubq_f32(v246, v309); - vst1q_f32((float32_t *)v767, v257); - vst1q_f32((float32_t *)v803, v258); - float32x4_t v310 = vaddq_f32(v254, v283); - float32x4_t v311 = vsubq_f32(v254, v283); - float32x4_t v314 = vaddq_f32(v296, v304); - float32x4_t v315 = vsubq_f32(v296, v304); - float32x4_t v316 = vaddq_f32(v312, v314); - float32x4_t v317 = vsubq_f32(v312, v314); - float32x4_t v318 = vaddq_f32(v313, v315); - float32x4_t v319 = vsubq_f32(v313, v315); - vst1q_f32((float32_t *)v785, v311); - vst1q_f32((float32_t *)v821, v310); - vst1q_f32((float32_t *)v776, v317); - vst1q_f32((float32_t *)v794, v318); - vst1q_f32((float32_t *)v812, v319); - vst1q_f32((float32_t *)v830, v316); - v5 += 2 * 1; - v6 += 2 * 1; - } - for (int j = v376 * 2; j < howmany; j += 1) { - float32x2_t v463 = v5[istride]; - float v603 = 1.0000000000000000e+00F; - float v604 = -1.0000000000000000e+00F; - float v611 = -7.0710678118654746e-01F; - float32x2_t v613 = (float32x2_t){v4, v4}; - float v618 = 7.0710678118654757e-01F; - float32x2_t v400 = v7[6]; - float32x2_t v405 = v7[7]; - float32x2_t v440 = v7[2]; - float32x2_t v445 = v7[3]; - float32x2_t v450 = v7[10]; - float32x2_t v455 = v7[11]; - float32x2_t v490 = v7[0]; - float32x2_t v491 = vtrn1_f32(v463, v463); - float32x2_t v492 = vtrn2_f32(v463, v463); - float32x2_t v495 = v7[1]; - float32x2_t v500 = v7[8]; - float32x2_t v505 = v7[9]; - float32x2_t v540 = v7[4]; - float32x2_t v545 = v7[5]; - float32x2_t v550 = v7[12]; - float32x2_t v555 = v7[13]; - float32x2_t v563 = v5[0]; - float32x2_t v605 = (float32x2_t){v603, v604}; - float32x2_t v612 = (float32x2_t){v618, v611}; - float32x2_t v619 = (float32x2_t){v618, v618}; - float32x2_t v388 = v5[istride * 4]; - float32x2_t v413 = v5[istride * 2]; - float32x2_t v428 = v5[istride * 6]; - float32x2_t v478 = v5[istride * 5]; - float32x2_t v496 = vmul_f32(v491, v490); - float32x2_t v513 = v5[istride * 3]; - float32x2_t v528 = v5[istride * 7]; - float32x2_t v607 = vmul_f32(v613, v605); - float32x2_t v614 = vmul_f32(v613, v612); - float32x2_t v401 = vtrn1_f32(v388, v388); - float32x2_t v402 = vtrn2_f32(v388, v388); - float32x2_t v441 = vtrn1_f32(v413, v413); - float32x2_t v442 = vtrn2_f32(v413, v413); - float32x2_t v451 = vtrn1_f32(v428, v428); - float32x2_t v452 = vtrn2_f32(v428, v428); - float32x2_t v498 = vfma_f32(v496, v492, v495); - float32x2_t v501 = vtrn1_f32(v478, v478); - float32x2_t v502 = vtrn2_f32(v478, v478); - float32x2_t v541 = vtrn1_f32(v513, v513); - float32x2_t v542 = vtrn2_f32(v513, v513); - float32x2_t v551 = vtrn1_f32(v528, v528); - float32x2_t v552 = vtrn2_f32(v528, v528); - float32x2_t v406 = vmul_f32(v401, v400); - float32x2_t v446 = vmul_f32(v441, v440); - float32x2_t v456 = vmul_f32(v451, v450); - float32x2_t v506 = vmul_f32(v501, v500); - float32x2_t v546 = vmul_f32(v541, v540); - float32x2_t v556 = vmul_f32(v551, v550); - float32x2_t v408 = vfma_f32(v406, v402, v405); - float32x2_t v448 = vfma_f32(v446, v442, v445); - float32x2_t v458 = vfma_f32(v456, v452, v455); - float32x2_t v508 = vfma_f32(v506, v502, v505); - float32x2_t v548 = vfma_f32(v546, v542, v545); - float32x2_t v558 = vfma_f32(v556, v552, v555); - float32x2_t v564 = vadd_f32(v563, v408); - float32x2_t v565 = vsub_f32(v563, v408); - float32x2_t v566 = vadd_f32(v448, v458); - float32x2_t v567 = vsub_f32(v448, v458); - float32x2_t v568 = vadd_f32(v498, v508); - float32x2_t v569 = vsub_f32(v498, v508); - float32x2_t v570 = vadd_f32(v548, v558); - float32x2_t v571 = vsub_f32(v548, v558); - float32x2_t v572 = vadd_f32(v564, v566); - float32x2_t v573 = vsub_f32(v564, v566); - float32x2_t v574 = vadd_f32(v568, v570); - float32x2_t v575 = vsub_f32(v568, v570); - float32x2_t v578 = vadd_f32(v569, v571); - float32x2_t v579 = vsub_f32(v569, v571); - float32x2_t v608 = vrev64_f32(v567); - float32x2_t v576 = vadd_f32(v572, v574); - float32x2_t v577 = vsub_f32(v572, v574); - float32x2_t v597 = vrev64_f32(v575); - float32x2_t v609 = vmul_f32(v608, v607); - float32x2_t v615 = vrev64_f32(v578); - float32x2_t v620 = vmul_f32(v579, v619); - float32x2_t v598 = vmul_f32(v597, v607); - float32x2_t v616 = vmul_f32(v615, v614); - float32x2_t v623 = vadd_f32(v565, v620); - float32x2_t v624 = vsub_f32(v565, v620); - v6[0] = v576; - v6[ostride * 4] = v577; - float32x2_t v621 = vadd_f32(v573, v598); - float32x2_t v622 = vsub_f32(v573, v598); - float32x2_t v625 = vadd_f32(v609, v616); - float32x2_t v626 = vsub_f32(v609, v616); - float32x2_t v627 = vadd_f32(v623, v625); - float32x2_t v628 = vsub_f32(v623, v625); - float32x2_t v629 = vadd_f32(v624, v626); - float32x2_t v630 = vsub_f32(v624, v626); - v6[ostride * 2] = v622; - v6[ostride * 6] = v621; - v6[ostride] = v628; - v6[ostride * 3] = v629; - v6[ostride * 5] = v630; - v6[ostride * 7] = v627; - v5 += 1 * 1; - v6 += 1 * 1; - } -} -#endif - -#ifdef ARMRAL_ARCH_SVE -void armral_fft_cf32_cf32_cf32_ac_t_uu8(const armral_cmplx_f32_t *restrict x, - armral_cmplx_f32_t *restrict y, - int istride, int ostride, - const armral_cmplx_f32_t *restrict w, - int howmany, float dir) { - int64_t v0 = istride; - int64_t v2 = ostride; - float v4 = dir; - const float32x2_t *v5 = (const float32x2_t *)x; - float32x2_t *v6 = (float32x2_t *)y; - const float32x2_t *v7 = (const float32x2_t *)w; - int64_t v8 = howmany; - int64_t v10 = svcntd(); - int64_t v11 = v10 * 1; - int64_t v12 = v10 * 1; - for (int j = 0; j < v8; j += v10) { - svbool_t pred_full = svwhilelt_b32(j * 2, howmany * 2); - float v174 = -1.0000000000000000e+00F; - float v181 = -7.0710678118654746e-01F; - float v188 = 7.0710678118654757e-01F; - const float32x2_t *v291 = &v5[v0]; - float32x2_t *v355 = &v6[v2]; - int64_t v19 = v0 * 4; - float32x2_t v30 = v7[3]; - int64_t v34 = v0 * 2; - int64_t v45 = v0 * 6; - float32x2_t v56 = v7[1]; - float32x2_t v60 = v7[5]; - int64_t v75 = v0 * 5; - float32x2_t v86 = v7[0]; - float32x2_t v90 = v7[4]; - int64_t v94 = v0 * 3; - int64_t v105 = v0 * 7; - float32x2_t v116 = v7[2]; - float32x2_t v120 = v7[6]; - float v177 = v4 * v174; - float v184 = v4 * v181; - int64_t v217 = v2 * 2; - int64_t v224 = v2 * 3; - int64_t v231 = v2 * 4; - int64_t v238 = v2 * 5; - int64_t v245 = v2 * 6; - int64_t v252 = v2 * 7; - const float32x2_t *v328 = &v5[0]; - svfloat32_t v338 = svdup_n_f32(v188); - float32x2_t *v346 = &v6[0]; - svfloat32_t v419 = - svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v291)[0])); - svfloat32_t v31; - asm("mov %0.d, %d1" : "=w"(v31) : "w"(v30)); - svfloat32_t v57; - asm("mov %0.d, %d1" : "=w"(v57) : "w"(v56)); - svfloat32_t v61; - asm("mov %0.d, %d1" : "=w"(v61) : "w"(v60)); - svfloat32_t v87; - asm("mov %0.d, %d1" : "=w"(v87) : "w"(v86)); - svfloat32_t v91; - asm("mov %0.d, %d1" : "=w"(v91) : "w"(v90)); - svfloat32_t v117; - asm("mov %0.d, %d1" : "=w"(v117) : "w"(v116)); - svfloat32_t v121; - asm("mov %0.d, %d1" : "=w"(v121) : "w"(v120)); - const float32x2_t *v264 = &v5[v19]; - const float32x2_t *v273 = &v5[v34]; - const float32x2_t *v282 = &v5[v45]; - const float32x2_t *v300 = &v5[v75]; - const float32x2_t *v309 = &v5[v94]; - const float32x2_t *v318 = &v5[v105]; - svfloat32_t v336 = svdup_n_f32(v177); - svfloat32_t v337 = svdup_n_f32(v184); - float32x2_t *v364 = &v6[v217]; - float32x2_t *v373 = &v6[v224]; - float32x2_t *v382 = &v6[v231]; - float32x2_t *v391 = &v6[v238]; - float32x2_t *v400 = &v6[v245]; - float32x2_t *v409 = &v6[v252]; - svfloat32_t v427 = - svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v328)[0])); - svfloat32_t zero88; - asm volatile("mov %0.s, #0" : "=w"(zero88)); - svfloat32_t v88 = - svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero88, v419, v87, 0), - v419, v87, 90); - svfloat32_t v413 = - svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v264)[0])); - svfloat32_t v415 = - svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v273)[0])); - svfloat32_t v417 = - svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v282)[0])); - svfloat32_t v421 = - svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v300)[0])); - svfloat32_t v423 = - svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v309)[0])); - svfloat32_t v425 = - svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v318)[0])); - svfloat32_t zero32; - asm volatile("mov %0.s, #0" : "=w"(zero32)); - svfloat32_t v32 = - svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero32, v413, v31, 0), - v413, v31, 90); - svfloat32_t zero58; - asm volatile("mov %0.s, #0" : "=w"(zero58)); - svfloat32_t v58 = - svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero58, v415, v57, 0), - v415, v57, 90); - svfloat32_t zero62; - asm volatile("mov %0.s, #0" : "=w"(zero62)); - svfloat32_t v62 = - svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero62, v417, v61, 0), - v417, v61, 90); - svfloat32_t zero92; - asm volatile("mov %0.s, #0" : "=w"(zero92)); - svfloat32_t v92 = - svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero92, v421, v91, 0), - v421, v91, 90); - svfloat32_t zero118; - asm volatile("mov %0.s, #0" : "=w"(zero118)); - svfloat32_t v118 = - svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero118, v423, v117, 0), - v423, v117, 90); - svfloat32_t zero122; - asm volatile("mov %0.s, #0" : "=w"(zero122)); - svfloat32_t v122 = - svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero122, v425, v121, 0), - v425, v121, 90); - svfloat32_t v130; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v130) : "w"(v427), "w"(v32)); - svfloat32_t v131; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v131) : "w"(v427), "w"(v32)); - svfloat32_t v132; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v132) : "w"(v58), "w"(v62)); - svfloat32_t v133; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v133) : "w"(v58), "w"(v62)); - svfloat32_t v134; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v134) : "w"(v88), "w"(v92)); - svfloat32_t v135; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v135) : "w"(v88), "w"(v92)); - svfloat32_t v136; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v136) : "w"(v118), "w"(v122)); - svfloat32_t v137; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v137) : "w"(v118), "w"(v122)); - svfloat32_t v138; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v138) : "w"(v130), "w"(v132)); - svfloat32_t v139; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v139) : "w"(v130), "w"(v132)); - svfloat32_t v140; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v140) : "w"(v134), "w"(v136)); - svfloat32_t v141; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v141) : "w"(v134), "w"(v136)); - svfloat32_t v144; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v144) : "w"(v135), "w"(v137)); - svfloat32_t v145; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v145) : "w"(v135), "w"(v137)); - svfloat32_t zero179; - asm volatile("mov %0.s, #0" : "=w"(zero179)); - svfloat32_t v179 = svcmla_f32_x(pred_full, zero179, v336, v133, 90); - svfloat32_t v142; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v142) : "w"(v138), "w"(v140)); - svfloat32_t v143; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v143) : "w"(v138), "w"(v140)); - svfloat32_t zero167; - asm volatile("mov %0.s, #0" : "=w"(zero167)); - svfloat32_t v167 = svcmla_f32_x(pred_full, zero167, v336, v141, 90); - svfloat32_t zero186; - asm volatile("mov %0.s, #0" : "=w"(zero186)); - svfloat32_t v186 = svcmla_f32_x(pred_full, zero186, v337, v144, 90); - svfloat32_t v192; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v192) : "w"(v139), "w"(v167)); - svfloat32_t v193; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v193) : "w"(v139), "w"(v167)); - svfloat32_t v194 = svmla_f32_x(pred_full, v131, v145, v338); - svfloat32_t v195 = svmls_f32_x(pred_full, v131, v145, v338); - svfloat32_t v196; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v196) : "w"(v179), "w"(v186)); - svfloat32_t v197; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v197) : "w"(v179), "w"(v186)); - svst1_f64(pred_full, (double *)(v346), svreinterpret_f64_f32(v142)); - svst1_f64(pred_full, (double *)(v382), svreinterpret_f64_f32(v143)); - svfloat32_t v198; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v198) : "w"(v194), "w"(v196)); - svfloat32_t v199; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v199) : "w"(v194), "w"(v196)); - svfloat32_t v200; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v200) : "w"(v195), "w"(v197)); - svfloat32_t v201; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v201) : "w"(v195), "w"(v197)); - svst1_f64(pred_full, (double *)(v364), svreinterpret_f64_f32(v193)); - svst1_f64(pred_full, (double *)(v400), svreinterpret_f64_f32(v192)); - svst1_f64(pred_full, (double *)(v355), svreinterpret_f64_f32(v199)); - svst1_f64(pred_full, (double *)(v373), svreinterpret_f64_f32(v200)); - svst1_f64(pred_full, (double *)(v391), svreinterpret_f64_f32(v201)); - svst1_f64(pred_full, (double *)(v409), svreinterpret_f64_f32(v198)); + const float32x2_t *v295 = &v5[v79]; + const float32x2_t *v304 = &v5[v90]; + svfloat32_t v322 = svdup_n_f32(v161); + svfloat32_t v323 = svdup_n_f32(v168); + svfloat32_t v324 = svdup_n_f32(v175); + svfloat32_t v325 = svdup_n_f32(v182); + float32x2_t *v351 = &v6[v219]; + float32x2_t *v360 = &v6[v226]; + float32x2_t *v369 = &v6[v233]; + float32x2_t *v378 = &v6[v240]; + float32x2_t *v387 = &v6[v247]; + svfloat32_t v403 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v314)[0])); + svfloat32_t zero43 = svdup_n_f32(0); + svfloat32_t v43 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero43, v391, v42, 0), + v391, v42, 90); + svfloat32_t v393 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v268)[0])); + svfloat32_t v395 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v277)[0])); + svfloat32_t v397 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v286)[0])); + svfloat32_t v399 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v295)[0])); + svfloat32_t v401 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v304)[0])); + svfloat32_t zero47 = svdup_n_f32(0); + svfloat32_t v47 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero47, v393, v46, 0), + v393, v46, 90); + svfloat32_t zero73 = svdup_n_f32(0); + svfloat32_t v73 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero73, v395, v72, 0), + v395, v72, 90); + svfloat32_t zero77 = svdup_n_f32(0); + svfloat32_t v77 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero77, v397, v76, 0), + v397, v76, 90); + svfloat32_t zero103 = svdup_n_f32(0); + svfloat32_t v103 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero103, v399, v102, 0), + v399, v102, 90); + svfloat32_t zero107 = svdup_n_f32(0); + svfloat32_t v107 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero107, v401, v106, 0), + v401, v106, 90); + svfloat32_t v108 = svadd_f32_x(svptrue_b32(), v43, v47); + svfloat32_t v109 = svsub_f32_x(svptrue_b32(), v43, v47); + svfloat32_t v110 = svadd_f32_x(svptrue_b32(), v73, v77); + svfloat32_t v111 = svsub_f32_x(svptrue_b32(), v73, v77); + svfloat32_t v112 = svadd_f32_x(svptrue_b32(), v103, v107); + svfloat32_t v113 = svsub_f32_x(svptrue_b32(), v103, v107); + svfloat32_t v114 = svadd_f32_x(svptrue_b32(), v108, v110); + svfloat32_t v124 = svsub_f32_x(svptrue_b32(), v108, v110); + svfloat32_t v125 = svsub_f32_x(svptrue_b32(), v110, v112); + svfloat32_t v126 = svsub_f32_x(svptrue_b32(), v112, v108); + svfloat32_t v127 = svadd_f32_x(svptrue_b32(), v109, v111); + svfloat32_t v129 = svsub_f32_x(svptrue_b32(), v109, v111); + svfloat32_t v130 = svsub_f32_x(svptrue_b32(), v111, v113); + svfloat32_t v131 = svsub_f32_x(svptrue_b32(), v113, v109); + svfloat32_t v115 = svadd_f32_x(svptrue_b32(), v114, v112); + svfloat32_t v128 = svadd_f32_x(svptrue_b32(), v127, v113); + svfloat32_t zero170 = svdup_n_f32(0); + svfloat32_t v170 = svcmla_f32_x(pred_full, zero170, v323, v129, 90); + svfloat32_t zero177 = svdup_n_f32(0); + svfloat32_t v177 = svcmla_f32_x(pred_full, zero177, v324, v130, 90); + svfloat32_t zero184 = svdup_n_f32(0); + svfloat32_t v184 = svcmla_f32_x(pred_full, zero184, v325, v131, 90); + svfloat32_t v123 = svadd_f32_x(svptrue_b32(), v115, v403); + svfloat32_t zero163 = svdup_n_f32(0); + svfloat32_t v163 = svcmla_f32_x(pred_full, zero163, v322, v128, 90); + svfloat32_t v185 = svmla_f32_x(pred_full, v123, v115, v318); + svfloat32_t v192 = svadd_f32_x(svptrue_b32(), v163, v170); + svfloat32_t v194 = svsub_f32_x(svptrue_b32(), v163, v170); + svfloat32_t v196 = svsub_f32_x(svptrue_b32(), v163, v177); + svst1_f64(pred_full, (double *)(v333), svreinterpret_f64_f32(v123)); + svfloat32_t v186 = svmla_f32_x(pred_full, v185, v124, v319); + svfloat32_t v188 = svmls_f32_x(pred_full, v185, v124, v319); + svfloat32_t v190 = svmls_f32_x(pred_full, v185, v125, v320); + svfloat32_t v193 = svadd_f32_x(svptrue_b32(), v192, v177); + svfloat32_t v195 = svsub_f32_x(svptrue_b32(), v194, v184); + svfloat32_t v197 = svadd_f32_x(svptrue_b32(), v196, v184); + svfloat32_t v187 = svmla_f32_x(pred_full, v186, v125, v320); + svfloat32_t v189 = svmls_f32_x(pred_full, v188, v126, v321); + svfloat32_t v191 = svmla_f32_x(pred_full, v190, v126, v321); + svfloat32_t v198 = svadd_f32_x(svptrue_b32(), v187, v193); + svfloat32_t v199 = svsub_f32_x(svptrue_b32(), v187, v193); + svfloat32_t v200 = svadd_f32_x(svptrue_b32(), v189, v195); + svfloat32_t v201 = svsub_f32_x(svptrue_b32(), v189, v195); + svfloat32_t v202 = svadd_f32_x(svptrue_b32(), v191, v197); + svfloat32_t v203 = svsub_f32_x(svptrue_b32(), v191, v197); + svst1_f64(pred_full, (double *)(v342), svreinterpret_f64_f32(v199)); + svst1_f64(pred_full, (double *)(v351), svreinterpret_f64_f32(v201)); + svst1_f64(pred_full, (double *)(v360), svreinterpret_f64_f32(v202)); + svst1_f64(pred_full, (double *)(v369), svreinterpret_f64_f32(v203)); + svst1_f64(pred_full, (double *)(v378), svreinterpret_f64_f32(v200)); + svst1_f64(pred_full, (double *)(v387), svreinterpret_f64_f32(v198)); v5 += v11; v6 += v12; } @@ -2549,20 +885,12 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu9(const armral_cmplx_f32_t *restrict x, const float32x2_t *v324 = &v5[v0]; float32x2_t *v427 = &v6[v2]; int64_t v30 = v0 * 8; - float32x2_t v41 = v7[0]; - float32x2_t v45 = v7[7]; int64_t v49 = v0 * 7; int64_t v60 = v0 * 2; - float32x2_t v71 = v7[6]; - float32x2_t v75 = v7[1]; int64_t v79 = v0 * 3; int64_t v90 = v0 * 6; - float32x2_t v101 = v7[2]; - float32x2_t v105 = v7[5]; int64_t v109 = v0 * 4; int64_t v120 = v0 * 5; - float32x2_t v131 = v7[3]; - float32x2_t v135 = v7[4]; float v191 = v4 * v188; float v213 = v4 * v210; float v220 = v4 * v217; @@ -2573,849 +901,163 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu9(const armral_cmplx_f32_t *restrict x, int64_t v291 = v2 * 5; int64_t v298 = v2 * 6; int64_t v305 = v2 * 7; - int64_t v312 = v2 * 8; - const float32x2_t *v397 = &v5[0]; - svfloat32_t v401 = svdup_n_f32(v171); - svfloat32_t v403 = svdup_n_f32(v183); - svfloat32_t v405 = svdup_n_f32(v195); - svfloat32_t v406 = svdup_n_f32(v200); - svfloat32_t v407 = svdup_n_f32(v205); - float32x2_t *v418 = &v6[0]; - svfloat32_t v494 = - svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v324)[0])); - svfloat32_t v42; - asm("mov %0.d, %d1" : "=w"(v42) : "w"(v41)); - svfloat32_t v46; - asm("mov %0.d, %d1" : "=w"(v46) : "w"(v45)); - svfloat32_t v72; - asm("mov %0.d, %d1" : "=w"(v72) : "w"(v71)); - svfloat32_t v76; - asm("mov %0.d, %d1" : "=w"(v76) : "w"(v75)); - svfloat32_t v102; - asm("mov %0.d, %d1" : "=w"(v102) : "w"(v101)); - svfloat32_t v106; - asm("mov %0.d, %d1" : "=w"(v106) : "w"(v105)); - svfloat32_t v132; - asm("mov %0.d, %d1" : "=w"(v132) : "w"(v131)); - svfloat32_t v136; - asm("mov %0.d, %d1" : "=w"(v136) : "w"(v135)); - const float32x2_t *v333 = &v5[v30]; - const float32x2_t *v342 = &v5[v49]; - const float32x2_t *v351 = &v5[v60]; - const float32x2_t *v360 = &v5[v79]; - const float32x2_t *v369 = &v5[v90]; - const float32x2_t *v378 = &v5[v109]; - const float32x2_t *v387 = &v5[v120]; - svfloat32_t v404 = svdup_n_f32(v191); - svfloat32_t v408 = svdup_n_f32(v213); - svfloat32_t v409 = svdup_n_f32(v220); - svfloat32_t v410 = svdup_n_f32(v227); - float32x2_t *v436 = &v6[v270]; - float32x2_t *v445 = &v6[v277]; - float32x2_t *v454 = &v6[v284]; - float32x2_t *v463 = &v6[v291]; - float32x2_t *v472 = &v6[v298]; - float32x2_t *v481 = &v6[v305]; - float32x2_t *v490 = &v6[v312]; - svfloat32_t v510 = - svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v397)[0])); - svfloat32_t zero43; - asm volatile("mov %0.s, #0" : "=w"(zero43)); - svfloat32_t v43 = - svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero43, v494, v42, 0), - v494, v42, 90); - svfloat32_t v496 = - svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v333)[0])); - svfloat32_t v498 = - svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v342)[0])); - svfloat32_t v500 = - svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v351)[0])); - svfloat32_t v502 = - svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v360)[0])); - svfloat32_t v504 = - svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v369)[0])); - svfloat32_t v506 = - svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v378)[0])); - svfloat32_t v508 = - svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v387)[0])); - svfloat32_t zero47; - asm volatile("mov %0.s, #0" : "=w"(zero47)); - svfloat32_t v47 = - svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero47, v496, v46, 0), - v496, v46, 90); - svfloat32_t zero73; - asm volatile("mov %0.s, #0" : "=w"(zero73)); - svfloat32_t v73 = - svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero73, v498, v72, 0), - v498, v72, 90); - svfloat32_t zero77; - asm volatile("mov %0.s, #0" : "=w"(zero77)); - svfloat32_t v77 = - svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero77, v500, v76, 0), - v500, v76, 90); - svfloat32_t zero103; - asm volatile("mov %0.s, #0" : "=w"(zero103)); - svfloat32_t v103 = - svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero103, v502, v102, 0), - v502, v102, 90); - svfloat32_t zero107; - asm volatile("mov %0.s, #0" : "=w"(zero107)); - svfloat32_t v107 = - svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero107, v504, v106, 0), - v504, v106, 90); - svfloat32_t zero133; - asm volatile("mov %0.s, #0" : "=w"(zero133)); - svfloat32_t v133 = - svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero133, v506, v132, 0), - v506, v132, 90); - svfloat32_t zero137; - asm volatile("mov %0.s, #0" : "=w"(zero137)); - svfloat32_t v137 = - svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero137, v508, v136, 0), - v508, v136, 90); - svfloat32_t v138; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v138) : "w"(v43), "w"(v47)); - svfloat32_t v139; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v139) : "w"(v43), "w"(v47)); - svfloat32_t v140; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v140) : "w"(v73), "w"(v77)); - svfloat32_t v141; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v141) : "w"(v73), "w"(v77)); - svfloat32_t v142; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v142) : "w"(v103), "w"(v107)); - svfloat32_t v143; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v143) : "w"(v103), "w"(v107)); - svfloat32_t v144; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v144) : "w"(v133), "w"(v137)); - svfloat32_t v145; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v145) : "w"(v133), "w"(v137)); - svfloat32_t v146; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v146) : "w"(v138), "w"(v140)); - svfloat32_t v157; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v157) : "w"(v139), "w"(v141)); - svfloat32_t v159; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v159) : "w"(v138), "w"(v140)); - svfloat32_t v160; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v160) : "w"(v140), "w"(v144)); - svfloat32_t v161; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v161) : "w"(v144), "w"(v138)); - svfloat32_t v162; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v162) : "w"(v139), "w"(v141)); - svfloat32_t v163; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v163) : "w"(v141), "w"(v145)); - svfloat32_t v164; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v164) : "w"(v145), "w"(v139)); - svfloat32_t zero193; - asm volatile("mov %0.s, #0" : "=w"(zero193)); - svfloat32_t v193 = svcmla_f32_x(pred_full, zero193, v404, v143, 90); - svfloat32_t v147; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v147) : "w"(v146), "w"(v144)); - svfloat32_t v158; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v158) : "w"(v157), "w"(v145)); - svfloat32_t zero215; - asm volatile("mov %0.s, #0" : "=w"(zero215)); - svfloat32_t v215 = svcmla_f32_x(pred_full, zero215, v408, v162, 90); - svfloat32_t zero222; - asm volatile("mov %0.s, #0" : "=w"(zero222)); - svfloat32_t v222 = svcmla_f32_x(pred_full, zero222, v409, v163, 90); - svfloat32_t zero229; - asm volatile("mov %0.s, #0" : "=w"(zero229)); - svfloat32_t v229 = svcmla_f32_x(pred_full, zero229, v410, v164, 90); - svfloat32_t v148; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v148) : "w"(v147), "w"(v142)); - svfloat32_t v174; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v174) : "w"(v147), "w"(v401)); - svfloat32_t zero181; - asm volatile("mov %0.s, #0" : "=w"(zero181)); - svfloat32_t v181 = svcmla_f32_x(pred_full, zero181, v404, v158, 90); - svfloat32_t v243; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v243) : "w"(v193), "w"(v215)); - svfloat32_t v245; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v245) : "w"(v193), "w"(v222)); - svfloat32_t v247; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v247) : "w"(v193), "w"(v215)); - svfloat32_t v156; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v156) : "w"(v148), "w"(v510)); - svfloat32_t v230; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v230) : "w"(v174), "w"(v174)); - svfloat32_t v244; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v244) : "w"(v243), "w"(v222)); - svfloat32_t v246; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v246) : "w"(v245), "w"(v229)); - svfloat32_t v248; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v248) : "w"(v247), "w"(v229)); - svfloat32_t v231 = svmla_f32_x(pred_full, v230, v147, v401); - svfloat32_t v235 = svmla_f32_x(pred_full, v156, v142, v403); - svst1_f64(pred_full, (double *)(v418), svreinterpret_f64_f32(v156)); - svfloat32_t v232; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v232) : "w"(v156), "w"(v231)); - svfloat32_t v236; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v236) : "w"(v235), "w"(v230)); - svfloat32_t v233; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v233) : "w"(v232), "w"(v181)); - svfloat32_t v234; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v234) : "w"(v232), "w"(v181)); - svfloat32_t v237 = svmla_f32_x(pred_full, v236, v159, v405); - svfloat32_t v239 = svmls_f32_x(pred_full, v236, v160, v406); - svfloat32_t v241 = svmls_f32_x(pred_full, v236, v159, v405); - svfloat32_t v238 = svmla_f32_x(pred_full, v237, v160, v406); - svfloat32_t v240 = svmla_f32_x(pred_full, v239, v161, v407); - svfloat32_t v242 = svmls_f32_x(pred_full, v241, v161, v407); - svst1_f64(pred_full, (double *)(v445), svreinterpret_f64_f32(v234)); - svst1_f64(pred_full, (double *)(v472), svreinterpret_f64_f32(v233)); - svfloat32_t v249; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v249) : "w"(v238), "w"(v244)); - svfloat32_t v250; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v250) : "w"(v238), "w"(v244)); - svfloat32_t v251; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v251) : "w"(v240), "w"(v246)); - svfloat32_t v252; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v252) : "w"(v240), "w"(v246)); - svfloat32_t v253; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v253) : "w"(v242), "w"(v248)); - svfloat32_t v254; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v254) : "w"(v242), "w"(v248)); - svst1_f64(pred_full, (double *)(v427), svreinterpret_f64_f32(v250)); - svst1_f64(pred_full, (double *)(v436), svreinterpret_f64_f32(v251)); - svst1_f64(pred_full, (double *)(v454), svreinterpret_f64_f32(v254)); - svst1_f64(pred_full, (double *)(v463), svreinterpret_f64_f32(v253)); - svst1_f64(pred_full, (double *)(v481), svreinterpret_f64_f32(v252)); - svst1_f64(pred_full, (double *)(v490), svreinterpret_f64_f32(v249)); - v5 += v11; - v6 += v12; - } -} -#endif - -#ifndef ARMRAL_ARCH_SVE -void armral_fft_cf32_cf32_cf32_ac_t_uu10(const armral_cmplx_f32_t *restrict x, - armral_cmplx_f32_t *restrict y, - int istride, int ostride, - const armral_cmplx_f32_t *restrict w, - int howmany, float dir) { - float v4 = dir; - const float32x2_t *v5 = (const float32x2_t *)x; - float32x2_t *v6 = (float32x2_t *)y; - const float32x2_t *v7 = (const float32x2_t *)w; - int64_t v12 = howmany - 1; - int64_t v499 = howmany / 2; - for (int j = 0; j < v12; j += 2) { - float v387 = -1.2500000000000000e+00F; - float v392 = 5.5901699437494745e-01F; - float v396 = 1.5388417685876268e+00F; - float v397 = -1.5388417685876268e+00F; - float v404 = 5.8778525229247325e-01F; - float v405 = -5.8778525229247325e-01F; - float v412 = 3.6327126400268028e-01F; - float v413 = -3.6327126400268028e-01F; - float32x2_t v415 = (float32x2_t){v4, v4}; - const float32x2_t *v972 = &v5[istride]; - float32x2_t *v1042 = &v6[ostride]; - float32x2_t v388 = (float32x2_t){v387, v387}; - float32x2_t v393 = (float32x2_t){v392, v392}; - float32x2_t v398 = (float32x2_t){v396, v397}; - float32x2_t v406 = (float32x2_t){v404, v405}; - float32x2_t v414 = (float32x2_t){v412, v413}; - const float32x2_t *v1005 = &v5[0]; - float32x2_t *v1015 = &v6[0]; - float32x4_t v1112 = vld1q_f32((const float32_t *)v972); - float32x4_t v47 = - vreinterpretq_f32_u64(vld1q_dup_u64((const uint64_t *)&v7[8])); - float32x4_t v49 = - vreinterpretq_f32_u64(vld1q_dup_u64((const uint64_t *)&v7[9])); - float32x4_t v97 = - vreinterpretq_f32_u64(vld1q_dup_u64((const uint64_t *)&v7[2])); - float32x4_t v99 = - vreinterpretq_f32_u64(vld1q_dup_u64((const uint64_t *)&v7[3])); - float32x4_t v109 = - vreinterpretq_f32_u64(vld1q_dup_u64((const uint64_t *)&v7[12])); - float32x4_t v111 = - vreinterpretq_f32_u64(vld1q_dup_u64((const uint64_t *)&v7[13])); - float32x4_t v159 = - vreinterpretq_f32_u64(vld1q_dup_u64((const uint64_t *)&v7[6])); - float32x4_t v161 = - vreinterpretq_f32_u64(vld1q_dup_u64((const uint64_t *)&v7[7])); - float32x4_t v171 = - vreinterpretq_f32_u64(vld1q_dup_u64((const uint64_t *)&v7[16])); - float32x4_t v173 = - vreinterpretq_f32_u64(vld1q_dup_u64((const uint64_t *)&v7[17])); - float32x4_t v221 = - vreinterpretq_f32_u64(vld1q_dup_u64((const uint64_t *)&v7[10])); - float32x4_t v223 = - vreinterpretq_f32_u64(vld1q_dup_u64((const uint64_t *)&v7[11])); - float32x4_t v228 = vtrn1q_f32(v1112, v1112); - float32x4_t v229 = vtrn2q_f32(v1112, v1112); - float32x4_t v233 = - vreinterpretq_f32_u64(vld1q_dup_u64((const uint64_t *)&v7[0])); - float32x4_t v235 = - vreinterpretq_f32_u64(vld1q_dup_u64((const uint64_t *)&v7[1])); - float32x4_t v283 = - vreinterpretq_f32_u64(vld1q_dup_u64((const uint64_t *)&v7[14])); - float32x4_t v285 = - vreinterpretq_f32_u64(vld1q_dup_u64((const uint64_t *)&v7[15])); - float32x4_t v295 = - vreinterpretq_f32_u64(vld1q_dup_u64((const uint64_t *)&v7[4])); - float32x4_t v297 = - vreinterpretq_f32_u64(vld1q_dup_u64((const uint64_t *)&v7[5])); - float32x4_t v389 = vcombine_f32(v388, v388); - float32x4_t v394 = vcombine_f32(v393, v393); - float32x2_t v400 = vmul_f32(v415, v398); - float32x2_t v408 = vmul_f32(v415, v406); - float32x2_t v416 = vmul_f32(v415, v414); - const float32x2_t *v907 = &v5[istride * 5]; - const float32x2_t *v918 = &v5[istride * 2]; - const float32x2_t *v928 = &v5[istride * 7]; - const float32x2_t *v940 = &v5[istride * 4]; - const float32x2_t *v950 = &v5[istride * 9]; - const float32x2_t *v962 = &v5[istride * 6]; - const float32x2_t *v982 = &v5[istride * 8]; - const float32x2_t *v992 = &v5[istride * 3]; - float32x2_t *v1024 = &v6[ostride * 5]; - float32x2_t *v1033 = &v6[ostride * 6]; - float32x2_t *v1051 = &v6[ostride * 2]; - float32x2_t *v1060 = &v6[ostride * 7]; - float32x2_t *v1069 = &v6[ostride * 8]; - float32x2_t *v1078 = &v6[ostride * 3]; - float32x2_t *v1087 = &v6[ostride * 4]; - float32x2_t *v1096 = &v6[ostride * 9]; - float32x4_t v1118 = vld1q_f32((const float32_t *)v1005); - float32x4_t v234 = vmulq_f32(v228, v233); - float32x4_t v402 = vcombine_f32(v400, v400); - float32x4_t v410 = vcombine_f32(v408, v408); - float32x4_t v418 = vcombine_f32(v416, v416); - float32x4_t v1100 = vld1q_f32((const float32_t *)v907); - float32x4_t v1102 = vld1q_f32((const float32_t *)v918); - float32x4_t v1104 = vld1q_f32((const float32_t *)v928); - float32x4_t v1106 = vld1q_f32((const float32_t *)v940); - float32x4_t v1108 = vld1q_f32((const float32_t *)v950); - float32x4_t v1110 = vld1q_f32((const float32_t *)v962); - float32x4_t v1114 = vld1q_f32((const float32_t *)v982); - float32x4_t v1116 = vld1q_f32((const float32_t *)v992); - float32x4_t v42 = vtrn1q_f32(v1100, v1100); - float32x4_t v43 = vtrn2q_f32(v1100, v1100); - float32x4_t v92 = vtrn1q_f32(v1102, v1102); - float32x4_t v93 = vtrn2q_f32(v1102, v1102); - float32x4_t v104 = vtrn1q_f32(v1104, v1104); - float32x4_t v105 = vtrn2q_f32(v1104, v1104); - float32x4_t v154 = vtrn1q_f32(v1106, v1106); - float32x4_t v155 = vtrn2q_f32(v1106, v1106); - float32x4_t v166 = vtrn1q_f32(v1108, v1108); - float32x4_t v167 = vtrn2q_f32(v1108, v1108); - float32x4_t v216 = vtrn1q_f32(v1110, v1110); - float32x4_t v217 = vtrn2q_f32(v1110, v1110); - float32x4_t v237 = vfmaq_f32(v234, v229, v235); - float32x4_t v278 = vtrn1q_f32(v1114, v1114); - float32x4_t v279 = vtrn2q_f32(v1114, v1114); - float32x4_t v290 = vtrn1q_f32(v1116, v1116); - float32x4_t v291 = vtrn2q_f32(v1116, v1116); - float32x4_t v48 = vmulq_f32(v42, v47); - float32x4_t v98 = vmulq_f32(v92, v97); - float32x4_t v110 = vmulq_f32(v104, v109); - float32x4_t v160 = vmulq_f32(v154, v159); - float32x4_t v172 = vmulq_f32(v166, v171); - float32x4_t v222 = vmulq_f32(v216, v221); - float32x4_t v284 = vmulq_f32(v278, v283); - float32x4_t v296 = vmulq_f32(v290, v295); - float32x4_t v51 = vfmaq_f32(v48, v43, v49); - float32x4_t v101 = vfmaq_f32(v98, v93, v99); - float32x4_t v113 = vfmaq_f32(v110, v105, v111); - float32x4_t v163 = vfmaq_f32(v160, v155, v161); - float32x4_t v175 = vfmaq_f32(v172, v167, v173); - float32x4_t v225 = vfmaq_f32(v222, v217, v223); - float32x4_t v287 = vfmaq_f32(v284, v279, v285); - float32x4_t v299 = vfmaq_f32(v296, v291, v297); - float32x4_t v307 = vaddq_f32(v1118, v51); - float32x4_t v308 = vsubq_f32(v1118, v51); - float32x4_t v309 = vaddq_f32(v101, v113); - float32x4_t v310 = vsubq_f32(v101, v113); - float32x4_t v311 = vaddq_f32(v163, v175); - float32x4_t v312 = vsubq_f32(v163, v175); - float32x4_t v313 = vaddq_f32(v225, v237); - float32x4_t v314 = vsubq_f32(v225, v237); - float32x4_t v315 = vaddq_f32(v287, v299); - float32x4_t v316 = vsubq_f32(v287, v299); - float32x4_t v317 = vaddq_f32(v309, v315); - float32x4_t v318 = vsubq_f32(v309, v315); - float32x4_t v319 = vaddq_f32(v313, v311); - float32x4_t v320 = vsubq_f32(v313, v311); - float32x4_t v373 = vaddq_f32(v310, v316); - float32x4_t v374 = vsubq_f32(v310, v316); - float32x4_t v375 = vaddq_f32(v314, v312); - float32x4_t v376 = vsubq_f32(v314, v312); - float32x4_t v321 = vaddq_f32(v317, v319); - float32x4_t v322 = vsubq_f32(v317, v319); - float32x4_t v323 = vaddq_f32(v318, v320); - float32x4_t v345 = vrev64q_f32(v318); - float32x4_t v361 = vrev64q_f32(v320); - float32x4_t v377 = vaddq_f32(v373, v375); - float32x4_t v378 = vsubq_f32(v373, v375); - float32x4_t v379 = vaddq_f32(v374, v376); - float32x4_t v401 = vrev64q_f32(v374); - float32x4_t v417 = vrev64q_f32(v376); - float32x4_t v324 = vaddq_f32(v321, v307); - float32x4_t v334 = vmulq_f32(v321, v389); - float32x4_t v339 = vmulq_f32(v322, v394); - float32x4_t v347 = vmulq_f32(v345, v402); - float32x4_t v353 = vrev64q_f32(v323); - float32x4_t v363 = vmulq_f32(v361, v418); - float32x4_t v380 = vaddq_f32(v377, v308); - float32x4_t v390 = vmulq_f32(v377, v389); - float32x4_t v395 = vmulq_f32(v378, v394); - float32x4_t v403 = vmulq_f32(v401, v402); - float32x4_t v409 = vrev64q_f32(v379); - float32x4_t v419 = vmulq_f32(v417, v418); - float32x4_t v355 = vmulq_f32(v353, v410); - float32x4_t v364 = vaddq_f32(v324, v334); - float32x4_t v411 = vmulq_f32(v409, v410); - float32x4_t v420 = vaddq_f32(v380, v390); - vst1q_f32((float32_t *)v1015, v324); - vst1q_f32((float32_t *)v1024, v380); - float32x4_t v365 = vaddq_f32(v364, v339); - float32x4_t v366 = vsubq_f32(v364, v339); - float32x4_t v367 = vsubq_f32(v347, v355); - float32x4_t v368 = vaddq_f32(v355, v363); - float32x4_t v421 = vaddq_f32(v420, v395); - float32x4_t v422 = vsubq_f32(v420, v395); - float32x4_t v423 = vsubq_f32(v403, v411); - float32x4_t v424 = vaddq_f32(v411, v419); - float32x4_t v369 = vaddq_f32(v365, v367); - float32x4_t v370 = vsubq_f32(v365, v367); - float32x4_t v371 = vaddq_f32(v366, v368); - float32x4_t v372 = vsubq_f32(v366, v368); - float32x4_t v425 = vaddq_f32(v421, v423); - float32x4_t v426 = vsubq_f32(v421, v423); - float32x4_t v427 = vaddq_f32(v422, v424); - float32x4_t v428 = vsubq_f32(v422, v424); - vst1q_f32((float32_t *)v1033, v370); - vst1q_f32((float32_t *)v1042, v426); - vst1q_f32((float32_t *)v1051, v372); - vst1q_f32((float32_t *)v1060, v428); - vst1q_f32((float32_t *)v1069, v371); - vst1q_f32((float32_t *)v1078, v427); - vst1q_f32((float32_t *)v1087, v369); - vst1q_f32((float32_t *)v1096, v425); - v5 += 2 * 1; - v6 += 2 * 1; - } - for (int j = v499 * 2; j < howmany; j += 1) { - float32x2_t v651 = v5[istride]; - float v810 = -1.2500000000000000e+00F; - float v814 = 5.5901699437494745e-01F; - float v817 = 1.5388417685876268e+00F; - float v818 = -1.5388417685876268e+00F; - float v824 = 5.8778525229247325e-01F; - float v825 = -5.8778525229247325e-01F; - float v831 = 3.6327126400268028e-01F; - float v832 = -3.6327126400268028e-01F; - float32x2_t v834 = (float32x2_t){v4, v4}; - float32x2_t v523 = v7[8]; - float32x2_t v528 = v7[9]; - float32x2_t v563 = v7[2]; - float32x2_t v568 = v7[3]; - float32x2_t v573 = v7[12]; - float32x2_t v578 = v7[13]; - float32x2_t v613 = v7[6]; - float32x2_t v618 = v7[7]; - float32x2_t v623 = v7[16]; - float32x2_t v628 = v7[17]; - float32x2_t v663 = v7[10]; - float32x2_t v668 = v7[11]; - float32x2_t v673 = v7[0]; - float32x2_t v674 = vtrn1_f32(v651, v651); - float32x2_t v675 = vtrn2_f32(v651, v651); - float32x2_t v678 = v7[1]; - float32x2_t v713 = v7[14]; - float32x2_t v718 = v7[15]; - float32x2_t v723 = v7[4]; - float32x2_t v728 = v7[5]; - float32x2_t v736 = v5[0]; - float32x2_t v811 = (float32x2_t){v810, v810}; - float32x2_t v815 = (float32x2_t){v814, v814}; - float32x2_t v819 = (float32x2_t){v817, v818}; - float32x2_t v826 = (float32x2_t){v824, v825}; - float32x2_t v833 = (float32x2_t){v831, v832}; - float32x2_t v511 = v5[istride * 5]; - float32x2_t v536 = v5[istride * 2]; - float32x2_t v551 = v5[istride * 7]; - float32x2_t v586 = v5[istride * 4]; - float32x2_t v601 = v5[istride * 9]; - float32x2_t v636 = v5[istride * 6]; - float32x2_t v679 = vmul_f32(v674, v673); - float32x2_t v686 = v5[istride * 8]; - float32x2_t v701 = v5[istride * 3]; - float32x2_t v821 = vmul_f32(v834, v819); - float32x2_t v828 = vmul_f32(v834, v826); - float32x2_t v835 = vmul_f32(v834, v833); - float32x2_t v524 = vtrn1_f32(v511, v511); - float32x2_t v525 = vtrn2_f32(v511, v511); - float32x2_t v564 = vtrn1_f32(v536, v536); - float32x2_t v565 = vtrn2_f32(v536, v536); - float32x2_t v574 = vtrn1_f32(v551, v551); - float32x2_t v575 = vtrn2_f32(v551, v551); - float32x2_t v614 = vtrn1_f32(v586, v586); - float32x2_t v615 = vtrn2_f32(v586, v586); - float32x2_t v624 = vtrn1_f32(v601, v601); - float32x2_t v625 = vtrn2_f32(v601, v601); - float32x2_t v664 = vtrn1_f32(v636, v636); - float32x2_t v665 = vtrn2_f32(v636, v636); - float32x2_t v681 = vfma_f32(v679, v675, v678); - float32x2_t v714 = vtrn1_f32(v686, v686); - float32x2_t v715 = vtrn2_f32(v686, v686); - float32x2_t v724 = vtrn1_f32(v701, v701); - float32x2_t v725 = vtrn2_f32(v701, v701); - float32x2_t v529 = vmul_f32(v524, v523); - float32x2_t v569 = vmul_f32(v564, v563); - float32x2_t v579 = vmul_f32(v574, v573); - float32x2_t v619 = vmul_f32(v614, v613); - float32x2_t v629 = vmul_f32(v624, v623); - float32x2_t v669 = vmul_f32(v664, v663); - float32x2_t v719 = vmul_f32(v714, v713); - float32x2_t v729 = vmul_f32(v724, v723); - float32x2_t v531 = vfma_f32(v529, v525, v528); - float32x2_t v571 = vfma_f32(v569, v565, v568); - float32x2_t v581 = vfma_f32(v579, v575, v578); - float32x2_t v621 = vfma_f32(v619, v615, v618); - float32x2_t v631 = vfma_f32(v629, v625, v628); - float32x2_t v671 = vfma_f32(v669, v665, v668); - float32x2_t v721 = vfma_f32(v719, v715, v718); - float32x2_t v731 = vfma_f32(v729, v725, v728); - float32x2_t v737 = vadd_f32(v736, v531); - float32x2_t v738 = vsub_f32(v736, v531); - float32x2_t v739 = vadd_f32(v571, v581); - float32x2_t v740 = vsub_f32(v571, v581); - float32x2_t v741 = vadd_f32(v621, v631); - float32x2_t v742 = vsub_f32(v621, v631); - float32x2_t v743 = vadd_f32(v671, v681); - float32x2_t v744 = vsub_f32(v671, v681); - float32x2_t v745 = vadd_f32(v721, v731); - float32x2_t v746 = vsub_f32(v721, v731); - float32x2_t v747 = vadd_f32(v739, v745); - float32x2_t v748 = vsub_f32(v739, v745); - float32x2_t v749 = vadd_f32(v743, v741); - float32x2_t v750 = vsub_f32(v743, v741); - float32x2_t v797 = vadd_f32(v740, v746); - float32x2_t v798 = vsub_f32(v740, v746); - float32x2_t v799 = vadd_f32(v744, v742); - float32x2_t v800 = vsub_f32(v744, v742); - float32x2_t v751 = vadd_f32(v747, v749); - float32x2_t v752 = vsub_f32(v747, v749); - float32x2_t v753 = vadd_f32(v748, v750); - float32x2_t v772 = vrev64_f32(v748); - float32x2_t v786 = vrev64_f32(v750); - float32x2_t v801 = vadd_f32(v797, v799); - float32x2_t v802 = vsub_f32(v797, v799); - float32x2_t v803 = vadd_f32(v798, v800); - float32x2_t v822 = vrev64_f32(v798); - float32x2_t v836 = vrev64_f32(v800); - float32x2_t v754 = vadd_f32(v751, v737); - float32x2_t v762 = vmul_f32(v751, v811); - float32x2_t v766 = vmul_f32(v752, v815); - float32x2_t v773 = vmul_f32(v772, v821); - float32x2_t v779 = vrev64_f32(v753); - float32x2_t v787 = vmul_f32(v786, v835); - float32x2_t v804 = vadd_f32(v801, v738); - float32x2_t v812 = vmul_f32(v801, v811); - float32x2_t v816 = vmul_f32(v802, v815); - float32x2_t v823 = vmul_f32(v822, v821); - float32x2_t v829 = vrev64_f32(v803); - float32x2_t v837 = vmul_f32(v836, v835); - float32x2_t v780 = vmul_f32(v779, v828); - float32x2_t v788 = vadd_f32(v754, v762); - float32x2_t v830 = vmul_f32(v829, v828); - float32x2_t v838 = vadd_f32(v804, v812); - v6[0] = v754; - v6[ostride * 5] = v804; - float32x2_t v789 = vadd_f32(v788, v766); - float32x2_t v790 = vsub_f32(v788, v766); - float32x2_t v791 = vsub_f32(v773, v780); - float32x2_t v792 = vadd_f32(v780, v787); - float32x2_t v839 = vadd_f32(v838, v816); - float32x2_t v840 = vsub_f32(v838, v816); - float32x2_t v841 = vsub_f32(v823, v830); - float32x2_t v842 = vadd_f32(v830, v837); - float32x2_t v793 = vadd_f32(v789, v791); - float32x2_t v794 = vsub_f32(v789, v791); - float32x2_t v795 = vadd_f32(v790, v792); - float32x2_t v796 = vsub_f32(v790, v792); - float32x2_t v843 = vadd_f32(v839, v841); - float32x2_t v844 = vsub_f32(v839, v841); - float32x2_t v845 = vadd_f32(v840, v842); - float32x2_t v846 = vsub_f32(v840, v842); - v6[ostride * 6] = v794; - v6[ostride] = v844; - v6[ostride * 2] = v796; - v6[ostride * 7] = v846; - v6[ostride * 8] = v795; - v6[ostride * 3] = v845; - v6[ostride * 4] = v793; - v6[ostride * 9] = v843; - v5 += 1 * 1; - v6 += 1 * 1; - } -} -#endif - -#ifdef ARMRAL_ARCH_SVE -void armral_fft_cf32_cf32_cf32_ac_t_uu10(const armral_cmplx_f32_t *restrict x, - armral_cmplx_f32_t *restrict y, - int istride, int ostride, - const armral_cmplx_f32_t *restrict w, - int howmany, float dir) { - int64_t v0 = istride; - int64_t v2 = ostride; - float v4 = dir; - const float32x2_t *v5 = (const float32x2_t *)x; - float32x2_t *v6 = (float32x2_t *)y; - const float32x2_t *v7 = (const float32x2_t *)w; - int64_t v8 = howmany; - int64_t v10 = svcntd(); - int64_t v11 = v10 * 1; - int64_t v12 = v10 * 1; - for (int j = 0; j < v8; j += v10) { - svbool_t pred_full = svwhilelt_b32(j * 2, howmany * 2); - float v237 = -1.2500000000000000e+00F; - float v242 = 5.5901699437494745e-01F; - float v247 = -1.5388417685876268e+00F; - float v254 = -5.8778525229247325e-01F; - float v261 = -3.6327126400268028e-01F; - const float32x2_t *v406 = &v5[v0]; - float32x2_t *v483 = &v6[v2]; - int64_t v19 = v0 * 5; - float32x2_t v30 = v7[4]; - int64_t v34 = v0 * 2; - int64_t v45 = v0 * 7; - float32x2_t v56 = v7[1]; - float32x2_t v60 = v7[6]; - int64_t v64 = v0 * 4; - int64_t v75 = v0 * 9; - float32x2_t v86 = v7[3]; - float32x2_t v90 = v7[8]; - int64_t v94 = v0 * 6; - float32x2_t v116 = v7[5]; - float32x2_t v120 = v7[0]; - int64_t v124 = v0 * 8; - int64_t v135 = v0 * 3; - float32x2_t v146 = v7[7]; - float32x2_t v150 = v7[2]; - float v250 = v4 * v247; - float v257 = v4 * v254; - float v264 = v4 * v261; - int64_t v284 = v2 * 5; - int64_t v291 = v2 * 6; - int64_t v305 = v2 * 2; - int64_t v312 = v2 * 7; - int64_t v319 = v2 * 8; - int64_t v326 = v2 * 3; - int64_t v333 = v2 * 4; - int64_t v340 = v2 * 9; - const float32x2_t *v434 = &v5[0]; - svfloat32_t v444 = svdup_n_f32(v237); - svfloat32_t v445 = svdup_n_f32(v242); - float32x2_t *v456 = &v6[0]; - svfloat32_t v553 = - svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v406)[0])); - svfloat32_t v31; - asm("mov %0.d, %d1" : "=w"(v31) : "w"(v30)); - svfloat32_t v57; - asm("mov %0.d, %d1" : "=w"(v57) : "w"(v56)); - svfloat32_t v61; - asm("mov %0.d, %d1" : "=w"(v61) : "w"(v60)); - svfloat32_t v87; - asm("mov %0.d, %d1" : "=w"(v87) : "w"(v86)); - svfloat32_t v91; - asm("mov %0.d, %d1" : "=w"(v91) : "w"(v90)); - svfloat32_t v117; - asm("mov %0.d, %d1" : "=w"(v117) : "w"(v116)); - svfloat32_t v121; - asm("mov %0.d, %d1" : "=w"(v121) : "w"(v120)); - svfloat32_t v147; - asm("mov %0.d, %d1" : "=w"(v147) : "w"(v146)); - svfloat32_t v151; - asm("mov %0.d, %d1" : "=w"(v151) : "w"(v150)); - const float32x2_t *v352 = &v5[v19]; - const float32x2_t *v361 = &v5[v34]; - const float32x2_t *v370 = &v5[v45]; - const float32x2_t *v379 = &v5[v64]; - const float32x2_t *v388 = &v5[v75]; - const float32x2_t *v397 = &v5[v94]; - const float32x2_t *v415 = &v5[v124]; - const float32x2_t *v424 = &v5[v135]; - svfloat32_t v446 = svdup_n_f32(v250); - svfloat32_t v447 = svdup_n_f32(v257); - svfloat32_t v448 = svdup_n_f32(v264); - float32x2_t *v465 = &v6[v284]; - float32x2_t *v474 = &v6[v291]; - float32x2_t *v492 = &v6[v305]; - float32x2_t *v501 = &v6[v312]; - float32x2_t *v510 = &v6[v319]; - float32x2_t *v519 = &v6[v326]; - float32x2_t *v528 = &v6[v333]; - float32x2_t *v537 = &v6[v340]; - svfloat32_t v559 = - svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v434)[0])); - svfloat32_t zero122; - asm volatile("mov %0.s, #0" : "=w"(zero122)); - svfloat32_t v122 = - svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero122, v553, v121, 0), - v553, v121, 90); - svfloat32_t v541 = - svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v352)[0])); - svfloat32_t v543 = - svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v361)[0])); - svfloat32_t v545 = - svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v370)[0])); - svfloat32_t v547 = - svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v379)[0])); - svfloat32_t v549 = - svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v388)[0])); - svfloat32_t v551 = + int64_t v312 = v2 * 8; + const float32x2_t *v397 = &v5[0]; + svfloat32_t v401 = svdup_n_f32(v171); + svfloat32_t v403 = svdup_n_f32(v183); + svfloat32_t v405 = svdup_n_f32(v195); + svfloat32_t v406 = svdup_n_f32(v200); + svfloat32_t v407 = svdup_n_f32(v205); + float32x2_t *v418 = &v6[0]; + svfloat32_t v494 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v324)[0])); + svfloat32_t v42 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[0])); + svfloat32_t v46 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[7])); + svfloat32_t v72 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[6])); + svfloat32_t v76 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[1])); + svfloat32_t v102 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[2])); + svfloat32_t v106 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[5])); + svfloat32_t v132 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[3])); + svfloat32_t v136 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[4])); + const float32x2_t *v333 = &v5[v30]; + const float32x2_t *v342 = &v5[v49]; + const float32x2_t *v351 = &v5[v60]; + const float32x2_t *v360 = &v5[v79]; + const float32x2_t *v369 = &v5[v90]; + const float32x2_t *v378 = &v5[v109]; + const float32x2_t *v387 = &v5[v120]; + svfloat32_t v404 = svdup_n_f32(v191); + svfloat32_t v408 = svdup_n_f32(v213); + svfloat32_t v409 = svdup_n_f32(v220); + svfloat32_t v410 = svdup_n_f32(v227); + float32x2_t *v436 = &v6[v270]; + float32x2_t *v445 = &v6[v277]; + float32x2_t *v454 = &v6[v284]; + float32x2_t *v463 = &v6[v291]; + float32x2_t *v472 = &v6[v298]; + float32x2_t *v481 = &v6[v305]; + float32x2_t *v490 = &v6[v312]; + svfloat32_t v510 = svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v397)[0])); - svfloat32_t v555 = - svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v415)[0])); - svfloat32_t v557 = - svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v424)[0])); - svfloat32_t zero32; - asm volatile("mov %0.s, #0" : "=w"(zero32)); - svfloat32_t v32 = - svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero32, v541, v31, 0), - v541, v31, 90); - svfloat32_t zero58; - asm volatile("mov %0.s, #0" : "=w"(zero58)); - svfloat32_t v58 = - svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero58, v543, v57, 0), - v543, v57, 90); - svfloat32_t zero62; - asm volatile("mov %0.s, #0" : "=w"(zero62)); - svfloat32_t v62 = - svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero62, v545, v61, 0), - v545, v61, 90); - svfloat32_t zero88; - asm volatile("mov %0.s, #0" : "=w"(zero88)); - svfloat32_t v88 = - svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero88, v547, v87, 0), - v547, v87, 90); - svfloat32_t zero92; - asm volatile("mov %0.s, #0" : "=w"(zero92)); - svfloat32_t v92 = - svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero92, v549, v91, 0), - v549, v91, 90); - svfloat32_t zero118; - asm volatile("mov %0.s, #0" : "=w"(zero118)); - svfloat32_t v118 = - svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero118, v551, v117, 0), - v551, v117, 90); - svfloat32_t zero148; - asm volatile("mov %0.s, #0" : "=w"(zero148)); - svfloat32_t v148 = - svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero148, v555, v147, 0), - v555, v147, 90); - svfloat32_t zero152; - asm volatile("mov %0.s, #0" : "=w"(zero152)); - svfloat32_t v152 = - svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero152, v557, v151, 0), - v557, v151, 90); - svfloat32_t v160; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v160) : "w"(v559), "w"(v32)); - svfloat32_t v161; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v161) : "w"(v559), "w"(v32)); - svfloat32_t v162; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v162) : "w"(v58), "w"(v62)); - svfloat32_t v163; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v163) : "w"(v58), "w"(v62)); - svfloat32_t v164; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v164) : "w"(v88), "w"(v92)); - svfloat32_t v165; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v165) : "w"(v88), "w"(v92)); - svfloat32_t v166; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v166) : "w"(v118), "w"(v122)); - svfloat32_t v167; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v167) : "w"(v118), "w"(v122)); - svfloat32_t v168; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v168) : "w"(v148), "w"(v152)); - svfloat32_t v169; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v169) : "w"(v148), "w"(v152)); - svfloat32_t v170; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v170) : "w"(v162), "w"(v168)); - svfloat32_t v171; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v171) : "w"(v162), "w"(v168)); - svfloat32_t v172; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v172) : "w"(v166), "w"(v164)); - svfloat32_t v173; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v173) : "w"(v166), "w"(v164)); - svfloat32_t v223; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v223) : "w"(v163), "w"(v169)); - svfloat32_t v224; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v224) : "w"(v163), "w"(v169)); - svfloat32_t v225; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v225) : "w"(v167), "w"(v165)); - svfloat32_t v226; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v226) : "w"(v167), "w"(v165)); - svfloat32_t v174; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v174) : "w"(v170), "w"(v172)); - svfloat32_t v175; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v175) : "w"(v170), "w"(v172)); - svfloat32_t v176; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v176) : "w"(v171), "w"(v173)); - svfloat32_t zero199; - asm volatile("mov %0.s, #0" : "=w"(zero199)); - svfloat32_t v199 = svcmla_f32_x(pred_full, zero199, v446, v171, 90); - svfloat32_t v227; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v227) : "w"(v223), "w"(v225)); - svfloat32_t v228; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v228) : "w"(v223), "w"(v225)); - svfloat32_t v229; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v229) : "w"(v224), "w"(v226)); - svfloat32_t zero252; - asm volatile("mov %0.s, #0" : "=w"(zero252)); - svfloat32_t v252 = svcmla_f32_x(pred_full, zero252, v446, v224, 90); - svfloat32_t v177; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v177) : "w"(v174), "w"(v160)); - svfloat32_t zero206; - asm volatile("mov %0.s, #0" : "=w"(zero206)); - svfloat32_t v206 = svcmla_f32_x(pred_full, zero206, v447, v176, 90); - svfloat32_t v230; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v230) : "w"(v227), "w"(v161)); - svfloat32_t zero259; - asm volatile("mov %0.s, #0" : "=w"(zero259)); - svfloat32_t v259 = svcmla_f32_x(pred_full, zero259, v447, v229, 90); - svfloat32_t v214 = svmla_f32_x(pred_full, v177, v174, v444); - svfloat32_t v217; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v217) : "w"(v199), "w"(v206)); - svfloat32_t v218 = svcmla_f32_x(pred_full, v206, v448, v173, 90); - svfloat32_t v267 = svmla_f32_x(pred_full, v230, v227, v444); - svfloat32_t v270; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v270) : "w"(v252), "w"(v259)); - svfloat32_t v271 = svcmla_f32_x(pred_full, v259, v448, v226, 90); - svst1_f64(pred_full, (double *)(v456), svreinterpret_f64_f32(v177)); - svst1_f64(pred_full, (double *)(v465), svreinterpret_f64_f32(v230)); - svfloat32_t v215 = svmla_f32_x(pred_full, v214, v175, v445); - svfloat32_t v216 = svmls_f32_x(pred_full, v214, v175, v445); - svfloat32_t v268 = svmla_f32_x(pred_full, v267, v228, v445); - svfloat32_t v269 = svmls_f32_x(pred_full, v267, v228, v445); - svfloat32_t v219; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v219) : "w"(v215), "w"(v217)); - svfloat32_t v220; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v220) : "w"(v215), "w"(v217)); - svfloat32_t v221; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v221) : "w"(v216), "w"(v218)); - svfloat32_t v222; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v222) : "w"(v216), "w"(v218)); - svfloat32_t v272; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v272) : "w"(v268), "w"(v270)); - svfloat32_t v273; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v273) : "w"(v268), "w"(v270)); - svfloat32_t v274; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v274) : "w"(v269), "w"(v271)); - svfloat32_t v275; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v275) : "w"(v269), "w"(v271)); - svst1_f64(pred_full, (double *)(v474), svreinterpret_f64_f32(v220)); - svst1_f64(pred_full, (double *)(v483), svreinterpret_f64_f32(v273)); - svst1_f64(pred_full, (double *)(v492), svreinterpret_f64_f32(v222)); - svst1_f64(pred_full, (double *)(v501), svreinterpret_f64_f32(v275)); - svst1_f64(pred_full, (double *)(v510), svreinterpret_f64_f32(v221)); - svst1_f64(pred_full, (double *)(v519), svreinterpret_f64_f32(v274)); - svst1_f64(pred_full, (double *)(v528), svreinterpret_f64_f32(v219)); - svst1_f64(pred_full, (double *)(v537), svreinterpret_f64_f32(v272)); + svfloat32_t zero43 = svdup_n_f32(0); + svfloat32_t v43 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero43, v494, v42, 0), + v494, v42, 90); + svfloat32_t v496 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v333)[0])); + svfloat32_t v498 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v342)[0])); + svfloat32_t v500 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v351)[0])); + svfloat32_t v502 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v360)[0])); + svfloat32_t v504 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v369)[0])); + svfloat32_t v506 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v378)[0])); + svfloat32_t v508 = + svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v387)[0])); + svfloat32_t zero47 = svdup_n_f32(0); + svfloat32_t v47 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero47, v496, v46, 0), + v496, v46, 90); + svfloat32_t zero73 = svdup_n_f32(0); + svfloat32_t v73 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero73, v498, v72, 0), + v498, v72, 90); + svfloat32_t zero77 = svdup_n_f32(0); + svfloat32_t v77 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero77, v500, v76, 0), + v500, v76, 90); + svfloat32_t zero103 = svdup_n_f32(0); + svfloat32_t v103 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero103, v502, v102, 0), + v502, v102, 90); + svfloat32_t zero107 = svdup_n_f32(0); + svfloat32_t v107 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero107, v504, v106, 0), + v504, v106, 90); + svfloat32_t zero133 = svdup_n_f32(0); + svfloat32_t v133 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero133, v506, v132, 0), + v506, v132, 90); + svfloat32_t zero137 = svdup_n_f32(0); + svfloat32_t v137 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero137, v508, v136, 0), + v508, v136, 90); + svfloat32_t v138 = svadd_f32_x(svptrue_b32(), v43, v47); + svfloat32_t v139 = svsub_f32_x(svptrue_b32(), v43, v47); + svfloat32_t v140 = svadd_f32_x(svptrue_b32(), v73, v77); + svfloat32_t v141 = svsub_f32_x(svptrue_b32(), v73, v77); + svfloat32_t v142 = svadd_f32_x(svptrue_b32(), v103, v107); + svfloat32_t v143 = svsub_f32_x(svptrue_b32(), v103, v107); + svfloat32_t v144 = svadd_f32_x(svptrue_b32(), v133, v137); + svfloat32_t v145 = svsub_f32_x(svptrue_b32(), v133, v137); + svfloat32_t v146 = svadd_f32_x(svptrue_b32(), v138, v140); + svfloat32_t v157 = svadd_f32_x(svptrue_b32(), v139, v141); + svfloat32_t v159 = svsub_f32_x(svptrue_b32(), v138, v140); + svfloat32_t v160 = svsub_f32_x(svptrue_b32(), v140, v144); + svfloat32_t v161 = svsub_f32_x(svptrue_b32(), v144, v138); + svfloat32_t v162 = svsub_f32_x(svptrue_b32(), v139, v141); + svfloat32_t v163 = svsub_f32_x(svptrue_b32(), v141, v145); + svfloat32_t v164 = svsub_f32_x(svptrue_b32(), v145, v139); + svfloat32_t zero193 = svdup_n_f32(0); + svfloat32_t v193 = svcmla_f32_x(pred_full, zero193, v404, v143, 90); + svfloat32_t v147 = svadd_f32_x(svptrue_b32(), v146, v144); + svfloat32_t v158 = svadd_f32_x(svptrue_b32(), v157, v145); + svfloat32_t zero215 = svdup_n_f32(0); + svfloat32_t v215 = svcmla_f32_x(pred_full, zero215, v408, v162, 90); + svfloat32_t zero222 = svdup_n_f32(0); + svfloat32_t v222 = svcmla_f32_x(pred_full, zero222, v409, v163, 90); + svfloat32_t zero229 = svdup_n_f32(0); + svfloat32_t v229 = svcmla_f32_x(pred_full, zero229, v410, v164, 90); + svfloat32_t v148 = svadd_f32_x(svptrue_b32(), v147, v142); + svfloat32_t v174 = svmul_f32_x(svptrue_b32(), v147, v401); + svfloat32_t zero181 = svdup_n_f32(0); + svfloat32_t v181 = svcmla_f32_x(pred_full, zero181, v404, v158, 90); + svfloat32_t v243 = svadd_f32_x(svptrue_b32(), v193, v215); + svfloat32_t v245 = svsub_f32_x(svptrue_b32(), v193, v222); + svfloat32_t v247 = svsub_f32_x(svptrue_b32(), v193, v215); + svfloat32_t v156 = svadd_f32_x(svptrue_b32(), v148, v510); + svfloat32_t v230 = svadd_f32_x(svptrue_b32(), v174, v174); + svfloat32_t v244 = svadd_f32_x(svptrue_b32(), v243, v222); + svfloat32_t v246 = svadd_f32_x(svptrue_b32(), v245, v229); + svfloat32_t v248 = svsub_f32_x(svptrue_b32(), v247, v229); + svfloat32_t v231 = svmla_f32_x(pred_full, v230, v147, v401); + svfloat32_t v235 = svmla_f32_x(pred_full, v156, v142, v403); + svst1_f64(pred_full, (double *)(v418), svreinterpret_f64_f32(v156)); + svfloat32_t v232 = svadd_f32_x(svptrue_b32(), v156, v231); + svfloat32_t v236 = svadd_f32_x(svptrue_b32(), v235, v230); + svfloat32_t v233 = svadd_f32_x(svptrue_b32(), v232, v181); + svfloat32_t v234 = svsub_f32_x(svptrue_b32(), v232, v181); + svfloat32_t v237 = svmla_f32_x(pred_full, v236, v159, v405); + svfloat32_t v239 = svmls_f32_x(pred_full, v236, v160, v406); + svfloat32_t v241 = svmls_f32_x(pred_full, v236, v159, v405); + svfloat32_t v238 = svmla_f32_x(pred_full, v237, v160, v406); + svfloat32_t v240 = svmla_f32_x(pred_full, v239, v161, v407); + svfloat32_t v242 = svmls_f32_x(pred_full, v241, v161, v407); + svst1_f64(pred_full, (double *)(v445), svreinterpret_f64_f32(v234)); + svst1_f64(pred_full, (double *)(v472), svreinterpret_f64_f32(v233)); + svfloat32_t v249 = svadd_f32_x(svptrue_b32(), v238, v244); + svfloat32_t v250 = svsub_f32_x(svptrue_b32(), v238, v244); + svfloat32_t v251 = svadd_f32_x(svptrue_b32(), v240, v246); + svfloat32_t v252 = svsub_f32_x(svptrue_b32(), v240, v246); + svfloat32_t v253 = svadd_f32_x(svptrue_b32(), v242, v248); + svfloat32_t v254 = svsub_f32_x(svptrue_b32(), v242, v248); + svst1_f64(pred_full, (double *)(v427), svreinterpret_f64_f32(v250)); + svst1_f64(pred_full, (double *)(v436), svreinterpret_f64_f32(v251)); + svst1_f64(pred_full, (double *)(v454), svreinterpret_f64_f32(v254)); + svst1_f64(pred_full, (double *)(v463), svreinterpret_f64_f32(v253)); + svst1_f64(pred_full, (double *)(v481), svreinterpret_f64_f32(v252)); + svst1_f64(pred_full, (double *)(v490), svreinterpret_f64_f32(v249)); v5 += v11; v6 += v12; } @@ -4070,16 +1712,6 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu11(const armral_cmplx_f32_t *restrict x, int64_t v96 = v0 * 7; int64_t v107 = v0 * 5; int64_t v118 = v0 * 6; - float32x2_t v129 = v7[0]; - float32x2_t v133 = v7[9]; - float32x2_t v137 = v7[1]; - float32x2_t v141 = v7[8]; - float32x2_t v145 = v7[2]; - float32x2_t v149 = v7[7]; - float32x2_t v153 = v7[3]; - float32x2_t v157 = v7[6]; - float32x2_t v161 = v7[4]; - float32x2_t v165 = v7[5]; float v226 = v4 * v223; float v278 = v4 * v275; float v285 = v4 * v282; @@ -4113,26 +1745,26 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu11(const armral_cmplx_f32_t *restrict x, float32x2_t *v589 = &v6[0]; svfloat32_t v683 = svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v467)[0])); - svfloat32_t v130; - asm("mov %0.d, %d1" : "=w"(v130) : "w"(v129)); - svfloat32_t v134; - asm("mov %0.d, %d1" : "=w"(v134) : "w"(v133)); - svfloat32_t v138; - asm("mov %0.d, %d1" : "=w"(v138) : "w"(v137)); - svfloat32_t v142; - asm("mov %0.d, %d1" : "=w"(v142) : "w"(v141)); - svfloat32_t v146; - asm("mov %0.d, %d1" : "=w"(v146) : "w"(v145)); - svfloat32_t v150; - asm("mov %0.d, %d1" : "=w"(v150) : "w"(v149)); - svfloat32_t v154; - asm("mov %0.d, %d1" : "=w"(v154) : "w"(v153)); - svfloat32_t v158; - asm("mov %0.d, %d1" : "=w"(v158) : "w"(v157)); - svfloat32_t v162; - asm("mov %0.d, %d1" : "=w"(v162) : "w"(v161)); - svfloat32_t v166; - asm("mov %0.d, %d1" : "=w"(v166) : "w"(v165)); + svfloat32_t v130 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[0])); + svfloat32_t v134 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[9])); + svfloat32_t v138 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[1])); + svfloat32_t v142 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[8])); + svfloat32_t v146 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[2])); + svfloat32_t v150 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[7])); + svfloat32_t v154 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[3])); + svfloat32_t v158 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[6])); + svfloat32_t v162 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[4])); + svfloat32_t v166 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[5])); const float32x2_t *v476 = &v5[v30]; const float32x2_t *v485 = &v5[v41]; const float32x2_t *v494 = &v5[v52]; @@ -4163,8 +1795,7 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu11(const armral_cmplx_f32_t *restrict x, float32x2_t *v670 = &v6[v448]; svfloat32_t v703 = svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v558)[0])); - svfloat32_t zero131; - asm volatile("mov %0.s, #0" : "=w"(zero131)); + svfloat32_t zero131 = svdup_n_f32(0); svfloat32_t v131 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero131, v683, v130, 0), v683, v130, 90); @@ -4186,953 +1817,156 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu11(const armral_cmplx_f32_t *restrict x, svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v539)[0])); svfloat32_t v701 = svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v548)[0])); - svfloat32_t zero135; - asm volatile("mov %0.s, #0" : "=w"(zero135)); + svfloat32_t zero135 = svdup_n_f32(0); svfloat32_t v135 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero135, v685, v134, 0), v685, v134, 90); - svfloat32_t zero139; - asm volatile("mov %0.s, #0" : "=w"(zero139)); + svfloat32_t zero139 = svdup_n_f32(0); svfloat32_t v139 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero139, v687, v138, 0), v687, v138, 90); - svfloat32_t zero143; - asm volatile("mov %0.s, #0" : "=w"(zero143)); + svfloat32_t zero143 = svdup_n_f32(0); svfloat32_t v143 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero143, v689, v142, 0), v689, v142, 90); - svfloat32_t zero147; - asm volatile("mov %0.s, #0" : "=w"(zero147)); + svfloat32_t zero147 = svdup_n_f32(0); svfloat32_t v147 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero147, v691, v146, 0), v691, v146, 90); - svfloat32_t zero151; - asm volatile("mov %0.s, #0" : "=w"(zero151)); + svfloat32_t zero151 = svdup_n_f32(0); svfloat32_t v151 = - svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero151, v693, v150, 0), - v693, v150, 90); - svfloat32_t zero155; - asm volatile("mov %0.s, #0" : "=w"(zero155)); - svfloat32_t v155 = - svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero155, v695, v154, 0), - v695, v154, 90); - svfloat32_t zero159; - asm volatile("mov %0.s, #0" : "=w"(zero159)); - svfloat32_t v159 = - svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero159, v697, v158, 0), - v697, v158, 90); - svfloat32_t zero163; - asm volatile("mov %0.s, #0" : "=w"(zero163)); - svfloat32_t v163 = - svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero163, v699, v162, 0), - v699, v162, 90); - svfloat32_t zero167; - asm volatile("mov %0.s, #0" : "=w"(zero167)); - svfloat32_t v167 = - svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero167, v701, v166, 0), - v701, v166, 90); - svfloat32_t v168; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v168) : "w"(v131), "w"(v135)); - svfloat32_t v169; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v169) : "w"(v139), "w"(v143)); - svfloat32_t v170; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v170) : "w"(v147), "w"(v151)); - svfloat32_t v171; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v171) : "w"(v155), "w"(v159)); - svfloat32_t v172; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v172) : "w"(v163), "w"(v167)); - svfloat32_t v173; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v173) : "w"(v131), "w"(v135)); - svfloat32_t v174; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v174) : "w"(v139), "w"(v143)); - svfloat32_t v175; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v175) : "w"(v147), "w"(v151)); - svfloat32_t v176; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v176) : "w"(v155), "w"(v159)); - svfloat32_t v177; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v177) : "w"(v163), "w"(v167)); - svfloat32_t v178; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v178) : "w"(v168), "w"(v169)); - svfloat32_t v179; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v179) : "w"(v170), "w"(v172)); - svfloat32_t v181; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v181) : "w"(v174), "w"(v175)); - svfloat32_t v182; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v182) : "w"(v173), "w"(v177)); - svfloat32_t v194; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v194) : "w"(v169), "w"(v171)); - svfloat32_t v195; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v195) : "w"(v168), "w"(v171)); - svfloat32_t v196; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v196) : "w"(v169), "w"(v168)); - svfloat32_t v197; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v197) : "w"(v172), "w"(v171)); - svfloat32_t v198; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v198) : "w"(v170), "w"(v171)); - svfloat32_t v199; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v199) : "w"(v172), "w"(v170)); - svfloat32_t v200; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v200) : "w"(v169), "w"(v172)); - svfloat32_t v201; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v201) : "w"(v168), "w"(v170)); - svfloat32_t v203; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v203) : "w"(v174), "w"(v176)); - svfloat32_t v204; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v204) : "w"(v173), "w"(v176)); - svfloat32_t v205; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v205) : "w"(v173), "w"(v174)); - svfloat32_t v206; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v206) : "w"(v176), "w"(v177)); - svfloat32_t v207; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v207) : "w"(v175), "w"(v176)); - svfloat32_t v208; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v208) : "w"(v175), "w"(v177)); - svfloat32_t v209; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v209) : "w"(v174), "w"(v177)); - svfloat32_t v210; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v210) : "w"(v173), "w"(v175)); - svfloat32_t v180; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v180) : "w"(v171), "w"(v178)); - svfloat32_t v192; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v192) : "w"(v181), "w"(v182)); - svfloat32_t v202; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v202) : "w"(v179), "w"(v178)); - svfloat32_t v211; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v211) : "w"(v181), "w"(v182)); - svfloat32_t v238; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v238) : "w"(v195), "w"(v565)); - svfloat32_t v243; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v243) : "w"(v196), "w"(v566)); - svfloat32_t v253; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v253) : "w"(v198), "w"(v568)); - svfloat32_t v258; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v258) : "w"(v199), "w"(v569)); - svfloat32_t zero280; - asm volatile("mov %0.s, #0" : "=w"(zero280)); - svfloat32_t v280 = svcmla_f32_x(pred_full, zero280, v573, v203, 90); - svfloat32_t zero294; - asm volatile("mov %0.s, #0" : "=w"(zero294)); - svfloat32_t v294 = svcmla_f32_x(pred_full, zero294, v575, v205, 90); - svfloat32_t zero301; - asm volatile("mov %0.s, #0" : "=w"(zero301)); - svfloat32_t v301 = svcmla_f32_x(pred_full, zero301, v576, v206, 90); - svfloat32_t zero315; - asm volatile("mov %0.s, #0" : "=w"(zero315)); - svfloat32_t v315 = svcmla_f32_x(pred_full, zero315, v578, v208, 90); - svfloat32_t zero322; - asm volatile("mov %0.s, #0" : "=w"(zero322)); - svfloat32_t v322 = svcmla_f32_x(pred_full, zero322, v579, v209, 90); - svfloat32_t v183; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v183) : "w"(v180), "w"(v179)); - svfloat32_t v193; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v193) : "w"(v192), "w"(v176)); - svfloat32_t v273; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v273) : "w"(v202), "w"(v572)); - svfloat32_t zero336; - asm volatile("mov %0.s, #0" : "=w"(zero336)); - svfloat32_t v336 = svcmla_f32_x(pred_full, zero336, v581, v211, 90); - svfloat32_t v338 = svmla_f32_x(pred_full, v238, v194, v564); - svfloat32_t v339 = svmla_f32_x(pred_full, v243, v195, v565); - svfloat32_t v340 = svnmls_f32_x(pred_full, v243, v194, v564); - svfloat32_t v341 = svmla_f32_x(pred_full, v253, v197, v567); - svfloat32_t v342 = svmla_f32_x(pred_full, v258, v198, v568); - svfloat32_t v343 = svnmls_f32_x(pred_full, v258, v197, v567); - svfloat32_t v346 = svcmla_f32_x(pred_full, v294, v574, v204, 90); - svfloat32_t v347; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v347) : "w"(v280), "w"(v294)); - svfloat32_t v348 = svcmla_f32_x(pred_full, v315, v577, v207, 90); - svfloat32_t v349; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v349) : "w"(v301), "w"(v315)); - svfloat32_t v191; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v191) : "w"(v703), "w"(v183)); - svfloat32_t zero228; - asm volatile("mov %0.s, #0" : "=w"(zero228)); - svfloat32_t v228 = svcmla_f32_x(pred_full, zero228, v563, v193, 90); - svfloat32_t v344 = svmla_f32_x(pred_full, v273, v201, v571); - svfloat32_t v345 = svmla_f32_x(pred_full, v273, v200, v570); - svfloat32_t v350 = svcmla_f32_x(pred_full, v336, v580, v210, 90); - svfloat32_t v351; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v351) : "w"(v322), "w"(v336)); - svfloat32_t v370; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v370) : "w"(v346), "w"(v347)); - svfloat32_t v337 = svmls_f32_x(pred_full, v191, v183, v562); - svfloat32_t v352; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v352) : "w"(v342), "w"(v344)); - svfloat32_t v362; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v362) : "w"(v228), "w"(v348)); - svfloat32_t v364; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v364) : "w"(v350), "w"(v346)); - svfloat32_t v366; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v366) : "w"(v228), "w"(v351)); - svfloat32_t v368; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v368) : "w"(v351), "w"(v347)); - svfloat32_t v371; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v371) : "w"(v370), "w"(v348)); - svst1_f64(pred_full, (double *)(v589), svreinterpret_f64_f32(v191)); - svfloat32_t v353; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v353) : "w"(v352), "w"(v337)); - svfloat32_t v354; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v354) : "w"(v337), "w"(v339)); - svfloat32_t v356; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v356) : "w"(v337), "w"(v343)); - svfloat32_t v358; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v358) : "w"(v337), "w"(v340)); - svfloat32_t v360; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v360) : "w"(v337), "w"(v338)); - svfloat32_t v363; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v363) : "w"(v362), "w"(v350)); - svfloat32_t v365; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v365) : "w"(v364), "w"(v228)); - svfloat32_t v367; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v367) : "w"(v366), "w"(v349)); - svfloat32_t v369; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v369) : "w"(v368), "w"(v228)); - svfloat32_t v372; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v372) : "w"(v371), "w"(v349)); - svfloat32_t v355; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v355) : "w"(v354), "w"(v344)); - svfloat32_t v357; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v357) : "w"(v356), "w"(v345)); - svfloat32_t v359; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v359) : "w"(v358), "w"(v345)); - svfloat32_t v361; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v361) : "w"(v360), "w"(v341)); - svfloat32_t v373; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v373) : "w"(v372), "w"(v228)); - svfloat32_t v375; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v375) : "w"(v353), "w"(v363)); - svfloat32_t v382; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v382) : "w"(v353), "w"(v363)); - svfloat32_t v374; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v374) : "w"(v361), "w"(v373)); - svfloat32_t v376; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v376) : "w"(v355), "w"(v365)); - svfloat32_t v377; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v377) : "w"(v357), "w"(v367)); - svfloat32_t v378; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v378) : "w"(v359), "w"(v369)); - svfloat32_t v379; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v379) : "w"(v359), "w"(v369)); - svfloat32_t v380; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v380) : "w"(v357), "w"(v367)); - svfloat32_t v381; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v381) : "w"(v355), "w"(v365)); - svfloat32_t v383; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v383) : "w"(v361), "w"(v373)); - svst1_f64(pred_full, (double *)(v607), svreinterpret_f64_f32(v375)); - svst1_f64(pred_full, (double *)(v670), svreinterpret_f64_f32(v382)); - svst1_f64(pred_full, (double *)(v598), svreinterpret_f64_f32(v374)); - svst1_f64(pred_full, (double *)(v616), svreinterpret_f64_f32(v376)); - svst1_f64(pred_full, (double *)(v625), svreinterpret_f64_f32(v377)); - svst1_f64(pred_full, (double *)(v634), svreinterpret_f64_f32(v378)); - svst1_f64(pred_full, (double *)(v643), svreinterpret_f64_f32(v379)); - svst1_f64(pred_full, (double *)(v652), svreinterpret_f64_f32(v380)); - svst1_f64(pred_full, (double *)(v661), svreinterpret_f64_f32(v381)); - svst1_f64(pred_full, (double *)(v679), svreinterpret_f64_f32(v383)); - v5 += v11; - v6 += v12; - } -} -#endif - -#ifndef ARMRAL_ARCH_SVE -void armral_fft_cf32_cf32_cf32_ac_t_uu12(const armral_cmplx_f32_t *restrict x, - armral_cmplx_f32_t *restrict y, - int istride, int ostride, - const armral_cmplx_f32_t *restrict w, - int howmany, float dir) { - float v4 = dir; - const float32x2_t *v5 = (const float32x2_t *)x; - float32x2_t *v6 = (float32x2_t *)y; - const float32x2_t *v7 = (const float32x2_t *)w; - int64_t v12 = howmany - 1; - int64_t v540 = howmany / 2; - for (int j = 0; j < v12; j += 2) { - float v366 = 1.0000000000000000e+00F; - float v367 = -1.0000000000000000e+00F; - float v397 = -1.4999999999999998e+00F; - float v398 = 1.4999999999999998e+00F; - float v429 = 8.6602540378443871e-01F; - float32x2_t v432 = (float32x2_t){v4, v4}; - float v438 = -8.6602540378443871e-01F; - const float32x2_t *v1065 = &v5[istride]; - float32x2_t *v1142 = &v6[ostride]; - float32x2_t v368 = (float32x2_t){v366, v367}; - float32x2_t v394 = (float32x2_t){v397, v397}; - float32x2_t v399 = (float32x2_t){v397, v398}; - float32x2_t v431 = (float32x2_t){v429, v438}; - float32x2_t v439 = (float32x2_t){v438, v438}; - const float32x2_t *v1096 = &v5[0]; - float32x2_t *v1106 = &v6[0]; - float32x4_t v1225 = vld1q_f32((const float32_t *)v1065); - float32x4_t v66 = - vreinterpretq_f32_u64(vld1q_dup_u64((const uint64_t *)&v7[6])); - float32x4_t v68 = - vreinterpretq_f32_u64(vld1q_dup_u64((const uint64_t *)&v7[7])); - float32x4_t v78 = - vreinterpretq_f32_u64(vld1q_dup_u64((const uint64_t *)&v7[14])); - float32x4_t v80 = - vreinterpretq_f32_u64(vld1q_dup_u64((const uint64_t *)&v7[15])); - float32x4_t v128 = - vreinterpretq_f32_u64(vld1q_dup_u64((const uint64_t *)&v7[12])); - float32x4_t v130 = - vreinterpretq_f32_u64(vld1q_dup_u64((const uint64_t *)&v7[13])); - float32x4_t v140 = - vreinterpretq_f32_u64(vld1q_dup_u64((const uint64_t *)&v7[20])); - float32x4_t v142 = - vreinterpretq_f32_u64(vld1q_dup_u64((const uint64_t *)&v7[21])); - float32x4_t v159 = - vreinterpretq_f32_u64(vld1q_dup_u64((const uint64_t *)&v7[4])); - float32x4_t v161 = - vreinterpretq_f32_u64(vld1q_dup_u64((const uint64_t *)&v7[5])); - float32x4_t v209 = - vreinterpretq_f32_u64(vld1q_dup_u64((const uint64_t *)&v7[18])); - float32x4_t v211 = - vreinterpretq_f32_u64(vld1q_dup_u64((const uint64_t *)&v7[19])); - float32x4_t v221 = - vreinterpretq_f32_u64(vld1q_dup_u64((const uint64_t *)&v7[2])); - float32x4_t v223 = - vreinterpretq_f32_u64(vld1q_dup_u64((const uint64_t *)&v7[3])); - float32x4_t v240 = - vreinterpretq_f32_u64(vld1q_dup_u64((const uint64_t *)&v7[10])); - float32x4_t v242 = - vreinterpretq_f32_u64(vld1q_dup_u64((const uint64_t *)&v7[11])); - float32x4_t v285 = vtrn1q_f32(v1225, v1225); - float32x4_t v286 = vtrn2q_f32(v1225, v1225); - float32x4_t v290 = - vreinterpretq_f32_u64(vld1q_dup_u64((const uint64_t *)&v7[0])); - float32x4_t v292 = - vreinterpretq_f32_u64(vld1q_dup_u64((const uint64_t *)&v7[1])); - float32x4_t v302 = - vreinterpretq_f32_u64(vld1q_dup_u64((const uint64_t *)&v7[8])); - float32x4_t v304 = - vreinterpretq_f32_u64(vld1q_dup_u64((const uint64_t *)&v7[9])); - float32x4_t v321 = - vreinterpretq_f32_u64(vld1q_dup_u64((const uint64_t *)&v7[16])); - float32x4_t v323 = - vreinterpretq_f32_u64(vld1q_dup_u64((const uint64_t *)&v7[17])); - float32x2_t v370 = vmul_f32(v432, v368); - float32x4_t v395 = vcombine_f32(v394, v394); - float32x2_t v401 = vmul_f32(v432, v399); - float32x2_t v433 = vmul_f32(v432, v431); - float32x4_t v440 = vcombine_f32(v439, v439); - const float32x2_t *v979 = &v5[istride * 4]; - const float32x2_t *v989 = &v5[istride * 8]; - const float32x2_t *v1001 = &v5[istride * 7]; - const float32x2_t *v1011 = &v5[istride * 11]; - const float32x2_t *v1023 = &v5[istride * 3]; - const float32x2_t *v1033 = &v5[istride * 10]; - const float32x2_t *v1043 = &v5[istride * 2]; - const float32x2_t *v1055 = &v5[istride * 6]; - const float32x2_t *v1074 = &v5[istride * 5]; - const float32x2_t *v1085 = &v5[istride * 9]; - float32x2_t *v1115 = &v6[ostride * 4]; - float32x2_t *v1124 = &v6[ostride * 8]; - float32x2_t *v1133 = &v6[ostride * 9]; - float32x2_t *v1151 = &v6[ostride * 5]; - float32x2_t *v1160 = &v6[ostride * 6]; - float32x2_t *v1169 = &v6[ostride * 10]; - float32x2_t *v1178 = &v6[ostride * 2]; - float32x2_t *v1187 = &v6[ostride * 3]; - float32x2_t *v1196 = &v6[ostride * 7]; - float32x2_t *v1205 = &v6[ostride * 11]; - float32x4_t v1231 = vld1q_f32((const float32_t *)v1096); - float32x4_t v291 = vmulq_f32(v285, v290); - float32x4_t v372 = vcombine_f32(v370, v370); - float32x4_t v403 = vcombine_f32(v401, v401); - float32x4_t v435 = vcombine_f32(v433, v433); - float32x4_t v1209 = vld1q_f32((const float32_t *)v979); - float32x4_t v1211 = vld1q_f32((const float32_t *)v989); - float32x4_t v1213 = vld1q_f32((const float32_t *)v1001); - float32x4_t v1215 = vld1q_f32((const float32_t *)v1011); - float32x4_t v1217 = vld1q_f32((const float32_t *)v1023); - float32x4_t v1219 = vld1q_f32((const float32_t *)v1033); - float32x4_t v1221 = vld1q_f32((const float32_t *)v1043); - float32x4_t v1223 = vld1q_f32((const float32_t *)v1055); - float32x4_t v1227 = vld1q_f32((const float32_t *)v1074); - float32x4_t v1229 = vld1q_f32((const float32_t *)v1085); - float32x4_t v61 = vtrn1q_f32(v1209, v1209); - float32x4_t v62 = vtrn2q_f32(v1209, v1209); - float32x4_t v73 = vtrn1q_f32(v1211, v1211); - float32x4_t v74 = vtrn2q_f32(v1211, v1211); - float32x4_t v123 = vtrn1q_f32(v1213, v1213); - float32x4_t v124 = vtrn2q_f32(v1213, v1213); - float32x4_t v135 = vtrn1q_f32(v1215, v1215); - float32x4_t v136 = vtrn2q_f32(v1215, v1215); - float32x4_t v154 = vtrn1q_f32(v1217, v1217); - float32x4_t v155 = vtrn2q_f32(v1217, v1217); - float32x4_t v204 = vtrn1q_f32(v1219, v1219); - float32x4_t v205 = vtrn2q_f32(v1219, v1219); - float32x4_t v216 = vtrn1q_f32(v1221, v1221); - float32x4_t v217 = vtrn2q_f32(v1221, v1221); - float32x4_t v235 = vtrn1q_f32(v1223, v1223); - float32x4_t v236 = vtrn2q_f32(v1223, v1223); - float32x4_t v294 = vfmaq_f32(v291, v286, v292); - float32x4_t v297 = vtrn1q_f32(v1227, v1227); - float32x4_t v298 = vtrn2q_f32(v1227, v1227); - float32x4_t v316 = vtrn1q_f32(v1229, v1229); - float32x4_t v317 = vtrn2q_f32(v1229, v1229); - float32x4_t v67 = vmulq_f32(v61, v66); - float32x4_t v79 = vmulq_f32(v73, v78); - float32x4_t v129 = vmulq_f32(v123, v128); - float32x4_t v141 = vmulq_f32(v135, v140); - float32x4_t v160 = vmulq_f32(v154, v159); - float32x4_t v210 = vmulq_f32(v204, v209); - float32x4_t v222 = vmulq_f32(v216, v221); - float32x4_t v241 = vmulq_f32(v235, v240); - float32x4_t v303 = vmulq_f32(v297, v302); - float32x4_t v322 = vmulq_f32(v316, v321); - float32x4_t v70 = vfmaq_f32(v67, v62, v68); - float32x4_t v82 = vfmaq_f32(v79, v74, v80); - float32x4_t v132 = vfmaq_f32(v129, v124, v130); - float32x4_t v144 = vfmaq_f32(v141, v136, v142); - float32x4_t v163 = vfmaq_f32(v160, v155, v161); - float32x4_t v213 = vfmaq_f32(v210, v205, v211); - float32x4_t v225 = vfmaq_f32(v222, v217, v223); - float32x4_t v244 = vfmaq_f32(v241, v236, v242); - float32x4_t v306 = vfmaq_f32(v303, v298, v304); - float32x4_t v325 = vfmaq_f32(v322, v317, v323); - float32x4_t v326 = vaddq_f32(v70, v82); - float32x4_t v327 = vsubq_f32(v70, v82); - float32x4_t v336 = vaddq_f32(v132, v144); - float32x4_t v337 = vsubq_f32(v132, v144); - float32x4_t v339 = vaddq_f32(v213, v225); - float32x4_t v340 = vsubq_f32(v213, v225); - float32x4_t v342 = vaddq_f32(v294, v306); - float32x4_t v343 = vsubq_f32(v294, v306); - float32x4_t v335 = vaddq_f32(v326, v1231); - float32x4_t v338 = vaddq_f32(v336, v163); - float32x4_t v341 = vaddq_f32(v339, v244); - float32x4_t v344 = vaddq_f32(v342, v325); - float32x4_t v376 = vaddq_f32(v326, v339); - float32x4_t v377 = vsubq_f32(v326, v339); - float32x4_t v378 = vaddq_f32(v336, v342); - float32x4_t v379 = vsubq_f32(v336, v342); - float32x4_t v407 = vaddq_f32(v327, v340); - float32x4_t v408 = vsubq_f32(v327, v340); - float32x4_t v409 = vaddq_f32(v337, v343); - float32x4_t v410 = vsubq_f32(v337, v343); - float32x4_t v345 = vaddq_f32(v335, v341); - float32x4_t v346 = vsubq_f32(v335, v341); - float32x4_t v347 = vaddq_f32(v338, v344); - float32x4_t v348 = vsubq_f32(v338, v344); - float32x4_t v380 = vaddq_f32(v376, v378); - float32x4_t v381 = vsubq_f32(v376, v378); - float32x4_t v396 = vmulq_f32(v377, v395); - float32x4_t v402 = vrev64q_f32(v379); - float32x4_t v411 = vaddq_f32(v407, v409); - float32x4_t v412 = vsubq_f32(v407, v409); - float32x4_t v434 = vrev64q_f32(v408); - float32x4_t v441 = vmulq_f32(v410, v440); - float32x4_t v349 = vaddq_f32(v345, v347); - float32x4_t v350 = vsubq_f32(v345, v347); - float32x4_t v371 = vrev64q_f32(v348); - float32x4_t v386 = vmulq_f32(v380, v395); - float32x4_t v391 = vmulq_f32(v381, v395); - float32x4_t v404 = vmulq_f32(v402, v403); - float32x4_t v418 = vrev64q_f32(v411); - float32x4_t v426 = vrev64q_f32(v412); - float32x4_t v436 = vmulq_f32(v434, v435); - float32x4_t v373 = vmulq_f32(v371, v372); - float32x4_t v405 = vaddq_f32(v396, v404); - float32x4_t v406 = vsubq_f32(v396, v404); - float32x4_t v420 = vmulq_f32(v418, v435); - float32x4_t v428 = vmulq_f32(v426, v435); - float32x4_t v442 = vaddq_f32(v436, v441); - float32x4_t v443 = vsubq_f32(v436, v441); - float32x4_t v444 = vaddq_f32(v349, v386); - float32x4_t v492 = vaddq_f32(v350, v391); - vst1q_f32((float32_t *)v1106, v349); - vst1q_f32((float32_t *)v1160, v350); - float32x4_t v374 = vaddq_f32(v346, v373); - float32x4_t v375 = vsubq_f32(v346, v373); - float32x4_t v445 = vaddq_f32(v444, v420); - float32x4_t v446 = vsubq_f32(v444, v420); - float32x4_t v493 = vaddq_f32(v492, v428); - float32x4_t v494 = vsubq_f32(v492, v428); - float32x4_t v468 = vaddq_f32(v375, v406); - float32x4_t v516 = vaddq_f32(v374, v405); - vst1q_f32((float32_t *)v1115, v446); - vst1q_f32((float32_t *)v1124, v445); - vst1q_f32((float32_t *)v1133, v375); - vst1q_f32((float32_t *)v1169, v494); - vst1q_f32((float32_t *)v1178, v493); - vst1q_f32((float32_t *)v1187, v374); - float32x4_t v469 = vaddq_f32(v468, v443); - float32x4_t v470 = vsubq_f32(v468, v443); - float32x4_t v517 = vaddq_f32(v516, v442); - float32x4_t v518 = vsubq_f32(v516, v442); - vst1q_f32((float32_t *)v1142, v470); - vst1q_f32((float32_t *)v1151, v469); - vst1q_f32((float32_t *)v1196, v518); - vst1q_f32((float32_t *)v1205, v517); - v5 += 2 * 1; - v6 += 2 * 1; - } - for (int j = v540 * 2; j < howmany; j += 1) { - float32x2_t v732 = v5[istride]; - float v828 = 1.0000000000000000e+00F; - float v829 = -1.0000000000000000e+00F; - float v855 = -1.4999999999999998e+00F; - float v856 = 1.4999999999999998e+00F; - float v884 = 8.6602540378443871e-01F; - float32x2_t v887 = (float32x2_t){v4, v4}; - float v892 = -8.6602540378443871e-01F; - float32x2_t v579 = v7[6]; - float32x2_t v584 = v7[7]; - float32x2_t v589 = v7[14]; - float32x2_t v594 = v7[15]; - float32x2_t v629 = v7[12]; - float32x2_t v634 = v7[13]; - float32x2_t v639 = v7[20]; - float32x2_t v644 = v7[21]; - float32x2_t v654 = v7[4]; - float32x2_t v659 = v7[5]; - float32x2_t v694 = v7[18]; - float32x2_t v699 = v7[19]; - float32x2_t v704 = v7[2]; - float32x2_t v709 = v7[3]; - float32x2_t v719 = v7[10]; - float32x2_t v724 = v7[11]; - float32x2_t v759 = v7[0]; - float32x2_t v760 = vtrn1_f32(v732, v732); - float32x2_t v761 = vtrn2_f32(v732, v732); - float32x2_t v764 = v7[1]; - float32x2_t v769 = v7[8]; - float32x2_t v774 = v7[9]; - float32x2_t v784 = v7[16]; - float32x2_t v789 = v7[17]; - float32x2_t v799 = v5[0]; - float32x2_t v830 = (float32x2_t){v828, v829}; - float32x2_t v853 = (float32x2_t){v855, v855}; - float32x2_t v857 = (float32x2_t){v855, v856}; - float32x2_t v886 = (float32x2_t){v884, v892}; - float32x2_t v893 = (float32x2_t){v892, v892}; - float32x2_t v552 = v5[istride * 4]; - float32x2_t v567 = v5[istride * 8]; - float32x2_t v602 = v5[istride * 7]; - float32x2_t v617 = v5[istride * 11]; - float32x2_t v652 = v5[istride * 3]; - float32x2_t v667 = v5[istride * 10]; - float32x2_t v682 = v5[istride * 2]; - float32x2_t v717 = v5[istride * 6]; - float32x2_t v747 = v5[istride * 5]; - float32x2_t v765 = vmul_f32(v760, v759); - float32x2_t v782 = v5[istride * 9]; - float32x2_t v832 = vmul_f32(v887, v830); - float32x2_t v859 = vmul_f32(v887, v857); - float32x2_t v888 = vmul_f32(v887, v886); - float32x2_t v580 = vtrn1_f32(v552, v552); - float32x2_t v581 = vtrn2_f32(v552, v552); - float32x2_t v590 = vtrn1_f32(v567, v567); - float32x2_t v591 = vtrn2_f32(v567, v567); - float32x2_t v630 = vtrn1_f32(v602, v602); - float32x2_t v631 = vtrn2_f32(v602, v602); - float32x2_t v640 = vtrn1_f32(v617, v617); - float32x2_t v641 = vtrn2_f32(v617, v617); - float32x2_t v655 = vtrn1_f32(v652, v652); - float32x2_t v656 = vtrn2_f32(v652, v652); - float32x2_t v695 = vtrn1_f32(v667, v667); - float32x2_t v696 = vtrn2_f32(v667, v667); - float32x2_t v705 = vtrn1_f32(v682, v682); - float32x2_t v706 = vtrn2_f32(v682, v682); - float32x2_t v720 = vtrn1_f32(v717, v717); - float32x2_t v721 = vtrn2_f32(v717, v717); - float32x2_t v767 = vfma_f32(v765, v761, v764); - float32x2_t v770 = vtrn1_f32(v747, v747); - float32x2_t v771 = vtrn2_f32(v747, v747); - float32x2_t v785 = vtrn1_f32(v782, v782); - float32x2_t v786 = vtrn2_f32(v782, v782); - float32x2_t v585 = vmul_f32(v580, v579); - float32x2_t v595 = vmul_f32(v590, v589); - float32x2_t v635 = vmul_f32(v630, v629); - float32x2_t v645 = vmul_f32(v640, v639); - float32x2_t v660 = vmul_f32(v655, v654); - float32x2_t v700 = vmul_f32(v695, v694); - float32x2_t v710 = vmul_f32(v705, v704); - float32x2_t v725 = vmul_f32(v720, v719); - float32x2_t v775 = vmul_f32(v770, v769); - float32x2_t v790 = vmul_f32(v785, v784); - float32x2_t v587 = vfma_f32(v585, v581, v584); - float32x2_t v597 = vfma_f32(v595, v591, v594); - float32x2_t v637 = vfma_f32(v635, v631, v634); - float32x2_t v647 = vfma_f32(v645, v641, v644); - float32x2_t v662 = vfma_f32(v660, v656, v659); - float32x2_t v702 = vfma_f32(v700, v696, v699); - float32x2_t v712 = vfma_f32(v710, v706, v709); - float32x2_t v727 = vfma_f32(v725, v721, v724); - float32x2_t v777 = vfma_f32(v775, v771, v774); - float32x2_t v792 = vfma_f32(v790, v786, v789); - float32x2_t v793 = vadd_f32(v587, v597); - float32x2_t v794 = vsub_f32(v587, v597); - float32x2_t v801 = vadd_f32(v637, v647); - float32x2_t v802 = vsub_f32(v637, v647); - float32x2_t v804 = vadd_f32(v702, v712); - float32x2_t v805 = vsub_f32(v702, v712); - float32x2_t v807 = vadd_f32(v767, v777); - float32x2_t v808 = vsub_f32(v767, v777); - float32x2_t v800 = vadd_f32(v793, v799); - float32x2_t v803 = vadd_f32(v801, v662); - float32x2_t v806 = vadd_f32(v804, v727); - float32x2_t v809 = vadd_f32(v807, v792); - float32x2_t v837 = vadd_f32(v793, v804); - float32x2_t v838 = vsub_f32(v793, v804); - float32x2_t v839 = vadd_f32(v801, v807); - float32x2_t v840 = vsub_f32(v801, v807); - float32x2_t v864 = vadd_f32(v794, v805); - float32x2_t v865 = vsub_f32(v794, v805); - float32x2_t v866 = vadd_f32(v802, v808); - float32x2_t v867 = vsub_f32(v802, v808); - float32x2_t v810 = vadd_f32(v800, v806); - float32x2_t v811 = vsub_f32(v800, v806); - float32x2_t v812 = vadd_f32(v803, v809); - float32x2_t v813 = vsub_f32(v803, v809); - float32x2_t v841 = vadd_f32(v837, v839); - float32x2_t v842 = vsub_f32(v837, v839); - float32x2_t v854 = vmul_f32(v838, v853); - float32x2_t v860 = vrev64_f32(v840); - float32x2_t v868 = vadd_f32(v864, v866); - float32x2_t v869 = vsub_f32(v864, v866); - float32x2_t v889 = vrev64_f32(v865); - float32x2_t v894 = vmul_f32(v867, v893); - float32x2_t v814 = vadd_f32(v810, v812); - float32x2_t v815 = vsub_f32(v810, v812); - float32x2_t v833 = vrev64_f32(v813); - float32x2_t v846 = vmul_f32(v841, v853); - float32x2_t v850 = vmul_f32(v842, v853); - float32x2_t v861 = vmul_f32(v860, v859); - float32x2_t v875 = vrev64_f32(v868); - float32x2_t v882 = vrev64_f32(v869); - float32x2_t v890 = vmul_f32(v889, v888); - float32x2_t v834 = vmul_f32(v833, v832); - float32x2_t v862 = vadd_f32(v854, v861); - float32x2_t v863 = vsub_f32(v854, v861); - float32x2_t v876 = vmul_f32(v875, v888); - float32x2_t v883 = vmul_f32(v882, v888); - float32x2_t v895 = vadd_f32(v890, v894); - float32x2_t v896 = vsub_f32(v890, v894); - float32x2_t v897 = vadd_f32(v814, v846); - v6[0] = v814; - float32x2_t v933 = vadd_f32(v815, v850); - v6[ostride * 6] = v815; - float32x2_t v835 = vadd_f32(v811, v834); - float32x2_t v836 = vsub_f32(v811, v834); - float32x2_t v898 = vadd_f32(v897, v876); - float32x2_t v899 = vsub_f32(v897, v876); - float32x2_t v934 = vadd_f32(v933, v883); - float32x2_t v935 = vsub_f32(v933, v883); - v6[ostride * 4] = v899; - v6[ostride * 8] = v898; - float32x2_t v915 = vadd_f32(v836, v863); - v6[ostride * 9] = v836; - v6[ostride * 10] = v935; - v6[ostride * 2] = v934; - float32x2_t v951 = vadd_f32(v835, v862); - v6[ostride * 3] = v835; - float32x2_t v916 = vadd_f32(v915, v896); - float32x2_t v917 = vsub_f32(v915, v896); - float32x2_t v952 = vadd_f32(v951, v895); - float32x2_t v953 = vsub_f32(v951, v895); - v6[ostride] = v917; - v6[ostride * 5] = v916; - v6[ostride * 7] = v953; - v6[ostride * 11] = v952; - v5 += 1 * 1; - v6 += 1 * 1; - } -} -#endif - -#ifdef ARMRAL_ARCH_SVE -void armral_fft_cf32_cf32_cf32_ac_t_uu12(const armral_cmplx_f32_t *restrict x, - armral_cmplx_f32_t *restrict y, - int istride, int ostride, - const armral_cmplx_f32_t *restrict w, - int howmany, float dir) { - int64_t v0 = istride; - int64_t v2 = ostride; - float v4 = dir; - const float32x2_t *v5 = (const float32x2_t *)x; - float32x2_t *v6 = (float32x2_t *)y; - const float32x2_t *v7 = (const float32x2_t *)w; - int64_t v8 = howmany; - int64_t v10 = svcntd(); - int64_t v11 = v10 * 1; - int64_t v12 = v10 * 1; - for (int j = 0; j < v8; j += v10) { - svbool_t pred_full = svwhilelt_b32(j * 2, howmany * 2); - float v212 = -1.0000000000000000e+00F; - float v237 = -1.4999999999999998e+00F; - float v242 = 1.4999999999999998e+00F; - float v278 = -8.6602540378443871e-01F; - const float32x2_t *v458 = &v5[v0]; - float32x2_t *v544 = &v6[v2]; - int64_t v19 = v0 * 4; - int64_t v30 = v0 * 8; - float32x2_t v41 = v7[3]; - float32x2_t v45 = v7[7]; - int64_t v49 = v0 * 7; - int64_t v60 = v0 * 11; - float32x2_t v71 = v7[6]; - float32x2_t v75 = v7[10]; - int64_t v79 = v0 * 3; - float32x2_t v86 = v7[2]; - int64_t v90 = v0 * 10; - int64_t v101 = v0 * 2; - float32x2_t v112 = v7[9]; - float32x2_t v116 = v7[1]; - int64_t v120 = v0 * 6; - float32x2_t v127 = v7[5]; - int64_t v142 = v0 * 5; - float32x2_t v153 = v7[0]; - float32x2_t v157 = v7[4]; - int64_t v161 = v0 * 9; - float32x2_t v168 = v7[8]; - float v215 = v4 * v212; - float v245 = v4 * v242; - float v274 = v4 * v278; - int64_t v295 = v2 * 4; - int64_t v302 = v2 * 8; - int64_t v312 = v2 * 9; - int64_t v326 = v2 * 5; - int64_t v336 = v2 * 6; - int64_t v343 = v2 * 10; - int64_t v350 = v2 * 2; - int64_t v360 = v2 * 3; - int64_t v367 = v2 * 7; - int64_t v374 = v2 * 11; - const float32x2_t *v486 = &v5[0]; - svfloat32_t v495 = svdup_n_f32(v237); - svfloat32_t v500 = svdup_n_f32(v278); - float32x2_t *v508 = &v6[0]; - svfloat32_t v627 = - svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v458)[0])); - svfloat32_t v42; - asm("mov %0.d, %d1" : "=w"(v42) : "w"(v41)); - svfloat32_t v46; - asm("mov %0.d, %d1" : "=w"(v46) : "w"(v45)); - svfloat32_t v72; - asm("mov %0.d, %d1" : "=w"(v72) : "w"(v71)); - svfloat32_t v76; - asm("mov %0.d, %d1" : "=w"(v76) : "w"(v75)); - svfloat32_t v87; - asm("mov %0.d, %d1" : "=w"(v87) : "w"(v86)); - svfloat32_t v113; - asm("mov %0.d, %d1" : "=w"(v113) : "w"(v112)); - svfloat32_t v117; - asm("mov %0.d, %d1" : "=w"(v117) : "w"(v116)); - svfloat32_t v128; - asm("mov %0.d, %d1" : "=w"(v128) : "w"(v127)); - svfloat32_t v154; - asm("mov %0.d, %d1" : "=w"(v154) : "w"(v153)); - svfloat32_t v158; - asm("mov %0.d, %d1" : "=w"(v158) : "w"(v157)); - svfloat32_t v169; - asm("mov %0.d, %d1" : "=w"(v169) : "w"(v168)); - const float32x2_t *v386 = &v5[v19]; - const float32x2_t *v395 = &v5[v30]; - const float32x2_t *v404 = &v5[v49]; - const float32x2_t *v413 = &v5[v60]; - const float32x2_t *v422 = &v5[v79]; - const float32x2_t *v431 = &v5[v90]; - const float32x2_t *v440 = &v5[v101]; - const float32x2_t *v449 = &v5[v120]; - const float32x2_t *v467 = &v5[v142]; - const float32x2_t *v476 = &v5[v161]; - svfloat32_t v492 = svdup_n_f32(v215); - svfloat32_t v496 = svdup_n_f32(v245); - svfloat32_t v499 = svdup_n_f32(v274); - float32x2_t *v517 = &v6[v295]; - float32x2_t *v526 = &v6[v302]; - float32x2_t *v535 = &v6[v312]; - float32x2_t *v553 = &v6[v326]; - float32x2_t *v562 = &v6[v336]; - float32x2_t *v571 = &v6[v343]; - float32x2_t *v580 = &v6[v350]; - float32x2_t *v589 = &v6[v360]; - float32x2_t *v598 = &v6[v367]; - float32x2_t *v607 = &v6[v374]; - svfloat32_t v633 = - svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v486)[0])); - svfloat32_t zero155; - asm volatile("mov %0.s, #0" : "=w"(zero155)); + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero151, v693, v150, 0), + v693, v150, 90); + svfloat32_t zero155 = svdup_n_f32(0); svfloat32_t v155 = - svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero155, v627, v154, 0), - v627, v154, 90); - svfloat32_t v611 = - svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v386)[0])); - svfloat32_t v613 = - svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v395)[0])); - svfloat32_t v615 = - svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v404)[0])); - svfloat32_t v617 = - svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v413)[0])); - svfloat32_t v619 = - svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v422)[0])); - svfloat32_t v621 = - svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v431)[0])); - svfloat32_t v623 = - svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v440)[0])); - svfloat32_t v625 = - svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v449)[0])); - svfloat32_t v629 = - svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v467)[0])); - svfloat32_t v631 = - svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v476)[0])); - svfloat32_t zero43; - asm volatile("mov %0.s, #0" : "=w"(zero43)); - svfloat32_t v43 = - svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero43, v611, v42, 0), - v611, v42, 90); - svfloat32_t zero47; - asm volatile("mov %0.s, #0" : "=w"(zero47)); - svfloat32_t v47 = - svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero47, v613, v46, 0), - v613, v46, 90); - svfloat32_t zero73; - asm volatile("mov %0.s, #0" : "=w"(zero73)); - svfloat32_t v73 = - svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero73, v615, v72, 0), - v615, v72, 90); - svfloat32_t zero77; - asm volatile("mov %0.s, #0" : "=w"(zero77)); - svfloat32_t v77 = - svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero77, v617, v76, 0), - v617, v76, 90); - svfloat32_t zero114; - asm volatile("mov %0.s, #0" : "=w"(zero114)); - svfloat32_t v114 = - svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero114, v621, v113, 0), - v621, v113, 90); - svfloat32_t zero118; - asm volatile("mov %0.s, #0" : "=w"(zero118)); - svfloat32_t v118 = - svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero118, v623, v117, 0), - v623, v117, 90); - svfloat32_t zero159; - asm volatile("mov %0.s, #0" : "=w"(zero159)); + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero155, v695, v154, 0), + v695, v154, 90); + svfloat32_t zero159 = svdup_n_f32(0); svfloat32_t v159 = - svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero159, v629, v158, 0), - v629, v158, 90); - svfloat32_t v171; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v171) : "w"(v43), "w"(v47)); - svfloat32_t v172; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v172) : "w"(v43), "w"(v47)); - svfloat32_t v181; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v181) : "w"(v73), "w"(v77)); - svfloat32_t v182; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v182) : "w"(v73), "w"(v77)); - svfloat32_t v184; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v184) : "w"(v114), "w"(v118)); - svfloat32_t v185; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v185) : "w"(v114), "w"(v118)); - svfloat32_t v187; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v187) : "w"(v155), "w"(v159)); - svfloat32_t v188; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v188) : "w"(v155), "w"(v159)); - svfloat32_t v180; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v180) : "w"(v171), "w"(v633)); - svfloat32_t v183 = svcmla_f32_x( - pred_full, svcmla_f32_x(pred_full, v181, v619, v87, 0), v619, v87, 90); - svfloat32_t v186 = - svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, v184, v625, v128, 0), - v625, v128, 90); - svfloat32_t v189 = - svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, v187, v631, v169, 0), - v631, v169, 90); - svfloat32_t v220; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v220) : "w"(v171), "w"(v184)); - svfloat32_t v221; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v221) : "w"(v171), "w"(v184)); - svfloat32_t v222; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v222) : "w"(v181), "w"(v187)); - svfloat32_t v223; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v223) : "w"(v181), "w"(v187)); - svfloat32_t v250; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v250) : "w"(v172), "w"(v185)); - svfloat32_t v251; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v251) : "w"(v172), "w"(v185)); - svfloat32_t v252; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v252) : "w"(v182), "w"(v188)); - svfloat32_t v253; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v253) : "w"(v182), "w"(v188)); - svfloat32_t v190; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v190) : "w"(v180), "w"(v186)); - svfloat32_t v191; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v191) : "w"(v180), "w"(v186)); - svfloat32_t v192; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v192) : "w"(v183), "w"(v189)); - svfloat32_t v193; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v193) : "w"(v183), "w"(v189)); - svfloat32_t v224; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v224) : "w"(v220), "w"(v222)); - svfloat32_t v225; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v225) : "w"(v220), "w"(v222)); - svfloat32_t zero247; - asm volatile("mov %0.s, #0" : "=w"(zero247)); - svfloat32_t v247 = svcmla_f32_x(pred_full, zero247, v496, v223, 90); - svfloat32_t v254; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v254) : "w"(v250), "w"(v252)); - svfloat32_t v255; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v255) : "w"(v250), "w"(v252)); - svfloat32_t zero276; - asm volatile("mov %0.s, #0" : "=w"(zero276)); - svfloat32_t v276 = svcmla_f32_x(pred_full, zero276, v499, v251, 90); - svfloat32_t v194; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v194) : "w"(v190), "w"(v192)); - svfloat32_t v195; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v195) : "w"(v190), "w"(v192)); - svfloat32_t zero217; - asm volatile("mov %0.s, #0" : "=w"(zero217)); - svfloat32_t v217 = svcmla_f32_x(pred_full, zero217, v492, v193, 90); - svfloat32_t v248 = svmla_f32_x(pred_full, v247, v221, v495); - svfloat32_t v249 = svnmls_f32_x(pred_full, v247, v221, v495); - svfloat32_t zero262; - asm volatile("mov %0.s, #0" : "=w"(zero262)); - svfloat32_t v262 = svcmla_f32_x(pred_full, zero262, v499, v254, 90); - svfloat32_t zero269; - asm volatile("mov %0.s, #0" : "=w"(zero269)); - svfloat32_t v269 = svcmla_f32_x(pred_full, zero269, v499, v255, 90); - svfloat32_t v282 = svmla_f32_x(pred_full, v276, v253, v500); - svfloat32_t v283 = svmls_f32_x(pred_full, v276, v253, v500); - svfloat32_t v218; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v218) : "w"(v191), "w"(v217)); - svfloat32_t v219; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v219) : "w"(v191), "w"(v217)); - svfloat32_t v284 = svmla_f32_x(pred_full, v194, v224, v495); - svfloat32_t v332 = svmla_f32_x(pred_full, v195, v225, v495); - svst1_f64(pred_full, (double *)(v508), svreinterpret_f64_f32(v194)); - svst1_f64(pred_full, (double *)(v562), svreinterpret_f64_f32(v195)); - svfloat32_t v285; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v285) : "w"(v284), "w"(v262)); - svfloat32_t v286; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v286) : "w"(v284), "w"(v262)); - svfloat32_t v308; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v308) : "w"(v219), "w"(v249)); - svfloat32_t v333; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v333) : "w"(v332), "w"(v269)); - svfloat32_t v334; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v334) : "w"(v332), "w"(v269)); - svfloat32_t v356; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v356) : "w"(v218), "w"(v248)); - svst1_f64(pred_full, (double *)(v535), svreinterpret_f64_f32(v219)); - svst1_f64(pred_full, (double *)(v589), svreinterpret_f64_f32(v218)); - svfloat32_t v309; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v309) : "w"(v308), "w"(v283)); - svfloat32_t v310; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v310) : "w"(v308), "w"(v283)); - svfloat32_t v357; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v357) : "w"(v356), "w"(v282)); - svfloat32_t v358; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v358) : "w"(v356), "w"(v282)); - svst1_f64(pred_full, (double *)(v517), svreinterpret_f64_f32(v286)); - svst1_f64(pred_full, (double *)(v526), svreinterpret_f64_f32(v285)); - svst1_f64(pred_full, (double *)(v571), svreinterpret_f64_f32(v334)); - svst1_f64(pred_full, (double *)(v580), svreinterpret_f64_f32(v333)); - svst1_f64(pred_full, (double *)(v544), svreinterpret_f64_f32(v310)); - svst1_f64(pred_full, (double *)(v553), svreinterpret_f64_f32(v309)); - svst1_f64(pred_full, (double *)(v598), svreinterpret_f64_f32(v358)); - svst1_f64(pred_full, (double *)(v607), svreinterpret_f64_f32(v357)); + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero159, v697, v158, 0), + v697, v158, 90); + svfloat32_t zero163 = svdup_n_f32(0); + svfloat32_t v163 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero163, v699, v162, 0), + v699, v162, 90); + svfloat32_t zero167 = svdup_n_f32(0); + svfloat32_t v167 = + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero167, v701, v166, 0), + v701, v166, 90); + svfloat32_t v168 = svadd_f32_x(svptrue_b32(), v131, v135); + svfloat32_t v169 = svadd_f32_x(svptrue_b32(), v139, v143); + svfloat32_t v170 = svadd_f32_x(svptrue_b32(), v147, v151); + svfloat32_t v171 = svadd_f32_x(svptrue_b32(), v155, v159); + svfloat32_t v172 = svadd_f32_x(svptrue_b32(), v163, v167); + svfloat32_t v173 = svsub_f32_x(svptrue_b32(), v131, v135); + svfloat32_t v174 = svsub_f32_x(svptrue_b32(), v139, v143); + svfloat32_t v175 = svsub_f32_x(svptrue_b32(), v147, v151); + svfloat32_t v176 = svsub_f32_x(svptrue_b32(), v155, v159); + svfloat32_t v177 = svsub_f32_x(svptrue_b32(), v163, v167); + svfloat32_t v178 = svadd_f32_x(svptrue_b32(), v168, v169); + svfloat32_t v179 = svadd_f32_x(svptrue_b32(), v170, v172); + svfloat32_t v181 = svsub_f32_x(svptrue_b32(), v174, v175); + svfloat32_t v182 = svadd_f32_x(svptrue_b32(), v173, v177); + svfloat32_t v194 = svsub_f32_x(svptrue_b32(), v169, v171); + svfloat32_t v195 = svsub_f32_x(svptrue_b32(), v168, v171); + svfloat32_t v196 = svsub_f32_x(svptrue_b32(), v169, v168); + svfloat32_t v197 = svsub_f32_x(svptrue_b32(), v172, v171); + svfloat32_t v198 = svsub_f32_x(svptrue_b32(), v170, v171); + svfloat32_t v199 = svsub_f32_x(svptrue_b32(), v172, v170); + svfloat32_t v200 = svsub_f32_x(svptrue_b32(), v169, v172); + svfloat32_t v201 = svsub_f32_x(svptrue_b32(), v168, v170); + svfloat32_t v203 = svadd_f32_x(svptrue_b32(), v174, v176); + svfloat32_t v204 = svsub_f32_x(svptrue_b32(), v173, v176); + svfloat32_t v205 = svadd_f32_x(svptrue_b32(), v173, v174); + svfloat32_t v206 = svsub_f32_x(svptrue_b32(), v176, v177); + svfloat32_t v207 = svsub_f32_x(svptrue_b32(), v175, v176); + svfloat32_t v208 = svsub_f32_x(svptrue_b32(), v175, v177); + svfloat32_t v209 = svadd_f32_x(svptrue_b32(), v174, v177); + svfloat32_t v210 = svsub_f32_x(svptrue_b32(), v173, v175); + svfloat32_t v180 = svadd_f32_x(svptrue_b32(), v171, v178); + svfloat32_t v192 = svsub_f32_x(svptrue_b32(), v181, v182); + svfloat32_t v202 = svsub_f32_x(svptrue_b32(), v179, v178); + svfloat32_t v211 = svadd_f32_x(svptrue_b32(), v181, v182); + svfloat32_t v238 = svmul_f32_x(svptrue_b32(), v195, v565); + svfloat32_t v243 = svmul_f32_x(svptrue_b32(), v196, v566); + svfloat32_t v253 = svmul_f32_x(svptrue_b32(), v198, v568); + svfloat32_t v258 = svmul_f32_x(svptrue_b32(), v199, v569); + svfloat32_t zero280 = svdup_n_f32(0); + svfloat32_t v280 = svcmla_f32_x(pred_full, zero280, v573, v203, 90); + svfloat32_t zero294 = svdup_n_f32(0); + svfloat32_t v294 = svcmla_f32_x(pred_full, zero294, v575, v205, 90); + svfloat32_t zero301 = svdup_n_f32(0); + svfloat32_t v301 = svcmla_f32_x(pred_full, zero301, v576, v206, 90); + svfloat32_t zero315 = svdup_n_f32(0); + svfloat32_t v315 = svcmla_f32_x(pred_full, zero315, v578, v208, 90); + svfloat32_t zero322 = svdup_n_f32(0); + svfloat32_t v322 = svcmla_f32_x(pred_full, zero322, v579, v209, 90); + svfloat32_t v183 = svadd_f32_x(svptrue_b32(), v180, v179); + svfloat32_t v193 = svsub_f32_x(svptrue_b32(), v192, v176); + svfloat32_t v273 = svmul_f32_x(svptrue_b32(), v202, v572); + svfloat32_t zero336 = svdup_n_f32(0); + svfloat32_t v336 = svcmla_f32_x(pred_full, zero336, v581, v211, 90); + svfloat32_t v338 = svmla_f32_x(pred_full, v238, v194, v564); + svfloat32_t v339 = svmla_f32_x(pred_full, v243, v195, v565); + svfloat32_t v340 = svnmls_f32_x(pred_full, v243, v194, v564); + svfloat32_t v341 = svmla_f32_x(pred_full, v253, v197, v567); + svfloat32_t v342 = svmla_f32_x(pred_full, v258, v198, v568); + svfloat32_t v343 = svnmls_f32_x(pred_full, v258, v197, v567); + svfloat32_t v346 = svcmla_f32_x(pred_full, v294, v574, v204, 90); + svfloat32_t v347 = svsub_f32_x(svptrue_b32(), v280, v294); + svfloat32_t v348 = svcmla_f32_x(pred_full, v315, v577, v207, 90); + svfloat32_t v349 = svsub_f32_x(svptrue_b32(), v301, v315); + svfloat32_t v191 = svadd_f32_x(svptrue_b32(), v703, v183); + svfloat32_t zero228 = svdup_n_f32(0); + svfloat32_t v228 = svcmla_f32_x(pred_full, zero228, v563, v193, 90); + svfloat32_t v344 = svmla_f32_x(pred_full, v273, v201, v571); + svfloat32_t v345 = svmla_f32_x(pred_full, v273, v200, v570); + svfloat32_t v350 = svcmla_f32_x(pred_full, v336, v580, v210, 90); + svfloat32_t v351 = svsub_f32_x(svptrue_b32(), v322, v336); + svfloat32_t v370 = svadd_f32_x(svptrue_b32(), v346, v347); + svfloat32_t v337 = svmls_f32_x(pred_full, v191, v183, v562); + svfloat32_t v352 = svadd_f32_x(svptrue_b32(), v342, v344); + svfloat32_t v362 = svadd_f32_x(svptrue_b32(), v228, v348); + svfloat32_t v364 = svsub_f32_x(svptrue_b32(), v350, v346); + svfloat32_t v366 = svadd_f32_x(svptrue_b32(), v228, v351); + svfloat32_t v368 = svsub_f32_x(svptrue_b32(), v351, v347); + svfloat32_t v371 = svadd_f32_x(svptrue_b32(), v370, v348); + svst1_f64(pred_full, (double *)(v589), svreinterpret_f64_f32(v191)); + svfloat32_t v353 = svadd_f32_x(svptrue_b32(), v352, v337); + svfloat32_t v354 = svsub_f32_x(svptrue_b32(), v337, v339); + svfloat32_t v356 = svadd_f32_x(svptrue_b32(), v337, v343); + svfloat32_t v358 = svsub_f32_x(svptrue_b32(), v337, v340); + svfloat32_t v360 = svadd_f32_x(svptrue_b32(), v337, v338); + svfloat32_t v363 = svadd_f32_x(svptrue_b32(), v362, v350); + svfloat32_t v365 = svsub_f32_x(svptrue_b32(), v364, v228); + svfloat32_t v367 = svadd_f32_x(svptrue_b32(), v366, v349); + svfloat32_t v369 = svsub_f32_x(svptrue_b32(), v368, v228); + svfloat32_t v372 = svadd_f32_x(svptrue_b32(), v371, v349); + svfloat32_t v355 = svsub_f32_x(svptrue_b32(), v354, v344); + svfloat32_t v357 = svadd_f32_x(svptrue_b32(), v356, v345); + svfloat32_t v359 = svsub_f32_x(svptrue_b32(), v358, v345); + svfloat32_t v361 = svsub_f32_x(svptrue_b32(), v360, v341); + svfloat32_t v373 = svsub_f32_x(svptrue_b32(), v372, v228); + svfloat32_t v375 = svadd_f32_x(svptrue_b32(), v353, v363); + svfloat32_t v382 = svsub_f32_x(svptrue_b32(), v353, v363); + svfloat32_t v374 = svadd_f32_x(svptrue_b32(), v361, v373); + svfloat32_t v376 = svadd_f32_x(svptrue_b32(), v355, v365); + svfloat32_t v377 = svsub_f32_x(svptrue_b32(), v357, v367); + svfloat32_t v378 = svadd_f32_x(svptrue_b32(), v359, v369); + svfloat32_t v379 = svsub_f32_x(svptrue_b32(), v359, v369); + svfloat32_t v380 = svadd_f32_x(svptrue_b32(), v357, v367); + svfloat32_t v381 = svsub_f32_x(svptrue_b32(), v355, v365); + svfloat32_t v383 = svsub_f32_x(svptrue_b32(), v361, v373); + svst1_f64(pred_full, (double *)(v607), svreinterpret_f64_f32(v375)); + svst1_f64(pred_full, (double *)(v670), svreinterpret_f64_f32(v382)); + svst1_f64(pred_full, (double *)(v598), svreinterpret_f64_f32(v374)); + svst1_f64(pred_full, (double *)(v616), svreinterpret_f64_f32(v376)); + svst1_f64(pred_full, (double *)(v625), svreinterpret_f64_f32(v377)); + svst1_f64(pred_full, (double *)(v634), svreinterpret_f64_f32(v378)); + svst1_f64(pred_full, (double *)(v643), svreinterpret_f64_f32(v379)); + svst1_f64(pred_full, (double *)(v652), svreinterpret_f64_f32(v380)); + svst1_f64(pred_full, (double *)(v661), svreinterpret_f64_f32(v381)); + svst1_f64(pred_full, (double *)(v679), svreinterpret_f64_f32(v383)); v5 += v11; v6 += v12; } @@ -5861,18 +2695,6 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu13(const armral_cmplx_f32_t *restrict x, int64_t v118 = v0 * 8; int64_t v129 = v0 * 6; int64_t v140 = v0 * 7; - float32x2_t v151 = v7[0]; - float32x2_t v155 = v7[11]; - float32x2_t v159 = v7[1]; - float32x2_t v163 = v7[10]; - float32x2_t v167 = v7[2]; - float32x2_t v171 = v7[9]; - float32x2_t v175 = v7[3]; - float32x2_t v179 = v7[8]; - float32x2_t v183 = v7[4]; - float32x2_t v187 = v7[7]; - float32x2_t v191 = v7[5]; - float32x2_t v195 = v7[6]; float v267 = v4 * v264; float v274 = v4 * v271; float v281 = v4 * v278; @@ -5908,30 +2730,30 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu13(const armral_cmplx_f32_t *restrict x, float32x2_t *v665 = &v6[0]; svfloat32_t v777 = svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v525)[0])); - svfloat32_t v152; - asm("mov %0.d, %d1" : "=w"(v152) : "w"(v151)); - svfloat32_t v156; - asm("mov %0.d, %d1" : "=w"(v156) : "w"(v155)); - svfloat32_t v160; - asm("mov %0.d, %d1" : "=w"(v160) : "w"(v159)); - svfloat32_t v164; - asm("mov %0.d, %d1" : "=w"(v164) : "w"(v163)); - svfloat32_t v168; - asm("mov %0.d, %d1" : "=w"(v168) : "w"(v167)); - svfloat32_t v172; - asm("mov %0.d, %d1" : "=w"(v172) : "w"(v171)); - svfloat32_t v176; - asm("mov %0.d, %d1" : "=w"(v176) : "w"(v175)); - svfloat32_t v180; - asm("mov %0.d, %d1" : "=w"(v180) : "w"(v179)); - svfloat32_t v184; - asm("mov %0.d, %d1" : "=w"(v184) : "w"(v183)); - svfloat32_t v188; - asm("mov %0.d, %d1" : "=w"(v188) : "w"(v187)); - svfloat32_t v192; - asm("mov %0.d, %d1" : "=w"(v192) : "w"(v191)); - svfloat32_t v196; - asm("mov %0.d, %d1" : "=w"(v196) : "w"(v195)); + svfloat32_t v152 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[0])); + svfloat32_t v156 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[11])); + svfloat32_t v160 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[1])); + svfloat32_t v164 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[10])); + svfloat32_t v168 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[2])); + svfloat32_t v172 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[9])); + svfloat32_t v176 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[3])); + svfloat32_t v180 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[8])); + svfloat32_t v184 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[4])); + svfloat32_t v188 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[7])); + svfloat32_t v192 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[5])); + svfloat32_t v196 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[6])); const float32x2_t *v534 = &v5[v30]; const float32x2_t *v543 = &v5[v41]; const float32x2_t *v552 = &v5[v52]; @@ -5968,8 +2790,7 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu13(const armral_cmplx_f32_t *restrict x, float32x2_t *v764 = &v6[v506]; svfloat32_t v801 = svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v634)[0])); - svfloat32_t zero153; - asm volatile("mov %0.s, #0" : "=w"(zero153)); + svfloat32_t zero153 = svdup_n_f32(0); svfloat32_t v153 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero153, v777, v152, 0), v777, v152, 90); @@ -5995,273 +2816,169 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu13(const armral_cmplx_f32_t *restrict x, svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v615)[0])); svfloat32_t v799 = svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v624)[0])); - svfloat32_t zero157; - asm volatile("mov %0.s, #0" : "=w"(zero157)); + svfloat32_t zero157 = svdup_n_f32(0); svfloat32_t v157 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero157, v779, v156, 0), v779, v156, 90); - svfloat32_t zero161; - asm volatile("mov %0.s, #0" : "=w"(zero161)); + svfloat32_t zero161 = svdup_n_f32(0); svfloat32_t v161 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero161, v781, v160, 0), v781, v160, 90); - svfloat32_t zero165; - asm volatile("mov %0.s, #0" : "=w"(zero165)); + svfloat32_t zero165 = svdup_n_f32(0); svfloat32_t v165 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero165, v783, v164, 0), v783, v164, 90); - svfloat32_t zero169; - asm volatile("mov %0.s, #0" : "=w"(zero169)); + svfloat32_t zero169 = svdup_n_f32(0); svfloat32_t v169 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero169, v785, v168, 0), v785, v168, 90); - svfloat32_t zero173; - asm volatile("mov %0.s, #0" : "=w"(zero173)); + svfloat32_t zero173 = svdup_n_f32(0); svfloat32_t v173 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero173, v787, v172, 0), v787, v172, 90); - svfloat32_t zero177; - asm volatile("mov %0.s, #0" : "=w"(zero177)); + svfloat32_t zero177 = svdup_n_f32(0); svfloat32_t v177 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero177, v789, v176, 0), v789, v176, 90); - svfloat32_t zero181; - asm volatile("mov %0.s, #0" : "=w"(zero181)); + svfloat32_t zero181 = svdup_n_f32(0); svfloat32_t v181 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero181, v791, v180, 0), v791, v180, 90); - svfloat32_t zero185; - asm volatile("mov %0.s, #0" : "=w"(zero185)); + svfloat32_t zero185 = svdup_n_f32(0); svfloat32_t v185 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero185, v793, v184, 0), v793, v184, 90); - svfloat32_t zero189; - asm volatile("mov %0.s, #0" : "=w"(zero189)); + svfloat32_t zero189 = svdup_n_f32(0); svfloat32_t v189 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero189, v795, v188, 0), v795, v188, 90); - svfloat32_t zero193; - asm volatile("mov %0.s, #0" : "=w"(zero193)); + svfloat32_t zero193 = svdup_n_f32(0); svfloat32_t v193 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero193, v797, v192, 0), v797, v192, 90); - svfloat32_t zero197; - asm volatile("mov %0.s, #0" : "=w"(zero197)); + svfloat32_t zero197 = svdup_n_f32(0); svfloat32_t v197 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero197, v799, v196, 0), v799, v196, 90); - svfloat32_t v198; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v198) : "w"(v153), "w"(v157)); - svfloat32_t v199; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v199) : "w"(v161), "w"(v165)); - svfloat32_t v200; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v200) : "w"(v169), "w"(v173)); - svfloat32_t v201; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v201) : "w"(v177), "w"(v181)); - svfloat32_t v202; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v202) : "w"(v185), "w"(v189)); - svfloat32_t v203; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v203) : "w"(v193), "w"(v197)); - svfloat32_t v204; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v204) : "w"(v153), "w"(v157)); - svfloat32_t v205; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v205) : "w"(v161), "w"(v165)); - svfloat32_t v206; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v206) : "w"(v169), "w"(v173)); - svfloat32_t v207; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v207) : "w"(v177), "w"(v181)); - svfloat32_t v208; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v208) : "w"(v185), "w"(v189)); - svfloat32_t v209; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v209) : "w"(v193), "w"(v197)); - svfloat32_t v210; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v210) : "w"(v199), "w"(v202)); - svfloat32_t v212; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v212) : "w"(v198), "w"(v200)); - svfloat32_t v215; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v215) : "w"(v205), "w"(v208)); - svfloat32_t v217; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v217) : "w"(v204), "w"(v206)); - svfloat32_t v219; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v219) : "w"(v199), "w"(v203)); - svfloat32_t v220; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v220) : "w"(v200), "w"(v201)); - svfloat32_t v221; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v221) : "w"(v198), "w"(v201)); - svfloat32_t v222; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v222) : "w"(v202), "w"(v203)); - svfloat32_t v227; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v227) : "w"(v205), "w"(v209)); - svfloat32_t v228; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v228) : "w"(v204), "w"(v206)); - svfloat32_t v229; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v229) : "w"(v205), "w"(v208)); - svfloat32_t v230; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v230) : "w"(v204), "w"(v207)); - svfloat32_t v231; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v231) : "w"(v208), "w"(v209)); - svfloat32_t v232; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v232) : "w"(v206), "w"(v207)); - svfloat32_t v211; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v211) : "w"(v210), "w"(v203)); - svfloat32_t v213; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v213) : "w"(v212), "w"(v201)); - svfloat32_t v216; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v216) : "w"(v215), "w"(v209)); - svfloat32_t v218; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v218) : "w"(v217), "w"(v207)); - svfloat32_t v223; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v223) : "w"(v219), "w"(v220)); - svfloat32_t v224; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v224) : "w"(v221), "w"(v222)); - svfloat32_t v225; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v225) : "w"(v219), "w"(v220)); - svfloat32_t v226; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v226) : "w"(v221), "w"(v222)); - svfloat32_t v245; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v245) : "w"(v227), "w"(v228)); - svfloat32_t v246; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v246) : "w"(v229), "w"(v230)); - svfloat32_t v247; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v247) : "w"(v231), "w"(v232)); - svfloat32_t zero320; - asm volatile("mov %0.s, #0" : "=w"(zero320)); + svfloat32_t v198 = svadd_f32_x(svptrue_b32(), v153, v157); + svfloat32_t v199 = svadd_f32_x(svptrue_b32(), v161, v165); + svfloat32_t v200 = svadd_f32_x(svptrue_b32(), v169, v173); + svfloat32_t v201 = svadd_f32_x(svptrue_b32(), v177, v181); + svfloat32_t v202 = svadd_f32_x(svptrue_b32(), v185, v189); + svfloat32_t v203 = svadd_f32_x(svptrue_b32(), v193, v197); + svfloat32_t v204 = svsub_f32_x(svptrue_b32(), v153, v157); + svfloat32_t v205 = svsub_f32_x(svptrue_b32(), v161, v165); + svfloat32_t v206 = svsub_f32_x(svptrue_b32(), v169, v173); + svfloat32_t v207 = svsub_f32_x(svptrue_b32(), v177, v181); + svfloat32_t v208 = svsub_f32_x(svptrue_b32(), v185, v189); + svfloat32_t v209 = svsub_f32_x(svptrue_b32(), v193, v197); + svfloat32_t v210 = svadd_f32_x(svptrue_b32(), v199, v202); + svfloat32_t v212 = svadd_f32_x(svptrue_b32(), v198, v200); + svfloat32_t v215 = svadd_f32_x(svptrue_b32(), v205, v208); + svfloat32_t v217 = svadd_f32_x(svptrue_b32(), v204, v206); + svfloat32_t v219 = svsub_f32_x(svptrue_b32(), v199, v203); + svfloat32_t v220 = svsub_f32_x(svptrue_b32(), v200, v201); + svfloat32_t v221 = svsub_f32_x(svptrue_b32(), v198, v201); + svfloat32_t v222 = svsub_f32_x(svptrue_b32(), v202, v203); + svfloat32_t v227 = svsub_f32_x(svptrue_b32(), v205, v209); + svfloat32_t v228 = svsub_f32_x(svptrue_b32(), v204, v206); + svfloat32_t v229 = svsub_f32_x(svptrue_b32(), v205, v208); + svfloat32_t v230 = svadd_f32_x(svptrue_b32(), v204, v207); + svfloat32_t v231 = svsub_f32_x(svptrue_b32(), v208, v209); + svfloat32_t v232 = svadd_f32_x(svptrue_b32(), v206, v207); + svfloat32_t v211 = svadd_f32_x(svptrue_b32(), v210, v203); + svfloat32_t v213 = svadd_f32_x(svptrue_b32(), v212, v201); + svfloat32_t v216 = svadd_f32_x(svptrue_b32(), v215, v209); + svfloat32_t v218 = svsub_f32_x(svptrue_b32(), v217, v207); + svfloat32_t v223 = svsub_f32_x(svptrue_b32(), v219, v220); + svfloat32_t v224 = svsub_f32_x(svptrue_b32(), v221, v222); + svfloat32_t v225 = svadd_f32_x(svptrue_b32(), v219, v220); + svfloat32_t v226 = svadd_f32_x(svptrue_b32(), v221, v222); + svfloat32_t v245 = svadd_f32_x(svptrue_b32(), v227, v228); + svfloat32_t v246 = svadd_f32_x(svptrue_b32(), v229, v230); + svfloat32_t v247 = svsub_f32_x(svptrue_b32(), v231, v232); + svfloat32_t zero320 = svdup_n_f32(0); svfloat32_t v320 = svcmla_f32_x(pred_full, zero320, v649, v227, 90); - svfloat32_t zero327; - asm volatile("mov %0.s, #0" : "=w"(zero327)); + svfloat32_t zero327 = svdup_n_f32(0); svfloat32_t v327 = svcmla_f32_x(pred_full, zero327, v650, v228, 90); - svfloat32_t zero341; - asm volatile("mov %0.s, #0" : "=w"(zero341)); + svfloat32_t zero341 = svdup_n_f32(0); svfloat32_t v341 = svcmla_f32_x(pred_full, zero341, v652, v229, 90); - svfloat32_t zero348; - asm volatile("mov %0.s, #0" : "=w"(zero348)); + svfloat32_t zero348 = svdup_n_f32(0); svfloat32_t v348 = svcmla_f32_x(pred_full, zero348, v653, v230, 90); - svfloat32_t zero362; - asm volatile("mov %0.s, #0" : "=w"(zero362)); + svfloat32_t zero362 = svdup_n_f32(0); svfloat32_t v362 = svcmla_f32_x(pred_full, zero362, v655, v231, 90); - svfloat32_t v214; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v214) : "w"(v211), "w"(v213)); - svfloat32_t v241; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v241) : "w"(v213), "w"(v211)); - svfloat32_t v242; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v242) : "w"(v216), "w"(v218)); - svfloat32_t v243; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v243) : "w"(v223), "w"(v224)); - svfloat32_t v244; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v244) : "w"(v225), "w"(v226)); - svfloat32_t zero269; - asm volatile("mov %0.s, #0" : "=w"(zero269)); + svfloat32_t v214 = svadd_f32_x(svptrue_b32(), v211, v213); + svfloat32_t v241 = svsub_f32_x(svptrue_b32(), v213, v211); + svfloat32_t v242 = svadd_f32_x(svptrue_b32(), v216, v218); + svfloat32_t v243 = svadd_f32_x(svptrue_b32(), v223, v224); + svfloat32_t v244 = svsub_f32_x(svptrue_b32(), v225, v226); + svfloat32_t zero269 = svdup_n_f32(0); svfloat32_t v269 = svcmla_f32_x(pred_full, zero269, v640, v216, 90); - svfloat32_t zero276; - asm volatile("mov %0.s, #0" : "=w"(zero276)); + svfloat32_t zero276 = svdup_n_f32(0); svfloat32_t v276 = svcmla_f32_x(pred_full, zero276, v641, v218, 90); - svfloat32_t v288; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v288) : "w"(v223), "w"(v643)); - svfloat32_t zero334; - asm volatile("mov %0.s, #0" : "=w"(zero334)); + svfloat32_t v288 = svmul_f32_x(svptrue_b32(), v223, v643); + svfloat32_t zero334 = svdup_n_f32(0); svfloat32_t v334 = svcmla_f32_x(pred_full, zero334, v651, v245, 90); - svfloat32_t zero355; - asm volatile("mov %0.s, #0" : "=w"(zero355)); + svfloat32_t zero355 = svdup_n_f32(0); svfloat32_t v355 = svcmla_f32_x(pred_full, zero355, v654, v246, 90); - svfloat32_t zero376; - asm volatile("mov %0.s, #0" : "=w"(zero376)); + svfloat32_t zero376 = svdup_n_f32(0); svfloat32_t v376 = svcmla_f32_x(pred_full, zero376, v657, v247, 90); - svfloat32_t v240; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v240) : "w"(v801), "w"(v214)); - svfloat32_t zero283; - asm volatile("mov %0.s, #0" : "=w"(zero283)); + svfloat32_t v240 = svadd_f32_x(svptrue_b32(), v801, v214); + svfloat32_t zero283 = svdup_n_f32(0); svfloat32_t v283 = svcmla_f32_x(pred_full, zero283, v642, v242, 90); - svfloat32_t v298; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v298) : "w"(v243), "w"(v645)); + svfloat32_t v298 = svmul_f32_x(svptrue_b32(), v243, v645); svfloat32_t v378 = svmla_f32_x(pred_full, v288, v224, v644); - svfloat32_t v390; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v390) : "w"(v320), "w"(v334)); - svfloat32_t v391; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v391) : "w"(v327), "w"(v334)); - svfloat32_t v392; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v392) : "w"(v341), "w"(v355)); - svfloat32_t v393; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v393) : "w"(v348), "w"(v355)); - svfloat32_t v394; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v394) : "w"(v362), "w"(v376)); + svfloat32_t v390 = svsub_f32_x(svptrue_b32(), v320, v334); + svfloat32_t v391 = svsub_f32_x(svptrue_b32(), v327, v334); + svfloat32_t v392 = svsub_f32_x(svptrue_b32(), v341, v355); + svfloat32_t v393 = svsub_f32_x(svptrue_b32(), v348, v355); + svfloat32_t v394 = svsub_f32_x(svptrue_b32(), v362, v376); svfloat32_t v395 = svcmla_f32_x(pred_full, v376, v656, v232, 90); svfloat32_t v377 = svmls_f32_x(pred_full, v240, v214, v638); svfloat32_t v379 = svmls_f32_x(pred_full, v378, v241, v639); svfloat32_t v380 = svmla_f32_x(pred_full, v298, v224, v644); svfloat32_t v382 = svnmls_f32_x(pred_full, v288, v243, v645); - svfloat32_t v396; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v396) : "w"(v269), "w"(v283)); - svfloat32_t v397; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v397) : "w"(v276), "w"(v283)); - svfloat32_t v408; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v408) : "w"(v390), "w"(v394)); - svfloat32_t v410; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v410) : "w"(v392), "w"(v394)); - svfloat32_t v412; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v412) : "w"(v391), "w"(v395)); + svfloat32_t v396 = svsub_f32_x(svptrue_b32(), v269, v283); + svfloat32_t v397 = svsub_f32_x(svptrue_b32(), v276, v283); + svfloat32_t v408 = svadd_f32_x(svptrue_b32(), v390, v394); + svfloat32_t v410 = svadd_f32_x(svptrue_b32(), v392, v394); + svfloat32_t v412 = svsub_f32_x(svptrue_b32(), v391, v395); svst1_f64(pred_full, (double *)(v665), svreinterpret_f64_f32(v240)); svfloat32_t v381 = svmla_f32_x(pred_full, v380, v241, v639); svfloat32_t v383 = svmls_f32_x(pred_full, v382, v241, v639); svfloat32_t v384 = svmla_f32_x(pred_full, v377, v225, v646); svfloat32_t v386 = svmls_f32_x(pred_full, v377, v226, v647); svfloat32_t v388 = svmls_f32_x(pred_full, v377, v225, v646); - svfloat32_t v404; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v404) : "w"(v397), "w"(v390)); - svfloat32_t v406; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v406) : "w"(v395), "w"(v396)); - svfloat32_t v409; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v409) : "w"(v408), "w"(v397)); - svfloat32_t v411; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v411) : "w"(v410), "w"(v397)); - svfloat32_t v413; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v413) : "w"(v412), "w"(v396)); - svfloat32_t v414; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v414) : "w"(v396), "w"(v391)); + svfloat32_t v404 = svsub_f32_x(svptrue_b32(), v397, v390); + svfloat32_t v406 = svsub_f32_x(svptrue_b32(), v395, v396); + svfloat32_t v409 = svadd_f32_x(svptrue_b32(), v408, v397); + svfloat32_t v411 = svsub_f32_x(svptrue_b32(), v410, v397); + svfloat32_t v413 = svsub_f32_x(svptrue_b32(), v412, v396); + svfloat32_t v414 = svadd_f32_x(svptrue_b32(), v396, v391); svfloat32_t v385 = svmla_f32_x(pred_full, v384, v226, v647); svfloat32_t v387 = svmls_f32_x(pred_full, v386, v244, v648); svfloat32_t v389 = svmla_f32_x(pred_full, v388, v244, v648); - svfloat32_t v405; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v405) : "w"(v404), "w"(v392)); - svfloat32_t v407; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v407) : "w"(v406), "w"(v393)); - svfloat32_t v415; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v415) : "w"(v414), "w"(v393)); - svfloat32_t v398; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v398) : "w"(v379), "w"(v385)); - svfloat32_t v399; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v399) : "w"(v381), "w"(v387)); - svfloat32_t v400; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v400) : "w"(v387), "w"(v381)); - svfloat32_t v401; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v401) : "w"(v383), "w"(v389)); - svfloat32_t v402; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v402) : "w"(v385), "w"(v379)); - svfloat32_t v403; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v403) : "w"(v389), "w"(v383)); - svfloat32_t v416; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v416) : "w"(v398), "w"(v405)); - svfloat32_t v417; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v417) : "w"(v399), "w"(v407)); - svfloat32_t v418; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v418) : "w"(v400), "w"(v409)); - svfloat32_t v419; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v419) : "w"(v401), "w"(v411)); - svfloat32_t v420; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v420) : "w"(v402), "w"(v413)); - svfloat32_t v421; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v421) : "w"(v403), "w"(v415)); - svfloat32_t v422; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v422) : "w"(v403), "w"(v415)); - svfloat32_t v423; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v423) : "w"(v402), "w"(v413)); - svfloat32_t v424; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v424) : "w"(v401), "w"(v411)); - svfloat32_t v425; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v425) : "w"(v400), "w"(v409)); - svfloat32_t v426; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v426) : "w"(v399), "w"(v407)); - svfloat32_t v427; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v427) : "w"(v398), "w"(v405)); + svfloat32_t v405 = svadd_f32_x(svptrue_b32(), v404, v392); + svfloat32_t v407 = svsub_f32_x(svptrue_b32(), v406, v393); + svfloat32_t v415 = svsub_f32_x(svptrue_b32(), v414, v393); + svfloat32_t v398 = svadd_f32_x(svptrue_b32(), v379, v385); + svfloat32_t v399 = svadd_f32_x(svptrue_b32(), v381, v387); + svfloat32_t v400 = svsub_f32_x(svptrue_b32(), v387, v381); + svfloat32_t v401 = svadd_f32_x(svptrue_b32(), v383, v389); + svfloat32_t v402 = svsub_f32_x(svptrue_b32(), v385, v379); + svfloat32_t v403 = svsub_f32_x(svptrue_b32(), v389, v383); + svfloat32_t v416 = svsub_f32_x(svptrue_b32(), v398, v405); + svfloat32_t v417 = svadd_f32_x(svptrue_b32(), v399, v407); + svfloat32_t v418 = svsub_f32_x(svptrue_b32(), v400, v409); + svfloat32_t v419 = svsub_f32_x(svptrue_b32(), v401, v411); + svfloat32_t v420 = svadd_f32_x(svptrue_b32(), v402, v413); + svfloat32_t v421 = svsub_f32_x(svptrue_b32(), v403, v415); + svfloat32_t v422 = svadd_f32_x(svptrue_b32(), v403, v415); + svfloat32_t v423 = svsub_f32_x(svptrue_b32(), v402, v413); + svfloat32_t v424 = svadd_f32_x(svptrue_b32(), v401, v411); + svfloat32_t v425 = svadd_f32_x(svptrue_b32(), v400, v409); + svfloat32_t v426 = svsub_f32_x(svptrue_b32(), v399, v407); + svfloat32_t v427 = svadd_f32_x(svptrue_b32(), v398, v405); svst1_f64(pred_full, (double *)(v674), svreinterpret_f64_f32(v416)); svst1_f64(pred_full, (double *)(v683), svreinterpret_f64_f32(v417)); svst1_f64(pred_full, (double *)(v692), svreinterpret_f64_f32(v418)); @@ -6876,30 +3593,17 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu14(const armral_cmplx_f32_t *restrict x, const float32x2_t *v588 = &v5[v0]; float32x2_t *v689 = &v6[v2]; int64_t v19 = v0 * 7; - float32x2_t v30 = v7[6]; int64_t v34 = v0 * 2; int64_t v45 = v0 * 9; - float32x2_t v56 = v7[1]; - float32x2_t v60 = v7[8]; int64_t v64 = v0 * 4; int64_t v75 = v0 * 11; - float32x2_t v86 = v7[3]; - float32x2_t v90 = v7[10]; int64_t v94 = v0 * 6; int64_t v105 = v0 * 13; - float32x2_t v116 = v7[5]; - float32x2_t v120 = v7[12]; int64_t v124 = v0 * 8; - float32x2_t v146 = v7[7]; - float32x2_t v150 = v7[0]; int64_t v154 = v0 * 10; int64_t v165 = v0 * 3; - float32x2_t v176 = v7[9]; - float32x2_t v180 = v7[2]; int64_t v184 = v0 * 12; int64_t v195 = v0 * 5; - float32x2_t v206 = v7[11]; - float32x2_t v210 = v7[4]; float v369 = v4 * v366; float v376 = v4 * v373; float v383 = v4 * v380; @@ -6924,32 +3628,32 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu14(const armral_cmplx_f32_t *restrict x, float32x2_t *v662 = &v6[0]; svfloat32_t v799 = svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v588)[0])); - svfloat32_t v31; - asm("mov %0.d, %d1" : "=w"(v31) : "w"(v30)); - svfloat32_t v57; - asm("mov %0.d, %d1" : "=w"(v57) : "w"(v56)); - svfloat32_t v61; - asm("mov %0.d, %d1" : "=w"(v61) : "w"(v60)); - svfloat32_t v87; - asm("mov %0.d, %d1" : "=w"(v87) : "w"(v86)); - svfloat32_t v91; - asm("mov %0.d, %d1" : "=w"(v91) : "w"(v90)); - svfloat32_t v117; - asm("mov %0.d, %d1" : "=w"(v117) : "w"(v116)); - svfloat32_t v121; - asm("mov %0.d, %d1" : "=w"(v121) : "w"(v120)); - svfloat32_t v147; - asm("mov %0.d, %d1" : "=w"(v147) : "w"(v146)); - svfloat32_t v151; - asm("mov %0.d, %d1" : "=w"(v151) : "w"(v150)); - svfloat32_t v177; - asm("mov %0.d, %d1" : "=w"(v177) : "w"(v176)); - svfloat32_t v181; - asm("mov %0.d, %d1" : "=w"(v181) : "w"(v180)); - svfloat32_t v207; - asm("mov %0.d, %d1" : "=w"(v207) : "w"(v206)); - svfloat32_t v211; - asm("mov %0.d, %d1" : "=w"(v211) : "w"(v210)); + svfloat32_t v31 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[6])); + svfloat32_t v57 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[1])); + svfloat32_t v61 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[8])); + svfloat32_t v87 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[3])); + svfloat32_t v91 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[10])); + svfloat32_t v117 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[5])); + svfloat32_t v121 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[12])); + svfloat32_t v147 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[7])); + svfloat32_t v151 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[0])); + svfloat32_t v177 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[9])); + svfloat32_t v181 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[2])); + svfloat32_t v207 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[11])); + svfloat32_t v211 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[4])); const float32x2_t *v516 = &v5[v19]; const float32x2_t *v525 = &v5[v34]; const float32x2_t *v534 = &v5[v45]; @@ -6980,8 +3684,7 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu14(const armral_cmplx_f32_t *restrict x, float32x2_t *v779 = &v6[v504]; svfloat32_t v809 = svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v634)[0])); - svfloat32_t zero152; - asm volatile("mov %0.s, #0" : "=w"(zero152)); + svfloat32_t zero152 = svdup_n_f32(0); svfloat32_t v152 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero152, v799, v151, 0), v799, v151, 90); @@ -7009,250 +3712,158 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu14(const armral_cmplx_f32_t *restrict x, svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v615)[0])); svfloat32_t v807 = svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v624)[0])); - svfloat32_t zero32; - asm volatile("mov %0.s, #0" : "=w"(zero32)); + svfloat32_t zero32 = svdup_n_f32(0); svfloat32_t v32 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero32, v783, v31, 0), v783, v31, 90); - svfloat32_t zero58; - asm volatile("mov %0.s, #0" : "=w"(zero58)); + svfloat32_t zero58 = svdup_n_f32(0); svfloat32_t v58 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero58, v785, v57, 0), v785, v57, 90); - svfloat32_t zero62; - asm volatile("mov %0.s, #0" : "=w"(zero62)); + svfloat32_t zero62 = svdup_n_f32(0); svfloat32_t v62 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero62, v787, v61, 0), v787, v61, 90); - svfloat32_t zero88; - asm volatile("mov %0.s, #0" : "=w"(zero88)); + svfloat32_t zero88 = svdup_n_f32(0); svfloat32_t v88 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero88, v789, v87, 0), v789, v87, 90); - svfloat32_t zero92; - asm volatile("mov %0.s, #0" : "=w"(zero92)); + svfloat32_t zero92 = svdup_n_f32(0); svfloat32_t v92 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero92, v791, v91, 0), v791, v91, 90); - svfloat32_t zero118; - asm volatile("mov %0.s, #0" : "=w"(zero118)); + svfloat32_t zero118 = svdup_n_f32(0); svfloat32_t v118 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero118, v793, v117, 0), v793, v117, 90); - svfloat32_t zero122; - asm volatile("mov %0.s, #0" : "=w"(zero122)); + svfloat32_t zero122 = svdup_n_f32(0); svfloat32_t v122 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero122, v795, v121, 0), v795, v121, 90); - svfloat32_t zero148; - asm volatile("mov %0.s, #0" : "=w"(zero148)); + svfloat32_t zero148 = svdup_n_f32(0); svfloat32_t v148 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero148, v797, v147, 0), v797, v147, 90); - svfloat32_t zero178; - asm volatile("mov %0.s, #0" : "=w"(zero178)); + svfloat32_t zero178 = svdup_n_f32(0); svfloat32_t v178 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero178, v801, v177, 0), v801, v177, 90); - svfloat32_t zero182; - asm volatile("mov %0.s, #0" : "=w"(zero182)); + svfloat32_t zero182 = svdup_n_f32(0); svfloat32_t v182 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero182, v803, v181, 0), v803, v181, 90); - svfloat32_t zero208; - asm volatile("mov %0.s, #0" : "=w"(zero208)); + svfloat32_t zero208 = svdup_n_f32(0); svfloat32_t v208 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero208, v805, v207, 0), v805, v207, 90); - svfloat32_t zero212; - asm volatile("mov %0.s, #0" : "=w"(zero212)); + svfloat32_t zero212 = svdup_n_f32(0); svfloat32_t v212 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero212, v807, v211, 0), v807, v211, 90); - svfloat32_t v220; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v220) : "w"(v809), "w"(v32)); - svfloat32_t v221; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v221) : "w"(v809), "w"(v32)); - svfloat32_t v222; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v222) : "w"(v58), "w"(v62)); - svfloat32_t v223; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v223) : "w"(v58), "w"(v62)); - svfloat32_t v224; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v224) : "w"(v88), "w"(v92)); - svfloat32_t v225; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v225) : "w"(v88), "w"(v92)); - svfloat32_t v226; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v226) : "w"(v118), "w"(v122)); - svfloat32_t v227; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v227) : "w"(v118), "w"(v122)); - svfloat32_t v228; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v228) : "w"(v148), "w"(v152)); - svfloat32_t v229; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v229) : "w"(v148), "w"(v152)); - svfloat32_t v230; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v230) : "w"(v178), "w"(v182)); - svfloat32_t v231; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v231) : "w"(v178), "w"(v182)); - svfloat32_t v232; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v232) : "w"(v208), "w"(v212)); - svfloat32_t v233; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v233) : "w"(v208), "w"(v212)); - svfloat32_t v234; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v234) : "w"(v222), "w"(v232)); - svfloat32_t v235; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v235) : "w"(v222), "w"(v232)); - svfloat32_t v236; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v236) : "w"(v228), "w"(v226)); - svfloat32_t v237; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v237) : "w"(v228), "w"(v226)); - svfloat32_t v238; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v238) : "w"(v224), "w"(v230)); - svfloat32_t v239; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v239) : "w"(v224), "w"(v230)); - svfloat32_t v323; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v323) : "w"(v223), "w"(v233)); - svfloat32_t v324; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v324) : "w"(v223), "w"(v233)); - svfloat32_t v325; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v325) : "w"(v229), "w"(v227)); - svfloat32_t v326; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v326) : "w"(v229), "w"(v227)); - svfloat32_t v327; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v327) : "w"(v225), "w"(v231)); - svfloat32_t v328; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v328) : "w"(v225), "w"(v231)); - svfloat32_t v240; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v240) : "w"(v234), "w"(v236)); - svfloat32_t v243; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v243) : "w"(v234), "w"(v236)); - svfloat32_t v244; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v244) : "w"(v236), "w"(v238)); - svfloat32_t v245; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v245) : "w"(v238), "w"(v234)); - svfloat32_t v246; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v246) : "w"(v235), "w"(v237)); - svfloat32_t v248; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v248) : "w"(v235), "w"(v237)); - svfloat32_t v249; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v249) : "w"(v237), "w"(v239)); - svfloat32_t v250; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v250) : "w"(v239), "w"(v235)); - svfloat32_t v329; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v329) : "w"(v323), "w"(v325)); - svfloat32_t v332; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v332) : "w"(v323), "w"(v325)); - svfloat32_t v333; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v333) : "w"(v325), "w"(v327)); - svfloat32_t v334; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v334) : "w"(v327), "w"(v323)); - svfloat32_t v335; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v335) : "w"(v324), "w"(v326)); - svfloat32_t v337; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v337) : "w"(v324), "w"(v326)); - svfloat32_t v338; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v338) : "w"(v326), "w"(v328)); - svfloat32_t v339; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v339) : "w"(v328), "w"(v324)); - svfloat32_t v241; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v241) : "w"(v240), "w"(v238)); - svfloat32_t v247; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v247) : "w"(v246), "w"(v239)); - svfloat32_t zero289; - asm volatile("mov %0.s, #0" : "=w"(zero289)); + svfloat32_t v220 = svadd_f32_x(svptrue_b32(), v809, v32); + svfloat32_t v221 = svsub_f32_x(svptrue_b32(), v809, v32); + svfloat32_t v222 = svadd_f32_x(svptrue_b32(), v58, v62); + svfloat32_t v223 = svsub_f32_x(svptrue_b32(), v58, v62); + svfloat32_t v224 = svadd_f32_x(svptrue_b32(), v88, v92); + svfloat32_t v225 = svsub_f32_x(svptrue_b32(), v88, v92); + svfloat32_t v226 = svadd_f32_x(svptrue_b32(), v118, v122); + svfloat32_t v227 = svsub_f32_x(svptrue_b32(), v118, v122); + svfloat32_t v228 = svadd_f32_x(svptrue_b32(), v148, v152); + svfloat32_t v229 = svsub_f32_x(svptrue_b32(), v148, v152); + svfloat32_t v230 = svadd_f32_x(svptrue_b32(), v178, v182); + svfloat32_t v231 = svsub_f32_x(svptrue_b32(), v178, v182); + svfloat32_t v232 = svadd_f32_x(svptrue_b32(), v208, v212); + svfloat32_t v233 = svsub_f32_x(svptrue_b32(), v208, v212); + svfloat32_t v234 = svadd_f32_x(svptrue_b32(), v222, v232); + svfloat32_t v235 = svsub_f32_x(svptrue_b32(), v222, v232); + svfloat32_t v236 = svadd_f32_x(svptrue_b32(), v228, v226); + svfloat32_t v237 = svsub_f32_x(svptrue_b32(), v228, v226); + svfloat32_t v238 = svadd_f32_x(svptrue_b32(), v224, v230); + svfloat32_t v239 = svsub_f32_x(svptrue_b32(), v224, v230); + svfloat32_t v323 = svadd_f32_x(svptrue_b32(), v223, v233); + svfloat32_t v324 = svsub_f32_x(svptrue_b32(), v223, v233); + svfloat32_t v325 = svadd_f32_x(svptrue_b32(), v229, v227); + svfloat32_t v326 = svsub_f32_x(svptrue_b32(), v229, v227); + svfloat32_t v327 = svadd_f32_x(svptrue_b32(), v225, v231); + svfloat32_t v328 = svsub_f32_x(svptrue_b32(), v225, v231); + svfloat32_t v240 = svadd_f32_x(svptrue_b32(), v234, v236); + svfloat32_t v243 = svsub_f32_x(svptrue_b32(), v234, v236); + svfloat32_t v244 = svsub_f32_x(svptrue_b32(), v236, v238); + svfloat32_t v245 = svsub_f32_x(svptrue_b32(), v238, v234); + svfloat32_t v246 = svadd_f32_x(svptrue_b32(), v235, v237); + svfloat32_t v248 = svsub_f32_x(svptrue_b32(), v235, v237); + svfloat32_t v249 = svsub_f32_x(svptrue_b32(), v237, v239); + svfloat32_t v250 = svsub_f32_x(svptrue_b32(), v239, v235); + svfloat32_t v329 = svadd_f32_x(svptrue_b32(), v323, v325); + svfloat32_t v332 = svsub_f32_x(svptrue_b32(), v323, v325); + svfloat32_t v333 = svsub_f32_x(svptrue_b32(), v325, v327); + svfloat32_t v334 = svsub_f32_x(svptrue_b32(), v327, v323); + svfloat32_t v335 = svadd_f32_x(svptrue_b32(), v324, v326); + svfloat32_t v337 = svsub_f32_x(svptrue_b32(), v324, v326); + svfloat32_t v338 = svsub_f32_x(svptrue_b32(), v326, v328); + svfloat32_t v339 = svsub_f32_x(svptrue_b32(), v328, v324); + svfloat32_t v241 = svadd_f32_x(svptrue_b32(), v240, v238); + svfloat32_t v247 = svadd_f32_x(svptrue_b32(), v246, v239); + svfloat32_t zero289 = svdup_n_f32(0); svfloat32_t v289 = svcmla_f32_x(pred_full, zero289, v652, v248, 90); - svfloat32_t zero296; - asm volatile("mov %0.s, #0" : "=w"(zero296)); + svfloat32_t zero296 = svdup_n_f32(0); svfloat32_t v296 = svcmla_f32_x(pred_full, zero296, v653, v249, 90); - svfloat32_t zero303; - asm volatile("mov %0.s, #0" : "=w"(zero303)); + svfloat32_t zero303 = svdup_n_f32(0); svfloat32_t v303 = svcmla_f32_x(pred_full, zero303, v654, v250, 90); - svfloat32_t v330; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v330) : "w"(v329), "w"(v327)); - svfloat32_t v336; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v336) : "w"(v335), "w"(v328)); - svfloat32_t zero378; - asm volatile("mov %0.s, #0" : "=w"(zero378)); + svfloat32_t v330 = svadd_f32_x(svptrue_b32(), v329, v327); + svfloat32_t v336 = svadd_f32_x(svptrue_b32(), v335, v328); + svfloat32_t zero378 = svdup_n_f32(0); svfloat32_t v378 = svcmla_f32_x(pred_full, zero378, v652, v337, 90); - svfloat32_t zero385; - asm volatile("mov %0.s, #0" : "=w"(zero385)); + svfloat32_t zero385 = svdup_n_f32(0); svfloat32_t v385 = svcmla_f32_x(pred_full, zero385, v653, v338, 90); - svfloat32_t zero392; - asm volatile("mov %0.s, #0" : "=w"(zero392)); + svfloat32_t zero392 = svdup_n_f32(0); svfloat32_t v392 = svcmla_f32_x(pred_full, zero392, v654, v339, 90); - svfloat32_t v242; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v242) : "w"(v241), "w"(v220)); - svfloat32_t zero282; - asm volatile("mov %0.s, #0" : "=w"(zero282)); + svfloat32_t v242 = svadd_f32_x(svptrue_b32(), v241, v220); + svfloat32_t zero282 = svdup_n_f32(0); svfloat32_t v282 = svcmla_f32_x(pred_full, zero282, v651, v247, 90); - svfloat32_t v331; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v331) : "w"(v330), "w"(v221)); - svfloat32_t zero371; - asm volatile("mov %0.s, #0" : "=w"(zero371)); + svfloat32_t v331 = svadd_f32_x(svptrue_b32(), v330, v221); + svfloat32_t zero371 = svdup_n_f32(0); svfloat32_t v371 = svcmla_f32_x(pred_full, zero371, v651, v336, 90); svfloat32_t v304 = svmla_f32_x(pred_full, v242, v241, v647); - svfloat32_t v311; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v311) : "w"(v282), "w"(v289)); - svfloat32_t v313; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v313) : "w"(v282), "w"(v289)); - svfloat32_t v315; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v315) : "w"(v282), "w"(v296)); + svfloat32_t v311 = svadd_f32_x(svptrue_b32(), v282, v289); + svfloat32_t v313 = svsub_f32_x(svptrue_b32(), v282, v289); + svfloat32_t v315 = svsub_f32_x(svptrue_b32(), v282, v296); svfloat32_t v393 = svmla_f32_x(pred_full, v331, v330, v647); - svfloat32_t v400; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v400) : "w"(v371), "w"(v378)); - svfloat32_t v402; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v402) : "w"(v371), "w"(v378)); - svfloat32_t v404; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v404) : "w"(v371), "w"(v385)); + svfloat32_t v400 = svadd_f32_x(svptrue_b32(), v371, v378); + svfloat32_t v402 = svsub_f32_x(svptrue_b32(), v371, v378); + svfloat32_t v404 = svsub_f32_x(svptrue_b32(), v371, v385); svst1_f64(pred_full, (double *)(v662), svreinterpret_f64_f32(v242)); svst1_f64(pred_full, (double *)(v671), svreinterpret_f64_f32(v331)); svfloat32_t v305 = svmla_f32_x(pred_full, v304, v243, v648); svfloat32_t v307 = svmls_f32_x(pred_full, v304, v243, v648); svfloat32_t v309 = svmls_f32_x(pred_full, v304, v244, v649); - svfloat32_t v312; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v312) : "w"(v311), "w"(v296)); - svfloat32_t v314; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v314) : "w"(v313), "w"(v303)); - svfloat32_t v316; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v316) : "w"(v315), "w"(v303)); + svfloat32_t v312 = svadd_f32_x(svptrue_b32(), v311, v296); + svfloat32_t v314 = svsub_f32_x(svptrue_b32(), v313, v303); + svfloat32_t v316 = svadd_f32_x(svptrue_b32(), v315, v303); svfloat32_t v394 = svmla_f32_x(pred_full, v393, v332, v648); svfloat32_t v396 = svmls_f32_x(pred_full, v393, v332, v648); svfloat32_t v398 = svmls_f32_x(pred_full, v393, v333, v649); - svfloat32_t v401; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v401) : "w"(v400), "w"(v385)); - svfloat32_t v403; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v403) : "w"(v402), "w"(v392)); - svfloat32_t v405; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v405) : "w"(v404), "w"(v392)); + svfloat32_t v401 = svadd_f32_x(svptrue_b32(), v400, v385); + svfloat32_t v403 = svsub_f32_x(svptrue_b32(), v402, v392); + svfloat32_t v405 = svadd_f32_x(svptrue_b32(), v404, v392); svfloat32_t v306 = svmla_f32_x(pred_full, v305, v244, v649); svfloat32_t v308 = svmls_f32_x(pred_full, v307, v245, v650); svfloat32_t v310 = svmla_f32_x(pred_full, v309, v245, v650); svfloat32_t v395 = svmla_f32_x(pred_full, v394, v333, v649); svfloat32_t v397 = svmls_f32_x(pred_full, v396, v334, v650); svfloat32_t v399 = svmla_f32_x(pred_full, v398, v334, v650); - svfloat32_t v317; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v317) : "w"(v306), "w"(v312)); - svfloat32_t v318; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v318) : "w"(v306), "w"(v312)); - svfloat32_t v319; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v319) : "w"(v308), "w"(v314)); - svfloat32_t v320; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v320) : "w"(v308), "w"(v314)); - svfloat32_t v321; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v321) : "w"(v310), "w"(v316)); - svfloat32_t v322; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v322) : "w"(v310), "w"(v316)); - svfloat32_t v406; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v406) : "w"(v395), "w"(v401)); - svfloat32_t v407; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v407) : "w"(v395), "w"(v401)); - svfloat32_t v408; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v408) : "w"(v397), "w"(v403)); - svfloat32_t v409; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v409) : "w"(v397), "w"(v403)); - svfloat32_t v410; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v410) : "w"(v399), "w"(v405)); - svfloat32_t v411; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v411) : "w"(v399), "w"(v405)); + svfloat32_t v317 = svadd_f32_x(svptrue_b32(), v306, v312); + svfloat32_t v318 = svsub_f32_x(svptrue_b32(), v306, v312); + svfloat32_t v319 = svadd_f32_x(svptrue_b32(), v308, v314); + svfloat32_t v320 = svsub_f32_x(svptrue_b32(), v308, v314); + svfloat32_t v321 = svadd_f32_x(svptrue_b32(), v310, v316); + svfloat32_t v322 = svsub_f32_x(svptrue_b32(), v310, v316); + svfloat32_t v406 = svadd_f32_x(svptrue_b32(), v395, v401); + svfloat32_t v407 = svsub_f32_x(svptrue_b32(), v395, v401); + svfloat32_t v408 = svadd_f32_x(svptrue_b32(), v397, v403); + svfloat32_t v409 = svsub_f32_x(svptrue_b32(), v397, v403); + svfloat32_t v410 = svadd_f32_x(svptrue_b32(), v399, v405); + svfloat32_t v411 = svsub_f32_x(svptrue_b32(), v399, v405); svst1_f64(pred_full, (double *)(v680), svreinterpret_f64_f32(v318)); svst1_f64(pred_full, (double *)(v689), svreinterpret_f64_f32(v407)); svst1_f64(pred_full, (double *)(v698), svreinterpret_f64_f32(v320)); @@ -7956,31 +4567,17 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu15(const armral_cmplx_f32_t *restrict x, float32x2_t *v710 = &v6[v2]; int64_t v19 = v0 * 5; int64_t v30 = v0 * 10; - float32x2_t v41 = v7[4]; - float32x2_t v45 = v7[9]; int64_t v49 = v0 * 8; int64_t v60 = v0 * 13; - float32x2_t v71 = v7[7]; - float32x2_t v75 = v7[12]; int64_t v79 = v0 * 3; - float32x2_t v86 = v7[2]; int64_t v90 = v0 * 11; - float32x2_t v112 = v7[10]; - float32x2_t v116 = v7[0]; int64_t v120 = v0 * 6; - float32x2_t v127 = v7[5]; int64_t v131 = v0 * 14; int64_t v142 = v0 * 4; - float32x2_t v153 = v7[13]; - float32x2_t v157 = v7[3]; int64_t v161 = v0 * 9; - float32x2_t v168 = v7[8]; int64_t v172 = v0 * 2; int64_t v183 = v0 * 7; - float32x2_t v194 = v7[1]; - float32x2_t v198 = v7[6]; int64_t v202 = v0 * 12; - float32x2_t v209 = v7[11]; float v261 = v4 * v258; float v268 = v4 * v265; float v275 = v4 * v272; @@ -8015,34 +4612,34 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu15(const armral_cmplx_f32_t *restrict x, float32x2_t *v674 = &v6[0]; svfloat32_t v816 = svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v573)[0])); - svfloat32_t v42; - asm("mov %0.d, %d1" : "=w"(v42) : "w"(v41)); - svfloat32_t v46; - asm("mov %0.d, %d1" : "=w"(v46) : "w"(v45)); - svfloat32_t v72; - asm("mov %0.d, %d1" : "=w"(v72) : "w"(v71)); - svfloat32_t v76; - asm("mov %0.d, %d1" : "=w"(v76) : "w"(v75)); - svfloat32_t v87; - asm("mov %0.d, %d1" : "=w"(v87) : "w"(v86)); - svfloat32_t v113; - asm("mov %0.d, %d1" : "=w"(v113) : "w"(v112)); - svfloat32_t v117; - asm("mov %0.d, %d1" : "=w"(v117) : "w"(v116)); - svfloat32_t v128; - asm("mov %0.d, %d1" : "=w"(v128) : "w"(v127)); - svfloat32_t v154; - asm("mov %0.d, %d1" : "=w"(v154) : "w"(v153)); - svfloat32_t v158; - asm("mov %0.d, %d1" : "=w"(v158) : "w"(v157)); - svfloat32_t v169; - asm("mov %0.d, %d1" : "=w"(v169) : "w"(v168)); - svfloat32_t v195; - asm("mov %0.d, %d1" : "=w"(v195) : "w"(v194)); - svfloat32_t v199; - asm("mov %0.d, %d1" : "=w"(v199) : "w"(v198)); - svfloat32_t v210; - asm("mov %0.d, %d1" : "=w"(v210) : "w"(v209)); + svfloat32_t v42 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[4])); + svfloat32_t v46 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[9])); + svfloat32_t v72 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[7])); + svfloat32_t v76 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[12])); + svfloat32_t v87 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[2])); + svfloat32_t v113 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[10])); + svfloat32_t v117 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[0])); + svfloat32_t v128 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[5])); + svfloat32_t v154 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[13])); + svfloat32_t v158 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[3])); + svfloat32_t v169 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[8])); + svfloat32_t v195 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[1])); + svfloat32_t v199 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[6])); + svfloat32_t v210 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[11])); const float32x2_t *v519 = &v5[v19]; const float32x2_t *v528 = &v5[v30]; const float32x2_t *v537 = &v5[v49]; @@ -8080,8 +4677,7 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu15(const armral_cmplx_f32_t *restrict x, float32x2_t *v800 = &v6[v507]; svfloat32_t v832 = svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v646)[0])); - svfloat32_t zero118; - asm volatile("mov %0.s, #0" : "=w"(zero118)); + svfloat32_t zero118 = svdup_n_f32(0); svfloat32_t v118 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero118, v816, v117, 0), v816, v117, 90); @@ -8111,73 +4707,53 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu15(const armral_cmplx_f32_t *restrict x, svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v627)[0])); svfloat32_t v830 = svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v636)[0])); - svfloat32_t zero43; - asm volatile("mov %0.s, #0" : "=w"(zero43)); + svfloat32_t zero43 = svdup_n_f32(0); svfloat32_t v43 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero43, v804, v42, 0), v804, v42, 90); - svfloat32_t zero47; - asm volatile("mov %0.s, #0" : "=w"(zero47)); + svfloat32_t zero47 = svdup_n_f32(0); svfloat32_t v47 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero47, v806, v46, 0), v806, v46, 90); - svfloat32_t zero73; - asm volatile("mov %0.s, #0" : "=w"(zero73)); + svfloat32_t zero73 = svdup_n_f32(0); svfloat32_t v73 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero73, v808, v72, 0), v808, v72, 90); - svfloat32_t zero77; - asm volatile("mov %0.s, #0" : "=w"(zero77)); + svfloat32_t zero77 = svdup_n_f32(0); svfloat32_t v77 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero77, v810, v76, 0), v810, v76, 90); - svfloat32_t zero114; - asm volatile("mov %0.s, #0" : "=w"(zero114)); + svfloat32_t zero114 = svdup_n_f32(0); svfloat32_t v114 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero114, v814, v113, 0), v814, v113, 90); - svfloat32_t zero155; - asm volatile("mov %0.s, #0" : "=w"(zero155)); + svfloat32_t zero155 = svdup_n_f32(0); svfloat32_t v155 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero155, v820, v154, 0), v820, v154, 90); - svfloat32_t zero159; - asm volatile("mov %0.s, #0" : "=w"(zero159)); + svfloat32_t zero159 = svdup_n_f32(0); svfloat32_t v159 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero159, v822, v158, 0), v822, v158, 90); - svfloat32_t zero196; - asm volatile("mov %0.s, #0" : "=w"(zero196)); + svfloat32_t zero196 = svdup_n_f32(0); svfloat32_t v196 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero196, v826, v195, 0), v826, v195, 90); - svfloat32_t zero200; - asm volatile("mov %0.s, #0" : "=w"(zero200)); + svfloat32_t zero200 = svdup_n_f32(0); svfloat32_t v200 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero200, v828, v199, 0), v828, v199, 90); - svfloat32_t v212; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v212) : "w"(v43), "w"(v47)); - svfloat32_t v213; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v213) : "w"(v43), "w"(v47)); - svfloat32_t v222; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v222) : "w"(v73), "w"(v77)); - svfloat32_t v223; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v223) : "w"(v73), "w"(v77)); - svfloat32_t v225; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v225) : "w"(v114), "w"(v118)); - svfloat32_t v226; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v226) : "w"(v114), "w"(v118)); - svfloat32_t v228; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v228) : "w"(v155), "w"(v159)); - svfloat32_t v229; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v229) : "w"(v155), "w"(v159)); - svfloat32_t v231; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v231) : "w"(v196), "w"(v200)); - svfloat32_t v232; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v232) : "w"(v196), "w"(v200)); - svfloat32_t v221; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v221) : "w"(v212), "w"(v832)); + svfloat32_t v212 = svadd_f32_x(svptrue_b32(), v43, v47); + svfloat32_t v213 = svsub_f32_x(svptrue_b32(), v43, v47); + svfloat32_t v222 = svadd_f32_x(svptrue_b32(), v73, v77); + svfloat32_t v223 = svsub_f32_x(svptrue_b32(), v73, v77); + svfloat32_t v225 = svadd_f32_x(svptrue_b32(), v114, v118); + svfloat32_t v226 = svsub_f32_x(svptrue_b32(), v114, v118); + svfloat32_t v228 = svadd_f32_x(svptrue_b32(), v155, v159); + svfloat32_t v229 = svsub_f32_x(svptrue_b32(), v155, v159); + svfloat32_t v231 = svadd_f32_x(svptrue_b32(), v196, v200); + svfloat32_t v232 = svsub_f32_x(svptrue_b32(), v196, v200); + svfloat32_t v221 = svadd_f32_x(svptrue_b32(), v212, v832); svfloat32_t v224 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, v222, v812, v87, 0), v812, v87, 90); svfloat32_t v227 = @@ -8189,86 +4765,51 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu15(const armral_cmplx_f32_t *restrict x, svfloat32_t v233 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, v231, v830, v210, 0), v830, v210, 90); - svfloat32_t v287; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v287) : "w"(v222), "w"(v231)); - svfloat32_t v288; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v288) : "w"(v222), "w"(v231)); - svfloat32_t v289; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v289) : "w"(v228), "w"(v225)); - svfloat32_t v290; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v290) : "w"(v228), "w"(v225)); - svfloat32_t v340; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v340) : "w"(v223), "w"(v232)); - svfloat32_t v341; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v341) : "w"(v223), "w"(v232)); - svfloat32_t v342; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v342) : "w"(v229), "w"(v226)); - svfloat32_t v343; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v343) : "w"(v229), "w"(v226)); - svfloat32_t v234; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v234) : "w"(v224), "w"(v233)); - svfloat32_t v235; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v235) : "w"(v224), "w"(v233)); - svfloat32_t v236; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v236) : "w"(v230), "w"(v227)); - svfloat32_t v237; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v237) : "w"(v230), "w"(v227)); - svfloat32_t v291; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v291) : "w"(v287), "w"(v289)); - svfloat32_t v292; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v292) : "w"(v287), "w"(v289)); - svfloat32_t v293; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v293) : "w"(v288), "w"(v290)); - svfloat32_t zero316; - asm volatile("mov %0.s, #0" : "=w"(zero316)); + svfloat32_t v287 = svadd_f32_x(svptrue_b32(), v222, v231); + svfloat32_t v288 = svsub_f32_x(svptrue_b32(), v222, v231); + svfloat32_t v289 = svadd_f32_x(svptrue_b32(), v228, v225); + svfloat32_t v290 = svsub_f32_x(svptrue_b32(), v228, v225); + svfloat32_t v340 = svadd_f32_x(svptrue_b32(), v223, v232); + svfloat32_t v341 = svsub_f32_x(svptrue_b32(), v223, v232); + svfloat32_t v342 = svadd_f32_x(svptrue_b32(), v229, v226); + svfloat32_t v343 = svsub_f32_x(svptrue_b32(), v229, v226); + svfloat32_t v234 = svadd_f32_x(svptrue_b32(), v224, v233); + svfloat32_t v235 = svsub_f32_x(svptrue_b32(), v224, v233); + svfloat32_t v236 = svadd_f32_x(svptrue_b32(), v230, v227); + svfloat32_t v237 = svsub_f32_x(svptrue_b32(), v230, v227); + svfloat32_t v291 = svadd_f32_x(svptrue_b32(), v287, v289); + svfloat32_t v292 = svsub_f32_x(svptrue_b32(), v287, v289); + svfloat32_t v293 = svadd_f32_x(svptrue_b32(), v288, v290); + svfloat32_t zero316 = svdup_n_f32(0); svfloat32_t v316 = svcmla_f32_x(pred_full, zero316, v658, v288, 90); - svfloat32_t v344; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v344) : "w"(v340), "w"(v342)); - svfloat32_t v345; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v345) : "w"(v340), "w"(v342)); - svfloat32_t v346; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v346) : "w"(v341), "w"(v343)); - svfloat32_t v383; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v383) : "w"(v343), "w"(v666)); - svfloat32_t v238; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v238) : "w"(v234), "w"(v236)); - svfloat32_t v239; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v239) : "w"(v234), "w"(v236)); - svfloat32_t v240; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v240) : "w"(v235), "w"(v237)); - svfloat32_t zero263; - asm volatile("mov %0.s, #0" : "=w"(zero263)); + svfloat32_t v344 = svadd_f32_x(svptrue_b32(), v340, v342); + svfloat32_t v345 = svsub_f32_x(svptrue_b32(), v340, v342); + svfloat32_t v346 = svadd_f32_x(svptrue_b32(), v341, v343); + svfloat32_t v383 = svmul_f32_x(svptrue_b32(), v343, v666); + svfloat32_t v238 = svadd_f32_x(svptrue_b32(), v234, v236); + svfloat32_t v239 = svsub_f32_x(svptrue_b32(), v234, v236); + svfloat32_t v240 = svadd_f32_x(svptrue_b32(), v235, v237); + svfloat32_t zero263 = svdup_n_f32(0); svfloat32_t v263 = svcmla_f32_x(pred_full, zero263, v652, v235, 90); - svfloat32_t v294; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v294) : "w"(v291), "w"(v212)); - svfloat32_t v304; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v304) : "w"(v291), "w"(v656)); - svfloat32_t zero323; - asm volatile("mov %0.s, #0" : "=w"(zero323)); + svfloat32_t v294 = svadd_f32_x(svptrue_b32(), v291, v212); + svfloat32_t v304 = svmul_f32_x(svptrue_b32(), v291, v656); + svfloat32_t zero323 = svdup_n_f32(0); svfloat32_t v323 = svcmla_f32_x(pred_full, zero323, v659, v293, 90); - svfloat32_t v347; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v347) : "w"(v344), "w"(v213)); - svfloat32_t zero368; - asm volatile("mov %0.s, #0" : "=w"(zero368)); + svfloat32_t v347 = svadd_f32_x(svptrue_b32(), v344, v213); + svfloat32_t zero368 = svdup_n_f32(0); svfloat32_t v368 = svcmla_f32_x(pred_full, zero368, v663, v345, 90); - svfloat32_t v378; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v378) : "w"(v346), "w"(v665)); - svfloat32_t v241; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v241) : "w"(v238), "w"(v221)); - svfloat32_t zero270; - asm volatile("mov %0.s, #0" : "=w"(zero270)); + svfloat32_t v378 = svmul_f32_x(svptrue_b32(), v346, v665); + svfloat32_t v241 = svadd_f32_x(svptrue_b32(), v238, v221); + svfloat32_t zero270 = svdup_n_f32(0); svfloat32_t v270 = svcmla_f32_x(pred_full, zero270, v653, v240, 90); - svfloat32_t v334; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v334) : "w"(v316), "w"(v323)); + svfloat32_t v334 = svsub_f32_x(svptrue_b32(), v316, v323); svfloat32_t v335 = svcmla_f32_x(pred_full, v323, v660, v290, 90); - svfloat32_t zero354; - asm volatile("mov %0.s, #0" : "=w"(zero354)); + svfloat32_t zero354 = svdup_n_f32(0); svfloat32_t v354 = svcmla_f32_x(pred_full, zero354, v661, v347, 90); svfloat32_t v387 = svnmls_f32_x(pred_full, v378, v341, v664); svfloat32_t v388 = svmla_f32_x(pred_full, v383, v346, v665); svfloat32_t v278 = svmla_f32_x(pred_full, v241, v238, v650); - svfloat32_t v281; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v281) : "w"(v263), "w"(v270)); + svfloat32_t v281 = svsub_f32_x(svptrue_b32(), v263, v270); svfloat32_t v282 = svcmla_f32_x(pred_full, v270, v654, v237, 90); svfloat32_t v331 = svmla_f32_x(pred_full, v304, v294, v655); svfloat32_t v384 = svcmla_f32_x(pred_full, v354, v662, v344, 90); @@ -8278,68 +4819,40 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu15(const armral_cmplx_f32_t *restrict x, svfloat32_t v280 = svmls_f32_x(pred_full, v278, v239, v651); svfloat32_t v332 = svmla_f32_x(pred_full, v331, v292, v657); svfloat32_t v333 = svmls_f32_x(pred_full, v331, v292, v657); - svfloat32_t v385; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v385) : "w"(v384), "w"(v368)); - svfloat32_t v386; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v386) : "w"(v384), "w"(v368)); - svfloat32_t v394; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v394) : "w"(v393), "w"(v354)); - svfloat32_t v395; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v395) : "w"(v393), "w"(v354)); - svfloat32_t v283; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v283) : "w"(v279), "w"(v281)); - svfloat32_t v284; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v284) : "w"(v279), "w"(v281)); - svfloat32_t v285; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v285) : "w"(v280), "w"(v282)); - svfloat32_t v286; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v286) : "w"(v280), "w"(v282)); - svfloat32_t v336; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v336) : "w"(v332), "w"(v334)); - svfloat32_t v337; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v337) : "w"(v332), "w"(v334)); - svfloat32_t v338; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v338) : "w"(v333), "w"(v335)); - svfloat32_t v339; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v339) : "w"(v333), "w"(v335)); - svfloat32_t v389; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v389) : "w"(v385), "w"(v387)); - svfloat32_t v390; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v390) : "w"(v385), "w"(v387)); - svfloat32_t v391; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v391) : "w"(v386), "w"(v388)); - svfloat32_t v392; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v392) : "w"(v386), "w"(v388)); + svfloat32_t v385 = svadd_f32_x(svptrue_b32(), v384, v368); + svfloat32_t v386 = svsub_f32_x(svptrue_b32(), v384, v368); + svfloat32_t v394 = svadd_f32_x(svptrue_b32(), v393, v354); + svfloat32_t v395 = svsub_f32_x(svptrue_b32(), v393, v354); + svfloat32_t v283 = svadd_f32_x(svptrue_b32(), v279, v281); + svfloat32_t v284 = svsub_f32_x(svptrue_b32(), v279, v281); + svfloat32_t v285 = svadd_f32_x(svptrue_b32(), v280, v282); + svfloat32_t v286 = svsub_f32_x(svptrue_b32(), v280, v282); + svfloat32_t v336 = svadd_f32_x(svptrue_b32(), v332, v334); + svfloat32_t v337 = svsub_f32_x(svptrue_b32(), v332, v334); + svfloat32_t v338 = svadd_f32_x(svptrue_b32(), v333, v335); + svfloat32_t v339 = svsub_f32_x(svptrue_b32(), v333, v335); + svfloat32_t v389 = svadd_f32_x(svptrue_b32(), v385, v387); + svfloat32_t v390 = svsub_f32_x(svptrue_b32(), v385, v387); + svfloat32_t v391 = svadd_f32_x(svptrue_b32(), v386, v388); + svfloat32_t v392 = svsub_f32_x(svptrue_b32(), v386, v388); svst1_f64(pred_full, (double *)(v683), svreinterpret_f64_f32(v395)); svst1_f64(pred_full, (double *)(v692), svreinterpret_f64_f32(v394)); - svfloat32_t v417; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v417) : "w"(v284), "w"(v337)); - svfloat32_t v441; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v441) : "w"(v286), "w"(v339)); - svfloat32_t v465; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v465) : "w"(v285), "w"(v338)); - svfloat32_t v489; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v489) : "w"(v283), "w"(v336)); + svfloat32_t v417 = svadd_f32_x(svptrue_b32(), v284, v337); + svfloat32_t v441 = svadd_f32_x(svptrue_b32(), v286, v339); + svfloat32_t v465 = svadd_f32_x(svptrue_b32(), v285, v338); + svfloat32_t v489 = svadd_f32_x(svptrue_b32(), v283, v336); svst1_f64(pred_full, (double *)(v701), svreinterpret_f64_f32(v284)); svst1_f64(pred_full, (double *)(v728), svreinterpret_f64_f32(v286)); svst1_f64(pred_full, (double *)(v755), svreinterpret_f64_f32(v285)); svst1_f64(pred_full, (double *)(v782), svreinterpret_f64_f32(v283)); - svfloat32_t v418; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v418) : "w"(v417), "w"(v390)); - svfloat32_t v419; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v419) : "w"(v417), "w"(v390)); - svfloat32_t v442; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v442) : "w"(v441), "w"(v392)); - svfloat32_t v443; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v443) : "w"(v441), "w"(v392)); - svfloat32_t v466; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v466) : "w"(v465), "w"(v391)); - svfloat32_t v467; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v467) : "w"(v465), "w"(v391)); - svfloat32_t v490; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v490) : "w"(v489), "w"(v389)); - svfloat32_t v491; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v491) : "w"(v489), "w"(v389)); + svfloat32_t v418 = svadd_f32_x(svptrue_b32(), v417, v390); + svfloat32_t v419 = svsub_f32_x(svptrue_b32(), v417, v390); + svfloat32_t v442 = svadd_f32_x(svptrue_b32(), v441, v392); + svfloat32_t v443 = svsub_f32_x(svptrue_b32(), v441, v392); + svfloat32_t v466 = svadd_f32_x(svptrue_b32(), v465, v391); + svfloat32_t v467 = svsub_f32_x(svptrue_b32(), v465, v391); + svfloat32_t v490 = svadd_f32_x(svptrue_b32(), v489, v389); + svfloat32_t v491 = svsub_f32_x(svptrue_b32(), v489, v389); svst1_f64(pred_full, (double *)(v710), svreinterpret_f64_f32(v419)); svst1_f64(pred_full, (double *)(v719), svreinterpret_f64_f32(v418)); svst1_f64(pred_full, (double *)(v737), svreinterpret_f64_f32(v443)); @@ -8980,34 +5493,19 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu16(const armral_cmplx_f32_t *restrict x, const float32x2_t *v621 = &v5[v0]; float32x2_t *v731 = &v6[v2]; int64_t v19 = v0 * 8; - float32x2_t v30 = v7[7]; int64_t v34 = v0 * 4; int64_t v45 = v0 * 12; - float32x2_t v56 = v7[3]; - float32x2_t v60 = v7[11]; int64_t v64 = v0 * 2; int64_t v75 = v0 * 10; - float32x2_t v86 = v7[1]; - float32x2_t v90 = v7[9]; int64_t v94 = v0 * 6; int64_t v105 = v0 * 14; - float32x2_t v116 = v7[5]; - float32x2_t v120 = v7[13]; int64_t v135 = v0 * 9; - float32x2_t v146 = v7[0]; - float32x2_t v150 = v7[8]; int64_t v154 = v0 * 5; int64_t v165 = v0 * 13; - float32x2_t v176 = v7[4]; - float32x2_t v180 = v7[12]; int64_t v184 = v0 * 3; int64_t v195 = v0 * 11; - float32x2_t v206 = v7[2]; - float32x2_t v210 = v7[10]; int64_t v214 = v0 * 7; int64_t v225 = v0 * 15; - float32x2_t v236 = v7[6]; - float32x2_t v240 = v7[14]; float v345 = v4 * v342; float v352 = v4 * v349; float v364 = v4 * v361; @@ -9035,36 +5533,36 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu16(const armral_cmplx_f32_t *restrict x, float32x2_t *v722 = &v6[0]; svfloat32_t v875 = svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v621)[0])); - svfloat32_t v31; - asm("mov %0.d, %d1" : "=w"(v31) : "w"(v30)); - svfloat32_t v57; - asm("mov %0.d, %d1" : "=w"(v57) : "w"(v56)); - svfloat32_t v61; - asm("mov %0.d, %d1" : "=w"(v61) : "w"(v60)); - svfloat32_t v87; - asm("mov %0.d, %d1" : "=w"(v87) : "w"(v86)); - svfloat32_t v91; - asm("mov %0.d, %d1" : "=w"(v91) : "w"(v90)); - svfloat32_t v117; - asm("mov %0.d, %d1" : "=w"(v117) : "w"(v116)); - svfloat32_t v121; - asm("mov %0.d, %d1" : "=w"(v121) : "w"(v120)); - svfloat32_t v147; - asm("mov %0.d, %d1" : "=w"(v147) : "w"(v146)); - svfloat32_t v151; - asm("mov %0.d, %d1" : "=w"(v151) : "w"(v150)); - svfloat32_t v177; - asm("mov %0.d, %d1" : "=w"(v177) : "w"(v176)); - svfloat32_t v181; - asm("mov %0.d, %d1" : "=w"(v181) : "w"(v180)); - svfloat32_t v207; - asm("mov %0.d, %d1" : "=w"(v207) : "w"(v206)); - svfloat32_t v211; - asm("mov %0.d, %d1" : "=w"(v211) : "w"(v210)); - svfloat32_t v237; - asm("mov %0.d, %d1" : "=w"(v237) : "w"(v236)); - svfloat32_t v241; - asm("mov %0.d, %d1" : "=w"(v241) : "w"(v240)); + svfloat32_t v31 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[7])); + svfloat32_t v57 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[3])); + svfloat32_t v61 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[11])); + svfloat32_t v87 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[1])); + svfloat32_t v91 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[9])); + svfloat32_t v117 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[5])); + svfloat32_t v121 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[13])); + svfloat32_t v147 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[0])); + svfloat32_t v151 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[8])); + svfloat32_t v177 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[4])); + svfloat32_t v181 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[12])); + svfloat32_t v207 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[2])); + svfloat32_t v211 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[10])); + svfloat32_t v237 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[6])); + svfloat32_t v241 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[14])); const float32x2_t *v558 = &v5[v19]; const float32x2_t *v567 = &v5[v34]; const float32x2_t *v576 = &v5[v45]; @@ -9100,8 +5598,7 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu16(const armral_cmplx_f32_t *restrict x, float32x2_t *v857 = &v6[v546]; svfloat32_t v891 = svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v694)[0])); - svfloat32_t zero148; - asm volatile("mov %0.s, #0" : "=w"(zero148)); + svfloat32_t zero148 = svdup_n_f32(0); svfloat32_t v148 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero148, v875, v147, 0), v875, v147, 90); @@ -9133,262 +5630,165 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu16(const armral_cmplx_f32_t *restrict x, svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v675)[0])); svfloat32_t v889 = svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v684)[0])); - svfloat32_t zero32; - asm volatile("mov %0.s, #0" : "=w"(zero32)); + svfloat32_t zero32 = svdup_n_f32(0); svfloat32_t v32 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero32, v861, v31, 0), v861, v31, 90); - svfloat32_t zero58; - asm volatile("mov %0.s, #0" : "=w"(zero58)); + svfloat32_t zero58 = svdup_n_f32(0); svfloat32_t v58 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero58, v863, v57, 0), v863, v57, 90); - svfloat32_t zero62; - asm volatile("mov %0.s, #0" : "=w"(zero62)); + svfloat32_t zero62 = svdup_n_f32(0); svfloat32_t v62 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero62, v865, v61, 0), v865, v61, 90); - svfloat32_t zero88; - asm volatile("mov %0.s, #0" : "=w"(zero88)); + svfloat32_t zero88 = svdup_n_f32(0); svfloat32_t v88 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero88, v867, v87, 0), v867, v87, 90); - svfloat32_t zero92; - asm volatile("mov %0.s, #0" : "=w"(zero92)); + svfloat32_t zero92 = svdup_n_f32(0); svfloat32_t v92 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero92, v869, v91, 0), v869, v91, 90); - svfloat32_t zero118; - asm volatile("mov %0.s, #0" : "=w"(zero118)); + svfloat32_t zero118 = svdup_n_f32(0); svfloat32_t v118 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero118, v871, v117, 0), v871, v117, 90); - svfloat32_t zero122; - asm volatile("mov %0.s, #0" : "=w"(zero122)); + svfloat32_t zero122 = svdup_n_f32(0); svfloat32_t v122 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero122, v873, v121, 0), v873, v121, 90); - svfloat32_t zero152; - asm volatile("mov %0.s, #0" : "=w"(zero152)); + svfloat32_t zero152 = svdup_n_f32(0); svfloat32_t v152 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero152, v877, v151, 0), v877, v151, 90); - svfloat32_t zero178; - asm volatile("mov %0.s, #0" : "=w"(zero178)); + svfloat32_t zero178 = svdup_n_f32(0); svfloat32_t v178 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero178, v879, v177, 0), v879, v177, 90); - svfloat32_t zero182; - asm volatile("mov %0.s, #0" : "=w"(zero182)); + svfloat32_t zero182 = svdup_n_f32(0); svfloat32_t v182 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero182, v881, v181, 0), v881, v181, 90); - svfloat32_t zero208; - asm volatile("mov %0.s, #0" : "=w"(zero208)); + svfloat32_t zero208 = svdup_n_f32(0); svfloat32_t v208 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero208, v883, v207, 0), v883, v207, 90); - svfloat32_t zero212; - asm volatile("mov %0.s, #0" : "=w"(zero212)); + svfloat32_t zero212 = svdup_n_f32(0); svfloat32_t v212 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero212, v885, v211, 0), v885, v211, 90); - svfloat32_t zero238; - asm volatile("mov %0.s, #0" : "=w"(zero238)); + svfloat32_t zero238 = svdup_n_f32(0); svfloat32_t v238 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero238, v887, v237, 0), v887, v237, 90); - svfloat32_t zero242; - asm volatile("mov %0.s, #0" : "=w"(zero242)); + svfloat32_t zero242 = svdup_n_f32(0); svfloat32_t v242 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero242, v889, v241, 0), v889, v241, 90); - svfloat32_t v250; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v250) : "w"(v891), "w"(v32)); - svfloat32_t v251; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v251) : "w"(v891), "w"(v32)); - svfloat32_t v252; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v252) : "w"(v58), "w"(v62)); - svfloat32_t v253; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v253) : "w"(v58), "w"(v62)); - svfloat32_t v254; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v254) : "w"(v88), "w"(v92)); - svfloat32_t v255; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v255) : "w"(v88), "w"(v92)); - svfloat32_t v256; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v256) : "w"(v118), "w"(v122)); - svfloat32_t v257; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v257) : "w"(v118), "w"(v122)); - svfloat32_t v258; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v258) : "w"(v148), "w"(v152)); - svfloat32_t v259; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v259) : "w"(v148), "w"(v152)); - svfloat32_t v260; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v260) : "w"(v178), "w"(v182)); - svfloat32_t v261; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v261) : "w"(v178), "w"(v182)); - svfloat32_t v262; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v262) : "w"(v208), "w"(v212)); - svfloat32_t v263; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v263) : "w"(v208), "w"(v212)); - svfloat32_t v264; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v264) : "w"(v238), "w"(v242)); - svfloat32_t v265; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v265) : "w"(v238), "w"(v242)); - svfloat32_t v266; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v266) : "w"(v250), "w"(v252)); - svfloat32_t v267; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v267) : "w"(v250), "w"(v252)); - svfloat32_t v268; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v268) : "w"(v254), "w"(v256)); - svfloat32_t v269; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v269) : "w"(v254), "w"(v256)); - svfloat32_t v270; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v270) : "w"(v258), "w"(v260)); - svfloat32_t v271; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v271) : "w"(v258), "w"(v260)); - svfloat32_t v272; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v272) : "w"(v262), "w"(v264)); - svfloat32_t v273; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v273) : "w"(v262), "w"(v264)); - svfloat32_t v282; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v282) : "w"(v255), "w"(v257)); - svfloat32_t v283; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v283) : "w"(v255), "w"(v257)); - svfloat32_t v284; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v284) : "w"(v259), "w"(v265)); - svfloat32_t v285; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v285) : "w"(v259), "w"(v265)); - svfloat32_t v286; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v286) : "w"(v261), "w"(v263)); - svfloat32_t v287; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v287) : "w"(v261), "w"(v263)); - svfloat32_t zero347; - asm volatile("mov %0.s, #0" : "=w"(zero347)); + svfloat32_t v250 = svadd_f32_x(svptrue_b32(), v891, v32); + svfloat32_t v251 = svsub_f32_x(svptrue_b32(), v891, v32); + svfloat32_t v252 = svadd_f32_x(svptrue_b32(), v58, v62); + svfloat32_t v253 = svsub_f32_x(svptrue_b32(), v58, v62); + svfloat32_t v254 = svadd_f32_x(svptrue_b32(), v88, v92); + svfloat32_t v255 = svsub_f32_x(svptrue_b32(), v88, v92); + svfloat32_t v256 = svadd_f32_x(svptrue_b32(), v118, v122); + svfloat32_t v257 = svsub_f32_x(svptrue_b32(), v118, v122); + svfloat32_t v258 = svadd_f32_x(svptrue_b32(), v148, v152); + svfloat32_t v259 = svsub_f32_x(svptrue_b32(), v148, v152); + svfloat32_t v260 = svadd_f32_x(svptrue_b32(), v178, v182); + svfloat32_t v261 = svsub_f32_x(svptrue_b32(), v178, v182); + svfloat32_t v262 = svadd_f32_x(svptrue_b32(), v208, v212); + svfloat32_t v263 = svsub_f32_x(svptrue_b32(), v208, v212); + svfloat32_t v264 = svadd_f32_x(svptrue_b32(), v238, v242); + svfloat32_t v265 = svsub_f32_x(svptrue_b32(), v238, v242); + svfloat32_t v266 = svadd_f32_x(svptrue_b32(), v250, v252); + svfloat32_t v267 = svsub_f32_x(svptrue_b32(), v250, v252); + svfloat32_t v268 = svadd_f32_x(svptrue_b32(), v254, v256); + svfloat32_t v269 = svsub_f32_x(svptrue_b32(), v254, v256); + svfloat32_t v270 = svadd_f32_x(svptrue_b32(), v258, v260); + svfloat32_t v271 = svsub_f32_x(svptrue_b32(), v258, v260); + svfloat32_t v272 = svadd_f32_x(svptrue_b32(), v262, v264); + svfloat32_t v273 = svsub_f32_x(svptrue_b32(), v262, v264); + svfloat32_t v282 = svadd_f32_x(svptrue_b32(), v255, v257); + svfloat32_t v283 = svsub_f32_x(svptrue_b32(), v255, v257); + svfloat32_t v284 = svadd_f32_x(svptrue_b32(), v259, v265); + svfloat32_t v285 = svsub_f32_x(svptrue_b32(), v259, v265); + svfloat32_t v286 = svadd_f32_x(svptrue_b32(), v261, v263); + svfloat32_t v287 = svsub_f32_x(svptrue_b32(), v261, v263); + svfloat32_t zero347 = svdup_n_f32(0); svfloat32_t v347 = svcmla_f32_x(pred_full, zero347, v706, v253, 90); - svfloat32_t v274; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v274) : "w"(v266), "w"(v268)); - svfloat32_t v275; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v275) : "w"(v266), "w"(v268)); - svfloat32_t v276; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v276) : "w"(v270), "w"(v272)); - svfloat32_t v277; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v277) : "w"(v270), "w"(v272)); - svfloat32_t v280; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v280) : "w"(v271), "w"(v273)); - svfloat32_t v281; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v281) : "w"(v271), "w"(v273)); - svfloat32_t v288; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v288) : "w"(v284), "w"(v286)); - svfloat32_t v289; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v289) : "w"(v285), "w"(v287)); - svfloat32_t zero323; - asm volatile("mov %0.s, #0" : "=w"(zero323)); + svfloat32_t v274 = svadd_f32_x(svptrue_b32(), v266, v268); + svfloat32_t v275 = svsub_f32_x(svptrue_b32(), v266, v268); + svfloat32_t v276 = svadd_f32_x(svptrue_b32(), v270, v272); + svfloat32_t v277 = svsub_f32_x(svptrue_b32(), v270, v272); + svfloat32_t v280 = svadd_f32_x(svptrue_b32(), v271, v273); + svfloat32_t v281 = svsub_f32_x(svptrue_b32(), v271, v273); + svfloat32_t v288 = svadd_f32_x(svptrue_b32(), v284, v286); + svfloat32_t v289 = svadd_f32_x(svptrue_b32(), v285, v287); + svfloat32_t zero323 = svdup_n_f32(0); svfloat32_t v323 = svcmla_f32_x(pred_full, zero323, v706, v269, 90); - svfloat32_t zero354; - asm volatile("mov %0.s, #0" : "=w"(zero354)); + svfloat32_t zero354 = svdup_n_f32(0); svfloat32_t v354 = svcmla_f32_x(pred_full, zero354, v707, v282, 90); - svfloat32_t zero380; - asm volatile("mov %0.s, #0" : "=w"(zero380)); + svfloat32_t zero380 = svdup_n_f32(0); svfloat32_t v380 = svcmla_f32_x(pred_full, zero380, v711, v286, 90); - svfloat32_t v390; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v390) : "w"(v285), "w"(v713)); - svfloat32_t v395; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v395) : "w"(v287), "w"(v714)); - svfloat32_t v278; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v278) : "w"(v274), "w"(v276)); - svfloat32_t v279; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v279) : "w"(v274), "w"(v276)); - svfloat32_t zero311; - asm volatile("mov %0.s, #0" : "=w"(zero311)); + svfloat32_t v390 = svmul_f32_x(svptrue_b32(), v285, v713); + svfloat32_t v395 = svmul_f32_x(svptrue_b32(), v287, v714); + svfloat32_t v278 = svadd_f32_x(svptrue_b32(), v274, v276); + svfloat32_t v279 = svsub_f32_x(svptrue_b32(), v274, v276); + svfloat32_t zero311 = svdup_n_f32(0); svfloat32_t v311 = svcmla_f32_x(pred_full, zero311, v706, v277, 90); - svfloat32_t zero330; - asm volatile("mov %0.s, #0" : "=w"(zero330)); + svfloat32_t zero330 = svdup_n_f32(0); svfloat32_t v330 = svcmla_f32_x(pred_full, zero330, v707, v280, 90); - svfloat32_t zero366; - asm volatile("mov %0.s, #0" : "=w"(zero366)); + svfloat32_t zero366 = svdup_n_f32(0); svfloat32_t v366 = svcmla_f32_x(pred_full, zero366, v709, v288, 90); - svfloat32_t v385; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v385) : "w"(v289), "w"(v712)); + svfloat32_t v385 = svmul_f32_x(svptrue_b32(), v289, v712); svfloat32_t v406 = svmla_f32_x(pred_full, v251, v283, v708); svfloat32_t v407 = svmls_f32_x(pred_full, v251, v283, v708); - svfloat32_t v408; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v408) : "w"(v347), "w"(v354)); - svfloat32_t v409; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v409) : "w"(v347), "w"(v354)); - svfloat32_t v396; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v396) : "w"(v275), "w"(v311)); - svfloat32_t v397; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v397) : "w"(v275), "w"(v311)); + svfloat32_t v408 = svadd_f32_x(svptrue_b32(), v347, v354); + svfloat32_t v409 = svsub_f32_x(svptrue_b32(), v347, v354); + svfloat32_t v396 = svadd_f32_x(svptrue_b32(), v275, v311); + svfloat32_t v397 = svsub_f32_x(svptrue_b32(), v275, v311); svfloat32_t v398 = svmla_f32_x(pred_full, v267, v281, v708); - svfloat32_t v399; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v399) : "w"(v323), "w"(v330)); + svfloat32_t v399 = svadd_f32_x(svptrue_b32(), v323, v330); svfloat32_t v400 = svmls_f32_x(pred_full, v267, v281, v708); - svfloat32_t v401; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v401) : "w"(v330), "w"(v323)); + svfloat32_t v401 = svsub_f32_x(svptrue_b32(), v330, v323); svfloat32_t v410 = svcmla_f32_x(pred_full, v366, v710, v284, 90); - svfloat32_t v411; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v411) : "w"(v366), "w"(v380)); + svfloat32_t v411 = svsub_f32_x(svptrue_b32(), v366, v380); svfloat32_t v412 = svnmls_f32_x(pred_full, v385, v285, v713); svfloat32_t v413 = svnmls_f32_x(pred_full, v385, v287, v714); svfloat32_t v414 = svnmls_f32_x(pred_full, v390, v289, v712); svfloat32_t v415 = svnmls_f32_x(pred_full, v395, v289, v712); - svfloat32_t v420; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v420) : "w"(v407), "w"(v409)); - svfloat32_t v421; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v421) : "w"(v407), "w"(v409)); + svfloat32_t v420 = svadd_f32_x(svptrue_b32(), v407, v409); + svfloat32_t v421 = svsub_f32_x(svptrue_b32(), v407, v409); svst1_f64(pred_full, (double *)(v722), svreinterpret_f64_f32(v278)); svst1_f64(pred_full, (double *)(v794), svreinterpret_f64_f32(v279)); - svfloat32_t v402; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v402) : "w"(v398), "w"(v399)); - svfloat32_t v403; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v403) : "w"(v400), "w"(v401)); - svfloat32_t v404; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v404) : "w"(v400), "w"(v401)); - svfloat32_t v405; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v405) : "w"(v398), "w"(v399)); - svfloat32_t v416; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v416) : "w"(v406), "w"(v412)); - svfloat32_t v417; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v417) : "w"(v406), "w"(v412)); - svfloat32_t v418; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v418) : "w"(v406), "w"(v414)); - svfloat32_t v419; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v419) : "w"(v406), "w"(v414)); - svfloat32_t v422; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v422) : "w"(v407), "w"(v415)); - svfloat32_t v423; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v423) : "w"(v407), "w"(v415)); - svfloat32_t v426; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v426) : "w"(v410), "w"(v408)); - svfloat32_t v427; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v427) : "w"(v410), "w"(v408)); - svfloat32_t v428; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v428) : "w"(v411), "w"(v413)); - svfloat32_t v429; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v429) : "w"(v411), "w"(v413)); - svfloat32_t v430; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v430) : "w"(v411), "w"(v409)); - svfloat32_t v431; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v431) : "w"(v411), "w"(v409)); + svfloat32_t v402 = svadd_f32_x(svptrue_b32(), v398, v399); + svfloat32_t v403 = svadd_f32_x(svptrue_b32(), v400, v401); + svfloat32_t v404 = svsub_f32_x(svptrue_b32(), v400, v401); + svfloat32_t v405 = svsub_f32_x(svptrue_b32(), v398, v399); + svfloat32_t v416 = svadd_f32_x(svptrue_b32(), v406, v412); + svfloat32_t v417 = svsub_f32_x(svptrue_b32(), v406, v412); + svfloat32_t v418 = svadd_f32_x(svptrue_b32(), v406, v414); + svfloat32_t v419 = svsub_f32_x(svptrue_b32(), v406, v414); + svfloat32_t v422 = svadd_f32_x(svptrue_b32(), v407, v415); + svfloat32_t v423 = svsub_f32_x(svptrue_b32(), v407, v415); + svfloat32_t v426 = svadd_f32_x(svptrue_b32(), v410, v408); + svfloat32_t v427 = svsub_f32_x(svptrue_b32(), v410, v408); + svfloat32_t v428 = svadd_f32_x(svptrue_b32(), v411, v413); + svfloat32_t v429 = svsub_f32_x(svptrue_b32(), v411, v413); + svfloat32_t v430 = svadd_f32_x(svptrue_b32(), v411, v409); + svfloat32_t v431 = svsub_f32_x(svptrue_b32(), v411, v409); svst1_f64(pred_full, (double *)(v758), svreinterpret_f64_f32(v397)); svst1_f64(pred_full, (double *)(v830), svreinterpret_f64_f32(v396)); - svfloat32_t v432; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v432) : "w"(v416), "w"(v426)); - svfloat32_t v433; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v433) : "w"(v417), "w"(v427)); - svfloat32_t v434; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v434) : "w"(v418), "w"(v427)); - svfloat32_t v435; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v435) : "w"(v419), "w"(v426)); - svfloat32_t v436; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v436) : "w"(v420), "w"(v428)); - svfloat32_t v437; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v437) : "w"(v421), "w"(v429)); - svfloat32_t v438; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v438) : "w"(v422), "w"(v431)); - svfloat32_t v439; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v439) : "w"(v423), "w"(v430)); + svfloat32_t v432 = svadd_f32_x(svptrue_b32(), v416, v426); + svfloat32_t v433 = svadd_f32_x(svptrue_b32(), v417, v427); + svfloat32_t v434 = svsub_f32_x(svptrue_b32(), v418, v427); + svfloat32_t v435 = svsub_f32_x(svptrue_b32(), v419, v426); + svfloat32_t v436 = svadd_f32_x(svptrue_b32(), v420, v428); + svfloat32_t v437 = svadd_f32_x(svptrue_b32(), v421, v429); + svfloat32_t v438 = svsub_f32_x(svptrue_b32(), v422, v431); + svfloat32_t v439 = svsub_f32_x(svptrue_b32(), v423, v430); svst1_f64(pred_full, (double *)(v740), svreinterpret_f64_f32(v405)); svst1_f64(pred_full, (double *)(v776), svreinterpret_f64_f32(v404)); svst1_f64(pred_full, (double *)(v812), svreinterpret_f64_f32(v403)); @@ -10499,36 +6899,20 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu17(const armral_cmplx_f32_t *restrict x, const float32x2_t *v769 = &v5[v0]; float32x2_t *v969 = &v6[v2]; int64_t v30 = v0 * 16; - float32x2_t v41 = v7[0]; - float32x2_t v45 = v7[15]; int64_t v49 = v0 * 3; int64_t v60 = v0 * 14; - float32x2_t v71 = v7[2]; - float32x2_t v75 = v7[13]; int64_t v79 = v0 * 9; int64_t v90 = v0 * 8; - float32x2_t v101 = v7[8]; - float32x2_t v105 = v7[7]; int64_t v109 = v0 * 10; int64_t v120 = v0 * 7; - float32x2_t v131 = v7[9]; - float32x2_t v135 = v7[6]; int64_t v139 = v0 * 13; int64_t v150 = v0 * 4; - float32x2_t v161 = v7[12]; - float32x2_t v165 = v7[3]; int64_t v169 = v0 * 5; int64_t v180 = v0 * 12; - float32x2_t v191 = v7[4]; - float32x2_t v195 = v7[11]; int64_t v199 = v0 * 15; int64_t v210 = v0 * 2; - float32x2_t v221 = v7[14]; - float32x2_t v225 = v7[1]; int64_t v229 = v0 * 11; int64_t v240 = v0 * 6; - float32x2_t v251 = v7[10]; - float32x2_t v255 = v7[5]; float v411 = v4 * v408; float v418 = v4 * v415; float v425 = v4 * v422; @@ -10583,38 +6967,38 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu17(const armral_cmplx_f32_t *restrict x, float32x2_t *v960 = &v6[0]; svfloat32_t v1108 = svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v769)[0])); - svfloat32_t v42; - asm("mov %0.d, %d1" : "=w"(v42) : "w"(v41)); - svfloat32_t v46; - asm("mov %0.d, %d1" : "=w"(v46) : "w"(v45)); - svfloat32_t v72; - asm("mov %0.d, %d1" : "=w"(v72) : "w"(v71)); - svfloat32_t v76; - asm("mov %0.d, %d1" : "=w"(v76) : "w"(v75)); - svfloat32_t v102; - asm("mov %0.d, %d1" : "=w"(v102) : "w"(v101)); - svfloat32_t v106; - asm("mov %0.d, %d1" : "=w"(v106) : "w"(v105)); - svfloat32_t v132; - asm("mov %0.d, %d1" : "=w"(v132) : "w"(v131)); - svfloat32_t v136; - asm("mov %0.d, %d1" : "=w"(v136) : "w"(v135)); - svfloat32_t v162; - asm("mov %0.d, %d1" : "=w"(v162) : "w"(v161)); - svfloat32_t v166; - asm("mov %0.d, %d1" : "=w"(v166) : "w"(v165)); - svfloat32_t v192; - asm("mov %0.d, %d1" : "=w"(v192) : "w"(v191)); - svfloat32_t v196; - asm("mov %0.d, %d1" : "=w"(v196) : "w"(v195)); - svfloat32_t v222; - asm("mov %0.d, %d1" : "=w"(v222) : "w"(v221)); - svfloat32_t v226; - asm("mov %0.d, %d1" : "=w"(v226) : "w"(v225)); - svfloat32_t v252; - asm("mov %0.d, %d1" : "=w"(v252) : "w"(v251)); - svfloat32_t v256; - asm("mov %0.d, %d1" : "=w"(v256) : "w"(v255)); + svfloat32_t v42 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[0])); + svfloat32_t v46 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[15])); + svfloat32_t v72 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[2])); + svfloat32_t v76 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[13])); + svfloat32_t v102 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[8])); + svfloat32_t v106 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[7])); + svfloat32_t v132 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[9])); + svfloat32_t v136 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[6])); + svfloat32_t v162 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[12])); + svfloat32_t v166 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[3])); + svfloat32_t v192 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[4])); + svfloat32_t v196 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[11])); + svfloat32_t v222 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[14])); + svfloat32_t v226 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[1])); + svfloat32_t v252 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[10])); + svfloat32_t v256 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[5])); const float32x2_t *v778 = &v5[v30]; const float32x2_t *v787 = &v5[v49]; const float32x2_t *v796 = &v5[v60]; @@ -10668,8 +7052,7 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu17(const armral_cmplx_f32_t *restrict x, float32x2_t *v1104 = &v6[v757]; svfloat32_t v1140 = svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v914)[0])); - svfloat32_t zero43; - asm volatile("mov %0.s, #0" : "=w"(zero43)); + svfloat32_t zero43 = svdup_n_f32(0); svfloat32_t v43 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero43, v1108, v42, 0), v1108, v42, 90); @@ -10703,242 +7086,151 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu17(const armral_cmplx_f32_t *restrict x, svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v895)[0])); svfloat32_t v1138 = svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v904)[0])); - svfloat32_t zero47; - asm volatile("mov %0.s, #0" : "=w"(zero47)); + svfloat32_t zero47 = svdup_n_f32(0); svfloat32_t v47 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero47, v1110, v46, 0), v1110, v46, 90); - svfloat32_t zero73; - asm volatile("mov %0.s, #0" : "=w"(zero73)); + svfloat32_t zero73 = svdup_n_f32(0); svfloat32_t v73 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero73, v1112, v72, 0), v1112, v72, 90); - svfloat32_t zero77; - asm volatile("mov %0.s, #0" : "=w"(zero77)); + svfloat32_t zero77 = svdup_n_f32(0); svfloat32_t v77 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero77, v1114, v76, 0), v1114, v76, 90); - svfloat32_t zero103; - asm volatile("mov %0.s, #0" : "=w"(zero103)); + svfloat32_t zero103 = svdup_n_f32(0); svfloat32_t v103 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero103, v1116, v102, 0), v1116, v102, 90); - svfloat32_t zero107; - asm volatile("mov %0.s, #0" : "=w"(zero107)); + svfloat32_t zero107 = svdup_n_f32(0); svfloat32_t v107 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero107, v1118, v106, 0), v1118, v106, 90); - svfloat32_t zero133; - asm volatile("mov %0.s, #0" : "=w"(zero133)); + svfloat32_t zero133 = svdup_n_f32(0); svfloat32_t v133 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero133, v1120, v132, 0), v1120, v132, 90); - svfloat32_t zero137; - asm volatile("mov %0.s, #0" : "=w"(zero137)); + svfloat32_t zero137 = svdup_n_f32(0); svfloat32_t v137 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero137, v1122, v136, 0), v1122, v136, 90); - svfloat32_t zero163; - asm volatile("mov %0.s, #0" : "=w"(zero163)); + svfloat32_t zero163 = svdup_n_f32(0); svfloat32_t v163 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero163, v1124, v162, 0), v1124, v162, 90); - svfloat32_t zero167; - asm volatile("mov %0.s, #0" : "=w"(zero167)); + svfloat32_t zero167 = svdup_n_f32(0); svfloat32_t v167 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero167, v1126, v166, 0), v1126, v166, 90); - svfloat32_t zero193; - asm volatile("mov %0.s, #0" : "=w"(zero193)); + svfloat32_t zero193 = svdup_n_f32(0); svfloat32_t v193 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero193, v1128, v192, 0), v1128, v192, 90); - svfloat32_t zero197; - asm volatile("mov %0.s, #0" : "=w"(zero197)); + svfloat32_t zero197 = svdup_n_f32(0); svfloat32_t v197 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero197, v1130, v196, 0), v1130, v196, 90); - svfloat32_t zero223; - asm volatile("mov %0.s, #0" : "=w"(zero223)); + svfloat32_t zero223 = svdup_n_f32(0); svfloat32_t v223 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero223, v1132, v222, 0), v1132, v222, 90); - svfloat32_t zero227; - asm volatile("mov %0.s, #0" : "=w"(zero227)); + svfloat32_t zero227 = svdup_n_f32(0); svfloat32_t v227 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero227, v1134, v226, 0), v1134, v226, 90); - svfloat32_t zero253; - asm volatile("mov %0.s, #0" : "=w"(zero253)); + svfloat32_t zero253 = svdup_n_f32(0); svfloat32_t v253 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero253, v1136, v252, 0), v1136, v252, 90); - svfloat32_t zero257; - asm volatile("mov %0.s, #0" : "=w"(zero257)); + svfloat32_t zero257 = svdup_n_f32(0); svfloat32_t v257 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero257, v1138, v256, 0), v1138, v256, 90); - svfloat32_t v258; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v258) : "w"(v43), "w"(v47)); - svfloat32_t v259; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v259) : "w"(v43), "w"(v47)); - svfloat32_t v260; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v260) : "w"(v73), "w"(v77)); - svfloat32_t v261; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v261) : "w"(v73), "w"(v77)); - svfloat32_t v262; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v262) : "w"(v103), "w"(v107)); - svfloat32_t v263; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v263) : "w"(v103), "w"(v107)); - svfloat32_t v264; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v264) : "w"(v133), "w"(v137)); - svfloat32_t v265; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v265) : "w"(v133), "w"(v137)); - svfloat32_t v266; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v266) : "w"(v163), "w"(v167)); - svfloat32_t v267; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v267) : "w"(v163), "w"(v167)); - svfloat32_t v268; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v268) : "w"(v193), "w"(v197)); - svfloat32_t v269; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v269) : "w"(v193), "w"(v197)); - svfloat32_t v270; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v270) : "w"(v223), "w"(v227)); - svfloat32_t v271; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v271) : "w"(v223), "w"(v227)); - svfloat32_t v272; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v272) : "w"(v253), "w"(v257)); - svfloat32_t v273; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v273) : "w"(v253), "w"(v257)); - svfloat32_t v274; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v274) : "w"(v258), "w"(v266)); - svfloat32_t v275; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v275) : "w"(v260), "w"(v268)); - svfloat32_t v276; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v276) : "w"(v262), "w"(v270)); - svfloat32_t v277; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v277) : "w"(v264), "w"(v272)); - svfloat32_t v280; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v280) : "w"(v258), "w"(v266)); - svfloat32_t v281; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v281) : "w"(v260), "w"(v268)); - svfloat32_t v282; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v282) : "w"(v262), "w"(v270)); - svfloat32_t v283; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v283) : "w"(v264), "w"(v272)); - svfloat32_t v294; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v294) : "w"(v259), "w"(v263)); - svfloat32_t v295; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v295) : "w"(v261), "w"(v265)); - svfloat32_t v296; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v296) : "w"(v259), "w"(v263)); - svfloat32_t v297; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v297) : "w"(v273), "w"(v269)); - svfloat32_t v298; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v298) : "w"(v267), "w"(v271)); - svfloat32_t v299; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v299) : "w"(v269), "w"(v273)); - svfloat32_t v300; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v300) : "w"(v267), "w"(v271)); - svfloat32_t v301; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v301) : "w"(v261), "w"(v265)); - svfloat32_t v314; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v314) : "w"(v259), "w"(v267)); - svfloat32_t v315; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v315) : "w"(v265), "w"(v273)); - svfloat32_t v278; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v278) : "w"(v274), "w"(v276)); - svfloat32_t v279; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v279) : "w"(v275), "w"(v277)); - svfloat32_t v284; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v284) : "w"(v274), "w"(v276)); - svfloat32_t v285; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v285) : "w"(v275), "w"(v277)); - svfloat32_t v288; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v288) : "w"(v281), "w"(v283)); - svfloat32_t v289; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v289) : "w"(v280), "w"(v282)); - svfloat32_t v291; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v291) : "w"(v282), "w"(v283)); - svfloat32_t v292; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v292) : "w"(v280), "w"(v281)); - svfloat32_t v302; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v302) : "w"(v294), "w"(v295)); - svfloat32_t v303; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v303) : "w"(v298), "w"(v299)); - svfloat32_t v305; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v305) : "w"(v294), "w"(v295)); - svfloat32_t v306; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v306) : "w"(v298), "w"(v299)); - svfloat32_t v308; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v308) : "w"(v296), "w"(v297)); - svfloat32_t v309; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v309) : "w"(v300), "w"(v301)); - svfloat32_t v311; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v311) : "w"(v296), "w"(v297)); - svfloat32_t v312; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v312) : "w"(v300), "w"(v301)); - svfloat32_t v351; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v351) : "w"(v282), "w"(v920)); - svfloat32_t zero518; - asm volatile("mov %0.s, #0" : "=w"(zero518)); + svfloat32_t v258 = svadd_f32_x(svptrue_b32(), v43, v47); + svfloat32_t v259 = svsub_f32_x(svptrue_b32(), v43, v47); + svfloat32_t v260 = svadd_f32_x(svptrue_b32(), v73, v77); + svfloat32_t v261 = svsub_f32_x(svptrue_b32(), v73, v77); + svfloat32_t v262 = svadd_f32_x(svptrue_b32(), v103, v107); + svfloat32_t v263 = svsub_f32_x(svptrue_b32(), v103, v107); + svfloat32_t v264 = svadd_f32_x(svptrue_b32(), v133, v137); + svfloat32_t v265 = svsub_f32_x(svptrue_b32(), v133, v137); + svfloat32_t v266 = svadd_f32_x(svptrue_b32(), v163, v167); + svfloat32_t v267 = svsub_f32_x(svptrue_b32(), v163, v167); + svfloat32_t v268 = svadd_f32_x(svptrue_b32(), v193, v197); + svfloat32_t v269 = svsub_f32_x(svptrue_b32(), v193, v197); + svfloat32_t v270 = svadd_f32_x(svptrue_b32(), v223, v227); + svfloat32_t v271 = svsub_f32_x(svptrue_b32(), v223, v227); + svfloat32_t v272 = svadd_f32_x(svptrue_b32(), v253, v257); + svfloat32_t v273 = svsub_f32_x(svptrue_b32(), v253, v257); + svfloat32_t v274 = svadd_f32_x(svptrue_b32(), v258, v266); + svfloat32_t v275 = svadd_f32_x(svptrue_b32(), v260, v268); + svfloat32_t v276 = svadd_f32_x(svptrue_b32(), v262, v270); + svfloat32_t v277 = svadd_f32_x(svptrue_b32(), v264, v272); + svfloat32_t v280 = svsub_f32_x(svptrue_b32(), v258, v266); + svfloat32_t v281 = svsub_f32_x(svptrue_b32(), v260, v268); + svfloat32_t v282 = svsub_f32_x(svptrue_b32(), v262, v270); + svfloat32_t v283 = svsub_f32_x(svptrue_b32(), v264, v272); + svfloat32_t v294 = svadd_f32_x(svptrue_b32(), v259, v263); + svfloat32_t v295 = svadd_f32_x(svptrue_b32(), v261, v265); + svfloat32_t v296 = svsub_f32_x(svptrue_b32(), v259, v263); + svfloat32_t v297 = svsub_f32_x(svptrue_b32(), v273, v269); + svfloat32_t v298 = svadd_f32_x(svptrue_b32(), v267, v271); + svfloat32_t v299 = svadd_f32_x(svptrue_b32(), v269, v273); + svfloat32_t v300 = svsub_f32_x(svptrue_b32(), v267, v271); + svfloat32_t v301 = svsub_f32_x(svptrue_b32(), v261, v265); + svfloat32_t v314 = svadd_f32_x(svptrue_b32(), v259, v267); + svfloat32_t v315 = svadd_f32_x(svptrue_b32(), v265, v273); + svfloat32_t v278 = svadd_f32_x(svptrue_b32(), v274, v276); + svfloat32_t v279 = svadd_f32_x(svptrue_b32(), v275, v277); + svfloat32_t v284 = svsub_f32_x(svptrue_b32(), v274, v276); + svfloat32_t v285 = svsub_f32_x(svptrue_b32(), v275, v277); + svfloat32_t v288 = svadd_f32_x(svptrue_b32(), v281, v283); + svfloat32_t v289 = svadd_f32_x(svptrue_b32(), v280, v282); + svfloat32_t v291 = svsub_f32_x(svptrue_b32(), v282, v283); + svfloat32_t v292 = svsub_f32_x(svptrue_b32(), v280, v281); + svfloat32_t v302 = svadd_f32_x(svptrue_b32(), v294, v295); + svfloat32_t v303 = svadd_f32_x(svptrue_b32(), v298, v299); + svfloat32_t v305 = svsub_f32_x(svptrue_b32(), v294, v295); + svfloat32_t v306 = svsub_f32_x(svptrue_b32(), v298, v299); + svfloat32_t v308 = svadd_f32_x(svptrue_b32(), v296, v297); + svfloat32_t v309 = svadd_f32_x(svptrue_b32(), v300, v301); + svfloat32_t v311 = svsub_f32_x(svptrue_b32(), v296, v297); + svfloat32_t v312 = svsub_f32_x(svptrue_b32(), v300, v301); + svfloat32_t v351 = svmul_f32_x(svptrue_b32(), v282, v920); + svfloat32_t zero518 = svdup_n_f32(0); svfloat32_t v518 = svcmla_f32_x(pred_full, zero518, v947, v315, 90); - svfloat32_t v286; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v286) : "w"(v278), "w"(v279)); - svfloat32_t v287; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v287) : "w"(v278), "w"(v279)); - svfloat32_t v290; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v290) : "w"(v289), "w"(v288)); - svfloat32_t v293; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v293) : "w"(v284), "w"(v285)); - svfloat32_t v304; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v304) : "w"(v302), "w"(v303)); - svfloat32_t v307; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v307) : "w"(v305), "w"(v306)); - svfloat32_t v310; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v310) : "w"(v308), "w"(v309)); - svfloat32_t v313; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v313) : "w"(v311), "w"(v312)); - svfloat32_t v316; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v316) : "w"(v309), "w"(v303)); - svfloat32_t v319; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v319) : "w"(v302), "w"(v308)); - svfloat32_t v361; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v361) : "w"(v284), "w"(v922)); - svfloat32_t v366; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v366) : "w"(v285), "w"(v923)); - svfloat32_t v396; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v396) : "w"(v291), "w"(v929)); - svfloat32_t v401; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v401) : "w"(v292), "w"(v930)); - svfloat32_t v317; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v317) : "w"(v316), "w"(v259)); - svfloat32_t v320; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v320) : "w"(v319), "w"(v265)); - svfloat32_t v331; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v331) : "w"(v1140), "w"(v286)); - svfloat32_t v391; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v391) : "w"(v290), "w"(v928)); - svfloat32_t zero427; - asm volatile("mov %0.s, #0" : "=w"(zero427)); + svfloat32_t v286 = svadd_f32_x(svptrue_b32(), v278, v279); + svfloat32_t v287 = svsub_f32_x(svptrue_b32(), v278, v279); + svfloat32_t v290 = svsub_f32_x(svptrue_b32(), v289, v288); + svfloat32_t v293 = svadd_f32_x(svptrue_b32(), v284, v285); + svfloat32_t v304 = svadd_f32_x(svptrue_b32(), v302, v303); + svfloat32_t v307 = svadd_f32_x(svptrue_b32(), v305, v306); + svfloat32_t v310 = svadd_f32_x(svptrue_b32(), v308, v309); + svfloat32_t v313 = svadd_f32_x(svptrue_b32(), v311, v312); + svfloat32_t v316 = svsub_f32_x(svptrue_b32(), v309, v303); + svfloat32_t v319 = svsub_f32_x(svptrue_b32(), v302, v308); + svfloat32_t v361 = svmul_f32_x(svptrue_b32(), v284, v922); + svfloat32_t v366 = svmul_f32_x(svptrue_b32(), v285, v923); + svfloat32_t v396 = svmul_f32_x(svptrue_b32(), v291, v929); + svfloat32_t v401 = svmul_f32_x(svptrue_b32(), v292, v930); + svfloat32_t v317 = svadd_f32_x(svptrue_b32(), v316, v259); + svfloat32_t v320 = svadd_f32_x(svptrue_b32(), v319, v265); + svfloat32_t v331 = svadd_f32_x(svptrue_b32(), v1140, v286); + svfloat32_t v391 = svmul_f32_x(svptrue_b32(), v290, v928); + svfloat32_t zero427 = svdup_n_f32(0); svfloat32_t v427 = svcmla_f32_x(pred_full, zero427, v934, v304, 90); - svfloat32_t zero448; - asm volatile("mov %0.s, #0" : "=w"(zero448)); + svfloat32_t zero448 = svdup_n_f32(0); svfloat32_t v448 = svcmla_f32_x(pred_full, zero448, v937, v307, 90); - svfloat32_t zero469; - asm volatile("mov %0.s, #0" : "=w"(zero469)); + svfloat32_t zero469 = svdup_n_f32(0); svfloat32_t v469 = svcmla_f32_x(pred_full, zero469, v940, v310, 90); - svfloat32_t zero490; - asm volatile("mov %0.s, #0" : "=w"(zero490)); + svfloat32_t zero490 = svdup_n_f32(0); svfloat32_t v490 = svcmla_f32_x(pred_full, zero490, v943, v313, 90); svfloat32_t v556 = svmla_f32_x(pred_full, v396, v283, v921); svfloat32_t v557 = svnmls_f32_x(pred_full, v351, v291, v929); svfloat32_t v558 = svmla_f32_x(pred_full, v401, v281, v919); svfloat32_t v559 = svnmls_f32_x(pred_full, v401, v280, v918); - svfloat32_t v318; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v318) : "w"(v317), "w"(v315)); - svfloat32_t v321; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v321) : "w"(v320), "w"(v267)); + svfloat32_t v318 = svsub_f32_x(svptrue_b32(), v317, v315); + svfloat32_t v321 = svadd_f32_x(svptrue_b32(), v320, v267); svfloat32_t v554 = svmla_f32_x(pred_full, v391, v288, v926); svfloat32_t v555 = svnmls_f32_x(pred_full, v391, v289, v927); svfloat32_t v560 = svnmls_f32_x(pred_full, v366, v293, v931); @@ -10953,160 +7245,91 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu17(const armral_cmplx_f32_t *restrict x, svfloat32_t v587 = svcmla_f32_x(pred_full, v490, v941, v311, 90); svfloat32_t v588 = svcmla_f32_x(pred_full, v490, v942, v312, 90); svst1_f64(pred_full, (double *)(v960), svreinterpret_f64_f32(v331)); - svfloat32_t v322; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v322) : "w"(v321), "w"(v273)); - svfloat32_t zero539; - asm volatile("mov %0.s, #0" : "=w"(zero539)); + svfloat32_t v322 = svsub_f32_x(svptrue_b32(), v321, v273); + svfloat32_t zero539 = svdup_n_f32(0); svfloat32_t v539 = svcmla_f32_x(pred_full, zero539, v950, v318, 90); svfloat32_t v563 = svmla_f32_x(pred_full, v562, v287, v925); svfloat32_t v564 = svmls_f32_x(pred_full, v562, v287, v925); - svfloat32_t v565; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v565) : "w"(v554), "w"(v556)); - svfloat32_t v567; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v567) : "w"(v555), "w"(v557)); - svfloat32_t v569; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v569) : "w"(v554), "w"(v558)); - svfloat32_t v571; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v571) : "w"(v555), "w"(v559)); - svfloat32_t v592; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v592) : "w"(v581), "w"(v583)); - svfloat32_t v593; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v593) : "w"(v581), "w"(v583)); - svfloat32_t v594; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v594) : "w"(v582), "w"(v584)); - svfloat32_t v595; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v595) : "w"(v582), "w"(v584)); - svfloat32_t v596; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v596) : "w"(v585), "w"(v587)); - svfloat32_t v597; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v597) : "w"(v587), "w"(v585)); - svfloat32_t v598; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v598) : "w"(v586), "w"(v588)); - svfloat32_t v599; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v599) : "w"(v588), "w"(v586)); - svfloat32_t v323; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v323) : "w"(v318), "w"(v322)); - svfloat32_t zero546; - asm volatile("mov %0.s, #0" : "=w"(zero546)); + svfloat32_t v565 = svsub_f32_x(svptrue_b32(), v554, v556); + svfloat32_t v567 = svadd_f32_x(svptrue_b32(), v555, v557); + svfloat32_t v569 = svadd_f32_x(svptrue_b32(), v554, v558); + svfloat32_t v571 = svadd_f32_x(svptrue_b32(), v555, v559); + svfloat32_t v592 = svadd_f32_x(svptrue_b32(), v581, v583); + svfloat32_t v593 = svsub_f32_x(svptrue_b32(), v581, v583); + svfloat32_t v594 = svadd_f32_x(svptrue_b32(), v582, v584); + svfloat32_t v595 = svsub_f32_x(svptrue_b32(), v582, v584); + svfloat32_t v596 = svadd_f32_x(svptrue_b32(), v585, v587); + svfloat32_t v597 = svsub_f32_x(svptrue_b32(), v587, v585); + svfloat32_t v598 = svadd_f32_x(svptrue_b32(), v586, v588); + svfloat32_t v599 = svsub_f32_x(svptrue_b32(), v588, v586); + svfloat32_t v323 = svadd_f32_x(svptrue_b32(), v318, v322); + svfloat32_t zero546 = svdup_n_f32(0); svfloat32_t v546 = svcmla_f32_x(pred_full, zero546, v951, v322, 90); - svfloat32_t v566; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v566) : "w"(v560), "w"(v563)); - svfloat32_t v568; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v568) : "w"(v561), "w"(v564)); - svfloat32_t v570; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v570) : "w"(v563), "w"(v560)); - svfloat32_t v572; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v572) : "w"(v564), "w"(v561)); - svfloat32_t v609; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v609) : "w"(v594), "w"(v598)); - svfloat32_t v611; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v611) : "w"(v593), "w"(v599)); - svfloat32_t v613; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v613) : "w"(v592), "w"(v596)); - svfloat32_t v615; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v615) : "w"(v599), "w"(v593)); - svfloat32_t v617; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v617) : "w"(v592), "w"(v596)); - svfloat32_t v620; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v620) : "w"(v597), "w"(v595)); - svfloat32_t v623; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v623) : "w"(v598), "w"(v594)); - svfloat32_t v626; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v626) : "w"(v595), "w"(v597)); - svfloat32_t v573; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v573) : "w"(v565), "w"(v566)); - svfloat32_t v574; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v574) : "w"(v567), "w"(v568)); - svfloat32_t v575; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v575) : "w"(v569), "w"(v570)); - svfloat32_t v576; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v576) : "w"(v571), "w"(v572)); - svfloat32_t v577; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v577) : "w"(v566), "w"(v565)); - svfloat32_t v578; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v578) : "w"(v568), "w"(v567)); - svfloat32_t v579; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v579) : "w"(v570), "w"(v569)); - svfloat32_t v580; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v580) : "w"(v572), "w"(v571)); - svfloat32_t v600; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v600) : "w"(v539), "w"(v546)); + svfloat32_t v566 = svadd_f32_x(svptrue_b32(), v560, v563); + svfloat32_t v568 = svadd_f32_x(svptrue_b32(), v561, v564); + svfloat32_t v570 = svsub_f32_x(svptrue_b32(), v563, v560); + svfloat32_t v572 = svsub_f32_x(svptrue_b32(), v564, v561); + svfloat32_t v609 = svadd_f32_x(svptrue_b32(), v594, v598); + svfloat32_t v611 = svadd_f32_x(svptrue_b32(), v593, v599); + svfloat32_t v613 = svsub_f32_x(svptrue_b32(), v592, v596); + svfloat32_t v615 = svsub_f32_x(svptrue_b32(), v599, v593); + svfloat32_t v617 = svadd_f32_x(svptrue_b32(), v592, v596); + svfloat32_t v620 = svsub_f32_x(svptrue_b32(), v597, v595); + svfloat32_t v623 = svsub_f32_x(svptrue_b32(), v598, v594); + svfloat32_t v626 = svadd_f32_x(svptrue_b32(), v595, v597); + svfloat32_t v573 = svadd_f32_x(svptrue_b32(), v565, v566); + svfloat32_t v574 = svadd_f32_x(svptrue_b32(), v567, v568); + svfloat32_t v575 = svadd_f32_x(svptrue_b32(), v569, v570); + svfloat32_t v576 = svadd_f32_x(svptrue_b32(), v571, v572); + svfloat32_t v577 = svsub_f32_x(svptrue_b32(), v566, v565); + svfloat32_t v578 = svsub_f32_x(svptrue_b32(), v568, v567); + svfloat32_t v579 = svsub_f32_x(svptrue_b32(), v570, v569); + svfloat32_t v580 = svsub_f32_x(svptrue_b32(), v572, v571); + svfloat32_t v600 = svsub_f32_x(svptrue_b32(), v539, v546); svfloat32_t v589 = svcmla_f32_x(pred_full, v546, v952, v323, 90); - svfloat32_t v602; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v602) : "w"(v600), "w"(v600)); - svfloat32_t v627; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v627) : "w"(v626), "w"(v600)); + svfloat32_t v602 = svadd_f32_x(svptrue_b32(), v600, v600); + svfloat32_t v627 = svsub_f32_x(svptrue_b32(), v626, v600); svfloat32_t v590 = svcmla_f32_x(pred_full, v589, v944, v314, 90); - svfloat32_t v603; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v603) : "w"(v518), "w"(v602)); - svfloat32_t v606; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v606) : "w"(v589), "w"(v589)); - svfloat32_t v624; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v624) : "w"(v623), "w"(v602)); - svfloat32_t v667; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v667) : "w"(v580), "w"(v627)); - svfloat32_t v675; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v675) : "w"(v580), "w"(v627)); + svfloat32_t v603 = svsub_f32_x(svptrue_b32(), v518, v602); + svfloat32_t v606 = svadd_f32_x(svptrue_b32(), v589, v589); + svfloat32_t v624 = svadd_f32_x(svptrue_b32(), v623, v602); + svfloat32_t v667 = svadd_f32_x(svptrue_b32(), v580, v627); + svfloat32_t v675 = svsub_f32_x(svptrue_b32(), v580, v627); svfloat32_t v591 = svcmla_f32_x(pred_full, v590, v945, v259, 90); svfloat32_t v601 = svcmla_f32_x(pred_full, v590, v946, v267, 90); svfloat32_t v604 = svcmla_f32_x(pred_full, v603, v948, v265, 90); svfloat32_t v605 = svcmla_f32_x(pred_full, v603, v949, v273, 90); - svfloat32_t v607; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v607) : "w"(v606), "w"(v606)); - svfloat32_t v608; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v608) : "w"(v600), "w"(v606)); - svfloat32_t v614; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v614) : "w"(v613), "w"(v606)); - svfloat32_t v625; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v625) : "w"(v624), "w"(v606)); + svfloat32_t v607 = svadd_f32_x(svptrue_b32(), v606, v606); + svfloat32_t v608 = svadd_f32_x(svptrue_b32(), v600, v606); + svfloat32_t v614 = svadd_f32_x(svptrue_b32(), v613, v606); + svfloat32_t v625 = svadd_f32_x(svptrue_b32(), v624, v606); svst1_f64(pred_full, (double *)(v1005), svreinterpret_f64_f32(v667)); svst1_f64(pred_full, (double *)(v1014), svreinterpret_f64_f32(v675)); - svfloat32_t v610; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v610) : "w"(v609), "w"(v601)); - svfloat32_t v612; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v612) : "w"(v611), "w"(v604)); - svfloat32_t v616; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v616) : "w"(v615), "w"(v608)); - svfloat32_t v618; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v618) : "w"(v617), "w"(v591)); - svfloat32_t v621; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v621) : "w"(v620), "w"(v605)); - svfloat32_t v651; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v651) : "w"(v575), "w"(v614)); - svfloat32_t v659; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v659) : "w"(v575), "w"(v614)); - svfloat32_t v747; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v747) : "w"(v579), "w"(v625)); - svfloat32_t v755; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v755) : "w"(v579), "w"(v625)); - svfloat32_t v619; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v619) : "w"(v618), "w"(v600)); - svfloat32_t v622; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v622) : "w"(v621), "w"(v607)); - svfloat32_t v635; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v635) : "w"(v573), "w"(v610)); - svfloat32_t v643; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v643) : "w"(v573), "w"(v610)); - svfloat32_t v699; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v699) : "w"(v576), "w"(v616)); - svfloat32_t v707; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v707) : "w"(v576), "w"(v616)); - svfloat32_t v715; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v715) : "w"(v574), "w"(v612)); - svfloat32_t v723; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v723) : "w"(v574), "w"(v612)); + svfloat32_t v610 = svadd_f32_x(svptrue_b32(), v609, v601); + svfloat32_t v612 = svadd_f32_x(svptrue_b32(), v611, v604); + svfloat32_t v616 = svsub_f32_x(svptrue_b32(), v615, v608); + svfloat32_t v618 = svadd_f32_x(svptrue_b32(), v617, v591); + svfloat32_t v621 = svsub_f32_x(svptrue_b32(), v620, v605); + svfloat32_t v651 = svadd_f32_x(svptrue_b32(), v575, v614); + svfloat32_t v659 = svsub_f32_x(svptrue_b32(), v575, v614); + svfloat32_t v747 = svadd_f32_x(svptrue_b32(), v579, v625); + svfloat32_t v755 = svsub_f32_x(svptrue_b32(), v579, v625); + svfloat32_t v619 = svadd_f32_x(svptrue_b32(), v618, v600); + svfloat32_t v622 = svadd_f32_x(svptrue_b32(), v621, v607); + svfloat32_t v635 = svadd_f32_x(svptrue_b32(), v573, v610); + svfloat32_t v643 = svsub_f32_x(svptrue_b32(), v573, v610); + svfloat32_t v699 = svadd_f32_x(svptrue_b32(), v576, v616); + svfloat32_t v707 = svsub_f32_x(svptrue_b32(), v576, v616); + svfloat32_t v715 = svadd_f32_x(svptrue_b32(), v574, v612); + svfloat32_t v723 = svsub_f32_x(svptrue_b32(), v574, v612); svst1_f64(pred_full, (double *)(v987), svreinterpret_f64_f32(v651)); svst1_f64(pred_full, (double *)(v996), svreinterpret_f64_f32(v659)); svst1_f64(pred_full, (double *)(v1095), svreinterpret_f64_f32(v747)); svst1_f64(pred_full, (double *)(v1104), svreinterpret_f64_f32(v755)); - svfloat32_t v683; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v683) : "w"(v577), "w"(v619)); - svfloat32_t v691; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v691) : "w"(v577), "w"(v619)); - svfloat32_t v731; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v731) : "w"(v578), "w"(v622)); - svfloat32_t v739; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v739) : "w"(v578), "w"(v622)); + svfloat32_t v683 = svadd_f32_x(svptrue_b32(), v577, v619); + svfloat32_t v691 = svsub_f32_x(svptrue_b32(), v577, v619); + svfloat32_t v731 = svadd_f32_x(svptrue_b32(), v578, v622); + svfloat32_t v739 = svsub_f32_x(svptrue_b32(), v578, v622); svst1_f64(pred_full, (double *)(v969), svreinterpret_f64_f32(v635)); svst1_f64(pred_full, (double *)(v978), svreinterpret_f64_f32(v643)); svst1_f64(pred_full, (double *)(v1041), svreinterpret_f64_f32(v699)); @@ -11861,38 +8084,21 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu18(const armral_cmplx_f32_t *restrict x, const float32x2_t *v740 = &v5[v0]; float32x2_t *v863 = &v6[v2]; int64_t v19 = v0 * 9; - float32x2_t v30 = v7[8]; int64_t v34 = v0 * 2; int64_t v45 = v0 * 11; - float32x2_t v56 = v7[1]; - float32x2_t v60 = v7[10]; int64_t v64 = v0 * 4; int64_t v75 = v0 * 13; - float32x2_t v86 = v7[3]; - float32x2_t v90 = v7[12]; int64_t v94 = v0 * 6; int64_t v105 = v0 * 15; - float32x2_t v116 = v7[5]; - float32x2_t v120 = v7[14]; int64_t v124 = v0 * 8; int64_t v135 = v0 * 17; - float32x2_t v146 = v7[7]; - float32x2_t v150 = v7[16]; int64_t v154 = v0 * 10; - float32x2_t v176 = v7[9]; - float32x2_t v180 = v7[0]; int64_t v184 = v0 * 12; int64_t v195 = v0 * 3; - float32x2_t v206 = v7[11]; - float32x2_t v210 = v7[2]; int64_t v214 = v0 * 14; int64_t v225 = v0 * 5; - float32x2_t v236 = v7[13]; - float32x2_t v240 = v7[4]; int64_t v244 = v0 * 16; int64_t v255 = v0 * 7; - float32x2_t v266 = v7[15]; - float32x2_t v270 = v7[6]; float v454 = v4 * v451; float v476 = v4 * v473; float v483 = v4 * v480; @@ -11922,40 +8128,40 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu18(const armral_cmplx_f32_t *restrict x, float32x2_t *v836 = &v6[0]; svfloat32_t v1013 = svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v740)[0])); - svfloat32_t v31; - asm("mov %0.d, %d1" : "=w"(v31) : "w"(v30)); - svfloat32_t v57; - asm("mov %0.d, %d1" : "=w"(v57) : "w"(v56)); - svfloat32_t v61; - asm("mov %0.d, %d1" : "=w"(v61) : "w"(v60)); - svfloat32_t v87; - asm("mov %0.d, %d1" : "=w"(v87) : "w"(v86)); - svfloat32_t v91; - asm("mov %0.d, %d1" : "=w"(v91) : "w"(v90)); - svfloat32_t v117; - asm("mov %0.d, %d1" : "=w"(v117) : "w"(v116)); - svfloat32_t v121; - asm("mov %0.d, %d1" : "=w"(v121) : "w"(v120)); - svfloat32_t v147; - asm("mov %0.d, %d1" : "=w"(v147) : "w"(v146)); - svfloat32_t v151; - asm("mov %0.d, %d1" : "=w"(v151) : "w"(v150)); - svfloat32_t v177; - asm("mov %0.d, %d1" : "=w"(v177) : "w"(v176)); - svfloat32_t v181; - asm("mov %0.d, %d1" : "=w"(v181) : "w"(v180)); - svfloat32_t v207; - asm("mov %0.d, %d1" : "=w"(v207) : "w"(v206)); - svfloat32_t v211; - asm("mov %0.d, %d1" : "=w"(v211) : "w"(v210)); - svfloat32_t v237; - asm("mov %0.d, %d1" : "=w"(v237) : "w"(v236)); - svfloat32_t v241; - asm("mov %0.d, %d1" : "=w"(v241) : "w"(v240)); - svfloat32_t v267; - asm("mov %0.d, %d1" : "=w"(v267) : "w"(v266)); - svfloat32_t v271; - asm("mov %0.d, %d1" : "=w"(v271) : "w"(v270)); + svfloat32_t v31 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[8])); + svfloat32_t v57 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[1])); + svfloat32_t v61 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[10])); + svfloat32_t v87 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[3])); + svfloat32_t v91 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[12])); + svfloat32_t v117 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[5])); + svfloat32_t v121 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[14])); + svfloat32_t v147 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[7])); + svfloat32_t v151 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[16])); + svfloat32_t v177 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[9])); + svfloat32_t v181 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[0])); + svfloat32_t v207 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[11])); + svfloat32_t v211 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[2])); + svfloat32_t v237 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[13])); + svfloat32_t v241 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[4])); + svfloat32_t v267 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[15])); + svfloat32_t v271 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[6])); const float32x2_t *v650 = &v5[v19]; const float32x2_t *v659 = &v5[v34]; const float32x2_t *v668 = &v5[v45]; @@ -11994,8 +8200,7 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu18(const armral_cmplx_f32_t *restrict x, float32x2_t *v989 = &v6[v638]; svfloat32_t v1027 = svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v804)[0])); - svfloat32_t zero182; - asm volatile("mov %0.s, #0" : "=w"(zero182)); + svfloat32_t zero182 = svdup_n_f32(0); svfloat32_t v182 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero182, v1013, v181, 0), v1013, v181, 90); @@ -12031,289 +8236,181 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu18(const armral_cmplx_f32_t *restrict x, svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v785)[0])); svfloat32_t v1025 = svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v794)[0])); - svfloat32_t zero32; - asm volatile("mov %0.s, #0" : "=w"(zero32)); + svfloat32_t zero32 = svdup_n_f32(0); svfloat32_t v32 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero32, v993, v31, 0), v993, v31, 90); - svfloat32_t zero58; - asm volatile("mov %0.s, #0" : "=w"(zero58)); + svfloat32_t zero58 = svdup_n_f32(0); svfloat32_t v58 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero58, v995, v57, 0), v995, v57, 90); - svfloat32_t zero62; - asm volatile("mov %0.s, #0" : "=w"(zero62)); + svfloat32_t zero62 = svdup_n_f32(0); svfloat32_t v62 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero62, v997, v61, 0), v997, v61, 90); - svfloat32_t zero88; - asm volatile("mov %0.s, #0" : "=w"(zero88)); + svfloat32_t zero88 = svdup_n_f32(0); svfloat32_t v88 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero88, v999, v87, 0), v999, v87, 90); - svfloat32_t zero92; - asm volatile("mov %0.s, #0" : "=w"(zero92)); + svfloat32_t zero92 = svdup_n_f32(0); svfloat32_t v92 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero92, v1001, v91, 0), v1001, v91, 90); - svfloat32_t zero118; - asm volatile("mov %0.s, #0" : "=w"(zero118)); + svfloat32_t zero118 = svdup_n_f32(0); svfloat32_t v118 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero118, v1003, v117, 0), v1003, v117, 90); - svfloat32_t zero122; - asm volatile("mov %0.s, #0" : "=w"(zero122)); + svfloat32_t zero122 = svdup_n_f32(0); svfloat32_t v122 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero122, v1005, v121, 0), v1005, v121, 90); - svfloat32_t zero148; - asm volatile("mov %0.s, #0" : "=w"(zero148)); + svfloat32_t zero148 = svdup_n_f32(0); svfloat32_t v148 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero148, v1007, v147, 0), v1007, v147, 90); - svfloat32_t zero152; - asm volatile("mov %0.s, #0" : "=w"(zero152)); + svfloat32_t zero152 = svdup_n_f32(0); svfloat32_t v152 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero152, v1009, v151, 0), v1009, v151, 90); - svfloat32_t zero178; - asm volatile("mov %0.s, #0" : "=w"(zero178)); + svfloat32_t zero178 = svdup_n_f32(0); svfloat32_t v178 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero178, v1011, v177, 0), v1011, v177, 90); - svfloat32_t zero208; - asm volatile("mov %0.s, #0" : "=w"(zero208)); + svfloat32_t zero208 = svdup_n_f32(0); svfloat32_t v208 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero208, v1015, v207, 0), v1015, v207, 90); - svfloat32_t zero212; - asm volatile("mov %0.s, #0" : "=w"(zero212)); + svfloat32_t zero212 = svdup_n_f32(0); svfloat32_t v212 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero212, v1017, v211, 0), v1017, v211, 90); - svfloat32_t zero238; - asm volatile("mov %0.s, #0" : "=w"(zero238)); + svfloat32_t zero238 = svdup_n_f32(0); svfloat32_t v238 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero238, v1019, v237, 0), v1019, v237, 90); - svfloat32_t zero242; - asm volatile("mov %0.s, #0" : "=w"(zero242)); + svfloat32_t zero242 = svdup_n_f32(0); svfloat32_t v242 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero242, v1021, v241, 0), v1021, v241, 90); - svfloat32_t zero268; - asm volatile("mov %0.s, #0" : "=w"(zero268)); + svfloat32_t zero268 = svdup_n_f32(0); svfloat32_t v268 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero268, v1023, v267, 0), v1023, v267, 90); - svfloat32_t zero272; - asm volatile("mov %0.s, #0" : "=w"(zero272)); + svfloat32_t zero272 = svdup_n_f32(0); svfloat32_t v272 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero272, v1025, v271, 0), v1025, v271, 90); - svfloat32_t v280; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v280) : "w"(v1027), "w"(v32)); - svfloat32_t v281; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v281) : "w"(v1027), "w"(v32)); - svfloat32_t v282; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v282) : "w"(v58), "w"(v62)); - svfloat32_t v283; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v283) : "w"(v58), "w"(v62)); - svfloat32_t v284; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v284) : "w"(v88), "w"(v92)); - svfloat32_t v285; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v285) : "w"(v88), "w"(v92)); - svfloat32_t v286; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v286) : "w"(v118), "w"(v122)); - svfloat32_t v287; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v287) : "w"(v118), "w"(v122)); - svfloat32_t v288; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v288) : "w"(v148), "w"(v152)); - svfloat32_t v289; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v289) : "w"(v148), "w"(v152)); - svfloat32_t v290; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v290) : "w"(v178), "w"(v182)); - svfloat32_t v291; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v291) : "w"(v178), "w"(v182)); - svfloat32_t v292; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v292) : "w"(v208), "w"(v212)); - svfloat32_t v293; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v293) : "w"(v208), "w"(v212)); - svfloat32_t v294; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v294) : "w"(v238), "w"(v242)); - svfloat32_t v295; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v295) : "w"(v238), "w"(v242)); - svfloat32_t v296; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v296) : "w"(v268), "w"(v272)); - svfloat32_t v297; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v297) : "w"(v268), "w"(v272)); - svfloat32_t v298; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v298) : "w"(v282), "w"(v296)); - svfloat32_t v299; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v299) : "w"(v282), "w"(v296)); - svfloat32_t v300; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v300) : "w"(v294), "w"(v284)); - svfloat32_t v301; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v301) : "w"(v294), "w"(v284)); - svfloat32_t v302; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v302) : "w"(v286), "w"(v292)); - svfloat32_t v303; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v303) : "w"(v286), "w"(v292)); - svfloat32_t v304; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v304) : "w"(v288), "w"(v290)); - svfloat32_t v305; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v305) : "w"(v288), "w"(v290)); - svfloat32_t v408; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v408) : "w"(v283), "w"(v297)); - svfloat32_t v409; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v409) : "w"(v283), "w"(v297)); - svfloat32_t v410; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v410) : "w"(v295), "w"(v285)); - svfloat32_t v411; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v411) : "w"(v295), "w"(v285)); - svfloat32_t v412; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v412) : "w"(v287), "w"(v293)); - svfloat32_t v413; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v413) : "w"(v287), "w"(v293)); - svfloat32_t v414; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v414) : "w"(v289), "w"(v291)); - svfloat32_t v415; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v415) : "w"(v289), "w"(v291)); - svfloat32_t v306; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v306) : "w"(v298), "w"(v300)); - svfloat32_t v310; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v310) : "w"(v299), "w"(v301)); - svfloat32_t v312; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v312) : "w"(v298), "w"(v300)); - svfloat32_t v313; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v313) : "w"(v300), "w"(v304)); - svfloat32_t v314; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v314) : "w"(v304), "w"(v298)); - svfloat32_t v315; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v315) : "w"(v299), "w"(v301)); - svfloat32_t v316; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v316) : "w"(v301), "w"(v305)); - svfloat32_t v317; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v317) : "w"(v305), "w"(v299)); - svfloat32_t zero346; - asm volatile("mov %0.s, #0" : "=w"(zero346)); + svfloat32_t v280 = svadd_f32_x(svptrue_b32(), v1027, v32); + svfloat32_t v281 = svsub_f32_x(svptrue_b32(), v1027, v32); + svfloat32_t v282 = svadd_f32_x(svptrue_b32(), v58, v62); + svfloat32_t v283 = svsub_f32_x(svptrue_b32(), v58, v62); + svfloat32_t v284 = svadd_f32_x(svptrue_b32(), v88, v92); + svfloat32_t v285 = svsub_f32_x(svptrue_b32(), v88, v92); + svfloat32_t v286 = svadd_f32_x(svptrue_b32(), v118, v122); + svfloat32_t v287 = svsub_f32_x(svptrue_b32(), v118, v122); + svfloat32_t v288 = svadd_f32_x(svptrue_b32(), v148, v152); + svfloat32_t v289 = svsub_f32_x(svptrue_b32(), v148, v152); + svfloat32_t v290 = svadd_f32_x(svptrue_b32(), v178, v182); + svfloat32_t v291 = svsub_f32_x(svptrue_b32(), v178, v182); + svfloat32_t v292 = svadd_f32_x(svptrue_b32(), v208, v212); + svfloat32_t v293 = svsub_f32_x(svptrue_b32(), v208, v212); + svfloat32_t v294 = svadd_f32_x(svptrue_b32(), v238, v242); + svfloat32_t v295 = svsub_f32_x(svptrue_b32(), v238, v242); + svfloat32_t v296 = svadd_f32_x(svptrue_b32(), v268, v272); + svfloat32_t v297 = svsub_f32_x(svptrue_b32(), v268, v272); + svfloat32_t v298 = svadd_f32_x(svptrue_b32(), v282, v296); + svfloat32_t v299 = svsub_f32_x(svptrue_b32(), v282, v296); + svfloat32_t v300 = svadd_f32_x(svptrue_b32(), v294, v284); + svfloat32_t v301 = svsub_f32_x(svptrue_b32(), v294, v284); + svfloat32_t v302 = svadd_f32_x(svptrue_b32(), v286, v292); + svfloat32_t v303 = svsub_f32_x(svptrue_b32(), v286, v292); + svfloat32_t v304 = svadd_f32_x(svptrue_b32(), v288, v290); + svfloat32_t v305 = svsub_f32_x(svptrue_b32(), v288, v290); + svfloat32_t v408 = svadd_f32_x(svptrue_b32(), v283, v297); + svfloat32_t v409 = svsub_f32_x(svptrue_b32(), v283, v297); + svfloat32_t v410 = svadd_f32_x(svptrue_b32(), v295, v285); + svfloat32_t v411 = svsub_f32_x(svptrue_b32(), v295, v285); + svfloat32_t v412 = svadd_f32_x(svptrue_b32(), v287, v293); + svfloat32_t v413 = svsub_f32_x(svptrue_b32(), v287, v293); + svfloat32_t v414 = svadd_f32_x(svptrue_b32(), v289, v291); + svfloat32_t v415 = svsub_f32_x(svptrue_b32(), v289, v291); + svfloat32_t v306 = svadd_f32_x(svptrue_b32(), v298, v300); + svfloat32_t v310 = svadd_f32_x(svptrue_b32(), v299, v301); + svfloat32_t v312 = svsub_f32_x(svptrue_b32(), v298, v300); + svfloat32_t v313 = svsub_f32_x(svptrue_b32(), v300, v304); + svfloat32_t v314 = svsub_f32_x(svptrue_b32(), v304, v298); + svfloat32_t v315 = svsub_f32_x(svptrue_b32(), v299, v301); + svfloat32_t v316 = svsub_f32_x(svptrue_b32(), v301, v305); + svfloat32_t v317 = svsub_f32_x(svptrue_b32(), v305, v299); + svfloat32_t zero346 = svdup_n_f32(0); svfloat32_t v346 = svcmla_f32_x(pred_full, zero346, v822, v303, 90); - svfloat32_t v416; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v416) : "w"(v408), "w"(v410)); - svfloat32_t v420; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v420) : "w"(v409), "w"(v411)); - svfloat32_t v422; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v422) : "w"(v408), "w"(v410)); - svfloat32_t v423; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v423) : "w"(v410), "w"(v414)); - svfloat32_t v424; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v424) : "w"(v414), "w"(v408)); - svfloat32_t v425; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v425) : "w"(v409), "w"(v411)); - svfloat32_t v426; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v426) : "w"(v411), "w"(v415)); - svfloat32_t v427; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v427) : "w"(v415), "w"(v409)); - svfloat32_t zero456; - asm volatile("mov %0.s, #0" : "=w"(zero456)); + svfloat32_t v416 = svadd_f32_x(svptrue_b32(), v408, v410); + svfloat32_t v420 = svadd_f32_x(svptrue_b32(), v409, v411); + svfloat32_t v422 = svsub_f32_x(svptrue_b32(), v408, v410); + svfloat32_t v423 = svsub_f32_x(svptrue_b32(), v410, v414); + svfloat32_t v424 = svsub_f32_x(svptrue_b32(), v414, v408); + svfloat32_t v425 = svsub_f32_x(svptrue_b32(), v409, v411); + svfloat32_t v426 = svsub_f32_x(svptrue_b32(), v411, v415); + svfloat32_t v427 = svsub_f32_x(svptrue_b32(), v415, v409); + svfloat32_t zero456 = svdup_n_f32(0); svfloat32_t v456 = svcmla_f32_x(pred_full, zero456, v822, v413, 90); - svfloat32_t v307; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v307) : "w"(v306), "w"(v304)); - svfloat32_t v311; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v311) : "w"(v310), "w"(v305)); - svfloat32_t zero368; - asm volatile("mov %0.s, #0" : "=w"(zero368)); + svfloat32_t v307 = svadd_f32_x(svptrue_b32(), v306, v304); + svfloat32_t v311 = svadd_f32_x(svptrue_b32(), v310, v305); + svfloat32_t zero368 = svdup_n_f32(0); svfloat32_t v368 = svcmla_f32_x(pred_full, zero368, v826, v315, 90); - svfloat32_t zero375; - asm volatile("mov %0.s, #0" : "=w"(zero375)); + svfloat32_t zero375 = svdup_n_f32(0); svfloat32_t v375 = svcmla_f32_x(pred_full, zero375, v827, v316, 90); - svfloat32_t zero382; - asm volatile("mov %0.s, #0" : "=w"(zero382)); + svfloat32_t zero382 = svdup_n_f32(0); svfloat32_t v382 = svcmla_f32_x(pred_full, zero382, v828, v317, 90); - svfloat32_t v417; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v417) : "w"(v416), "w"(v414)); - svfloat32_t v421; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v421) : "w"(v420), "w"(v415)); - svfloat32_t zero478; - asm volatile("mov %0.s, #0" : "=w"(zero478)); + svfloat32_t v417 = svadd_f32_x(svptrue_b32(), v416, v414); + svfloat32_t v421 = svadd_f32_x(svptrue_b32(), v420, v415); + svfloat32_t zero478 = svdup_n_f32(0); svfloat32_t v478 = svcmla_f32_x(pred_full, zero478, v826, v425, 90); - svfloat32_t zero485; - asm volatile("mov %0.s, #0" : "=w"(zero485)); + svfloat32_t zero485 = svdup_n_f32(0); svfloat32_t v485 = svcmla_f32_x(pred_full, zero485, v827, v426, 90); - svfloat32_t zero492; - asm volatile("mov %0.s, #0" : "=w"(zero492)); + svfloat32_t zero492 = svdup_n_f32(0); svfloat32_t v492 = svcmla_f32_x(pred_full, zero492, v828, v427, 90); - svfloat32_t v308; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v308) : "w"(v307), "w"(v302)); - svfloat32_t v327; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v327) : "w"(v307), "w"(v819)); - svfloat32_t zero334; - asm volatile("mov %0.s, #0" : "=w"(zero334)); + svfloat32_t v308 = svadd_f32_x(svptrue_b32(), v307, v302); + svfloat32_t v327 = svmul_f32_x(svptrue_b32(), v307, v819); + svfloat32_t zero334 = svdup_n_f32(0); svfloat32_t v334 = svcmla_f32_x(pred_full, zero334, v822, v311, 90); - svfloat32_t v396; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v396) : "w"(v346), "w"(v368)); - svfloat32_t v398; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v398) : "w"(v346), "w"(v375)); - svfloat32_t v400; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v400) : "w"(v346), "w"(v368)); - svfloat32_t v418; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v418) : "w"(v417), "w"(v412)); - svfloat32_t v437; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v437) : "w"(v417), "w"(v819)); - svfloat32_t zero444; - asm volatile("mov %0.s, #0" : "=w"(zero444)); + svfloat32_t v396 = svadd_f32_x(svptrue_b32(), v346, v368); + svfloat32_t v398 = svsub_f32_x(svptrue_b32(), v346, v375); + svfloat32_t v400 = svsub_f32_x(svptrue_b32(), v346, v368); + svfloat32_t v418 = svadd_f32_x(svptrue_b32(), v417, v412); + svfloat32_t v437 = svmul_f32_x(svptrue_b32(), v417, v819); + svfloat32_t zero444 = svdup_n_f32(0); svfloat32_t v444 = svcmla_f32_x(pred_full, zero444, v822, v421, 90); - svfloat32_t v506; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v506) : "w"(v456), "w"(v478)); - svfloat32_t v508; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v508) : "w"(v456), "w"(v485)); - svfloat32_t v510; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v510) : "w"(v456), "w"(v478)); - svfloat32_t v309; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v309) : "w"(v308), "w"(v280)); - svfloat32_t v383; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v383) : "w"(v327), "w"(v327)); - svfloat32_t v397; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v397) : "w"(v396), "w"(v375)); - svfloat32_t v399; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v399) : "w"(v398), "w"(v382)); - svfloat32_t v401; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v401) : "w"(v400), "w"(v382)); - svfloat32_t v419; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v419) : "w"(v418), "w"(v281)); - svfloat32_t v493; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v493) : "w"(v437), "w"(v437)); - svfloat32_t v507; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v507) : "w"(v506), "w"(v485)); - svfloat32_t v509; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v509) : "w"(v508), "w"(v492)); - svfloat32_t v511; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v511) : "w"(v510), "w"(v492)); + svfloat32_t v506 = svadd_f32_x(svptrue_b32(), v456, v478); + svfloat32_t v508 = svsub_f32_x(svptrue_b32(), v456, v485); + svfloat32_t v510 = svsub_f32_x(svptrue_b32(), v456, v478); + svfloat32_t v309 = svadd_f32_x(svptrue_b32(), v308, v280); + svfloat32_t v383 = svadd_f32_x(svptrue_b32(), v327, v327); + svfloat32_t v397 = svadd_f32_x(svptrue_b32(), v396, v375); + svfloat32_t v399 = svadd_f32_x(svptrue_b32(), v398, v382); + svfloat32_t v401 = svsub_f32_x(svptrue_b32(), v400, v382); + svfloat32_t v419 = svadd_f32_x(svptrue_b32(), v418, v281); + svfloat32_t v493 = svadd_f32_x(svptrue_b32(), v437, v437); + svfloat32_t v507 = svadd_f32_x(svptrue_b32(), v506, v485); + svfloat32_t v509 = svadd_f32_x(svptrue_b32(), v508, v492); + svfloat32_t v511 = svsub_f32_x(svptrue_b32(), v510, v492); svfloat32_t v384 = svmla_f32_x(pred_full, v383, v307, v819); svfloat32_t v388 = svmla_f32_x(pred_full, v309, v302, v821); svfloat32_t v494 = svmla_f32_x(pred_full, v493, v417, v819); svfloat32_t v498 = svmla_f32_x(pred_full, v419, v412, v821); svst1_f64(pred_full, (double *)(v836), svreinterpret_f64_f32(v309)); svst1_f64(pred_full, (double *)(v845), svreinterpret_f64_f32(v419)); - svfloat32_t v385; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v385) : "w"(v309), "w"(v384)); - svfloat32_t v389; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v389) : "w"(v388), "w"(v383)); - svfloat32_t v495; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v495) : "w"(v419), "w"(v494)); - svfloat32_t v499; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v499) : "w"(v498), "w"(v493)); - svfloat32_t v386; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v386) : "w"(v385), "w"(v334)); - svfloat32_t v387; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v387) : "w"(v385), "w"(v334)); + svfloat32_t v385 = svadd_f32_x(svptrue_b32(), v309, v384); + svfloat32_t v389 = svadd_f32_x(svptrue_b32(), v388, v383); + svfloat32_t v495 = svadd_f32_x(svptrue_b32(), v419, v494); + svfloat32_t v499 = svadd_f32_x(svptrue_b32(), v498, v493); + svfloat32_t v386 = svadd_f32_x(svptrue_b32(), v385, v334); + svfloat32_t v387 = svsub_f32_x(svptrue_b32(), v385, v334); svfloat32_t v390 = svmla_f32_x(pred_full, v389, v312, v823); svfloat32_t v392 = svmls_f32_x(pred_full, v389, v313, v824); svfloat32_t v394 = svmls_f32_x(pred_full, v389, v312, v823); - svfloat32_t v496; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v496) : "w"(v495), "w"(v444)); - svfloat32_t v497; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v497) : "w"(v495), "w"(v444)); + svfloat32_t v496 = svadd_f32_x(svptrue_b32(), v495, v444); + svfloat32_t v497 = svsub_f32_x(svptrue_b32(), v495, v444); svfloat32_t v500 = svmla_f32_x(pred_full, v499, v422, v823); svfloat32_t v502 = svmls_f32_x(pred_full, v499, v423, v824); svfloat32_t v504 = svmls_f32_x(pred_full, v499, v422, v823); @@ -12327,30 +8424,18 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu18(const armral_cmplx_f32_t *restrict x, svst1_f64(pred_full, (double *)(v899), svreinterpret_f64_f32(v497)); svst1_f64(pred_full, (double *)(v944), svreinterpret_f64_f32(v386)); svst1_f64(pred_full, (double *)(v953), svreinterpret_f64_f32(v496)); - svfloat32_t v402; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v402) : "w"(v391), "w"(v397)); - svfloat32_t v403; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v403) : "w"(v391), "w"(v397)); - svfloat32_t v404; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v404) : "w"(v393), "w"(v399)); - svfloat32_t v405; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v405) : "w"(v393), "w"(v399)); - svfloat32_t v406; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v406) : "w"(v395), "w"(v401)); - svfloat32_t v407; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v407) : "w"(v395), "w"(v401)); - svfloat32_t v512; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v512) : "w"(v501), "w"(v507)); - svfloat32_t v513; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v513) : "w"(v501), "w"(v507)); - svfloat32_t v514; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v514) : "w"(v503), "w"(v509)); - svfloat32_t v515; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v515) : "w"(v503), "w"(v509)); - svfloat32_t v516; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v516) : "w"(v505), "w"(v511)); - svfloat32_t v517; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v517) : "w"(v505), "w"(v511)); + svfloat32_t v402 = svadd_f32_x(svptrue_b32(), v391, v397); + svfloat32_t v403 = svsub_f32_x(svptrue_b32(), v391, v397); + svfloat32_t v404 = svadd_f32_x(svptrue_b32(), v393, v399); + svfloat32_t v405 = svsub_f32_x(svptrue_b32(), v393, v399); + svfloat32_t v406 = svadd_f32_x(svptrue_b32(), v395, v401); + svfloat32_t v407 = svsub_f32_x(svptrue_b32(), v395, v401); + svfloat32_t v512 = svadd_f32_x(svptrue_b32(), v501, v507); + svfloat32_t v513 = svsub_f32_x(svptrue_b32(), v501, v507); + svfloat32_t v514 = svadd_f32_x(svptrue_b32(), v503, v509); + svfloat32_t v515 = svsub_f32_x(svptrue_b32(), v503, v509); + svfloat32_t v516 = svadd_f32_x(svptrue_b32(), v505, v511); + svfloat32_t v517 = svsub_f32_x(svptrue_b32(), v505, v511); svst1_f64(pred_full, (double *)(v854), svreinterpret_f64_f32(v403)); svst1_f64(pred_full, (double *)(v863), svreinterpret_f64_f32(v513)); svst1_f64(pred_full, (double *)(v872), svreinterpret_f64_f32(v404)); @@ -13567,40 +9652,22 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu19(const armral_cmplx_f32_t *restrict x, const float32x2_t *v853 = &v5[v0]; float32x2_t *v1074 = &v6[v2]; int64_t v30 = v0 * 18; - float32x2_t v41 = v7[0]; - float32x2_t v45 = v7[17]; int64_t v49 = v0 * 2; int64_t v60 = v0 * 17; - float32x2_t v71 = v7[16]; - float32x2_t v75 = v7[1]; int64_t v79 = v0 * 4; int64_t v90 = v0 * 15; - float32x2_t v101 = v7[3]; - float32x2_t v105 = v7[14]; int64_t v109 = v0 * 8; int64_t v120 = v0 * 11; - float32x2_t v131 = v7[10]; - float32x2_t v135 = v7[7]; int64_t v139 = v0 * 16; int64_t v150 = v0 * 3; - float32x2_t v161 = v7[15]; - float32x2_t v165 = v7[2]; int64_t v169 = v0 * 13; int64_t v180 = v0 * 6; - float32x2_t v191 = v7[5]; - float32x2_t v195 = v7[12]; int64_t v199 = v0 * 7; int64_t v210 = v0 * 12; - float32x2_t v221 = v7[6]; - float32x2_t v225 = v7[11]; int64_t v229 = v0 * 14; int64_t v240 = v0 * 5; - float32x2_t v251 = v7[4]; - float32x2_t v255 = v7[13]; int64_t v259 = v0 * 9; int64_t v270 = v0 * 10; - float32x2_t v281 = v7[8]; - float32x2_t v285 = v7[9]; float v482 = v4 * v479; float v489 = v4 * v486; float v496 = v4 * v493; @@ -13660,42 +9727,42 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu19(const armral_cmplx_f32_t *restrict x, float32x2_t *v1065 = &v6[0]; svfloat32_t v1231 = svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v853)[0])); - svfloat32_t v42; - asm("mov %0.d, %d1" : "=w"(v42) : "w"(v41)); - svfloat32_t v46; - asm("mov %0.d, %d1" : "=w"(v46) : "w"(v45)); - svfloat32_t v72; - asm("mov %0.d, %d1" : "=w"(v72) : "w"(v71)); - svfloat32_t v76; - asm("mov %0.d, %d1" : "=w"(v76) : "w"(v75)); - svfloat32_t v102; - asm("mov %0.d, %d1" : "=w"(v102) : "w"(v101)); - svfloat32_t v106; - asm("mov %0.d, %d1" : "=w"(v106) : "w"(v105)); - svfloat32_t v132; - asm("mov %0.d, %d1" : "=w"(v132) : "w"(v131)); - svfloat32_t v136; - asm("mov %0.d, %d1" : "=w"(v136) : "w"(v135)); - svfloat32_t v162; - asm("mov %0.d, %d1" : "=w"(v162) : "w"(v161)); - svfloat32_t v166; - asm("mov %0.d, %d1" : "=w"(v166) : "w"(v165)); - svfloat32_t v192; - asm("mov %0.d, %d1" : "=w"(v192) : "w"(v191)); - svfloat32_t v196; - asm("mov %0.d, %d1" : "=w"(v196) : "w"(v195)); - svfloat32_t v222; - asm("mov %0.d, %d1" : "=w"(v222) : "w"(v221)); - svfloat32_t v226; - asm("mov %0.d, %d1" : "=w"(v226) : "w"(v225)); - svfloat32_t v252; - asm("mov %0.d, %d1" : "=w"(v252) : "w"(v251)); - svfloat32_t v256; - asm("mov %0.d, %d1" : "=w"(v256) : "w"(v255)); - svfloat32_t v282; - asm("mov %0.d, %d1" : "=w"(v282) : "w"(v281)); - svfloat32_t v286; - asm("mov %0.d, %d1" : "=w"(v286) : "w"(v285)); + svfloat32_t v42 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[0])); + svfloat32_t v46 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[17])); + svfloat32_t v72 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[16])); + svfloat32_t v76 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[1])); + svfloat32_t v102 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[3])); + svfloat32_t v106 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[14])); + svfloat32_t v132 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[10])); + svfloat32_t v136 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[7])); + svfloat32_t v162 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[15])); + svfloat32_t v166 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[2])); + svfloat32_t v192 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[5])); + svfloat32_t v196 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[12])); + svfloat32_t v222 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[6])); + svfloat32_t v226 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[11])); + svfloat32_t v252 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[4])); + svfloat32_t v256 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[13])); + svfloat32_t v282 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[8])); + svfloat32_t v286 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[9])); const float32x2_t *v862 = &v5[v30]; const float32x2_t *v871 = &v5[v49]; const float32x2_t *v880 = &v5[v60]; @@ -13751,8 +9818,7 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu19(const armral_cmplx_f32_t *restrict x, float32x2_t *v1227 = &v6[v841]; svfloat32_t v1267 = svreinterpret_f32_f64( svld1_f64(pred_full, &((const double *)v1016)[0])); - svfloat32_t zero43; - asm volatile("mov %0.s, #0" : "=w"(zero43)); + svfloat32_t zero43 = svdup_n_f32(0); svfloat32_t v43 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero43, v1231, v42, 0), v1231, v42, 90); @@ -13790,488 +9856,295 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu19(const armral_cmplx_f32_t *restrict x, svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v997)[0])); svfloat32_t v1265 = svreinterpret_f32_f64( svld1_f64(pred_full, &((const double *)v1006)[0])); - svfloat32_t zero47; - asm volatile("mov %0.s, #0" : "=w"(zero47)); + svfloat32_t zero47 = svdup_n_f32(0); svfloat32_t v47 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero47, v1233, v46, 0), v1233, v46, 90); - svfloat32_t zero73; - asm volatile("mov %0.s, #0" : "=w"(zero73)); + svfloat32_t zero73 = svdup_n_f32(0); svfloat32_t v73 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero73, v1237, v72, 0), v1237, v72, 90); - svfloat32_t zero77; - asm volatile("mov %0.s, #0" : "=w"(zero77)); + svfloat32_t zero77 = svdup_n_f32(0); svfloat32_t v77 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero77, v1235, v76, 0), v1235, v76, 90); - svfloat32_t zero103; - asm volatile("mov %0.s, #0" : "=w"(zero103)); + svfloat32_t zero103 = svdup_n_f32(0); svfloat32_t v103 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero103, v1239, v102, 0), v1239, v102, 90); - svfloat32_t zero107; - asm volatile("mov %0.s, #0" : "=w"(zero107)); + svfloat32_t zero107 = svdup_n_f32(0); svfloat32_t v107 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero107, v1241, v106, 0), v1241, v106, 90); - svfloat32_t zero133; - asm volatile("mov %0.s, #0" : "=w"(zero133)); + svfloat32_t zero133 = svdup_n_f32(0); svfloat32_t v133 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero133, v1245, v132, 0), v1245, v132, 90); - svfloat32_t zero137; - asm volatile("mov %0.s, #0" : "=w"(zero137)); + svfloat32_t zero137 = svdup_n_f32(0); svfloat32_t v137 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero137, v1243, v136, 0), v1243, v136, 90); - svfloat32_t zero163; - asm volatile("mov %0.s, #0" : "=w"(zero163)); + svfloat32_t zero163 = svdup_n_f32(0); svfloat32_t v163 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero163, v1247, v162, 0), v1247, v162, 90); - svfloat32_t zero167; - asm volatile("mov %0.s, #0" : "=w"(zero167)); + svfloat32_t zero167 = svdup_n_f32(0); svfloat32_t v167 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero167, v1249, v166, 0), v1249, v166, 90); - svfloat32_t zero193; - asm volatile("mov %0.s, #0" : "=w"(zero193)); + svfloat32_t zero193 = svdup_n_f32(0); svfloat32_t v193 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero193, v1253, v192, 0), v1253, v192, 90); - svfloat32_t zero197; - asm volatile("mov %0.s, #0" : "=w"(zero197)); + svfloat32_t zero197 = svdup_n_f32(0); svfloat32_t v197 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero197, v1251, v196, 0), v1251, v196, 90); - svfloat32_t zero223; - asm volatile("mov %0.s, #0" : "=w"(zero223)); + svfloat32_t zero223 = svdup_n_f32(0); svfloat32_t v223 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero223, v1255, v222, 0), v1255, v222, 90); - svfloat32_t zero227; - asm volatile("mov %0.s, #0" : "=w"(zero227)); + svfloat32_t zero227 = svdup_n_f32(0); svfloat32_t v227 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero227, v1257, v226, 0), v1257, v226, 90); - svfloat32_t zero253; - asm volatile("mov %0.s, #0" : "=w"(zero253)); + svfloat32_t zero253 = svdup_n_f32(0); svfloat32_t v253 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero253, v1261, v252, 0), v1261, v252, 90); - svfloat32_t zero257; - asm volatile("mov %0.s, #0" : "=w"(zero257)); + svfloat32_t zero257 = svdup_n_f32(0); svfloat32_t v257 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero257, v1259, v256, 0), v1259, v256, 90); - svfloat32_t zero283; - asm volatile("mov %0.s, #0" : "=w"(zero283)); + svfloat32_t zero283 = svdup_n_f32(0); svfloat32_t v283 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero283, v1263, v282, 0), v1263, v282, 90); - svfloat32_t zero287; - asm volatile("mov %0.s, #0" : "=w"(zero287)); + svfloat32_t zero287 = svdup_n_f32(0); svfloat32_t v287 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero287, v1265, v286, 0), v1265, v286, 90); - svfloat32_t v288; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v288) : "w"(v43), "w"(v47)); - svfloat32_t v289; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v289) : "w"(v43), "w"(v47)); - svfloat32_t v290; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v290) : "w"(v77), "w"(v73)); - svfloat32_t v291; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v291) : "w"(v73), "w"(v77)); - svfloat32_t v292; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v292) : "w"(v103), "w"(v107)); - svfloat32_t v293; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v293) : "w"(v103), "w"(v107)); - svfloat32_t v294; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v294) : "w"(v137), "w"(v133)); - svfloat32_t v295; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v295) : "w"(v133), "w"(v137)); - svfloat32_t v296; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v296) : "w"(v163), "w"(v167)); - svfloat32_t v297; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v297) : "w"(v163), "w"(v167)); - svfloat32_t v298; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v298) : "w"(v197), "w"(v193)); - svfloat32_t v299; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v299) : "w"(v193), "w"(v197)); - svfloat32_t v300; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v300) : "w"(v223), "w"(v227)); - svfloat32_t v301; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v301) : "w"(v223), "w"(v227)); - svfloat32_t v302; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v302) : "w"(v257), "w"(v253)); - svfloat32_t v303; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v303) : "w"(v253), "w"(v257)); - svfloat32_t v304; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v304) : "w"(v283), "w"(v287)); - svfloat32_t v305; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v305) : "w"(v283), "w"(v287)); - svfloat32_t v306; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v306) : "w"(v288), "w"(v300)); - svfloat32_t v307; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v307) : "w"(v290), "w"(v302)); - svfloat32_t v308; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v308) : "w"(v292), "w"(v304)); - svfloat32_t v309; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v309) : "w"(v294), "w"(v300)); - svfloat32_t v310; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v310) : "w"(v296), "w"(v302)); - svfloat32_t v311; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v311) : "w"(v298), "w"(v304)); - svfloat32_t v312; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v312) : "w"(v288), "w"(v294)); - svfloat32_t v314; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v314) : "w"(v290), "w"(v296)); - svfloat32_t v316; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v316) : "w"(v292), "w"(v298)); - svfloat32_t v346; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v346) : "w"(v289), "w"(v301)); - svfloat32_t v347; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v347) : "w"(v291), "w"(v303)); - svfloat32_t v348; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v348) : "w"(v293), "w"(v305)); - svfloat32_t v349; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v349) : "w"(v295), "w"(v301)); - svfloat32_t v350; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v350) : "w"(v297), "w"(v303)); - svfloat32_t v351; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v351) : "w"(v299), "w"(v305)); - svfloat32_t v352; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v352) : "w"(v289), "w"(v295)); - svfloat32_t v354; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v354) : "w"(v291), "w"(v297)); - svfloat32_t v356; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v356) : "w"(v293), "w"(v299)); - svfloat32_t v313; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v313) : "w"(v312), "w"(v300)); - svfloat32_t v315; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v315) : "w"(v314), "w"(v302)); - svfloat32_t v317; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v317) : "w"(v316), "w"(v304)); - svfloat32_t v318; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v318) : "w"(v306), "w"(v308)); - svfloat32_t v319; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v319) : "w"(v309), "w"(v311)); - svfloat32_t v336; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v336) : "w"(v306), "w"(v309)); - svfloat32_t v337; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v337) : "w"(v308), "w"(v311)); - svfloat32_t v353; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v353) : "w"(v352), "w"(v301)); - svfloat32_t v355; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v355) : "w"(v354), "w"(v303)); - svfloat32_t v357; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v357) : "w"(v356), "w"(v305)); - svfloat32_t v358; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v358) : "w"(v346), "w"(v348)); - svfloat32_t v359; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v359) : "w"(v349), "w"(v351)); - svfloat32_t v368; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v368) : "w"(v346), "w"(v349)); - svfloat32_t v369; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v369) : "w"(v348), "w"(v351)); - svfloat32_t zero533; - asm volatile("mov %0.s, #0" : "=w"(zero533)); + svfloat32_t v288 = svadd_f32_x(svptrue_b32(), v43, v47); + svfloat32_t v289 = svsub_f32_x(svptrue_b32(), v43, v47); + svfloat32_t v290 = svadd_f32_x(svptrue_b32(), v77, v73); + svfloat32_t v291 = svsub_f32_x(svptrue_b32(), v73, v77); + svfloat32_t v292 = svadd_f32_x(svptrue_b32(), v103, v107); + svfloat32_t v293 = svsub_f32_x(svptrue_b32(), v103, v107); + svfloat32_t v294 = svadd_f32_x(svptrue_b32(), v137, v133); + svfloat32_t v295 = svsub_f32_x(svptrue_b32(), v133, v137); + svfloat32_t v296 = svadd_f32_x(svptrue_b32(), v163, v167); + svfloat32_t v297 = svsub_f32_x(svptrue_b32(), v163, v167); + svfloat32_t v298 = svadd_f32_x(svptrue_b32(), v197, v193); + svfloat32_t v299 = svsub_f32_x(svptrue_b32(), v193, v197); + svfloat32_t v300 = svadd_f32_x(svptrue_b32(), v223, v227); + svfloat32_t v301 = svsub_f32_x(svptrue_b32(), v223, v227); + svfloat32_t v302 = svadd_f32_x(svptrue_b32(), v257, v253); + svfloat32_t v303 = svsub_f32_x(svptrue_b32(), v253, v257); + svfloat32_t v304 = svadd_f32_x(svptrue_b32(), v283, v287); + svfloat32_t v305 = svsub_f32_x(svptrue_b32(), v283, v287); + svfloat32_t v306 = svsub_f32_x(svptrue_b32(), v288, v300); + svfloat32_t v307 = svsub_f32_x(svptrue_b32(), v290, v302); + svfloat32_t v308 = svsub_f32_x(svptrue_b32(), v292, v304); + svfloat32_t v309 = svsub_f32_x(svptrue_b32(), v294, v300); + svfloat32_t v310 = svsub_f32_x(svptrue_b32(), v296, v302); + svfloat32_t v311 = svsub_f32_x(svptrue_b32(), v298, v304); + svfloat32_t v312 = svadd_f32_x(svptrue_b32(), v288, v294); + svfloat32_t v314 = svadd_f32_x(svptrue_b32(), v290, v296); + svfloat32_t v316 = svadd_f32_x(svptrue_b32(), v292, v298); + svfloat32_t v346 = svsub_f32_x(svptrue_b32(), v289, v301); + svfloat32_t v347 = svsub_f32_x(svptrue_b32(), v291, v303); + svfloat32_t v348 = svsub_f32_x(svptrue_b32(), v293, v305); + svfloat32_t v349 = svsub_f32_x(svptrue_b32(), v295, v301); + svfloat32_t v350 = svsub_f32_x(svptrue_b32(), v297, v303); + svfloat32_t v351 = svsub_f32_x(svptrue_b32(), v299, v305); + svfloat32_t v352 = svadd_f32_x(svptrue_b32(), v289, v295); + svfloat32_t v354 = svadd_f32_x(svptrue_b32(), v291, v297); + svfloat32_t v356 = svadd_f32_x(svptrue_b32(), v293, v299); + svfloat32_t v313 = svadd_f32_x(svptrue_b32(), v312, v300); + svfloat32_t v315 = svadd_f32_x(svptrue_b32(), v314, v302); + svfloat32_t v317 = svadd_f32_x(svptrue_b32(), v316, v304); + svfloat32_t v318 = svadd_f32_x(svptrue_b32(), v306, v308); + svfloat32_t v319 = svadd_f32_x(svptrue_b32(), v309, v311); + svfloat32_t v336 = svsub_f32_x(svptrue_b32(), v306, v309); + svfloat32_t v337 = svsub_f32_x(svptrue_b32(), v308, v311); + svfloat32_t v353 = svadd_f32_x(svptrue_b32(), v352, v301); + svfloat32_t v355 = svadd_f32_x(svptrue_b32(), v354, v303); + svfloat32_t v357 = svadd_f32_x(svptrue_b32(), v356, v305); + svfloat32_t v358 = svadd_f32_x(svptrue_b32(), v346, v348); + svfloat32_t v359 = svadd_f32_x(svptrue_b32(), v349, v351); + svfloat32_t v368 = svsub_f32_x(svptrue_b32(), v346, v349); + svfloat32_t v369 = svsub_f32_x(svptrue_b32(), v348, v351); + svfloat32_t zero533 = svdup_n_f32(0); svfloat32_t v533 = svcmla_f32_x(pred_full, zero533, v1046, v349, 90); - svfloat32_t zero554; - asm volatile("mov %0.s, #0" : "=w"(zero554)); + svfloat32_t zero554 = svdup_n_f32(0); svfloat32_t v554 = svcmla_f32_x(pred_full, zero554, v1049, v351, 90); - svfloat32_t v320; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v320) : "w"(v313), "w"(v315)); - svfloat32_t v330; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v330) : "w"(v319), "w"(v310)); - svfloat32_t v331; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v331) : "w"(v318), "w"(v307)); - svfloat32_t v333; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v333) : "w"(v319), "w"(v310)); - svfloat32_t v334; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v334) : "w"(v318), "w"(v307)); - svfloat32_t v338; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v338) : "w"(v306), "w"(v337)); - svfloat32_t v340; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v340) : "w"(v336), "w"(v311)); - svfloat32_t v343; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v343) : "w"(v313), "w"(v317)); - svfloat32_t v344; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v344) : "w"(v315), "w"(v317)); - svfloat32_t v360; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v360) : "w"(v353), "w"(v355)); - svfloat32_t v362; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v362) : "w"(v359), "w"(v350)); - svfloat32_t v363; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v363) : "w"(v358), "w"(v347)); - svfloat32_t v365; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v365) : "w"(v359), "w"(v350)); - svfloat32_t v366; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v366) : "w"(v358), "w"(v347)); - svfloat32_t v370; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v370) : "w"(v346), "w"(v369)); - svfloat32_t v372; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v372) : "w"(v368), "w"(v351)); - svfloat32_t v375; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v375) : "w"(v353), "w"(v357)); - svfloat32_t v376; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v376) : "w"(v355), "w"(v357)); - svfloat32_t v321; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v321) : "w"(v320), "w"(v317)); - svfloat32_t v332; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v332) : "w"(v331), "w"(v330)); - svfloat32_t v335; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v335) : "w"(v334), "w"(v333)); - svfloat32_t v339; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v339) : "w"(v338), "w"(v310)); - svfloat32_t v341; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v341) : "w"(v340), "w"(v307)); - svfloat32_t v345; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v345) : "w"(v343), "w"(v344)); - svfloat32_t v361; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v361) : "w"(v360), "w"(v357)); - svfloat32_t v364; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v364) : "w"(v363), "w"(v362)); - svfloat32_t v367; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v367) : "w"(v366), "w"(v365)); - svfloat32_t v371; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v371) : "w"(v370), "w"(v350)); - svfloat32_t v373; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v373) : "w"(v372), "w"(v347)); - svfloat32_t v377; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v377) : "w"(v375), "w"(v376)); - svfloat32_t v397; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v397) : "w"(v331), "w"(v1022)); - svfloat32_t v412; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v412) : "w"(v334), "w"(v1025)); - svfloat32_t zero491; - asm volatile("mov %0.s, #0" : "=w"(zero491)); + svfloat32_t v320 = svadd_f32_x(svptrue_b32(), v313, v315); + svfloat32_t v330 = svadd_f32_x(svptrue_b32(), v319, v310); + svfloat32_t v331 = svadd_f32_x(svptrue_b32(), v318, v307); + svfloat32_t v333 = svsub_f32_x(svptrue_b32(), v319, v310); + svfloat32_t v334 = svsub_f32_x(svptrue_b32(), v318, v307); + svfloat32_t v338 = svsub_f32_x(svptrue_b32(), v306, v337); + svfloat32_t v340 = svadd_f32_x(svptrue_b32(), v336, v311); + svfloat32_t v343 = svsub_f32_x(svptrue_b32(), v313, v317); + svfloat32_t v344 = svsub_f32_x(svptrue_b32(), v315, v317); + svfloat32_t v360 = svadd_f32_x(svptrue_b32(), v353, v355); + svfloat32_t v362 = svadd_f32_x(svptrue_b32(), v359, v350); + svfloat32_t v363 = svadd_f32_x(svptrue_b32(), v358, v347); + svfloat32_t v365 = svsub_f32_x(svptrue_b32(), v359, v350); + svfloat32_t v366 = svsub_f32_x(svptrue_b32(), v358, v347); + svfloat32_t v370 = svsub_f32_x(svptrue_b32(), v346, v369); + svfloat32_t v372 = svadd_f32_x(svptrue_b32(), v368, v351); + svfloat32_t v375 = svsub_f32_x(svptrue_b32(), v353, v357); + svfloat32_t v376 = svsub_f32_x(svptrue_b32(), v355, v357); + svfloat32_t v321 = svadd_f32_x(svptrue_b32(), v320, v317); + svfloat32_t v332 = svsub_f32_x(svptrue_b32(), v331, v330); + svfloat32_t v335 = svsub_f32_x(svptrue_b32(), v334, v333); + svfloat32_t v339 = svsub_f32_x(svptrue_b32(), v338, v310); + svfloat32_t v341 = svsub_f32_x(svptrue_b32(), v340, v307); + svfloat32_t v345 = svadd_f32_x(svptrue_b32(), v343, v344); + svfloat32_t v361 = svadd_f32_x(svptrue_b32(), v360, v357); + svfloat32_t v364 = svsub_f32_x(svptrue_b32(), v363, v362); + svfloat32_t v367 = svsub_f32_x(svptrue_b32(), v366, v365); + svfloat32_t v371 = svsub_f32_x(svptrue_b32(), v370, v350); + svfloat32_t v373 = svsub_f32_x(svptrue_b32(), v372, v347); + svfloat32_t v377 = svadd_f32_x(svptrue_b32(), v375, v376); + svfloat32_t v397 = svmul_f32_x(svptrue_b32(), v331, v1022); + svfloat32_t v412 = svmul_f32_x(svptrue_b32(), v334, v1025); + svfloat32_t zero491 = svdup_n_f32(0); svfloat32_t v491 = svcmla_f32_x(pred_full, zero491, v1040, v362, 90); - svfloat32_t zero512; - asm volatile("mov %0.s, #0" : "=w"(zero512)); + svfloat32_t zero512 = svdup_n_f32(0); svfloat32_t v512 = svcmla_f32_x(pred_full, zero512, v1043, v365, 90); - svfloat32_t zero596; - asm volatile("mov %0.s, #0" : "=w"(zero596)); + svfloat32_t zero596 = svdup_n_f32(0); svfloat32_t v596 = svcmla_f32_x(pred_full, zero596, v1055, v375, 90); - svfloat32_t zero603; - asm volatile("mov %0.s, #0" : "=w"(zero603)); + svfloat32_t zero603 = svdup_n_f32(0); svfloat32_t v603 = svcmla_f32_x(pred_full, zero603, v1056, v376, 90); - svfloat32_t v329; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v329) : "w"(v1267), "w"(v321)); - svfloat32_t v342; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v342) : "w"(v339), "w"(v341)); - svfloat32_t v374; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v374) : "w"(v371), "w"(v373)); - svfloat32_t v402; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v402) : "w"(v332), "w"(v1023)); - svfloat32_t v417; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v417) : "w"(v335), "w"(v1026)); - svfloat32_t v477; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v477) : "w"(v345), "w"(v1038)); - svfloat32_t zero484; - asm volatile("mov %0.s, #0" : "=w"(zero484)); + svfloat32_t v329 = svadd_f32_x(svptrue_b32(), v1267, v321); + svfloat32_t v342 = svsub_f32_x(svptrue_b32(), v339, v341); + svfloat32_t v374 = svsub_f32_x(svptrue_b32(), v371, v373); + svfloat32_t v402 = svmul_f32_x(svptrue_b32(), v332, v1023); + svfloat32_t v417 = svmul_f32_x(svptrue_b32(), v335, v1026); + svfloat32_t v477 = svmul_f32_x(svptrue_b32(), v345, v1038); + svfloat32_t zero484 = svdup_n_f32(0); svfloat32_t v484 = svcmla_f32_x(pred_full, zero484, v1039, v361, 90); - svfloat32_t zero610; - asm volatile("mov %0.s, #0" : "=w"(zero610)); + svfloat32_t zero610 = svdup_n_f32(0); svfloat32_t v610 = svcmla_f32_x(pred_full, zero610, v1057, v377, 90); svfloat32_t v611 = svmla_f32_x(pred_full, v397, v330, v1021); svfloat32_t v612 = svmla_f32_x(pred_full, v412, v333, v1024); svfloat32_t v642 = svcmla_f32_x(pred_full, v491, v1041, v363, 90); svfloat32_t v643 = svcmla_f32_x(pred_full, v512, v1044, v366, 90); - svfloat32_t v462; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v462) : "w"(v342), "w"(v1035)); - svfloat32_t zero589; - asm volatile("mov %0.s, #0" : "=w"(zero589)); + svfloat32_t v462 = svmul_f32_x(svptrue_b32(), v342, v1035); + svfloat32_t zero589 = svdup_n_f32(0); svfloat32_t v589 = svcmla_f32_x(pred_full, zero589, v1054, v374, 90); - svfloat32_t v614; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v614) : "w"(v611), "w"(v612)); + svfloat32_t v614 = svadd_f32_x(svptrue_b32(), v611, v612); svfloat32_t v615 = svmla_f32_x(pred_full, v402, v330, v1021); svfloat32_t v616 = svmla_f32_x(pred_full, v417, v333, v1024); - svfloat32_t v633; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v633) : "w"(v611), "w"(v612)); + svfloat32_t v633 = svsub_f32_x(svptrue_b32(), v611, v612); svfloat32_t v635 = svnmls_f32_x(pred_full, v477, v343, v1036); svfloat32_t v636 = svnmls_f32_x(pred_full, v477, v344, v1037); svfloat32_t v637 = svmla_f32_x(pred_full, v329, v321, v1020); - svfloat32_t v645; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v645) : "w"(v642), "w"(v643)); + svfloat32_t v645 = svadd_f32_x(svptrue_b32(), v642, v643); svfloat32_t v646 = svcmla_f32_x(pred_full, v491, v1042, v364, 90); svfloat32_t v647 = svcmla_f32_x(pred_full, v512, v1045, v367, 90); - svfloat32_t v664; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v664) : "w"(v642), "w"(v643)); - svfloat32_t v666; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v666) : "w"(v596), "w"(v610)); - svfloat32_t v667; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v667) : "w"(v603), "w"(v610)); + svfloat32_t v664 = svsub_f32_x(svptrue_b32(), v642, v643); + svfloat32_t v666 = svsub_f32_x(svptrue_b32(), v596, v610); + svfloat32_t v667 = svsub_f32_x(svptrue_b32(), v603, v610); svst1_f64(pred_full, (double *)(v1065), svreinterpret_f64_f32(v329)); svfloat32_t v613 = svmla_f32_x(pred_full, v462, v341, v1034); svfloat32_t v617 = svmla_f32_x(pred_full, v462, v339, v1033); svfloat32_t v618 = svnmls_f32_x(pred_full, v614, v309, v1027); - svfloat32_t v619; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v619) : "w"(v615), "w"(v616)); - svfloat32_t v625; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v625) : "w"(v615), "w"(v616)); + svfloat32_t v619 = svadd_f32_x(svptrue_b32(), v615, v616); + svfloat32_t v625 = svsub_f32_x(svptrue_b32(), v615, v616); svfloat32_t v630 = svmla_f32_x(pred_full, v614, v308, v1032); - svfloat32_t v638; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v638) : "w"(v637), "w"(v635)); - svfloat32_t v639; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v639) : "w"(v637), "w"(v635)); - svfloat32_t v641; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v641) : "w"(v637), "w"(v636)); + svfloat32_t v638 = svadd_f32_x(svptrue_b32(), v637, v635); + svfloat32_t v639 = svsub_f32_x(svptrue_b32(), v637, v635); + svfloat32_t v641 = svadd_f32_x(svptrue_b32(), v637, v636); svfloat32_t v644 = svcmla_f32_x(pred_full, v589, v1053, v373, 90); svfloat32_t v648 = svcmla_f32_x(pred_full, v589, v1052, v371, 90); - svfloat32_t v649; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v649) : "w"(v533), "w"(v645)); - svfloat32_t v650; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v650) : "w"(v646), "w"(v647)); - svfloat32_t v656; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v656) : "w"(v646), "w"(v647)); + svfloat32_t v649 = svsub_f32_x(svptrue_b32(), v533, v645); + svfloat32_t v650 = svadd_f32_x(svptrue_b32(), v646, v647); + svfloat32_t v656 = svsub_f32_x(svptrue_b32(), v646, v647); svfloat32_t v661 = svcmla_f32_x(pred_full, v645, v1051, v348, 90); - svfloat32_t v668; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v668) : "w"(v484), "w"(v666)); - svfloat32_t v669; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v669) : "w"(v484), "w"(v666)); - svfloat32_t v671; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v671) : "w"(v484), "w"(v667)); + svfloat32_t v668 = svadd_f32_x(svptrue_b32(), v484, v666); + svfloat32_t v669 = svsub_f32_x(svptrue_b32(), v484, v666); + svfloat32_t v671 = svadd_f32_x(svptrue_b32(), v484, v667); svfloat32_t v620 = svnmls_f32_x(pred_full, v617, v311, v1030); svfloat32_t v621 = svmla_f32_x(pred_full, v613, v336, v1028); svfloat32_t v623 = svmla_f32_x(pred_full, v619, v337, v1031); - svfloat32_t v626; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v626) : "w"(v625), "w"(v613)); - svfloat32_t v627; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v627) : "w"(v618), "w"(v619)); - svfloat32_t v634; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v634) : "w"(v633), "w"(v617)); - svfloat32_t v640; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v640) : "w"(v639), "w"(v636)); - svfloat32_t v651; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v651) : "w"(v554), "w"(v648)); + svfloat32_t v626 = svadd_f32_x(svptrue_b32(), v625, v613); + svfloat32_t v627 = svadd_f32_x(svptrue_b32(), v618, v619); + svfloat32_t v634 = svadd_f32_x(svptrue_b32(), v633, v617); + svfloat32_t v640 = svsub_f32_x(svptrue_b32(), v639, v636); + svfloat32_t v651 = svsub_f32_x(svptrue_b32(), v554, v648); svfloat32_t v652 = svcmla_f32_x(pred_full, v644, v1047, v368, 90); svfloat32_t v654 = svcmla_f32_x(pred_full, v650, v1050, v369, 90); - svfloat32_t v657; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v657) : "w"(v656), "w"(v644)); - svfloat32_t v658; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v658) : "w"(v649), "w"(v650)); - svfloat32_t v665; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v665) : "w"(v664), "w"(v648)); - svfloat32_t v670; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v670) : "w"(v669), "w"(v667)); - svfloat32_t v622; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v622) : "w"(v621), "w"(v618)); - svfloat32_t v624; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v624) : "w"(v623), "w"(v620)); + svfloat32_t v657 = svadd_f32_x(svptrue_b32(), v656, v644); + svfloat32_t v658 = svadd_f32_x(svptrue_b32(), v649, v650); + svfloat32_t v665 = svadd_f32_x(svptrue_b32(), v664, v648); + svfloat32_t v670 = svsub_f32_x(svptrue_b32(), v669, v667); + svfloat32_t v622 = svadd_f32_x(svptrue_b32(), v621, v618); + svfloat32_t v624 = svadd_f32_x(svptrue_b32(), v623, v620); svfloat32_t v628 = svmla_f32_x(pred_full, v627, v306, v1029); - svfloat32_t v631; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v631) : "w"(v630), "w"(v620)); - svfloat32_t v653; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v653) : "w"(v652), "w"(v649)); - svfloat32_t v655; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v655) : "w"(v654), "w"(v651)); + svfloat32_t v631 = svadd_f32_x(svptrue_b32(), v630, v620); + svfloat32_t v653 = svadd_f32_x(svptrue_b32(), v652, v649); + svfloat32_t v655 = svadd_f32_x(svptrue_b32(), v654, v651); svfloat32_t v659 = svcmla_f32_x(pred_full, v658, v1048, v346, 90); - svfloat32_t v662; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v662) : "w"(v661), "w"(v651)); - svfloat32_t v676; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v676) : "w"(v634), "w"(v626)); - svfloat32_t v680; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v680) : "w"(v641), "w"(v634)); - svfloat32_t v683; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v683) : "w"(v626), "w"(v641)); - svfloat32_t v688; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v688) : "w"(v665), "w"(v657)); - svfloat32_t v692; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v692) : "w"(v665), "w"(v671)); - svfloat32_t v695; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v695) : "w"(v657), "w"(v671)); - svfloat32_t v629; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v629) : "w"(v628), "w"(v617)); - svfloat32_t v632; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v632) : "w"(v631), "w"(v613)); - svfloat32_t v660; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v660) : "w"(v659), "w"(v648)); - svfloat32_t v663; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v663) : "w"(v662), "w"(v644)); - svfloat32_t v677; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v677) : "w"(v676), "w"(v641)); - svfloat32_t v681; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v681) : "w"(v622), "w"(v638)); - svfloat32_t v682; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v682) : "w"(v624), "w"(v640)); - svfloat32_t v689; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v689) : "w"(v688), "w"(v671)); - svfloat32_t v693; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v693) : "w"(v653), "w"(v668)); - svfloat32_t v694; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v694) : "w"(v655), "w"(v670)); - svfloat32_t v719; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v719) : "w"(v683), "w"(v695)); - svfloat32_t v727; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v727) : "w"(v683), "w"(v695)); - svfloat32_t v735; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v735) : "w"(v680), "w"(v692)); - svfloat32_t v743; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v743) : "w"(v680), "w"(v692)); - svfloat32_t v672; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v672) : "w"(v629), "w"(v622)); - svfloat32_t v674; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v674) : "w"(v632), "w"(v624)); - svfloat32_t v678; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v678) : "w"(v638), "w"(v629)); - svfloat32_t v679; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v679) : "w"(v640), "w"(v632)); - svfloat32_t v684; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v684) : "w"(v660), "w"(v653)); - svfloat32_t v686; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v686) : "w"(v663), "w"(v655)); - svfloat32_t v690; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v690) : "w"(v668), "w"(v660)); - svfloat32_t v691; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v691) : "w"(v670), "w"(v663)); - svfloat32_t v751; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v751) : "w"(v682), "w"(v694)); - svfloat32_t v759; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v759) : "w"(v682), "w"(v694)); - svfloat32_t v767; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v767) : "w"(v677), "w"(v689)); - svfloat32_t v775; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v775) : "w"(v677), "w"(v689)); - svfloat32_t v815; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v815) : "w"(v681), "w"(v693)); - svfloat32_t v823; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v823) : "w"(v681), "w"(v693)); + svfloat32_t v662 = svadd_f32_x(svptrue_b32(), v661, v651); + svfloat32_t v676 = svsub_f32_x(svptrue_b32(), v634, v626); + svfloat32_t v680 = svsub_f32_x(svptrue_b32(), v641, v634); + svfloat32_t v683 = svadd_f32_x(svptrue_b32(), v626, v641); + svfloat32_t v688 = svsub_f32_x(svptrue_b32(), v665, v657); + svfloat32_t v692 = svsub_f32_x(svptrue_b32(), v665, v671); + svfloat32_t v695 = svadd_f32_x(svptrue_b32(), v657, v671); + svfloat32_t v629 = svadd_f32_x(svptrue_b32(), v628, v617); + svfloat32_t v632 = svadd_f32_x(svptrue_b32(), v631, v613); + svfloat32_t v660 = svadd_f32_x(svptrue_b32(), v659, v648); + svfloat32_t v663 = svadd_f32_x(svptrue_b32(), v662, v644); + svfloat32_t v677 = svadd_f32_x(svptrue_b32(), v676, v641); + svfloat32_t v681 = svadd_f32_x(svptrue_b32(), v622, v638); + svfloat32_t v682 = svadd_f32_x(svptrue_b32(), v624, v640); + svfloat32_t v689 = svadd_f32_x(svptrue_b32(), v688, v671); + svfloat32_t v693 = svadd_f32_x(svptrue_b32(), v653, v668); + svfloat32_t v694 = svadd_f32_x(svptrue_b32(), v655, v670); + svfloat32_t v719 = svsub_f32_x(svptrue_b32(), v683, v695); + svfloat32_t v727 = svadd_f32_x(svptrue_b32(), v683, v695); + svfloat32_t v735 = svadd_f32_x(svptrue_b32(), v680, v692); + svfloat32_t v743 = svsub_f32_x(svptrue_b32(), v680, v692); + svfloat32_t v672 = svsub_f32_x(svptrue_b32(), v629, v622); + svfloat32_t v674 = svsub_f32_x(svptrue_b32(), v632, v624); + svfloat32_t v678 = svsub_f32_x(svptrue_b32(), v638, v629); + svfloat32_t v679 = svsub_f32_x(svptrue_b32(), v640, v632); + svfloat32_t v684 = svsub_f32_x(svptrue_b32(), v660, v653); + svfloat32_t v686 = svsub_f32_x(svptrue_b32(), v663, v655); + svfloat32_t v690 = svsub_f32_x(svptrue_b32(), v668, v660); + svfloat32_t v691 = svsub_f32_x(svptrue_b32(), v670, v663); + svfloat32_t v751 = svadd_f32_x(svptrue_b32(), v682, v694); + svfloat32_t v759 = svsub_f32_x(svptrue_b32(), v682, v694); + svfloat32_t v767 = svadd_f32_x(svptrue_b32(), v677, v689); + svfloat32_t v775 = svsub_f32_x(svptrue_b32(), v677, v689); + svfloat32_t v815 = svsub_f32_x(svptrue_b32(), v681, v693); + svfloat32_t v823 = svadd_f32_x(svptrue_b32(), v681, v693); svst1_f64(pred_full, (double *)(v1092), svreinterpret_f64_f32(v719)); svst1_f64(pred_full, (double *)(v1101), svreinterpret_f64_f32(v727)); svst1_f64(pred_full, (double *)(v1110), svreinterpret_f64_f32(v735)); svst1_f64(pred_full, (double *)(v1119), svreinterpret_f64_f32(v743)); - svfloat32_t v673; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v673) : "w"(v672), "w"(v638)); - svfloat32_t v675; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v675) : "w"(v674), "w"(v640)); - svfloat32_t v685; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v685) : "w"(v684), "w"(v668)); - svfloat32_t v687; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v687) : "w"(v686), "w"(v670)); - svfloat32_t v783; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v783) : "w"(v679), "w"(v691)); - svfloat32_t v791; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v791) : "w"(v679), "w"(v691)); - svfloat32_t v799; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v799) : "w"(v678), "w"(v690)); - svfloat32_t v807; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v807) : "w"(v678), "w"(v690)); + svfloat32_t v673 = svadd_f32_x(svptrue_b32(), v672, v638); + svfloat32_t v675 = svadd_f32_x(svptrue_b32(), v674, v640); + svfloat32_t v685 = svadd_f32_x(svptrue_b32(), v684, v668); + svfloat32_t v687 = svadd_f32_x(svptrue_b32(), v686, v670); + svfloat32_t v783 = svadd_f32_x(svptrue_b32(), v679, v691); + svfloat32_t v791 = svsub_f32_x(svptrue_b32(), v679, v691); + svfloat32_t v799 = svadd_f32_x(svptrue_b32(), v678, v690); + svfloat32_t v807 = svsub_f32_x(svptrue_b32(), v678, v690); svst1_f64(pred_full, (double *)(v1128), svreinterpret_f64_f32(v751)); svst1_f64(pred_full, (double *)(v1137), svreinterpret_f64_f32(v759)); svst1_f64(pred_full, (double *)(v1146), svreinterpret_f64_f32(v767)); svst1_f64(pred_full, (double *)(v1155), svreinterpret_f64_f32(v775)); svst1_f64(pred_full, (double *)(v1200), svreinterpret_f64_f32(v815)); svst1_f64(pred_full, (double *)(v1209), svreinterpret_f64_f32(v823)); - svfloat32_t v703; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v703) : "w"(v673), "w"(v685)); - svfloat32_t v711; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v711) : "w"(v673), "w"(v685)); - svfloat32_t v831; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v831) : "w"(v675), "w"(v687)); - svfloat32_t v839; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v839) : "w"(v675), "w"(v687)); + svfloat32_t v703 = svadd_f32_x(svptrue_b32(), v673, v685); + svfloat32_t v711 = svsub_f32_x(svptrue_b32(), v673, v685); + svfloat32_t v831 = svadd_f32_x(svptrue_b32(), v675, v687); + svfloat32_t v839 = svsub_f32_x(svptrue_b32(), v675, v687); svst1_f64(pred_full, (double *)(v1164), svreinterpret_f64_f32(v783)); svst1_f64(pred_full, (double *)(v1173), svreinterpret_f64_f32(v791)); svst1_f64(pred_full, (double *)(v1182), svreinterpret_f64_f32(v799)); @@ -15077,42 +10950,23 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu20(const armral_cmplx_f32_t *restrict x, const float32x2_t *v861 = &v5[v0]; float32x2_t *v959 = &v6[v2]; int64_t v19 = v0 * 10; - float32x2_t v30 = v7[9]; int64_t v34 = v0 * 5; int64_t v45 = v0 * 15; - float32x2_t v56 = v7[4]; - float32x2_t v60 = v7[14]; int64_t v64 = v0 * 4; int64_t v75 = v0 * 14; - float32x2_t v86 = v7[3]; - float32x2_t v90 = v7[13]; int64_t v94 = v0 * 9; int64_t v105 = v0 * 19; - float32x2_t v116 = v7[8]; - float32x2_t v120 = v7[18]; int64_t v124 = v0 * 8; int64_t v135 = v0 * 18; - float32x2_t v146 = v7[7]; - float32x2_t v150 = v7[17]; int64_t v154 = v0 * 13; int64_t v165 = v0 * 3; - float32x2_t v176 = v7[12]; - float32x2_t v180 = v7[2]; int64_t v184 = v0 * 12; int64_t v195 = v0 * 2; - float32x2_t v206 = v7[11]; - float32x2_t v210 = v7[1]; int64_t v214 = v0 * 17; int64_t v225 = v0 * 7; - float32x2_t v236 = v7[16]; - float32x2_t v240 = v7[6]; int64_t v244 = v0 * 16; int64_t v255 = v0 * 6; - float32x2_t v266 = v7[15]; - float32x2_t v270 = v7[5]; int64_t v285 = v0 * 11; - float32x2_t v296 = v7[0]; - float32x2_t v300 = v7[10]; float v473 = v4 * v529; float v480 = v4 * v534; float v487 = v4 * v539; @@ -15146,44 +11000,44 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu20(const armral_cmplx_f32_t *restrict x, float32x2_t *v914 = &v6[0]; svfloat32_t v1123 = svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v861)[0])); - svfloat32_t v31; - asm("mov %0.d, %d1" : "=w"(v31) : "w"(v30)); - svfloat32_t v57; - asm("mov %0.d, %d1" : "=w"(v57) : "w"(v56)); - svfloat32_t v61; - asm("mov %0.d, %d1" : "=w"(v61) : "w"(v60)); - svfloat32_t v87; - asm("mov %0.d, %d1" : "=w"(v87) : "w"(v86)); - svfloat32_t v91; - asm("mov %0.d, %d1" : "=w"(v91) : "w"(v90)); - svfloat32_t v117; - asm("mov %0.d, %d1" : "=w"(v117) : "w"(v116)); - svfloat32_t v121; - asm("mov %0.d, %d1" : "=w"(v121) : "w"(v120)); - svfloat32_t v147; - asm("mov %0.d, %d1" : "=w"(v147) : "w"(v146)); - svfloat32_t v151; - asm("mov %0.d, %d1" : "=w"(v151) : "w"(v150)); - svfloat32_t v177; - asm("mov %0.d, %d1" : "=w"(v177) : "w"(v176)); - svfloat32_t v181; - asm("mov %0.d, %d1" : "=w"(v181) : "w"(v180)); - svfloat32_t v207; - asm("mov %0.d, %d1" : "=w"(v207) : "w"(v206)); - svfloat32_t v211; - asm("mov %0.d, %d1" : "=w"(v211) : "w"(v210)); - svfloat32_t v237; - asm("mov %0.d, %d1" : "=w"(v237) : "w"(v236)); - svfloat32_t v241; - asm("mov %0.d, %d1" : "=w"(v241) : "w"(v240)); - svfloat32_t v267; - asm("mov %0.d, %d1" : "=w"(v267) : "w"(v266)); - svfloat32_t v271; - asm("mov %0.d, %d1" : "=w"(v271) : "w"(v270)); - svfloat32_t v297; - asm("mov %0.d, %d1" : "=w"(v297) : "w"(v296)); - svfloat32_t v301; - asm("mov %0.d, %d1" : "=w"(v301) : "w"(v300)); + svfloat32_t v31 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[9])); + svfloat32_t v57 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[4])); + svfloat32_t v61 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[14])); + svfloat32_t v87 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[3])); + svfloat32_t v91 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[13])); + svfloat32_t v117 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[8])); + svfloat32_t v121 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[18])); + svfloat32_t v147 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[7])); + svfloat32_t v151 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[17])); + svfloat32_t v177 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[12])); + svfloat32_t v181 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[2])); + svfloat32_t v207 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[11])); + svfloat32_t v211 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[1])); + svfloat32_t v237 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[16])); + svfloat32_t v241 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[6])); + svfloat32_t v267 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[15])); + svfloat32_t v271 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[5])); + svfloat32_t v297 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[0])); + svfloat32_t v301 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[10])); const float32x2_t *v708 = &v5[v19]; const float32x2_t *v717 = &v5[v34]; const float32x2_t *v726 = &v5[v45]; @@ -15228,8 +11082,7 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu20(const armral_cmplx_f32_t *restrict x, float32x2_t *v1085 = &v6[v696]; svfloat32_t v1127 = svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v880)[0])); - svfloat32_t zero298; - asm volatile("mov %0.s, #0" : "=w"(zero298)); + svfloat32_t zero298 = svdup_n_f32(0); svfloat32_t v298 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero298, v1123, v297, 0), v1123, v297, 90); @@ -15269,329 +11122,208 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu20(const armral_cmplx_f32_t *restrict x, svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v852)[0])); svfloat32_t v1125 = svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v870)[0])); - svfloat32_t zero32; - asm volatile("mov %0.s, #0" : "=w"(zero32)); + svfloat32_t zero32 = svdup_n_f32(0); svfloat32_t v32 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero32, v1089, v31, 0), v1089, v31, 90); - svfloat32_t zero58; - asm volatile("mov %0.s, #0" : "=w"(zero58)); + svfloat32_t zero58 = svdup_n_f32(0); svfloat32_t v58 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero58, v1091, v57, 0), v1091, v57, 90); - svfloat32_t zero62; - asm volatile("mov %0.s, #0" : "=w"(zero62)); + svfloat32_t zero62 = svdup_n_f32(0); svfloat32_t v62 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero62, v1093, v61, 0), v1093, v61, 90); - svfloat32_t zero88; - asm volatile("mov %0.s, #0" : "=w"(zero88)); + svfloat32_t zero88 = svdup_n_f32(0); svfloat32_t v88 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero88, v1095, v87, 0), v1095, v87, 90); - svfloat32_t zero92; - asm volatile("mov %0.s, #0" : "=w"(zero92)); + svfloat32_t zero92 = svdup_n_f32(0); svfloat32_t v92 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero92, v1097, v91, 0), v1097, v91, 90); - svfloat32_t zero118; - asm volatile("mov %0.s, #0" : "=w"(zero118)); + svfloat32_t zero118 = svdup_n_f32(0); svfloat32_t v118 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero118, v1099, v117, 0), v1099, v117, 90); - svfloat32_t zero122; - asm volatile("mov %0.s, #0" : "=w"(zero122)); + svfloat32_t zero122 = svdup_n_f32(0); svfloat32_t v122 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero122, v1101, v121, 0), v1101, v121, 90); - svfloat32_t zero148; - asm volatile("mov %0.s, #0" : "=w"(zero148)); + svfloat32_t zero148 = svdup_n_f32(0); svfloat32_t v148 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero148, v1103, v147, 0), v1103, v147, 90); - svfloat32_t zero152; - asm volatile("mov %0.s, #0" : "=w"(zero152)); + svfloat32_t zero152 = svdup_n_f32(0); svfloat32_t v152 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero152, v1105, v151, 0), v1105, v151, 90); - svfloat32_t zero178; - asm volatile("mov %0.s, #0" : "=w"(zero178)); + svfloat32_t zero178 = svdup_n_f32(0); svfloat32_t v178 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero178, v1107, v177, 0), v1107, v177, 90); - svfloat32_t zero182; - asm volatile("mov %0.s, #0" : "=w"(zero182)); + svfloat32_t zero182 = svdup_n_f32(0); svfloat32_t v182 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero182, v1109, v181, 0), v1109, v181, 90); - svfloat32_t zero208; - asm volatile("mov %0.s, #0" : "=w"(zero208)); + svfloat32_t zero208 = svdup_n_f32(0); svfloat32_t v208 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero208, v1111, v207, 0), v1111, v207, 90); - svfloat32_t zero212; - asm volatile("mov %0.s, #0" : "=w"(zero212)); + svfloat32_t zero212 = svdup_n_f32(0); svfloat32_t v212 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero212, v1113, v211, 0), v1113, v211, 90); - svfloat32_t zero238; - asm volatile("mov %0.s, #0" : "=w"(zero238)); + svfloat32_t zero238 = svdup_n_f32(0); svfloat32_t v238 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero238, v1115, v237, 0), v1115, v237, 90); - svfloat32_t zero242; - asm volatile("mov %0.s, #0" : "=w"(zero242)); + svfloat32_t zero242 = svdup_n_f32(0); svfloat32_t v242 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero242, v1117, v241, 0), v1117, v241, 90); - svfloat32_t zero268; - asm volatile("mov %0.s, #0" : "=w"(zero268)); + svfloat32_t zero268 = svdup_n_f32(0); svfloat32_t v268 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero268, v1119, v267, 0), v1119, v267, 90); - svfloat32_t zero272; - asm volatile("mov %0.s, #0" : "=w"(zero272)); + svfloat32_t zero272 = svdup_n_f32(0); svfloat32_t v272 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero272, v1121, v271, 0), v1121, v271, 90); - svfloat32_t zero302; - asm volatile("mov %0.s, #0" : "=w"(zero302)); + svfloat32_t zero302 = svdup_n_f32(0); svfloat32_t v302 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero302, v1125, v301, 0), v1125, v301, 90); - svfloat32_t v310; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v310) : "w"(v1127), "w"(v32)); - svfloat32_t v311; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v311) : "w"(v1127), "w"(v32)); - svfloat32_t v312; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v312) : "w"(v58), "w"(v62)); - svfloat32_t v313; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v313) : "w"(v58), "w"(v62)); - svfloat32_t v316; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v316) : "w"(v88), "w"(v92)); - svfloat32_t v317; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v317) : "w"(v88), "w"(v92)); - svfloat32_t v318; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v318) : "w"(v118), "w"(v122)); - svfloat32_t v319; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v319) : "w"(v118), "w"(v122)); - svfloat32_t v322; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v322) : "w"(v148), "w"(v152)); - svfloat32_t v323; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v323) : "w"(v148), "w"(v152)); - svfloat32_t v324; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v324) : "w"(v178), "w"(v182)); - svfloat32_t v325; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v325) : "w"(v178), "w"(v182)); - svfloat32_t v328; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v328) : "w"(v208), "w"(v212)); - svfloat32_t v329; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v329) : "w"(v208), "w"(v212)); - svfloat32_t v330; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v330) : "w"(v238), "w"(v242)); - svfloat32_t v331; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v331) : "w"(v238), "w"(v242)); - svfloat32_t v334; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v334) : "w"(v268), "w"(v272)); - svfloat32_t v335; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v335) : "w"(v268), "w"(v272)); - svfloat32_t v336; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v336) : "w"(v298), "w"(v302)); - svfloat32_t v337; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v337) : "w"(v298), "w"(v302)); - svfloat32_t v314; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v314) : "w"(v310), "w"(v312)); - svfloat32_t v315; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v315) : "w"(v310), "w"(v312)); - svfloat32_t v320; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v320) : "w"(v316), "w"(v318)); - svfloat32_t v321; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v321) : "w"(v316), "w"(v318)); - svfloat32_t v326; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v326) : "w"(v322), "w"(v324)); - svfloat32_t v327; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v327) : "w"(v322), "w"(v324)); - svfloat32_t v332; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v332) : "w"(v328), "w"(v330)); - svfloat32_t v333; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v333) : "w"(v328), "w"(v330)); - svfloat32_t v338; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v338) : "w"(v334), "w"(v336)); - svfloat32_t v339; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v339) : "w"(v334), "w"(v336)); - svfloat32_t v446; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v446) : "w"(v317), "w"(v335)); - svfloat32_t v447; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v447) : "w"(v317), "w"(v335)); - svfloat32_t v448; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v448) : "w"(v329), "w"(v323)); - svfloat32_t v449; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v449) : "w"(v329), "w"(v323)); - svfloat32_t v499; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v499) : "w"(v319), "w"(v337)); - svfloat32_t v500; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v500) : "w"(v319), "w"(v337)); - svfloat32_t v501; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v501) : "w"(v331), "w"(v325)); - svfloat32_t v502; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v502) : "w"(v331), "w"(v325)); - svfloat32_t v340; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v340) : "w"(v320), "w"(v338)); - svfloat32_t v341; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v341) : "w"(v320), "w"(v338)); - svfloat32_t v342; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v342) : "w"(v332), "w"(v326)); - svfloat32_t v343; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v343) : "w"(v332), "w"(v326)); - svfloat32_t v393; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v393) : "w"(v321), "w"(v339)); - svfloat32_t v394; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v394) : "w"(v321), "w"(v339)); - svfloat32_t v395; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v395) : "w"(v333), "w"(v327)); - svfloat32_t v396; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v396) : "w"(v333), "w"(v327)); - svfloat32_t v450; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v450) : "w"(v446), "w"(v448)); - svfloat32_t v451; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v451) : "w"(v446), "w"(v448)); - svfloat32_t v452; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v452) : "w"(v447), "w"(v449)); - svfloat32_t zero475; - asm volatile("mov %0.s, #0" : "=w"(zero475)); + svfloat32_t v310 = svadd_f32_x(svptrue_b32(), v1127, v32); + svfloat32_t v311 = svsub_f32_x(svptrue_b32(), v1127, v32); + svfloat32_t v312 = svadd_f32_x(svptrue_b32(), v58, v62); + svfloat32_t v313 = svsub_f32_x(svptrue_b32(), v58, v62); + svfloat32_t v316 = svadd_f32_x(svptrue_b32(), v88, v92); + svfloat32_t v317 = svsub_f32_x(svptrue_b32(), v88, v92); + svfloat32_t v318 = svadd_f32_x(svptrue_b32(), v118, v122); + svfloat32_t v319 = svsub_f32_x(svptrue_b32(), v118, v122); + svfloat32_t v322 = svadd_f32_x(svptrue_b32(), v148, v152); + svfloat32_t v323 = svsub_f32_x(svptrue_b32(), v148, v152); + svfloat32_t v324 = svadd_f32_x(svptrue_b32(), v178, v182); + svfloat32_t v325 = svsub_f32_x(svptrue_b32(), v178, v182); + svfloat32_t v328 = svadd_f32_x(svptrue_b32(), v208, v212); + svfloat32_t v329 = svsub_f32_x(svptrue_b32(), v208, v212); + svfloat32_t v330 = svadd_f32_x(svptrue_b32(), v238, v242); + svfloat32_t v331 = svsub_f32_x(svptrue_b32(), v238, v242); + svfloat32_t v334 = svadd_f32_x(svptrue_b32(), v268, v272); + svfloat32_t v335 = svsub_f32_x(svptrue_b32(), v268, v272); + svfloat32_t v336 = svadd_f32_x(svptrue_b32(), v298, v302); + svfloat32_t v337 = svsub_f32_x(svptrue_b32(), v298, v302); + svfloat32_t v314 = svadd_f32_x(svptrue_b32(), v310, v312); + svfloat32_t v315 = svsub_f32_x(svptrue_b32(), v310, v312); + svfloat32_t v320 = svadd_f32_x(svptrue_b32(), v316, v318); + svfloat32_t v321 = svsub_f32_x(svptrue_b32(), v316, v318); + svfloat32_t v326 = svadd_f32_x(svptrue_b32(), v322, v324); + svfloat32_t v327 = svsub_f32_x(svptrue_b32(), v322, v324); + svfloat32_t v332 = svadd_f32_x(svptrue_b32(), v328, v330); + svfloat32_t v333 = svsub_f32_x(svptrue_b32(), v328, v330); + svfloat32_t v338 = svadd_f32_x(svptrue_b32(), v334, v336); + svfloat32_t v339 = svsub_f32_x(svptrue_b32(), v334, v336); + svfloat32_t v446 = svadd_f32_x(svptrue_b32(), v317, v335); + svfloat32_t v447 = svsub_f32_x(svptrue_b32(), v317, v335); + svfloat32_t v448 = svadd_f32_x(svptrue_b32(), v329, v323); + svfloat32_t v449 = svsub_f32_x(svptrue_b32(), v329, v323); + svfloat32_t v499 = svadd_f32_x(svptrue_b32(), v319, v337); + svfloat32_t v500 = svsub_f32_x(svptrue_b32(), v319, v337); + svfloat32_t v501 = svadd_f32_x(svptrue_b32(), v331, v325); + svfloat32_t v502 = svsub_f32_x(svptrue_b32(), v331, v325); + svfloat32_t v340 = svadd_f32_x(svptrue_b32(), v320, v338); + svfloat32_t v341 = svsub_f32_x(svptrue_b32(), v320, v338); + svfloat32_t v342 = svadd_f32_x(svptrue_b32(), v332, v326); + svfloat32_t v343 = svsub_f32_x(svptrue_b32(), v332, v326); + svfloat32_t v393 = svadd_f32_x(svptrue_b32(), v321, v339); + svfloat32_t v394 = svsub_f32_x(svptrue_b32(), v321, v339); + svfloat32_t v395 = svadd_f32_x(svptrue_b32(), v333, v327); + svfloat32_t v396 = svsub_f32_x(svptrue_b32(), v333, v327); + svfloat32_t v450 = svadd_f32_x(svptrue_b32(), v446, v448); + svfloat32_t v451 = svsub_f32_x(svptrue_b32(), v446, v448); + svfloat32_t v452 = svadd_f32_x(svptrue_b32(), v447, v449); + svfloat32_t zero475 = svdup_n_f32(0); svfloat32_t v475 = svcmla_f32_x(pred_full, zero475, v898, v447, 90); - svfloat32_t v503; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v503) : "w"(v499), "w"(v501)); - svfloat32_t v504; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v504) : "w"(v499), "w"(v501)); - svfloat32_t v505; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v505) : "w"(v500), "w"(v502)); - svfloat32_t v542; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v542) : "w"(v502), "w"(v906)); - svfloat32_t v344; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v344) : "w"(v340), "w"(v342)); - svfloat32_t v345; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v345) : "w"(v340), "w"(v342)); - svfloat32_t v346; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v346) : "w"(v341), "w"(v343)); - svfloat32_t zero369; - asm volatile("mov %0.s, #0" : "=w"(zero369)); + svfloat32_t v503 = svadd_f32_x(svptrue_b32(), v499, v501); + svfloat32_t v504 = svsub_f32_x(svptrue_b32(), v499, v501); + svfloat32_t v505 = svadd_f32_x(svptrue_b32(), v500, v502); + svfloat32_t v542 = svmul_f32_x(svptrue_b32(), v502, v906); + svfloat32_t v344 = svadd_f32_x(svptrue_b32(), v340, v342); + svfloat32_t v345 = svsub_f32_x(svptrue_b32(), v340, v342); + svfloat32_t v346 = svadd_f32_x(svptrue_b32(), v341, v343); + svfloat32_t zero369 = svdup_n_f32(0); svfloat32_t v369 = svcmla_f32_x(pred_full, zero369, v898, v341, 90); - svfloat32_t v397; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v397) : "w"(v393), "w"(v395)); - svfloat32_t v398; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v398) : "w"(v393), "w"(v395)); - svfloat32_t v399; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v399) : "w"(v394), "w"(v396)); - svfloat32_t zero422; - asm volatile("mov %0.s, #0" : "=w"(zero422)); + svfloat32_t v397 = svadd_f32_x(svptrue_b32(), v393, v395); + svfloat32_t v398 = svsub_f32_x(svptrue_b32(), v393, v395); + svfloat32_t v399 = svadd_f32_x(svptrue_b32(), v394, v396); + svfloat32_t zero422 = svdup_n_f32(0); svfloat32_t v422 = svcmla_f32_x(pred_full, zero422, v898, v394, 90); - svfloat32_t v453; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v453) : "w"(v450), "w"(v311)); - svfloat32_t zero482; - asm volatile("mov %0.s, #0" : "=w"(zero482)); + svfloat32_t v453 = svadd_f32_x(svptrue_b32(), v450, v311); + svfloat32_t zero482 = svdup_n_f32(0); svfloat32_t v482 = svcmla_f32_x(pred_full, zero482, v899, v452, 90); - svfloat32_t v506; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v506) : "w"(v503), "w"(v313)); - svfloat32_t zero527; - asm volatile("mov %0.s, #0" : "=w"(zero527)); + svfloat32_t v506 = svadd_f32_x(svptrue_b32(), v503, v313); + svfloat32_t zero527 = svdup_n_f32(0); svfloat32_t v527 = svcmla_f32_x(pred_full, zero527, v903, v504, 90); - svfloat32_t v537; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v537) : "w"(v505), "w"(v905)); - svfloat32_t v347; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v347) : "w"(v344), "w"(v314)); - svfloat32_t zero376; - asm volatile("mov %0.s, #0" : "=w"(zero376)); + svfloat32_t v537 = svmul_f32_x(svptrue_b32(), v505, v905); + svfloat32_t v347 = svadd_f32_x(svptrue_b32(), v344, v314); + svfloat32_t zero376 = svdup_n_f32(0); svfloat32_t v376 = svcmla_f32_x(pred_full, zero376, v899, v346, 90); - svfloat32_t v400; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v400) : "w"(v397), "w"(v315)); - svfloat32_t zero429; - asm volatile("mov %0.s, #0" : "=w"(zero429)); + svfloat32_t v400 = svadd_f32_x(svptrue_b32(), v397, v315); + svfloat32_t zero429 = svdup_n_f32(0); svfloat32_t v429 = svcmla_f32_x(pred_full, zero429, v899, v399, 90); svfloat32_t v490 = svmla_f32_x(pred_full, v453, v450, v896); - svfloat32_t v493; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v493) : "w"(v475), "w"(v482)); + svfloat32_t v493 = svsub_f32_x(svptrue_b32(), v475, v482); svfloat32_t v494 = svcmla_f32_x(pred_full, v482, v900, v449, 90); - svfloat32_t zero513; - asm volatile("mov %0.s, #0" : "=w"(zero513)); + svfloat32_t zero513 = svdup_n_f32(0); svfloat32_t v513 = svcmla_f32_x(pred_full, zero513, v901, v506, 90); svfloat32_t v546 = svnmls_f32_x(pred_full, v537, v500, v904); svfloat32_t v547 = svmla_f32_x(pred_full, v542, v505, v905); svfloat32_t v384 = svmla_f32_x(pred_full, v347, v344, v896); - svfloat32_t v387; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v387) : "w"(v369), "w"(v376)); + svfloat32_t v387 = svsub_f32_x(svptrue_b32(), v369, v376); svfloat32_t v388 = svcmla_f32_x(pred_full, v376, v900, v343, 90); svfloat32_t v437 = svmla_f32_x(pred_full, v400, v397, v896); - svfloat32_t v440; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v440) : "w"(v422), "w"(v429)); + svfloat32_t v440 = svsub_f32_x(svptrue_b32(), v422, v429); svfloat32_t v441 = svcmla_f32_x(pred_full, v429, v900, v396, 90); svfloat32_t v491 = svmla_f32_x(pred_full, v490, v451, v897); svfloat32_t v492 = svmls_f32_x(pred_full, v490, v451, v897); svfloat32_t v543 = svcmla_f32_x(pred_full, v513, v902, v503, 90); - svfloat32_t v552; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v552) : "w"(v453), "w"(v513)); - svfloat32_t v553; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v553) : "w"(v453), "w"(v513)); + svfloat32_t v552 = svadd_f32_x(svptrue_b32(), v453, v513); + svfloat32_t v553 = svsub_f32_x(svptrue_b32(), v453, v513); svst1_f64(pred_full, (double *)(v914), svreinterpret_f64_f32(v347)); svst1_f64(pred_full, (double *)(v932), svreinterpret_f64_f32(v400)); svfloat32_t v385 = svmla_f32_x(pred_full, v384, v345, v897); svfloat32_t v386 = svmls_f32_x(pred_full, v384, v345, v897); svfloat32_t v438 = svmla_f32_x(pred_full, v437, v398, v897); svfloat32_t v439 = svmls_f32_x(pred_full, v437, v398, v897); - svfloat32_t v495; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v495) : "w"(v491), "w"(v493)); - svfloat32_t v496; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v496) : "w"(v491), "w"(v493)); - svfloat32_t v497; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v497) : "w"(v492), "w"(v494)); - svfloat32_t v498; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v498) : "w"(v492), "w"(v494)); - svfloat32_t v544; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v544) : "w"(v543), "w"(v527)); - svfloat32_t v545; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v545) : "w"(v543), "w"(v527)); + svfloat32_t v495 = svadd_f32_x(svptrue_b32(), v491, v493); + svfloat32_t v496 = svsub_f32_x(svptrue_b32(), v491, v493); + svfloat32_t v497 = svadd_f32_x(svptrue_b32(), v492, v494); + svfloat32_t v498 = svsub_f32_x(svptrue_b32(), v492, v494); + svfloat32_t v544 = svadd_f32_x(svptrue_b32(), v543, v527); + svfloat32_t v545 = svsub_f32_x(svptrue_b32(), v543, v527); svst1_f64(pred_full, (double *)(v923), svreinterpret_f64_f32(v553)); svst1_f64(pred_full, (double *)(v941), svreinterpret_f64_f32(v552)); - svfloat32_t v389; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v389) : "w"(v385), "w"(v387)); - svfloat32_t v390; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v390) : "w"(v385), "w"(v387)); - svfloat32_t v391; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v391) : "w"(v386), "w"(v388)); - svfloat32_t v392; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v392) : "w"(v386), "w"(v388)); - svfloat32_t v442; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v442) : "w"(v438), "w"(v440)); - svfloat32_t v443; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v443) : "w"(v438), "w"(v440)); - svfloat32_t v444; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v444) : "w"(v439), "w"(v441)); - svfloat32_t v445; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v445) : "w"(v439), "w"(v441)); - svfloat32_t v548; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v548) : "w"(v544), "w"(v546)); - svfloat32_t v549; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v549) : "w"(v544), "w"(v546)); - svfloat32_t v550; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v550) : "w"(v545), "w"(v547)); - svfloat32_t v551; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v551) : "w"(v545), "w"(v547)); - svfloat32_t v582; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v582) : "w"(v496), "w"(v549)); - svfloat32_t v583; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v583) : "w"(v496), "w"(v549)); - svfloat32_t v612; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v612) : "w"(v498), "w"(v551)); - svfloat32_t v613; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v613) : "w"(v498), "w"(v551)); - svfloat32_t v642; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v642) : "w"(v497), "w"(v550)); - svfloat32_t v643; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v643) : "w"(v497), "w"(v550)); - svfloat32_t v672; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v672) : "w"(v495), "w"(v548)); - svfloat32_t v673; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v673) : "w"(v495), "w"(v548)); + svfloat32_t v389 = svadd_f32_x(svptrue_b32(), v385, v387); + svfloat32_t v390 = svsub_f32_x(svptrue_b32(), v385, v387); + svfloat32_t v391 = svadd_f32_x(svptrue_b32(), v386, v388); + svfloat32_t v392 = svsub_f32_x(svptrue_b32(), v386, v388); + svfloat32_t v442 = svadd_f32_x(svptrue_b32(), v438, v440); + svfloat32_t v443 = svsub_f32_x(svptrue_b32(), v438, v440); + svfloat32_t v444 = svadd_f32_x(svptrue_b32(), v439, v441); + svfloat32_t v445 = svsub_f32_x(svptrue_b32(), v439, v441); + svfloat32_t v548 = svadd_f32_x(svptrue_b32(), v544, v546); + svfloat32_t v549 = svsub_f32_x(svptrue_b32(), v544, v546); + svfloat32_t v550 = svadd_f32_x(svptrue_b32(), v545, v547); + svfloat32_t v551 = svsub_f32_x(svptrue_b32(), v545, v547); + svfloat32_t v582 = svadd_f32_x(svptrue_b32(), v496, v549); + svfloat32_t v583 = svsub_f32_x(svptrue_b32(), v496, v549); + svfloat32_t v612 = svadd_f32_x(svptrue_b32(), v498, v551); + svfloat32_t v613 = svsub_f32_x(svptrue_b32(), v498, v551); + svfloat32_t v642 = svadd_f32_x(svptrue_b32(), v497, v550); + svfloat32_t v643 = svsub_f32_x(svptrue_b32(), v497, v550); + svfloat32_t v672 = svadd_f32_x(svptrue_b32(), v495, v548); + svfloat32_t v673 = svsub_f32_x(svptrue_b32(), v495, v548); svst1_f64(pred_full, (double *)(v950), svreinterpret_f64_f32(v390)); svst1_f64(pred_full, (double *)(v968), svreinterpret_f64_f32(v443)); svst1_f64(pred_full, (double *)(v986), svreinterpret_f64_f32(v392)); @@ -16653,43 +12385,23 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu21(const armral_cmplx_f32_t *restrict x, float32x2_t *v1019 = &v6[v2]; int64_t v19 = v0 * 7; int64_t v30 = v0 * 14; - float32x2_t v41 = v7[6]; - float32x2_t v45 = v7[13]; int64_t v49 = v0 * 10; int64_t v60 = v0 * 17; - float32x2_t v71 = v7[9]; - float32x2_t v75 = v7[16]; int64_t v79 = v0 * 3; - float32x2_t v86 = v7[2]; int64_t v90 = v0 * 13; int64_t v101 = v0 * 20; - float32x2_t v112 = v7[12]; - float32x2_t v116 = v7[19]; int64_t v120 = v0 * 6; - float32x2_t v127 = v7[5]; int64_t v131 = v0 * 16; int64_t v142 = v0 * 2; - float32x2_t v153 = v7[15]; - float32x2_t v157 = v7[1]; int64_t v161 = v0 * 9; - float32x2_t v168 = v7[8]; int64_t v172 = v0 * 19; int64_t v183 = v0 * 5; - float32x2_t v194 = v7[18]; - float32x2_t v198 = v7[4]; int64_t v202 = v0 * 12; - float32x2_t v209 = v7[11]; int64_t v224 = v0 * 8; - float32x2_t v235 = v7[0]; - float32x2_t v239 = v7[7]; int64_t v243 = v0 * 15; - float32x2_t v250 = v7[14]; int64_t v254 = v0 * 4; int64_t v265 = v0 * 11; - float32x2_t v276 = v7[3]; - float32x2_t v280 = v7[10]; int64_t v284 = v0 * 18; - float32x2_t v291 = v7[17]; float v368 = v4 * v365; float v375 = v4 * v372; float v382 = v4 * v379; @@ -16739,46 +12451,46 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu21(const armral_cmplx_f32_t *restrict x, float32x2_t *v983 = &v6[0]; svfloat32_t v1195 = svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v891)[0])); - svfloat32_t v42; - asm("mov %0.d, %d1" : "=w"(v42) : "w"(v41)); - svfloat32_t v46; - asm("mov %0.d, %d1" : "=w"(v46) : "w"(v45)); - svfloat32_t v72; - asm("mov %0.d, %d1" : "=w"(v72) : "w"(v71)); - svfloat32_t v76; - asm("mov %0.d, %d1" : "=w"(v76) : "w"(v75)); - svfloat32_t v87; - asm("mov %0.d, %d1" : "=w"(v87) : "w"(v86)); - svfloat32_t v113; - asm("mov %0.d, %d1" : "=w"(v113) : "w"(v112)); - svfloat32_t v117; - asm("mov %0.d, %d1" : "=w"(v117) : "w"(v116)); - svfloat32_t v128; - asm("mov %0.d, %d1" : "=w"(v128) : "w"(v127)); - svfloat32_t v154; - asm("mov %0.d, %d1" : "=w"(v154) : "w"(v153)); - svfloat32_t v158; - asm("mov %0.d, %d1" : "=w"(v158) : "w"(v157)); - svfloat32_t v169; - asm("mov %0.d, %d1" : "=w"(v169) : "w"(v168)); - svfloat32_t v195; - asm("mov %0.d, %d1" : "=w"(v195) : "w"(v194)); - svfloat32_t v199; - asm("mov %0.d, %d1" : "=w"(v199) : "w"(v198)); - svfloat32_t v210; - asm("mov %0.d, %d1" : "=w"(v210) : "w"(v209)); - svfloat32_t v236; - asm("mov %0.d, %d1" : "=w"(v236) : "w"(v235)); - svfloat32_t v240; - asm("mov %0.d, %d1" : "=w"(v240) : "w"(v239)); - svfloat32_t v251; - asm("mov %0.d, %d1" : "=w"(v251) : "w"(v250)); - svfloat32_t v277; - asm("mov %0.d, %d1" : "=w"(v277) : "w"(v276)); - svfloat32_t v281; - asm("mov %0.d, %d1" : "=w"(v281) : "w"(v280)); - svfloat32_t v292; - asm("mov %0.d, %d1" : "=w"(v292) : "w"(v291)); + svfloat32_t v42 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[6])); + svfloat32_t v46 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[13])); + svfloat32_t v72 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[9])); + svfloat32_t v76 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[16])); + svfloat32_t v87 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[2])); + svfloat32_t v113 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[12])); + svfloat32_t v117 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[19])); + svfloat32_t v128 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[5])); + svfloat32_t v154 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[15])); + svfloat32_t v158 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[1])); + svfloat32_t v169 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[8])); + svfloat32_t v195 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[18])); + svfloat32_t v199 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[4])); + svfloat32_t v210 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[11])); + svfloat32_t v236 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[0])); + svfloat32_t v240 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[7])); + svfloat32_t v251 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[14])); + svfloat32_t v277 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[3])); + svfloat32_t v281 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[10])); + svfloat32_t v292 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[17])); const float32x2_t *v765 = &v5[v19]; const float32x2_t *v774 = &v5[v30]; const float32x2_t *v783 = &v5[v49]; @@ -16832,8 +12544,7 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu21(const armral_cmplx_f32_t *restrict x, float32x2_t *v1163 = &v6[v753]; svfloat32_t v1207 = svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v946)[0])); - svfloat32_t zero237; - asm volatile("mov %0.s, #0" : "=w"(zero237)); + svfloat32_t zero237 = svdup_n_f32(0); svfloat32_t v237 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero237, v1195, v236, 0), v1195, v236, 90); @@ -16875,101 +12586,73 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu21(const armral_cmplx_f32_t *restrict x, svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v927)[0])); svfloat32_t v1205 = svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v936)[0])); - svfloat32_t zero43; - asm volatile("mov %0.s, #0" : "=w"(zero43)); + svfloat32_t zero43 = svdup_n_f32(0); svfloat32_t v43 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero43, v1167, v42, 0), v1167, v42, 90); - svfloat32_t zero47; - asm volatile("mov %0.s, #0" : "=w"(zero47)); + svfloat32_t zero47 = svdup_n_f32(0); svfloat32_t v47 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero47, v1169, v46, 0), v1169, v46, 90); - svfloat32_t zero73; - asm volatile("mov %0.s, #0" : "=w"(zero73)); + svfloat32_t zero73 = svdup_n_f32(0); svfloat32_t v73 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero73, v1171, v72, 0), v1171, v72, 90); - svfloat32_t zero77; - asm volatile("mov %0.s, #0" : "=w"(zero77)); + svfloat32_t zero77 = svdup_n_f32(0); svfloat32_t v77 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero77, v1173, v76, 0), v1173, v76, 90); - svfloat32_t zero114; - asm volatile("mov %0.s, #0" : "=w"(zero114)); + svfloat32_t zero114 = svdup_n_f32(0); svfloat32_t v114 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero114, v1177, v113, 0), v1177, v113, 90); - svfloat32_t zero118; - asm volatile("mov %0.s, #0" : "=w"(zero118)); + svfloat32_t zero118 = svdup_n_f32(0); svfloat32_t v118 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero118, v1179, v117, 0), v1179, v117, 90); - svfloat32_t zero155; - asm volatile("mov %0.s, #0" : "=w"(zero155)); + svfloat32_t zero155 = svdup_n_f32(0); svfloat32_t v155 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero155, v1183, v154, 0), v1183, v154, 90); - svfloat32_t zero159; - asm volatile("mov %0.s, #0" : "=w"(zero159)); + svfloat32_t zero159 = svdup_n_f32(0); svfloat32_t v159 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero159, v1185, v158, 0), v1185, v158, 90); - svfloat32_t zero196; - asm volatile("mov %0.s, #0" : "=w"(zero196)); + svfloat32_t zero196 = svdup_n_f32(0); svfloat32_t v196 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero196, v1189, v195, 0), v1189, v195, 90); - svfloat32_t zero200; - asm volatile("mov %0.s, #0" : "=w"(zero200)); + svfloat32_t zero200 = svdup_n_f32(0); svfloat32_t v200 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero200, v1191, v199, 0), v1191, v199, 90); - svfloat32_t zero241; - asm volatile("mov %0.s, #0" : "=w"(zero241)); + svfloat32_t zero241 = svdup_n_f32(0); svfloat32_t v241 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero241, v1197, v240, 0), v1197, v240, 90); - svfloat32_t zero278; - asm volatile("mov %0.s, #0" : "=w"(zero278)); + svfloat32_t zero278 = svdup_n_f32(0); svfloat32_t v278 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero278, v1201, v277, 0), v1201, v277, 90); - svfloat32_t zero282; - asm volatile("mov %0.s, #0" : "=w"(zero282)); + svfloat32_t zero282 = svdup_n_f32(0); svfloat32_t v282 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero282, v1203, v281, 0), v1203, v281, 90); - svfloat32_t v294; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v294) : "w"(v43), "w"(v47)); - svfloat32_t v295; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v295) : "w"(v43), "w"(v47)); - svfloat32_t v304; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v304) : "w"(v73), "w"(v77)); - svfloat32_t v305; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v305) : "w"(v73), "w"(v77)); - svfloat32_t v307; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v307) : "w"(v114), "w"(v118)); - svfloat32_t v308; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v308) : "w"(v114), "w"(v118)); - svfloat32_t v310; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v310) : "w"(v155), "w"(v159)); - svfloat32_t v311; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v311) : "w"(v155), "w"(v159)); - svfloat32_t v313; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v313) : "w"(v196), "w"(v200)); - svfloat32_t v314; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v314) : "w"(v196), "w"(v200)); - svfloat32_t v316; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v316) : "w"(v237), "w"(v241)); - svfloat32_t v317; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v317) : "w"(v237), "w"(v241)); - svfloat32_t v319; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v319) : "w"(v278), "w"(v282)); - svfloat32_t v320; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v320) : "w"(v278), "w"(v282)); - svfloat32_t v303; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v303) : "w"(v294), "w"(v1207)); + svfloat32_t v294 = svadd_f32_x(svptrue_b32(), v43, v47); + svfloat32_t v295 = svsub_f32_x(svptrue_b32(), v43, v47); + svfloat32_t v304 = svadd_f32_x(svptrue_b32(), v73, v77); + svfloat32_t v305 = svsub_f32_x(svptrue_b32(), v73, v77); + svfloat32_t v307 = svadd_f32_x(svptrue_b32(), v114, v118); + svfloat32_t v308 = svsub_f32_x(svptrue_b32(), v114, v118); + svfloat32_t v310 = svadd_f32_x(svptrue_b32(), v155, v159); + svfloat32_t v311 = svsub_f32_x(svptrue_b32(), v155, v159); + svfloat32_t v313 = svadd_f32_x(svptrue_b32(), v196, v200); + svfloat32_t v314 = svsub_f32_x(svptrue_b32(), v196, v200); + svfloat32_t v316 = svadd_f32_x(svptrue_b32(), v237, v241); + svfloat32_t v317 = svsub_f32_x(svptrue_b32(), v237, v241); + svfloat32_t v319 = svadd_f32_x(svptrue_b32(), v278, v282); + svfloat32_t v320 = svsub_f32_x(svptrue_b32(), v278, v282); + svfloat32_t v303 = svadd_f32_x(svptrue_b32(), v294, v1207); svfloat32_t v306 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, v304, v1175, v87, 0), v1175, v87, 90); @@ -16988,173 +12671,98 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu21(const armral_cmplx_f32_t *restrict x, svfloat32_t v321 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, v319, v1205, v292, 0), v1205, v292, 90); - svfloat32_t v411; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v411) : "w"(v304), "w"(v319)); - svfloat32_t v412; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v412) : "w"(v304), "w"(v319)); - svfloat32_t v413; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v413) : "w"(v313), "w"(v310)); - svfloat32_t v414; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v414) : "w"(v313), "w"(v310)); - svfloat32_t v415; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v415) : "w"(v307), "w"(v316)); - svfloat32_t v416; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v416) : "w"(v307), "w"(v316)); - svfloat32_t v500; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v500) : "w"(v305), "w"(v320)); - svfloat32_t v501; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v501) : "w"(v305), "w"(v320)); - svfloat32_t v502; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v502) : "w"(v314), "w"(v311)); - svfloat32_t v503; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v503) : "w"(v314), "w"(v311)); - svfloat32_t v504; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v504) : "w"(v308), "w"(v317)); - svfloat32_t v505; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v505) : "w"(v308), "w"(v317)); - svfloat32_t v322; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v322) : "w"(v306), "w"(v321)); - svfloat32_t v323; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v323) : "w"(v306), "w"(v321)); - svfloat32_t v324; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v324) : "w"(v315), "w"(v312)); - svfloat32_t v325; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v325) : "w"(v315), "w"(v312)); - svfloat32_t v326; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v326) : "w"(v309), "w"(v318)); - svfloat32_t v327; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v327) : "w"(v309), "w"(v318)); - svfloat32_t v417; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v417) : "w"(v411), "w"(v413)); - svfloat32_t v420; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v420) : "w"(v411), "w"(v413)); - svfloat32_t v421; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v421) : "w"(v413), "w"(v415)); - svfloat32_t v422; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v422) : "w"(v415), "w"(v411)); - svfloat32_t v423; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v423) : "w"(v412), "w"(v414)); - svfloat32_t v425; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v425) : "w"(v412), "w"(v414)); - svfloat32_t v426; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v426) : "w"(v414), "w"(v416)); - svfloat32_t v427; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v427) : "w"(v416), "w"(v412)); - svfloat32_t v506; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v506) : "w"(v500), "w"(v502)); - svfloat32_t v509; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v509) : "w"(v500), "w"(v502)); - svfloat32_t v510; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v510) : "w"(v502), "w"(v504)); - svfloat32_t v511; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v511) : "w"(v504), "w"(v500)); - svfloat32_t v512; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v512) : "w"(v501), "w"(v503)); - svfloat32_t v514; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v514) : "w"(v501), "w"(v503)); - svfloat32_t v515; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v515) : "w"(v503), "w"(v505)); - svfloat32_t v516; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v516) : "w"(v505), "w"(v501)); - svfloat32_t v328; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v328) : "w"(v322), "w"(v324)); - svfloat32_t v331; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v331) : "w"(v322), "w"(v324)); - svfloat32_t v332; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v332) : "w"(v324), "w"(v326)); - svfloat32_t v333; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v333) : "w"(v326), "w"(v322)); - svfloat32_t v334; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v334) : "w"(v323), "w"(v325)); - svfloat32_t v336; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v336) : "w"(v323), "w"(v325)); - svfloat32_t v337; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v337) : "w"(v325), "w"(v327)); - svfloat32_t v338; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v338) : "w"(v327), "w"(v323)); - svfloat32_t v418; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v418) : "w"(v417), "w"(v415)); - svfloat32_t v424; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v424) : "w"(v423), "w"(v416)); - svfloat32_t zero466; - asm volatile("mov %0.s, #0" : "=w"(zero466)); + svfloat32_t v411 = svadd_f32_x(svptrue_b32(), v304, v319); + svfloat32_t v412 = svsub_f32_x(svptrue_b32(), v304, v319); + svfloat32_t v413 = svadd_f32_x(svptrue_b32(), v313, v310); + svfloat32_t v414 = svsub_f32_x(svptrue_b32(), v313, v310); + svfloat32_t v415 = svadd_f32_x(svptrue_b32(), v307, v316); + svfloat32_t v416 = svsub_f32_x(svptrue_b32(), v307, v316); + svfloat32_t v500 = svadd_f32_x(svptrue_b32(), v305, v320); + svfloat32_t v501 = svsub_f32_x(svptrue_b32(), v305, v320); + svfloat32_t v502 = svadd_f32_x(svptrue_b32(), v314, v311); + svfloat32_t v503 = svsub_f32_x(svptrue_b32(), v314, v311); + svfloat32_t v504 = svadd_f32_x(svptrue_b32(), v308, v317); + svfloat32_t v505 = svsub_f32_x(svptrue_b32(), v308, v317); + svfloat32_t v322 = svadd_f32_x(svptrue_b32(), v306, v321); + svfloat32_t v323 = svsub_f32_x(svptrue_b32(), v306, v321); + svfloat32_t v324 = svadd_f32_x(svptrue_b32(), v315, v312); + svfloat32_t v325 = svsub_f32_x(svptrue_b32(), v315, v312); + svfloat32_t v326 = svadd_f32_x(svptrue_b32(), v309, v318); + svfloat32_t v327 = svsub_f32_x(svptrue_b32(), v309, v318); + svfloat32_t v417 = svadd_f32_x(svptrue_b32(), v411, v413); + svfloat32_t v420 = svsub_f32_x(svptrue_b32(), v411, v413); + svfloat32_t v421 = svsub_f32_x(svptrue_b32(), v413, v415); + svfloat32_t v422 = svsub_f32_x(svptrue_b32(), v415, v411); + svfloat32_t v423 = svadd_f32_x(svptrue_b32(), v412, v414); + svfloat32_t v425 = svsub_f32_x(svptrue_b32(), v412, v414); + svfloat32_t v426 = svsub_f32_x(svptrue_b32(), v414, v416); + svfloat32_t v427 = svsub_f32_x(svptrue_b32(), v416, v412); + svfloat32_t v506 = svadd_f32_x(svptrue_b32(), v500, v502); + svfloat32_t v509 = svsub_f32_x(svptrue_b32(), v500, v502); + svfloat32_t v510 = svsub_f32_x(svptrue_b32(), v502, v504); + svfloat32_t v511 = svsub_f32_x(svptrue_b32(), v504, v500); + svfloat32_t v512 = svadd_f32_x(svptrue_b32(), v501, v503); + svfloat32_t v514 = svsub_f32_x(svptrue_b32(), v501, v503); + svfloat32_t v515 = svsub_f32_x(svptrue_b32(), v503, v505); + svfloat32_t v516 = svsub_f32_x(svptrue_b32(), v505, v501); + svfloat32_t v328 = svadd_f32_x(svptrue_b32(), v322, v324); + svfloat32_t v331 = svsub_f32_x(svptrue_b32(), v322, v324); + svfloat32_t v332 = svsub_f32_x(svptrue_b32(), v324, v326); + svfloat32_t v333 = svsub_f32_x(svptrue_b32(), v326, v322); + svfloat32_t v334 = svadd_f32_x(svptrue_b32(), v323, v325); + svfloat32_t v336 = svsub_f32_x(svptrue_b32(), v323, v325); + svfloat32_t v337 = svsub_f32_x(svptrue_b32(), v325, v327); + svfloat32_t v338 = svsub_f32_x(svptrue_b32(), v327, v323); + svfloat32_t v418 = svadd_f32_x(svptrue_b32(), v417, v415); + svfloat32_t v424 = svadd_f32_x(svptrue_b32(), v423, v416); + svfloat32_t zero466 = svdup_n_f32(0); svfloat32_t v466 = svcmla_f32_x(pred_full, zero466, v964, v425, 90); - svfloat32_t zero473; - asm volatile("mov %0.s, #0" : "=w"(zero473)); + svfloat32_t zero473 = svdup_n_f32(0); svfloat32_t v473 = svcmla_f32_x(pred_full, zero473, v965, v426, 90); - svfloat32_t zero480; - asm volatile("mov %0.s, #0" : "=w"(zero480)); + svfloat32_t zero480 = svdup_n_f32(0); svfloat32_t v480 = svcmla_f32_x(pred_full, zero480, v966, v427, 90); - svfloat32_t v507; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v507) : "w"(v506), "w"(v504)); - svfloat32_t v513; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v513) : "w"(v512), "w"(v505)); - svfloat32_t zero537; - asm volatile("mov %0.s, #0" : "=w"(zero537)); + svfloat32_t v507 = svadd_f32_x(svptrue_b32(), v506, v504); + svfloat32_t v513 = svadd_f32_x(svptrue_b32(), v512, v505); + svfloat32_t zero537 = svdup_n_f32(0); svfloat32_t v537 = svcmla_f32_x(pred_full, zero537, v969, v509, 90); - svfloat32_t zero544; - asm volatile("mov %0.s, #0" : "=w"(zero544)); + svfloat32_t zero544 = svdup_n_f32(0); svfloat32_t v544 = svcmla_f32_x(pred_full, zero544, v970, v510, 90); - svfloat32_t zero551; - asm volatile("mov %0.s, #0" : "=w"(zero551)); + svfloat32_t zero551 = svdup_n_f32(0); svfloat32_t v551 = svcmla_f32_x(pred_full, zero551, v971, v511, 90); - svfloat32_t v561; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v561) : "w"(v514), "w"(v973)); - svfloat32_t v566; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v566) : "w"(v515), "w"(v974)); - svfloat32_t v329; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v329) : "w"(v328), "w"(v326)); - svfloat32_t v335; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v335) : "w"(v334), "w"(v327)); - svfloat32_t zero377; - asm volatile("mov %0.s, #0" : "=w"(zero377)); + svfloat32_t v561 = svmul_f32_x(svptrue_b32(), v514, v973); + svfloat32_t v566 = svmul_f32_x(svptrue_b32(), v515, v974); + svfloat32_t v329 = svadd_f32_x(svptrue_b32(), v328, v326); + svfloat32_t v335 = svadd_f32_x(svptrue_b32(), v334, v327); + svfloat32_t zero377 = svdup_n_f32(0); svfloat32_t v377 = svcmla_f32_x(pred_full, zero377, v955, v336, 90); - svfloat32_t zero384; - asm volatile("mov %0.s, #0" : "=w"(zero384)); + svfloat32_t zero384 = svdup_n_f32(0); svfloat32_t v384 = svcmla_f32_x(pred_full, zero384, v956, v337, 90); - svfloat32_t zero391; - asm volatile("mov %0.s, #0" : "=w"(zero391)); + svfloat32_t zero391 = svdup_n_f32(0); svfloat32_t v391 = svcmla_f32_x(pred_full, zero391, v957, v338, 90); - svfloat32_t v419; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v419) : "w"(v418), "w"(v294)); - svfloat32_t v437; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v437) : "w"(v418), "w"(v959)); - svfloat32_t zero459; - asm volatile("mov %0.s, #0" : "=w"(zero459)); + svfloat32_t v419 = svadd_f32_x(svptrue_b32(), v418, v294); + svfloat32_t v437 = svmul_f32_x(svptrue_b32(), v418, v959); + svfloat32_t zero459 = svdup_n_f32(0); svfloat32_t v459 = svcmla_f32_x(pred_full, zero459, v963, v424, 90); - svfloat32_t v508; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v508) : "w"(v507), "w"(v295)); - svfloat32_t v330; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v330) : "w"(v329), "w"(v303)); - svfloat32_t zero370; - asm volatile("mov %0.s, #0" : "=w"(zero370)); + svfloat32_t v508 = svadd_f32_x(svptrue_b32(), v507, v295); + svfloat32_t v330 = svadd_f32_x(svptrue_b32(), v329, v303); + svfloat32_t zero370 = svdup_n_f32(0); svfloat32_t v370 = svcmla_f32_x(pred_full, zero370, v954, v335, 90); - svfloat32_t v488; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v488) : "w"(v459), "w"(v466)); - svfloat32_t v490; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v490) : "w"(v459), "w"(v466)); - svfloat32_t v492; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v492) : "w"(v459), "w"(v473)); - svfloat32_t zero523; - asm volatile("mov %0.s, #0" : "=w"(zero523)); + svfloat32_t v488 = svadd_f32_x(svptrue_b32(), v459, v466); + svfloat32_t v490 = svsub_f32_x(svptrue_b32(), v459, v466); + svfloat32_t v492 = svsub_f32_x(svptrue_b32(), v459, v473); + svfloat32_t zero523 = svdup_n_f32(0); svfloat32_t v523 = svcmla_f32_x(pred_full, zero523, v967, v508, 90); svfloat32_t v579 = svmla_f32_x(pred_full, v561, v513, v972); svfloat32_t v581 = svnmls_f32_x(pred_full, v561, v513, v972); svfloat32_t v583 = svnmls_f32_x(pred_full, v566, v513, v972); svfloat32_t v392 = svmla_f32_x(pred_full, v330, v329, v950); - svfloat32_t v399; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v399) : "w"(v370), "w"(v377)); - svfloat32_t v401; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v401) : "w"(v370), "w"(v377)); - svfloat32_t v403; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v403) : "w"(v370), "w"(v384)); + svfloat32_t v399 = svadd_f32_x(svptrue_b32(), v370, v377); + svfloat32_t v401 = svsub_f32_x(svptrue_b32(), v370, v377); + svfloat32_t v403 = svsub_f32_x(svptrue_b32(), v370, v384); svfloat32_t v481 = svmla_f32_x(pred_full, v437, v419, v958); - svfloat32_t v489; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v489) : "w"(v488), "w"(v473)); - svfloat32_t v491; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v491) : "w"(v490), "w"(v480)); - svfloat32_t v493; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v493) : "w"(v492), "w"(v480)); + svfloat32_t v489 = svadd_f32_x(svptrue_b32(), v488, v473); + svfloat32_t v491 = svsub_f32_x(svptrue_b32(), v490, v480); + svfloat32_t v493 = svadd_f32_x(svptrue_b32(), v492, v480); svfloat32_t v572 = svcmla_f32_x(pred_full, v523, v968, v507, 90); svfloat32_t v580 = svmla_f32_x(pred_full, v579, v515, v974); svfloat32_t v582 = svmls_f32_x(pred_full, v581, v516, v975); @@ -17164,117 +12772,70 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu21(const armral_cmplx_f32_t *restrict x, svfloat32_t v393 = svmla_f32_x(pred_full, v392, v331, v951); svfloat32_t v395 = svmls_f32_x(pred_full, v392, v331, v951); svfloat32_t v397 = svmls_f32_x(pred_full, v392, v332, v952); - svfloat32_t v400; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v400) : "w"(v399), "w"(v384)); - svfloat32_t v402; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v402) : "w"(v401), "w"(v391)); - svfloat32_t v404; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v404) : "w"(v403), "w"(v391)); + svfloat32_t v400 = svadd_f32_x(svptrue_b32(), v399, v384); + svfloat32_t v402 = svsub_f32_x(svptrue_b32(), v401, v391); + svfloat32_t v404 = svadd_f32_x(svptrue_b32(), v403, v391); svfloat32_t v482 = svmla_f32_x(pred_full, v481, v420, v960); svfloat32_t v484 = svmls_f32_x(pred_full, v481, v420, v960); svfloat32_t v486 = svmls_f32_x(pred_full, v481, v421, v961); - svfloat32_t v573; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v573) : "w"(v572), "w"(v537)); - svfloat32_t v575; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v575) : "w"(v572), "w"(v537)); - svfloat32_t v577; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v577) : "w"(v572), "w"(v544)); - svfloat32_t v592; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v592) : "w"(v591), "w"(v523)); - svfloat32_t v593; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v593) : "w"(v591), "w"(v523)); + svfloat32_t v573 = svadd_f32_x(svptrue_b32(), v572, v537); + svfloat32_t v575 = svsub_f32_x(svptrue_b32(), v572, v537); + svfloat32_t v577 = svsub_f32_x(svptrue_b32(), v572, v544); + svfloat32_t v592 = svadd_f32_x(svptrue_b32(), v591, v523); + svfloat32_t v593 = svsub_f32_x(svptrue_b32(), v591, v523); svfloat32_t v394 = svmla_f32_x(pred_full, v393, v332, v952); svfloat32_t v396 = svmls_f32_x(pred_full, v395, v333, v953); svfloat32_t v398 = svmla_f32_x(pred_full, v397, v333, v953); svfloat32_t v483 = svmla_f32_x(pred_full, v482, v421, v961); svfloat32_t v485 = svmls_f32_x(pred_full, v484, v422, v962); svfloat32_t v487 = svmla_f32_x(pred_full, v486, v422, v962); - svfloat32_t v574; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v574) : "w"(v573), "w"(v544)); - svfloat32_t v576; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v576) : "w"(v575), "w"(v551)); - svfloat32_t v578; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v578) : "w"(v577), "w"(v551)); + svfloat32_t v574 = svadd_f32_x(svptrue_b32(), v573, v544); + svfloat32_t v576 = svsub_f32_x(svptrue_b32(), v575, v551); + svfloat32_t v578 = svadd_f32_x(svptrue_b32(), v577, v551); svst1_f64(pred_full, (double *)(v992), svreinterpret_f64_f32(v593)); svst1_f64(pred_full, (double *)(v1001), svreinterpret_f64_f32(v592)); - svfloat32_t v405; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v405) : "w"(v394), "w"(v400)); - svfloat32_t v406; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v406) : "w"(v394), "w"(v400)); - svfloat32_t v407; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v407) : "w"(v396), "w"(v402)); - svfloat32_t v408; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v408) : "w"(v396), "w"(v402)); - svfloat32_t v409; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v409) : "w"(v398), "w"(v404)); - svfloat32_t v410; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v410) : "w"(v398), "w"(v404)); - svfloat32_t v494; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v494) : "w"(v483), "w"(v489)); - svfloat32_t v495; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v495) : "w"(v483), "w"(v489)); - svfloat32_t v496; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v496) : "w"(v485), "w"(v491)); - svfloat32_t v497; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v497) : "w"(v485), "w"(v491)); - svfloat32_t v498; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v498) : "w"(v487), "w"(v493)); - svfloat32_t v499; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v499) : "w"(v487), "w"(v493)); - svfloat32_t v585; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v585) : "w"(v574), "w"(v580)); - svfloat32_t v586; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v586) : "w"(v574), "w"(v580)); - svfloat32_t v587; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v587) : "w"(v576), "w"(v582)); - svfloat32_t v588; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v588) : "w"(v576), "w"(v582)); - svfloat32_t v589; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v589) : "w"(v578), "w"(v584)); - svfloat32_t v590; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v590) : "w"(v578), "w"(v584)); - svfloat32_t v615; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v615) : "w"(v406), "w"(v495)); - svfloat32_t v639; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v639) : "w"(v408), "w"(v497)); - svfloat32_t v663; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v663) : "w"(v409), "w"(v498)); - svfloat32_t v687; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v687) : "w"(v410), "w"(v499)); - svfloat32_t v711; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v711) : "w"(v407), "w"(v496)); - svfloat32_t v735; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v735) : "w"(v405), "w"(v494)); + svfloat32_t v405 = svadd_f32_x(svptrue_b32(), v394, v400); + svfloat32_t v406 = svsub_f32_x(svptrue_b32(), v394, v400); + svfloat32_t v407 = svadd_f32_x(svptrue_b32(), v396, v402); + svfloat32_t v408 = svsub_f32_x(svptrue_b32(), v396, v402); + svfloat32_t v409 = svadd_f32_x(svptrue_b32(), v398, v404); + svfloat32_t v410 = svsub_f32_x(svptrue_b32(), v398, v404); + svfloat32_t v494 = svadd_f32_x(svptrue_b32(), v483, v489); + svfloat32_t v495 = svsub_f32_x(svptrue_b32(), v483, v489); + svfloat32_t v496 = svadd_f32_x(svptrue_b32(), v485, v491); + svfloat32_t v497 = svsub_f32_x(svptrue_b32(), v485, v491); + svfloat32_t v498 = svadd_f32_x(svptrue_b32(), v487, v493); + svfloat32_t v499 = svsub_f32_x(svptrue_b32(), v487, v493); + svfloat32_t v585 = svadd_f32_x(svptrue_b32(), v574, v580); + svfloat32_t v586 = svsub_f32_x(svptrue_b32(), v574, v580); + svfloat32_t v587 = svadd_f32_x(svptrue_b32(), v576, v582); + svfloat32_t v588 = svsub_f32_x(svptrue_b32(), v576, v582); + svfloat32_t v589 = svadd_f32_x(svptrue_b32(), v578, v584); + svfloat32_t v590 = svsub_f32_x(svptrue_b32(), v578, v584); + svfloat32_t v615 = svadd_f32_x(svptrue_b32(), v406, v495); + svfloat32_t v639 = svadd_f32_x(svptrue_b32(), v408, v497); + svfloat32_t v663 = svadd_f32_x(svptrue_b32(), v409, v498); + svfloat32_t v687 = svadd_f32_x(svptrue_b32(), v410, v499); + svfloat32_t v711 = svadd_f32_x(svptrue_b32(), v407, v496); + svfloat32_t v735 = svadd_f32_x(svptrue_b32(), v405, v494); svst1_f64(pred_full, (double *)(v1010), svreinterpret_f64_f32(v406)); svst1_f64(pred_full, (double *)(v1037), svreinterpret_f64_f32(v408)); svst1_f64(pred_full, (double *)(v1064), svreinterpret_f64_f32(v409)); svst1_f64(pred_full, (double *)(v1091), svreinterpret_f64_f32(v410)); svst1_f64(pred_full, (double *)(v1118), svreinterpret_f64_f32(v407)); svst1_f64(pred_full, (double *)(v1145), svreinterpret_f64_f32(v405)); - svfloat32_t v616; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v616) : "w"(v615), "w"(v586)); - svfloat32_t v617; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v617) : "w"(v615), "w"(v586)); - svfloat32_t v640; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v640) : "w"(v639), "w"(v588)); - svfloat32_t v641; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v641) : "w"(v639), "w"(v588)); - svfloat32_t v664; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v664) : "w"(v663), "w"(v589)); - svfloat32_t v665; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v665) : "w"(v663), "w"(v589)); - svfloat32_t v688; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v688) : "w"(v687), "w"(v590)); - svfloat32_t v689; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v689) : "w"(v687), "w"(v590)); - svfloat32_t v712; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v712) : "w"(v711), "w"(v587)); - svfloat32_t v713; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v713) : "w"(v711), "w"(v587)); - svfloat32_t v736; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v736) : "w"(v735), "w"(v585)); - svfloat32_t v737; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v737) : "w"(v735), "w"(v585)); + svfloat32_t v616 = svadd_f32_x(svptrue_b32(), v615, v586); + svfloat32_t v617 = svsub_f32_x(svptrue_b32(), v615, v586); + svfloat32_t v640 = svadd_f32_x(svptrue_b32(), v639, v588); + svfloat32_t v641 = svsub_f32_x(svptrue_b32(), v639, v588); + svfloat32_t v664 = svadd_f32_x(svptrue_b32(), v663, v589); + svfloat32_t v665 = svsub_f32_x(svptrue_b32(), v663, v589); + svfloat32_t v688 = svadd_f32_x(svptrue_b32(), v687, v590); + svfloat32_t v689 = svsub_f32_x(svptrue_b32(), v687, v590); + svfloat32_t v712 = svadd_f32_x(svptrue_b32(), v711, v587); + svfloat32_t v713 = svsub_f32_x(svptrue_b32(), v711, v587); + svfloat32_t v736 = svadd_f32_x(svptrue_b32(), v735, v585); + svfloat32_t v737 = svsub_f32_x(svptrue_b32(), v735, v585); svst1_f64(pred_full, (double *)(v1019), svreinterpret_f64_f32(v617)); svst1_f64(pred_full, (double *)(v1028), svreinterpret_f64_f32(v616)); svst1_f64(pred_full, (double *)(v1046), svreinterpret_f64_f32(v641)); @@ -18425,46 +13986,25 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu22(const armral_cmplx_f32_t *restrict x, const float32x2_t *v1048 = &v5[v0]; float32x2_t *v1209 = &v6[v2]; int64_t v19 = v0 * 11; - float32x2_t v30 = v7[10]; int64_t v34 = v0 * 2; int64_t v45 = v0 * 13; - float32x2_t v56 = v7[1]; - float32x2_t v60 = v7[12]; int64_t v64 = v0 * 4; int64_t v75 = v0 * 15; - float32x2_t v86 = v7[3]; - float32x2_t v90 = v7[14]; int64_t v94 = v0 * 6; int64_t v105 = v0 * 17; - float32x2_t v116 = v7[5]; - float32x2_t v120 = v7[16]; int64_t v124 = v0 * 8; int64_t v135 = v0 * 19; - float32x2_t v146 = v7[7]; - float32x2_t v150 = v7[18]; int64_t v154 = v0 * 10; int64_t v165 = v0 * 21; - float32x2_t v176 = v7[9]; - float32x2_t v180 = v7[20]; int64_t v184 = v0 * 12; - float32x2_t v206 = v7[11]; - float32x2_t v210 = v7[0]; int64_t v214 = v0 * 14; int64_t v225 = v0 * 3; - float32x2_t v236 = v7[13]; - float32x2_t v240 = v7[2]; int64_t v244 = v0 * 16; int64_t v255 = v0 * 5; - float32x2_t v266 = v7[15]; - float32x2_t v270 = v7[4]; int64_t v274 = v0 * 18; int64_t v285 = v0 * 7; - float32x2_t v296 = v7[17]; - float32x2_t v300 = v7[6]; int64_t v304 = v0 * 20; int64_t v315 = v0 * 9; - float32x2_t v326 = v7[19]; - float32x2_t v330 = v7[8]; float v622 = v4 * v619; float v674 = v4 * v671; float v681 = v4 * v678; @@ -18509,48 +14049,48 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu22(const armral_cmplx_f32_t *restrict x, float32x2_t *v1182 = &v6[0]; svfloat32_t v1399 = svreinterpret_f32_f64( svld1_f64(pred_full, &((const double *)v1048)[0])); - svfloat32_t v31; - asm("mov %0.d, %d1" : "=w"(v31) : "w"(v30)); - svfloat32_t v57; - asm("mov %0.d, %d1" : "=w"(v57) : "w"(v56)); - svfloat32_t v61; - asm("mov %0.d, %d1" : "=w"(v61) : "w"(v60)); - svfloat32_t v87; - asm("mov %0.d, %d1" : "=w"(v87) : "w"(v86)); - svfloat32_t v91; - asm("mov %0.d, %d1" : "=w"(v91) : "w"(v90)); - svfloat32_t v117; - asm("mov %0.d, %d1" : "=w"(v117) : "w"(v116)); - svfloat32_t v121; - asm("mov %0.d, %d1" : "=w"(v121) : "w"(v120)); - svfloat32_t v147; - asm("mov %0.d, %d1" : "=w"(v147) : "w"(v146)); - svfloat32_t v151; - asm("mov %0.d, %d1" : "=w"(v151) : "w"(v150)); - svfloat32_t v177; - asm("mov %0.d, %d1" : "=w"(v177) : "w"(v176)); - svfloat32_t v181; - asm("mov %0.d, %d1" : "=w"(v181) : "w"(v180)); - svfloat32_t v207; - asm("mov %0.d, %d1" : "=w"(v207) : "w"(v206)); - svfloat32_t v211; - asm("mov %0.d, %d1" : "=w"(v211) : "w"(v210)); - svfloat32_t v237; - asm("mov %0.d, %d1" : "=w"(v237) : "w"(v236)); - svfloat32_t v241; - asm("mov %0.d, %d1" : "=w"(v241) : "w"(v240)); - svfloat32_t v267; - asm("mov %0.d, %d1" : "=w"(v267) : "w"(v266)); - svfloat32_t v271; - asm("mov %0.d, %d1" : "=w"(v271) : "w"(v270)); - svfloat32_t v297; - asm("mov %0.d, %d1" : "=w"(v297) : "w"(v296)); - svfloat32_t v301; - asm("mov %0.d, %d1" : "=w"(v301) : "w"(v300)); - svfloat32_t v327; - asm("mov %0.d, %d1" : "=w"(v327) : "w"(v326)); - svfloat32_t v331; - asm("mov %0.d, %d1" : "=w"(v331) : "w"(v330)); + svfloat32_t v31 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[10])); + svfloat32_t v57 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[1])); + svfloat32_t v61 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[12])); + svfloat32_t v87 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[3])); + svfloat32_t v91 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[14])); + svfloat32_t v117 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[5])); + svfloat32_t v121 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[16])); + svfloat32_t v147 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[7])); + svfloat32_t v151 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[18])); + svfloat32_t v177 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[9])); + svfloat32_t v181 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[20])); + svfloat32_t v207 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[11])); + svfloat32_t v211 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[0])); + svfloat32_t v237 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[13])); + svfloat32_t v241 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[2])); + svfloat32_t v267 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[15])); + svfloat32_t v271 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[4])); + svfloat32_t v297 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[17])); + svfloat32_t v301 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[6])); + svfloat32_t v327 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[19])); + svfloat32_t v331 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[8])); const float32x2_t *v940 = &v5[v19]; const float32x2_t *v949 = &v5[v34]; const float32x2_t *v958 = &v5[v45]; @@ -18603,8 +14143,7 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu22(const armral_cmplx_f32_t *restrict x, float32x2_t *v1371 = &v6[v928]; svfloat32_t v1417 = svreinterpret_f32_f64( svld1_f64(pred_full, &((const double *)v1130)[0])); - svfloat32_t zero212; - asm volatile("mov %0.s, #0" : "=w"(zero212)); + svfloat32_t zero212 = svdup_n_f32(0); svfloat32_t v212 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero212, v1399, v211, 0), v1399, v211, 90); @@ -18648,340 +14187,208 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu22(const armral_cmplx_f32_t *restrict x, svld1_f64(pred_full, &((const double *)v1111)[0])); svfloat32_t v1415 = svreinterpret_f32_f64( svld1_f64(pred_full, &((const double *)v1120)[0])); - svfloat32_t zero32; - asm volatile("mov %0.s, #0" : "=w"(zero32)); + svfloat32_t zero32 = svdup_n_f32(0); svfloat32_t v32 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero32, v1375, v31, 0), v1375, v31, 90); - svfloat32_t zero58; - asm volatile("mov %0.s, #0" : "=w"(zero58)); + svfloat32_t zero58 = svdup_n_f32(0); svfloat32_t v58 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero58, v1377, v57, 0), v1377, v57, 90); - svfloat32_t zero62; - asm volatile("mov %0.s, #0" : "=w"(zero62)); + svfloat32_t zero62 = svdup_n_f32(0); svfloat32_t v62 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero62, v1379, v61, 0), v1379, v61, 90); - svfloat32_t zero88; - asm volatile("mov %0.s, #0" : "=w"(zero88)); + svfloat32_t zero88 = svdup_n_f32(0); svfloat32_t v88 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero88, v1381, v87, 0), v1381, v87, 90); - svfloat32_t zero92; - asm volatile("mov %0.s, #0" : "=w"(zero92)); + svfloat32_t zero92 = svdup_n_f32(0); svfloat32_t v92 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero92, v1383, v91, 0), v1383, v91, 90); - svfloat32_t zero118; - asm volatile("mov %0.s, #0" : "=w"(zero118)); + svfloat32_t zero118 = svdup_n_f32(0); svfloat32_t v118 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero118, v1385, v117, 0), v1385, v117, 90); - svfloat32_t zero122; - asm volatile("mov %0.s, #0" : "=w"(zero122)); + svfloat32_t zero122 = svdup_n_f32(0); svfloat32_t v122 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero122, v1387, v121, 0), v1387, v121, 90); - svfloat32_t zero148; - asm volatile("mov %0.s, #0" : "=w"(zero148)); + svfloat32_t zero148 = svdup_n_f32(0); svfloat32_t v148 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero148, v1389, v147, 0), v1389, v147, 90); - svfloat32_t zero152; - asm volatile("mov %0.s, #0" : "=w"(zero152)); + svfloat32_t zero152 = svdup_n_f32(0); svfloat32_t v152 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero152, v1391, v151, 0), v1391, v151, 90); - svfloat32_t zero178; - asm volatile("mov %0.s, #0" : "=w"(zero178)); + svfloat32_t zero178 = svdup_n_f32(0); svfloat32_t v178 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero178, v1393, v177, 0), v1393, v177, 90); - svfloat32_t zero182; - asm volatile("mov %0.s, #0" : "=w"(zero182)); + svfloat32_t zero182 = svdup_n_f32(0); svfloat32_t v182 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero182, v1395, v181, 0), v1395, v181, 90); - svfloat32_t zero208; - asm volatile("mov %0.s, #0" : "=w"(zero208)); + svfloat32_t zero208 = svdup_n_f32(0); svfloat32_t v208 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero208, v1397, v207, 0), v1397, v207, 90); - svfloat32_t zero238; - asm volatile("mov %0.s, #0" : "=w"(zero238)); + svfloat32_t zero238 = svdup_n_f32(0); svfloat32_t v238 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero238, v1401, v237, 0), v1401, v237, 90); - svfloat32_t zero242; - asm volatile("mov %0.s, #0" : "=w"(zero242)); + svfloat32_t zero242 = svdup_n_f32(0); svfloat32_t v242 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero242, v1403, v241, 0), v1403, v241, 90); - svfloat32_t zero268; - asm volatile("mov %0.s, #0" : "=w"(zero268)); + svfloat32_t zero268 = svdup_n_f32(0); svfloat32_t v268 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero268, v1405, v267, 0), v1405, v267, 90); - svfloat32_t zero272; - asm volatile("mov %0.s, #0" : "=w"(zero272)); + svfloat32_t zero272 = svdup_n_f32(0); svfloat32_t v272 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero272, v1407, v271, 0), v1407, v271, 90); - svfloat32_t zero298; - asm volatile("mov %0.s, #0" : "=w"(zero298)); + svfloat32_t zero298 = svdup_n_f32(0); svfloat32_t v298 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero298, v1409, v297, 0), v1409, v297, 90); - svfloat32_t zero302; - asm volatile("mov %0.s, #0" : "=w"(zero302)); + svfloat32_t zero302 = svdup_n_f32(0); svfloat32_t v302 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero302, v1411, v301, 0), v1411, v301, 90); - svfloat32_t zero328; - asm volatile("mov %0.s, #0" : "=w"(zero328)); + svfloat32_t zero328 = svdup_n_f32(0); svfloat32_t v328 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero328, v1413, v327, 0), v1413, v327, 90); - svfloat32_t zero332; - asm volatile("mov %0.s, #0" : "=w"(zero332)); + svfloat32_t zero332 = svdup_n_f32(0); svfloat32_t v332 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero332, v1415, v331, 0), v1415, v331, 90); - svfloat32_t v340; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v340) : "w"(v1417), "w"(v32)); - svfloat32_t v341; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v341) : "w"(v1417), "w"(v32)); - svfloat32_t v342; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v342) : "w"(v58), "w"(v62)); - svfloat32_t v343; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v343) : "w"(v58), "w"(v62)); - svfloat32_t v344; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v344) : "w"(v88), "w"(v92)); - svfloat32_t v345; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v345) : "w"(v88), "w"(v92)); - svfloat32_t v346; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v346) : "w"(v118), "w"(v122)); - svfloat32_t v347; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v347) : "w"(v118), "w"(v122)); - svfloat32_t v348; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v348) : "w"(v148), "w"(v152)); - svfloat32_t v349; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v349) : "w"(v148), "w"(v152)); - svfloat32_t v350; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v350) : "w"(v178), "w"(v182)); - svfloat32_t v351; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v351) : "w"(v178), "w"(v182)); - svfloat32_t v352; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v352) : "w"(v208), "w"(v212)); - svfloat32_t v353; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v353) : "w"(v208), "w"(v212)); - svfloat32_t v354; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v354) : "w"(v238), "w"(v242)); - svfloat32_t v355; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v355) : "w"(v238), "w"(v242)); - svfloat32_t v356; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v356) : "w"(v268), "w"(v272)); - svfloat32_t v357; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v357) : "w"(v268), "w"(v272)); - svfloat32_t v358; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v358) : "w"(v298), "w"(v302)); - svfloat32_t v359; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v359) : "w"(v298), "w"(v302)); - svfloat32_t v360; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v360) : "w"(v328), "w"(v332)); - svfloat32_t v361; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v361) : "w"(v328), "w"(v332)); - svfloat32_t v362; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v362) : "w"(v342), "w"(v360)); - svfloat32_t v363; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v363) : "w"(v344), "w"(v358)); - svfloat32_t v364; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v364) : "w"(v346), "w"(v356)); - svfloat32_t v365; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v365) : "w"(v348), "w"(v354)); - svfloat32_t v366; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v366) : "w"(v350), "w"(v352)); - svfloat32_t v367; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v367) : "w"(v342), "w"(v360)); - svfloat32_t v368; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v368) : "w"(v344), "w"(v358)); - svfloat32_t v369; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v369) : "w"(v346), "w"(v356)); - svfloat32_t v370; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v370) : "w"(v348), "w"(v354)); - svfloat32_t v371; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v371) : "w"(v350), "w"(v352)); - svfloat32_t v571; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v571) : "w"(v343), "w"(v361)); - svfloat32_t v572; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v572) : "w"(v345), "w"(v359)); - svfloat32_t v573; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v573) : "w"(v347), "w"(v357)); - svfloat32_t v574; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v574) : "w"(v349), "w"(v355)); - svfloat32_t v575; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v575) : "w"(v351), "w"(v353)); - svfloat32_t v576; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v576) : "w"(v343), "w"(v361)); - svfloat32_t v577; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v577) : "w"(v345), "w"(v359)); - svfloat32_t v578; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v578) : "w"(v347), "w"(v357)); - svfloat32_t v579; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v579) : "w"(v349), "w"(v355)); - svfloat32_t v580; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v580) : "w"(v351), "w"(v353)); - svfloat32_t v372; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v372) : "w"(v362), "w"(v363)); - svfloat32_t v373; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v373) : "w"(v364), "w"(v366)); - svfloat32_t v375; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v375) : "w"(v368), "w"(v369)); - svfloat32_t v376; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v376) : "w"(v367), "w"(v371)); - svfloat32_t v381; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v381) : "w"(v363), "w"(v365)); - svfloat32_t v382; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v382) : "w"(v362), "w"(v365)); - svfloat32_t v383; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v383) : "w"(v363), "w"(v362)); - svfloat32_t v384; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v384) : "w"(v366), "w"(v365)); - svfloat32_t v385; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v385) : "w"(v364), "w"(v365)); - svfloat32_t v386; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v386) : "w"(v366), "w"(v364)); - svfloat32_t v387; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v387) : "w"(v363), "w"(v366)); - svfloat32_t v388; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v388) : "w"(v362), "w"(v364)); - svfloat32_t v390; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v390) : "w"(v368), "w"(v370)); - svfloat32_t v391; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v391) : "w"(v367), "w"(v370)); - svfloat32_t v392; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v392) : "w"(v367), "w"(v368)); - svfloat32_t v393; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v393) : "w"(v370), "w"(v371)); - svfloat32_t v394; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v394) : "w"(v369), "w"(v370)); - svfloat32_t v395; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v395) : "w"(v369), "w"(v371)); - svfloat32_t v396; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v396) : "w"(v368), "w"(v371)); - svfloat32_t v397; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v397) : "w"(v367), "w"(v369)); - svfloat32_t v581; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v581) : "w"(v571), "w"(v572)); - svfloat32_t v582; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v582) : "w"(v573), "w"(v575)); - svfloat32_t v584; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v584) : "w"(v577), "w"(v578)); - svfloat32_t v585; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v585) : "w"(v576), "w"(v580)); - svfloat32_t v590; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v590) : "w"(v572), "w"(v574)); - svfloat32_t v591; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v591) : "w"(v571), "w"(v574)); - svfloat32_t v592; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v592) : "w"(v572), "w"(v571)); - svfloat32_t v593; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v593) : "w"(v575), "w"(v574)); - svfloat32_t v594; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v594) : "w"(v573), "w"(v574)); - svfloat32_t v595; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v595) : "w"(v575), "w"(v573)); - svfloat32_t v596; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v596) : "w"(v572), "w"(v575)); - svfloat32_t v597; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v597) : "w"(v571), "w"(v573)); - svfloat32_t v599; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v599) : "w"(v577), "w"(v579)); - svfloat32_t v600; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v600) : "w"(v576), "w"(v579)); - svfloat32_t v601; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v601) : "w"(v576), "w"(v577)); - svfloat32_t v602; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v602) : "w"(v579), "w"(v580)); - svfloat32_t v603; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v603) : "w"(v578), "w"(v579)); - svfloat32_t v604; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v604) : "w"(v578), "w"(v580)); - svfloat32_t v605; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v605) : "w"(v577), "w"(v580)); - svfloat32_t v606; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v606) : "w"(v576), "w"(v578)); - svfloat32_t v374; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v374) : "w"(v365), "w"(v372)); - svfloat32_t v379; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v379) : "w"(v375), "w"(v376)); - svfloat32_t v389; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v389) : "w"(v373), "w"(v372)); - svfloat32_t v398; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v398) : "w"(v375), "w"(v376)); - svfloat32_t v425; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v425) : "w"(v382), "w"(v1158)); - svfloat32_t v430; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v430) : "w"(v383), "w"(v1159)); - svfloat32_t v440; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v440) : "w"(v385), "w"(v1161)); - svfloat32_t v445; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v445) : "w"(v386), "w"(v1162)); - svfloat32_t zero467; - asm volatile("mov %0.s, #0" : "=w"(zero467)); + svfloat32_t v340 = svadd_f32_x(svptrue_b32(), v1417, v32); + svfloat32_t v341 = svsub_f32_x(svptrue_b32(), v1417, v32); + svfloat32_t v342 = svadd_f32_x(svptrue_b32(), v58, v62); + svfloat32_t v343 = svsub_f32_x(svptrue_b32(), v58, v62); + svfloat32_t v344 = svadd_f32_x(svptrue_b32(), v88, v92); + svfloat32_t v345 = svsub_f32_x(svptrue_b32(), v88, v92); + svfloat32_t v346 = svadd_f32_x(svptrue_b32(), v118, v122); + svfloat32_t v347 = svsub_f32_x(svptrue_b32(), v118, v122); + svfloat32_t v348 = svadd_f32_x(svptrue_b32(), v148, v152); + svfloat32_t v349 = svsub_f32_x(svptrue_b32(), v148, v152); + svfloat32_t v350 = svadd_f32_x(svptrue_b32(), v178, v182); + svfloat32_t v351 = svsub_f32_x(svptrue_b32(), v178, v182); + svfloat32_t v352 = svadd_f32_x(svptrue_b32(), v208, v212); + svfloat32_t v353 = svsub_f32_x(svptrue_b32(), v208, v212); + svfloat32_t v354 = svadd_f32_x(svptrue_b32(), v238, v242); + svfloat32_t v355 = svsub_f32_x(svptrue_b32(), v238, v242); + svfloat32_t v356 = svadd_f32_x(svptrue_b32(), v268, v272); + svfloat32_t v357 = svsub_f32_x(svptrue_b32(), v268, v272); + svfloat32_t v358 = svadd_f32_x(svptrue_b32(), v298, v302); + svfloat32_t v359 = svsub_f32_x(svptrue_b32(), v298, v302); + svfloat32_t v360 = svadd_f32_x(svptrue_b32(), v328, v332); + svfloat32_t v361 = svsub_f32_x(svptrue_b32(), v328, v332); + svfloat32_t v362 = svadd_f32_x(svptrue_b32(), v342, v360); + svfloat32_t v363 = svadd_f32_x(svptrue_b32(), v344, v358); + svfloat32_t v364 = svadd_f32_x(svptrue_b32(), v346, v356); + svfloat32_t v365 = svadd_f32_x(svptrue_b32(), v348, v354); + svfloat32_t v366 = svadd_f32_x(svptrue_b32(), v350, v352); + svfloat32_t v367 = svsub_f32_x(svptrue_b32(), v342, v360); + svfloat32_t v368 = svsub_f32_x(svptrue_b32(), v344, v358); + svfloat32_t v369 = svsub_f32_x(svptrue_b32(), v346, v356); + svfloat32_t v370 = svsub_f32_x(svptrue_b32(), v348, v354); + svfloat32_t v371 = svsub_f32_x(svptrue_b32(), v350, v352); + svfloat32_t v571 = svadd_f32_x(svptrue_b32(), v343, v361); + svfloat32_t v572 = svadd_f32_x(svptrue_b32(), v345, v359); + svfloat32_t v573 = svadd_f32_x(svptrue_b32(), v347, v357); + svfloat32_t v574 = svadd_f32_x(svptrue_b32(), v349, v355); + svfloat32_t v575 = svadd_f32_x(svptrue_b32(), v351, v353); + svfloat32_t v576 = svsub_f32_x(svptrue_b32(), v343, v361); + svfloat32_t v577 = svsub_f32_x(svptrue_b32(), v345, v359); + svfloat32_t v578 = svsub_f32_x(svptrue_b32(), v347, v357); + svfloat32_t v579 = svsub_f32_x(svptrue_b32(), v349, v355); + svfloat32_t v580 = svsub_f32_x(svptrue_b32(), v351, v353); + svfloat32_t v372 = svadd_f32_x(svptrue_b32(), v362, v363); + svfloat32_t v373 = svadd_f32_x(svptrue_b32(), v364, v366); + svfloat32_t v375 = svsub_f32_x(svptrue_b32(), v368, v369); + svfloat32_t v376 = svadd_f32_x(svptrue_b32(), v367, v371); + svfloat32_t v381 = svsub_f32_x(svptrue_b32(), v363, v365); + svfloat32_t v382 = svsub_f32_x(svptrue_b32(), v362, v365); + svfloat32_t v383 = svsub_f32_x(svptrue_b32(), v363, v362); + svfloat32_t v384 = svsub_f32_x(svptrue_b32(), v366, v365); + svfloat32_t v385 = svsub_f32_x(svptrue_b32(), v364, v365); + svfloat32_t v386 = svsub_f32_x(svptrue_b32(), v366, v364); + svfloat32_t v387 = svsub_f32_x(svptrue_b32(), v363, v366); + svfloat32_t v388 = svsub_f32_x(svptrue_b32(), v362, v364); + svfloat32_t v390 = svadd_f32_x(svptrue_b32(), v368, v370); + svfloat32_t v391 = svsub_f32_x(svptrue_b32(), v367, v370); + svfloat32_t v392 = svadd_f32_x(svptrue_b32(), v367, v368); + svfloat32_t v393 = svsub_f32_x(svptrue_b32(), v370, v371); + svfloat32_t v394 = svsub_f32_x(svptrue_b32(), v369, v370); + svfloat32_t v395 = svsub_f32_x(svptrue_b32(), v369, v371); + svfloat32_t v396 = svadd_f32_x(svptrue_b32(), v368, v371); + svfloat32_t v397 = svsub_f32_x(svptrue_b32(), v367, v369); + svfloat32_t v581 = svadd_f32_x(svptrue_b32(), v571, v572); + svfloat32_t v582 = svadd_f32_x(svptrue_b32(), v573, v575); + svfloat32_t v584 = svsub_f32_x(svptrue_b32(), v577, v578); + svfloat32_t v585 = svadd_f32_x(svptrue_b32(), v576, v580); + svfloat32_t v590 = svsub_f32_x(svptrue_b32(), v572, v574); + svfloat32_t v591 = svsub_f32_x(svptrue_b32(), v571, v574); + svfloat32_t v592 = svsub_f32_x(svptrue_b32(), v572, v571); + svfloat32_t v593 = svsub_f32_x(svptrue_b32(), v575, v574); + svfloat32_t v594 = svsub_f32_x(svptrue_b32(), v573, v574); + svfloat32_t v595 = svsub_f32_x(svptrue_b32(), v575, v573); + svfloat32_t v596 = svsub_f32_x(svptrue_b32(), v572, v575); + svfloat32_t v597 = svsub_f32_x(svptrue_b32(), v571, v573); + svfloat32_t v599 = svadd_f32_x(svptrue_b32(), v577, v579); + svfloat32_t v600 = svsub_f32_x(svptrue_b32(), v576, v579); + svfloat32_t v601 = svadd_f32_x(svptrue_b32(), v576, v577); + svfloat32_t v602 = svsub_f32_x(svptrue_b32(), v579, v580); + svfloat32_t v603 = svsub_f32_x(svptrue_b32(), v578, v579); + svfloat32_t v604 = svsub_f32_x(svptrue_b32(), v578, v580); + svfloat32_t v605 = svadd_f32_x(svptrue_b32(), v577, v580); + svfloat32_t v606 = svsub_f32_x(svptrue_b32(), v576, v578); + svfloat32_t v374 = svadd_f32_x(svptrue_b32(), v365, v372); + svfloat32_t v379 = svsub_f32_x(svptrue_b32(), v375, v376); + svfloat32_t v389 = svsub_f32_x(svptrue_b32(), v373, v372); + svfloat32_t v398 = svadd_f32_x(svptrue_b32(), v375, v376); + svfloat32_t v425 = svmul_f32_x(svptrue_b32(), v382, v1158); + svfloat32_t v430 = svmul_f32_x(svptrue_b32(), v383, v1159); + svfloat32_t v440 = svmul_f32_x(svptrue_b32(), v385, v1161); + svfloat32_t v445 = svmul_f32_x(svptrue_b32(), v386, v1162); + svfloat32_t zero467 = svdup_n_f32(0); svfloat32_t v467 = svcmla_f32_x(pred_full, zero467, v1166, v390, 90); - svfloat32_t zero481; - asm volatile("mov %0.s, #0" : "=w"(zero481)); + svfloat32_t zero481 = svdup_n_f32(0); svfloat32_t v481 = svcmla_f32_x(pred_full, zero481, v1168, v392, 90); - svfloat32_t zero488; - asm volatile("mov %0.s, #0" : "=w"(zero488)); + svfloat32_t zero488 = svdup_n_f32(0); svfloat32_t v488 = svcmla_f32_x(pred_full, zero488, v1169, v393, 90); - svfloat32_t zero502; - asm volatile("mov %0.s, #0" : "=w"(zero502)); + svfloat32_t zero502 = svdup_n_f32(0); svfloat32_t v502 = svcmla_f32_x(pred_full, zero502, v1171, v395, 90); - svfloat32_t zero509; - asm volatile("mov %0.s, #0" : "=w"(zero509)); + svfloat32_t zero509 = svdup_n_f32(0); svfloat32_t v509 = svcmla_f32_x(pred_full, zero509, v1172, v396, 90); - svfloat32_t v583; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v583) : "w"(v574), "w"(v581)); - svfloat32_t v588; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v588) : "w"(v584), "w"(v585)); - svfloat32_t v598; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v598) : "w"(v582), "w"(v581)); - svfloat32_t v607; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v607) : "w"(v584), "w"(v585)); - svfloat32_t v634; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v634) : "w"(v591), "w"(v1158)); - svfloat32_t v639; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v639) : "w"(v592), "w"(v1159)); - svfloat32_t v649; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v649) : "w"(v594), "w"(v1161)); - svfloat32_t v654; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v654) : "w"(v595), "w"(v1162)); - svfloat32_t zero676; - asm volatile("mov %0.s, #0" : "=w"(zero676)); + svfloat32_t v583 = svadd_f32_x(svptrue_b32(), v574, v581); + svfloat32_t v588 = svsub_f32_x(svptrue_b32(), v584, v585); + svfloat32_t v598 = svsub_f32_x(svptrue_b32(), v582, v581); + svfloat32_t v607 = svadd_f32_x(svptrue_b32(), v584, v585); + svfloat32_t v634 = svmul_f32_x(svptrue_b32(), v591, v1158); + svfloat32_t v639 = svmul_f32_x(svptrue_b32(), v592, v1159); + svfloat32_t v649 = svmul_f32_x(svptrue_b32(), v594, v1161); + svfloat32_t v654 = svmul_f32_x(svptrue_b32(), v595, v1162); + svfloat32_t zero676 = svdup_n_f32(0); svfloat32_t v676 = svcmla_f32_x(pred_full, zero676, v1166, v599, 90); - svfloat32_t zero690; - asm volatile("mov %0.s, #0" : "=w"(zero690)); + svfloat32_t zero690 = svdup_n_f32(0); svfloat32_t v690 = svcmla_f32_x(pred_full, zero690, v1168, v601, 90); - svfloat32_t zero697; - asm volatile("mov %0.s, #0" : "=w"(zero697)); + svfloat32_t zero697 = svdup_n_f32(0); svfloat32_t v697 = svcmla_f32_x(pred_full, zero697, v1169, v602, 90); - svfloat32_t zero711; - asm volatile("mov %0.s, #0" : "=w"(zero711)); + svfloat32_t zero711 = svdup_n_f32(0); svfloat32_t v711 = svcmla_f32_x(pred_full, zero711, v1171, v604, 90); - svfloat32_t zero718; - asm volatile("mov %0.s, #0" : "=w"(zero718)); + svfloat32_t zero718 = svdup_n_f32(0); svfloat32_t v718 = svcmla_f32_x(pred_full, zero718, v1172, v605, 90); - svfloat32_t v377; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v377) : "w"(v374), "w"(v373)); - svfloat32_t v380; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v380) : "w"(v379), "w"(v370)); - svfloat32_t v460; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v460) : "w"(v389), "w"(v1165)); - svfloat32_t zero523; - asm volatile("mov %0.s, #0" : "=w"(zero523)); + svfloat32_t v377 = svadd_f32_x(svptrue_b32(), v374, v373); + svfloat32_t v380 = svsub_f32_x(svptrue_b32(), v379, v370); + svfloat32_t v460 = svmul_f32_x(svptrue_b32(), v389, v1165); + svfloat32_t zero523 = svdup_n_f32(0); svfloat32_t v523 = svcmla_f32_x(pred_full, zero523, v1174, v398, 90); svfloat32_t v525 = svmla_f32_x(pred_full, v425, v381, v1157); svfloat32_t v526 = svmla_f32_x(pred_full, v430, v382, v1158); @@ -18990,19 +14397,13 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu22(const armral_cmplx_f32_t *restrict x, svfloat32_t v529 = svmla_f32_x(pred_full, v445, v385, v1161); svfloat32_t v530 = svnmls_f32_x(pred_full, v445, v384, v1160); svfloat32_t v533 = svcmla_f32_x(pred_full, v481, v1167, v391, 90); - svfloat32_t v534; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v534) : "w"(v467), "w"(v481)); + svfloat32_t v534 = svsub_f32_x(svptrue_b32(), v467, v481); svfloat32_t v535 = svcmla_f32_x(pred_full, v502, v1170, v394, 90); - svfloat32_t v536; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v536) : "w"(v488), "w"(v502)); - svfloat32_t v586; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v586) : "w"(v583), "w"(v582)); - svfloat32_t v589; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v589) : "w"(v588), "w"(v579)); - svfloat32_t v669; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v669) : "w"(v598), "w"(v1165)); - svfloat32_t zero732; - asm volatile("mov %0.s, #0" : "=w"(zero732)); + svfloat32_t v536 = svsub_f32_x(svptrue_b32(), v488, v502); + svfloat32_t v586 = svadd_f32_x(svptrue_b32(), v583, v582); + svfloat32_t v589 = svsub_f32_x(svptrue_b32(), v588, v579); + svfloat32_t v669 = svmul_f32_x(svptrue_b32(), v598, v1165); + svfloat32_t zero732 = svdup_n_f32(0); svfloat32_t v732 = svcmla_f32_x(pred_full, zero732, v1174, v607, 90); svfloat32_t v734 = svmla_f32_x(pred_full, v634, v590, v1157); svfloat32_t v735 = svmla_f32_x(pred_full, v639, v591, v1158); @@ -19011,163 +14412,91 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu22(const armral_cmplx_f32_t *restrict x, svfloat32_t v738 = svmla_f32_x(pred_full, v654, v594, v1161); svfloat32_t v739 = svnmls_f32_x(pred_full, v654, v593, v1160); svfloat32_t v742 = svcmla_f32_x(pred_full, v690, v1167, v600, 90); - svfloat32_t v743; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v743) : "w"(v676), "w"(v690)); + svfloat32_t v743 = svsub_f32_x(svptrue_b32(), v676, v690); svfloat32_t v744 = svcmla_f32_x(pred_full, v711, v1170, v603, 90); - svfloat32_t v745; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v745) : "w"(v697), "w"(v711)); - svfloat32_t v378; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v378) : "w"(v340), "w"(v377)); - svfloat32_t zero415; - asm volatile("mov %0.s, #0" : "=w"(zero415)); + svfloat32_t v745 = svsub_f32_x(svptrue_b32(), v697, v711); + svfloat32_t v378 = svadd_f32_x(svptrue_b32(), v340, v377); + svfloat32_t zero415 = svdup_n_f32(0); svfloat32_t v415 = svcmla_f32_x(pred_full, zero415, v1156, v380, 90); svfloat32_t v531 = svmla_f32_x(pred_full, v460, v388, v1164); svfloat32_t v532 = svmla_f32_x(pred_full, v460, v387, v1163); svfloat32_t v537 = svcmla_f32_x(pred_full, v523, v1173, v397, 90); - svfloat32_t v538; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v538) : "w"(v509), "w"(v523)); - svfloat32_t v557; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v557) : "w"(v533), "w"(v534)); - svfloat32_t v587; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v587) : "w"(v341), "w"(v586)); - svfloat32_t zero624; - asm volatile("mov %0.s, #0" : "=w"(zero624)); + svfloat32_t v538 = svsub_f32_x(svptrue_b32(), v509, v523); + svfloat32_t v557 = svadd_f32_x(svptrue_b32(), v533, v534); + svfloat32_t v587 = svadd_f32_x(svptrue_b32(), v341, v586); + svfloat32_t zero624 = svdup_n_f32(0); svfloat32_t v624 = svcmla_f32_x(pred_full, zero624, v1156, v589, 90); svfloat32_t v740 = svmla_f32_x(pred_full, v669, v597, v1164); svfloat32_t v741 = svmla_f32_x(pred_full, v669, v596, v1163); svfloat32_t v746 = svcmla_f32_x(pred_full, v732, v1173, v606, 90); - svfloat32_t v747; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v747) : "w"(v718), "w"(v732)); - svfloat32_t v766; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v766) : "w"(v742), "w"(v743)); + svfloat32_t v747 = svsub_f32_x(svptrue_b32(), v718, v732); + svfloat32_t v766 = svadd_f32_x(svptrue_b32(), v742, v743); svfloat32_t v524 = svmls_f32_x(pred_full, v378, v377, v1155); - svfloat32_t v539; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v539) : "w"(v529), "w"(v531)); - svfloat32_t v549; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v549) : "w"(v415), "w"(v535)); - svfloat32_t v551; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v551) : "w"(v537), "w"(v533)); - svfloat32_t v553; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v553) : "w"(v415), "w"(v538)); - svfloat32_t v555; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v555) : "w"(v538), "w"(v534)); - svfloat32_t v558; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v558) : "w"(v557), "w"(v535)); + svfloat32_t v539 = svadd_f32_x(svptrue_b32(), v529, v531); + svfloat32_t v549 = svadd_f32_x(svptrue_b32(), v415, v535); + svfloat32_t v551 = svsub_f32_x(svptrue_b32(), v537, v533); + svfloat32_t v553 = svadd_f32_x(svptrue_b32(), v415, v538); + svfloat32_t v555 = svsub_f32_x(svptrue_b32(), v538, v534); + svfloat32_t v558 = svadd_f32_x(svptrue_b32(), v557, v535); svfloat32_t v733 = svmls_f32_x(pred_full, v587, v586, v1155); - svfloat32_t v748; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v748) : "w"(v738), "w"(v740)); - svfloat32_t v758; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v758) : "w"(v624), "w"(v744)); - svfloat32_t v760; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v760) : "w"(v746), "w"(v742)); - svfloat32_t v762; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v762) : "w"(v624), "w"(v747)); - svfloat32_t v764; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v764) : "w"(v747), "w"(v743)); - svfloat32_t v767; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v767) : "w"(v766), "w"(v744)); + svfloat32_t v748 = svadd_f32_x(svptrue_b32(), v738, v740); + svfloat32_t v758 = svadd_f32_x(svptrue_b32(), v624, v744); + svfloat32_t v760 = svsub_f32_x(svptrue_b32(), v746, v742); + svfloat32_t v762 = svadd_f32_x(svptrue_b32(), v624, v747); + svfloat32_t v764 = svsub_f32_x(svptrue_b32(), v747, v743); + svfloat32_t v767 = svadd_f32_x(svptrue_b32(), v766, v744); svst1_f64(pred_full, (double *)(v1182), svreinterpret_f64_f32(v378)); svst1_f64(pred_full, (double *)(v1191), svreinterpret_f64_f32(v587)); - svfloat32_t v540; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v540) : "w"(v539), "w"(v524)); - svfloat32_t v541; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v541) : "w"(v524), "w"(v526)); - svfloat32_t v543; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v543) : "w"(v524), "w"(v530)); - svfloat32_t v545; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v545) : "w"(v524), "w"(v527)); - svfloat32_t v547; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v547) : "w"(v524), "w"(v525)); - svfloat32_t v550; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v550) : "w"(v549), "w"(v537)); - svfloat32_t v552; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v552) : "w"(v551), "w"(v415)); - svfloat32_t v554; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v554) : "w"(v553), "w"(v536)); - svfloat32_t v556; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v556) : "w"(v555), "w"(v415)); - svfloat32_t v559; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v559) : "w"(v558), "w"(v536)); - svfloat32_t v749; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v749) : "w"(v748), "w"(v733)); - svfloat32_t v750; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v750) : "w"(v733), "w"(v735)); - svfloat32_t v752; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v752) : "w"(v733), "w"(v739)); - svfloat32_t v754; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v754) : "w"(v733), "w"(v736)); - svfloat32_t v756; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v756) : "w"(v733), "w"(v734)); - svfloat32_t v759; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v759) : "w"(v758), "w"(v746)); - svfloat32_t v761; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v761) : "w"(v760), "w"(v624)); - svfloat32_t v763; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v763) : "w"(v762), "w"(v745)); - svfloat32_t v765; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v765) : "w"(v764), "w"(v624)); - svfloat32_t v768; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v768) : "w"(v767), "w"(v745)); - svfloat32_t v542; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v542) : "w"(v541), "w"(v531)); - svfloat32_t v544; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v544) : "w"(v543), "w"(v532)); - svfloat32_t v546; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v546) : "w"(v545), "w"(v532)); - svfloat32_t v548; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v548) : "w"(v547), "w"(v528)); - svfloat32_t v560; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v560) : "w"(v559), "w"(v415)); - svfloat32_t v562; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v562) : "w"(v540), "w"(v550)); - svfloat32_t v569; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v569) : "w"(v540), "w"(v550)); - svfloat32_t v751; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v751) : "w"(v750), "w"(v740)); - svfloat32_t v753; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v753) : "w"(v752), "w"(v741)); - svfloat32_t v755; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v755) : "w"(v754), "w"(v741)); - svfloat32_t v757; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v757) : "w"(v756), "w"(v737)); - svfloat32_t v769; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v769) : "w"(v768), "w"(v624)); - svfloat32_t v771; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v771) : "w"(v749), "w"(v759)); - svfloat32_t v778; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v778) : "w"(v749), "w"(v759)); - svfloat32_t v561; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v561) : "w"(v548), "w"(v560)); - svfloat32_t v563; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v563) : "w"(v542), "w"(v552)); - svfloat32_t v564; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v564) : "w"(v544), "w"(v554)); - svfloat32_t v565; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v565) : "w"(v546), "w"(v556)); - svfloat32_t v566; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v566) : "w"(v546), "w"(v556)); - svfloat32_t v567; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v567) : "w"(v544), "w"(v554)); - svfloat32_t v568; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v568) : "w"(v542), "w"(v552)); - svfloat32_t v570; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v570) : "w"(v548), "w"(v560)); - svfloat32_t v770; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v770) : "w"(v757), "w"(v769)); - svfloat32_t v772; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v772) : "w"(v751), "w"(v761)); - svfloat32_t v773; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v773) : "w"(v753), "w"(v763)); - svfloat32_t v774; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v774) : "w"(v755), "w"(v765)); - svfloat32_t v775; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v775) : "w"(v755), "w"(v765)); - svfloat32_t v776; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v776) : "w"(v753), "w"(v763)); - svfloat32_t v777; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v777) : "w"(v751), "w"(v761)); - svfloat32_t v779; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v779) : "w"(v757), "w"(v769)); + svfloat32_t v540 = svadd_f32_x(svptrue_b32(), v539, v524); + svfloat32_t v541 = svsub_f32_x(svptrue_b32(), v524, v526); + svfloat32_t v543 = svadd_f32_x(svptrue_b32(), v524, v530); + svfloat32_t v545 = svsub_f32_x(svptrue_b32(), v524, v527); + svfloat32_t v547 = svadd_f32_x(svptrue_b32(), v524, v525); + svfloat32_t v550 = svadd_f32_x(svptrue_b32(), v549, v537); + svfloat32_t v552 = svsub_f32_x(svptrue_b32(), v551, v415); + svfloat32_t v554 = svadd_f32_x(svptrue_b32(), v553, v536); + svfloat32_t v556 = svsub_f32_x(svptrue_b32(), v555, v415); + svfloat32_t v559 = svadd_f32_x(svptrue_b32(), v558, v536); + svfloat32_t v749 = svadd_f32_x(svptrue_b32(), v748, v733); + svfloat32_t v750 = svsub_f32_x(svptrue_b32(), v733, v735); + svfloat32_t v752 = svadd_f32_x(svptrue_b32(), v733, v739); + svfloat32_t v754 = svsub_f32_x(svptrue_b32(), v733, v736); + svfloat32_t v756 = svadd_f32_x(svptrue_b32(), v733, v734); + svfloat32_t v759 = svadd_f32_x(svptrue_b32(), v758, v746); + svfloat32_t v761 = svsub_f32_x(svptrue_b32(), v760, v624); + svfloat32_t v763 = svadd_f32_x(svptrue_b32(), v762, v745); + svfloat32_t v765 = svsub_f32_x(svptrue_b32(), v764, v624); + svfloat32_t v768 = svadd_f32_x(svptrue_b32(), v767, v745); + svfloat32_t v542 = svsub_f32_x(svptrue_b32(), v541, v531); + svfloat32_t v544 = svadd_f32_x(svptrue_b32(), v543, v532); + svfloat32_t v546 = svsub_f32_x(svptrue_b32(), v545, v532); + svfloat32_t v548 = svsub_f32_x(svptrue_b32(), v547, v528); + svfloat32_t v560 = svsub_f32_x(svptrue_b32(), v559, v415); + svfloat32_t v562 = svadd_f32_x(svptrue_b32(), v540, v550); + svfloat32_t v569 = svsub_f32_x(svptrue_b32(), v540, v550); + svfloat32_t v751 = svsub_f32_x(svptrue_b32(), v750, v740); + svfloat32_t v753 = svadd_f32_x(svptrue_b32(), v752, v741); + svfloat32_t v755 = svsub_f32_x(svptrue_b32(), v754, v741); + svfloat32_t v757 = svsub_f32_x(svptrue_b32(), v756, v737); + svfloat32_t v769 = svsub_f32_x(svptrue_b32(), v768, v624); + svfloat32_t v771 = svadd_f32_x(svptrue_b32(), v749, v759); + svfloat32_t v778 = svsub_f32_x(svptrue_b32(), v749, v759); + svfloat32_t v561 = svadd_f32_x(svptrue_b32(), v548, v560); + svfloat32_t v563 = svadd_f32_x(svptrue_b32(), v542, v552); + svfloat32_t v564 = svsub_f32_x(svptrue_b32(), v544, v554); + svfloat32_t v565 = svadd_f32_x(svptrue_b32(), v546, v556); + svfloat32_t v566 = svsub_f32_x(svptrue_b32(), v546, v556); + svfloat32_t v567 = svadd_f32_x(svptrue_b32(), v544, v554); + svfloat32_t v568 = svsub_f32_x(svptrue_b32(), v542, v552); + svfloat32_t v570 = svsub_f32_x(svptrue_b32(), v548, v560); + svfloat32_t v770 = svadd_f32_x(svptrue_b32(), v757, v769); + svfloat32_t v772 = svadd_f32_x(svptrue_b32(), v751, v761); + svfloat32_t v773 = svsub_f32_x(svptrue_b32(), v753, v763); + svfloat32_t v774 = svadd_f32_x(svptrue_b32(), v755, v765); + svfloat32_t v775 = svsub_f32_x(svptrue_b32(), v755, v765); + svfloat32_t v776 = svadd_f32_x(svptrue_b32(), v753, v763); + svfloat32_t v777 = svsub_f32_x(svptrue_b32(), v751, v761); + svfloat32_t v779 = svsub_f32_x(svptrue_b32(), v757, v769); svst1_f64(pred_full, (double *)(v1218), svreinterpret_f64_f32(v569)); svst1_f64(pred_full, (double *)(v1227), svreinterpret_f64_f32(v778)); svst1_f64(pred_full, (double *)(v1344), svreinterpret_f64_f32(v562)); @@ -20099,49 +15428,26 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu24(const armral_cmplx_f32_t *restrict x, float32x2_t *v1062 = &v6[v2]; int64_t v19 = v0 * 8; int64_t v30 = v0 * 16; - float32x2_t v41 = v7[7]; - float32x2_t v45 = v7[15]; int64_t v49 = v0 * 11; int64_t v60 = v0 * 19; - float32x2_t v71 = v7[10]; - float32x2_t v75 = v7[18]; int64_t v79 = v0 * 3; - float32x2_t v86 = v7[2]; int64_t v90 = v0 * 14; int64_t v101 = v0 * 22; - float32x2_t v112 = v7[13]; - float32x2_t v116 = v7[21]; int64_t v120 = v0 * 6; - float32x2_t v127 = v7[5]; int64_t v131 = v0 * 17; - float32x2_t v153 = v7[16]; - float32x2_t v157 = v7[0]; int64_t v161 = v0 * 9; - float32x2_t v168 = v7[8]; int64_t v172 = v0 * 20; int64_t v183 = v0 * 4; - float32x2_t v194 = v7[19]; - float32x2_t v198 = v7[3]; int64_t v202 = v0 * 12; - float32x2_t v209 = v7[11]; int64_t v213 = v0 * 23; int64_t v224 = v0 * 7; - float32x2_t v235 = v7[22]; - float32x2_t v239 = v7[6]; int64_t v243 = v0 * 15; - float32x2_t v250 = v7[14]; int64_t v254 = v0 * 2; int64_t v265 = v0 * 10; - float32x2_t v276 = v7[1]; - float32x2_t v280 = v7[9]; int64_t v284 = v0 * 18; - float32x2_t v291 = v7[17]; int64_t v295 = v0 * 5; int64_t v306 = v0 * 13; - float32x2_t v317 = v7[4]; - float32x2_t v321 = v7[12]; int64_t v325 = v0 * 21; - float32x2_t v332 = v7[20]; float v413 = v4 * v410; float v420 = v4 * v417; float v485 = v4 * v482; @@ -20179,52 +15485,52 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu24(const armral_cmplx_f32_t *restrict x, float32x2_t *v1026 = &v6[0]; svfloat32_t v1255 = svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v865)[0])); - svfloat32_t v42; - asm("mov %0.d, %d1" : "=w"(v42) : "w"(v41)); - svfloat32_t v46; - asm("mov %0.d, %d1" : "=w"(v46) : "w"(v45)); - svfloat32_t v72; - asm("mov %0.d, %d1" : "=w"(v72) : "w"(v71)); - svfloat32_t v76; - asm("mov %0.d, %d1" : "=w"(v76) : "w"(v75)); - svfloat32_t v87; - asm("mov %0.d, %d1" : "=w"(v87) : "w"(v86)); - svfloat32_t v113; - asm("mov %0.d, %d1" : "=w"(v113) : "w"(v112)); - svfloat32_t v117; - asm("mov %0.d, %d1" : "=w"(v117) : "w"(v116)); - svfloat32_t v128; - asm("mov %0.d, %d1" : "=w"(v128) : "w"(v127)); - svfloat32_t v154; - asm("mov %0.d, %d1" : "=w"(v154) : "w"(v153)); - svfloat32_t v158; - asm("mov %0.d, %d1" : "=w"(v158) : "w"(v157)); - svfloat32_t v169; - asm("mov %0.d, %d1" : "=w"(v169) : "w"(v168)); - svfloat32_t v195; - asm("mov %0.d, %d1" : "=w"(v195) : "w"(v194)); - svfloat32_t v199; - asm("mov %0.d, %d1" : "=w"(v199) : "w"(v198)); - svfloat32_t v210; - asm("mov %0.d, %d1" : "=w"(v210) : "w"(v209)); - svfloat32_t v236; - asm("mov %0.d, %d1" : "=w"(v236) : "w"(v235)); - svfloat32_t v240; - asm("mov %0.d, %d1" : "=w"(v240) : "w"(v239)); - svfloat32_t v251; - asm("mov %0.d, %d1" : "=w"(v251) : "w"(v250)); - svfloat32_t v277; - asm("mov %0.d, %d1" : "=w"(v277) : "w"(v276)); - svfloat32_t v281; - asm("mov %0.d, %d1" : "=w"(v281) : "w"(v280)); - svfloat32_t v292; - asm("mov %0.d, %d1" : "=w"(v292) : "w"(v291)); - svfloat32_t v318; - asm("mov %0.d, %d1" : "=w"(v318) : "w"(v317)); - svfloat32_t v322; - asm("mov %0.d, %d1" : "=w"(v322) : "w"(v321)); - svfloat32_t v333; - asm("mov %0.d, %d1" : "=w"(v333) : "w"(v332)); + svfloat32_t v42 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[7])); + svfloat32_t v46 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[15])); + svfloat32_t v72 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[10])); + svfloat32_t v76 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[18])); + svfloat32_t v87 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[2])); + svfloat32_t v113 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[13])); + svfloat32_t v117 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[21])); + svfloat32_t v128 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[5])); + svfloat32_t v154 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[16])); + svfloat32_t v158 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[0])); + svfloat32_t v169 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[8])); + svfloat32_t v195 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[19])); + svfloat32_t v199 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[3])); + svfloat32_t v210 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[11])); + svfloat32_t v236 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[22])); + svfloat32_t v240 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[6])); + svfloat32_t v251 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[14])); + svfloat32_t v277 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[1])); + svfloat32_t v281 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[9])); + svfloat32_t v292 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[17])); + svfloat32_t v318 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[4])); + svfloat32_t v322 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[12])); + svfloat32_t v333 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[20])); const float32x2_t *v784 = &v5[v19]; const float32x2_t *v793 = &v5[v30]; const float32x2_t *v802 = &v5[v49]; @@ -20277,8 +15583,7 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu24(const armral_cmplx_f32_t *restrict x, float32x2_t *v1233 = &v6[v772]; svfloat32_t v1283 = svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v992)[0])); - svfloat32_t zero159; - asm volatile("mov %0.s, #0" : "=w"(zero159)); + svfloat32_t zero159 = svdup_n_f32(0); svfloat32_t v159 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero159, v1255, v158, 0), v1255, v158, 90); @@ -20326,115 +15631,83 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu24(const armral_cmplx_f32_t *restrict x, svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v973)[0])); svfloat32_t v1281 = svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v982)[0])); - svfloat32_t zero43; - asm volatile("mov %0.s, #0" : "=w"(zero43)); + svfloat32_t zero43 = svdup_n_f32(0); svfloat32_t v43 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero43, v1237, v42, 0), v1237, v42, 90); - svfloat32_t zero47; - asm volatile("mov %0.s, #0" : "=w"(zero47)); + svfloat32_t zero47 = svdup_n_f32(0); svfloat32_t v47 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero47, v1239, v46, 0), v1239, v46, 90); - svfloat32_t zero73; - asm volatile("mov %0.s, #0" : "=w"(zero73)); + svfloat32_t zero73 = svdup_n_f32(0); svfloat32_t v73 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero73, v1241, v72, 0), v1241, v72, 90); - svfloat32_t zero77; - asm volatile("mov %0.s, #0" : "=w"(zero77)); + svfloat32_t zero77 = svdup_n_f32(0); svfloat32_t v77 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero77, v1243, v76, 0), v1243, v76, 90); - svfloat32_t zero114; - asm volatile("mov %0.s, #0" : "=w"(zero114)); + svfloat32_t zero114 = svdup_n_f32(0); svfloat32_t v114 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero114, v1247, v113, 0), v1247, v113, 90); - svfloat32_t zero118; - asm volatile("mov %0.s, #0" : "=w"(zero118)); + svfloat32_t zero118 = svdup_n_f32(0); svfloat32_t v118 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero118, v1249, v117, 0), v1249, v117, 90); - svfloat32_t zero155; - asm volatile("mov %0.s, #0" : "=w"(zero155)); + svfloat32_t zero155 = svdup_n_f32(0); svfloat32_t v155 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero155, v1253, v154, 0), v1253, v154, 90); - svfloat32_t zero196; - asm volatile("mov %0.s, #0" : "=w"(zero196)); + svfloat32_t zero196 = svdup_n_f32(0); svfloat32_t v196 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero196, v1259, v195, 0), v1259, v195, 90); - svfloat32_t zero200; - asm volatile("mov %0.s, #0" : "=w"(zero200)); + svfloat32_t zero200 = svdup_n_f32(0); svfloat32_t v200 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero200, v1261, v199, 0), v1261, v199, 90); - svfloat32_t zero237; - asm volatile("mov %0.s, #0" : "=w"(zero237)); + svfloat32_t zero237 = svdup_n_f32(0); svfloat32_t v237 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero237, v1265, v236, 0), v1265, v236, 90); - svfloat32_t zero241; - asm volatile("mov %0.s, #0" : "=w"(zero241)); + svfloat32_t zero241 = svdup_n_f32(0); svfloat32_t v241 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero241, v1267, v240, 0), v1267, v240, 90); - svfloat32_t zero278; - asm volatile("mov %0.s, #0" : "=w"(zero278)); + svfloat32_t zero278 = svdup_n_f32(0); svfloat32_t v278 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero278, v1271, v277, 0), v1271, v277, 90); - svfloat32_t zero282; - asm volatile("mov %0.s, #0" : "=w"(zero282)); + svfloat32_t zero282 = svdup_n_f32(0); svfloat32_t v282 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero282, v1273, v281, 0), v1273, v281, 90); - svfloat32_t zero319; - asm volatile("mov %0.s, #0" : "=w"(zero319)); + svfloat32_t zero319 = svdup_n_f32(0); svfloat32_t v319 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero319, v1277, v318, 0), v1277, v318, 90); - svfloat32_t zero323; - asm volatile("mov %0.s, #0" : "=w"(zero323)); + svfloat32_t zero323 = svdup_n_f32(0); svfloat32_t v323 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero323, v1279, v322, 0), v1279, v322, 90); - svfloat32_t v335; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v335) : "w"(v43), "w"(v47)); - svfloat32_t v336; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v336) : "w"(v43), "w"(v47)); - svfloat32_t v345; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v345) : "w"(v73), "w"(v77)); - svfloat32_t v346; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v346) : "w"(v73), "w"(v77)); - svfloat32_t v348; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v348) : "w"(v114), "w"(v118)); - svfloat32_t v349; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v349) : "w"(v114), "w"(v118)); - svfloat32_t v351; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v351) : "w"(v155), "w"(v159)); - svfloat32_t v352; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v352) : "w"(v155), "w"(v159)); - svfloat32_t v354; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v354) : "w"(v196), "w"(v200)); - svfloat32_t v355; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v355) : "w"(v196), "w"(v200)); - svfloat32_t v357; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v357) : "w"(v237), "w"(v241)); - svfloat32_t v358; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v358) : "w"(v237), "w"(v241)); - svfloat32_t v360; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v360) : "w"(v278), "w"(v282)); - svfloat32_t v361; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v361) : "w"(v278), "w"(v282)); - svfloat32_t v363; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v363) : "w"(v319), "w"(v323)); - svfloat32_t v364; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v364) : "w"(v319), "w"(v323)); - svfloat32_t v344; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v344) : "w"(v335), "w"(v1283)); + svfloat32_t v335 = svadd_f32_x(svptrue_b32(), v43, v47); + svfloat32_t v336 = svsub_f32_x(svptrue_b32(), v43, v47); + svfloat32_t v345 = svadd_f32_x(svptrue_b32(), v73, v77); + svfloat32_t v346 = svsub_f32_x(svptrue_b32(), v73, v77); + svfloat32_t v348 = svadd_f32_x(svptrue_b32(), v114, v118); + svfloat32_t v349 = svsub_f32_x(svptrue_b32(), v114, v118); + svfloat32_t v351 = svadd_f32_x(svptrue_b32(), v155, v159); + svfloat32_t v352 = svsub_f32_x(svptrue_b32(), v155, v159); + svfloat32_t v354 = svadd_f32_x(svptrue_b32(), v196, v200); + svfloat32_t v355 = svsub_f32_x(svptrue_b32(), v196, v200); + svfloat32_t v357 = svadd_f32_x(svptrue_b32(), v237, v241); + svfloat32_t v358 = svsub_f32_x(svptrue_b32(), v237, v241); + svfloat32_t v360 = svadd_f32_x(svptrue_b32(), v278, v282); + svfloat32_t v361 = svsub_f32_x(svptrue_b32(), v278, v282); + svfloat32_t v363 = svadd_f32_x(svptrue_b32(), v319, v323); + svfloat32_t v364 = svsub_f32_x(svptrue_b32(), v319, v323); + svfloat32_t v344 = svadd_f32_x(svptrue_b32(), v335, v1283); svfloat32_t v347 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, v345, v1245, v87, 0), v1245, v87, 90); @@ -20456,223 +15729,128 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu24(const armral_cmplx_f32_t *restrict x, svfloat32_t v365 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, v363, v1281, v333, 0), v1281, v333, 90); - svfloat32_t v438; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v438) : "w"(v335), "w"(v354)); - svfloat32_t v439; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v439) : "w"(v335), "w"(v354)); - svfloat32_t v440; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v440) : "w"(v348), "w"(v360)); - svfloat32_t v441; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v441) : "w"(v348), "w"(v360)); - svfloat32_t v442; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v442) : "w"(v345), "w"(v357)); - svfloat32_t v443; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v443) : "w"(v345), "w"(v357)); - svfloat32_t v444; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v444) : "w"(v351), "w"(v363)); - svfloat32_t v445; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v445) : "w"(v351), "w"(v363)); - svfloat32_t v510; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v510) : "w"(v336), "w"(v355)); - svfloat32_t v511; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v511) : "w"(v336), "w"(v355)); - svfloat32_t v512; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v512) : "w"(v349), "w"(v361)); - svfloat32_t v513; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v513) : "w"(v349), "w"(v361)); - svfloat32_t v514; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v514) : "w"(v346), "w"(v358)); - svfloat32_t v515; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v515) : "w"(v346), "w"(v358)); - svfloat32_t v516; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v516) : "w"(v352), "w"(v364)); - svfloat32_t v517; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v517) : "w"(v352), "w"(v364)); - svfloat32_t v366; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v366) : "w"(v344), "w"(v356)); - svfloat32_t v367; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v367) : "w"(v344), "w"(v356)); - svfloat32_t v368; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v368) : "w"(v350), "w"(v362)); - svfloat32_t v369; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v369) : "w"(v350), "w"(v362)); - svfloat32_t v370; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v370) : "w"(v347), "w"(v359)); - svfloat32_t v371; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v371) : "w"(v347), "w"(v359)); - svfloat32_t v372; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v372) : "w"(v353), "w"(v365)); - svfloat32_t v373; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v373) : "w"(v353), "w"(v365)); - svfloat32_t v446; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v446) : "w"(v438), "w"(v440)); - svfloat32_t v447; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v447) : "w"(v438), "w"(v440)); - svfloat32_t v448; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v448) : "w"(v442), "w"(v444)); - svfloat32_t v449; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v449) : "w"(v442), "w"(v444)); - svfloat32_t v452; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v452) : "w"(v443), "w"(v445)); - svfloat32_t v453; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v453) : "w"(v443), "w"(v445)); - svfloat32_t zero487; - asm volatile("mov %0.s, #0" : "=w"(zero487)); + svfloat32_t v438 = svadd_f32_x(svptrue_b32(), v335, v354); + svfloat32_t v439 = svsub_f32_x(svptrue_b32(), v335, v354); + svfloat32_t v440 = svadd_f32_x(svptrue_b32(), v348, v360); + svfloat32_t v441 = svsub_f32_x(svptrue_b32(), v348, v360); + svfloat32_t v442 = svadd_f32_x(svptrue_b32(), v345, v357); + svfloat32_t v443 = svsub_f32_x(svptrue_b32(), v345, v357); + svfloat32_t v444 = svadd_f32_x(svptrue_b32(), v351, v363); + svfloat32_t v445 = svsub_f32_x(svptrue_b32(), v351, v363); + svfloat32_t v510 = svadd_f32_x(svptrue_b32(), v336, v355); + svfloat32_t v511 = svsub_f32_x(svptrue_b32(), v336, v355); + svfloat32_t v512 = svadd_f32_x(svptrue_b32(), v349, v361); + svfloat32_t v513 = svsub_f32_x(svptrue_b32(), v349, v361); + svfloat32_t v514 = svadd_f32_x(svptrue_b32(), v346, v358); + svfloat32_t v515 = svsub_f32_x(svptrue_b32(), v346, v358); + svfloat32_t v516 = svadd_f32_x(svptrue_b32(), v352, v364); + svfloat32_t v517 = svsub_f32_x(svptrue_b32(), v352, v364); + svfloat32_t v366 = svadd_f32_x(svptrue_b32(), v344, v356); + svfloat32_t v367 = svsub_f32_x(svptrue_b32(), v344, v356); + svfloat32_t v368 = svadd_f32_x(svptrue_b32(), v350, v362); + svfloat32_t v369 = svsub_f32_x(svptrue_b32(), v350, v362); + svfloat32_t v370 = svadd_f32_x(svptrue_b32(), v347, v359); + svfloat32_t v371 = svsub_f32_x(svptrue_b32(), v347, v359); + svfloat32_t v372 = svadd_f32_x(svptrue_b32(), v353, v365); + svfloat32_t v373 = svsub_f32_x(svptrue_b32(), v353, v365); + svfloat32_t v446 = svadd_f32_x(svptrue_b32(), v438, v440); + svfloat32_t v447 = svsub_f32_x(svptrue_b32(), v438, v440); + svfloat32_t v448 = svadd_f32_x(svptrue_b32(), v442, v444); + svfloat32_t v449 = svsub_f32_x(svptrue_b32(), v442, v444); + svfloat32_t v452 = svadd_f32_x(svptrue_b32(), v443, v445); + svfloat32_t v453 = svsub_f32_x(svptrue_b32(), v443, v445); + svfloat32_t zero487 = svdup_n_f32(0); svfloat32_t v487 = svcmla_f32_x(pred_full, zero487, v1008, v441, 90); - svfloat32_t v518; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v518) : "w"(v510), "w"(v512)); - svfloat32_t v519; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v519) : "w"(v510), "w"(v512)); - svfloat32_t v520; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v520) : "w"(v514), "w"(v516)); - svfloat32_t v521; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v521) : "w"(v514), "w"(v516)); - svfloat32_t v524; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v524) : "w"(v515), "w"(v517)); - svfloat32_t v525; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v525) : "w"(v515), "w"(v517)); - svfloat32_t zero558; - asm volatile("mov %0.s, #0" : "=w"(zero558)); + svfloat32_t v518 = svadd_f32_x(svptrue_b32(), v510, v512); + svfloat32_t v519 = svsub_f32_x(svptrue_b32(), v510, v512); + svfloat32_t v520 = svadd_f32_x(svptrue_b32(), v514, v516); + svfloat32_t v521 = svsub_f32_x(svptrue_b32(), v514, v516); + svfloat32_t v524 = svadd_f32_x(svptrue_b32(), v515, v517); + svfloat32_t v525 = svsub_f32_x(svptrue_b32(), v515, v517); + svfloat32_t zero558 = svdup_n_f32(0); svfloat32_t v558 = svcmla_f32_x(pred_full, zero558, v1015, v511, 90); - svfloat32_t v374; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v374) : "w"(v366), "w"(v368)); - svfloat32_t v375; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v375) : "w"(v366), "w"(v368)); - svfloat32_t v376; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v376) : "w"(v370), "w"(v372)); - svfloat32_t v377; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v377) : "w"(v370), "w"(v372)); - svfloat32_t v380; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v380) : "w"(v371), "w"(v373)); - svfloat32_t v381; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v381) : "w"(v371), "w"(v373)); - svfloat32_t zero415; - asm volatile("mov %0.s, #0" : "=w"(zero415)); + svfloat32_t v374 = svadd_f32_x(svptrue_b32(), v366, v368); + svfloat32_t v375 = svsub_f32_x(svptrue_b32(), v366, v368); + svfloat32_t v376 = svadd_f32_x(svptrue_b32(), v370, v372); + svfloat32_t v377 = svsub_f32_x(svptrue_b32(), v370, v372); + svfloat32_t v380 = svadd_f32_x(svptrue_b32(), v371, v373); + svfloat32_t v381 = svsub_f32_x(svptrue_b32(), v371, v373); + svfloat32_t zero415 = svdup_n_f32(0); svfloat32_t v415 = svcmla_f32_x(pred_full, zero415, v1000, v369, 90); - svfloat32_t v450; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v450) : "w"(v446), "w"(v448)); - svfloat32_t v451; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v451) : "w"(v446), "w"(v448)); - svfloat32_t zero475; - asm volatile("mov %0.s, #0" : "=w"(zero475)); + svfloat32_t v450 = svadd_f32_x(svptrue_b32(), v446, v448); + svfloat32_t v451 = svsub_f32_x(svptrue_b32(), v446, v448); + svfloat32_t zero475 = svdup_n_f32(0); svfloat32_t v475 = svcmla_f32_x(pred_full, zero475, v1008, v449, 90); - svfloat32_t zero494; - asm volatile("mov %0.s, #0" : "=w"(zero494)); + svfloat32_t zero494 = svdup_n_f32(0); svfloat32_t v494 = svcmla_f32_x(pred_full, zero494, v1009, v452, 90); - svfloat32_t v499; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v499) : "w"(v453), "w"(v1010)); - svfloat32_t v522; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v522) : "w"(v518), "w"(v520)); - svfloat32_t v523; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v523) : "w"(v518), "w"(v520)); - svfloat32_t zero546; - asm volatile("mov %0.s, #0" : "=w"(zero546)); + svfloat32_t v499 = svmul_f32_x(svptrue_b32(), v453, v1010); + svfloat32_t v522 = svadd_f32_x(svptrue_b32(), v518, v520); + svfloat32_t v523 = svsub_f32_x(svptrue_b32(), v518, v520); + svfloat32_t zero546 = svdup_n_f32(0); svfloat32_t v546 = svcmla_f32_x(pred_full, zero546, v1015, v519, 90); - svfloat32_t v568; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v568) : "w"(v524), "w"(v1017)); - svfloat32_t zero575; - asm volatile("mov %0.s, #0" : "=w"(zero575)); + svfloat32_t v568 = svmul_f32_x(svptrue_b32(), v524, v1017); + svfloat32_t zero575 = svdup_n_f32(0); svfloat32_t v575 = svcmla_f32_x(pred_full, zero575, v1018, v525, 90); - svfloat32_t v378; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v378) : "w"(v374), "w"(v376)); - svfloat32_t v379; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v379) : "w"(v374), "w"(v376)); - svfloat32_t zero403; - asm volatile("mov %0.s, #0" : "=w"(zero403)); + svfloat32_t v378 = svadd_f32_x(svptrue_b32(), v374, v376); + svfloat32_t v379 = svsub_f32_x(svptrue_b32(), v374, v376); + svfloat32_t zero403 = svdup_n_f32(0); svfloat32_t v403 = svcmla_f32_x(pred_full, zero403, v1000, v377, 90); - svfloat32_t zero422; - asm volatile("mov %0.s, #0" : "=w"(zero422)); + svfloat32_t zero422 = svdup_n_f32(0); svfloat32_t v422 = svcmla_f32_x(pred_full, zero422, v1001, v380, 90); svfloat32_t v500 = svmla_f32_x(pred_full, v475, v447, v1007); svfloat32_t v501 = svnmls_f32_x(pred_full, v475, v447, v1007); svfloat32_t v502 = svmla_f32_x(pred_full, v499, v439, v1007); svfloat32_t v503 = svnmls_f32_x(pred_full, v499, v439, v1007); - svfloat32_t v504; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v504) : "w"(v487), "w"(v494)); - svfloat32_t v505; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v505) : "w"(v487), "w"(v494)); - svfloat32_t zero532; - asm volatile("mov %0.s, #0" : "=w"(zero532)); + svfloat32_t v504 = svadd_f32_x(svptrue_b32(), v487, v494); + svfloat32_t v505 = svsub_f32_x(svptrue_b32(), v487, v494); + svfloat32_t zero532 = svdup_n_f32(0); svfloat32_t v532 = svcmla_f32_x(pred_full, zero532, v1015, v522, 90); - svfloat32_t zero539; - asm volatile("mov %0.s, #0" : "=w"(zero539)); + svfloat32_t zero539 = svdup_n_f32(0); svfloat32_t v539 = svcmla_f32_x(pred_full, zero539, v1015, v523, 90); svfloat32_t v576 = svmla_f32_x(pred_full, v546, v521, v1016); svfloat32_t v577 = svmls_f32_x(pred_full, v546, v521, v1016); - svfloat32_t v578; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v578) : "w"(v558), "w"(v575)); - svfloat32_t v579; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v579) : "w"(v558), "w"(v575)); + svfloat32_t v578 = svadd_f32_x(svptrue_b32(), v558, v575); + svfloat32_t v579 = svsub_f32_x(svptrue_b32(), v558, v575); svfloat32_t v580 = svmla_f32_x(pred_full, v568, v513, v1016); svfloat32_t v581 = svnmls_f32_x(pred_full, v568, v513, v1016); - svfloat32_t v428; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v428) : "w"(v375), "w"(v403)); - svfloat32_t v429; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v429) : "w"(v375), "w"(v403)); + svfloat32_t v428 = svadd_f32_x(svptrue_b32(), v375, v403); + svfloat32_t v429 = svsub_f32_x(svptrue_b32(), v375, v403); svfloat32_t v430 = svmla_f32_x(pred_full, v367, v381, v1002); svfloat32_t v431 = svmls_f32_x(pred_full, v367, v381, v1002); - svfloat32_t v432; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v432) : "w"(v415), "w"(v422)); - svfloat32_t v433; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v433) : "w"(v415), "w"(v422)); - svfloat32_t v506; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v506) : "w"(v502), "w"(v504)); - svfloat32_t v507; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v507) : "w"(v502), "w"(v504)); - svfloat32_t v508; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v508) : "w"(v503), "w"(v505)); - svfloat32_t v509; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v509) : "w"(v503), "w"(v505)); - svfloat32_t v582; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v582) : "w"(v578), "w"(v580)); - svfloat32_t v583; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v583) : "w"(v578), "w"(v580)); - svfloat32_t v584; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v584) : "w"(v579), "w"(v581)); - svfloat32_t v585; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v585) : "w"(v579), "w"(v581)); + svfloat32_t v432 = svadd_f32_x(svptrue_b32(), v415, v422); + svfloat32_t v433 = svsub_f32_x(svptrue_b32(), v415, v422); + svfloat32_t v506 = svadd_f32_x(svptrue_b32(), v502, v504); + svfloat32_t v507 = svsub_f32_x(svptrue_b32(), v502, v504); + svfloat32_t v508 = svadd_f32_x(svptrue_b32(), v503, v505); + svfloat32_t v509 = svsub_f32_x(svptrue_b32(), v503, v505); + svfloat32_t v582 = svadd_f32_x(svptrue_b32(), v578, v580); + svfloat32_t v583 = svsub_f32_x(svptrue_b32(), v578, v580); + svfloat32_t v584 = svadd_f32_x(svptrue_b32(), v579, v581); + svfloat32_t v585 = svsub_f32_x(svptrue_b32(), v579, v581); svfloat32_t v586 = svmla_f32_x(pred_full, v378, v450, v1007); svfloat32_t v682 = svmla_f32_x(pred_full, v379, v451, v1007); svst1_f64(pred_full, (double *)(v1026), svreinterpret_f64_f32(v378)); svst1_f64(pred_full, (double *)(v1134), svreinterpret_f64_f32(v379)); - svfloat32_t v434; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v434) : "w"(v430), "w"(v432)); - svfloat32_t v435; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v435) : "w"(v430), "w"(v432)); - svfloat32_t v436; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v436) : "w"(v431), "w"(v433)); - svfloat32_t v437; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v437) : "w"(v431), "w"(v433)); - svfloat32_t v587; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v587) : "w"(v586), "w"(v532)); - svfloat32_t v588; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v588) : "w"(v586), "w"(v532)); - svfloat32_t v634; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v634) : "w"(v429), "w"(v501)); - svfloat32_t v683; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v683) : "w"(v682), "w"(v539)); - svfloat32_t v684; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v684) : "w"(v682), "w"(v539)); - svfloat32_t v730; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v730) : "w"(v428), "w"(v500)); + svfloat32_t v434 = svadd_f32_x(svptrue_b32(), v430, v432); + svfloat32_t v435 = svsub_f32_x(svptrue_b32(), v430, v432); + svfloat32_t v436 = svadd_f32_x(svptrue_b32(), v431, v433); + svfloat32_t v437 = svsub_f32_x(svptrue_b32(), v431, v433); + svfloat32_t v587 = svadd_f32_x(svptrue_b32(), v586, v532); + svfloat32_t v588 = svsub_f32_x(svptrue_b32(), v586, v532); + svfloat32_t v634 = svadd_f32_x(svptrue_b32(), v429, v501); + svfloat32_t v683 = svadd_f32_x(svptrue_b32(), v682, v539); + svfloat32_t v684 = svsub_f32_x(svptrue_b32(), v682, v539); + svfloat32_t v730 = svadd_f32_x(svptrue_b32(), v428, v500); svst1_f64(pred_full, (double *)(v1080), svreinterpret_f64_f32(v429)); svst1_f64(pred_full, (double *)(v1188), svreinterpret_f64_f32(v428)); - svfloat32_t v610; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v610) : "w"(v435), "w"(v507)); - svfloat32_t v635; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v635) : "w"(v634), "w"(v577)); - svfloat32_t v636; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v636) : "w"(v634), "w"(v577)); - svfloat32_t v658; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v658) : "w"(v436), "w"(v508)); - svfloat32_t v706; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v706) : "w"(v437), "w"(v509)); - svfloat32_t v731; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v731) : "w"(v730), "w"(v576)); - svfloat32_t v732; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v732) : "w"(v730), "w"(v576)); - svfloat32_t v754; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v754) : "w"(v434), "w"(v506)); + svfloat32_t v610 = svadd_f32_x(svptrue_b32(), v435, v507); + svfloat32_t v635 = svadd_f32_x(svptrue_b32(), v634, v577); + svfloat32_t v636 = svsub_f32_x(svptrue_b32(), v634, v577); + svfloat32_t v658 = svadd_f32_x(svptrue_b32(), v436, v508); + svfloat32_t v706 = svadd_f32_x(svptrue_b32(), v437, v509); + svfloat32_t v731 = svadd_f32_x(svptrue_b32(), v730, v576); + svfloat32_t v732 = svsub_f32_x(svptrue_b32(), v730, v576); + svfloat32_t v754 = svadd_f32_x(svptrue_b32(), v434, v506); svst1_f64(pred_full, (double *)(v1035), svreinterpret_f64_f32(v588)); svst1_f64(pred_full, (double *)(v1044), svreinterpret_f64_f32(v587)); svst1_f64(pred_full, (double *)(v1053), svreinterpret_f64_f32(v435)); @@ -20681,22 +15859,14 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu24(const armral_cmplx_f32_t *restrict x, svst1_f64(pred_full, (double *)(v1152), svreinterpret_f64_f32(v683)); svst1_f64(pred_full, (double *)(v1161), svreinterpret_f64_f32(v437)); svst1_f64(pred_full, (double *)(v1215), svreinterpret_f64_f32(v434)); - svfloat32_t v611; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v611) : "w"(v610), "w"(v583)); - svfloat32_t v612; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v612) : "w"(v610), "w"(v583)); - svfloat32_t v659; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v659) : "w"(v658), "w"(v584)); - svfloat32_t v660; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v660) : "w"(v658), "w"(v584)); - svfloat32_t v707; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v707) : "w"(v706), "w"(v585)); - svfloat32_t v708; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v708) : "w"(v706), "w"(v585)); - svfloat32_t v755; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v755) : "w"(v754), "w"(v582)); - svfloat32_t v756; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v756) : "w"(v754), "w"(v582)); + svfloat32_t v611 = svadd_f32_x(svptrue_b32(), v610, v583); + svfloat32_t v612 = svsub_f32_x(svptrue_b32(), v610, v583); + svfloat32_t v659 = svadd_f32_x(svptrue_b32(), v658, v584); + svfloat32_t v660 = svsub_f32_x(svptrue_b32(), v658, v584); + svfloat32_t v707 = svadd_f32_x(svptrue_b32(), v706, v585); + svfloat32_t v708 = svsub_f32_x(svptrue_b32(), v706, v585); + svfloat32_t v755 = svadd_f32_x(svptrue_b32(), v754, v582); + svfloat32_t v756 = svsub_f32_x(svptrue_b32(), v754, v582); svst1_f64(pred_full, (double *)(v1089), svreinterpret_f64_f32(v636)); svst1_f64(pred_full, (double *)(v1098), svreinterpret_f64_f32(v635)); svst1_f64(pred_full, (double *)(v1197), svreinterpret_f64_f32(v732)); @@ -20728,7 +15898,6 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu25(const armral_cmplx_f32_t *restrict x, int64_t v12 = howmany - 1; int64_t v1989 = howmany / 2; for (int j = 0; j < v12; j += 2) { - float v1204 = 0.0000000000000000e+00F; float v1318 = 9.6858316112863108e-01F; float v1322 = -2.4868988716485479e-01F; float v1323 = 2.4868988716485479e-01F; @@ -20762,7 +15931,6 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu25(const armral_cmplx_f32_t *restrict x, float v1977 = 2.0000000000000000e+00F; const float32x2_t *v3677 = &v5[istride]; float32x2_t *v3932 = &v6[ostride]; - float v1207 = dir * v1204; float32x2_t v1319 = (float32x2_t){v1318, v1318}; float32x2_t v1324 = (float32x2_t){v1322, v1323}; float32x2_t v1487 = (float32x2_t){v1486, v1486}; @@ -20787,7 +15955,7 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu25(const armral_cmplx_f32_t *restrict x, float32x2_t v1978 = (float32x2_t){v1977, v1977}; const float32x2_t *v3877 = &v5[0]; float32x2_t *v3887 = &v6[0]; - float32x4_t v4115 = vld1q_f32((const float32_t *)v3677); + float32x4_t v4163 = vld1q_f32((const float32_t *)v3677); float32x4_t v35 = vreinterpretq_f32_u64(vld1q_dup_u64((const uint64_t *)&v7[8])); float32x4_t v37 = @@ -20804,8 +15972,8 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu25(const armral_cmplx_f32_t *restrict x, vreinterpretq_f32_u64(vld1q_dup_u64((const uint64_t *)&v7[38])); float32x4_t v94 = vreinterpretq_f32_u64(vld1q_dup_u64((const uint64_t *)&v7[39])); - float32x4_t v106 = vtrn1q_f32(v4115, v4115); - float32x4_t v107 = vtrn2q_f32(v4115, v4115); + float32x4_t v106 = vtrn1q_f32(v4163, v4163); + float32x4_t v107 = vtrn2q_f32(v4163, v4163); float32x4_t v111 = vreinterpretq_f32_u64(vld1q_dup_u64((const uint64_t *)&v7[0])); float32x4_t v113 = @@ -20886,7 +16054,6 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu25(const armral_cmplx_f32_t *restrict x, vreinterpretq_f32_u64(vld1q_dup_u64((const uint64_t *)&v7[46])); float32x4_t v474 = vreinterpretq_f32_u64(vld1q_dup_u64((const uint64_t *)&v7[47])); - float32x2_t v1205 = (float32x2_t){v1204, v1207}; float32x4_t v1320 = vcombine_f32(v1319, v1319); float32x2_t v1326 = vmul_f32(v1950, v1324); float32x4_t v1488 = vcombine_f32(v1487, v1487); @@ -20955,9 +16122,8 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu25(const armral_cmplx_f32_t *restrict x, float32x2_t *v4085 = &v6[ostride * 14]; float32x2_t *v4094 = &v6[ostride * 19]; float32x2_t *v4103 = &v6[ostride * 24]; - float32x4_t v4155 = vld1q_f32((const float32_t *)v3877); + float32x4_t v4203 = vld1q_f32((const float32_t *)v3877); float32x4_t v112 = vmulq_f32(v106, v111); - float32x4_t v1209 = vcombine_f32(v1205, v1205); float32x4_t v1328 = vcombine_f32(v1326, v1326); float32x4_t v1496 = vcombine_f32(v1494, v1494); float32x4_t v1664 = vcombine_f32(v1662, v1662); @@ -20968,76 +16134,76 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu25(const armral_cmplx_f32_t *restrict x, float32x4_t v1860 = vcombine_f32(v1858, v1858); float32x4_t v1881 = vcombine_f32(v1879, v1879); float32x4_t v1953 = vcombine_f32(v1951, v1951); - float32x4_t v4107 = vld1q_f32((const float32_t *)v3637); - float32x4_t v4109 = vld1q_f32((const float32_t *)v3647); - float32x4_t v4111 = vld1q_f32((const float32_t *)v3657); - float32x4_t v4113 = vld1q_f32((const float32_t *)v3667); - float32x4_t v4117 = vld1q_f32((const float32_t *)v3686); - float32x4_t v4119 = vld1q_f32((const float32_t *)v3696); - float32x4_t v4121 = vld1q_f32((const float32_t *)v3706); - float32x4_t v4123 = vld1q_f32((const float32_t *)v3716); - float32x4_t v4125 = vld1q_f32((const float32_t *)v3726); - float32x4_t v4127 = vld1q_f32((const float32_t *)v3736); - float32x4_t v4129 = vld1q_f32((const float32_t *)v3746); - float32x4_t v4131 = vld1q_f32((const float32_t *)v3756); - float32x4_t v4133 = vld1q_f32((const float32_t *)v3766); - float32x4_t v4135 = vld1q_f32((const float32_t *)v3776); - float32x4_t v4137 = vld1q_f32((const float32_t *)v3786); - float32x4_t v4139 = vld1q_f32((const float32_t *)v3796); - float32x4_t v4141 = vld1q_f32((const float32_t *)v3806); - float32x4_t v4143 = vld1q_f32((const float32_t *)v3816); - float32x4_t v4145 = vld1q_f32((const float32_t *)v3826); - float32x4_t v4147 = vld1q_f32((const float32_t *)v3836); - float32x4_t v4149 = vld1q_f32((const float32_t *)v3846); - float32x4_t v4151 = vld1q_f32((const float32_t *)v3856); - float32x4_t v4153 = vld1q_f32((const float32_t *)v3866); - float32x4_t v30 = vtrn1q_f32(v4107, v4107); - float32x4_t v31 = vtrn2q_f32(v4107, v4107); - float32x4_t v49 = vtrn1q_f32(v4109, v4109); - float32x4_t v50 = vtrn2q_f32(v4109, v4109); - float32x4_t v68 = vtrn1q_f32(v4111, v4111); - float32x4_t v69 = vtrn2q_f32(v4111, v4111); - float32x4_t v87 = vtrn1q_f32(v4113, v4113); - float32x4_t v88 = vtrn2q_f32(v4113, v4113); + float32x4_t v4155 = vld1q_f32((const float32_t *)v3637); + float32x4_t v4157 = vld1q_f32((const float32_t *)v3647); + float32x4_t v4159 = vld1q_f32((const float32_t *)v3657); + float32x4_t v4161 = vld1q_f32((const float32_t *)v3667); + float32x4_t v4165 = vld1q_f32((const float32_t *)v3686); + float32x4_t v4167 = vld1q_f32((const float32_t *)v3696); + float32x4_t v4169 = vld1q_f32((const float32_t *)v3706); + float32x4_t v4171 = vld1q_f32((const float32_t *)v3716); + float32x4_t v4173 = vld1q_f32((const float32_t *)v3726); + float32x4_t v4175 = vld1q_f32((const float32_t *)v3736); + float32x4_t v4177 = vld1q_f32((const float32_t *)v3746); + float32x4_t v4179 = vld1q_f32((const float32_t *)v3756); + float32x4_t v4181 = vld1q_f32((const float32_t *)v3766); + float32x4_t v4183 = vld1q_f32((const float32_t *)v3776); + float32x4_t v4185 = vld1q_f32((const float32_t *)v3786); + float32x4_t v4187 = vld1q_f32((const float32_t *)v3796); + float32x4_t v4189 = vld1q_f32((const float32_t *)v3806); + float32x4_t v4191 = vld1q_f32((const float32_t *)v3816); + float32x4_t v4193 = vld1q_f32((const float32_t *)v3826); + float32x4_t v4195 = vld1q_f32((const float32_t *)v3836); + float32x4_t v4197 = vld1q_f32((const float32_t *)v3846); + float32x4_t v4199 = vld1q_f32((const float32_t *)v3856); + float32x4_t v4201 = vld1q_f32((const float32_t *)v3866); + float32x4_t v30 = vtrn1q_f32(v4155, v4155); + float32x4_t v31 = vtrn2q_f32(v4155, v4155); + float32x4_t v49 = vtrn1q_f32(v4157, v4157); + float32x4_t v50 = vtrn2q_f32(v4157, v4157); + float32x4_t v68 = vtrn1q_f32(v4159, v4159); + float32x4_t v69 = vtrn2q_f32(v4159, v4159); + float32x4_t v87 = vtrn1q_f32(v4161, v4161); + float32x4_t v88 = vtrn2q_f32(v4161, v4161); float32x4_t v115 = vfmaq_f32(v112, v107, v113); - float32x4_t v125 = vtrn1q_f32(v4117, v4117); - float32x4_t v126 = vtrn2q_f32(v4117, v4117); - float32x4_t v144 = vtrn1q_f32(v4119, v4119); - float32x4_t v145 = vtrn2q_f32(v4119, v4119); - float32x4_t v163 = vtrn1q_f32(v4121, v4121); - float32x4_t v164 = vtrn2q_f32(v4121, v4121); - float32x4_t v182 = vtrn1q_f32(v4123, v4123); - float32x4_t v183 = vtrn2q_f32(v4123, v4123); - float32x4_t v201 = vtrn1q_f32(v4125, v4125); - float32x4_t v202 = vtrn2q_f32(v4125, v4125); - float32x4_t v220 = vtrn1q_f32(v4127, v4127); - float32x4_t v221 = vtrn2q_f32(v4127, v4127); - float32x4_t v239 = vtrn1q_f32(v4129, v4129); - float32x4_t v240 = vtrn2q_f32(v4129, v4129); - float32x4_t v258 = vtrn1q_f32(v4131, v4131); - float32x4_t v259 = vtrn2q_f32(v4131, v4131); - float32x4_t v277 = vtrn1q_f32(v4133, v4133); - float32x4_t v278 = vtrn2q_f32(v4133, v4133); - float32x4_t v296 = vtrn1q_f32(v4135, v4135); - float32x4_t v297 = vtrn2q_f32(v4135, v4135); - float32x4_t v315 = vtrn1q_f32(v4137, v4137); - float32x4_t v316 = vtrn2q_f32(v4137, v4137); - float32x4_t v334 = vtrn1q_f32(v4139, v4139); - float32x4_t v335 = vtrn2q_f32(v4139, v4139); - float32x4_t v353 = vtrn1q_f32(v4141, v4141); - float32x4_t v354 = vtrn2q_f32(v4141, v4141); - float32x4_t v372 = vtrn1q_f32(v4143, v4143); - float32x4_t v373 = vtrn2q_f32(v4143, v4143); - float32x4_t v391 = vtrn1q_f32(v4145, v4145); - float32x4_t v392 = vtrn2q_f32(v4145, v4145); - float32x4_t v410 = vtrn1q_f32(v4147, v4147); - float32x4_t v411 = vtrn2q_f32(v4147, v4147); - float32x4_t v429 = vtrn1q_f32(v4149, v4149); - float32x4_t v430 = vtrn2q_f32(v4149, v4149); - float32x4_t v448 = vtrn1q_f32(v4151, v4151); - float32x4_t v449 = vtrn2q_f32(v4151, v4151); - float32x4_t v467 = vtrn1q_f32(v4153, v4153); - float32x4_t v468 = vtrn2q_f32(v4153, v4153); + float32x4_t v125 = vtrn1q_f32(v4165, v4165); + float32x4_t v126 = vtrn2q_f32(v4165, v4165); + float32x4_t v144 = vtrn1q_f32(v4167, v4167); + float32x4_t v145 = vtrn2q_f32(v4167, v4167); + float32x4_t v163 = vtrn1q_f32(v4169, v4169); + float32x4_t v164 = vtrn2q_f32(v4169, v4169); + float32x4_t v182 = vtrn1q_f32(v4171, v4171); + float32x4_t v183 = vtrn2q_f32(v4171, v4171); + float32x4_t v201 = vtrn1q_f32(v4173, v4173); + float32x4_t v202 = vtrn2q_f32(v4173, v4173); + float32x4_t v220 = vtrn1q_f32(v4175, v4175); + float32x4_t v221 = vtrn2q_f32(v4175, v4175); + float32x4_t v239 = vtrn1q_f32(v4177, v4177); + float32x4_t v240 = vtrn2q_f32(v4177, v4177); + float32x4_t v258 = vtrn1q_f32(v4179, v4179); + float32x4_t v259 = vtrn2q_f32(v4179, v4179); + float32x4_t v277 = vtrn1q_f32(v4181, v4181); + float32x4_t v278 = vtrn2q_f32(v4181, v4181); + float32x4_t v296 = vtrn1q_f32(v4183, v4183); + float32x4_t v297 = vtrn2q_f32(v4183, v4183); + float32x4_t v315 = vtrn1q_f32(v4185, v4185); + float32x4_t v316 = vtrn2q_f32(v4185, v4185); + float32x4_t v334 = vtrn1q_f32(v4187, v4187); + float32x4_t v335 = vtrn2q_f32(v4187, v4187); + float32x4_t v353 = vtrn1q_f32(v4189, v4189); + float32x4_t v354 = vtrn2q_f32(v4189, v4189); + float32x4_t v372 = vtrn1q_f32(v4191, v4191); + float32x4_t v373 = vtrn2q_f32(v4191, v4191); + float32x4_t v391 = vtrn1q_f32(v4193, v4193); + float32x4_t v392 = vtrn2q_f32(v4193, v4193); + float32x4_t v410 = vtrn1q_f32(v4195, v4195); + float32x4_t v411 = vtrn2q_f32(v4195, v4195); + float32x4_t v429 = vtrn1q_f32(v4197, v4197); + float32x4_t v430 = vtrn2q_f32(v4197, v4197); + float32x4_t v448 = vtrn1q_f32(v4199, v4199); + float32x4_t v449 = vtrn2q_f32(v4199, v4199); + float32x4_t v467 = vtrn1q_f32(v4201, v4201); + float32x4_t v468 = vtrn2q_f32(v4201, v4201); float32x4_t v36 = vmulq_f32(v30, v35); float32x4_t v55 = vmulq_f32(v49, v54); float32x4_t v74 = vmulq_f32(v68, v73); @@ -21084,86 +16250,26 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu25(const armral_cmplx_f32_t *restrict x, float32x4_t v438 = vfmaq_f32(v435, v430, v436); float32x4_t v457 = vfmaq_f32(v454, v449, v455); float32x4_t v476 = vfmaq_f32(v473, v468, v474); - float32x4_t v494 = vrev64q_f32(v39); - float32x4_t v508 = vrev64q_f32(v58); - float32x4_t v522 = vrev64q_f32(v96); - float32x4_t v543 = vrev64q_f32(v77); - float32x4_t v627 = vrev64q_f32(v134); - float32x4_t v641 = vrev64q_f32(v153); - float32x4_t v655 = vrev64q_f32(v191); - float32x4_t v676 = vrev64q_f32(v172); - float32x4_t v760 = vrev64q_f32(v229); - float32x4_t v774 = vrev64q_f32(v248); - float32x4_t v788 = vrev64q_f32(v286); - float32x4_t v809 = vrev64q_f32(v267); - float32x4_t v893 = vrev64q_f32(v324); - float32x4_t v907 = vrev64q_f32(v343); - float32x4_t v921 = vrev64q_f32(v381); - float32x4_t v942 = vrev64q_f32(v362); - float32x4_t v1026 = vrev64q_f32(v419); - float32x4_t v1040 = vrev64q_f32(v438); - float32x4_t v1054 = vrev64q_f32(v476); - float32x4_t v1075 = vrev64q_f32(v457); - float32x4_t v496 = vmulq_f32(v494, v1209); - float32x4_t v510 = vmulq_f32(v508, v1209); - float32x4_t v524 = vmulq_f32(v522, v1209); - float32x4_t v545 = vmulq_f32(v543, v1209); - float32x4_t v629 = vmulq_f32(v627, v1209); - float32x4_t v643 = vmulq_f32(v641, v1209); - float32x4_t v657 = vmulq_f32(v655, v1209); - float32x4_t v678 = vmulq_f32(v676, v1209); - float32x4_t v762 = vmulq_f32(v760, v1209); - float32x4_t v776 = vmulq_f32(v774, v1209); - float32x4_t v790 = vmulq_f32(v788, v1209); - float32x4_t v811 = vmulq_f32(v809, v1209); - float32x4_t v895 = vmulq_f32(v893, v1209); - float32x4_t v909 = vmulq_f32(v907, v1209); - float32x4_t v923 = vmulq_f32(v921, v1209); - float32x4_t v944 = vmulq_f32(v942, v1209); - float32x4_t v1028 = vmulq_f32(v1026, v1209); - float32x4_t v1042 = vmulq_f32(v1040, v1209); - float32x4_t v1056 = vmulq_f32(v1054, v1209); - float32x4_t v1077 = vmulq_f32(v1075, v1209); - float32x4_t v497 = vaddq_f32(v496, v39); - float32x4_t v511 = vaddq_f32(v510, v58); - float32x4_t v525 = vaddq_f32(v524, v96); - float32x4_t v546 = vaddq_f32(v545, v77); - float32x4_t v630 = vaddq_f32(v629, v134); - float32x4_t v644 = vaddq_f32(v643, v153); - float32x4_t v658 = vaddq_f32(v657, v191); - float32x4_t v679 = vaddq_f32(v678, v172); - float32x4_t v763 = vaddq_f32(v762, v229); - float32x4_t v777 = vaddq_f32(v776, v248); - float32x4_t v791 = vaddq_f32(v790, v286); - float32x4_t v812 = vaddq_f32(v811, v267); - float32x4_t v896 = vaddq_f32(v895, v324); - float32x4_t v910 = vaddq_f32(v909, v343); - float32x4_t v924 = vaddq_f32(v923, v381); - float32x4_t v945 = vaddq_f32(v944, v362); - float32x4_t v1029 = vaddq_f32(v1028, v419); - float32x4_t v1043 = vaddq_f32(v1042, v438); - float32x4_t v1057 = vaddq_f32(v1056, v476); - float32x4_t v1078 = vaddq_f32(v1077, v457); - float32x4_t v526 = vsubq_f32(v497, v525); - float32x4_t v531 = vmulq_f32(v497, v1979); - float32x4_t v547 = vsubq_f32(v511, v546); - float32x4_t v552 = vmulq_f32(v511, v1979); - float32x4_t v659 = vsubq_f32(v630, v658); - float32x4_t v664 = vmulq_f32(v630, v1979); - float32x4_t v680 = vsubq_f32(v644, v679); - float32x4_t v685 = vmulq_f32(v644, v1979); - float32x4_t v792 = vsubq_f32(v763, v791); - float32x4_t v797 = vmulq_f32(v763, v1979); - float32x4_t v813 = vsubq_f32(v777, v812); - float32x4_t v818 = vmulq_f32(v777, v1979); - float32x4_t v925 = vsubq_f32(v896, v924); - float32x4_t v930 = vmulq_f32(v896, v1979); - float32x4_t v946 = vsubq_f32(v910, v945); - float32x4_t v951 = vmulq_f32(v910, v1979); - float32x4_t v1058 = vsubq_f32(v1029, v1057); - float32x4_t v1063 = vmulq_f32(v1029, v1979); - float32x4_t v1079 = vsubq_f32(v1043, v1078); - float32x4_t v1084 = vmulq_f32(v1043, v1979); + float32x4_t v526 = vsubq_f32(v39, v96); + float32x4_t v531 = vmulq_f32(v39, v1979); + float32x4_t v547 = vsubq_f32(v58, v77); + float32x4_t v552 = vmulq_f32(v58, v1979); + float32x4_t v659 = vsubq_f32(v134, v191); + float32x4_t v664 = vmulq_f32(v134, v1979); + float32x4_t v680 = vsubq_f32(v153, v172); + float32x4_t v685 = vmulq_f32(v153, v1979); + float32x4_t v792 = vsubq_f32(v229, v286); + float32x4_t v797 = vmulq_f32(v229, v1979); + float32x4_t v813 = vsubq_f32(v248, v267); + float32x4_t v818 = vmulq_f32(v248, v1979); + float32x4_t v925 = vsubq_f32(v324, v381); + float32x4_t v930 = vmulq_f32(v324, v1979); + float32x4_t v946 = vsubq_f32(v343, v362); + float32x4_t v951 = vmulq_f32(v343, v1979); + float32x4_t v1058 = vsubq_f32(v419, v476); + float32x4_t v1063 = vmulq_f32(v419, v1979); + float32x4_t v1079 = vsubq_f32(v438, v457); + float32x4_t v1084 = vmulq_f32(v438, v1979); float32x4_t v532 = vsubq_f32(v531, v526); float32x4_t v553 = vsubq_f32(v552, v547); float32x4_t v566 = vmulq_f32(v547, v1920); @@ -21206,7 +16312,7 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu25(const armral_cmplx_f32_t *restrict x, float32x4_t v1117 = vsubq_f32(v1116, v1079); float32x4_t v560 = vmulq_f32(v554, v1896); float32x4_t v572 = vmulq_f32(v555, v1908); - float32x4_t v586 = vaddq_f32(v4155, v554); + float32x4_t v586 = vaddq_f32(v4203, v554); float32x4_t v592 = vrev64q_f32(v567); float32x4_t v601 = vrev64q_f32(v585); float32x4_t v693 = vmulq_f32(v687, v1896); @@ -21229,7 +16335,7 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu25(const armral_cmplx_f32_t *restrict x, float32x4_t v1118 = vaddq_f32(v400, v1086); float32x4_t v1124 = vrev64q_f32(v1099); float32x4_t v1133 = vrev64q_f32(v1117); - float32x4_t v561 = vsubq_f32(v4155, v560); + float32x4_t v561 = vsubq_f32(v4203, v560); float32x4_t v594 = vmulq_f32(v592, v1953); float32x4_t v603 = vmulq_f32(v601, v1953); float32x4_t v694 = vsubq_f32(v115, v693); @@ -21244,10 +16350,10 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu25(const armral_cmplx_f32_t *restrict x, float32x4_t v1093 = vsubq_f32(v400, v1092); float32x4_t v1126 = vmulq_f32(v1124, v1953); float32x4_t v1135 = vmulq_f32(v1133, v1953); - float32x4_t v1159 = vrev64q_f32(v719); - float32x4_t v1173 = vrev64q_f32(v852); - float32x4_t v1187 = vrev64q_f32(v1118); - float32x4_t v1208 = vrev64q_f32(v985); + float32x4_t v1191 = vsubq_f32(v719, v1118); + float32x4_t v1196 = vmulq_f32(v719, v1979); + float32x4_t v1212 = vsubq_f32(v852, v985); + float32x4_t v1217 = vmulq_f32(v852, v1979); float32x4_t v573 = vsubq_f32(v561, v572); float32x4_t v578 = vmulq_f32(v561, v1979); float32x4_t v706 = vsubq_f32(v694, v705); @@ -21258,10 +16364,10 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu25(const armral_cmplx_f32_t *restrict x, float32x4_t v977 = vmulq_f32(v960, v1979); float32x4_t v1105 = vsubq_f32(v1093, v1104); float32x4_t v1110 = vmulq_f32(v1093, v1979); - float32x4_t v1161 = vmulq_f32(v1159, v1209); - float32x4_t v1175 = vmulq_f32(v1173, v1209); - float32x4_t v1189 = vmulq_f32(v1187, v1209); - float32x4_t v1210 = vmulq_f32(v1208, v1209); + float32x4_t v1197 = vsubq_f32(v1196, v1191); + float32x4_t v1218 = vsubq_f32(v1217, v1212); + float32x4_t v1231 = vmulq_f32(v1212, v1920); + float32x4_t v1249 = vmulq_f32(v1191, v1920); float32x4_t v579 = vsubq_f32(v578, v573); float32x4_t v604 = vsubq_f32(v573, v603); float32x4_t v609 = vmulq_f32(v573, v1979); @@ -21277,10 +16383,10 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu25(const armral_cmplx_f32_t *restrict x, float32x4_t v1111 = vsubq_f32(v1110, v1105); float32x4_t v1136 = vsubq_f32(v1105, v1135); float32x4_t v1141 = vmulq_f32(v1105, v1979); - float32x4_t v1162 = vaddq_f32(v1161, v719); - float32x4_t v1176 = vaddq_f32(v1175, v852); - float32x4_t v1190 = vaddq_f32(v1189, v1118); - float32x4_t v1211 = vaddq_f32(v1210, v985); + float32x4_t v1219 = vaddq_f32(v1197, v1218); + float32x4_t v1220 = vsubq_f32(v1197, v1218); + float32x4_t v1232 = vaddq_f32(v1191, v1231); + float32x4_t v1250 = vsubq_f32(v1249, v1212); float32x4_t v595 = vsubq_f32(v579, v594); float32x4_t v610 = vsubq_f32(v609, v604); float32x4_t v615 = vmulq_f32(v579, v1979); @@ -21296,10 +16402,11 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu25(const armral_cmplx_f32_t *restrict x, float32x4_t v1127 = vsubq_f32(v1111, v1126); float32x4_t v1142 = vsubq_f32(v1141, v1136); float32x4_t v1147 = vmulq_f32(v1111, v1979); - float32x4_t v1191 = vsubq_f32(v1162, v1190); - float32x4_t v1196 = vmulq_f32(v1162, v1979); - float32x4_t v1212 = vsubq_f32(v1176, v1211); - float32x4_t v1217 = vmulq_f32(v1176, v1979); + float32x4_t v1225 = vmulq_f32(v1219, v1896); + float32x4_t v1237 = vmulq_f32(v1220, v1908); + float32x4_t v1251 = vaddq_f32(v586, v1219); + float32x4_t v1264 = vrev64q_f32(v1232); + float32x4_t v1280 = vrev64q_f32(v1250); float32x4_t v1495 = vrev64q_f32(v737); float32x4_t v1509 = vrev64q_f32(v870); float32x4_t v1523 = vrev64q_f32(v1136); @@ -21309,10 +16416,9 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu25(const armral_cmplx_f32_t *restrict x, float32x4_t v882 = vsubq_f32(v881, v861); float32x4_t v1015 = vsubq_f32(v1014, v994); float32x4_t v1148 = vsubq_f32(v1147, v1127); - float32x4_t v1197 = vsubq_f32(v1196, v1191); - float32x4_t v1218 = vsubq_f32(v1217, v1212); - float32x4_t v1231 = vmulq_f32(v1212, v1920); - float32x4_t v1249 = vmulq_f32(v1191, v1920); + float32x4_t v1226 = vsubq_f32(v586, v1225); + float32x4_t v1266 = vmulq_f32(v1264, v1953); + float32x4_t v1282 = vmulq_f32(v1280, v1953); float32x4_t v1327 = vrev64q_f32(v728); float32x4_t v1341 = vrev64q_f32(v861); float32x4_t v1355 = vrev64q_f32(v1127); @@ -21325,10 +16431,9 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu25(const armral_cmplx_f32_t *restrict x, float32x4_t v1677 = vrev64q_f32(v876); float32x4_t v1691 = vrev64q_f32(v1142); float32x4_t v1712 = vrev64q_f32(v1009); - float32x4_t v1219 = vaddq_f32(v1197, v1218); - float32x4_t v1220 = vsubq_f32(v1197, v1218); - float32x4_t v1232 = vaddq_f32(v1191, v1231); - float32x4_t v1250 = vsubq_f32(v1249, v1212); + vst1q_f32((float32_t *)v3887, v1251); + float32x4_t v1238 = vsubq_f32(v1226, v1237); + float32x4_t v1243 = vmulq_f32(v1226, v1979); float32x4_t v1329 = vmulq_f32(v1327, v1328); float32x4_t v1343 = vmulq_f32(v1341, v1496); float32x4_t v1357 = vmulq_f32(v1355, v1832); @@ -21345,11 +16450,9 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu25(const armral_cmplx_f32_t *restrict x, float32x4_t v1845 = vrev64q_f32(v882); float32x4_t v1859 = vrev64q_f32(v1148); float32x4_t v1880 = vrev64q_f32(v1015); - float32x4_t v1225 = vmulq_f32(v1219, v1896); - float32x4_t v1237 = vmulq_f32(v1220, v1908); - float32x4_t v1251 = vaddq_f32(v586, v1219); - float32x4_t v1264 = vrev64q_f32(v1232); - float32x4_t v1280 = vrev64q_f32(v1250); + float32x4_t v1244 = vsubq_f32(v1243, v1238); + float32x4_t v1283 = vsubq_f32(v1238, v1282); + float32x4_t v1295 = vmulq_f32(v1238, v1979); float32x4_t v1330 = vfmaq_f32(v1329, v728, v1320); float32x4_t v1344 = vfmaq_f32(v1343, v861, v1488); float32x4_t v1358 = vfmaq_f32(v1357, v1127, v1824); @@ -21366,9 +16469,9 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu25(const armral_cmplx_f32_t *restrict x, float32x4_t v1847 = vmulq_f32(v1845, v1846); float32x4_t v1861 = vmulq_f32(v1859, v1860); float32x4_t v1882 = vmulq_f32(v1880, v1881); - float32x4_t v1226 = vsubq_f32(v586, v1225); - float32x4_t v1266 = vmulq_f32(v1264, v1953); - float32x4_t v1282 = vmulq_f32(v1280, v1953); + float32x4_t v1267 = vsubq_f32(v1244, v1266); + float32x4_t v1296 = vsubq_f32(v1295, v1283); + float32x4_t v1308 = vmulq_f32(v1244, v1979); float32x4_t v1359 = vsubq_f32(v1330, v1358); float32x4_t v1364 = vmulq_f32(v1330, v1979); float32x4_t v1380 = vsubq_f32(v1344, v1379); @@ -21385,9 +16488,8 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu25(const armral_cmplx_f32_t *restrict x, float32x4_t v1848 = vfmaq_f32(v1847, v882, v1838); float32x4_t v1862 = vfmaq_f32(v1861, v1148, v1852); float32x4_t v1883 = vfmaq_f32(v1882, v1015, v1873); - vst1q_f32((float32_t *)v3887, v1251); - float32x4_t v1238 = vsubq_f32(v1226, v1237); - float32x4_t v1243 = vmulq_f32(v1226, v1979); + vst1q_f32((float32_t *)v3905, v1283); + float32x4_t v1309 = vsubq_f32(v1308, v1267); float32x4_t v1365 = vsubq_f32(v1364, v1359); float32x4_t v1386 = vsubq_f32(v1385, v1380); float32x4_t v1399 = vmulq_f32(v1380, v1920); @@ -21404,9 +16506,8 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu25(const armral_cmplx_f32_t *restrict x, float32x4_t v1868 = vmulq_f32(v1834, v1979); float32x4_t v1884 = vsubq_f32(v1848, v1883); float32x4_t v1889 = vmulq_f32(v1848, v1979); - float32x4_t v1244 = vsubq_f32(v1243, v1238); - float32x4_t v1283 = vsubq_f32(v1238, v1282); - float32x4_t v1295 = vmulq_f32(v1238, v1979); + vst1q_f32((float32_t *)v3896, v1267); + vst1q_f32((float32_t *)v3914, v1296); float32x4_t v1387 = vaddq_f32(v1365, v1386); float32x4_t v1388 = vsubq_f32(v1365, v1386); float32x4_t v1400 = vaddq_f32(v1359, v1399); @@ -21424,9 +16525,7 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu25(const armral_cmplx_f32_t *restrict x, float32x4_t v1890 = vsubq_f32(v1889, v1884); float32x4_t v1903 = vmulq_f32(v1884, v1920); float32x4_t v1921 = vmulq_f32(v1863, v1920); - float32x4_t v1267 = vsubq_f32(v1244, v1266); - float32x4_t v1296 = vsubq_f32(v1295, v1283); - float32x4_t v1308 = vmulq_f32(v1244, v1979); + vst1q_f32((float32_t *)v3923, v1309); float32x4_t v1393 = vmulq_f32(v1387, v1896); float32x4_t v1405 = vmulq_f32(v1388, v1908); float32x4_t v1419 = vaddq_f32(v595, v1387); @@ -21444,9 +16543,7 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu25(const armral_cmplx_f32_t *restrict x, float32x4_t v1892 = vsubq_f32(v1869, v1890); float32x4_t v1904 = vaddq_f32(v1863, v1903); float32x4_t v1922 = vsubq_f32(v1921, v1884); - vst1q_f32((float32_t *)v3905, v1283); vst1q_f32((float32_t *)v3977, v1587); - float32x4_t v1309 = vsubq_f32(v1308, v1267); float32x4_t v1394 = vsubq_f32(v595, v1393); float32x4_t v1434 = vmulq_f32(v1432, v1953); float32x4_t v1450 = vmulq_f32(v1448, v1953); @@ -21460,8 +16557,6 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu25(const armral_cmplx_f32_t *restrict x, float32x4_t v1923 = vaddq_f32(v616, v1891); float32x4_t v1936 = vrev64q_f32(v1904); float32x4_t v1952 = vrev64q_f32(v1922); - vst1q_f32((float32_t *)v3896, v1267); - vst1q_f32((float32_t *)v3914, v1296); vst1q_f32((float32_t *)v3932, v1419); vst1q_f32((float32_t *)v4022, v1755); float32x4_t v1406 = vsubq_f32(v1394, v1405); @@ -21474,7 +16569,6 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu25(const armral_cmplx_f32_t *restrict x, float32x4_t v1898 = vsubq_f32(v616, v1897); float32x4_t v1938 = vmulq_f32(v1936, v1953); float32x4_t v1954 = vmulq_f32(v1952, v1953); - vst1q_f32((float32_t *)v3923, v1309); vst1q_f32((float32_t *)v4067, v1923); float32x4_t v1412 = vsubq_f32(v1411, v1406); float32x4_t v1451 = vsubq_f32(v1406, v1450); @@ -21524,7 +16618,6 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu25(const armral_cmplx_f32_t *restrict x, } for (int j = v1989 * 2; j < howmany; j += 1) { float32x2_t v2061 = v5[istride]; - float v2979 = 0.0000000000000000e+00F; float v3072 = 9.6858316112863108e-01F; float v3075 = -2.4868988716485479e-01F; float v3076 = 2.4868988716485479e-01F; @@ -21607,7 +16700,6 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu25(const armral_cmplx_f32_t *restrict x, float32x2_t v2348 = v7[46]; float32x2_t v2353 = v7[47]; float32x2_t v2361 = v5[0]; - float v2982 = dir * v2979; float32x2_t v3073 = (float32x2_t){v3072, v3072}; float32x2_t v3077 = (float32x2_t){v3075, v3076}; float32x2_t v3212 = (float32x2_t){v3211, v3211}; @@ -21654,7 +16746,6 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu25(const armral_cmplx_f32_t *restrict x, float32x2_t v2316 = v5[istride * 14]; float32x2_t v2331 = v5[istride * 19]; float32x2_t v2346 = v5[istride * 24]; - float32x2_t v2980 = (float32x2_t){v2979, v2982}; float32x2_t v3079 = vmul_f32(v3597, v3077); float32x2_t v3218 = vmul_f32(v3597, v3216); float32x2_t v3357 = vmul_f32(v3597, v3355); @@ -21758,86 +16849,26 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu25(const armral_cmplx_f32_t *restrict x, float32x2_t v2326 = vfma_f32(v2324, v2320, v2323); float32x2_t v2341 = vfma_f32(v2339, v2335, v2338); float32x2_t v2356 = vfma_f32(v2354, v2350, v2353); - float32x2_t v2371 = vrev64_f32(v2011); - float32x2_t v2383 = vrev64_f32(v2026); - float32x2_t v2395 = vrev64_f32(v2056); - float32x2_t v2413 = vrev64_f32(v2041); - float32x2_t v2485 = vrev64_f32(v2086); - float32x2_t v2497 = vrev64_f32(v2101); - float32x2_t v2509 = vrev64_f32(v2131); - float32x2_t v2527 = vrev64_f32(v2116); - float32x2_t v2599 = vrev64_f32(v2161); - float32x2_t v2611 = vrev64_f32(v2176); - float32x2_t v2623 = vrev64_f32(v2206); - float32x2_t v2641 = vrev64_f32(v2191); - float32x2_t v2713 = vrev64_f32(v2236); - float32x2_t v2725 = vrev64_f32(v2251); - float32x2_t v2737 = vrev64_f32(v2281); - float32x2_t v2755 = vrev64_f32(v2266); - float32x2_t v2827 = vrev64_f32(v2311); - float32x2_t v2839 = vrev64_f32(v2326); - float32x2_t v2851 = vrev64_f32(v2356); - float32x2_t v2869 = vrev64_f32(v2341); - float32x2_t v2372 = vmul_f32(v2371, v2980); - float32x2_t v2384 = vmul_f32(v2383, v2980); - float32x2_t v2396 = vmul_f32(v2395, v2980); - float32x2_t v2414 = vmul_f32(v2413, v2980); - float32x2_t v2486 = vmul_f32(v2485, v2980); - float32x2_t v2498 = vmul_f32(v2497, v2980); - float32x2_t v2510 = vmul_f32(v2509, v2980); - float32x2_t v2528 = vmul_f32(v2527, v2980); - float32x2_t v2600 = vmul_f32(v2599, v2980); - float32x2_t v2612 = vmul_f32(v2611, v2980); - float32x2_t v2624 = vmul_f32(v2623, v2980); - float32x2_t v2642 = vmul_f32(v2641, v2980); - float32x2_t v2714 = vmul_f32(v2713, v2980); - float32x2_t v2726 = vmul_f32(v2725, v2980); - float32x2_t v2738 = vmul_f32(v2737, v2980); - float32x2_t v2756 = vmul_f32(v2755, v2980); - float32x2_t v2828 = vmul_f32(v2827, v2980); - float32x2_t v2840 = vmul_f32(v2839, v2980); - float32x2_t v2852 = vmul_f32(v2851, v2980); - float32x2_t v2870 = vmul_f32(v2869, v2980); - float32x2_t v2373 = vadd_f32(v2372, v2011); - float32x2_t v2385 = vadd_f32(v2384, v2026); - float32x2_t v2397 = vadd_f32(v2396, v2056); - float32x2_t v2415 = vadd_f32(v2414, v2041); - float32x2_t v2487 = vadd_f32(v2486, v2086); - float32x2_t v2499 = vadd_f32(v2498, v2101); - float32x2_t v2511 = vadd_f32(v2510, v2131); - float32x2_t v2529 = vadd_f32(v2528, v2116); - float32x2_t v2601 = vadd_f32(v2600, v2161); - float32x2_t v2613 = vadd_f32(v2612, v2176); - float32x2_t v2625 = vadd_f32(v2624, v2206); - float32x2_t v2643 = vadd_f32(v2642, v2191); - float32x2_t v2715 = vadd_f32(v2714, v2236); - float32x2_t v2727 = vadd_f32(v2726, v2251); - float32x2_t v2739 = vadd_f32(v2738, v2281); - float32x2_t v2757 = vadd_f32(v2756, v2266); - float32x2_t v2829 = vadd_f32(v2828, v2311); - float32x2_t v2841 = vadd_f32(v2840, v2326); - float32x2_t v2853 = vadd_f32(v2852, v2356); - float32x2_t v2871 = vadd_f32(v2870, v2341); - float32x2_t v2398 = vsub_f32(v2373, v2397); - float32x2_t v2402 = vmul_f32(v2373, v3619); - float32x2_t v2416 = vsub_f32(v2385, v2415); - float32x2_t v2420 = vmul_f32(v2385, v3619); - float32x2_t v2512 = vsub_f32(v2487, v2511); - float32x2_t v2516 = vmul_f32(v2487, v3619); - float32x2_t v2530 = vsub_f32(v2499, v2529); - float32x2_t v2534 = vmul_f32(v2499, v3619); - float32x2_t v2626 = vsub_f32(v2601, v2625); - float32x2_t v2630 = vmul_f32(v2601, v3619); - float32x2_t v2644 = vsub_f32(v2613, v2643); - float32x2_t v2648 = vmul_f32(v2613, v3619); - float32x2_t v2740 = vsub_f32(v2715, v2739); - float32x2_t v2744 = vmul_f32(v2715, v3619); - float32x2_t v2758 = vsub_f32(v2727, v2757); - float32x2_t v2762 = vmul_f32(v2727, v3619); - float32x2_t v2854 = vsub_f32(v2829, v2853); - float32x2_t v2858 = vmul_f32(v2829, v3619); - float32x2_t v2872 = vsub_f32(v2841, v2871); - float32x2_t v2876 = vmul_f32(v2841, v3619); + float32x2_t v2398 = vsub_f32(v2011, v2056); + float32x2_t v2402 = vmul_f32(v2011, v3619); + float32x2_t v2416 = vsub_f32(v2026, v2041); + float32x2_t v2420 = vmul_f32(v2026, v3619); + float32x2_t v2512 = vsub_f32(v2086, v2131); + float32x2_t v2516 = vmul_f32(v2086, v3619); + float32x2_t v2530 = vsub_f32(v2101, v2116); + float32x2_t v2534 = vmul_f32(v2101, v3619); + float32x2_t v2626 = vsub_f32(v2161, v2206); + float32x2_t v2630 = vmul_f32(v2161, v3619); + float32x2_t v2644 = vsub_f32(v2176, v2191); + float32x2_t v2648 = vmul_f32(v2176, v3619); + float32x2_t v2740 = vsub_f32(v2236, v2281); + float32x2_t v2744 = vmul_f32(v2236, v3619); + float32x2_t v2758 = vsub_f32(v2251, v2266); + float32x2_t v2762 = vmul_f32(v2251, v3619); + float32x2_t v2854 = vsub_f32(v2311, v2356); + float32x2_t v2858 = vmul_f32(v2311, v3619); + float32x2_t v2872 = vsub_f32(v2326, v2341); + float32x2_t v2876 = vmul_f32(v2326, v3619); float32x2_t v2403 = vsub_f32(v2402, v2398); float32x2_t v2421 = vsub_f32(v2420, v2416); float32x2_t v2432 = vmul_f32(v2416, v3572); @@ -21918,10 +16949,10 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu25(const armral_cmplx_f32_t *restrict x, float32x2_t v2884 = vsub_f32(v2296, v2883); float32x2_t v2912 = vmul_f32(v2911, v3598); float32x2_t v2920 = vmul_f32(v2919, v3598); - float32x2_t v2941 = vrev64_f32(v2563); - float32x2_t v2953 = vrev64_f32(v2677); - float32x2_t v2965 = vrev64_f32(v2905); - float32x2_t v2983 = vrev64_f32(v2791); + float32x2_t v2968 = vsub_f32(v2563, v2905); + float32x2_t v2972 = vmul_f32(v2563, v3619); + float32x2_t v2986 = vsub_f32(v2677, v2791); + float32x2_t v2990 = vmul_f32(v2677, v3619); float32x2_t v2438 = vsub_f32(v2428, v2437); float32x2_t v2442 = vmul_f32(v2428, v3619); float32x2_t v2552 = vsub_f32(v2542, v2551); @@ -21932,10 +16963,10 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu25(const armral_cmplx_f32_t *restrict x, float32x2_t v2784 = vmul_f32(v2770, v3619); float32x2_t v2894 = vsub_f32(v2884, v2893); float32x2_t v2898 = vmul_f32(v2884, v3619); - float32x2_t v2942 = vmul_f32(v2941, v2980); - float32x2_t v2954 = vmul_f32(v2953, v2980); - float32x2_t v2966 = vmul_f32(v2965, v2980); - float32x2_t v2984 = vmul_f32(v2983, v2980); + float32x2_t v2973 = vsub_f32(v2972, v2968); + float32x2_t v2991 = vsub_f32(v2990, v2986); + float32x2_t v3002 = vmul_f32(v2986, v3572); + float32x2_t v3017 = vmul_f32(v2968, v3572); float32x2_t v2443 = vsub_f32(v2442, v2438); float32x2_t v2465 = vsub_f32(v2438, v2464); float32x2_t v2469 = vmul_f32(v2438, v3619); @@ -21951,10 +16982,10 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu25(const armral_cmplx_f32_t *restrict x, float32x2_t v2899 = vsub_f32(v2898, v2894); float32x2_t v2921 = vsub_f32(v2894, v2920); float32x2_t v2925 = vmul_f32(v2894, v3619); - float32x2_t v2943 = vadd_f32(v2942, v2563); - float32x2_t v2955 = vadd_f32(v2954, v2677); - float32x2_t v2967 = vadd_f32(v2966, v2905); - float32x2_t v2985 = vadd_f32(v2984, v2791); + float32x2_t v2992 = vadd_f32(v2973, v2991); + float32x2_t v2993 = vsub_f32(v2973, v2991); + float32x2_t v3003 = vadd_f32(v2968, v3002); + float32x2_t v3018 = vsub_f32(v3017, v2986); float32x2_t v2457 = vsub_f32(v2443, v2456); float32x2_t v2470 = vsub_f32(v2469, v2465); float32x2_t v2474 = vmul_f32(v2443, v3619); @@ -21970,10 +17001,11 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu25(const armral_cmplx_f32_t *restrict x, float32x2_t v2913 = vsub_f32(v2899, v2912); float32x2_t v2926 = vsub_f32(v2925, v2921); float32x2_t v2930 = vmul_f32(v2899, v3619); - float32x2_t v2968 = vsub_f32(v2943, v2967); - float32x2_t v2972 = vmul_f32(v2943, v3619); - float32x2_t v2986 = vsub_f32(v2955, v2985); - float32x2_t v2990 = vmul_f32(v2955, v3619); + float32x2_t v2997 = vmul_f32(v2992, v3552); + float32x2_t v3007 = vmul_f32(v2993, v3562); + float32x2_t v3019 = vadd_f32(v2449, v2992); + float32x2_t v3030 = vrev64_f32(v3003); + float32x2_t v3043 = vrev64_f32(v3018); float32x2_t v3219 = vrev64_f32(v2579); float32x2_t v3231 = vrev64_f32(v2693); float32x2_t v3243 = vrev64_f32(v2921); @@ -21983,10 +17015,10 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu25(const armral_cmplx_f32_t *restrict x, float32x2_t v2703 = vsub_f32(v2702, v2685); float32x2_t v2817 = vsub_f32(v2816, v2799); float32x2_t v2931 = vsub_f32(v2930, v2913); - float32x2_t v2973 = vsub_f32(v2972, v2968); - float32x2_t v2991 = vsub_f32(v2990, v2986); - float32x2_t v3002 = vmul_f32(v2986, v3572); - float32x2_t v3017 = vmul_f32(v2968, v3572); + float32x2_t v2998 = vsub_f32(v2449, v2997); + v6[0] = v3019; + float32x2_t v3031 = vmul_f32(v3030, v3598); + float32x2_t v3044 = vmul_f32(v3043, v3598); float32x2_t v3080 = vrev64_f32(v2571); float32x2_t v3092 = vrev64_f32(v2685); float32x2_t v3104 = vrev64_f32(v2913); @@ -21999,10 +17031,8 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu25(const armral_cmplx_f32_t *restrict x, float32x2_t v3370 = vrev64_f32(v2698); float32x2_t v3382 = vrev64_f32(v2926); float32x2_t v3400 = vrev64_f32(v2812); - float32x2_t v2992 = vadd_f32(v2973, v2991); - float32x2_t v2993 = vsub_f32(v2973, v2991); - float32x2_t v3003 = vadd_f32(v2968, v3002); - float32x2_t v3018 = vsub_f32(v3017, v2986); + float32x2_t v3008 = vsub_f32(v2998, v3007); + float32x2_t v3012 = vmul_f32(v2998, v3619); float32x2_t v3081 = vmul_f32(v3080, v3079); float32x2_t v3093 = vmul_f32(v3092, v3218); float32x2_t v3105 = vmul_f32(v3104, v3496); @@ -22019,11 +17049,9 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu25(const armral_cmplx_f32_t *restrict x, float32x2_t v3509 = vrev64_f32(v2703); float32x2_t v3521 = vrev64_f32(v2931); float32x2_t v3539 = vrev64_f32(v2817); - float32x2_t v2997 = vmul_f32(v2992, v3552); - float32x2_t v3007 = vmul_f32(v2993, v3562); - float32x2_t v3019 = vadd_f32(v2449, v2992); - float32x2_t v3030 = vrev64_f32(v3003); - float32x2_t v3043 = vrev64_f32(v3018); + float32x2_t v3013 = vsub_f32(v3012, v3008); + float32x2_t v3045 = vsub_f32(v3008, v3044); + float32x2_t v3054 = vmul_f32(v3008, v3619); float32x2_t v3082 = vfma_f32(v3081, v2571, v3073); float32x2_t v3094 = vfma_f32(v3093, v2685, v3212); float32x2_t v3106 = vfma_f32(v3105, v2913, v3490); @@ -22040,10 +17068,10 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu25(const armral_cmplx_f32_t *restrict x, float32x2_t v3510 = vmul_f32(v3509, v3508); float32x2_t v3522 = vmul_f32(v3521, v3520); float32x2_t v3540 = vmul_f32(v3539, v3538); - float32x2_t v2998 = vsub_f32(v2449, v2997); - v6[0] = v3019; - float32x2_t v3031 = vmul_f32(v3030, v3598); - float32x2_t v3044 = vmul_f32(v3043, v3598); + float32x2_t v3032 = vsub_f32(v3013, v3031); + v6[ostride * 10] = v3045; + float32x2_t v3055 = vsub_f32(v3054, v3045); + float32x2_t v3064 = vmul_f32(v3013, v3619); float32x2_t v3107 = vsub_f32(v3082, v3106); float32x2_t v3111 = vmul_f32(v3082, v3619); float32x2_t v3125 = vsub_f32(v3094, v3124); @@ -22060,8 +17088,9 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu25(const armral_cmplx_f32_t *restrict x, float32x2_t v3511 = vfma_f32(v3510, v2703, v3502); float32x2_t v3523 = vfma_f32(v3522, v2931, v3514); float32x2_t v3541 = vfma_f32(v3540, v2817, v3532); - float32x2_t v3008 = vsub_f32(v2998, v3007); - float32x2_t v3012 = vmul_f32(v2998, v3619); + v6[ostride * 5] = v3032; + v6[ostride * 15] = v3055; + float32x2_t v3065 = vsub_f32(v3064, v3032); float32x2_t v3112 = vsub_f32(v3111, v3107); float32x2_t v3130 = vsub_f32(v3129, v3125); float32x2_t v3141 = vmul_f32(v3125, v3572); @@ -22078,9 +17107,7 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu25(const armral_cmplx_f32_t *restrict x, float32x2_t v3528 = vmul_f32(v3499, v3619); float32x2_t v3542 = vsub_f32(v3511, v3541); float32x2_t v3546 = vmul_f32(v3511, v3619); - float32x2_t v3013 = vsub_f32(v3012, v3008); - float32x2_t v3045 = vsub_f32(v3008, v3044); - float32x2_t v3054 = vmul_f32(v3008, v3619); + v6[ostride * 20] = v3065; float32x2_t v3131 = vadd_f32(v3112, v3130); float32x2_t v3132 = vsub_f32(v3112, v3130); float32x2_t v3142 = vadd_f32(v3107, v3141); @@ -22098,10 +17125,6 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu25(const armral_cmplx_f32_t *restrict x, float32x2_t v3547 = vsub_f32(v3546, v3542); float32x2_t v3558 = vmul_f32(v3542, v3572); float32x2_t v3573 = vmul_f32(v3524, v3572); - float32x2_t v3032 = vsub_f32(v3013, v3031); - v6[ostride * 10] = v3045; - float32x2_t v3055 = vsub_f32(v3054, v3045); - float32x2_t v3064 = vmul_f32(v3013, v3619); float32x2_t v3136 = vmul_f32(v3131, v3552); float32x2_t v3146 = vmul_f32(v3132, v3562); float32x2_t v3158 = vadd_f32(v2457, v3131); @@ -22120,9 +17143,6 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu25(const armral_cmplx_f32_t *restrict x, float32x2_t v3549 = vsub_f32(v3529, v3547); float32x2_t v3559 = vadd_f32(v3524, v3558); float32x2_t v3574 = vsub_f32(v3573, v3542); - v6[ostride * 5] = v3032; - v6[ostride * 15] = v3055; - float32x2_t v3065 = vsub_f32(v3064, v3032); float32x2_t v3137 = vsub_f32(v2457, v3136); v6[ostride] = v3158; float32x2_t v3170 = vmul_f32(v3169, v3598); @@ -22138,7 +17158,6 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu25(const armral_cmplx_f32_t *restrict x, float32x2_t v3575 = vadd_f32(v2475, v3548); float32x2_t v3586 = vrev64_f32(v3559); float32x2_t v3599 = vrev64_f32(v3574); - v6[ostride * 20] = v3065; float32x2_t v3147 = vsub_f32(v3137, v3146); float32x2_t v3151 = vmul_f32(v3137, v3619); float32x2_t v3291 = vsub_f32(v3290, v3286); @@ -22237,59 +17256,33 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu25(const armral_cmplx_f32_t *restrict x, float v1641 = 2.5000000000000000e-01F; float v1653 = 5.5901699437494745e-01F; float v1665 = 6.1803398874989490e-01F; - float v1693 = 0.0000000000000000e+00F; float v1694 = -9.5105651629515353e-01F; float v1722 = 2.0000000000000000e+00F; const float32x2_t *v1776 = &v5[v0]; float32x2_t *v2141 = &v6[v2]; int64_t v19 = v0 * 5; - float32x2_t v26 = v7[4]; int64_t v30 = v0 * 10; - float32x2_t v37 = v7[9]; int64_t v41 = v0 * 15; - float32x2_t v48 = v7[14]; int64_t v52 = v0 * 20; - float32x2_t v59 = v7[19]; - float32x2_t v70 = v7[0]; int64_t v74 = v0 * 6; - float32x2_t v81 = v7[5]; int64_t v85 = v0 * 11; - float32x2_t v92 = v7[10]; int64_t v96 = v0 * 16; - float32x2_t v103 = v7[15]; int64_t v107 = v0 * 21; - float32x2_t v114 = v7[20]; int64_t v118 = v0 * 2; - float32x2_t v125 = v7[1]; int64_t v129 = v0 * 7; - float32x2_t v136 = v7[6]; int64_t v140 = v0 * 12; - float32x2_t v147 = v7[11]; int64_t v151 = v0 * 17; - float32x2_t v158 = v7[16]; int64_t v162 = v0 * 22; - float32x2_t v169 = v7[21]; int64_t v173 = v0 * 3; - float32x2_t v180 = v7[2]; int64_t v184 = v0 * 8; - float32x2_t v191 = v7[7]; int64_t v195 = v0 * 13; - float32x2_t v202 = v7[12]; int64_t v206 = v0 * 18; - float32x2_t v213 = v7[17]; int64_t v217 = v0 * 23; - float32x2_t v224 = v7[22]; int64_t v228 = v0 * 4; - float32x2_t v235 = v7[3]; int64_t v239 = v0 * 9; - float32x2_t v246 = v7[8]; int64_t v250 = v0 * 14; - float32x2_t v257 = v7[13]; int64_t v261 = v0 * 19; - float32x2_t v268 = v7[18]; int64_t v272 = v0 * 24; - float32x2_t v279 = v7[23]; - float v979 = v4 * v1693; int64_t v1039 = v2 * 5; int64_t v1054 = v2 * 10; int64_t v1067 = v2 * 15; @@ -22324,6 +17317,7 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu25(const armral_cmplx_f32_t *restrict x, int64_t v1715 = v2 * 19; int64_t v1728 = v2 * 24; const float32x2_t *v1957 = &v5[0]; + svfloat32_t v2063 = svdup_n_f32(0); float32x2_t *v2077 = &v6[0]; svfloat32_t v2120 = svdup_n_f32(v1087); svfloat32_t v2184 = svdup_n_f32(v1249); @@ -22337,56 +17331,56 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu25(const armral_cmplx_f32_t *restrict x, svfloat32_t v2324 = svdup_n_f32(v1653); svfloat32_t v2326 = svdup_n_f32(v1665); svfloat32_t v2366 = svdup_n_f32(v1722); - svfloat32_t v2385 = svreinterpret_f32_f64( + svfloat32_t v2409 = svreinterpret_f32_f64( svld1_f64(pred_full, &((const double *)v1776)[0])); - svfloat32_t v27; - asm("mov %0.d, %d1" : "=w"(v27) : "w"(v26)); - svfloat32_t v38; - asm("mov %0.d, %d1" : "=w"(v38) : "w"(v37)); - svfloat32_t v49; - asm("mov %0.d, %d1" : "=w"(v49) : "w"(v48)); - svfloat32_t v60; - asm("mov %0.d, %d1" : "=w"(v60) : "w"(v59)); - svfloat32_t v71; - asm("mov %0.d, %d1" : "=w"(v71) : "w"(v70)); - svfloat32_t v82; - asm("mov %0.d, %d1" : "=w"(v82) : "w"(v81)); - svfloat32_t v93; - asm("mov %0.d, %d1" : "=w"(v93) : "w"(v92)); - svfloat32_t v104; - asm("mov %0.d, %d1" : "=w"(v104) : "w"(v103)); - svfloat32_t v115; - asm("mov %0.d, %d1" : "=w"(v115) : "w"(v114)); - svfloat32_t v126; - asm("mov %0.d, %d1" : "=w"(v126) : "w"(v125)); - svfloat32_t v137; - asm("mov %0.d, %d1" : "=w"(v137) : "w"(v136)); - svfloat32_t v148; - asm("mov %0.d, %d1" : "=w"(v148) : "w"(v147)); - svfloat32_t v159; - asm("mov %0.d, %d1" : "=w"(v159) : "w"(v158)); - svfloat32_t v170; - asm("mov %0.d, %d1" : "=w"(v170) : "w"(v169)); - svfloat32_t v181; - asm("mov %0.d, %d1" : "=w"(v181) : "w"(v180)); - svfloat32_t v192; - asm("mov %0.d, %d1" : "=w"(v192) : "w"(v191)); - svfloat32_t v203; - asm("mov %0.d, %d1" : "=w"(v203) : "w"(v202)); - svfloat32_t v214; - asm("mov %0.d, %d1" : "=w"(v214) : "w"(v213)); - svfloat32_t v225; - asm("mov %0.d, %d1" : "=w"(v225) : "w"(v224)); - svfloat32_t v236; - asm("mov %0.d, %d1" : "=w"(v236) : "w"(v235)); - svfloat32_t v247; - asm("mov %0.d, %d1" : "=w"(v247) : "w"(v246)); - svfloat32_t v258; - asm("mov %0.d, %d1" : "=w"(v258) : "w"(v257)); - svfloat32_t v269; - asm("mov %0.d, %d1" : "=w"(v269) : "w"(v268)); - svfloat32_t v280; - asm("mov %0.d, %d1" : "=w"(v280) : "w"(v279)); + svfloat32_t v27 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[4])); + svfloat32_t v38 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[9])); + svfloat32_t v49 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[14])); + svfloat32_t v60 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[19])); + svfloat32_t v71 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[0])); + svfloat32_t v82 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[5])); + svfloat32_t v93 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[10])); + svfloat32_t v104 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[15])); + svfloat32_t v115 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[20])); + svfloat32_t v126 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[1])); + svfloat32_t v137 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[6])); + svfloat32_t v148 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[11])); + svfloat32_t v159 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[16])); + svfloat32_t v170 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[21])); + svfloat32_t v181 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[2])); + svfloat32_t v192 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[7])); + svfloat32_t v203 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[12])); + svfloat32_t v214 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[17])); + svfloat32_t v225 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[22])); + svfloat32_t v236 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[3])); + svfloat32_t v247 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[8])); + svfloat32_t v258 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[13])); + svfloat32_t v269 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[18])); + svfloat32_t v280 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[23])); const float32x2_t *v1740 = &v5[v19]; const float32x2_t *v1749 = &v5[v30]; const float32x2_t *v1758 = &v5[v41]; @@ -22410,7 +17404,6 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu25(const armral_cmplx_f32_t *restrict x, const float32x2_t *v1929 = &v5[v250]; const float32x2_t *v1938 = &v5[v261]; const float32x2_t *v1947 = &v5[v272]; - svfloat32_t v2063 = svdup_n_f32(v979); float32x2_t *v2087 = &v6[v1039]; float32x2_t *v2097 = &v6[v1054]; float32x2_t *v2107 = &v6[v1067]; @@ -22444,173 +17437,149 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu25(const armral_cmplx_f32_t *restrict x, float32x2_t *v2353 = &v6[v1702]; float32x2_t *v2363 = &v6[v1715]; float32x2_t *v2373 = &v6[v1728]; - svfloat32_t v2425 = svreinterpret_f32_f64( + svfloat32_t v2449 = svreinterpret_f32_f64( svld1_f64(pred_full, &((const double *)v1957)[0])); - svfloat32_t zero72; - asm volatile("mov %0.s, #0" : "=w"(zero72)); + svfloat32_t zero72 = svdup_n_f32(0); svfloat32_t v72 = - svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero72, v2385, v71, 0), - v2385, v71, 90); - svfloat32_t v2377 = svreinterpret_f32_f64( + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero72, v2409, v71, 0), + v2409, v71, 90); + svfloat32_t v2401 = svreinterpret_f32_f64( svld1_f64(pred_full, &((const double *)v1740)[0])); - svfloat32_t v2379 = svreinterpret_f32_f64( + svfloat32_t v2403 = svreinterpret_f32_f64( svld1_f64(pred_full, &((const double *)v1749)[0])); - svfloat32_t v2381 = svreinterpret_f32_f64( + svfloat32_t v2405 = svreinterpret_f32_f64( svld1_f64(pred_full, &((const double *)v1758)[0])); - svfloat32_t v2383 = svreinterpret_f32_f64( + svfloat32_t v2407 = svreinterpret_f32_f64( svld1_f64(pred_full, &((const double *)v1767)[0])); - svfloat32_t v2387 = svreinterpret_f32_f64( + svfloat32_t v2411 = svreinterpret_f32_f64( svld1_f64(pred_full, &((const double *)v1785)[0])); - svfloat32_t v2389 = svreinterpret_f32_f64( + svfloat32_t v2413 = svreinterpret_f32_f64( svld1_f64(pred_full, &((const double *)v1794)[0])); - svfloat32_t v2391 = svreinterpret_f32_f64( + svfloat32_t v2415 = svreinterpret_f32_f64( svld1_f64(pred_full, &((const double *)v1803)[0])); - svfloat32_t v2393 = svreinterpret_f32_f64( + svfloat32_t v2417 = svreinterpret_f32_f64( svld1_f64(pred_full, &((const double *)v1812)[0])); - svfloat32_t v2395 = svreinterpret_f32_f64( + svfloat32_t v2419 = svreinterpret_f32_f64( svld1_f64(pred_full, &((const double *)v1821)[0])); - svfloat32_t v2397 = svreinterpret_f32_f64( + svfloat32_t v2421 = svreinterpret_f32_f64( svld1_f64(pred_full, &((const double *)v1830)[0])); - svfloat32_t v2399 = svreinterpret_f32_f64( + svfloat32_t v2423 = svreinterpret_f32_f64( svld1_f64(pred_full, &((const double *)v1839)[0])); - svfloat32_t v2401 = svreinterpret_f32_f64( + svfloat32_t v2425 = svreinterpret_f32_f64( svld1_f64(pred_full, &((const double *)v1848)[0])); - svfloat32_t v2403 = svreinterpret_f32_f64( + svfloat32_t v2427 = svreinterpret_f32_f64( svld1_f64(pred_full, &((const double *)v1857)[0])); - svfloat32_t v2405 = svreinterpret_f32_f64( + svfloat32_t v2429 = svreinterpret_f32_f64( svld1_f64(pred_full, &((const double *)v1866)[0])); - svfloat32_t v2407 = svreinterpret_f32_f64( + svfloat32_t v2431 = svreinterpret_f32_f64( svld1_f64(pred_full, &((const double *)v1875)[0])); - svfloat32_t v2409 = svreinterpret_f32_f64( + svfloat32_t v2433 = svreinterpret_f32_f64( svld1_f64(pred_full, &((const double *)v1884)[0])); - svfloat32_t v2411 = svreinterpret_f32_f64( + svfloat32_t v2435 = svreinterpret_f32_f64( svld1_f64(pred_full, &((const double *)v1893)[0])); - svfloat32_t v2413 = svreinterpret_f32_f64( + svfloat32_t v2437 = svreinterpret_f32_f64( svld1_f64(pred_full, &((const double *)v1902)[0])); - svfloat32_t v2415 = svreinterpret_f32_f64( + svfloat32_t v2439 = svreinterpret_f32_f64( svld1_f64(pred_full, &((const double *)v1911)[0])); - svfloat32_t v2417 = svreinterpret_f32_f64( + svfloat32_t v2441 = svreinterpret_f32_f64( svld1_f64(pred_full, &((const double *)v1920)[0])); - svfloat32_t v2419 = svreinterpret_f32_f64( + svfloat32_t v2443 = svreinterpret_f32_f64( svld1_f64(pred_full, &((const double *)v1929)[0])); - svfloat32_t v2421 = svreinterpret_f32_f64( + svfloat32_t v2445 = svreinterpret_f32_f64( svld1_f64(pred_full, &((const double *)v1938)[0])); - svfloat32_t v2423 = svreinterpret_f32_f64( + svfloat32_t v2447 = svreinterpret_f32_f64( svld1_f64(pred_full, &((const double *)v1947)[0])); - svfloat32_t zero28; - asm volatile("mov %0.s, #0" : "=w"(zero28)); + svfloat32_t zero28 = svdup_n_f32(0); svfloat32_t v28 = - svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero28, v2377, v27, 0), - v2377, v27, 90); - svfloat32_t zero39; - asm volatile("mov %0.s, #0" : "=w"(zero39)); + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero28, v2401, v27, 0), + v2401, v27, 90); + svfloat32_t zero39 = svdup_n_f32(0); svfloat32_t v39 = - svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero39, v2379, v38, 0), - v2379, v38, 90); - svfloat32_t zero50; - asm volatile("mov %0.s, #0" : "=w"(zero50)); + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero39, v2403, v38, 0), + v2403, v38, 90); + svfloat32_t zero50 = svdup_n_f32(0); svfloat32_t v50 = - svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero50, v2381, v49, 0), - v2381, v49, 90); - svfloat32_t zero61; - asm volatile("mov %0.s, #0" : "=w"(zero61)); + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero50, v2405, v49, 0), + v2405, v49, 90); + svfloat32_t zero61 = svdup_n_f32(0); svfloat32_t v61 = - svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero61, v2383, v60, 0), - v2383, v60, 90); - svfloat32_t zero83; - asm volatile("mov %0.s, #0" : "=w"(zero83)); + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero61, v2407, v60, 0), + v2407, v60, 90); + svfloat32_t zero83 = svdup_n_f32(0); svfloat32_t v83 = - svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero83, v2387, v82, 0), - v2387, v82, 90); - svfloat32_t zero94; - asm volatile("mov %0.s, #0" : "=w"(zero94)); + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero83, v2411, v82, 0), + v2411, v82, 90); + svfloat32_t zero94 = svdup_n_f32(0); svfloat32_t v94 = - svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero94, v2389, v93, 0), - v2389, v93, 90); - svfloat32_t zero105; - asm volatile("mov %0.s, #0" : "=w"(zero105)); + svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero94, v2413, v93, 0), + v2413, v93, 90); + svfloat32_t zero105 = svdup_n_f32(0); svfloat32_t v105 = svcmla_f32_x( - pred_full, svcmla_f32_x(pred_full, zero105, v2391, v104, 0), v2391, + pred_full, svcmla_f32_x(pred_full, zero105, v2415, v104, 0), v2415, v104, 90); - svfloat32_t zero116; - asm volatile("mov %0.s, #0" : "=w"(zero116)); + svfloat32_t zero116 = svdup_n_f32(0); svfloat32_t v116 = svcmla_f32_x( - pred_full, svcmla_f32_x(pred_full, zero116, v2393, v115, 0), v2393, + pred_full, svcmla_f32_x(pred_full, zero116, v2417, v115, 0), v2417, v115, 90); - svfloat32_t zero127; - asm volatile("mov %0.s, #0" : "=w"(zero127)); + svfloat32_t zero127 = svdup_n_f32(0); svfloat32_t v127 = svcmla_f32_x( - pred_full, svcmla_f32_x(pred_full, zero127, v2395, v126, 0), v2395, + pred_full, svcmla_f32_x(pred_full, zero127, v2419, v126, 0), v2419, v126, 90); - svfloat32_t zero138; - asm volatile("mov %0.s, #0" : "=w"(zero138)); + svfloat32_t zero138 = svdup_n_f32(0); svfloat32_t v138 = svcmla_f32_x( - pred_full, svcmla_f32_x(pred_full, zero138, v2397, v137, 0), v2397, + pred_full, svcmla_f32_x(pred_full, zero138, v2421, v137, 0), v2421, v137, 90); - svfloat32_t zero149; - asm volatile("mov %0.s, #0" : "=w"(zero149)); + svfloat32_t zero149 = svdup_n_f32(0); svfloat32_t v149 = svcmla_f32_x( - pred_full, svcmla_f32_x(pred_full, zero149, v2399, v148, 0), v2399, + pred_full, svcmla_f32_x(pred_full, zero149, v2423, v148, 0), v2423, v148, 90); - svfloat32_t zero160; - asm volatile("mov %0.s, #0" : "=w"(zero160)); + svfloat32_t zero160 = svdup_n_f32(0); svfloat32_t v160 = svcmla_f32_x( - pred_full, svcmla_f32_x(pred_full, zero160, v2401, v159, 0), v2401, + pred_full, svcmla_f32_x(pred_full, zero160, v2425, v159, 0), v2425, v159, 90); - svfloat32_t zero171; - asm volatile("mov %0.s, #0" : "=w"(zero171)); + svfloat32_t zero171 = svdup_n_f32(0); svfloat32_t v171 = svcmla_f32_x( - pred_full, svcmla_f32_x(pred_full, zero171, v2403, v170, 0), v2403, + pred_full, svcmla_f32_x(pred_full, zero171, v2427, v170, 0), v2427, v170, 90); - svfloat32_t zero182; - asm volatile("mov %0.s, #0" : "=w"(zero182)); + svfloat32_t zero182 = svdup_n_f32(0); svfloat32_t v182 = svcmla_f32_x( - pred_full, svcmla_f32_x(pred_full, zero182, v2405, v181, 0), v2405, + pred_full, svcmla_f32_x(pred_full, zero182, v2429, v181, 0), v2429, v181, 90); - svfloat32_t zero193; - asm volatile("mov %0.s, #0" : "=w"(zero193)); + svfloat32_t zero193 = svdup_n_f32(0); svfloat32_t v193 = svcmla_f32_x( - pred_full, svcmla_f32_x(pred_full, zero193, v2407, v192, 0), v2407, + pred_full, svcmla_f32_x(pred_full, zero193, v2431, v192, 0), v2431, v192, 90); - svfloat32_t zero204; - asm volatile("mov %0.s, #0" : "=w"(zero204)); + svfloat32_t zero204 = svdup_n_f32(0); svfloat32_t v204 = svcmla_f32_x( - pred_full, svcmla_f32_x(pred_full, zero204, v2409, v203, 0), v2409, + pred_full, svcmla_f32_x(pred_full, zero204, v2433, v203, 0), v2433, v203, 90); - svfloat32_t zero215; - asm volatile("mov %0.s, #0" : "=w"(zero215)); + svfloat32_t zero215 = svdup_n_f32(0); svfloat32_t v215 = svcmla_f32_x( - pred_full, svcmla_f32_x(pred_full, zero215, v2411, v214, 0), v2411, + pred_full, svcmla_f32_x(pred_full, zero215, v2435, v214, 0), v2435, v214, 90); - svfloat32_t zero226; - asm volatile("mov %0.s, #0" : "=w"(zero226)); + svfloat32_t zero226 = svdup_n_f32(0); svfloat32_t v226 = svcmla_f32_x( - pred_full, svcmla_f32_x(pred_full, zero226, v2413, v225, 0), v2413, + pred_full, svcmla_f32_x(pred_full, zero226, v2437, v225, 0), v2437, v225, 90); - svfloat32_t zero237; - asm volatile("mov %0.s, #0" : "=w"(zero237)); + svfloat32_t zero237 = svdup_n_f32(0); svfloat32_t v237 = svcmla_f32_x( - pred_full, svcmla_f32_x(pred_full, zero237, v2415, v236, 0), v2415, + pred_full, svcmla_f32_x(pred_full, zero237, v2439, v236, 0), v2439, v236, 90); - svfloat32_t zero248; - asm volatile("mov %0.s, #0" : "=w"(zero248)); + svfloat32_t zero248 = svdup_n_f32(0); svfloat32_t v248 = svcmla_f32_x( - pred_full, svcmla_f32_x(pred_full, zero248, v2417, v247, 0), v2417, + pred_full, svcmla_f32_x(pred_full, zero248, v2441, v247, 0), v2441, v247, 90); - svfloat32_t zero259; - asm volatile("mov %0.s, #0" : "=w"(zero259)); + svfloat32_t zero259 = svdup_n_f32(0); svfloat32_t v259 = svcmla_f32_x( - pred_full, svcmla_f32_x(pred_full, zero259, v2419, v258, 0), v2419, + pred_full, svcmla_f32_x(pred_full, zero259, v2443, v258, 0), v2443, v258, 90); - svfloat32_t zero270; - asm volatile("mov %0.s, #0" : "=w"(zero270)); + svfloat32_t zero270 = svdup_n_f32(0); svfloat32_t v270 = svcmla_f32_x( - pred_full, svcmla_f32_x(pred_full, zero270, v2421, v269, 0), v2421, + pred_full, svcmla_f32_x(pred_full, zero270, v2445, v269, 0), v2445, v269, 90); - svfloat32_t zero281; - asm volatile("mov %0.s, #0" : "=w"(zero281)); + svfloat32_t zero281 = svdup_n_f32(0); svfloat32_t v281 = svcmla_f32_x( - pred_full, svcmla_f32_x(pred_full, zero281, v2423, v280, 0), v2423, + pred_full, svcmla_f32_x(pred_full, zero281, v2447, v280, 0), v2447, v280, 90); svfloat32_t v301 = svcmla_f32_x(pred_full, v28, v2063, v28, 90); svfloat32_t v314 = svcmla_f32_x(pred_full, v39, v2063, v39, 90); @@ -22632,26 +17601,16 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu25(const armral_cmplx_f32_t *restrict x, svfloat32_t v822 = svcmla_f32_x(pred_full, v259, v2063, v259, 90); svfloat32_t v835 = svcmla_f32_x(pred_full, v281, v2063, v281, 90); svfloat32_t v855 = svcmla_f32_x(pred_full, v270, v2063, v270, 90); - svfloat32_t v328; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v328) : "w"(v301), "w"(v327)); - svfloat32_t v348; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v348) : "w"(v314), "w"(v347)); - svfloat32_t v455; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v455) : "w"(v428), "w"(v454)); - svfloat32_t v475; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v475) : "w"(v441), "w"(v474)); - svfloat32_t v582; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v582) : "w"(v555), "w"(v581)); - svfloat32_t v602; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v602) : "w"(v568), "w"(v601)); - svfloat32_t v709; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v709) : "w"(v682), "w"(v708)); - svfloat32_t v729; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v729) : "w"(v695), "w"(v728)); - svfloat32_t v836; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v836) : "w"(v809), "w"(v835)); - svfloat32_t v856; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v856) : "w"(v822), "w"(v855)); + svfloat32_t v328 = svsub_f32_x(svptrue_b32(), v301, v327); + svfloat32_t v348 = svsub_f32_x(svptrue_b32(), v314, v347); + svfloat32_t v455 = svsub_f32_x(svptrue_b32(), v428, v454); + svfloat32_t v475 = svsub_f32_x(svptrue_b32(), v441, v474); + svfloat32_t v582 = svsub_f32_x(svptrue_b32(), v555, v581); + svfloat32_t v602 = svsub_f32_x(svptrue_b32(), v568, v601); + svfloat32_t v709 = svsub_f32_x(svptrue_b32(), v682, v708); + svfloat32_t v729 = svsub_f32_x(svptrue_b32(), v695, v728); + svfloat32_t v836 = svsub_f32_x(svptrue_b32(), v809, v835); + svfloat32_t v856 = svsub_f32_x(svptrue_b32(), v822, v855); svfloat32_t v334 = svnmls_f32_x(pred_full, v328, v301, v2366); svfloat32_t v354 = svnmls_f32_x(pred_full, v348, v314, v2366); svfloat32_t v461 = svnmls_f32_x(pred_full, v455, v428, v2366); @@ -22662,77 +17621,52 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu25(const armral_cmplx_f32_t *restrict x, svfloat32_t v735 = svnmls_f32_x(pred_full, v729, v695, v2366); svfloat32_t v842 = svnmls_f32_x(pred_full, v836, v809, v2366); svfloat32_t v862 = svnmls_f32_x(pred_full, v856, v822, v2366); - svfloat32_t v355; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v355) : "w"(v334), "w"(v354)); - svfloat32_t v356; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v356) : "w"(v334), "w"(v354)); + svfloat32_t v355 = svadd_f32_x(svptrue_b32(), v334, v354); + svfloat32_t v356 = svsub_f32_x(svptrue_b32(), v334, v354); svfloat32_t v368 = svmla_f32_x(pred_full, v328, v348, v2326); svfloat32_t v386 = svnmls_f32_x(pred_full, v348, v328, v2326); - svfloat32_t v482; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v482) : "w"(v461), "w"(v481)); - svfloat32_t v483; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v483) : "w"(v461), "w"(v481)); + svfloat32_t v482 = svadd_f32_x(svptrue_b32(), v461, v481); + svfloat32_t v483 = svsub_f32_x(svptrue_b32(), v461, v481); svfloat32_t v495 = svmla_f32_x(pred_full, v455, v475, v2326); svfloat32_t v513 = svnmls_f32_x(pred_full, v475, v455, v2326); - svfloat32_t v609; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v609) : "w"(v588), "w"(v608)); - svfloat32_t v610; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v610) : "w"(v588), "w"(v608)); + svfloat32_t v609 = svadd_f32_x(svptrue_b32(), v588, v608); + svfloat32_t v610 = svsub_f32_x(svptrue_b32(), v588, v608); svfloat32_t v622 = svmla_f32_x(pred_full, v582, v602, v2326); svfloat32_t v640 = svnmls_f32_x(pred_full, v602, v582, v2326); - svfloat32_t v736; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v736) : "w"(v715), "w"(v735)); - svfloat32_t v737; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v737) : "w"(v715), "w"(v735)); + svfloat32_t v736 = svadd_f32_x(svptrue_b32(), v715, v735); + svfloat32_t v737 = svsub_f32_x(svptrue_b32(), v715, v735); svfloat32_t v749 = svmla_f32_x(pred_full, v709, v729, v2326); svfloat32_t v767 = svnmls_f32_x(pred_full, v729, v709, v2326); - svfloat32_t v863; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v863) : "w"(v842), "w"(v862)); - svfloat32_t v864; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v864) : "w"(v842), "w"(v862)); + svfloat32_t v863 = svadd_f32_x(svptrue_b32(), v842, v862); + svfloat32_t v864 = svsub_f32_x(svptrue_b32(), v842, v862); svfloat32_t v876 = svmla_f32_x(pred_full, v836, v856, v2326); svfloat32_t v894 = svnmls_f32_x(pred_full, v856, v836, v2326); - svfloat32_t v387; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v387) : "w"(v2425), "w"(v355)); - svfloat32_t zero394; - asm volatile("mov %0.s, #0" : "=w"(zero394)); + svfloat32_t v387 = svadd_f32_x(svptrue_b32(), v2449, v355); + svfloat32_t zero394 = svdup_n_f32(0); svfloat32_t v394 = svcmla_f32_x(pred_full, zero394, v2346, v368, 90); - svfloat32_t zero402; - asm volatile("mov %0.s, #0" : "=w"(zero402)); + svfloat32_t zero402 = svdup_n_f32(0); svfloat32_t v402 = svcmla_f32_x(pred_full, zero402, v2346, v386, 90); - svfloat32_t v514; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v514) : "w"(v72), "w"(v482)); - svfloat32_t zero521; - asm volatile("mov %0.s, #0" : "=w"(zero521)); + svfloat32_t v514 = svadd_f32_x(svptrue_b32(), v72, v482); + svfloat32_t zero521 = svdup_n_f32(0); svfloat32_t v521 = svcmla_f32_x(pred_full, zero521, v2346, v495, 90); - svfloat32_t zero529; - asm volatile("mov %0.s, #0" : "=w"(zero529)); + svfloat32_t zero529 = svdup_n_f32(0); svfloat32_t v529 = svcmla_f32_x(pred_full, zero529, v2346, v513, 90); - svfloat32_t v641; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v641) : "w"(v127), "w"(v609)); - svfloat32_t zero648; - asm volatile("mov %0.s, #0" : "=w"(zero648)); + svfloat32_t v641 = svadd_f32_x(svptrue_b32(), v127, v609); + svfloat32_t zero648 = svdup_n_f32(0); svfloat32_t v648 = svcmla_f32_x(pred_full, zero648, v2346, v622, 90); - svfloat32_t zero656; - asm volatile("mov %0.s, #0" : "=w"(zero656)); + svfloat32_t zero656 = svdup_n_f32(0); svfloat32_t v656 = svcmla_f32_x(pred_full, zero656, v2346, v640, 90); - svfloat32_t v768; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v768) : "w"(v182), "w"(v736)); - svfloat32_t zero775; - asm volatile("mov %0.s, #0" : "=w"(zero775)); + svfloat32_t v768 = svadd_f32_x(svptrue_b32(), v182, v736); + svfloat32_t zero775 = svdup_n_f32(0); svfloat32_t v775 = svcmla_f32_x(pred_full, zero775, v2346, v749, 90); - svfloat32_t zero783; - asm volatile("mov %0.s, #0" : "=w"(zero783)); + svfloat32_t zero783 = svdup_n_f32(0); svfloat32_t v783 = svcmla_f32_x(pred_full, zero783, v2346, v767, 90); - svfloat32_t v895; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v895) : "w"(v237), "w"(v863)); - svfloat32_t zero902; - asm volatile("mov %0.s, #0" : "=w"(zero902)); + svfloat32_t v895 = svadd_f32_x(svptrue_b32(), v237, v863); + svfloat32_t zero902 = svdup_n_f32(0); svfloat32_t v902 = svcmla_f32_x(pred_full, zero902, v2346, v876, 90); - svfloat32_t zero910; - asm volatile("mov %0.s, #0" : "=w"(zero910)); + svfloat32_t zero910 = svdup_n_f32(0); svfloat32_t v910 = svcmla_f32_x(pred_full, zero910, v2346, v894, 90); - svfloat32_t v362 = svmls_f32_x(pred_full, v2425, v355, v2322); + svfloat32_t v362 = svmls_f32_x(pred_full, v2449, v355, v2322); svfloat32_t v489 = svmls_f32_x(pred_full, v72, v482, v2322); svfloat32_t v616 = svmls_f32_x(pred_full, v127, v609, v2322); svfloat32_t v743 = svmls_f32_x(pred_full, v182, v736, v2322); @@ -22747,119 +17681,80 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu25(const armral_cmplx_f32_t *restrict x, svfloat32_t v962 = svcmla_f32_x(pred_full, v895, v2063, v895, 90); svfloat32_t v982 = svcmla_f32_x(pred_full, v768, v2063, v768, 90); svfloat32_t v380 = svnmls_f32_x(pred_full, v374, v362, v2366); - svfloat32_t v403; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v403) : "w"(v374), "w"(v402)); + svfloat32_t v403 = svsub_f32_x(svptrue_b32(), v374, v402); svfloat32_t v507 = svnmls_f32_x(pred_full, v501, v489, v2366); - svfloat32_t v530; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v530) : "w"(v501), "w"(v529)); + svfloat32_t v530 = svsub_f32_x(svptrue_b32(), v501, v529); svfloat32_t v634 = svnmls_f32_x(pred_full, v628, v616, v2366); - svfloat32_t v657; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v657) : "w"(v628), "w"(v656)); + svfloat32_t v657 = svsub_f32_x(svptrue_b32(), v628, v656); svfloat32_t v761 = svnmls_f32_x(pred_full, v755, v743, v2366); - svfloat32_t v784; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v784) : "w"(v755), "w"(v783)); + svfloat32_t v784 = svsub_f32_x(svptrue_b32(), v755, v783); svfloat32_t v888 = svnmls_f32_x(pred_full, v882, v870, v2366); - svfloat32_t v911; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v911) : "w"(v882), "w"(v910)); - svfloat32_t v963; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v963) : "w"(v936), "w"(v962)); - svfloat32_t v983; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v983) : "w"(v949), "w"(v982)); - svfloat32_t v395; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v395) : "w"(v380), "w"(v394)); + svfloat32_t v911 = svsub_f32_x(svptrue_b32(), v882, v910); + svfloat32_t v963 = svsub_f32_x(svptrue_b32(), v936, v962); + svfloat32_t v983 = svsub_f32_x(svptrue_b32(), v949, v982); + svfloat32_t v395 = svsub_f32_x(svptrue_b32(), v380, v394); svfloat32_t v409 = svnmls_f32_x(pred_full, v403, v374, v2366); - svfloat32_t v522; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v522) : "w"(v507), "w"(v521)); + svfloat32_t v522 = svsub_f32_x(svptrue_b32(), v507, v521); svfloat32_t v536 = svnmls_f32_x(pred_full, v530, v501, v2366); - svfloat32_t v649; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v649) : "w"(v634), "w"(v648)); + svfloat32_t v649 = svsub_f32_x(svptrue_b32(), v634, v648); svfloat32_t v663 = svnmls_f32_x(pred_full, v657, v628, v2366); - svfloat32_t v776; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v776) : "w"(v761), "w"(v775)); + svfloat32_t v776 = svsub_f32_x(svptrue_b32(), v761, v775); svfloat32_t v790 = svnmls_f32_x(pred_full, v784, v755, v2366); - svfloat32_t v903; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v903) : "w"(v888), "w"(v902)); + svfloat32_t v903 = svsub_f32_x(svptrue_b32(), v888, v902); svfloat32_t v917 = svnmls_f32_x(pred_full, v911, v882, v2366); svfloat32_t v969 = svnmls_f32_x(pred_full, v963, v936, v2366); svfloat32_t v989 = svnmls_f32_x(pred_full, v983, v949, v2366); - svfloat32_t v1252; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v1252) : "w"(v530), "w"(v2184)); - svfloat32_t v1265; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v1265) : "w"(v657), "w"(v2312)); - svfloat32_t v1278; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v1278) : "w"(v911), "w"(v2314)); - svfloat32_t v1298; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v1298) : "w"(v784), "w"(v2250)); + svfloat32_t v1252 = svmul_f32_x(svptrue_b32(), v530, v2184); + svfloat32_t v1265 = svmul_f32_x(svptrue_b32(), v657, v2312); + svfloat32_t v1278 = svmul_f32_x(svptrue_b32(), v911, v2314); + svfloat32_t v1298 = svmul_f32_x(svptrue_b32(), v784, v2250); svfloat32_t v415 = svnmls_f32_x(pred_full, v395, v380, v2366); svfloat32_t v542 = svnmls_f32_x(pred_full, v522, v507, v2366); svfloat32_t v669 = svnmls_f32_x(pred_full, v649, v634, v2366); svfloat32_t v796 = svnmls_f32_x(pred_full, v776, v761, v2366); svfloat32_t v923 = svnmls_f32_x(pred_full, v903, v888, v2366); - svfloat32_t v990; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v990) : "w"(v969), "w"(v989)); - svfloat32_t v991; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v991) : "w"(v969), "w"(v989)); + svfloat32_t v990 = svadd_f32_x(svptrue_b32(), v969, v989); + svfloat32_t v991 = svsub_f32_x(svptrue_b32(), v969, v989); svfloat32_t v1003 = svmla_f32_x(pred_full, v963, v983, v2326); svfloat32_t v1021 = svnmls_f32_x(pred_full, v983, v963, v2326); - svfloat32_t v1090; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v1090) : "w"(v522), "w"(v2120)); - svfloat32_t v1103; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v1103) : "w"(v649), "w"(v2184)); - svfloat32_t v1116; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v1116) : "w"(v903), "w"(v2312)); - svfloat32_t v1136; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v1136) : "w"(v776), "w"(v2248)); + svfloat32_t v1090 = svmul_f32_x(svptrue_b32(), v522, v2120); + svfloat32_t v1103 = svmul_f32_x(svptrue_b32(), v649, v2184); + svfloat32_t v1116 = svmul_f32_x(svptrue_b32(), v903, v2312); + svfloat32_t v1136 = svmul_f32_x(svptrue_b32(), v776, v2248); svfloat32_t v1260 = svcmla_f32_x(pred_full, v1252, v2185, v530, 90); svfloat32_t v1273 = svcmla_f32_x(pred_full, v1265, v2313, v657, 90); svfloat32_t v1286 = svcmla_f32_x(pred_full, v1278, v2315, v911, 90); svfloat32_t v1306 = svcmla_f32_x(pred_full, v1298, v2251, v784, 90); - svfloat32_t v1414; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v1414) : "w"(v536), "w"(v2248)); - svfloat32_t v1427; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v1427) : "w"(v663), "w"(v2250)); - svfloat32_t v1440; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v1440) : "w"(v917), "w"(v2319)); - svfloat32_t v1460; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v1460) : "w"(v790), "w"(v2316)); - svfloat32_t v1022; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v1022) : "w"(v387), "w"(v990)); - svfloat32_t zero1036; - asm volatile("mov %0.s, #0" : "=w"(zero1036)); + svfloat32_t v1414 = svmul_f32_x(svptrue_b32(), v536, v2248); + svfloat32_t v1427 = svmul_f32_x(svptrue_b32(), v663, v2250); + svfloat32_t v1440 = svmul_f32_x(svptrue_b32(), v917, v2319); + svfloat32_t v1460 = svmul_f32_x(svptrue_b32(), v790, v2316); + svfloat32_t v1022 = svadd_f32_x(svptrue_b32(), v387, v990); + svfloat32_t zero1036 = svdup_n_f32(0); svfloat32_t v1036 = svcmla_f32_x(pred_full, zero1036, v2346, v1003, 90); - svfloat32_t zero1051; - asm volatile("mov %0.s, #0" : "=w"(zero1051)); + svfloat32_t zero1051 = svdup_n_f32(0); svfloat32_t v1051 = svcmla_f32_x(pred_full, zero1051, v2346, v1021, 90); svfloat32_t v1098 = svcmla_f32_x(pred_full, v1090, v2121, v522, 90); svfloat32_t v1111 = svcmla_f32_x(pred_full, v1103, v2185, v649, 90); svfloat32_t v1124 = svcmla_f32_x(pred_full, v1116, v2313, v903, 90); svfloat32_t v1144 = svcmla_f32_x(pred_full, v1136, v2249, v776, 90); - svfloat32_t v1287; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1287) : "w"(v1260), "w"(v1286)); - svfloat32_t v1307; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1307) : "w"(v1273), "w"(v1306)); + svfloat32_t v1287 = svsub_f32_x(svptrue_b32(), v1260, v1286); + svfloat32_t v1307 = svsub_f32_x(svptrue_b32(), v1273, v1306); svfloat32_t v1422 = svcmla_f32_x(pred_full, v1414, v2249, v536, 90); svfloat32_t v1435 = svcmla_f32_x(pred_full, v1427, v2251, v663, 90); svfloat32_t v1448 = svcmla_f32_x(pred_full, v1440, v2320, v917, 90); svfloat32_t v1468 = svcmla_f32_x(pred_full, v1460, v2256, v790, 90); - svfloat32_t v1576; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v1576) : "w"(v542), "w"(v2312)); - svfloat32_t v1589; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v1589) : "w"(v669), "w"(v2314)); - svfloat32_t v1602; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v1602) : "w"(v923), "w"(v2316)); - svfloat32_t v1622; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v1622) : "w"(v796), "w"(v2319)); + svfloat32_t v1576 = svmul_f32_x(svptrue_b32(), v542, v2312); + svfloat32_t v1589 = svmul_f32_x(svptrue_b32(), v669, v2314); + svfloat32_t v1602 = svmul_f32_x(svptrue_b32(), v923, v2316); + svfloat32_t v1622 = svmul_f32_x(svptrue_b32(), v796, v2319); svfloat32_t v997 = svmls_f32_x(pred_full, v387, v990, v2322); - svfloat32_t v1125; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1125) : "w"(v1098), "w"(v1124)); - svfloat32_t v1145; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1145) : "w"(v1111), "w"(v1144)); + svfloat32_t v1125 = svsub_f32_x(svptrue_b32(), v1098, v1124); + svfloat32_t v1145 = svsub_f32_x(svptrue_b32(), v1111, v1144); svfloat32_t v1293 = svnmls_f32_x(pred_full, v1287, v1260, v2366); svfloat32_t v1313 = svnmls_f32_x(pred_full, v1307, v1273, v2366); - svfloat32_t v1449; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1449) : "w"(v1422), "w"(v1448)); - svfloat32_t v1469; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1469) : "w"(v1435), "w"(v1468)); + svfloat32_t v1449 = svsub_f32_x(svptrue_b32(), v1422, v1448); + svfloat32_t v1469 = svsub_f32_x(svptrue_b32(), v1435, v1468); svfloat32_t v1584 = svcmla_f32_x(pred_full, v1576, v2313, v542, 90); svfloat32_t v1597 = svcmla_f32_x(pred_full, v1589, v2315, v669, 90); svfloat32_t v1610 = svcmla_f32_x(pred_full, v1602, v2317, v923, 90); @@ -22868,67 +17763,46 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu25(const armral_cmplx_f32_t *restrict x, svfloat32_t v1009 = svmls_f32_x(pred_full, v997, v991, v2324); svfloat32_t v1131 = svnmls_f32_x(pred_full, v1125, v1098, v2366); svfloat32_t v1151 = svnmls_f32_x(pred_full, v1145, v1111, v2366); - svfloat32_t v1314; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v1314) : "w"(v1293), "w"(v1313)); - svfloat32_t v1315; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1315) : "w"(v1293), "w"(v1313)); + svfloat32_t v1314 = svadd_f32_x(svptrue_b32(), v1293, v1313); + svfloat32_t v1315 = svsub_f32_x(svptrue_b32(), v1293, v1313); svfloat32_t v1327 = svmla_f32_x(pred_full, v1287, v1307, v2326); svfloat32_t v1345 = svnmls_f32_x(pred_full, v1307, v1287, v2326); svfloat32_t v1455 = svnmls_f32_x(pred_full, v1449, v1422, v2366); svfloat32_t v1475 = svnmls_f32_x(pred_full, v1469, v1435, v2366); - svfloat32_t v1611; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1611) : "w"(v1584), "w"(v1610)); - svfloat32_t v1631; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1631) : "w"(v1597), "w"(v1630)); + svfloat32_t v1611 = svsub_f32_x(svptrue_b32(), v1584, v1610); + svfloat32_t v1631 = svsub_f32_x(svptrue_b32(), v1597, v1630); svfloat32_t v1015 = svnmls_f32_x(pred_full, v1009, v997, v2366); - svfloat32_t v1052; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1052) : "w"(v1009), "w"(v1051)); - svfloat32_t v1152; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v1152) : "w"(v1131), "w"(v1151)); - svfloat32_t v1153; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1153) : "w"(v1131), "w"(v1151)); + svfloat32_t v1052 = svsub_f32_x(svptrue_b32(), v1009, v1051); + svfloat32_t v1152 = svadd_f32_x(svptrue_b32(), v1131, v1151); + svfloat32_t v1153 = svsub_f32_x(svptrue_b32(), v1131, v1151); svfloat32_t v1165 = svmla_f32_x(pred_full, v1125, v1145, v2326); svfloat32_t v1183 = svnmls_f32_x(pred_full, v1145, v1125, v2326); - svfloat32_t v1346; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v1346) : "w"(v403), "w"(v1314)); - svfloat32_t zero1360; - asm volatile("mov %0.s, #0" : "=w"(zero1360)); + svfloat32_t v1346 = svadd_f32_x(svptrue_b32(), v403, v1314); + svfloat32_t zero1360 = svdup_n_f32(0); svfloat32_t v1360 = svcmla_f32_x(pred_full, zero1360, v2346, v1327, 90); - svfloat32_t zero1375; - asm volatile("mov %0.s, #0" : "=w"(zero1375)); + svfloat32_t zero1375 = svdup_n_f32(0); svfloat32_t v1375 = svcmla_f32_x(pred_full, zero1375, v2346, v1345, 90); - svfloat32_t v1476; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v1476) : "w"(v1455), "w"(v1475)); - svfloat32_t v1477; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1477) : "w"(v1455), "w"(v1475)); + svfloat32_t v1476 = svadd_f32_x(svptrue_b32(), v1455, v1475); + svfloat32_t v1477 = svsub_f32_x(svptrue_b32(), v1455, v1475); svfloat32_t v1489 = svmla_f32_x(pred_full, v1449, v1469, v2326); svfloat32_t v1507 = svnmls_f32_x(pred_full, v1469, v1449, v2326); svfloat32_t v1617 = svnmls_f32_x(pred_full, v1611, v1584, v2366); svfloat32_t v1637 = svnmls_f32_x(pred_full, v1631, v1597, v2366); - svfloat32_t v1037; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1037) : "w"(v1015), "w"(v1036)); + svfloat32_t v1037 = svsub_f32_x(svptrue_b32(), v1015, v1036); svfloat32_t v1065 = svnmls_f32_x(pred_full, v1052, v1009, v2366); - svfloat32_t v1184; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v1184) : "w"(v395), "w"(v1152)); - svfloat32_t zero1198; - asm volatile("mov %0.s, #0" : "=w"(zero1198)); + svfloat32_t v1184 = svadd_f32_x(svptrue_b32(), v395, v1152); + svfloat32_t zero1198 = svdup_n_f32(0); svfloat32_t v1198 = svcmla_f32_x(pred_full, zero1198, v2346, v1165, 90); - svfloat32_t zero1213; - asm volatile("mov %0.s, #0" : "=w"(zero1213)); + svfloat32_t zero1213 = svdup_n_f32(0); svfloat32_t v1213 = svcmla_f32_x(pred_full, zero1213, v2346, v1183, 90); svfloat32_t v1321 = svmls_f32_x(pred_full, v403, v1314, v2322); - svfloat32_t v1508; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v1508) : "w"(v409), "w"(v1476)); - svfloat32_t zero1522; - asm volatile("mov %0.s, #0" : "=w"(zero1522)); + svfloat32_t v1508 = svadd_f32_x(svptrue_b32(), v409, v1476); + svfloat32_t zero1522 = svdup_n_f32(0); svfloat32_t v1522 = svcmla_f32_x(pred_full, zero1522, v2346, v1489, 90); - svfloat32_t zero1537; - asm volatile("mov %0.s, #0" : "=w"(zero1537)); + svfloat32_t zero1537 = svdup_n_f32(0); svfloat32_t v1537 = svcmla_f32_x(pred_full, zero1537, v2346, v1507, 90); - svfloat32_t v1638; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v1638) : "w"(v1617), "w"(v1637)); - svfloat32_t v1639; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1639) : "w"(v1617), "w"(v1637)); + svfloat32_t v1638 = svadd_f32_x(svptrue_b32(), v1617, v1637); + svfloat32_t v1639 = svsub_f32_x(svptrue_b32(), v1617, v1637); svfloat32_t v1651 = svmla_f32_x(pred_full, v1611, v1631, v2326); svfloat32_t v1669 = svnmls_f32_x(pred_full, v1631, v1611, v2326); svst1_f64(pred_full, (double *)(v2097), svreinterpret_f64_f32(v1052)); @@ -22937,13 +17811,10 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu25(const armral_cmplx_f32_t *restrict x, svfloat32_t v1159 = svmls_f32_x(pred_full, v395, v1152, v2322); svfloat32_t v1333 = svmls_f32_x(pred_full, v1321, v1315, v2324); svfloat32_t v1483 = svmls_f32_x(pred_full, v409, v1476, v2322); - svfloat32_t v1670; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v1670) : "w"(v415), "w"(v1638)); - svfloat32_t zero1684; - asm volatile("mov %0.s, #0" : "=w"(zero1684)); + svfloat32_t v1670 = svadd_f32_x(svptrue_b32(), v415, v1638); + svfloat32_t zero1684 = svdup_n_f32(0); svfloat32_t v1684 = svcmla_f32_x(pred_full, zero1684, v2346, v1651, 90); - svfloat32_t zero1699; - asm volatile("mov %0.s, #0" : "=w"(zero1699)); + svfloat32_t zero1699 = svdup_n_f32(0); svfloat32_t v1699 = svcmla_f32_x(pred_full, zero1699, v2346, v1669, 90); svst1_f64(pred_full, (double *)(v2087), svreinterpret_f64_f32(v1037)); svst1_f64(pred_full, (double *)(v2107), svreinterpret_f64_f32(v1065)); @@ -22951,41 +17822,33 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu25(const armral_cmplx_f32_t *restrict x, svst1_f64(pred_full, (double *)(v2269), svreinterpret_f64_f32(v1508)); svfloat32_t v1171 = svmls_f32_x(pred_full, v1159, v1153, v2324); svfloat32_t v1339 = svnmls_f32_x(pred_full, v1333, v1321, v2366); - svfloat32_t v1376; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1376) : "w"(v1333), "w"(v1375)); + svfloat32_t v1376 = svsub_f32_x(svptrue_b32(), v1333, v1375); svfloat32_t v1495 = svmls_f32_x(pred_full, v1483, v1477, v2324); svfloat32_t v1645 = svmls_f32_x(pred_full, v415, v1638, v2322); svst1_f64(pred_full, (double *)(v2117), svreinterpret_f64_f32(v1078)); svst1_f64(pred_full, (double *)(v2333), svreinterpret_f64_f32(v1670)); svfloat32_t v1177 = svnmls_f32_x(pred_full, v1171, v1159, v2366); - svfloat32_t v1214; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1214) : "w"(v1171), "w"(v1213)); - svfloat32_t v1361; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1361) : "w"(v1339), "w"(v1360)); + svfloat32_t v1214 = svsub_f32_x(svptrue_b32(), v1171, v1213); + svfloat32_t v1361 = svsub_f32_x(svptrue_b32(), v1339, v1360); svfloat32_t v1389 = svnmls_f32_x(pred_full, v1376, v1333, v2366); svfloat32_t v1501 = svnmls_f32_x(pred_full, v1495, v1483, v2366); - svfloat32_t v1538; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1538) : "w"(v1495), "w"(v1537)); + svfloat32_t v1538 = svsub_f32_x(svptrue_b32(), v1495, v1537); svfloat32_t v1657 = svmls_f32_x(pred_full, v1645, v1639, v2324); svst1_f64(pred_full, (double *)(v2225), svreinterpret_f64_f32(v1376)); - svfloat32_t v1199; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1199) : "w"(v1177), "w"(v1198)); + svfloat32_t v1199 = svsub_f32_x(svptrue_b32(), v1177, v1198); svfloat32_t v1227 = svnmls_f32_x(pred_full, v1214, v1171, v2366); svfloat32_t v1402 = svnmls_f32_x(pred_full, v1361, v1339, v2366); - svfloat32_t v1523; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1523) : "w"(v1501), "w"(v1522)); + svfloat32_t v1523 = svsub_f32_x(svptrue_b32(), v1501, v1522); svfloat32_t v1551 = svnmls_f32_x(pred_full, v1538, v1495, v2366); svfloat32_t v1663 = svnmls_f32_x(pred_full, v1657, v1645, v2366); - svfloat32_t v1700; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1700) : "w"(v1657), "w"(v1699)); + svfloat32_t v1700 = svsub_f32_x(svptrue_b32(), v1657, v1699); svst1_f64(pred_full, (double *)(v2161), svreinterpret_f64_f32(v1214)); svst1_f64(pred_full, (double *)(v2215), svreinterpret_f64_f32(v1361)); svst1_f64(pred_full, (double *)(v2235), svreinterpret_f64_f32(v1389)); svst1_f64(pred_full, (double *)(v2289), svreinterpret_f64_f32(v1538)); svfloat32_t v1240 = svnmls_f32_x(pred_full, v1199, v1177, v2366); svfloat32_t v1564 = svnmls_f32_x(pred_full, v1523, v1501, v2366); - svfloat32_t v1685; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1685) : "w"(v1663), "w"(v1684)); + svfloat32_t v1685 = svsub_f32_x(svptrue_b32(), v1663, v1684); svfloat32_t v1713 = svnmls_f32_x(pred_full, v1700, v1657, v2366); svst1_f64(pred_full, (double *)(v2151), svreinterpret_f64_f32(v1199)); svst1_f64(pred_full, (double *)(v2171), svreinterpret_f64_f32(v1227)); @@ -24395,70 +19258,38 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu32(const armral_cmplx_f32_t *restrict x, float v1272 = 9.8078528040323043e-01F; float v1279 = -5.5557023301960218e-01F; float v1284 = -8.3146961230254524e-01F; - float v1295 = 1.0000000000000000e+00F; const float32x2_t *v1474 = &v5[v0]; float32x2_t *v1710 = &v6[v2]; int64_t v19 = v0 * 16; - float32x2_t v30 = v7[15]; int64_t v34 = v0 * 8; - float32x2_t v41 = v7[7]; int64_t v45 = v0 * 24; - float32x2_t v52 = v7[23]; int64_t v56 = v0 * 4; int64_t v67 = v0 * 20; - float32x2_t v78 = v7[3]; - float32x2_t v82 = v7[19]; int64_t v86 = v0 * 12; int64_t v97 = v0 * 28; - float32x2_t v108 = v7[11]; - float32x2_t v112 = v7[27]; int64_t v116 = v0 * 2; int64_t v127 = v0 * 18; - float32x2_t v138 = v7[1]; - float32x2_t v142 = v7[17]; int64_t v146 = v0 * 10; - float32x2_t v153 = v7[9]; int64_t v157 = v0 * 26; - float32x2_t v164 = v7[25]; int64_t v168 = v0 * 6; int64_t v179 = v0 * 22; - float32x2_t v190 = v7[5]; - float32x2_t v194 = v7[21]; int64_t v198 = v0 * 14; - float32x2_t v205 = v7[13]; int64_t v209 = v0 * 30; - float32x2_t v216 = v7[29]; int64_t v231 = v0 * 17; - float32x2_t v242 = v7[0]; - float32x2_t v246 = v7[16]; int64_t v250 = v0 * 9; - float32x2_t v257 = v7[8]; int64_t v261 = v0 * 25; - float32x2_t v268 = v7[24]; int64_t v272 = v0 * 5; int64_t v283 = v0 * 21; - float32x2_t v294 = v7[4]; - float32x2_t v298 = v7[20]; int64_t v302 = v0 * 13; int64_t v313 = v0 * 29; - float32x2_t v324 = v7[12]; - float32x2_t v328 = v7[28]; int64_t v332 = v0 * 3; int64_t v343 = v0 * 19; - float32x2_t v354 = v7[2]; - float32x2_t v358 = v7[18]; int64_t v362 = v0 * 11; - float32x2_t v369 = v7[10]; int64_t v373 = v0 * 27; - float32x2_t v380 = v7[26]; int64_t v384 = v0 * 7; int64_t v395 = v0 * 23; - float32x2_t v406 = v7[6]; - float32x2_t v410 = v7[22]; int64_t v414 = v0 * 15; int64_t v425 = v0 * 31; - float32x2_t v436 = v7[14]; - float32x2_t v440 = v7[30]; int64_t v844 = v2 * 8; int64_t v851 = v2 * 16; int64_t v858 = v2 * 24; @@ -24494,7 +19325,6 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu32(const armral_cmplx_f32_t *restrict x, int64_t v1260 = v2 * 30; float v1275 = v4 * v1272; float v1287 = v4 * v1284; - float v1298 = v4 * v1295; int64_t v1306 = v2 * 7; int64_t v1313 = v2 * 15; int64_t v1320 = v2 * 23; @@ -24513,70 +19343,71 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu32(const armral_cmplx_f32_t *restrict x, svfloat32_t v1906 = svdup_n_f32(v1212); svfloat32_t v1945 = svdup_n_f32(v1267); svfloat32_t v1947 = svdup_n_f32(v1279); + svfloat32_t v1949 = svdup_n_f32(v4); svfloat32_t v2017 = svreinterpret_f32_f64( svld1_f64(pred_full, &((const double *)v1474)[0])); - svfloat32_t v31; - asm("mov %0.d, %d1" : "=w"(v31) : "w"(v30)); - svfloat32_t v42; - asm("mov %0.d, %d1" : "=w"(v42) : "w"(v41)); - svfloat32_t v53; - asm("mov %0.d, %d1" : "=w"(v53) : "w"(v52)); - svfloat32_t v79; - asm("mov %0.d, %d1" : "=w"(v79) : "w"(v78)); - svfloat32_t v83; - asm("mov %0.d, %d1" : "=w"(v83) : "w"(v82)); - svfloat32_t v109; - asm("mov %0.d, %d1" : "=w"(v109) : "w"(v108)); - svfloat32_t v113; - asm("mov %0.d, %d1" : "=w"(v113) : "w"(v112)); - svfloat32_t v139; - asm("mov %0.d, %d1" : "=w"(v139) : "w"(v138)); - svfloat32_t v143; - asm("mov %0.d, %d1" : "=w"(v143) : "w"(v142)); - svfloat32_t v154; - asm("mov %0.d, %d1" : "=w"(v154) : "w"(v153)); - svfloat32_t v165; - asm("mov %0.d, %d1" : "=w"(v165) : "w"(v164)); - svfloat32_t v191; - asm("mov %0.d, %d1" : "=w"(v191) : "w"(v190)); - svfloat32_t v195; - asm("mov %0.d, %d1" : "=w"(v195) : "w"(v194)); - svfloat32_t v206; - asm("mov %0.d, %d1" : "=w"(v206) : "w"(v205)); - svfloat32_t v217; - asm("mov %0.d, %d1" : "=w"(v217) : "w"(v216)); - svfloat32_t v243; - asm("mov %0.d, %d1" : "=w"(v243) : "w"(v242)); - svfloat32_t v247; - asm("mov %0.d, %d1" : "=w"(v247) : "w"(v246)); - svfloat32_t v258; - asm("mov %0.d, %d1" : "=w"(v258) : "w"(v257)); - svfloat32_t v269; - asm("mov %0.d, %d1" : "=w"(v269) : "w"(v268)); - svfloat32_t v295; - asm("mov %0.d, %d1" : "=w"(v295) : "w"(v294)); - svfloat32_t v299; - asm("mov %0.d, %d1" : "=w"(v299) : "w"(v298)); - svfloat32_t v325; - asm("mov %0.d, %d1" : "=w"(v325) : "w"(v324)); - svfloat32_t v329; - asm("mov %0.d, %d1" : "=w"(v329) : "w"(v328)); - svfloat32_t v355; - asm("mov %0.d, %d1" : "=w"(v355) : "w"(v354)); - svfloat32_t v359; - asm("mov %0.d, %d1" : "=w"(v359) : "w"(v358)); - svfloat32_t v370; - asm("mov %0.d, %d1" : "=w"(v370) : "w"(v369)); - svfloat32_t v381; - asm("mov %0.d, %d1" : "=w"(v381) : "w"(v380)); - svfloat32_t v407; - asm("mov %0.d, %d1" : "=w"(v407) : "w"(v406)); - svfloat32_t v411; - asm("mov %0.d, %d1" : "=w"(v411) : "w"(v410)); - svfloat32_t v437; - asm("mov %0.d, %d1" : "=w"(v437) : "w"(v436)); - svfloat32_t v441; - asm("mov %0.d, %d1" : "=w"(v441) : "w"(v440)); + svfloat32_t v31 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[15])); + svfloat32_t v42 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[7])); + svfloat32_t v53 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[23])); + svfloat32_t v79 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[3])); + svfloat32_t v83 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[19])); + svfloat32_t v109 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[11])); + svfloat32_t v113 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[27])); + svfloat32_t v139 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[1])); + svfloat32_t v143 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[17])); + svfloat32_t v154 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[9])); + svfloat32_t v165 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[25])); + svfloat32_t v191 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[5])); + svfloat32_t v195 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[21])); + svfloat32_t v206 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[13])); + svfloat32_t v217 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[29])); + svfloat32_t v243 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[0])); + svfloat32_t v247 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[16])); + svfloat32_t v258 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[8])); + svfloat32_t v269 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[24])); + svfloat32_t v295 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[4])); + svfloat32_t v299 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[20])); + svfloat32_t v325 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[12])); + svfloat32_t v329 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[28])); + svfloat32_t v355 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[2])); + svfloat32_t v359 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[18])); + svfloat32_t v370 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[10])); + svfloat32_t v381 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[26])); + svfloat32_t v407 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[6])); + svfloat32_t v411 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[22])); + svfloat32_t v437 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[14])); + svfloat32_t v441 = + svreinterpret_f32_u64(svdup_n_u64(((const uint64_t *)v7)[30])); const float32x2_t *v1339 = &v5[v19]; const float32x2_t *v1348 = &v5[v34]; const float32x2_t *v1357 = &v5[v45]; @@ -24642,15 +19473,13 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu32(const armral_cmplx_f32_t *restrict x, float32x2_t *v1942 = &v6[v1260]; svfloat32_t v1946 = svdup_n_f32(v1275); svfloat32_t v1948 = svdup_n_f32(v1287); - svfloat32_t v1949 = svdup_n_f32(v1298); float32x2_t *v1956 = &v6[v1306]; float32x2_t *v1965 = &v6[v1313]; float32x2_t *v1974 = &v6[v1320]; float32x2_t *v1983 = &v6[v1327]; svfloat32_t v2049 = svreinterpret_f32_f64( svld1_f64(pred_full, &((const double *)v1619)[0])); - svfloat32_t zero244; - asm volatile("mov %0.s, #0" : "=w"(zero244)); + svfloat32_t zero244 = svdup_n_f32(0); svfloat32_t v244 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero244, v2017, v243, 0), v2017, v243, 90); @@ -24714,443 +19543,282 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu32(const armral_cmplx_f32_t *restrict x, svld1_f64(pred_full, &((const double *)v1600)[0])); svfloat32_t v2047 = svreinterpret_f32_f64( svld1_f64(pred_full, &((const double *)v1609)[0])); - svfloat32_t zero32; - asm volatile("mov %0.s, #0" : "=w"(zero32)); + svfloat32_t zero32 = svdup_n_f32(0); svfloat32_t v32 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero32, v1987, v31, 0), v1987, v31, 90); - svfloat32_t zero43; - asm volatile("mov %0.s, #0" : "=w"(zero43)); + svfloat32_t zero43 = svdup_n_f32(0); svfloat32_t v43 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero43, v1989, v42, 0), v1989, v42, 90); - svfloat32_t zero54; - asm volatile("mov %0.s, #0" : "=w"(zero54)); + svfloat32_t zero54 = svdup_n_f32(0); svfloat32_t v54 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero54, v1991, v53, 0), v1991, v53, 90); - svfloat32_t zero80; - asm volatile("mov %0.s, #0" : "=w"(zero80)); + svfloat32_t zero80 = svdup_n_f32(0); svfloat32_t v80 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero80, v1993, v79, 0), v1993, v79, 90); - svfloat32_t zero84; - asm volatile("mov %0.s, #0" : "=w"(zero84)); + svfloat32_t zero84 = svdup_n_f32(0); svfloat32_t v84 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero84, v1995, v83, 0), v1995, v83, 90); - svfloat32_t zero110; - asm volatile("mov %0.s, #0" : "=w"(zero110)); + svfloat32_t zero110 = svdup_n_f32(0); svfloat32_t v110 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero110, v1997, v109, 0), v1997, v109, 90); - svfloat32_t zero114; - asm volatile("mov %0.s, #0" : "=w"(zero114)); + svfloat32_t zero114 = svdup_n_f32(0); svfloat32_t v114 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero114, v1999, v113, 0), v1999, v113, 90); - svfloat32_t zero140; - asm volatile("mov %0.s, #0" : "=w"(zero140)); + svfloat32_t zero140 = svdup_n_f32(0); svfloat32_t v140 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero140, v2001, v139, 0), v2001, v139, 90); - svfloat32_t zero144; - asm volatile("mov %0.s, #0" : "=w"(zero144)); + svfloat32_t zero144 = svdup_n_f32(0); svfloat32_t v144 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero144, v2003, v143, 0), v2003, v143, 90); - svfloat32_t zero155; - asm volatile("mov %0.s, #0" : "=w"(zero155)); + svfloat32_t zero155 = svdup_n_f32(0); svfloat32_t v155 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero155, v2005, v154, 0), v2005, v154, 90); - svfloat32_t zero166; - asm volatile("mov %0.s, #0" : "=w"(zero166)); + svfloat32_t zero166 = svdup_n_f32(0); svfloat32_t v166 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero166, v2007, v165, 0), v2007, v165, 90); - svfloat32_t zero192; - asm volatile("mov %0.s, #0" : "=w"(zero192)); + svfloat32_t zero192 = svdup_n_f32(0); svfloat32_t v192 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero192, v2009, v191, 0), v2009, v191, 90); - svfloat32_t zero196; - asm volatile("mov %0.s, #0" : "=w"(zero196)); + svfloat32_t zero196 = svdup_n_f32(0); svfloat32_t v196 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero196, v2011, v195, 0), v2011, v195, 90); - svfloat32_t zero207; - asm volatile("mov %0.s, #0" : "=w"(zero207)); + svfloat32_t zero207 = svdup_n_f32(0); svfloat32_t v207 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero207, v2013, v206, 0), v2013, v206, 90); - svfloat32_t zero218; - asm volatile("mov %0.s, #0" : "=w"(zero218)); + svfloat32_t zero218 = svdup_n_f32(0); svfloat32_t v218 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero218, v2015, v217, 0), v2015, v217, 90); - svfloat32_t zero248; - asm volatile("mov %0.s, #0" : "=w"(zero248)); + svfloat32_t zero248 = svdup_n_f32(0); svfloat32_t v248 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero248, v2019, v247, 0), v2019, v247, 90); - svfloat32_t zero259; - asm volatile("mov %0.s, #0" : "=w"(zero259)); + svfloat32_t zero259 = svdup_n_f32(0); svfloat32_t v259 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero259, v2021, v258, 0), v2021, v258, 90); - svfloat32_t zero270; - asm volatile("mov %0.s, #0" : "=w"(zero270)); + svfloat32_t zero270 = svdup_n_f32(0); svfloat32_t v270 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero270, v2023, v269, 0), v2023, v269, 90); - svfloat32_t zero296; - asm volatile("mov %0.s, #0" : "=w"(zero296)); + svfloat32_t zero296 = svdup_n_f32(0); svfloat32_t v296 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero296, v2025, v295, 0), v2025, v295, 90); - svfloat32_t zero300; - asm volatile("mov %0.s, #0" : "=w"(zero300)); + svfloat32_t zero300 = svdup_n_f32(0); svfloat32_t v300 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero300, v2027, v299, 0), v2027, v299, 90); - svfloat32_t zero326; - asm volatile("mov %0.s, #0" : "=w"(zero326)); + svfloat32_t zero326 = svdup_n_f32(0); svfloat32_t v326 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero326, v2029, v325, 0), v2029, v325, 90); - svfloat32_t zero330; - asm volatile("mov %0.s, #0" : "=w"(zero330)); + svfloat32_t zero330 = svdup_n_f32(0); svfloat32_t v330 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero330, v2031, v329, 0), v2031, v329, 90); - svfloat32_t zero356; - asm volatile("mov %0.s, #0" : "=w"(zero356)); + svfloat32_t zero356 = svdup_n_f32(0); svfloat32_t v356 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero356, v2033, v355, 0), v2033, v355, 90); - svfloat32_t zero360; - asm volatile("mov %0.s, #0" : "=w"(zero360)); + svfloat32_t zero360 = svdup_n_f32(0); svfloat32_t v360 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero360, v2035, v359, 0), v2035, v359, 90); - svfloat32_t zero371; - asm volatile("mov %0.s, #0" : "=w"(zero371)); + svfloat32_t zero371 = svdup_n_f32(0); svfloat32_t v371 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero371, v2037, v370, 0), v2037, v370, 90); - svfloat32_t zero382; - asm volatile("mov %0.s, #0" : "=w"(zero382)); + svfloat32_t zero382 = svdup_n_f32(0); svfloat32_t v382 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero382, v2039, v381, 0), v2039, v381, 90); - svfloat32_t zero408; - asm volatile("mov %0.s, #0" : "=w"(zero408)); + svfloat32_t zero408 = svdup_n_f32(0); svfloat32_t v408 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero408, v2041, v407, 0), v2041, v407, 90); - svfloat32_t zero412; - asm volatile("mov %0.s, #0" : "=w"(zero412)); + svfloat32_t zero412 = svdup_n_f32(0); svfloat32_t v412 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero412, v2043, v411, 0), v2043, v411, 90); - svfloat32_t zero438; - asm volatile("mov %0.s, #0" : "=w"(zero438)); + svfloat32_t zero438 = svdup_n_f32(0); svfloat32_t v438 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero438, v2045, v437, 0), v2045, v437, 90); - svfloat32_t zero442; - asm volatile("mov %0.s, #0" : "=w"(zero442)); + svfloat32_t zero442 = svdup_n_f32(0); svfloat32_t v442 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero442, v2047, v441, 0), v2047, v441, 90); - svfloat32_t v450; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v450) : "w"(v2049), "w"(v32)); - svfloat32_t v451; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v451) : "w"(v2049), "w"(v32)); - svfloat32_t v452; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v452) : "w"(v43), "w"(v54)); - svfloat32_t v453; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v453) : "w"(v43), "w"(v54)); - svfloat32_t v465; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v465) : "w"(v80), "w"(v84)); - svfloat32_t v466; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v466) : "w"(v80), "w"(v84)); - svfloat32_t v467; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v467) : "w"(v110), "w"(v114)); - svfloat32_t v468; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v468) : "w"(v110), "w"(v114)); - svfloat32_t v521; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v521) : "w"(v140), "w"(v144)); - svfloat32_t v522; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v522) : "w"(v140), "w"(v144)); - svfloat32_t v523; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v523) : "w"(v155), "w"(v166)); - svfloat32_t v524; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v524) : "w"(v155), "w"(v166)); - svfloat32_t v536; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v536) : "w"(v192), "w"(v196)); - svfloat32_t v537; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v537) : "w"(v192), "w"(v196)); - svfloat32_t v538; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v538) : "w"(v207), "w"(v218)); - svfloat32_t v539; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v539) : "w"(v207), "w"(v218)); - svfloat32_t v681; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v681) : "w"(v244), "w"(v248)); - svfloat32_t v682; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v682) : "w"(v244), "w"(v248)); - svfloat32_t v683; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v683) : "w"(v259), "w"(v270)); - svfloat32_t v684; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v684) : "w"(v259), "w"(v270)); - svfloat32_t v696; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v696) : "w"(v296), "w"(v300)); - svfloat32_t v697; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v697) : "w"(v296), "w"(v300)); - svfloat32_t v698; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v698) : "w"(v326), "w"(v330)); - svfloat32_t v699; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v699) : "w"(v326), "w"(v330)); - svfloat32_t v752; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v752) : "w"(v356), "w"(v360)); - svfloat32_t v753; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v753) : "w"(v356), "w"(v360)); - svfloat32_t v754; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v754) : "w"(v371), "w"(v382)); - svfloat32_t v755; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v755) : "w"(v371), "w"(v382)); - svfloat32_t v767; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v767) : "w"(v408), "w"(v412)); - svfloat32_t v768; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v768) : "w"(v408), "w"(v412)); - svfloat32_t v769; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v769) : "w"(v438), "w"(v442)); - svfloat32_t v770; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v770) : "w"(v438), "w"(v442)); - svfloat32_t zero460; - asm volatile("mov %0.s, #0" : "=w"(zero460)); + svfloat32_t v450 = svadd_f32_x(svptrue_b32(), v2049, v32); + svfloat32_t v451 = svsub_f32_x(svptrue_b32(), v2049, v32); + svfloat32_t v452 = svadd_f32_x(svptrue_b32(), v43, v54); + svfloat32_t v453 = svsub_f32_x(svptrue_b32(), v43, v54); + svfloat32_t v465 = svadd_f32_x(svptrue_b32(), v80, v84); + svfloat32_t v466 = svsub_f32_x(svptrue_b32(), v80, v84); + svfloat32_t v467 = svadd_f32_x(svptrue_b32(), v110, v114); + svfloat32_t v468 = svsub_f32_x(svptrue_b32(), v110, v114); + svfloat32_t v521 = svadd_f32_x(svptrue_b32(), v140, v144); + svfloat32_t v522 = svsub_f32_x(svptrue_b32(), v140, v144); + svfloat32_t v523 = svadd_f32_x(svptrue_b32(), v155, v166); + svfloat32_t v524 = svsub_f32_x(svptrue_b32(), v155, v166); + svfloat32_t v536 = svadd_f32_x(svptrue_b32(), v192, v196); + svfloat32_t v537 = svsub_f32_x(svptrue_b32(), v192, v196); + svfloat32_t v538 = svadd_f32_x(svptrue_b32(), v207, v218); + svfloat32_t v539 = svsub_f32_x(svptrue_b32(), v207, v218); + svfloat32_t v681 = svadd_f32_x(svptrue_b32(), v244, v248); + svfloat32_t v682 = svsub_f32_x(svptrue_b32(), v244, v248); + svfloat32_t v683 = svadd_f32_x(svptrue_b32(), v259, v270); + svfloat32_t v684 = svsub_f32_x(svptrue_b32(), v259, v270); + svfloat32_t v696 = svadd_f32_x(svptrue_b32(), v296, v300); + svfloat32_t v697 = svsub_f32_x(svptrue_b32(), v296, v300); + svfloat32_t v698 = svadd_f32_x(svptrue_b32(), v326, v330); + svfloat32_t v699 = svsub_f32_x(svptrue_b32(), v326, v330); + svfloat32_t v752 = svadd_f32_x(svptrue_b32(), v356, v360); + svfloat32_t v753 = svsub_f32_x(svptrue_b32(), v356, v360); + svfloat32_t v754 = svadd_f32_x(svptrue_b32(), v371, v382); + svfloat32_t v755 = svsub_f32_x(svptrue_b32(), v371, v382); + svfloat32_t v767 = svadd_f32_x(svptrue_b32(), v408, v412); + svfloat32_t v768 = svsub_f32_x(svptrue_b32(), v408, v412); + svfloat32_t v769 = svadd_f32_x(svptrue_b32(), v438, v442); + svfloat32_t v770 = svsub_f32_x(svptrue_b32(), v438, v442); + svfloat32_t zero460 = svdup_n_f32(0); svfloat32_t v460 = svcmla_f32_x(pred_full, zero460, v1825, v453, 90); - svfloat32_t v461; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v461) : "w"(v450), "w"(v452)); - svfloat32_t v462; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v462) : "w"(v450), "w"(v452)); - svfloat32_t v469; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v469) : "w"(v465), "w"(v467)); - svfloat32_t v470; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v470) : "w"(v465), "w"(v467)); - svfloat32_t v486; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v486) : "w"(v466), "w"(v1822)); - svfloat32_t v498; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v498) : "w"(v468), "w"(v1824)); - svfloat32_t zero531; - asm volatile("mov %0.s, #0" : "=w"(zero531)); + svfloat32_t v461 = svadd_f32_x(svptrue_b32(), v450, v452); + svfloat32_t v462 = svsub_f32_x(svptrue_b32(), v450, v452); + svfloat32_t v469 = svadd_f32_x(svptrue_b32(), v465, v467); + svfloat32_t v470 = svsub_f32_x(svptrue_b32(), v465, v467); + svfloat32_t v486 = svmul_f32_x(svptrue_b32(), v466, v1822); + svfloat32_t v498 = svmul_f32_x(svptrue_b32(), v468, v1824); + svfloat32_t zero531 = svdup_n_f32(0); svfloat32_t v531 = svcmla_f32_x(pred_full, zero531, v1825, v524, 90); - svfloat32_t v532; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v532) : "w"(v521), "w"(v523)); - svfloat32_t v533; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v533) : "w"(v521), "w"(v523)); - svfloat32_t zero546; - asm volatile("mov %0.s, #0" : "=w"(zero546)); + svfloat32_t v532 = svadd_f32_x(svptrue_b32(), v521, v523); + svfloat32_t v533 = svsub_f32_x(svptrue_b32(), v521, v523); + svfloat32_t zero546 = svdup_n_f32(0); svfloat32_t v546 = svcmla_f32_x(pred_full, zero546, v1825, v539, 90); - svfloat32_t v547; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v547) : "w"(v536), "w"(v538)); - svfloat32_t v548; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v548) : "w"(v536), "w"(v538)); - svfloat32_t zero691; - asm volatile("mov %0.s, #0" : "=w"(zero691)); + svfloat32_t v547 = svadd_f32_x(svptrue_b32(), v536, v538); + svfloat32_t v548 = svsub_f32_x(svptrue_b32(), v536, v538); + svfloat32_t zero691 = svdup_n_f32(0); svfloat32_t v691 = svcmla_f32_x(pred_full, zero691, v1825, v684, 90); - svfloat32_t v692; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v692) : "w"(v681), "w"(v683)); - svfloat32_t v693; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v693) : "w"(v681), "w"(v683)); - svfloat32_t v700; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v700) : "w"(v696), "w"(v698)); - svfloat32_t v701; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v701) : "w"(v696), "w"(v698)); - svfloat32_t v717; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v717) : "w"(v697), "w"(v1822)); - svfloat32_t v729; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v729) : "w"(v699), "w"(v1824)); - svfloat32_t zero762; - asm volatile("mov %0.s, #0" : "=w"(zero762)); + svfloat32_t v692 = svadd_f32_x(svptrue_b32(), v681, v683); + svfloat32_t v693 = svsub_f32_x(svptrue_b32(), v681, v683); + svfloat32_t v700 = svadd_f32_x(svptrue_b32(), v696, v698); + svfloat32_t v701 = svsub_f32_x(svptrue_b32(), v696, v698); + svfloat32_t v717 = svmul_f32_x(svptrue_b32(), v697, v1822); + svfloat32_t v729 = svmul_f32_x(svptrue_b32(), v699, v1824); + svfloat32_t zero762 = svdup_n_f32(0); svfloat32_t v762 = svcmla_f32_x(pred_full, zero762, v1825, v755, 90); - svfloat32_t v763; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v763) : "w"(v752), "w"(v754)); - svfloat32_t v764; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v764) : "w"(v752), "w"(v754)); - svfloat32_t v771; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v771) : "w"(v767), "w"(v769)); - svfloat32_t v772; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v772) : "w"(v767), "w"(v769)); - svfloat32_t v788; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v788) : "w"(v768), "w"(v1822)); - svfloat32_t v800; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v800) : "w"(v770), "w"(v1824)); - svfloat32_t v463; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v463) : "w"(v451), "w"(v460)); - svfloat32_t v464; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v464) : "w"(v451), "w"(v460)); - svfloat32_t zero477; - asm volatile("mov %0.s, #0" : "=w"(zero477)); + svfloat32_t v763 = svadd_f32_x(svptrue_b32(), v752, v754); + svfloat32_t v764 = svsub_f32_x(svptrue_b32(), v752, v754); + svfloat32_t v771 = svadd_f32_x(svptrue_b32(), v767, v769); + svfloat32_t v772 = svsub_f32_x(svptrue_b32(), v767, v769); + svfloat32_t v788 = svmul_f32_x(svptrue_b32(), v768, v1822); + svfloat32_t v800 = svmul_f32_x(svptrue_b32(), v770, v1824); + svfloat32_t v463 = svsub_f32_x(svptrue_b32(), v451, v460); + svfloat32_t v464 = svadd_f32_x(svptrue_b32(), v451, v460); + svfloat32_t zero477 = svdup_n_f32(0); svfloat32_t v477 = svcmla_f32_x(pred_full, zero477, v1825, v470, 90); - svfloat32_t v478; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v478) : "w"(v461), "w"(v469)); - svfloat32_t v479; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v479) : "w"(v461), "w"(v469)); - svfloat32_t v534; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v534) : "w"(v522), "w"(v531)); - svfloat32_t v535; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v535) : "w"(v522), "w"(v531)); - svfloat32_t v549; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v549) : "w"(v537), "w"(v546)); - svfloat32_t v550; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v550) : "w"(v537), "w"(v546)); - svfloat32_t v551; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v551) : "w"(v532), "w"(v547)); - svfloat32_t v552; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v552) : "w"(v532), "w"(v547)); - svfloat32_t v607; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v607) : "w"(v533), "w"(v1822)); - svfloat32_t v619; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v619) : "w"(v548), "w"(v1824)); - svfloat32_t v694; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v694) : "w"(v682), "w"(v691)); - svfloat32_t v695; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v695) : "w"(v682), "w"(v691)); - svfloat32_t zero708; - asm volatile("mov %0.s, #0" : "=w"(zero708)); + svfloat32_t v478 = svadd_f32_x(svptrue_b32(), v461, v469); + svfloat32_t v479 = svsub_f32_x(svptrue_b32(), v461, v469); + svfloat32_t v534 = svsub_f32_x(svptrue_b32(), v522, v531); + svfloat32_t v535 = svadd_f32_x(svptrue_b32(), v522, v531); + svfloat32_t v549 = svsub_f32_x(svptrue_b32(), v537, v546); + svfloat32_t v550 = svadd_f32_x(svptrue_b32(), v537, v546); + svfloat32_t v551 = svadd_f32_x(svptrue_b32(), v532, v547); + svfloat32_t v552 = svsub_f32_x(svptrue_b32(), v532, v547); + svfloat32_t v607 = svmul_f32_x(svptrue_b32(), v533, v1822); + svfloat32_t v619 = svmul_f32_x(svptrue_b32(), v548, v1824); + svfloat32_t v694 = svsub_f32_x(svptrue_b32(), v682, v691); + svfloat32_t v695 = svadd_f32_x(svptrue_b32(), v682, v691); + svfloat32_t zero708 = svdup_n_f32(0); svfloat32_t v708 = svcmla_f32_x(pred_full, zero708, v1825, v701, 90); - svfloat32_t v709; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v709) : "w"(v692), "w"(v700)); - svfloat32_t v710; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v710) : "w"(v692), "w"(v700)); - svfloat32_t v765; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v765) : "w"(v753), "w"(v762)); - svfloat32_t v766; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v766) : "w"(v753), "w"(v762)); - svfloat32_t zero779; - asm volatile("mov %0.s, #0" : "=w"(zero779)); + svfloat32_t v709 = svadd_f32_x(svptrue_b32(), v692, v700); + svfloat32_t v710 = svsub_f32_x(svptrue_b32(), v692, v700); + svfloat32_t v765 = svsub_f32_x(svptrue_b32(), v753, v762); + svfloat32_t v766 = svadd_f32_x(svptrue_b32(), v753, v762); + svfloat32_t zero779 = svdup_n_f32(0); svfloat32_t v779 = svcmla_f32_x(pred_full, zero779, v1825, v772, 90); - svfloat32_t v780; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v780) : "w"(v763), "w"(v771)); - svfloat32_t v781; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v781) : "w"(v763), "w"(v771)); - svfloat32_t v480; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v480) : "w"(v462), "w"(v477)); - svfloat32_t v481; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v481) : "w"(v462), "w"(v477)); + svfloat32_t v780 = svadd_f32_x(svptrue_b32(), v763, v771); + svfloat32_t v781 = svsub_f32_x(svptrue_b32(), v763, v771); + svfloat32_t v480 = svsub_f32_x(svptrue_b32(), v462, v477); + svfloat32_t v481 = svadd_f32_x(svptrue_b32(), v462, v477); svfloat32_t v506 = svcmla_f32_x(pred_full, v486, v1949, v486, 90); svfloat32_t v507 = svcmla_f32_x(pred_full, v498, v1825, v498, 90); - svfloat32_t zero559; - asm volatile("mov %0.s, #0" : "=w"(zero559)); + svfloat32_t zero559 = svdup_n_f32(0); svfloat32_t v559 = svcmla_f32_x(pred_full, zero559, v1825, v552, 90); - svfloat32_t v560; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v560) : "w"(v478), "w"(v551)); - svfloat32_t v561; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v561) : "w"(v478), "w"(v551)); - svfloat32_t v568; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v568) : "w"(v534), "w"(v1740)); - svfloat32_t v580; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v580) : "w"(v549), "w"(v1904)); - svfloat32_t v646; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v646) : "w"(v535), "w"(v1904)); - svfloat32_t v658; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v658) : "w"(v550), "w"(v1906)); - svfloat32_t v711; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v711) : "w"(v693), "w"(v708)); - svfloat32_t v712; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v712) : "w"(v693), "w"(v708)); + svfloat32_t v560 = svadd_f32_x(svptrue_b32(), v478, v551); + svfloat32_t v561 = svsub_f32_x(svptrue_b32(), v478, v551); + svfloat32_t v568 = svmul_f32_x(svptrue_b32(), v534, v1740); + svfloat32_t v580 = svmul_f32_x(svptrue_b32(), v549, v1904); + svfloat32_t v646 = svmul_f32_x(svptrue_b32(), v535, v1904); + svfloat32_t v658 = svmul_f32_x(svptrue_b32(), v550, v1906); + svfloat32_t v711 = svsub_f32_x(svptrue_b32(), v693, v708); + svfloat32_t v712 = svadd_f32_x(svptrue_b32(), v693, v708); svfloat32_t v737 = svcmla_f32_x(pred_full, v717, v1949, v717, 90); svfloat32_t v738 = svcmla_f32_x(pred_full, v729, v1825, v729, 90); - svfloat32_t v782; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v782) : "w"(v764), "w"(v779)); - svfloat32_t v783; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v783) : "w"(v764), "w"(v779)); + svfloat32_t v782 = svsub_f32_x(svptrue_b32(), v764, v779); + svfloat32_t v783 = svadd_f32_x(svptrue_b32(), v764, v779); svfloat32_t v808 = svcmla_f32_x(pred_full, v788, v1949, v788, 90); svfloat32_t v809 = svcmla_f32_x(pred_full, v800, v1825, v800, 90); - svfloat32_t v823; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v823) : "w"(v709), "w"(v780)); - svfloat32_t v824; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v824) : "w"(v709), "w"(v780)); - svfloat32_t v1069; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v1069) : "w"(v710), "w"(v1822)); - svfloat32_t v1081; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v1081) : "w"(v781), "w"(v1824)); - svfloat32_t v508; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v508) : "w"(v506), "w"(v507)); - svfloat32_t v509; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v509) : "w"(v507), "w"(v506)); - svfloat32_t v562; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v562) : "w"(v479), "w"(v559)); - svfloat32_t v563; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v563) : "w"(v479), "w"(v559)); + svfloat32_t v823 = svadd_f32_x(svptrue_b32(), v709, v780); + svfloat32_t v824 = svsub_f32_x(svptrue_b32(), v709, v780); + svfloat32_t v1069 = svmul_f32_x(svptrue_b32(), v710, v1822); + svfloat32_t v1081 = svmul_f32_x(svptrue_b32(), v781, v1824); + svfloat32_t v508 = svadd_f32_x(svptrue_b32(), v506, v507); + svfloat32_t v509 = svsub_f32_x(svptrue_b32(), v507, v506); + svfloat32_t v562 = svsub_f32_x(svptrue_b32(), v479, v559); + svfloat32_t v563 = svadd_f32_x(svptrue_b32(), v479, v559); svfloat32_t v588 = svcmla_f32_x(pred_full, v568, v1741, v534, 90); svfloat32_t v589 = svcmla_f32_x(pred_full, v580, v1905, v549, 90); svfloat32_t v627 = svcmla_f32_x(pred_full, v607, v1949, v607, 90); svfloat32_t v628 = svcmla_f32_x(pred_full, v619, v1825, v619, 90); svfloat32_t v666 = svcmla_f32_x(pred_full, v646, v1905, v535, 90); svfloat32_t v667 = svcmla_f32_x(pred_full, v658, v1907, v550, 90); - svfloat32_t v739; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v739) : "w"(v737), "w"(v738)); - svfloat32_t v740; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v740) : "w"(v738), "w"(v737)); - svfloat32_t v810; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v810) : "w"(v808), "w"(v809)); - svfloat32_t v811; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v811) : "w"(v809), "w"(v808)); - svfloat32_t zero831; - asm volatile("mov %0.s, #0" : "=w"(zero831)); + svfloat32_t v739 = svadd_f32_x(svptrue_b32(), v737, v738); + svfloat32_t v740 = svsub_f32_x(svptrue_b32(), v738, v737); + svfloat32_t v810 = svadd_f32_x(svptrue_b32(), v808, v809); + svfloat32_t v811 = svsub_f32_x(svptrue_b32(), v809, v808); + svfloat32_t zero831 = svdup_n_f32(0); svfloat32_t v831 = svcmla_f32_x(pred_full, zero831, v1825, v824, 90); - svfloat32_t v832; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v832) : "w"(v560), "w"(v823)); - svfloat32_t v833; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v833) : "w"(v560), "w"(v823)); - svfloat32_t v935; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v935) : "w"(v711), "w"(v1740)); - svfloat32_t v947; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v947) : "w"(v782), "w"(v1904)); - svfloat32_t v1203; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v1203) : "w"(v712), "w"(v1904)); - svfloat32_t v1215; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v1215) : "w"(v783), "w"(v1906)); - svfloat32_t zero516; - asm volatile("mov %0.s, #0" : "=w"(zero516)); + svfloat32_t v832 = svadd_f32_x(svptrue_b32(), v560, v823); + svfloat32_t v833 = svsub_f32_x(svptrue_b32(), v560, v823); + svfloat32_t v935 = svmul_f32_x(svptrue_b32(), v711, v1740); + svfloat32_t v947 = svmul_f32_x(svptrue_b32(), v782, v1904); + svfloat32_t v1203 = svmul_f32_x(svptrue_b32(), v712, v1904); + svfloat32_t v1215 = svmul_f32_x(svptrue_b32(), v783, v1906); + svfloat32_t zero516 = svdup_n_f32(0); svfloat32_t v516 = svcmla_f32_x(pred_full, zero516, v1949, v509, 90); - svfloat32_t v517; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v517) : "w"(v463), "w"(v508)); - svfloat32_t v518; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v518) : "w"(v463), "w"(v508)); - svfloat32_t v590; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v590) : "w"(v588), "w"(v589)); - svfloat32_t v591; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v591) : "w"(v589), "w"(v588)); - svfloat32_t v629; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v629) : "w"(v627), "w"(v628)); - svfloat32_t v630; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v630) : "w"(v628), "w"(v627)); - svfloat32_t v668; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v668) : "w"(v666), "w"(v667)); - svfloat32_t v669; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v669) : "w"(v667), "w"(v666)); - svfloat32_t zero747; - asm volatile("mov %0.s, #0" : "=w"(zero747)); + svfloat32_t v517 = svadd_f32_x(svptrue_b32(), v463, v508); + svfloat32_t v518 = svsub_f32_x(svptrue_b32(), v463, v508); + svfloat32_t v590 = svadd_f32_x(svptrue_b32(), v588, v589); + svfloat32_t v591 = svsub_f32_x(svptrue_b32(), v589, v588); + svfloat32_t v629 = svadd_f32_x(svptrue_b32(), v627, v628); + svfloat32_t v630 = svsub_f32_x(svptrue_b32(), v628, v627); + svfloat32_t v668 = svadd_f32_x(svptrue_b32(), v666, v667); + svfloat32_t v669 = svsub_f32_x(svptrue_b32(), v667, v666); + svfloat32_t zero747 = svdup_n_f32(0); svfloat32_t v747 = svcmla_f32_x(pred_full, zero747, v1949, v740, 90); - svfloat32_t v748; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v748) : "w"(v694), "w"(v739)); - svfloat32_t v749; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v749) : "w"(v694), "w"(v739)); - svfloat32_t zero818; - asm volatile("mov %0.s, #0" : "=w"(zero818)); + svfloat32_t v748 = svadd_f32_x(svptrue_b32(), v694, v739); + svfloat32_t v749 = svsub_f32_x(svptrue_b32(), v694, v739); + svfloat32_t zero818 = svdup_n_f32(0); svfloat32_t v818 = svcmla_f32_x(pred_full, zero818, v1949, v811, 90); - svfloat32_t v819; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v819) : "w"(v765), "w"(v810)); - svfloat32_t v820; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v820) : "w"(v765), "w"(v810)); - svfloat32_t v834; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v834) : "w"(v561), "w"(v831)); - svfloat32_t v835; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v835) : "w"(v561), "w"(v831)); + svfloat32_t v819 = svadd_f32_x(svptrue_b32(), v765, v810); + svfloat32_t v820 = svsub_f32_x(svptrue_b32(), v765, v810); + svfloat32_t v834 = svsub_f32_x(svptrue_b32(), v561, v831); + svfloat32_t v835 = svadd_f32_x(svptrue_b32(), v561, v831); svfloat32_t v955 = svcmla_f32_x(pred_full, v935, v1741, v711, 90); svfloat32_t v956 = svcmla_f32_x(pred_full, v947, v1905, v782, 90); svfloat32_t v1089 = svcmla_f32_x(pred_full, v1069, v1949, v1069, 90); @@ -25159,156 +19827,92 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu32(const armral_cmplx_f32_t *restrict x, svfloat32_t v1224 = svcmla_f32_x(pred_full, v1215, v1907, v783, 90); svst1_f64(pred_full, (double *)(v1669), svreinterpret_f64_f32(v832)); svst1_f64(pred_full, (double *)(v1687), svreinterpret_f64_f32(v833)); - svfloat32_t v519; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v519) : "w"(v464), "w"(v516)); - svfloat32_t v520; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v520) : "w"(v464), "w"(v516)); - svfloat32_t zero598; - asm volatile("mov %0.s, #0" : "=w"(zero598)); + svfloat32_t v519 = svsub_f32_x(svptrue_b32(), v464, v516); + svfloat32_t v520 = svadd_f32_x(svptrue_b32(), v464, v516); + svfloat32_t zero598 = svdup_n_f32(0); svfloat32_t v598 = svcmla_f32_x(pred_full, zero598, v1949, v591, 90); - svfloat32_t v599; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v599) : "w"(v517), "w"(v590)); - svfloat32_t v600; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v600) : "w"(v517), "w"(v590)); - svfloat32_t zero637; - asm volatile("mov %0.s, #0" : "=w"(zero637)); + svfloat32_t v599 = svadd_f32_x(svptrue_b32(), v517, v590); + svfloat32_t v600 = svsub_f32_x(svptrue_b32(), v517, v590); + svfloat32_t zero637 = svdup_n_f32(0); svfloat32_t v637 = svcmla_f32_x(pred_full, zero637, v1949, v630, 90); - svfloat32_t v638; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v638) : "w"(v480), "w"(v629)); - svfloat32_t v639; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v639) : "w"(v480), "w"(v629)); - svfloat32_t zero676; - asm volatile("mov %0.s, #0" : "=w"(zero676)); + svfloat32_t v638 = svadd_f32_x(svptrue_b32(), v480, v629); + svfloat32_t v639 = svsub_f32_x(svptrue_b32(), v480, v629); + svfloat32_t zero676 = svdup_n_f32(0); svfloat32_t v676 = svcmla_f32_x(pred_full, zero676, v1949, v669, 90); - svfloat32_t v750; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v750) : "w"(v695), "w"(v747)); - svfloat32_t v751; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v751) : "w"(v695), "w"(v747)); - svfloat32_t v821; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v821) : "w"(v766), "w"(v818)); - svfloat32_t v822; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v822) : "w"(v766), "w"(v818)); - svfloat32_t v868; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v868) : "w"(v748), "w"(v1699)); - svfloat32_t v880; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v880) : "w"(v819), "w"(v1781)); - svfloat32_t v957; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v957) : "w"(v955), "w"(v956)); - svfloat32_t v958; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v958) : "w"(v956), "w"(v955)); - svfloat32_t v1091; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v1091) : "w"(v1089), "w"(v1090)); - svfloat32_t v1092; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1092) : "w"(v1090), "w"(v1089)); - svfloat32_t v1136; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v1136) : "w"(v749), "w"(v1863)); - svfloat32_t v1148; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v1148) : "w"(v820), "w"(v1865)); - svfloat32_t v1225; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v1225) : "w"(v1223), "w"(v1224)); - svfloat32_t v1226; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1226) : "w"(v1224), "w"(v1223)); + svfloat32_t v750 = svsub_f32_x(svptrue_b32(), v695, v747); + svfloat32_t v751 = svadd_f32_x(svptrue_b32(), v695, v747); + svfloat32_t v821 = svsub_f32_x(svptrue_b32(), v766, v818); + svfloat32_t v822 = svadd_f32_x(svptrue_b32(), v766, v818); + svfloat32_t v868 = svmul_f32_x(svptrue_b32(), v748, v1699); + svfloat32_t v880 = svmul_f32_x(svptrue_b32(), v819, v1781); + svfloat32_t v957 = svadd_f32_x(svptrue_b32(), v955, v956); + svfloat32_t v958 = svsub_f32_x(svptrue_b32(), v956, v955); + svfloat32_t v1091 = svadd_f32_x(svptrue_b32(), v1089, v1090); + svfloat32_t v1092 = svsub_f32_x(svptrue_b32(), v1090, v1089); + svfloat32_t v1136 = svmul_f32_x(svptrue_b32(), v749, v1863); + svfloat32_t v1148 = svmul_f32_x(svptrue_b32(), v820, v1865); + svfloat32_t v1225 = svadd_f32_x(svptrue_b32(), v1223, v1224); + svfloat32_t v1226 = svsub_f32_x(svptrue_b32(), v1224, v1223); svst1_f64(pred_full, (double *)(v1678), svreinterpret_f64_f32(v834)); svst1_f64(pred_full, (double *)(v1696), svreinterpret_f64_f32(v835)); - svfloat32_t v601; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v601) : "w"(v518), "w"(v598)); - svfloat32_t v602; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v602) : "w"(v518), "w"(v598)); - svfloat32_t v640; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v640) : "w"(v481), "w"(v637)); - svfloat32_t v641; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v641) : "w"(v481), "w"(v637)); - svfloat32_t v677; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v677) : "w"(v519), "w"(v668)); - svfloat32_t v678; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v678) : "w"(v519), "w"(v668)); - svfloat32_t v679; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v679) : "w"(v520), "w"(v676)); - svfloat32_t v680; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v680) : "w"(v520), "w"(v676)); + svfloat32_t v601 = svsub_f32_x(svptrue_b32(), v518, v598); + svfloat32_t v602 = svadd_f32_x(svptrue_b32(), v518, v598); + svfloat32_t v640 = svsub_f32_x(svptrue_b32(), v481, v637); + svfloat32_t v641 = svadd_f32_x(svptrue_b32(), v481, v637); + svfloat32_t v677 = svadd_f32_x(svptrue_b32(), v519, v668); + svfloat32_t v678 = svsub_f32_x(svptrue_b32(), v519, v668); + svfloat32_t v679 = svsub_f32_x(svptrue_b32(), v520, v676); + svfloat32_t v680 = svadd_f32_x(svptrue_b32(), v520, v676); svfloat32_t v888 = svcmla_f32_x(pred_full, v868, v1866, v748, 90); svfloat32_t v889 = svcmla_f32_x(pred_full, v880, v1782, v819, 90); - svfloat32_t zero965; - asm volatile("mov %0.s, #0" : "=w"(zero965)); + svfloat32_t zero965 = svdup_n_f32(0); svfloat32_t v965 = svcmla_f32_x(pred_full, zero965, v1949, v958, 90); - svfloat32_t v966; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v966) : "w"(v638), "w"(v957)); - svfloat32_t v967; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v967) : "w"(v638), "w"(v957)); - svfloat32_t v1002; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v1002) : "w"(v750), "w"(v1781)); - svfloat32_t v1014; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v1014) : "w"(v821), "w"(v1783)); - svfloat32_t zero1099; - asm volatile("mov %0.s, #0" : "=w"(zero1099)); + svfloat32_t v966 = svadd_f32_x(svptrue_b32(), v638, v957); + svfloat32_t v967 = svsub_f32_x(svptrue_b32(), v638, v957); + svfloat32_t v1002 = svmul_f32_x(svptrue_b32(), v750, v1781); + svfloat32_t v1014 = svmul_f32_x(svptrue_b32(), v821, v1783); + svfloat32_t zero1099 = svdup_n_f32(0); svfloat32_t v1099 = svcmla_f32_x(pred_full, zero1099, v1949, v1092, 90); - svfloat32_t v1100; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v1100) : "w"(v562), "w"(v1091)); - svfloat32_t v1101; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1101) : "w"(v562), "w"(v1091)); + svfloat32_t v1100 = svadd_f32_x(svptrue_b32(), v562, v1091); + svfloat32_t v1101 = svsub_f32_x(svptrue_b32(), v562, v1091); svfloat32_t v1156 = svcmla_f32_x(pred_full, v1136, v1864, v749, 90); svfloat32_t v1157 = svcmla_f32_x(pred_full, v1148, v1866, v820, 90); - svfloat32_t zero1233; - asm volatile("mov %0.s, #0" : "=w"(zero1233)); + svfloat32_t zero1233 = svdup_n_f32(0); svfloat32_t v1233 = svcmla_f32_x(pred_full, zero1233, v1949, v1226, 90); - svfloat32_t v1270; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v1270) : "w"(v751), "w"(v1945)); - svfloat32_t v1282; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v1282) : "w"(v822), "w"(v1947)); - svfloat32_t v890; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v890) : "w"(v888), "w"(v889)); - svfloat32_t v891; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v891) : "w"(v889), "w"(v888)); - svfloat32_t v968; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v968) : "w"(v639), "w"(v965)); - svfloat32_t v969; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v969) : "w"(v639), "w"(v965)); + svfloat32_t v1270 = svmul_f32_x(svptrue_b32(), v751, v1945); + svfloat32_t v1282 = svmul_f32_x(svptrue_b32(), v822, v1947); + svfloat32_t v890 = svadd_f32_x(svptrue_b32(), v888, v889); + svfloat32_t v891 = svsub_f32_x(svptrue_b32(), v889, v888); + svfloat32_t v968 = svsub_f32_x(svptrue_b32(), v639, v965); + svfloat32_t v969 = svadd_f32_x(svptrue_b32(), v639, v965); svfloat32_t v1022 = svcmla_f32_x(pred_full, v1002, v1782, v750, 90); svfloat32_t v1023 = svcmla_f32_x(pred_full, v1014, v1946, v821, 90); - svfloat32_t v1102; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1102) : "w"(v563), "w"(v1099)); - svfloat32_t v1103; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v1103) : "w"(v563), "w"(v1099)); - svfloat32_t v1158; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v1158) : "w"(v1156), "w"(v1157)); - svfloat32_t v1159; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1159) : "w"(v1157), "w"(v1156)); - svfloat32_t v1234; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v1234) : "w"(v640), "w"(v1225)); - svfloat32_t v1235; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1235) : "w"(v640), "w"(v1225)); - svfloat32_t v1236; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1236) : "w"(v641), "w"(v1233)); - svfloat32_t v1237; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v1237) : "w"(v641), "w"(v1233)); + svfloat32_t v1102 = svsub_f32_x(svptrue_b32(), v563, v1099); + svfloat32_t v1103 = svadd_f32_x(svptrue_b32(), v563, v1099); + svfloat32_t v1158 = svadd_f32_x(svptrue_b32(), v1156, v1157); + svfloat32_t v1159 = svsub_f32_x(svptrue_b32(), v1157, v1156); + svfloat32_t v1234 = svadd_f32_x(svptrue_b32(), v640, v1225); + svfloat32_t v1235 = svsub_f32_x(svptrue_b32(), v640, v1225); + svfloat32_t v1236 = svsub_f32_x(svptrue_b32(), v641, v1233); + svfloat32_t v1237 = svadd_f32_x(svptrue_b32(), v641, v1233); svfloat32_t v1290 = svcmla_f32_x(pred_full, v1270, v1946, v751, 90); svfloat32_t v1291 = svcmla_f32_x(pred_full, v1282, v1948, v822, 90); svst1_f64(pred_full, (double *)(v1751), svreinterpret_f64_f32(v966)); svst1_f64(pred_full, (double *)(v1769), svreinterpret_f64_f32(v967)); svst1_f64(pred_full, (double *)(v1833), svreinterpret_f64_f32(v1100)); svst1_f64(pred_full, (double *)(v1851), svreinterpret_f64_f32(v1101)); - svfloat32_t zero898; - asm volatile("mov %0.s, #0" : "=w"(zero898)); + svfloat32_t zero898 = svdup_n_f32(0); svfloat32_t v898 = svcmla_f32_x(pred_full, zero898, v1949, v891, 90); - svfloat32_t v899; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v899) : "w"(v599), "w"(v890)); - svfloat32_t v900; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v900) : "w"(v599), "w"(v890)); - svfloat32_t v1024; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v1024) : "w"(v1022), "w"(v1023)); - svfloat32_t v1025; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1025) : "w"(v1023), "w"(v1022)); - svfloat32_t zero1166; - asm volatile("mov %0.s, #0" : "=w"(zero1166)); + svfloat32_t v899 = svadd_f32_x(svptrue_b32(), v599, v890); + svfloat32_t v900 = svsub_f32_x(svptrue_b32(), v599, v890); + svfloat32_t v1024 = svadd_f32_x(svptrue_b32(), v1022, v1023); + svfloat32_t v1025 = svsub_f32_x(svptrue_b32(), v1023, v1022); + svfloat32_t zero1166 = svdup_n_f32(0); svfloat32_t v1166 = svcmla_f32_x(pred_full, zero1166, v1949, v1159, 90); - svfloat32_t v1167; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v1167) : "w"(v601), "w"(v1158)); - svfloat32_t v1168; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1168) : "w"(v601), "w"(v1158)); - svfloat32_t v1292; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v1292) : "w"(v1290), "w"(v1291)); - svfloat32_t v1293; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1293) : "w"(v1291), "w"(v1290)); + svfloat32_t v1167 = svadd_f32_x(svptrue_b32(), v601, v1158); + svfloat32_t v1168 = svsub_f32_x(svptrue_b32(), v601, v1158); + svfloat32_t v1292 = svadd_f32_x(svptrue_b32(), v1290, v1291); + svfloat32_t v1293 = svsub_f32_x(svptrue_b32(), v1291, v1290); svst1_f64(pred_full, (double *)(v1760), svreinterpret_f64_f32(v968)); svst1_f64(pred_full, (double *)(v1778), svreinterpret_f64_f32(v969)); svst1_f64(pred_full, (double *)(v1842), svreinterpret_f64_f32(v1102)); @@ -25317,40 +19921,26 @@ void armral_fft_cf32_cf32_cf32_ac_t_uu32(const armral_cmplx_f32_t *restrict x, svst1_f64(pred_full, (double *)(v1924), svreinterpret_f64_f32(v1236)); svst1_f64(pred_full, (double *)(v1933), svreinterpret_f64_f32(v1235)); svst1_f64(pred_full, (double *)(v1942), svreinterpret_f64_f32(v1237)); - svfloat32_t v901; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v901) : "w"(v600), "w"(v898)); - svfloat32_t v902; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v902) : "w"(v600), "w"(v898)); - svfloat32_t zero1032; - asm volatile("mov %0.s, #0" : "=w"(zero1032)); + svfloat32_t v901 = svsub_f32_x(svptrue_b32(), v600, v898); + svfloat32_t v902 = svadd_f32_x(svptrue_b32(), v600, v898); + svfloat32_t zero1032 = svdup_n_f32(0); svfloat32_t v1032 = svcmla_f32_x(pred_full, zero1032, v1949, v1025, 90); - svfloat32_t v1033; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v1033) : "w"(v677), "w"(v1024)); - svfloat32_t v1034; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1034) : "w"(v677), "w"(v1024)); - svfloat32_t v1169; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1169) : "w"(v602), "w"(v1166)); - svfloat32_t v1170; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v1170) : "w"(v602), "w"(v1166)); - svfloat32_t zero1300; - asm volatile("mov %0.s, #0" : "=w"(zero1300)); + svfloat32_t v1033 = svadd_f32_x(svptrue_b32(), v677, v1024); + svfloat32_t v1034 = svsub_f32_x(svptrue_b32(), v677, v1024); + svfloat32_t v1169 = svsub_f32_x(svptrue_b32(), v602, v1166); + svfloat32_t v1170 = svadd_f32_x(svptrue_b32(), v602, v1166); + svfloat32_t zero1300 = svdup_n_f32(0); svfloat32_t v1300 = svcmla_f32_x(pred_full, zero1300, v1949, v1293, 90); - svfloat32_t v1301; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v1301) : "w"(v679), "w"(v1292)); - svfloat32_t v1302; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1302) : "w"(v679), "w"(v1292)); + svfloat32_t v1301 = svadd_f32_x(svptrue_b32(), v679, v1292); + svfloat32_t v1302 = svsub_f32_x(svptrue_b32(), v679, v1292); svst1_f64(pred_full, (double *)(v1710), svreinterpret_f64_f32(v899)); svst1_f64(pred_full, (double *)(v1728), svreinterpret_f64_f32(v900)); svst1_f64(pred_full, (double *)(v1874), svreinterpret_f64_f32(v1167)); svst1_f64(pred_full, (double *)(v1892), svreinterpret_f64_f32(v1168)); - svfloat32_t v1035; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1035) : "w"(v678), "w"(v1032)); - svfloat32_t v1036; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v1036) : "w"(v678), "w"(v1032)); - svfloat32_t v1303; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1303) : "w"(v680), "w"(v1300)); - svfloat32_t v1304; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v1304) : "w"(v680), "w"(v1300)); + svfloat32_t v1035 = svsub_f32_x(svptrue_b32(), v678, v1032); + svfloat32_t v1036 = svadd_f32_x(svptrue_b32(), v678, v1032); + svfloat32_t v1303 = svsub_f32_x(svptrue_b32(), v680, v1300); + svfloat32_t v1304 = svadd_f32_x(svptrue_b32(), v680, v1300); svst1_f64(pred_full, (double *)(v1719), svreinterpret_f64_f32(v901)); svst1_f64(pred_full, (double *)(v1737), svreinterpret_f64_f32(v902)); svst1_f64(pred_full, (double *)(v1792), svreinterpret_f64_f32(v1033)); diff --git a/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ac_t_uu.h b/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ac_t_uu.h index 88f8678fc205d4542de74743f27abf9dd1dd0cb3..8211d4c99a0f2df98aebb028d1fed636fd5ca3c9 100644 --- a/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ac_t_uu.h +++ b/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ac_t_uu.h @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #pragma once @@ -17,17 +19,9 @@ typedef void(cf32_cf32_cf32_ac_t_uu_fft_t)(const armral_cmplx_f32_t *x, const armral_cmplx_f32_t *w, int howmany, float dir); -cf32_cf32_cf32_ac_t_uu_fft_t armral_fft_cf32_cf32_cf32_ac_t_uu2; -cf32_cf32_cf32_ac_t_uu_fft_t armral_fft_cf32_cf32_cf32_ac_t_uu3; -cf32_cf32_cf32_ac_t_uu_fft_t armral_fft_cf32_cf32_cf32_ac_t_uu4; -cf32_cf32_cf32_ac_t_uu_fft_t armral_fft_cf32_cf32_cf32_ac_t_uu5; -cf32_cf32_cf32_ac_t_uu_fft_t armral_fft_cf32_cf32_cf32_ac_t_uu6; cf32_cf32_cf32_ac_t_uu_fft_t armral_fft_cf32_cf32_cf32_ac_t_uu7; -cf32_cf32_cf32_ac_t_uu_fft_t armral_fft_cf32_cf32_cf32_ac_t_uu8; cf32_cf32_cf32_ac_t_uu_fft_t armral_fft_cf32_cf32_cf32_ac_t_uu9; -cf32_cf32_cf32_ac_t_uu_fft_t armral_fft_cf32_cf32_cf32_ac_t_uu10; cf32_cf32_cf32_ac_t_uu_fft_t armral_fft_cf32_cf32_cf32_ac_t_uu11; -cf32_cf32_cf32_ac_t_uu_fft_t armral_fft_cf32_cf32_cf32_ac_t_uu12; cf32_cf32_cf32_ac_t_uu_fft_t armral_fft_cf32_cf32_cf32_ac_t_uu13; cf32_cf32_cf32_ac_t_uu_fft_t armral_fft_cf32_cf32_cf32_ac_t_uu14; cf32_cf32_cf32_ac_t_uu_fft_t armral_fft_cf32_cf32_cf32_ac_t_uu15; diff --git a/src/LowerPHY/FFT/fft_cf32_cf32_cs16_ab_t_gu.c b/src/LowerPHY/FFT/fft_cf32_cf32_cs16_ab_t_gu.c index d46ed9d262a485367fa3617c355ccca55d4eb6fd..e42831592d950ef7950fadf937e1a2266ec1be48 100644 --- a/src/LowerPHY/FFT/fft_cf32_cf32_cs16_ab_t_gu.c +++ b/src/LowerPHY/FFT/fft_cf32_cf32_cs16_ab_t_gu.c @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "fft_cf32_cf32_cs16_ab_t_gu.h" @@ -72,14 +74,11 @@ void armral_fft_cf32_cf32_cs16_ab_t_gu2(const armral_cmplx_f32_t *restrict x, svld1_gather_s64index_f64(pred_full, (const double *)(v80), v93)); svfloat32_t v94 = svreinterpret_f32_f64( svld1_gather_s64index_f64(pred_full, (const double *)(v92), v93)); - svfloat32_t zero38; - asm volatile("mov %0.s, #0" : "=w"(zero38)); + svfloat32_t zero38 = svdup_n_f32(0); svfloat32_t v38 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero38, v82, v37, 0), v82, v37, 90); - svfloat32_t v46; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v46) : "w"(v94), "w"(v38)); - svfloat32_t v47; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v47) : "w"(v94), "w"(v38)); + svfloat32_t v46 = svadd_f32_x(svptrue_b32(), v94, v38); + svfloat32_t v47 = svsub_f32_x(svptrue_b32(), v94, v38); svint16_t v60 = svtbl_s16( svreinterpret_s16_s32(svcvt_s32_f32_x( pred_full, svmul_n_f32_x(pred_full, v46, (float)(1ULL << 31ULL)))), @@ -200,8 +199,7 @@ void armral_fft_cf32_cf32_cs16_ab_t_gu3(const armral_cmplx_f32_t *restrict x, svld1_gather_s64index_f64(pred_full, (const double *)(v141), v142)); svfloat32_t v146 = svdup_n_f32(v84); int32_t *v172 = &v6[v107]; - svfloat32_t zero52; - asm volatile("mov %0.s, #0" : "=w"(zero52)); + svfloat32_t zero52 = svdup_n_f32(0); svfloat32_t v52 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero52, v122, v51, 0), v122, v51, 90); @@ -209,19 +207,14 @@ void armral_fft_cf32_cf32_cs16_ab_t_gu3(const armral_cmplx_f32_t *restrict x, svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v7)[v57])); svfloat32_t v132 = svreinterpret_f32_f64( svld1_gather_s64index_f64(pred_full, (const double *)(v130), v142)); - svfloat32_t zero59; - asm volatile("mov %0.s, #0" : "=w"(zero59)); + svfloat32_t zero59 = svdup_n_f32(0); svfloat32_t v59 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero59, v132, v58, 0), v132, v58, 90); - svfloat32_t v60; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v60) : "w"(v52), "w"(v59)); - svfloat32_t v61; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v61) : "w"(v52), "w"(v59)); - svfloat32_t v69; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v69) : "w"(v60), "w"(v143)); - svfloat32_t zero86; - asm volatile("mov %0.s, #0" : "=w"(zero86)); + svfloat32_t v60 = svadd_f32_x(svptrue_b32(), v52, v59); + svfloat32_t v61 = svsub_f32_x(svptrue_b32(), v52, v59); + svfloat32_t v69 = svadd_f32_x(svptrue_b32(), v60, v143); + svfloat32_t zero86 = svdup_n_f32(0); svfloat32_t v86 = svcmla_f32_x(pred_full, zero86, v146, v61, 90); svfloat32_t v87 = svmla_f32_x(pred_full, v69, v60, v145); svint16_t v92 = svtbl_s16( @@ -229,10 +222,8 @@ void armral_fft_cf32_cf32_cs16_ab_t_gu3(const armral_cmplx_f32_t *restrict x, pred_full, svmul_n_f32_x(pred_full, v69, (float)(1ULL << 31ULL)))), svreinterpret_u16_u64( svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svfloat32_t v88; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v88) : "w"(v87), "w"(v86)); - svfloat32_t v89; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v89) : "w"(v87), "w"(v86)); + svfloat32_t v88 = svadd_f32_x(svptrue_b32(), v87, v86); + svfloat32_t v89 = svsub_f32_x(svptrue_b32(), v87, v86); svst1w_u64(pred_full, (unsigned *)(v154), svreinterpret_u64_s16(v92)); svint16_t v100 = svtbl_s16( svreinterpret_s16_s32(svcvt_s32_f32_x( @@ -371,8 +362,7 @@ void armral_fft_cf32_cf32_cs16_ab_t_gu4(const armral_cmplx_f32_t *restrict x, int32_t *v227 = &v6[v143]; svfloat32_t v37 = svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v7)[v36])); - svfloat32_t zero73; - asm volatile("mov %0.s, #0" : "=w"(zero73)); + svfloat32_t zero73 = svdup_n_f32(0); svfloat32_t v73 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero73, v167, v72, 0), v167, v72, 90); @@ -382,35 +372,24 @@ void armral_fft_cf32_cf32_cs16_ab_t_gu4(const armral_cmplx_f32_t *restrict x, svld1_gather_s64index_f64(pred_full, (const double *)(v156), v187)); svfloat32_t v177 = svreinterpret_f32_f64( svld1_gather_s64index_f64(pred_full, (const double *)(v175), v187)); - svfloat32_t zero38; - asm volatile("mov %0.s, #0" : "=w"(zero38)); + svfloat32_t zero38 = svdup_n_f32(0); svfloat32_t v38 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero38, v158, v37, 0), v158, v37, 90); - svfloat32_t zero80; - asm volatile("mov %0.s, #0" : "=w"(zero80)); + svfloat32_t zero80 = svdup_n_f32(0); svfloat32_t v80 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero80, v177, v79, 0), v177, v79, 90); - svfloat32_t v88; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v88) : "w"(v188), "w"(v38)); - svfloat32_t v89; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v89) : "w"(v188), "w"(v38)); - svfloat32_t v90; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v90) : "w"(v73), "w"(v80)); - svfloat32_t v91; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v91) : "w"(v73), "w"(v80)); - svfloat32_t v92; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v92) : "w"(v88), "w"(v90)); - svfloat32_t v93; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v93) : "w"(v88), "w"(v90)); - svfloat32_t zero115; - asm volatile("mov %0.s, #0" : "=w"(zero115)); + svfloat32_t v88 = svadd_f32_x(svptrue_b32(), v188, v38); + svfloat32_t v89 = svsub_f32_x(svptrue_b32(), v188, v38); + svfloat32_t v90 = svadd_f32_x(svptrue_b32(), v73, v80); + svfloat32_t v91 = svsub_f32_x(svptrue_b32(), v73, v80); + svfloat32_t v92 = svadd_f32_x(svptrue_b32(), v88, v90); + svfloat32_t v93 = svsub_f32_x(svptrue_b32(), v88, v90); + svfloat32_t zero115 = svdup_n_f32(0); svfloat32_t v115 = svcmla_f32_x(pred_full, zero115, v192, v91, 90); - svfloat32_t v116; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v116) : "w"(v89), "w"(v115)); - svfloat32_t v117; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v117) : "w"(v89), "w"(v115)); + svfloat32_t v116 = svadd_f32_x(svptrue_b32(), v89, v115); + svfloat32_t v117 = svsub_f32_x(svptrue_b32(), v89, v115); svint16_t v120 = svtbl_s16( svreinterpret_s16_s32(svcvt_s32_f32_x( pred_full, svmul_n_f32_x(pred_full, v92, (float)(1ULL << 31ULL)))), @@ -613,8 +592,7 @@ void armral_fft_cf32_cf32_cs16_ab_t_gu5(const armral_cmplx_f32_t *restrict x, int32_t *v281 = &v6[v179]; int32_t *v290 = &v6[v187]; int32_t *v299 = &v6[v195]; - svfloat32_t zero52; - asm volatile("mov %0.s, #0" : "=w"(zero52)); + svfloat32_t zero52 = svdup_n_f32(0); svfloat32_t v52 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero52, v210, v51, 0), v210, v51, 90); @@ -630,46 +608,32 @@ void armral_fft_cf32_cf32_cs16_ab_t_gu5(const armral_cmplx_f32_t *restrict x, svld1_gather_s64index_f64(pred_full, (const double *)(v228), v248)); svfloat32_t v239 = svreinterpret_f32_f64( svld1_gather_s64index_f64(pred_full, (const double *)(v237), v248)); - svfloat32_t zero59; - asm volatile("mov %0.s, #0" : "=w"(zero59)); + svfloat32_t zero59 = svdup_n_f32(0); svfloat32_t v59 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero59, v220, v58, 0), v220, v58, 90); - svfloat32_t zero94; - asm volatile("mov %0.s, #0" : "=w"(zero94)); + svfloat32_t zero94 = svdup_n_f32(0); svfloat32_t v94 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero94, v230, v93, 0), v230, v93, 90); - svfloat32_t zero101; - asm volatile("mov %0.s, #0" : "=w"(zero101)); + svfloat32_t zero101 = svdup_n_f32(0); svfloat32_t v101 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero101, v239, v100, 0), v239, v100, 90); - svfloat32_t v102; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v102) : "w"(v52), "w"(v59)); - svfloat32_t v103; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v103) : "w"(v52), "w"(v59)); - svfloat32_t v104; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v104) : "w"(v94), "w"(v101)); - svfloat32_t v105; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v105) : "w"(v94), "w"(v101)); - svfloat32_t v106; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v106) : "w"(v102), "w"(v104)); - svfloat32_t v107; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v107) : "w"(v102), "w"(v104)); - svfloat32_t v108; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v108) : "w"(v103), "w"(v105)); - svfloat32_t zero138; - asm volatile("mov %0.s, #0" : "=w"(zero138)); + svfloat32_t v102 = svadd_f32_x(svptrue_b32(), v52, v59); + svfloat32_t v103 = svsub_f32_x(svptrue_b32(), v52, v59); + svfloat32_t v104 = svadd_f32_x(svptrue_b32(), v94, v101); + svfloat32_t v105 = svsub_f32_x(svptrue_b32(), v94, v101); + svfloat32_t v106 = svadd_f32_x(svptrue_b32(), v102, v104); + svfloat32_t v107 = svsub_f32_x(svptrue_b32(), v102, v104); + svfloat32_t v108 = svadd_f32_x(svptrue_b32(), v103, v105); + svfloat32_t zero138 = svdup_n_f32(0); svfloat32_t v138 = svcmla_f32_x(pred_full, zero138, v253, v103, 90); - svfloat32_t v116; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v116) : "w"(v106), "w"(v249)); - svfloat32_t zero145; - asm volatile("mov %0.s, #0" : "=w"(zero145)); + svfloat32_t v116 = svadd_f32_x(svptrue_b32(), v106, v249); + svfloat32_t zero145 = svdup_n_f32(0); svfloat32_t v145 = svcmla_f32_x(pred_full, zero145, v254, v108, 90); svfloat32_t v153 = svmla_f32_x(pred_full, v116, v106, v251); - svfloat32_t v156; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v156) : "w"(v138), "w"(v145)); + svfloat32_t v156 = svsub_f32_x(svptrue_b32(), v138, v145); svfloat32_t v157 = svcmla_f32_x(pred_full, v145, v255, v105, 90); svint16_t v164 = svtbl_s16( svreinterpret_s16_s32(svcvt_s32_f32_x( @@ -679,14 +643,10 @@ void armral_fft_cf32_cf32_cs16_ab_t_gu5(const armral_cmplx_f32_t *restrict x, svfloat32_t v154 = svmla_f32_x(pred_full, v153, v107, v252); svfloat32_t v155 = svmls_f32_x(pred_full, v153, v107, v252); svst1w_u64(pred_full, (unsigned *)(v263), svreinterpret_u64_s16(v164)); - svfloat32_t v158; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v158) : "w"(v154), "w"(v156)); - svfloat32_t v159; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v159) : "w"(v154), "w"(v156)); - svfloat32_t v160; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v160) : "w"(v155), "w"(v157)); - svfloat32_t v161; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v161) : "w"(v155), "w"(v157)); + svfloat32_t v158 = svadd_f32_x(svptrue_b32(), v154, v156); + svfloat32_t v159 = svsub_f32_x(svptrue_b32(), v154, v156); + svfloat32_t v160 = svadd_f32_x(svptrue_b32(), v155, v157); + svfloat32_t v161 = svsub_f32_x(svptrue_b32(), v155, v157); svint16_t v172 = svtbl_s16( svreinterpret_s16_s32(svcvt_s32_f32_x( pred_full, svmul_n_f32_x(pred_full, v159, (float)(1ULL << 31ULL)))), @@ -896,8 +856,7 @@ void armral_fft_cf32_cf32_cs16_ab_t_gu6(const armral_cmplx_f32_t *restrict x, svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v7)[v78])); svfloat32_t v114 = svreinterpret_f32_f64( svld1_f64(pred_full, &((const double *)v7)[v113])); - svfloat32_t zero122; - asm volatile("mov %0.s, #0" : "=w"(zero122)); + svfloat32_t zero122 = svdup_n_f32(0); svfloat32_t v122 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero122, v274, v121, 0), v274, v121, 90); @@ -909,55 +868,37 @@ void armral_fft_cf32_cf32_cs16_ab_t_gu6(const armral_cmplx_f32_t *restrict x, svld1_gather_s64index_f64(pred_full, (const double *)(v254), v285)); svfloat32_t v265 = svreinterpret_f32_f64( svld1_gather_s64index_f64(pred_full, (const double *)(v263), v285)); - svfloat32_t zero38; - asm volatile("mov %0.s, #0" : "=w"(zero38)); + svfloat32_t zero38 = svdup_n_f32(0); svfloat32_t v38 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero38, v238, v37, 0), v238, v37, 90); - svfloat32_t zero73; - asm volatile("mov %0.s, #0" : "=w"(zero73)); + svfloat32_t zero73 = svdup_n_f32(0); svfloat32_t v73 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero73, v247, v72, 0), v247, v72, 90); - svfloat32_t zero80; - asm volatile("mov %0.s, #0" : "=w"(zero80)); + svfloat32_t zero80 = svdup_n_f32(0); svfloat32_t v80 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero80, v256, v79, 0), v256, v79, 90); - svfloat32_t zero115; - asm volatile("mov %0.s, #0" : "=w"(zero115)); + svfloat32_t zero115 = svdup_n_f32(0); svfloat32_t v115 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero115, v265, v114, 0), v265, v114, 90); - svfloat32_t v130; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v130) : "w"(v286), "w"(v38)); - svfloat32_t v131; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v131) : "w"(v286), "w"(v38)); - svfloat32_t v132; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v132) : "w"(v73), "w"(v80)); - svfloat32_t v133; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v133) : "w"(v73), "w"(v80)); - svfloat32_t v134; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v134) : "w"(v115), "w"(v122)); - svfloat32_t v135; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v135) : "w"(v115), "w"(v122)); - svfloat32_t v136; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v136) : "w"(v132), "w"(v134)); - svfloat32_t v137; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v137) : "w"(v132), "w"(v134)); - svfloat32_t v159; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v159) : "w"(v133), "w"(v135)); - svfloat32_t v160; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v160) : "w"(v133), "w"(v135)); - svfloat32_t v138; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v138) : "w"(v136), "w"(v130)); - svfloat32_t zero155; - asm volatile("mov %0.s, #0" : "=w"(zero155)); + svfloat32_t v130 = svadd_f32_x(svptrue_b32(), v286, v38); + svfloat32_t v131 = svsub_f32_x(svptrue_b32(), v286, v38); + svfloat32_t v132 = svadd_f32_x(svptrue_b32(), v73, v80); + svfloat32_t v133 = svsub_f32_x(svptrue_b32(), v73, v80); + svfloat32_t v134 = svadd_f32_x(svptrue_b32(), v115, v122); + svfloat32_t v135 = svsub_f32_x(svptrue_b32(), v115, v122); + svfloat32_t v136 = svadd_f32_x(svptrue_b32(), v132, v134); + svfloat32_t v137 = svsub_f32_x(svptrue_b32(), v132, v134); + svfloat32_t v159 = svadd_f32_x(svptrue_b32(), v133, v135); + svfloat32_t v160 = svsub_f32_x(svptrue_b32(), v133, v135); + svfloat32_t v138 = svadd_f32_x(svptrue_b32(), v136, v130); + svfloat32_t zero155 = svdup_n_f32(0); svfloat32_t v155 = svcmla_f32_x(pred_full, zero155, v292, v137, 90); - svfloat32_t v161; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v161) : "w"(v159), "w"(v131)); - svfloat32_t zero178; - asm volatile("mov %0.s, #0" : "=w"(zero178)); + svfloat32_t v161 = svadd_f32_x(svptrue_b32(), v159, v131); + svfloat32_t zero178 = svdup_n_f32(0); svfloat32_t v178 = svcmla_f32_x(pred_full, zero178, v292, v160, 90); svfloat32_t v156 = svmla_f32_x(pred_full, v138, v136, v291); svfloat32_t v179 = svmla_f32_x(pred_full, v161, v159, v291); @@ -971,14 +912,10 @@ void armral_fft_cf32_cf32_cs16_ab_t_gu6(const armral_cmplx_f32_t *restrict x, pred_full, svmul_n_f32_x(pred_full, v161, (float)(1ULL << 31ULL)))), svreinterpret_u16_u64( svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svfloat32_t v157; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v157) : "w"(v156), "w"(v155)); - svfloat32_t v158; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v158) : "w"(v156), "w"(v155)); - svfloat32_t v180; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v180) : "w"(v179), "w"(v178)); - svfloat32_t v181; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v181) : "w"(v179), "w"(v178)); + svfloat32_t v157 = svadd_f32_x(svptrue_b32(), v156, v155); + svfloat32_t v158 = svsub_f32_x(svptrue_b32(), v156, v155); + svfloat32_t v180 = svadd_f32_x(svptrue_b32(), v179, v178); + svfloat32_t v181 = svsub_f32_x(svptrue_b32(), v179, v178); svst1w_u64(pred_full, (unsigned *)(v300), svreinterpret_u64_s16(v184)); svst1w_u64(pred_full, (unsigned *)(v309), svreinterpret_u64_s16(v192)); svint16_t v200 = svtbl_s16( @@ -1257,8 +1194,7 @@ void armral_fft_cf32_cf32_cs16_ab_t_gu7(const armral_cmplx_f32_t *restrict x, int32_t *v414 = &v6[v273]; int32_t *v423 = &v6[v281]; int32_t *v432 = &v6[v289]; - svfloat32_t zero52; - asm volatile("mov %0.s, #0" : "=w"(zero52)); + svfloat32_t zero52 = svdup_n_f32(0); svfloat32_t v52 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero52, v304, v51, 0), v304, v51, 90); @@ -1282,84 +1218,55 @@ void armral_fft_cf32_cf32_cs16_ab_t_gu7(const armral_cmplx_f32_t *restrict x, svld1_gather_s64index_f64(pred_full, (const double *)(v340), v360)); svfloat32_t v351 = svreinterpret_f32_f64( svld1_gather_s64index_f64(pred_full, (const double *)(v349), v360)); - svfloat32_t zero59; - asm volatile("mov %0.s, #0" : "=w"(zero59)); + svfloat32_t zero59 = svdup_n_f32(0); svfloat32_t v59 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero59, v314, v58, 0), v314, v58, 90); - svfloat32_t zero94; - asm volatile("mov %0.s, #0" : "=w"(zero94)); + svfloat32_t zero94 = svdup_n_f32(0); svfloat32_t v94 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero94, v324, v93, 0), v324, v93, 90); - svfloat32_t zero101; - asm volatile("mov %0.s, #0" : "=w"(zero101)); + svfloat32_t zero101 = svdup_n_f32(0); svfloat32_t v101 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero101, v333, v100, 0), v333, v100, 90); - svfloat32_t zero136; - asm volatile("mov %0.s, #0" : "=w"(zero136)); + svfloat32_t zero136 = svdup_n_f32(0); svfloat32_t v136 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero136, v342, v135, 0), v342, v135, 90); - svfloat32_t zero143; - asm volatile("mov %0.s, #0" : "=w"(zero143)); + svfloat32_t zero143 = svdup_n_f32(0); svfloat32_t v143 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero143, v351, v142, 0), v351, v142, 90); - svfloat32_t v144; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v144) : "w"(v52), "w"(v59)); - svfloat32_t v145; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v145) : "w"(v52), "w"(v59)); - svfloat32_t v146; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v146) : "w"(v94), "w"(v101)); - svfloat32_t v147; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v147) : "w"(v94), "w"(v101)); - svfloat32_t v148; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v148) : "w"(v136), "w"(v143)); - svfloat32_t v149; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v149) : "w"(v136), "w"(v143)); - svfloat32_t v150; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v150) : "w"(v144), "w"(v146)); - svfloat32_t v160; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v160) : "w"(v144), "w"(v146)); - svfloat32_t v161; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v161) : "w"(v146), "w"(v148)); - svfloat32_t v162; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v162) : "w"(v148), "w"(v144)); - svfloat32_t v163; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v163) : "w"(v145), "w"(v147)); - svfloat32_t v165; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v165) : "w"(v145), "w"(v147)); - svfloat32_t v166; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v166) : "w"(v147), "w"(v149)); - svfloat32_t v167; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v167) : "w"(v149), "w"(v145)); - svfloat32_t v151; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v151) : "w"(v150), "w"(v148)); - svfloat32_t v164; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v164) : "w"(v163), "w"(v149)); - svfloat32_t zero206; - asm volatile("mov %0.s, #0" : "=w"(zero206)); + svfloat32_t v144 = svadd_f32_x(svptrue_b32(), v52, v59); + svfloat32_t v145 = svsub_f32_x(svptrue_b32(), v52, v59); + svfloat32_t v146 = svadd_f32_x(svptrue_b32(), v94, v101); + svfloat32_t v147 = svsub_f32_x(svptrue_b32(), v94, v101); + svfloat32_t v148 = svadd_f32_x(svptrue_b32(), v136, v143); + svfloat32_t v149 = svsub_f32_x(svptrue_b32(), v136, v143); + svfloat32_t v150 = svadd_f32_x(svptrue_b32(), v144, v146); + svfloat32_t v160 = svsub_f32_x(svptrue_b32(), v144, v146); + svfloat32_t v161 = svsub_f32_x(svptrue_b32(), v146, v148); + svfloat32_t v162 = svsub_f32_x(svptrue_b32(), v148, v144); + svfloat32_t v163 = svadd_f32_x(svptrue_b32(), v145, v147); + svfloat32_t v165 = svsub_f32_x(svptrue_b32(), v145, v147); + svfloat32_t v166 = svsub_f32_x(svptrue_b32(), v147, v149); + svfloat32_t v167 = svsub_f32_x(svptrue_b32(), v149, v145); + svfloat32_t v151 = svadd_f32_x(svptrue_b32(), v150, v148); + svfloat32_t v164 = svadd_f32_x(svptrue_b32(), v163, v149); + svfloat32_t zero206 = svdup_n_f32(0); svfloat32_t v206 = svcmla_f32_x(pred_full, zero206, v368, v165, 90); - svfloat32_t zero213; - asm volatile("mov %0.s, #0" : "=w"(zero213)); + svfloat32_t zero213 = svdup_n_f32(0); svfloat32_t v213 = svcmla_f32_x(pred_full, zero213, v369, v166, 90); - svfloat32_t zero220; - asm volatile("mov %0.s, #0" : "=w"(zero220)); + svfloat32_t zero220 = svdup_n_f32(0); svfloat32_t v220 = svcmla_f32_x(pred_full, zero220, v370, v167, 90); - svfloat32_t v159; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v159) : "w"(v151), "w"(v361)); - svfloat32_t zero199; - asm volatile("mov %0.s, #0" : "=w"(zero199)); + svfloat32_t v159 = svadd_f32_x(svptrue_b32(), v151, v361); + svfloat32_t zero199 = svdup_n_f32(0); svfloat32_t v199 = svcmla_f32_x(pred_full, zero199, v367, v164, 90); svfloat32_t v221 = svmla_f32_x(pred_full, v159, v151, v363); - svfloat32_t v228; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v228) : "w"(v199), "w"(v206)); - svfloat32_t v230; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v230) : "w"(v199), "w"(v206)); - svfloat32_t v232; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v232) : "w"(v199), "w"(v213)); + svfloat32_t v228 = svadd_f32_x(svptrue_b32(), v199, v206); + svfloat32_t v230 = svsub_f32_x(svptrue_b32(), v199, v206); + svfloat32_t v232 = svsub_f32_x(svptrue_b32(), v199, v213); svint16_t v242 = svtbl_s16( svreinterpret_s16_s32(svcvt_s32_f32_x( pred_full, svmul_n_f32_x(pred_full, v159, (float)(1ULL << 31ULL)))), @@ -1368,28 +1275,19 @@ void armral_fft_cf32_cf32_cs16_ab_t_gu7(const armral_cmplx_f32_t *restrict x, svfloat32_t v222 = svmla_f32_x(pred_full, v221, v160, v364); svfloat32_t v224 = svmls_f32_x(pred_full, v221, v160, v364); svfloat32_t v226 = svmls_f32_x(pred_full, v221, v161, v365); - svfloat32_t v229; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v229) : "w"(v228), "w"(v213)); - svfloat32_t v231; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v231) : "w"(v230), "w"(v220)); - svfloat32_t v233; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v233) : "w"(v232), "w"(v220)); + svfloat32_t v229 = svadd_f32_x(svptrue_b32(), v228, v213); + svfloat32_t v231 = svsub_f32_x(svptrue_b32(), v230, v220); + svfloat32_t v233 = svadd_f32_x(svptrue_b32(), v232, v220); svst1w_u64(pred_full, (unsigned *)(v378), svreinterpret_u64_s16(v242)); svfloat32_t v223 = svmla_f32_x(pred_full, v222, v161, v365); svfloat32_t v225 = svmls_f32_x(pred_full, v224, v162, v366); svfloat32_t v227 = svmla_f32_x(pred_full, v226, v162, v366); - svfloat32_t v234; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v234) : "w"(v223), "w"(v229)); - svfloat32_t v235; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v235) : "w"(v223), "w"(v229)); - svfloat32_t v236; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v236) : "w"(v225), "w"(v231)); - svfloat32_t v237; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v237) : "w"(v225), "w"(v231)); - svfloat32_t v238; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v238) : "w"(v227), "w"(v233)); - svfloat32_t v239; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v239) : "w"(v227), "w"(v233)); + svfloat32_t v234 = svadd_f32_x(svptrue_b32(), v223, v229); + svfloat32_t v235 = svsub_f32_x(svptrue_b32(), v223, v229); + svfloat32_t v236 = svadd_f32_x(svptrue_b32(), v225, v231); + svfloat32_t v237 = svsub_f32_x(svptrue_b32(), v225, v231); + svfloat32_t v238 = svadd_f32_x(svptrue_b32(), v227, v233); + svfloat32_t v239 = svsub_f32_x(svptrue_b32(), v227, v233); svint16_t v250 = svtbl_s16( svreinterpret_s16_s32(svcvt_s32_f32_x( pred_full, svmul_n_f32_x(pred_full, v235, (float)(1ULL << 31ULL)))), @@ -1660,8 +1558,7 @@ void armral_fft_cf32_cf32_cs16_ab_t_gu8(const armral_cmplx_f32_t *restrict x, svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v7)[v71])); svfloat32_t v79 = svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v7)[v78])); - svfloat32_t zero115; - asm volatile("mov %0.s, #0" : "=w"(zero115)); + svfloat32_t zero115 = svdup_n_f32(0); svfloat32_t v115 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero115, v343, v114, 0), v343, v114, 90); @@ -1683,87 +1580,58 @@ void armral_fft_cf32_cf32_cs16_ab_t_gu8(const armral_cmplx_f32_t *restrict x, svld1_gather_s64index_f64(pred_full, (const double *)(v361), v381)); svfloat32_t v372 = svreinterpret_f32_f64( svld1_gather_s64index_f64(pred_full, (const double *)(v370), v381)); - svfloat32_t zero38; - asm volatile("mov %0.s, #0" : "=w"(zero38)); + svfloat32_t zero38 = svdup_n_f32(0); svfloat32_t v38 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero38, v316, v37, 0), v316, v37, 90); - svfloat32_t zero73; - asm volatile("mov %0.s, #0" : "=w"(zero73)); + svfloat32_t zero73 = svdup_n_f32(0); svfloat32_t v73 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero73, v325, v72, 0), v325, v72, 90); - svfloat32_t zero80; - asm volatile("mov %0.s, #0" : "=w"(zero80)); + svfloat32_t zero80 = svdup_n_f32(0); svfloat32_t v80 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero80, v334, v79, 0), v334, v79, 90); - svfloat32_t zero122; - asm volatile("mov %0.s, #0" : "=w"(zero122)); + svfloat32_t zero122 = svdup_n_f32(0); svfloat32_t v122 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero122, v353, v121, 0), v353, v121, 90); - svfloat32_t zero157; - asm volatile("mov %0.s, #0" : "=w"(zero157)); + svfloat32_t zero157 = svdup_n_f32(0); svfloat32_t v157 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero157, v363, v156, 0), v363, v156, 90); - svfloat32_t zero164; - asm volatile("mov %0.s, #0" : "=w"(zero164)); + svfloat32_t zero164 = svdup_n_f32(0); svfloat32_t v164 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero164, v372, v163, 0), v372, v163, 90); - svfloat32_t v172; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v172) : "w"(v382), "w"(v38)); - svfloat32_t v173; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v173) : "w"(v382), "w"(v38)); - svfloat32_t v174; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v174) : "w"(v73), "w"(v80)); - svfloat32_t v175; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v175) : "w"(v73), "w"(v80)); - svfloat32_t v176; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v176) : "w"(v115), "w"(v122)); - svfloat32_t v177; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v177) : "w"(v115), "w"(v122)); - svfloat32_t v178; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v178) : "w"(v157), "w"(v164)); - svfloat32_t v179; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v179) : "w"(v157), "w"(v164)); - svfloat32_t v180; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v180) : "w"(v172), "w"(v174)); - svfloat32_t v181; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v181) : "w"(v172), "w"(v174)); - svfloat32_t v182; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v182) : "w"(v176), "w"(v178)); - svfloat32_t v183; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v183) : "w"(v176), "w"(v178)); - svfloat32_t v186; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v186) : "w"(v177), "w"(v179)); - svfloat32_t v187; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v187) : "w"(v177), "w"(v179)); - svfloat32_t zero221; - asm volatile("mov %0.s, #0" : "=w"(zero221)); + svfloat32_t v172 = svadd_f32_x(svptrue_b32(), v382, v38); + svfloat32_t v173 = svsub_f32_x(svptrue_b32(), v382, v38); + svfloat32_t v174 = svadd_f32_x(svptrue_b32(), v73, v80); + svfloat32_t v175 = svsub_f32_x(svptrue_b32(), v73, v80); + svfloat32_t v176 = svadd_f32_x(svptrue_b32(), v115, v122); + svfloat32_t v177 = svsub_f32_x(svptrue_b32(), v115, v122); + svfloat32_t v178 = svadd_f32_x(svptrue_b32(), v157, v164); + svfloat32_t v179 = svsub_f32_x(svptrue_b32(), v157, v164); + svfloat32_t v180 = svadd_f32_x(svptrue_b32(), v172, v174); + svfloat32_t v181 = svsub_f32_x(svptrue_b32(), v172, v174); + svfloat32_t v182 = svadd_f32_x(svptrue_b32(), v176, v178); + svfloat32_t v183 = svsub_f32_x(svptrue_b32(), v176, v178); + svfloat32_t v186 = svadd_f32_x(svptrue_b32(), v177, v179); + svfloat32_t v187 = svsub_f32_x(svptrue_b32(), v177, v179); + svfloat32_t zero221 = svdup_n_f32(0); svfloat32_t v221 = svcmla_f32_x(pred_full, zero221, v388, v175, 90); - svfloat32_t v184; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v184) : "w"(v180), "w"(v182)); - svfloat32_t v185; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v185) : "w"(v180), "w"(v182)); - svfloat32_t zero209; - asm volatile("mov %0.s, #0" : "=w"(zero209)); + svfloat32_t v184 = svadd_f32_x(svptrue_b32(), v180, v182); + svfloat32_t v185 = svsub_f32_x(svptrue_b32(), v180, v182); + svfloat32_t zero209 = svdup_n_f32(0); svfloat32_t v209 = svcmla_f32_x(pred_full, zero209, v388, v183, 90); - svfloat32_t zero228; - asm volatile("mov %0.s, #0" : "=w"(zero228)); + svfloat32_t zero228 = svdup_n_f32(0); svfloat32_t v228 = svcmla_f32_x(pred_full, zero228, v389, v186, 90); - svfloat32_t v234; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v234) : "w"(v181), "w"(v209)); - svfloat32_t v235; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v235) : "w"(v181), "w"(v209)); + svfloat32_t v234 = svadd_f32_x(svptrue_b32(), v181, v209); + svfloat32_t v235 = svsub_f32_x(svptrue_b32(), v181, v209); svfloat32_t v236 = svmla_f32_x(pred_full, v173, v187, v390); svfloat32_t v237 = svmls_f32_x(pred_full, v173, v187, v390); - svfloat32_t v238; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v238) : "w"(v221), "w"(v228)); - svfloat32_t v239; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v239) : "w"(v221), "w"(v228)); + svfloat32_t v238 = svadd_f32_x(svptrue_b32(), v221, v228); + svfloat32_t v239 = svsub_f32_x(svptrue_b32(), v221, v228); svint16_t v246 = svtbl_s16( svreinterpret_s16_s32(svcvt_s32_f32_x( pred_full, svmul_n_f32_x(pred_full, v184, (float)(1ULL << 31ULL)))), @@ -1774,14 +1642,10 @@ void armral_fft_cf32_cf32_cs16_ab_t_gu8(const armral_cmplx_f32_t *restrict x, pred_full, svmul_n_f32_x(pred_full, v185, (float)(1ULL << 31ULL)))), svreinterpret_u16_u64( svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svfloat32_t v240; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v240) : "w"(v236), "w"(v238)); - svfloat32_t v241; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v241) : "w"(v236), "w"(v238)); - svfloat32_t v242; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v242) : "w"(v237), "w"(v239)); - svfloat32_t v243; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v243) : "w"(v237), "w"(v239)); + svfloat32_t v240 = svadd_f32_x(svptrue_b32(), v236, v238); + svfloat32_t v241 = svsub_f32_x(svptrue_b32(), v236, v238); + svfloat32_t v242 = svadd_f32_x(svptrue_b32(), v237, v239); + svfloat32_t v243 = svsub_f32_x(svptrue_b32(), v237, v239); svint16_t v262 = svtbl_s16( svreinterpret_s16_s32(svcvt_s32_f32_x( pred_full, svmul_n_f32_x(pred_full, v235, (float)(1ULL << 31ULL)))), @@ -2124,8 +1988,7 @@ void armral_fft_cf32_cf32_cs16_ab_t_gu9(const armral_cmplx_f32_t *restrict x, int32_t *v531 = &v6[v352]; int32_t *v540 = &v6[v360]; int32_t *v549 = &v6[v368]; - svfloat32_t zero52; - asm volatile("mov %0.s, #0" : "=w"(zero52)); + svfloat32_t zero52 = svdup_n_f32(0); svfloat32_t v52 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero52, v383, v51, 0), v383, v51, 90); @@ -2157,112 +2020,72 @@ void armral_fft_cf32_cf32_cs16_ab_t_gu9(const armral_cmplx_f32_t *restrict x, svld1_gather_s64index_f64(pred_full, (const double *)(v437), v457)); svfloat32_t v448 = svreinterpret_f32_f64( svld1_gather_s64index_f64(pred_full, (const double *)(v446), v457)); - svfloat32_t zero59; - asm volatile("mov %0.s, #0" : "=w"(zero59)); + svfloat32_t zero59 = svdup_n_f32(0); svfloat32_t v59 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero59, v393, v58, 0), v393, v58, 90); - svfloat32_t zero94; - asm volatile("mov %0.s, #0" : "=w"(zero94)); + svfloat32_t zero94 = svdup_n_f32(0); svfloat32_t v94 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero94, v403, v93, 0), v403, v93, 90); - svfloat32_t zero101; - asm volatile("mov %0.s, #0" : "=w"(zero101)); + svfloat32_t zero101 = svdup_n_f32(0); svfloat32_t v101 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero101, v412, v100, 0), v412, v100, 90); - svfloat32_t zero136; - asm volatile("mov %0.s, #0" : "=w"(zero136)); + svfloat32_t zero136 = svdup_n_f32(0); svfloat32_t v136 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero136, v421, v135, 0), v421, v135, 90); - svfloat32_t zero143; - asm volatile("mov %0.s, #0" : "=w"(zero143)); + svfloat32_t zero143 = svdup_n_f32(0); svfloat32_t v143 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero143, v430, v142, 0), v430, v142, 90); - svfloat32_t zero178; - asm volatile("mov %0.s, #0" : "=w"(zero178)); + svfloat32_t zero178 = svdup_n_f32(0); svfloat32_t v178 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero178, v439, v177, 0), v439, v177, 90); - svfloat32_t zero185; - asm volatile("mov %0.s, #0" : "=w"(zero185)); + svfloat32_t zero185 = svdup_n_f32(0); svfloat32_t v185 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero185, v448, v184, 0), v448, v184, 90); - svfloat32_t v186; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v186) : "w"(v52), "w"(v59)); - svfloat32_t v187; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v187) : "w"(v52), "w"(v59)); - svfloat32_t v188; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v188) : "w"(v94), "w"(v101)); - svfloat32_t v189; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v189) : "w"(v94), "w"(v101)); - svfloat32_t v190; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v190) : "w"(v136), "w"(v143)); - svfloat32_t v191; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v191) : "w"(v136), "w"(v143)); - svfloat32_t v192; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v192) : "w"(v178), "w"(v185)); - svfloat32_t v193; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v193) : "w"(v178), "w"(v185)); - svfloat32_t v194; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v194) : "w"(v186), "w"(v188)); - svfloat32_t v205; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v205) : "w"(v187), "w"(v189)); - svfloat32_t v207; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v207) : "w"(v186), "w"(v188)); - svfloat32_t v208; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v208) : "w"(v188), "w"(v192)); - svfloat32_t v209; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v209) : "w"(v192), "w"(v186)); - svfloat32_t v210; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v210) : "w"(v187), "w"(v189)); - svfloat32_t v211; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v211) : "w"(v189), "w"(v193)); - svfloat32_t v212; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v212) : "w"(v193), "w"(v187)); - svfloat32_t zero241; - asm volatile("mov %0.s, #0" : "=w"(zero241)); + svfloat32_t v186 = svadd_f32_x(svptrue_b32(), v52, v59); + svfloat32_t v187 = svsub_f32_x(svptrue_b32(), v52, v59); + svfloat32_t v188 = svadd_f32_x(svptrue_b32(), v94, v101); + svfloat32_t v189 = svsub_f32_x(svptrue_b32(), v94, v101); + svfloat32_t v190 = svadd_f32_x(svptrue_b32(), v136, v143); + svfloat32_t v191 = svsub_f32_x(svptrue_b32(), v136, v143); + svfloat32_t v192 = svadd_f32_x(svptrue_b32(), v178, v185); + svfloat32_t v193 = svsub_f32_x(svptrue_b32(), v178, v185); + svfloat32_t v194 = svadd_f32_x(svptrue_b32(), v186, v188); + svfloat32_t v205 = svadd_f32_x(svptrue_b32(), v187, v189); + svfloat32_t v207 = svsub_f32_x(svptrue_b32(), v186, v188); + svfloat32_t v208 = svsub_f32_x(svptrue_b32(), v188, v192); + svfloat32_t v209 = svsub_f32_x(svptrue_b32(), v192, v186); + svfloat32_t v210 = svsub_f32_x(svptrue_b32(), v187, v189); + svfloat32_t v211 = svsub_f32_x(svptrue_b32(), v189, v193); + svfloat32_t v212 = svsub_f32_x(svptrue_b32(), v193, v187); + svfloat32_t zero241 = svdup_n_f32(0); svfloat32_t v241 = svcmla_f32_x(pred_full, zero241, v463, v191, 90); - svfloat32_t v195; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v195) : "w"(v194), "w"(v192)); - svfloat32_t v206; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v206) : "w"(v205), "w"(v193)); - svfloat32_t zero263; - asm volatile("mov %0.s, #0" : "=w"(zero263)); + svfloat32_t v195 = svadd_f32_x(svptrue_b32(), v194, v192); + svfloat32_t v206 = svadd_f32_x(svptrue_b32(), v205, v193); + svfloat32_t zero263 = svdup_n_f32(0); svfloat32_t v263 = svcmla_f32_x(pred_full, zero263, v467, v210, 90); - svfloat32_t zero270; - asm volatile("mov %0.s, #0" : "=w"(zero270)); + svfloat32_t zero270 = svdup_n_f32(0); svfloat32_t v270 = svcmla_f32_x(pred_full, zero270, v468, v211, 90); - svfloat32_t zero277; - asm volatile("mov %0.s, #0" : "=w"(zero277)); + svfloat32_t zero277 = svdup_n_f32(0); svfloat32_t v277 = svcmla_f32_x(pred_full, zero277, v469, v212, 90); - svfloat32_t v196; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v196) : "w"(v195), "w"(v190)); - svfloat32_t v222; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v222) : "w"(v195), "w"(v460)); - svfloat32_t zero229; - asm volatile("mov %0.s, #0" : "=w"(zero229)); + svfloat32_t v196 = svadd_f32_x(svptrue_b32(), v195, v190); + svfloat32_t v222 = svmul_f32_x(svptrue_b32(), v195, v460); + svfloat32_t zero229 = svdup_n_f32(0); svfloat32_t v229 = svcmla_f32_x(pred_full, zero229, v463, v206, 90); - svfloat32_t v291; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v291) : "w"(v241), "w"(v263)); - svfloat32_t v293; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v293) : "w"(v241), "w"(v270)); - svfloat32_t v295; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v295) : "w"(v241), "w"(v263)); - svfloat32_t v204; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v204) : "w"(v196), "w"(v458)); - svfloat32_t v278; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v278) : "w"(v222), "w"(v222)); - svfloat32_t v292; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v292) : "w"(v291), "w"(v270)); - svfloat32_t v294; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v294) : "w"(v293), "w"(v277)); - svfloat32_t v296; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v296) : "w"(v295), "w"(v277)); + svfloat32_t v291 = svadd_f32_x(svptrue_b32(), v241, v263); + svfloat32_t v293 = svsub_f32_x(svptrue_b32(), v241, v270); + svfloat32_t v295 = svsub_f32_x(svptrue_b32(), v241, v263); + svfloat32_t v204 = svadd_f32_x(svptrue_b32(), v196, v458); + svfloat32_t v278 = svadd_f32_x(svptrue_b32(), v222, v222); + svfloat32_t v292 = svadd_f32_x(svptrue_b32(), v291, v270); + svfloat32_t v294 = svadd_f32_x(svptrue_b32(), v293, v277); + svfloat32_t v296 = svsub_f32_x(svptrue_b32(), v295, v277); svfloat32_t v279 = svmla_f32_x(pred_full, v278, v195, v460); svfloat32_t v283 = svmla_f32_x(pred_full, v204, v190, v462); svint16_t v305 = svtbl_s16( @@ -2270,15 +2093,11 @@ void armral_fft_cf32_cf32_cs16_ab_t_gu9(const armral_cmplx_f32_t *restrict x, pred_full, svmul_n_f32_x(pred_full, v204, (float)(1ULL << 31ULL)))), svreinterpret_u16_u64( svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svfloat32_t v280; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v280) : "w"(v204), "w"(v279)); - svfloat32_t v284; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v284) : "w"(v283), "w"(v278)); + svfloat32_t v280 = svadd_f32_x(svptrue_b32(), v204, v279); + svfloat32_t v284 = svadd_f32_x(svptrue_b32(), v283, v278); svst1w_u64(pred_full, (unsigned *)(v477), svreinterpret_u64_s16(v305)); - svfloat32_t v281; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v281) : "w"(v280), "w"(v229)); - svfloat32_t v282; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v282) : "w"(v280), "w"(v229)); + svfloat32_t v281 = svadd_f32_x(svptrue_b32(), v280, v229); + svfloat32_t v282 = svsub_f32_x(svptrue_b32(), v280, v229); svfloat32_t v285 = svmla_f32_x(pred_full, v284, v207, v464); svfloat32_t v287 = svmls_f32_x(pred_full, v284, v208, v465); svfloat32_t v289 = svmls_f32_x(pred_full, v284, v207, v464); @@ -2295,18 +2114,12 @@ void armral_fft_cf32_cf32_cs16_ab_t_gu9(const armral_cmplx_f32_t *restrict x, pred_full, svmul_n_f32_x(pred_full, v281, (float)(1ULL << 31ULL)))), svreinterpret_u16_u64( svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svfloat32_t v297; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v297) : "w"(v286), "w"(v292)); - svfloat32_t v298; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v298) : "w"(v286), "w"(v292)); - svfloat32_t v299; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v299) : "w"(v288), "w"(v294)); - svfloat32_t v300; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v300) : "w"(v288), "w"(v294)); - svfloat32_t v301; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v301) : "w"(v290), "w"(v296)); - svfloat32_t v302; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v302) : "w"(v290), "w"(v296)); + svfloat32_t v297 = svadd_f32_x(svptrue_b32(), v286, v292); + svfloat32_t v298 = svsub_f32_x(svptrue_b32(), v286, v292); + svfloat32_t v299 = svadd_f32_x(svptrue_b32(), v288, v294); + svfloat32_t v300 = svsub_f32_x(svptrue_b32(), v288, v294); + svfloat32_t v301 = svadd_f32_x(svptrue_b32(), v290, v296); + svfloat32_t v302 = svsub_f32_x(svptrue_b32(), v290, v296); svst1w_u64(pred_full, (unsigned *)(v504), svreinterpret_u64_s16(v329)); svst1w_u64(pred_full, (unsigned *)(v531), svreinterpret_u64_s16(v353)); svint16_t v313 = svtbl_s16( @@ -2660,8 +2473,7 @@ void armral_fft_cf32_cf32_cs16_ab_t_gu10(const armral_cmplx_f32_t *restrict x, svld1_f64(pred_full, &((const double *)v7)[v120])); svfloat32_t v156 = svreinterpret_f32_f64( svld1_f64(pred_full, &((const double *)v7)[v155])); - svfloat32_t zero164; - asm volatile("mov %0.s, #0" : "=w"(zero164)); + svfloat32_t zero164 = svdup_n_f32(0); svfloat32_t v164 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero164, v472, v163, 0), v472, v163, 90); @@ -2685,117 +2497,77 @@ void armral_fft_cf32_cf32_cs16_ab_t_gu10(const armral_cmplx_f32_t *restrict x, svld1_gather_s64index_f64(pred_full, (const double *)(v481), v501)); svfloat32_t v492 = svreinterpret_f32_f64( svld1_gather_s64index_f64(pred_full, (const double *)(v490), v501)); - svfloat32_t zero38; - asm volatile("mov %0.s, #0" : "=w"(zero38)); + svfloat32_t zero38 = svdup_n_f32(0); svfloat32_t v38 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero38, v418, v37, 0), v418, v37, 90); - svfloat32_t zero73; - asm volatile("mov %0.s, #0" : "=w"(zero73)); + svfloat32_t zero73 = svdup_n_f32(0); svfloat32_t v73 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero73, v427, v72, 0), v427, v72, 90); - svfloat32_t zero80; - asm volatile("mov %0.s, #0" : "=w"(zero80)); + svfloat32_t zero80 = svdup_n_f32(0); svfloat32_t v80 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero80, v436, v79, 0), v436, v79, 90); - svfloat32_t zero115; - asm volatile("mov %0.s, #0" : "=w"(zero115)); + svfloat32_t zero115 = svdup_n_f32(0); svfloat32_t v115 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero115, v445, v114, 0), v445, v114, 90); - svfloat32_t zero122; - asm volatile("mov %0.s, #0" : "=w"(zero122)); + svfloat32_t zero122 = svdup_n_f32(0); svfloat32_t v122 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero122, v454, v121, 0), v454, v121, 90); - svfloat32_t zero157; - asm volatile("mov %0.s, #0" : "=w"(zero157)); + svfloat32_t zero157 = svdup_n_f32(0); svfloat32_t v157 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero157, v463, v156, 0), v463, v156, 90); - svfloat32_t zero199; - asm volatile("mov %0.s, #0" : "=w"(zero199)); + svfloat32_t zero199 = svdup_n_f32(0); svfloat32_t v199 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero199, v483, v198, 0), v483, v198, 90); - svfloat32_t zero206; - asm volatile("mov %0.s, #0" : "=w"(zero206)); + svfloat32_t zero206 = svdup_n_f32(0); svfloat32_t v206 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero206, v492, v205, 0), v492, v205, 90); - svfloat32_t v214; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v214) : "w"(v502), "w"(v38)); - svfloat32_t v215; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v215) : "w"(v502), "w"(v38)); - svfloat32_t v216; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v216) : "w"(v73), "w"(v80)); - svfloat32_t v217; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v217) : "w"(v73), "w"(v80)); - svfloat32_t v218; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v218) : "w"(v115), "w"(v122)); - svfloat32_t v219; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v219) : "w"(v115), "w"(v122)); - svfloat32_t v220; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v220) : "w"(v157), "w"(v164)); - svfloat32_t v221; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v221) : "w"(v157), "w"(v164)); - svfloat32_t v222; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v222) : "w"(v199), "w"(v206)); - svfloat32_t v223; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v223) : "w"(v199), "w"(v206)); - svfloat32_t v224; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v224) : "w"(v216), "w"(v222)); - svfloat32_t v225; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v225) : "w"(v216), "w"(v222)); - svfloat32_t v226; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v226) : "w"(v220), "w"(v218)); - svfloat32_t v227; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v227) : "w"(v220), "w"(v218)); - svfloat32_t v277; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v277) : "w"(v217), "w"(v223)); - svfloat32_t v278; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v278) : "w"(v217), "w"(v223)); - svfloat32_t v279; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v279) : "w"(v221), "w"(v219)); - svfloat32_t v280; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v280) : "w"(v221), "w"(v219)); - svfloat32_t v228; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v228) : "w"(v224), "w"(v226)); - svfloat32_t v229; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v229) : "w"(v224), "w"(v226)); - svfloat32_t v230; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v230) : "w"(v225), "w"(v227)); - svfloat32_t zero253; - asm volatile("mov %0.s, #0" : "=w"(zero253)); + svfloat32_t v214 = svadd_f32_x(svptrue_b32(), v502, v38); + svfloat32_t v215 = svsub_f32_x(svptrue_b32(), v502, v38); + svfloat32_t v216 = svadd_f32_x(svptrue_b32(), v73, v80); + svfloat32_t v217 = svsub_f32_x(svptrue_b32(), v73, v80); + svfloat32_t v218 = svadd_f32_x(svptrue_b32(), v115, v122); + svfloat32_t v219 = svsub_f32_x(svptrue_b32(), v115, v122); + svfloat32_t v220 = svadd_f32_x(svptrue_b32(), v157, v164); + svfloat32_t v221 = svsub_f32_x(svptrue_b32(), v157, v164); + svfloat32_t v222 = svadd_f32_x(svptrue_b32(), v199, v206); + svfloat32_t v223 = svsub_f32_x(svptrue_b32(), v199, v206); + svfloat32_t v224 = svadd_f32_x(svptrue_b32(), v216, v222); + svfloat32_t v225 = svsub_f32_x(svptrue_b32(), v216, v222); + svfloat32_t v226 = svadd_f32_x(svptrue_b32(), v220, v218); + svfloat32_t v227 = svsub_f32_x(svptrue_b32(), v220, v218); + svfloat32_t v277 = svadd_f32_x(svptrue_b32(), v217, v223); + svfloat32_t v278 = svsub_f32_x(svptrue_b32(), v217, v223); + svfloat32_t v279 = svadd_f32_x(svptrue_b32(), v221, v219); + svfloat32_t v280 = svsub_f32_x(svptrue_b32(), v221, v219); + svfloat32_t v228 = svadd_f32_x(svptrue_b32(), v224, v226); + svfloat32_t v229 = svsub_f32_x(svptrue_b32(), v224, v226); + svfloat32_t v230 = svadd_f32_x(svptrue_b32(), v225, v227); + svfloat32_t zero253 = svdup_n_f32(0); svfloat32_t v253 = svcmla_f32_x(pred_full, zero253, v512, v225, 90); - svfloat32_t v281; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v281) : "w"(v277), "w"(v279)); - svfloat32_t v282; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v282) : "w"(v277), "w"(v279)); - svfloat32_t v283; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v283) : "w"(v278), "w"(v280)); - svfloat32_t zero306; - asm volatile("mov %0.s, #0" : "=w"(zero306)); + svfloat32_t v281 = svadd_f32_x(svptrue_b32(), v277, v279); + svfloat32_t v282 = svsub_f32_x(svptrue_b32(), v277, v279); + svfloat32_t v283 = svadd_f32_x(svptrue_b32(), v278, v280); + svfloat32_t zero306 = svdup_n_f32(0); svfloat32_t v306 = svcmla_f32_x(pred_full, zero306, v512, v278, 90); - svfloat32_t v231; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v231) : "w"(v228), "w"(v214)); - svfloat32_t zero260; - asm volatile("mov %0.s, #0" : "=w"(zero260)); + svfloat32_t v231 = svadd_f32_x(svptrue_b32(), v228, v214); + svfloat32_t zero260 = svdup_n_f32(0); svfloat32_t v260 = svcmla_f32_x(pred_full, zero260, v513, v230, 90); - svfloat32_t v284; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v284) : "w"(v281), "w"(v215)); - svfloat32_t zero313; - asm volatile("mov %0.s, #0" : "=w"(zero313)); + svfloat32_t v284 = svadd_f32_x(svptrue_b32(), v281, v215); + svfloat32_t zero313 = svdup_n_f32(0); svfloat32_t v313 = svcmla_f32_x(pred_full, zero313, v513, v283, 90); svfloat32_t v268 = svmla_f32_x(pred_full, v231, v228, v510); - svfloat32_t v271; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v271) : "w"(v253), "w"(v260)); + svfloat32_t v271 = svsub_f32_x(svptrue_b32(), v253, v260); svfloat32_t v272 = svcmla_f32_x(pred_full, v260, v514, v227, 90); svfloat32_t v321 = svmla_f32_x(pred_full, v284, v281, v510); - svfloat32_t v324; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v324) : "w"(v306), "w"(v313)); + svfloat32_t v324 = svsub_f32_x(svptrue_b32(), v306, v313); svfloat32_t v325 = svcmla_f32_x(pred_full, v313, v514, v280, 90); svint16_t v332 = svtbl_s16( svreinterpret_s16_s32(svcvt_s32_f32_x( @@ -2813,22 +2585,14 @@ void armral_fft_cf32_cf32_cs16_ab_t_gu10(const armral_cmplx_f32_t *restrict x, svfloat32_t v323 = svmls_f32_x(pred_full, v321, v282, v511); svst1w_u64(pred_full, (unsigned *)(v522), svreinterpret_u64_s16(v332)); svst1w_u64(pred_full, (unsigned *)(v531), svreinterpret_u64_s16(v340)); - svfloat32_t v273; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v273) : "w"(v269), "w"(v271)); - svfloat32_t v274; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v274) : "w"(v269), "w"(v271)); - svfloat32_t v275; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v275) : "w"(v270), "w"(v272)); - svfloat32_t v276; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v276) : "w"(v270), "w"(v272)); - svfloat32_t v326; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v326) : "w"(v322), "w"(v324)); - svfloat32_t v327; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v327) : "w"(v322), "w"(v324)); - svfloat32_t v328; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v328) : "w"(v323), "w"(v325)); - svfloat32_t v329; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v329) : "w"(v323), "w"(v325)); + svfloat32_t v273 = svadd_f32_x(svptrue_b32(), v269, v271); + svfloat32_t v274 = svsub_f32_x(svptrue_b32(), v269, v271); + svfloat32_t v275 = svadd_f32_x(svptrue_b32(), v270, v272); + svfloat32_t v276 = svsub_f32_x(svptrue_b32(), v270, v272); + svfloat32_t v326 = svadd_f32_x(svptrue_b32(), v322, v324); + svfloat32_t v327 = svsub_f32_x(svptrue_b32(), v322, v324); + svfloat32_t v328 = svadd_f32_x(svptrue_b32(), v323, v325); + svfloat32_t v329 = svsub_f32_x(svptrue_b32(), v323, v325); svint16_t v348 = svtbl_s16( svreinterpret_s16_s32(svcvt_s32_f32_x( pred_full, svmul_n_f32_x(pred_full, v274, (float)(1ULL << 31ULL)))), @@ -3333,8 +3097,7 @@ void armral_fft_cf32_cf32_cs16_ab_t_gu11(const armral_cmplx_f32_t *restrict x, int32_t *v725 = &v6[v501]; int32_t *v734 = &v6[v509]; int32_t *v743 = &v6[v517]; - svfloat32_t zero164; - asm volatile("mov %0.s, #0" : "=w"(zero164)); + svfloat32_t zero164 = svdup_n_f32(0); svfloat32_t v164 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero164, v540, v163, 0), v540, v163, 90); @@ -3374,150 +3137,94 @@ void armral_fft_cf32_cf32_cs16_ab_t_gu11(const armral_cmplx_f32_t *restrict x, svld1_gather_s64index_f64(pred_full, (const double *)(v611), v632)); svfloat32_t v622 = svreinterpret_f32_f64( svld1_gather_s64index_f64(pred_full, (const double *)(v620), v632)); - svfloat32_t zero171; - asm volatile("mov %0.s, #0" : "=w"(zero171)); + svfloat32_t zero171 = svdup_n_f32(0); svfloat32_t v171 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero171, v550, v170, 0), v550, v170, 90); - svfloat32_t zero178; - asm volatile("mov %0.s, #0" : "=w"(zero178)); + svfloat32_t zero178 = svdup_n_f32(0); svfloat32_t v178 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero178, v559, v177, 0), v559, v177, 90); - svfloat32_t zero185; - asm volatile("mov %0.s, #0" : "=w"(zero185)); + svfloat32_t zero185 = svdup_n_f32(0); svfloat32_t v185 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero185, v568, v184, 0), v568, v184, 90); - svfloat32_t zero192; - asm volatile("mov %0.s, #0" : "=w"(zero192)); + svfloat32_t zero192 = svdup_n_f32(0); svfloat32_t v192 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero192, v577, v191, 0), v577, v191, 90); - svfloat32_t zero199; - asm volatile("mov %0.s, #0" : "=w"(zero199)); + svfloat32_t zero199 = svdup_n_f32(0); svfloat32_t v199 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero199, v586, v198, 0), v586, v198, 90); - svfloat32_t zero206; - asm volatile("mov %0.s, #0" : "=w"(zero206)); + svfloat32_t zero206 = svdup_n_f32(0); svfloat32_t v206 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero206, v595, v205, 0), v595, v205, 90); - svfloat32_t zero213; - asm volatile("mov %0.s, #0" : "=w"(zero213)); + svfloat32_t zero213 = svdup_n_f32(0); svfloat32_t v213 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero213, v604, v212, 0), v604, v212, 90); - svfloat32_t zero220; - asm volatile("mov %0.s, #0" : "=w"(zero220)); + svfloat32_t zero220 = svdup_n_f32(0); svfloat32_t v220 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero220, v613, v219, 0), v613, v219, 90); - svfloat32_t zero227; - asm volatile("mov %0.s, #0" : "=w"(zero227)); + svfloat32_t zero227 = svdup_n_f32(0); svfloat32_t v227 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero227, v622, v226, 0), v622, v226, 90); - svfloat32_t v228; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v228) : "w"(v164), "w"(v171)); - svfloat32_t v229; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v229) : "w"(v178), "w"(v185)); - svfloat32_t v230; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v230) : "w"(v192), "w"(v199)); - svfloat32_t v231; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v231) : "w"(v206), "w"(v213)); - svfloat32_t v232; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v232) : "w"(v220), "w"(v227)); - svfloat32_t v233; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v233) : "w"(v164), "w"(v171)); - svfloat32_t v234; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v234) : "w"(v178), "w"(v185)); - svfloat32_t v235; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v235) : "w"(v192), "w"(v199)); - svfloat32_t v236; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v236) : "w"(v206), "w"(v213)); - svfloat32_t v237; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v237) : "w"(v220), "w"(v227)); - svfloat32_t v238; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v238) : "w"(v228), "w"(v229)); - svfloat32_t v239; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v239) : "w"(v230), "w"(v232)); - svfloat32_t v241; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v241) : "w"(v234), "w"(v235)); - svfloat32_t v242; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v242) : "w"(v233), "w"(v237)); - svfloat32_t v254; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v254) : "w"(v229), "w"(v231)); - svfloat32_t v255; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v255) : "w"(v228), "w"(v231)); - svfloat32_t v256; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v256) : "w"(v229), "w"(v228)); - svfloat32_t v257; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v257) : "w"(v232), "w"(v231)); - svfloat32_t v258; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v258) : "w"(v230), "w"(v231)); - svfloat32_t v259; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v259) : "w"(v232), "w"(v230)); - svfloat32_t v260; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v260) : "w"(v229), "w"(v232)); - svfloat32_t v261; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v261) : "w"(v228), "w"(v230)); - svfloat32_t v263; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v263) : "w"(v234), "w"(v236)); - svfloat32_t v264; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v264) : "w"(v233), "w"(v236)); - svfloat32_t v265; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v265) : "w"(v233), "w"(v234)); - svfloat32_t v266; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v266) : "w"(v236), "w"(v237)); - svfloat32_t v267; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v267) : "w"(v235), "w"(v236)); - svfloat32_t v268; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v268) : "w"(v235), "w"(v237)); - svfloat32_t v269; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v269) : "w"(v234), "w"(v237)); - svfloat32_t v270; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v270) : "w"(v233), "w"(v235)); - svfloat32_t v240; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v240) : "w"(v231), "w"(v238)); - svfloat32_t v252; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v252) : "w"(v241), "w"(v242)); - svfloat32_t v262; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v262) : "w"(v239), "w"(v238)); - svfloat32_t v271; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v271) : "w"(v241), "w"(v242)); - svfloat32_t v298; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v298) : "w"(v255), "w"(v638)); - svfloat32_t v303; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v303) : "w"(v256), "w"(v639)); - svfloat32_t v313; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v313) : "w"(v258), "w"(v641)); - svfloat32_t v318; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v318) : "w"(v259), "w"(v642)); - svfloat32_t zero340; - asm volatile("mov %0.s, #0" : "=w"(zero340)); + svfloat32_t v228 = svadd_f32_x(svptrue_b32(), v164, v171); + svfloat32_t v229 = svadd_f32_x(svptrue_b32(), v178, v185); + svfloat32_t v230 = svadd_f32_x(svptrue_b32(), v192, v199); + svfloat32_t v231 = svadd_f32_x(svptrue_b32(), v206, v213); + svfloat32_t v232 = svadd_f32_x(svptrue_b32(), v220, v227); + svfloat32_t v233 = svsub_f32_x(svptrue_b32(), v164, v171); + svfloat32_t v234 = svsub_f32_x(svptrue_b32(), v178, v185); + svfloat32_t v235 = svsub_f32_x(svptrue_b32(), v192, v199); + svfloat32_t v236 = svsub_f32_x(svptrue_b32(), v206, v213); + svfloat32_t v237 = svsub_f32_x(svptrue_b32(), v220, v227); + svfloat32_t v238 = svadd_f32_x(svptrue_b32(), v228, v229); + svfloat32_t v239 = svadd_f32_x(svptrue_b32(), v230, v232); + svfloat32_t v241 = svsub_f32_x(svptrue_b32(), v234, v235); + svfloat32_t v242 = svadd_f32_x(svptrue_b32(), v233, v237); + svfloat32_t v254 = svsub_f32_x(svptrue_b32(), v229, v231); + svfloat32_t v255 = svsub_f32_x(svptrue_b32(), v228, v231); + svfloat32_t v256 = svsub_f32_x(svptrue_b32(), v229, v228); + svfloat32_t v257 = svsub_f32_x(svptrue_b32(), v232, v231); + svfloat32_t v258 = svsub_f32_x(svptrue_b32(), v230, v231); + svfloat32_t v259 = svsub_f32_x(svptrue_b32(), v232, v230); + svfloat32_t v260 = svsub_f32_x(svptrue_b32(), v229, v232); + svfloat32_t v261 = svsub_f32_x(svptrue_b32(), v228, v230); + svfloat32_t v263 = svadd_f32_x(svptrue_b32(), v234, v236); + svfloat32_t v264 = svsub_f32_x(svptrue_b32(), v233, v236); + svfloat32_t v265 = svadd_f32_x(svptrue_b32(), v233, v234); + svfloat32_t v266 = svsub_f32_x(svptrue_b32(), v236, v237); + svfloat32_t v267 = svsub_f32_x(svptrue_b32(), v235, v236); + svfloat32_t v268 = svsub_f32_x(svptrue_b32(), v235, v237); + svfloat32_t v269 = svadd_f32_x(svptrue_b32(), v234, v237); + svfloat32_t v270 = svsub_f32_x(svptrue_b32(), v233, v235); + svfloat32_t v240 = svadd_f32_x(svptrue_b32(), v231, v238); + svfloat32_t v252 = svsub_f32_x(svptrue_b32(), v241, v242); + svfloat32_t v262 = svsub_f32_x(svptrue_b32(), v239, v238); + svfloat32_t v271 = svadd_f32_x(svptrue_b32(), v241, v242); + svfloat32_t v298 = svmul_f32_x(svptrue_b32(), v255, v638); + svfloat32_t v303 = svmul_f32_x(svptrue_b32(), v256, v639); + svfloat32_t v313 = svmul_f32_x(svptrue_b32(), v258, v641); + svfloat32_t v318 = svmul_f32_x(svptrue_b32(), v259, v642); + svfloat32_t zero340 = svdup_n_f32(0); svfloat32_t v340 = svcmla_f32_x(pred_full, zero340, v646, v263, 90); - svfloat32_t zero354; - asm volatile("mov %0.s, #0" : "=w"(zero354)); + svfloat32_t zero354 = svdup_n_f32(0); svfloat32_t v354 = svcmla_f32_x(pred_full, zero354, v648, v265, 90); - svfloat32_t zero361; - asm volatile("mov %0.s, #0" : "=w"(zero361)); + svfloat32_t zero361 = svdup_n_f32(0); svfloat32_t v361 = svcmla_f32_x(pred_full, zero361, v649, v266, 90); - svfloat32_t zero375; - asm volatile("mov %0.s, #0" : "=w"(zero375)); + svfloat32_t zero375 = svdup_n_f32(0); svfloat32_t v375 = svcmla_f32_x(pred_full, zero375, v651, v268, 90); - svfloat32_t zero382; - asm volatile("mov %0.s, #0" : "=w"(zero382)); + svfloat32_t zero382 = svdup_n_f32(0); svfloat32_t v382 = svcmla_f32_x(pred_full, zero382, v652, v269, 90); - svfloat32_t v243; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v243) : "w"(v240), "w"(v239)); - svfloat32_t v253; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v253) : "w"(v252), "w"(v236)); - svfloat32_t v333; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v333) : "w"(v262), "w"(v645)); - svfloat32_t zero396; - asm volatile("mov %0.s, #0" : "=w"(zero396)); + svfloat32_t v243 = svadd_f32_x(svptrue_b32(), v240, v239); + svfloat32_t v253 = svsub_f32_x(svptrue_b32(), v252, v236); + svfloat32_t v333 = svmul_f32_x(svptrue_b32(), v262, v645); + svfloat32_t zero396 = svdup_n_f32(0); svfloat32_t v396 = svcmla_f32_x(pred_full, zero396, v654, v271, 90); svfloat32_t v398 = svmla_f32_x(pred_full, v298, v254, v637); svfloat32_t v399 = svmla_f32_x(pred_full, v303, v255, v638); @@ -3526,92 +3233,55 @@ void armral_fft_cf32_cf32_cs16_ab_t_gu11(const armral_cmplx_f32_t *restrict x, svfloat32_t v402 = svmla_f32_x(pred_full, v318, v258, v641); svfloat32_t v403 = svnmls_f32_x(pred_full, v318, v257, v640); svfloat32_t v406 = svcmla_f32_x(pred_full, v354, v647, v264, 90); - svfloat32_t v407; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v407) : "w"(v340), "w"(v354)); + svfloat32_t v407 = svsub_f32_x(svptrue_b32(), v340, v354); svfloat32_t v408 = svcmla_f32_x(pred_full, v375, v650, v267, 90); - svfloat32_t v409; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v409) : "w"(v361), "w"(v375)); - svfloat32_t v251; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v251) : "w"(v633), "w"(v243)); - svfloat32_t zero288; - asm volatile("mov %0.s, #0" : "=w"(zero288)); + svfloat32_t v409 = svsub_f32_x(svptrue_b32(), v361, v375); + svfloat32_t v251 = svadd_f32_x(svptrue_b32(), v633, v243); + svfloat32_t zero288 = svdup_n_f32(0); svfloat32_t v288 = svcmla_f32_x(pred_full, zero288, v636, v253, 90); svfloat32_t v404 = svmla_f32_x(pred_full, v333, v261, v644); svfloat32_t v405 = svmla_f32_x(pred_full, v333, v260, v643); svfloat32_t v410 = svcmla_f32_x(pred_full, v396, v653, v270, 90); - svfloat32_t v411; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v411) : "w"(v382), "w"(v396)); - svfloat32_t v430; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v430) : "w"(v406), "w"(v407)); + svfloat32_t v411 = svsub_f32_x(svptrue_b32(), v382, v396); + svfloat32_t v430 = svadd_f32_x(svptrue_b32(), v406, v407); svfloat32_t v397 = svmls_f32_x(pred_full, v251, v243, v635); - svfloat32_t v412; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v412) : "w"(v402), "w"(v404)); - svfloat32_t v422; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v422) : "w"(v288), "w"(v408)); - svfloat32_t v424; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v424) : "w"(v410), "w"(v406)); - svfloat32_t v426; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v426) : "w"(v288), "w"(v411)); - svfloat32_t v428; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v428) : "w"(v411), "w"(v407)); - svfloat32_t v431; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v431) : "w"(v430), "w"(v408)); + svfloat32_t v412 = svadd_f32_x(svptrue_b32(), v402, v404); + svfloat32_t v422 = svadd_f32_x(svptrue_b32(), v288, v408); + svfloat32_t v424 = svsub_f32_x(svptrue_b32(), v410, v406); + svfloat32_t v426 = svadd_f32_x(svptrue_b32(), v288, v411); + svfloat32_t v428 = svsub_f32_x(svptrue_b32(), v411, v407); + svfloat32_t v431 = svadd_f32_x(svptrue_b32(), v430, v408); svint16_t v446 = svtbl_s16( svreinterpret_s16_s32(svcvt_s32_f32_x( pred_full, svmul_n_f32_x(pred_full, v251, (float)(1ULL << 31ULL)))), svreinterpret_u16_u64( svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svfloat32_t v413; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v413) : "w"(v412), "w"(v397)); - svfloat32_t v414; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v414) : "w"(v397), "w"(v399)); - svfloat32_t v416; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v416) : "w"(v397), "w"(v403)); - svfloat32_t v418; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v418) : "w"(v397), "w"(v400)); - svfloat32_t v420; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v420) : "w"(v397), "w"(v398)); - svfloat32_t v423; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v423) : "w"(v422), "w"(v410)); - svfloat32_t v425; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v425) : "w"(v424), "w"(v288)); - svfloat32_t v427; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v427) : "w"(v426), "w"(v409)); - svfloat32_t v429; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v429) : "w"(v428), "w"(v288)); - svfloat32_t v432; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v432) : "w"(v431), "w"(v409)); + svfloat32_t v413 = svadd_f32_x(svptrue_b32(), v412, v397); + svfloat32_t v414 = svsub_f32_x(svptrue_b32(), v397, v399); + svfloat32_t v416 = svadd_f32_x(svptrue_b32(), v397, v403); + svfloat32_t v418 = svsub_f32_x(svptrue_b32(), v397, v400); + svfloat32_t v420 = svadd_f32_x(svptrue_b32(), v397, v398); + svfloat32_t v423 = svadd_f32_x(svptrue_b32(), v422, v410); + svfloat32_t v425 = svsub_f32_x(svptrue_b32(), v424, v288); + svfloat32_t v427 = svadd_f32_x(svptrue_b32(), v426, v409); + svfloat32_t v429 = svsub_f32_x(svptrue_b32(), v428, v288); + svfloat32_t v432 = svadd_f32_x(svptrue_b32(), v431, v409); svst1w_u64(pred_full, (unsigned *)(v662), svreinterpret_u64_s16(v446)); - svfloat32_t v415; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v415) : "w"(v414), "w"(v404)); - svfloat32_t v417; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v417) : "w"(v416), "w"(v405)); - svfloat32_t v419; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v419) : "w"(v418), "w"(v405)); - svfloat32_t v421; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v421) : "w"(v420), "w"(v401)); - svfloat32_t v433; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v433) : "w"(v432), "w"(v288)); - svfloat32_t v435; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v435) : "w"(v413), "w"(v423)); - svfloat32_t v442; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v442) : "w"(v413), "w"(v423)); - svfloat32_t v434; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v434) : "w"(v421), "w"(v433)); - svfloat32_t v436; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v436) : "w"(v415), "w"(v425)); - svfloat32_t v437; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v437) : "w"(v417), "w"(v427)); - svfloat32_t v438; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v438) : "w"(v419), "w"(v429)); - svfloat32_t v439; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v439) : "w"(v419), "w"(v429)); - svfloat32_t v440; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v440) : "w"(v417), "w"(v427)); - svfloat32_t v441; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v441) : "w"(v415), "w"(v425)); - svfloat32_t v443; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v443) : "w"(v421), "w"(v433)); + svfloat32_t v415 = svsub_f32_x(svptrue_b32(), v414, v404); + svfloat32_t v417 = svadd_f32_x(svptrue_b32(), v416, v405); + svfloat32_t v419 = svsub_f32_x(svptrue_b32(), v418, v405); + svfloat32_t v421 = svsub_f32_x(svptrue_b32(), v420, v401); + svfloat32_t v433 = svsub_f32_x(svptrue_b32(), v432, v288); + svfloat32_t v435 = svadd_f32_x(svptrue_b32(), v413, v423); + svfloat32_t v442 = svsub_f32_x(svptrue_b32(), v413, v423); + svfloat32_t v434 = svadd_f32_x(svptrue_b32(), v421, v433); + svfloat32_t v436 = svadd_f32_x(svptrue_b32(), v415, v425); + svfloat32_t v437 = svsub_f32_x(svptrue_b32(), v417, v427); + svfloat32_t v438 = svadd_f32_x(svptrue_b32(), v419, v429); + svfloat32_t v439 = svsub_f32_x(svptrue_b32(), v419, v429); + svfloat32_t v440 = svadd_f32_x(svptrue_b32(), v417, v427); + svfloat32_t v441 = svsub_f32_x(svptrue_b32(), v415, v425); + svfloat32_t v443 = svsub_f32_x(svptrue_b32(), v421, v433); svint16_t v462 = svtbl_s16( svreinterpret_s16_s32(svcvt_s32_f32_x( pred_full, svmul_n_f32_x(pred_full, v435, (float)(1ULL << 31ULL)))), @@ -4026,8 +3696,7 @@ void armral_fft_cf32_cf32_cs16_ab_t_gu12(const armral_cmplx_f32_t *restrict x, svld1_f64(pred_full, &((const double *)v7)[v155])); svfloat32_t v170 = svreinterpret_f32_f64( svld1_f64(pred_full, &((const double *)v7)[v169])); - svfloat32_t zero206; - asm volatile("mov %0.s, #0" : "=w"(zero206)); + svfloat32_t zero206 = svdup_n_f32(0); svfloat32_t v206 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero206, v529, v205, 0), v529, v205, 90); @@ -4055,59 +3724,43 @@ void armral_fft_cf32_cf32_cs16_ab_t_gu12(const armral_cmplx_f32_t *restrict x, svld1_gather_s64index_f64(pred_full, (const double *)(v537), v558)); svfloat32_t v549 = svreinterpret_f32_f64( svld1_gather_s64index_f64(pred_full, (const double *)(v547), v558)); - svfloat32_t zero52; - asm volatile("mov %0.s, #0" : "=w"(zero52)); + svfloat32_t zero52 = svdup_n_f32(0); svfloat32_t v52 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero52, v457, v51, 0), v457, v51, 90); - svfloat32_t zero59; - asm volatile("mov %0.s, #0" : "=w"(zero59)); + svfloat32_t zero59 = svdup_n_f32(0); svfloat32_t v59 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero59, v466, v58, 0), v466, v58, 90); - svfloat32_t zero94; - asm volatile("mov %0.s, #0" : "=w"(zero94)); + svfloat32_t zero94 = svdup_n_f32(0); svfloat32_t v94 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero94, v475, v93, 0), v475, v93, 90); - svfloat32_t zero101; - asm volatile("mov %0.s, #0" : "=w"(zero101)); + svfloat32_t zero101 = svdup_n_f32(0); svfloat32_t v101 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero101, v484, v100, 0), v484, v100, 90); - svfloat32_t zero150; - asm volatile("mov %0.s, #0" : "=w"(zero150)); + svfloat32_t zero150 = svdup_n_f32(0); svfloat32_t v150 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero150, v502, v149, 0), v502, v149, 90); - svfloat32_t zero157; - asm volatile("mov %0.s, #0" : "=w"(zero157)); + svfloat32_t zero157 = svdup_n_f32(0); svfloat32_t v157 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero157, v511, v156, 0), v511, v156, 90); - svfloat32_t zero213; - asm volatile("mov %0.s, #0" : "=w"(zero213)); + svfloat32_t zero213 = svdup_n_f32(0); svfloat32_t v213 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero213, v539, v212, 0), v539, v212, 90); - svfloat32_t v228; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v228) : "w"(v52), "w"(v59)); - svfloat32_t v229; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v229) : "w"(v52), "w"(v59)); - svfloat32_t v238; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v238) : "w"(v94), "w"(v101)); - svfloat32_t v239; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v239) : "w"(v94), "w"(v101)); - svfloat32_t v241; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v241) : "w"(v150), "w"(v157)); - svfloat32_t v242; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v242) : "w"(v150), "w"(v157)); - svfloat32_t v244; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v244) : "w"(v206), "w"(v213)); - svfloat32_t v245; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v245) : "w"(v206), "w"(v213)); - svfloat32_t v237; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v237) : "w"(v228), "w"(v559)); + svfloat32_t v228 = svadd_f32_x(svptrue_b32(), v52, v59); + svfloat32_t v229 = svsub_f32_x(svptrue_b32(), v52, v59); + svfloat32_t v238 = svadd_f32_x(svptrue_b32(), v94, v101); + svfloat32_t v239 = svsub_f32_x(svptrue_b32(), v94, v101); + svfloat32_t v241 = svadd_f32_x(svptrue_b32(), v150, v157); + svfloat32_t v242 = svsub_f32_x(svptrue_b32(), v150, v157); + svfloat32_t v244 = svadd_f32_x(svptrue_b32(), v206, v213); + svfloat32_t v245 = svsub_f32_x(svptrue_b32(), v206, v213); + svfloat32_t v237 = svadd_f32_x(svptrue_b32(), v228, v559); svfloat32_t v240 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, v238, v493, v114, 0), v493, v114, 90); @@ -4117,65 +3770,40 @@ void armral_fft_cf32_cf32_cs16_ab_t_gu12(const armral_cmplx_f32_t *restrict x, svfloat32_t v246 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, v244, v549, v226, 0), v549, v226, 90); - svfloat32_t v277; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v277) : "w"(v228), "w"(v241)); - svfloat32_t v278; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v278) : "w"(v228), "w"(v241)); - svfloat32_t v279; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v279) : "w"(v238), "w"(v244)); - svfloat32_t v280; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v280) : "w"(v238), "w"(v244)); - svfloat32_t v307; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v307) : "w"(v229), "w"(v242)); - svfloat32_t v308; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v308) : "w"(v229), "w"(v242)); - svfloat32_t v309; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v309) : "w"(v239), "w"(v245)); - svfloat32_t v310; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v310) : "w"(v239), "w"(v245)); - svfloat32_t v247; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v247) : "w"(v237), "w"(v243)); - svfloat32_t v248; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v248) : "w"(v237), "w"(v243)); - svfloat32_t v249; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v249) : "w"(v240), "w"(v246)); - svfloat32_t v250; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v250) : "w"(v240), "w"(v246)); - svfloat32_t v281; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v281) : "w"(v277), "w"(v279)); - svfloat32_t v282; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v282) : "w"(v277), "w"(v279)); - svfloat32_t zero304; - asm volatile("mov %0.s, #0" : "=w"(zero304)); + svfloat32_t v277 = svadd_f32_x(svptrue_b32(), v228, v241); + svfloat32_t v278 = svsub_f32_x(svptrue_b32(), v228, v241); + svfloat32_t v279 = svadd_f32_x(svptrue_b32(), v238, v244); + svfloat32_t v280 = svsub_f32_x(svptrue_b32(), v238, v244); + svfloat32_t v307 = svadd_f32_x(svptrue_b32(), v229, v242); + svfloat32_t v308 = svsub_f32_x(svptrue_b32(), v229, v242); + svfloat32_t v309 = svadd_f32_x(svptrue_b32(), v239, v245); + svfloat32_t v310 = svsub_f32_x(svptrue_b32(), v239, v245); + svfloat32_t v247 = svadd_f32_x(svptrue_b32(), v237, v243); + svfloat32_t v248 = svsub_f32_x(svptrue_b32(), v237, v243); + svfloat32_t v249 = svadd_f32_x(svptrue_b32(), v240, v246); + svfloat32_t v250 = svsub_f32_x(svptrue_b32(), v240, v246); + svfloat32_t v281 = svadd_f32_x(svptrue_b32(), v277, v279); + svfloat32_t v282 = svsub_f32_x(svptrue_b32(), v277, v279); + svfloat32_t zero304 = svdup_n_f32(0); svfloat32_t v304 = svcmla_f32_x(pred_full, zero304, v567, v280, 90); - svfloat32_t v311; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v311) : "w"(v307), "w"(v309)); - svfloat32_t v312; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v312) : "w"(v307), "w"(v309)); - svfloat32_t zero333; - asm volatile("mov %0.s, #0" : "=w"(zero333)); + svfloat32_t v311 = svadd_f32_x(svptrue_b32(), v307, v309); + svfloat32_t v312 = svsub_f32_x(svptrue_b32(), v307, v309); + svfloat32_t zero333 = svdup_n_f32(0); svfloat32_t v333 = svcmla_f32_x(pred_full, zero333, v570, v308, 90); - svfloat32_t v251; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v251) : "w"(v247), "w"(v249)); - svfloat32_t v252; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v252) : "w"(v247), "w"(v249)); - svfloat32_t zero274; - asm volatile("mov %0.s, #0" : "=w"(zero274)); + svfloat32_t v251 = svadd_f32_x(svptrue_b32(), v247, v249); + svfloat32_t v252 = svsub_f32_x(svptrue_b32(), v247, v249); + svfloat32_t zero274 = svdup_n_f32(0); svfloat32_t v274 = svcmla_f32_x(pred_full, zero274, v563, v250, 90); svfloat32_t v305 = svmla_f32_x(pred_full, v304, v278, v566); svfloat32_t v306 = svnmls_f32_x(pred_full, v304, v278, v566); - svfloat32_t zero319; - asm volatile("mov %0.s, #0" : "=w"(zero319)); + svfloat32_t zero319 = svdup_n_f32(0); svfloat32_t v319 = svcmla_f32_x(pred_full, zero319, v570, v311, 90); - svfloat32_t zero326; - asm volatile("mov %0.s, #0" : "=w"(zero326)); + svfloat32_t zero326 = svdup_n_f32(0); svfloat32_t v326 = svcmla_f32_x(pred_full, zero326, v570, v312, 90); svfloat32_t v339 = svmla_f32_x(pred_full, v333, v310, v571); svfloat32_t v340 = svmls_f32_x(pred_full, v333, v310, v571); - svfloat32_t v275; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v275) : "w"(v248), "w"(v274)); - svfloat32_t v276; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v276) : "w"(v248), "w"(v274)); + svfloat32_t v275 = svadd_f32_x(svptrue_b32(), v248, v274); + svfloat32_t v276 = svsub_f32_x(svptrue_b32(), v248, v274); svfloat32_t v341 = svmla_f32_x(pred_full, v251, v281, v566); svint16_t v346 = svtbl_s16( svreinterpret_s16_s32(svcvt_s32_f32_x( @@ -4188,23 +3816,17 @@ void armral_fft_cf32_cf32_cs16_ab_t_gu12(const armral_cmplx_f32_t *restrict x, pred_full, svmul_n_f32_x(pred_full, v252, (float)(1ULL << 31ULL)))), svreinterpret_u16_u64( svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svfloat32_t v342; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v342) : "w"(v341), "w"(v319)); - svfloat32_t v343; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v343) : "w"(v341), "w"(v319)); - svfloat32_t v368; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v368) : "w"(v276), "w"(v306)); + svfloat32_t v342 = svadd_f32_x(svptrue_b32(), v341, v319); + svfloat32_t v343 = svsub_f32_x(svptrue_b32(), v341, v319); + svfloat32_t v368 = svadd_f32_x(svptrue_b32(), v276, v306); svint16_t v373 = svtbl_s16( svreinterpret_s16_s32(svcvt_s32_f32_x( pred_full, svmul_n_f32_x(pred_full, v276, (float)(1ULL << 31ULL)))), svreinterpret_u16_u64( svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svfloat32_t v396; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v396) : "w"(v395), "w"(v326)); - svfloat32_t v397; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v397) : "w"(v395), "w"(v326)); - svfloat32_t v422; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v422) : "w"(v275), "w"(v305)); + svfloat32_t v396 = svadd_f32_x(svptrue_b32(), v395, v326); + svfloat32_t v397 = svsub_f32_x(svptrue_b32(), v395, v326); + svfloat32_t v422 = svadd_f32_x(svptrue_b32(), v275, v305); svint16_t v427 = svtbl_s16( svreinterpret_s16_s32(svcvt_s32_f32_x( pred_full, svmul_n_f32_x(pred_full, v275, (float)(1ULL << 31ULL)))), @@ -4222,10 +3844,8 @@ void armral_fft_cf32_cf32_cs16_ab_t_gu12(const armral_cmplx_f32_t *restrict x, pred_full, svmul_n_f32_x(pred_full, v342, (float)(1ULL << 31ULL)))), svreinterpret_u16_u64( svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svfloat32_t v369; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v369) : "w"(v368), "w"(v340)); - svfloat32_t v370; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v370) : "w"(v368), "w"(v340)); + svfloat32_t v369 = svadd_f32_x(svptrue_b32(), v368, v340); + svfloat32_t v370 = svsub_f32_x(svptrue_b32(), v368, v340); svint16_t v408 = svtbl_s16( svreinterpret_s16_s32(svcvt_s32_f32_x( pred_full, svmul_n_f32_x(pred_full, v397, (float)(1ULL << 31ULL)))), @@ -4236,10 +3856,8 @@ void armral_fft_cf32_cf32_cs16_ab_t_gu12(const armral_cmplx_f32_t *restrict x, pred_full, svmul_n_f32_x(pred_full, v396, (float)(1ULL << 31ULL)))), svreinterpret_u16_u64( svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svfloat32_t v423; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v423) : "w"(v422), "w"(v339)); - svfloat32_t v424; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v424) : "w"(v422), "w"(v339)); + svfloat32_t v423 = svadd_f32_x(svptrue_b32(), v422, v339); + svfloat32_t v424 = svsub_f32_x(svptrue_b32(), v422, v339); svst1w_u64(pred_full, (unsigned *)(v606), svreinterpret_u64_s16(v373)); svst1w_u64(pred_full, (unsigned *)(v660), svreinterpret_u64_s16(v427)); svint16_t v381 = svtbl_s16( @@ -4780,8 +4398,7 @@ void armral_fft_cf32_cf32_cs16_ab_t_gu13(const armral_cmplx_f32_t *restrict x, int32_t *v833 = &v6[v573]; int32_t *v842 = &v6[v581]; int32_t *v851 = &v6[v589]; - svfloat32_t zero192; - asm volatile("mov %0.s, #0" : "=w"(zero192)); + svfloat32_t zero192 = svdup_n_f32(0); svfloat32_t v192 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero192, v612, v191, 0), v612, v191, 90); @@ -4829,210 +4446,133 @@ void armral_fft_cf32_cf32_cs16_ab_t_gu13(const armral_cmplx_f32_t *restrict x, svld1_gather_s64index_f64(pred_full, (const double *)(v701), v722)); svfloat32_t v712 = svreinterpret_f32_f64( svld1_gather_s64index_f64(pred_full, (const double *)(v710), v722)); - svfloat32_t zero199; - asm volatile("mov %0.s, #0" : "=w"(zero199)); + svfloat32_t zero199 = svdup_n_f32(0); svfloat32_t v199 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero199, v622, v198, 0), v622, v198, 90); - svfloat32_t zero206; - asm volatile("mov %0.s, #0" : "=w"(zero206)); + svfloat32_t zero206 = svdup_n_f32(0); svfloat32_t v206 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero206, v631, v205, 0), v631, v205, 90); - svfloat32_t zero213; - asm volatile("mov %0.s, #0" : "=w"(zero213)); + svfloat32_t zero213 = svdup_n_f32(0); svfloat32_t v213 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero213, v640, v212, 0), v640, v212, 90); - svfloat32_t zero220; - asm volatile("mov %0.s, #0" : "=w"(zero220)); + svfloat32_t zero220 = svdup_n_f32(0); svfloat32_t v220 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero220, v649, v219, 0), v649, v219, 90); - svfloat32_t zero227; - asm volatile("mov %0.s, #0" : "=w"(zero227)); + svfloat32_t zero227 = svdup_n_f32(0); svfloat32_t v227 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero227, v658, v226, 0), v658, v226, 90); - svfloat32_t zero234; - asm volatile("mov %0.s, #0" : "=w"(zero234)); + svfloat32_t zero234 = svdup_n_f32(0); svfloat32_t v234 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero234, v667, v233, 0), v667, v233, 90); - svfloat32_t zero241; - asm volatile("mov %0.s, #0" : "=w"(zero241)); + svfloat32_t zero241 = svdup_n_f32(0); svfloat32_t v241 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero241, v676, v240, 0), v676, v240, 90); - svfloat32_t zero248; - asm volatile("mov %0.s, #0" : "=w"(zero248)); + svfloat32_t zero248 = svdup_n_f32(0); svfloat32_t v248 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero248, v685, v247, 0), v685, v247, 90); - svfloat32_t zero255; - asm volatile("mov %0.s, #0" : "=w"(zero255)); + svfloat32_t zero255 = svdup_n_f32(0); svfloat32_t v255 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero255, v694, v254, 0), v694, v254, 90); - svfloat32_t zero262; - asm volatile("mov %0.s, #0" : "=w"(zero262)); + svfloat32_t zero262 = svdup_n_f32(0); svfloat32_t v262 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero262, v703, v261, 0), v703, v261, 90); - svfloat32_t zero269; - asm volatile("mov %0.s, #0" : "=w"(zero269)); + svfloat32_t zero269 = svdup_n_f32(0); svfloat32_t v269 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero269, v712, v268, 0), v712, v268, 90); - svfloat32_t v270; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v270) : "w"(v192), "w"(v199)); - svfloat32_t v271; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v271) : "w"(v206), "w"(v213)); - svfloat32_t v272; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v272) : "w"(v220), "w"(v227)); - svfloat32_t v273; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v273) : "w"(v234), "w"(v241)); - svfloat32_t v274; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v274) : "w"(v248), "w"(v255)); - svfloat32_t v275; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v275) : "w"(v262), "w"(v269)); - svfloat32_t v276; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v276) : "w"(v192), "w"(v199)); - svfloat32_t v277; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v277) : "w"(v206), "w"(v213)); - svfloat32_t v278; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v278) : "w"(v220), "w"(v227)); - svfloat32_t v279; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v279) : "w"(v234), "w"(v241)); - svfloat32_t v280; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v280) : "w"(v248), "w"(v255)); - svfloat32_t v281; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v281) : "w"(v262), "w"(v269)); - svfloat32_t v282; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v282) : "w"(v271), "w"(v274)); - svfloat32_t v284; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v284) : "w"(v270), "w"(v272)); - svfloat32_t v287; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v287) : "w"(v277), "w"(v280)); - svfloat32_t v289; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v289) : "w"(v276), "w"(v278)); - svfloat32_t v291; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v291) : "w"(v271), "w"(v275)); - svfloat32_t v292; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v292) : "w"(v272), "w"(v273)); - svfloat32_t v293; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v293) : "w"(v270), "w"(v273)); - svfloat32_t v294; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v294) : "w"(v274), "w"(v275)); - svfloat32_t v299; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v299) : "w"(v277), "w"(v281)); - svfloat32_t v300; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v300) : "w"(v276), "w"(v278)); - svfloat32_t v301; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v301) : "w"(v277), "w"(v280)); - svfloat32_t v302; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v302) : "w"(v276), "w"(v279)); - svfloat32_t v303; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v303) : "w"(v280), "w"(v281)); - svfloat32_t v304; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v304) : "w"(v278), "w"(v279)); - svfloat32_t v283; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v283) : "w"(v282), "w"(v275)); - svfloat32_t v285; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v285) : "w"(v284), "w"(v273)); - svfloat32_t v288; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v288) : "w"(v287), "w"(v281)); - svfloat32_t v290; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v290) : "w"(v289), "w"(v279)); - svfloat32_t v295; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v295) : "w"(v291), "w"(v292)); - svfloat32_t v296; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v296) : "w"(v293), "w"(v294)); - svfloat32_t v297; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v297) : "w"(v291), "w"(v292)); - svfloat32_t v298; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v298) : "w"(v293), "w"(v294)); - svfloat32_t v317; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v317) : "w"(v299), "w"(v300)); - svfloat32_t v318; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v318) : "w"(v301), "w"(v302)); - svfloat32_t v319; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v319) : "w"(v303), "w"(v304)); - svfloat32_t zero392; - asm volatile("mov %0.s, #0" : "=w"(zero392)); + svfloat32_t v270 = svadd_f32_x(svptrue_b32(), v192, v199); + svfloat32_t v271 = svadd_f32_x(svptrue_b32(), v206, v213); + svfloat32_t v272 = svadd_f32_x(svptrue_b32(), v220, v227); + svfloat32_t v273 = svadd_f32_x(svptrue_b32(), v234, v241); + svfloat32_t v274 = svadd_f32_x(svptrue_b32(), v248, v255); + svfloat32_t v275 = svadd_f32_x(svptrue_b32(), v262, v269); + svfloat32_t v276 = svsub_f32_x(svptrue_b32(), v192, v199); + svfloat32_t v277 = svsub_f32_x(svptrue_b32(), v206, v213); + svfloat32_t v278 = svsub_f32_x(svptrue_b32(), v220, v227); + svfloat32_t v279 = svsub_f32_x(svptrue_b32(), v234, v241); + svfloat32_t v280 = svsub_f32_x(svptrue_b32(), v248, v255); + svfloat32_t v281 = svsub_f32_x(svptrue_b32(), v262, v269); + svfloat32_t v282 = svadd_f32_x(svptrue_b32(), v271, v274); + svfloat32_t v284 = svadd_f32_x(svptrue_b32(), v270, v272); + svfloat32_t v287 = svadd_f32_x(svptrue_b32(), v277, v280); + svfloat32_t v289 = svadd_f32_x(svptrue_b32(), v276, v278); + svfloat32_t v291 = svsub_f32_x(svptrue_b32(), v271, v275); + svfloat32_t v292 = svsub_f32_x(svptrue_b32(), v272, v273); + svfloat32_t v293 = svsub_f32_x(svptrue_b32(), v270, v273); + svfloat32_t v294 = svsub_f32_x(svptrue_b32(), v274, v275); + svfloat32_t v299 = svsub_f32_x(svptrue_b32(), v277, v281); + svfloat32_t v300 = svsub_f32_x(svptrue_b32(), v276, v278); + svfloat32_t v301 = svsub_f32_x(svptrue_b32(), v277, v280); + svfloat32_t v302 = svadd_f32_x(svptrue_b32(), v276, v279); + svfloat32_t v303 = svsub_f32_x(svptrue_b32(), v280, v281); + svfloat32_t v304 = svadd_f32_x(svptrue_b32(), v278, v279); + svfloat32_t v283 = svadd_f32_x(svptrue_b32(), v282, v275); + svfloat32_t v285 = svadd_f32_x(svptrue_b32(), v284, v273); + svfloat32_t v288 = svadd_f32_x(svptrue_b32(), v287, v281); + svfloat32_t v290 = svsub_f32_x(svptrue_b32(), v289, v279); + svfloat32_t v295 = svsub_f32_x(svptrue_b32(), v291, v292); + svfloat32_t v296 = svsub_f32_x(svptrue_b32(), v293, v294); + svfloat32_t v297 = svadd_f32_x(svptrue_b32(), v291, v292); + svfloat32_t v298 = svadd_f32_x(svptrue_b32(), v293, v294); + svfloat32_t v317 = svadd_f32_x(svptrue_b32(), v299, v300); + svfloat32_t v318 = svadd_f32_x(svptrue_b32(), v301, v302); + svfloat32_t v319 = svsub_f32_x(svptrue_b32(), v303, v304); + svfloat32_t zero392 = svdup_n_f32(0); svfloat32_t v392 = svcmla_f32_x(pred_full, zero392, v736, v299, 90); - svfloat32_t zero399; - asm volatile("mov %0.s, #0" : "=w"(zero399)); + svfloat32_t zero399 = svdup_n_f32(0); svfloat32_t v399 = svcmla_f32_x(pred_full, zero399, v737, v300, 90); - svfloat32_t zero413; - asm volatile("mov %0.s, #0" : "=w"(zero413)); + svfloat32_t zero413 = svdup_n_f32(0); svfloat32_t v413 = svcmla_f32_x(pred_full, zero413, v739, v301, 90); - svfloat32_t zero420; - asm volatile("mov %0.s, #0" : "=w"(zero420)); + svfloat32_t zero420 = svdup_n_f32(0); svfloat32_t v420 = svcmla_f32_x(pred_full, zero420, v740, v302, 90); - svfloat32_t zero434; - asm volatile("mov %0.s, #0" : "=w"(zero434)); + svfloat32_t zero434 = svdup_n_f32(0); svfloat32_t v434 = svcmla_f32_x(pred_full, zero434, v742, v303, 90); - svfloat32_t v286; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v286) : "w"(v283), "w"(v285)); - svfloat32_t v313; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v313) : "w"(v285), "w"(v283)); - svfloat32_t v314; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v314) : "w"(v288), "w"(v290)); - svfloat32_t v315; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v315) : "w"(v295), "w"(v296)); - svfloat32_t v316; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v316) : "w"(v297), "w"(v298)); - svfloat32_t zero341; - asm volatile("mov %0.s, #0" : "=w"(zero341)); + svfloat32_t v286 = svadd_f32_x(svptrue_b32(), v283, v285); + svfloat32_t v313 = svsub_f32_x(svptrue_b32(), v285, v283); + svfloat32_t v314 = svadd_f32_x(svptrue_b32(), v288, v290); + svfloat32_t v315 = svadd_f32_x(svptrue_b32(), v295, v296); + svfloat32_t v316 = svsub_f32_x(svptrue_b32(), v297, v298); + svfloat32_t zero341 = svdup_n_f32(0); svfloat32_t v341 = svcmla_f32_x(pred_full, zero341, v727, v288, 90); - svfloat32_t zero348; - asm volatile("mov %0.s, #0" : "=w"(zero348)); + svfloat32_t zero348 = svdup_n_f32(0); svfloat32_t v348 = svcmla_f32_x(pred_full, zero348, v728, v290, 90); - svfloat32_t v360; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v360) : "w"(v295), "w"(v730)); - svfloat32_t zero406; - asm volatile("mov %0.s, #0" : "=w"(zero406)); + svfloat32_t v360 = svmul_f32_x(svptrue_b32(), v295, v730); + svfloat32_t zero406 = svdup_n_f32(0); svfloat32_t v406 = svcmla_f32_x(pred_full, zero406, v738, v317, 90); - svfloat32_t zero427; - asm volatile("mov %0.s, #0" : "=w"(zero427)); + svfloat32_t zero427 = svdup_n_f32(0); svfloat32_t v427 = svcmla_f32_x(pred_full, zero427, v741, v318, 90); - svfloat32_t zero448; - asm volatile("mov %0.s, #0" : "=w"(zero448)); + svfloat32_t zero448 = svdup_n_f32(0); svfloat32_t v448 = svcmla_f32_x(pred_full, zero448, v744, v319, 90); - svfloat32_t v312; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v312) : "w"(v723), "w"(v286)); - svfloat32_t zero355; - asm volatile("mov %0.s, #0" : "=w"(zero355)); + svfloat32_t v312 = svadd_f32_x(svptrue_b32(), v723, v286); + svfloat32_t zero355 = svdup_n_f32(0); svfloat32_t v355 = svcmla_f32_x(pred_full, zero355, v729, v314, 90); - svfloat32_t v370; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v370) : "w"(v315), "w"(v732)); + svfloat32_t v370 = svmul_f32_x(svptrue_b32(), v315, v732); svfloat32_t v450 = svmla_f32_x(pred_full, v360, v296, v731); - svfloat32_t v462; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v462) : "w"(v392), "w"(v406)); - svfloat32_t v463; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v463) : "w"(v399), "w"(v406)); - svfloat32_t v464; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v464) : "w"(v413), "w"(v427)); - svfloat32_t v465; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v465) : "w"(v420), "w"(v427)); - svfloat32_t v466; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v466) : "w"(v434), "w"(v448)); + svfloat32_t v462 = svsub_f32_x(svptrue_b32(), v392, v406); + svfloat32_t v463 = svsub_f32_x(svptrue_b32(), v399, v406); + svfloat32_t v464 = svsub_f32_x(svptrue_b32(), v413, v427); + svfloat32_t v465 = svsub_f32_x(svptrue_b32(), v420, v427); + svfloat32_t v466 = svsub_f32_x(svptrue_b32(), v434, v448); svfloat32_t v467 = svcmla_f32_x(pred_full, v448, v743, v304, 90); svfloat32_t v449 = svmls_f32_x(pred_full, v312, v286, v725); svfloat32_t v451 = svmls_f32_x(pred_full, v450, v313, v726); svfloat32_t v452 = svmla_f32_x(pred_full, v370, v296, v731); svfloat32_t v454 = svnmls_f32_x(pred_full, v360, v315, v732); - svfloat32_t v468; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v468) : "w"(v341), "w"(v355)); - svfloat32_t v469; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v469) : "w"(v348), "w"(v355)); - svfloat32_t v480; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v480) : "w"(v462), "w"(v466)); - svfloat32_t v482; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v482) : "w"(v464), "w"(v466)); - svfloat32_t v484; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v484) : "w"(v463), "w"(v467)); + svfloat32_t v468 = svsub_f32_x(svptrue_b32(), v341, v355); + svfloat32_t v469 = svsub_f32_x(svptrue_b32(), v348, v355); + svfloat32_t v480 = svadd_f32_x(svptrue_b32(), v462, v466); + svfloat32_t v482 = svadd_f32_x(svptrue_b32(), v464, v466); + svfloat32_t v484 = svsub_f32_x(svptrue_b32(), v463, v467); svint16_t v502 = svtbl_s16( svreinterpret_s16_s32(svcvt_s32_f32_x( pred_full, svmul_n_f32_x(pred_full, v312, (float)(1ULL << 31ULL)))), @@ -5043,64 +4583,37 @@ void armral_fft_cf32_cf32_cs16_ab_t_gu13(const armral_cmplx_f32_t *restrict x, svfloat32_t v456 = svmla_f32_x(pred_full, v449, v297, v733); svfloat32_t v458 = svmls_f32_x(pred_full, v449, v298, v734); svfloat32_t v460 = svmls_f32_x(pred_full, v449, v297, v733); - svfloat32_t v476; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v476) : "w"(v469), "w"(v462)); - svfloat32_t v478; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v478) : "w"(v467), "w"(v468)); - svfloat32_t v481; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v481) : "w"(v480), "w"(v469)); - svfloat32_t v483; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v483) : "w"(v482), "w"(v469)); - svfloat32_t v485; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v485) : "w"(v484), "w"(v468)); - svfloat32_t v486; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v486) : "w"(v468), "w"(v463)); + svfloat32_t v476 = svsub_f32_x(svptrue_b32(), v469, v462); + svfloat32_t v478 = svsub_f32_x(svptrue_b32(), v467, v468); + svfloat32_t v481 = svadd_f32_x(svptrue_b32(), v480, v469); + svfloat32_t v483 = svsub_f32_x(svptrue_b32(), v482, v469); + svfloat32_t v485 = svsub_f32_x(svptrue_b32(), v484, v468); + svfloat32_t v486 = svadd_f32_x(svptrue_b32(), v468, v463); svst1w_u64(pred_full, (unsigned *)(v752), svreinterpret_u64_s16(v502)); svfloat32_t v457 = svmla_f32_x(pred_full, v456, v298, v734); svfloat32_t v459 = svmls_f32_x(pred_full, v458, v316, v735); svfloat32_t v461 = svmla_f32_x(pred_full, v460, v316, v735); - svfloat32_t v477; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v477) : "w"(v476), "w"(v464)); - svfloat32_t v479; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v479) : "w"(v478), "w"(v465)); - svfloat32_t v487; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v487) : "w"(v486), "w"(v465)); - svfloat32_t v470; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v470) : "w"(v451), "w"(v457)); - svfloat32_t v471; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v471) : "w"(v453), "w"(v459)); - svfloat32_t v472; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v472) : "w"(v459), "w"(v453)); - svfloat32_t v473; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v473) : "w"(v455), "w"(v461)); - svfloat32_t v474; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v474) : "w"(v457), "w"(v451)); - svfloat32_t v475; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v475) : "w"(v461), "w"(v455)); - svfloat32_t v488; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v488) : "w"(v470), "w"(v477)); - svfloat32_t v489; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v489) : "w"(v471), "w"(v479)); - svfloat32_t v490; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v490) : "w"(v472), "w"(v481)); - svfloat32_t v491; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v491) : "w"(v473), "w"(v483)); - svfloat32_t v492; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v492) : "w"(v474), "w"(v485)); - svfloat32_t v493; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v493) : "w"(v475), "w"(v487)); - svfloat32_t v494; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v494) : "w"(v475), "w"(v487)); - svfloat32_t v495; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v495) : "w"(v474), "w"(v485)); - svfloat32_t v496; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v496) : "w"(v473), "w"(v483)); - svfloat32_t v497; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v497) : "w"(v472), "w"(v481)); - svfloat32_t v498; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v498) : "w"(v471), "w"(v479)); - svfloat32_t v499; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v499) : "w"(v470), "w"(v477)); + svfloat32_t v477 = svadd_f32_x(svptrue_b32(), v476, v464); + svfloat32_t v479 = svsub_f32_x(svptrue_b32(), v478, v465); + svfloat32_t v487 = svsub_f32_x(svptrue_b32(), v486, v465); + svfloat32_t v470 = svadd_f32_x(svptrue_b32(), v451, v457); + svfloat32_t v471 = svadd_f32_x(svptrue_b32(), v453, v459); + svfloat32_t v472 = svsub_f32_x(svptrue_b32(), v459, v453); + svfloat32_t v473 = svadd_f32_x(svptrue_b32(), v455, v461); + svfloat32_t v474 = svsub_f32_x(svptrue_b32(), v457, v451); + svfloat32_t v475 = svsub_f32_x(svptrue_b32(), v461, v455); + svfloat32_t v488 = svsub_f32_x(svptrue_b32(), v470, v477); + svfloat32_t v489 = svadd_f32_x(svptrue_b32(), v471, v479); + svfloat32_t v490 = svsub_f32_x(svptrue_b32(), v472, v481); + svfloat32_t v491 = svsub_f32_x(svptrue_b32(), v473, v483); + svfloat32_t v492 = svadd_f32_x(svptrue_b32(), v474, v485); + svfloat32_t v493 = svsub_f32_x(svptrue_b32(), v475, v487); + svfloat32_t v494 = svadd_f32_x(svptrue_b32(), v475, v487); + svfloat32_t v495 = svsub_f32_x(svptrue_b32(), v474, v485); + svfloat32_t v496 = svadd_f32_x(svptrue_b32(), v473, v483); + svfloat32_t v497 = svadd_f32_x(svptrue_b32(), v472, v481); + svfloat32_t v498 = svsub_f32_x(svptrue_b32(), v471, v479); + svfloat32_t v499 = svadd_f32_x(svptrue_b32(), v470, v477); svint16_t v510 = svtbl_s16( svreinterpret_s16_s32(svcvt_s32_f32_x( pred_full, svmul_n_f32_x(pred_full, v488, (float)(1ULL << 31ULL)))), @@ -5629,8 +5142,7 @@ void armral_fft_cf32_cf32_cs16_ab_t_gu14(const armral_cmplx_f32_t *restrict x, svld1_f64(pred_full, &((const double *)v7)[v162])); svfloat32_t v198 = svreinterpret_f32_f64( svld1_f64(pred_full, &((const double *)v7)[v197])); - svfloat32_t zero206; - asm volatile("mov %0.s, #0" : "=w"(zero206)); + svfloat32_t zero206 = svdup_n_f32(0); svfloat32_t v206 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero206, v682, v205, 0), v682, v205, 90); @@ -5666,200 +5178,126 @@ void armral_fft_cf32_cf32_cs16_ab_t_gu14(const armral_cmplx_f32_t *restrict x, svld1_gather_s64index_f64(pred_full, (const double *)(v709), v729)); svfloat32_t v720 = svreinterpret_f32_f64( svld1_gather_s64index_f64(pred_full, (const double *)(v718), v729)); - svfloat32_t zero38; - asm volatile("mov %0.s, #0" : "=w"(zero38)); + svfloat32_t zero38 = svdup_n_f32(0); svfloat32_t v38 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero38, v610, v37, 0), v610, v37, 90); - svfloat32_t zero73; - asm volatile("mov %0.s, #0" : "=w"(zero73)); + svfloat32_t zero73 = svdup_n_f32(0); svfloat32_t v73 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero73, v619, v72, 0), v619, v72, 90); - svfloat32_t zero80; - asm volatile("mov %0.s, #0" : "=w"(zero80)); + svfloat32_t zero80 = svdup_n_f32(0); svfloat32_t v80 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero80, v628, v79, 0), v628, v79, 90); - svfloat32_t zero115; - asm volatile("mov %0.s, #0" : "=w"(zero115)); + svfloat32_t zero115 = svdup_n_f32(0); svfloat32_t v115 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero115, v637, v114, 0), v637, v114, 90); - svfloat32_t zero122; - asm volatile("mov %0.s, #0" : "=w"(zero122)); + svfloat32_t zero122 = svdup_n_f32(0); svfloat32_t v122 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero122, v646, v121, 0), v646, v121, 90); - svfloat32_t zero157; - asm volatile("mov %0.s, #0" : "=w"(zero157)); + svfloat32_t zero157 = svdup_n_f32(0); svfloat32_t v157 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero157, v655, v156, 0), v655, v156, 90); - svfloat32_t zero164; - asm volatile("mov %0.s, #0" : "=w"(zero164)); + svfloat32_t zero164 = svdup_n_f32(0); svfloat32_t v164 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero164, v664, v163, 0), v664, v163, 90); - svfloat32_t zero199; - asm volatile("mov %0.s, #0" : "=w"(zero199)); + svfloat32_t zero199 = svdup_n_f32(0); svfloat32_t v199 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero199, v673, v198, 0), v673, v198, 90); - svfloat32_t zero241; - asm volatile("mov %0.s, #0" : "=w"(zero241)); + svfloat32_t zero241 = svdup_n_f32(0); svfloat32_t v241 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero241, v693, v240, 0), v693, v240, 90); - svfloat32_t zero248; - asm volatile("mov %0.s, #0" : "=w"(zero248)); + svfloat32_t zero248 = svdup_n_f32(0); svfloat32_t v248 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero248, v702, v247, 0), v702, v247, 90); - svfloat32_t zero283; - asm volatile("mov %0.s, #0" : "=w"(zero283)); + svfloat32_t zero283 = svdup_n_f32(0); svfloat32_t v283 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero283, v711, v282, 0), v711, v282, 90); - svfloat32_t zero290; - asm volatile("mov %0.s, #0" : "=w"(zero290)); + svfloat32_t zero290 = svdup_n_f32(0); svfloat32_t v290 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero290, v720, v289, 0), v720, v289, 90); - svfloat32_t v298; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v298) : "w"(v730), "w"(v38)); - svfloat32_t v299; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v299) : "w"(v730), "w"(v38)); - svfloat32_t v300; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v300) : "w"(v73), "w"(v80)); - svfloat32_t v301; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v301) : "w"(v73), "w"(v80)); - svfloat32_t v302; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v302) : "w"(v115), "w"(v122)); - svfloat32_t v303; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v303) : "w"(v115), "w"(v122)); - svfloat32_t v304; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v304) : "w"(v157), "w"(v164)); - svfloat32_t v305; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v305) : "w"(v157), "w"(v164)); - svfloat32_t v306; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v306) : "w"(v199), "w"(v206)); - svfloat32_t v307; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v307) : "w"(v199), "w"(v206)); - svfloat32_t v308; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v308) : "w"(v241), "w"(v248)); - svfloat32_t v309; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v309) : "w"(v241), "w"(v248)); - svfloat32_t v310; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v310) : "w"(v283), "w"(v290)); - svfloat32_t v311; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v311) : "w"(v283), "w"(v290)); - svfloat32_t v312; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v312) : "w"(v300), "w"(v310)); - svfloat32_t v313; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v313) : "w"(v300), "w"(v310)); - svfloat32_t v314; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v314) : "w"(v306), "w"(v304)); - svfloat32_t v315; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v315) : "w"(v306), "w"(v304)); - svfloat32_t v316; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v316) : "w"(v302), "w"(v308)); - svfloat32_t v317; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v317) : "w"(v302), "w"(v308)); - svfloat32_t v401; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v401) : "w"(v301), "w"(v311)); - svfloat32_t v402; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v402) : "w"(v301), "w"(v311)); - svfloat32_t v403; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v403) : "w"(v307), "w"(v305)); - svfloat32_t v404; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v404) : "w"(v307), "w"(v305)); - svfloat32_t v405; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v405) : "w"(v303), "w"(v309)); - svfloat32_t v406; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v406) : "w"(v303), "w"(v309)); - svfloat32_t v318; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v318) : "w"(v312), "w"(v314)); - svfloat32_t v321; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v321) : "w"(v312), "w"(v314)); - svfloat32_t v322; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v322) : "w"(v314), "w"(v316)); - svfloat32_t v323; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v323) : "w"(v316), "w"(v312)); - svfloat32_t v324; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v324) : "w"(v313), "w"(v315)); - svfloat32_t v326; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v326) : "w"(v313), "w"(v315)); - svfloat32_t v327; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v327) : "w"(v315), "w"(v317)); - svfloat32_t v328; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v328) : "w"(v317), "w"(v313)); - svfloat32_t v407; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v407) : "w"(v401), "w"(v403)); - svfloat32_t v410; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v410) : "w"(v401), "w"(v403)); - svfloat32_t v411; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v411) : "w"(v403), "w"(v405)); - svfloat32_t v412; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v412) : "w"(v405), "w"(v401)); - svfloat32_t v413; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v413) : "w"(v402), "w"(v404)); - svfloat32_t v415; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v415) : "w"(v402), "w"(v404)); - svfloat32_t v416; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v416) : "w"(v404), "w"(v406)); - svfloat32_t v417; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v417) : "w"(v406), "w"(v402)); - svfloat32_t v319; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v319) : "w"(v318), "w"(v316)); - svfloat32_t v325; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v325) : "w"(v324), "w"(v317)); - svfloat32_t zero367; - asm volatile("mov %0.s, #0" : "=w"(zero367)); + svfloat32_t v298 = svadd_f32_x(svptrue_b32(), v730, v38); + svfloat32_t v299 = svsub_f32_x(svptrue_b32(), v730, v38); + svfloat32_t v300 = svadd_f32_x(svptrue_b32(), v73, v80); + svfloat32_t v301 = svsub_f32_x(svptrue_b32(), v73, v80); + svfloat32_t v302 = svadd_f32_x(svptrue_b32(), v115, v122); + svfloat32_t v303 = svsub_f32_x(svptrue_b32(), v115, v122); + svfloat32_t v304 = svadd_f32_x(svptrue_b32(), v157, v164); + svfloat32_t v305 = svsub_f32_x(svptrue_b32(), v157, v164); + svfloat32_t v306 = svadd_f32_x(svptrue_b32(), v199, v206); + svfloat32_t v307 = svsub_f32_x(svptrue_b32(), v199, v206); + svfloat32_t v308 = svadd_f32_x(svptrue_b32(), v241, v248); + svfloat32_t v309 = svsub_f32_x(svptrue_b32(), v241, v248); + svfloat32_t v310 = svadd_f32_x(svptrue_b32(), v283, v290); + svfloat32_t v311 = svsub_f32_x(svptrue_b32(), v283, v290); + svfloat32_t v312 = svadd_f32_x(svptrue_b32(), v300, v310); + svfloat32_t v313 = svsub_f32_x(svptrue_b32(), v300, v310); + svfloat32_t v314 = svadd_f32_x(svptrue_b32(), v306, v304); + svfloat32_t v315 = svsub_f32_x(svptrue_b32(), v306, v304); + svfloat32_t v316 = svadd_f32_x(svptrue_b32(), v302, v308); + svfloat32_t v317 = svsub_f32_x(svptrue_b32(), v302, v308); + svfloat32_t v401 = svadd_f32_x(svptrue_b32(), v301, v311); + svfloat32_t v402 = svsub_f32_x(svptrue_b32(), v301, v311); + svfloat32_t v403 = svadd_f32_x(svptrue_b32(), v307, v305); + svfloat32_t v404 = svsub_f32_x(svptrue_b32(), v307, v305); + svfloat32_t v405 = svadd_f32_x(svptrue_b32(), v303, v309); + svfloat32_t v406 = svsub_f32_x(svptrue_b32(), v303, v309); + svfloat32_t v318 = svadd_f32_x(svptrue_b32(), v312, v314); + svfloat32_t v321 = svsub_f32_x(svptrue_b32(), v312, v314); + svfloat32_t v322 = svsub_f32_x(svptrue_b32(), v314, v316); + svfloat32_t v323 = svsub_f32_x(svptrue_b32(), v316, v312); + svfloat32_t v324 = svadd_f32_x(svptrue_b32(), v313, v315); + svfloat32_t v326 = svsub_f32_x(svptrue_b32(), v313, v315); + svfloat32_t v327 = svsub_f32_x(svptrue_b32(), v315, v317); + svfloat32_t v328 = svsub_f32_x(svptrue_b32(), v317, v313); + svfloat32_t v407 = svadd_f32_x(svptrue_b32(), v401, v403); + svfloat32_t v410 = svsub_f32_x(svptrue_b32(), v401, v403); + svfloat32_t v411 = svsub_f32_x(svptrue_b32(), v403, v405); + svfloat32_t v412 = svsub_f32_x(svptrue_b32(), v405, v401); + svfloat32_t v413 = svadd_f32_x(svptrue_b32(), v402, v404); + svfloat32_t v415 = svsub_f32_x(svptrue_b32(), v402, v404); + svfloat32_t v416 = svsub_f32_x(svptrue_b32(), v404, v406); + svfloat32_t v417 = svsub_f32_x(svptrue_b32(), v406, v402); + svfloat32_t v319 = svadd_f32_x(svptrue_b32(), v318, v316); + svfloat32_t v325 = svadd_f32_x(svptrue_b32(), v324, v317); + svfloat32_t zero367 = svdup_n_f32(0); svfloat32_t v367 = svcmla_f32_x(pred_full, zero367, v746, v326, 90); - svfloat32_t zero374; - asm volatile("mov %0.s, #0" : "=w"(zero374)); + svfloat32_t zero374 = svdup_n_f32(0); svfloat32_t v374 = svcmla_f32_x(pred_full, zero374, v747, v327, 90); - svfloat32_t zero381; - asm volatile("mov %0.s, #0" : "=w"(zero381)); + svfloat32_t zero381 = svdup_n_f32(0); svfloat32_t v381 = svcmla_f32_x(pred_full, zero381, v748, v328, 90); - svfloat32_t v408; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v408) : "w"(v407), "w"(v405)); - svfloat32_t v414; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v414) : "w"(v413), "w"(v406)); - svfloat32_t zero456; - asm volatile("mov %0.s, #0" : "=w"(zero456)); + svfloat32_t v408 = svadd_f32_x(svptrue_b32(), v407, v405); + svfloat32_t v414 = svadd_f32_x(svptrue_b32(), v413, v406); + svfloat32_t zero456 = svdup_n_f32(0); svfloat32_t v456 = svcmla_f32_x(pred_full, zero456, v746, v415, 90); - svfloat32_t zero463; - asm volatile("mov %0.s, #0" : "=w"(zero463)); + svfloat32_t zero463 = svdup_n_f32(0); svfloat32_t v463 = svcmla_f32_x(pred_full, zero463, v747, v416, 90); - svfloat32_t zero470; - asm volatile("mov %0.s, #0" : "=w"(zero470)); + svfloat32_t zero470 = svdup_n_f32(0); svfloat32_t v470 = svcmla_f32_x(pred_full, zero470, v748, v417, 90); - svfloat32_t v320; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v320) : "w"(v319), "w"(v298)); - svfloat32_t zero360; - asm volatile("mov %0.s, #0" : "=w"(zero360)); + svfloat32_t v320 = svadd_f32_x(svptrue_b32(), v319, v298); + svfloat32_t zero360 = svdup_n_f32(0); svfloat32_t v360 = svcmla_f32_x(pred_full, zero360, v745, v325, 90); - svfloat32_t v409; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v409) : "w"(v408), "w"(v299)); - svfloat32_t zero449; - asm volatile("mov %0.s, #0" : "=w"(zero449)); + svfloat32_t v409 = svadd_f32_x(svptrue_b32(), v408, v299); + svfloat32_t zero449 = svdup_n_f32(0); svfloat32_t v449 = svcmla_f32_x(pred_full, zero449, v745, v414, 90); svfloat32_t v382 = svmla_f32_x(pred_full, v320, v319, v741); - svfloat32_t v389; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v389) : "w"(v360), "w"(v367)); - svfloat32_t v391; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v391) : "w"(v360), "w"(v367)); - svfloat32_t v393; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v393) : "w"(v360), "w"(v374)); + svfloat32_t v389 = svadd_f32_x(svptrue_b32(), v360, v367); + svfloat32_t v391 = svsub_f32_x(svptrue_b32(), v360, v367); + svfloat32_t v393 = svsub_f32_x(svptrue_b32(), v360, v374); svfloat32_t v471 = svmla_f32_x(pred_full, v409, v408, v741); - svfloat32_t v478; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v478) : "w"(v449), "w"(v456)); - svfloat32_t v480; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v480) : "w"(v449), "w"(v456)); - svfloat32_t v482; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v482) : "w"(v449), "w"(v463)); + svfloat32_t v478 = svadd_f32_x(svptrue_b32(), v449, v456); + svfloat32_t v480 = svsub_f32_x(svptrue_b32(), v449, v456); + svfloat32_t v482 = svsub_f32_x(svptrue_b32(), v449, v463); svint16_t v492 = svtbl_s16( svreinterpret_s16_s32(svcvt_s32_f32_x( pred_full, svmul_n_f32_x(pred_full, v320, (float)(1ULL << 31ULL)))), @@ -5873,21 +5311,15 @@ void armral_fft_cf32_cf32_cs16_ab_t_gu14(const armral_cmplx_f32_t *restrict x, svfloat32_t v383 = svmla_f32_x(pred_full, v382, v321, v742); svfloat32_t v385 = svmls_f32_x(pred_full, v382, v321, v742); svfloat32_t v387 = svmls_f32_x(pred_full, v382, v322, v743); - svfloat32_t v390; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v390) : "w"(v389), "w"(v374)); - svfloat32_t v392; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v392) : "w"(v391), "w"(v381)); - svfloat32_t v394; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v394) : "w"(v393), "w"(v381)); + svfloat32_t v390 = svadd_f32_x(svptrue_b32(), v389, v374); + svfloat32_t v392 = svsub_f32_x(svptrue_b32(), v391, v381); + svfloat32_t v394 = svadd_f32_x(svptrue_b32(), v393, v381); svfloat32_t v472 = svmla_f32_x(pred_full, v471, v410, v742); svfloat32_t v474 = svmls_f32_x(pred_full, v471, v410, v742); svfloat32_t v476 = svmls_f32_x(pred_full, v471, v411, v743); - svfloat32_t v479; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v479) : "w"(v478), "w"(v463)); - svfloat32_t v481; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v481) : "w"(v480), "w"(v470)); - svfloat32_t v483; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v483) : "w"(v482), "w"(v470)); + svfloat32_t v479 = svadd_f32_x(svptrue_b32(), v478, v463); + svfloat32_t v481 = svsub_f32_x(svptrue_b32(), v480, v470); + svfloat32_t v483 = svadd_f32_x(svptrue_b32(), v482, v470); svst1w_u64(pred_full, (unsigned *)(v756), svreinterpret_u64_s16(v492)); svst1w_u64(pred_full, (unsigned *)(v765), svreinterpret_u64_s16(v500)); svfloat32_t v384 = svmla_f32_x(pred_full, v383, v322, v743); @@ -5896,30 +5328,18 @@ void armral_fft_cf32_cf32_cs16_ab_t_gu14(const armral_cmplx_f32_t *restrict x, svfloat32_t v473 = svmla_f32_x(pred_full, v472, v411, v743); svfloat32_t v475 = svmls_f32_x(pred_full, v474, v412, v744); svfloat32_t v477 = svmla_f32_x(pred_full, v476, v412, v744); - svfloat32_t v395; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v395) : "w"(v384), "w"(v390)); - svfloat32_t v396; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v396) : "w"(v384), "w"(v390)); - svfloat32_t v397; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v397) : "w"(v386), "w"(v392)); - svfloat32_t v398; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v398) : "w"(v386), "w"(v392)); - svfloat32_t v399; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v399) : "w"(v388), "w"(v394)); - svfloat32_t v400; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v400) : "w"(v388), "w"(v394)); - svfloat32_t v484; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v484) : "w"(v473), "w"(v479)); - svfloat32_t v485; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v485) : "w"(v473), "w"(v479)); - svfloat32_t v486; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v486) : "w"(v475), "w"(v481)); - svfloat32_t v487; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v487) : "w"(v475), "w"(v481)); - svfloat32_t v488; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v488) : "w"(v477), "w"(v483)); - svfloat32_t v489; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v489) : "w"(v477), "w"(v483)); + svfloat32_t v395 = svadd_f32_x(svptrue_b32(), v384, v390); + svfloat32_t v396 = svsub_f32_x(svptrue_b32(), v384, v390); + svfloat32_t v397 = svadd_f32_x(svptrue_b32(), v386, v392); + svfloat32_t v398 = svsub_f32_x(svptrue_b32(), v386, v392); + svfloat32_t v399 = svadd_f32_x(svptrue_b32(), v388, v394); + svfloat32_t v400 = svsub_f32_x(svptrue_b32(), v388, v394); + svfloat32_t v484 = svadd_f32_x(svptrue_b32(), v473, v479); + svfloat32_t v485 = svsub_f32_x(svptrue_b32(), v473, v479); + svfloat32_t v486 = svadd_f32_x(svptrue_b32(), v475, v481); + svfloat32_t v487 = svsub_f32_x(svptrue_b32(), v475, v481); + svfloat32_t v488 = svadd_f32_x(svptrue_b32(), v477, v483); + svfloat32_t v489 = svsub_f32_x(svptrue_b32(), v477, v483); svint16_t v508 = svtbl_s16( svreinterpret_s16_s32(svcvt_s32_f32_x( pred_full, svmul_n_f32_x(pred_full, v396, (float)(1ULL << 31ULL)))), @@ -6510,8 +5930,7 @@ void armral_fft_cf32_cf32_cs16_ab_t_gu15(const armral_cmplx_f32_t *restrict x, svld1_f64(pred_full, &((const double *)v7)[v113])); svfloat32_t v149 = svreinterpret_f32_f64( svld1_f64(pred_full, &((const double *)v7)[v148])); - svfloat32_t zero157; - asm volatile("mov %0.s, #0" : "=w"(zero157)); + svfloat32_t zero157 = svdup_n_f32(0); svfloat32_t v157 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero157, v662, v156, 0), v662, v156, 90); @@ -6555,73 +5974,53 @@ void armral_fft_cf32_cf32_cs16_ab_t_gu15(const armral_cmplx_f32_t *restrict x, svld1_gather_s64index_f64(pred_full, (const double *)(v716), v736)); svfloat32_t v727 = svreinterpret_f32_f64( svld1_gather_s64index_f64(pred_full, (const double *)(v725), v736)); - svfloat32_t zero52; - asm volatile("mov %0.s, #0" : "=w"(zero52)); + svfloat32_t zero52 = svdup_n_f32(0); svfloat32_t v52 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero52, v608, v51, 0), v608, v51, 90); - svfloat32_t zero59; - asm volatile("mov %0.s, #0" : "=w"(zero59)); + svfloat32_t zero59 = svdup_n_f32(0); svfloat32_t v59 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero59, v617, v58, 0), v617, v58, 90); - svfloat32_t zero94; - asm volatile("mov %0.s, #0" : "=w"(zero94)); + svfloat32_t zero94 = svdup_n_f32(0); svfloat32_t v94 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero94, v626, v93, 0), v626, v93, 90); - svfloat32_t zero101; - asm volatile("mov %0.s, #0" : "=w"(zero101)); + svfloat32_t zero101 = svdup_n_f32(0); svfloat32_t v101 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero101, v635, v100, 0), v635, v100, 90); - svfloat32_t zero150; - asm volatile("mov %0.s, #0" : "=w"(zero150)); + svfloat32_t zero150 = svdup_n_f32(0); svfloat32_t v150 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero150, v653, v149, 0), v653, v149, 90); - svfloat32_t zero206; - asm volatile("mov %0.s, #0" : "=w"(zero206)); + svfloat32_t zero206 = svdup_n_f32(0); svfloat32_t v206 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero206, v682, v205, 0), v682, v205, 90); - svfloat32_t zero213; - asm volatile("mov %0.s, #0" : "=w"(zero213)); + svfloat32_t zero213 = svdup_n_f32(0); svfloat32_t v213 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero213, v691, v212, 0), v691, v212, 90); - svfloat32_t zero262; - asm volatile("mov %0.s, #0" : "=w"(zero262)); + svfloat32_t zero262 = svdup_n_f32(0); svfloat32_t v262 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero262, v709, v261, 0), v709, v261, 90); - svfloat32_t zero269; - asm volatile("mov %0.s, #0" : "=w"(zero269)); + svfloat32_t zero269 = svdup_n_f32(0); svfloat32_t v269 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero269, v718, v268, 0), v718, v268, 90); - svfloat32_t v284; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v284) : "w"(v52), "w"(v59)); - svfloat32_t v285; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v285) : "w"(v52), "w"(v59)); - svfloat32_t v294; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v294) : "w"(v94), "w"(v101)); - svfloat32_t v295; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v295) : "w"(v94), "w"(v101)); - svfloat32_t v297; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v297) : "w"(v150), "w"(v157)); - svfloat32_t v298; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v298) : "w"(v150), "w"(v157)); - svfloat32_t v300; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v300) : "w"(v206), "w"(v213)); - svfloat32_t v301; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v301) : "w"(v206), "w"(v213)); - svfloat32_t v303; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v303) : "w"(v262), "w"(v269)); - svfloat32_t v304; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v304) : "w"(v262), "w"(v269)); - svfloat32_t v293; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v293) : "w"(v284), "w"(v737)); + svfloat32_t v284 = svadd_f32_x(svptrue_b32(), v52, v59); + svfloat32_t v285 = svsub_f32_x(svptrue_b32(), v52, v59); + svfloat32_t v294 = svadd_f32_x(svptrue_b32(), v94, v101); + svfloat32_t v295 = svsub_f32_x(svptrue_b32(), v94, v101); + svfloat32_t v297 = svadd_f32_x(svptrue_b32(), v150, v157); + svfloat32_t v298 = svsub_f32_x(svptrue_b32(), v150, v157); + svfloat32_t v300 = svadd_f32_x(svptrue_b32(), v206, v213); + svfloat32_t v301 = svsub_f32_x(svptrue_b32(), v206, v213); + svfloat32_t v303 = svadd_f32_x(svptrue_b32(), v262, v269); + svfloat32_t v304 = svsub_f32_x(svptrue_b32(), v262, v269); + svfloat32_t v293 = svadd_f32_x(svptrue_b32(), v284, v737); svfloat32_t v296 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, v294, v644, v114, 0), v644, v114, 90); @@ -6634,86 +6033,51 @@ void armral_fft_cf32_cf32_cs16_ab_t_gu15(const armral_cmplx_f32_t *restrict x, svfloat32_t v305 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, v303, v727, v282, 0), v727, v282, 90); - svfloat32_t v359; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v359) : "w"(v294), "w"(v303)); - svfloat32_t v360; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v360) : "w"(v294), "w"(v303)); - svfloat32_t v361; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v361) : "w"(v300), "w"(v297)); - svfloat32_t v362; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v362) : "w"(v300), "w"(v297)); - svfloat32_t v412; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v412) : "w"(v295), "w"(v304)); - svfloat32_t v413; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v413) : "w"(v295), "w"(v304)); - svfloat32_t v414; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v414) : "w"(v301), "w"(v298)); - svfloat32_t v415; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v415) : "w"(v301), "w"(v298)); - svfloat32_t v306; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v306) : "w"(v296), "w"(v305)); - svfloat32_t v307; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v307) : "w"(v296), "w"(v305)); - svfloat32_t v308; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v308) : "w"(v302), "w"(v299)); - svfloat32_t v309; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v309) : "w"(v302), "w"(v299)); - svfloat32_t v363; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v363) : "w"(v359), "w"(v361)); - svfloat32_t v364; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v364) : "w"(v359), "w"(v361)); - svfloat32_t v365; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v365) : "w"(v360), "w"(v362)); - svfloat32_t zero388; - asm volatile("mov %0.s, #0" : "=w"(zero388)); + svfloat32_t v359 = svadd_f32_x(svptrue_b32(), v294, v303); + svfloat32_t v360 = svsub_f32_x(svptrue_b32(), v294, v303); + svfloat32_t v361 = svadd_f32_x(svptrue_b32(), v300, v297); + svfloat32_t v362 = svsub_f32_x(svptrue_b32(), v300, v297); + svfloat32_t v412 = svadd_f32_x(svptrue_b32(), v295, v304); + svfloat32_t v413 = svsub_f32_x(svptrue_b32(), v295, v304); + svfloat32_t v414 = svadd_f32_x(svptrue_b32(), v301, v298); + svfloat32_t v415 = svsub_f32_x(svptrue_b32(), v301, v298); + svfloat32_t v306 = svadd_f32_x(svptrue_b32(), v296, v305); + svfloat32_t v307 = svsub_f32_x(svptrue_b32(), v296, v305); + svfloat32_t v308 = svadd_f32_x(svptrue_b32(), v302, v299); + svfloat32_t v309 = svsub_f32_x(svptrue_b32(), v302, v299); + svfloat32_t v363 = svadd_f32_x(svptrue_b32(), v359, v361); + svfloat32_t v364 = svsub_f32_x(svptrue_b32(), v359, v361); + svfloat32_t v365 = svadd_f32_x(svptrue_b32(), v360, v362); + svfloat32_t zero388 = svdup_n_f32(0); svfloat32_t v388 = svcmla_f32_x(pred_full, zero388, v747, v360, 90); - svfloat32_t v416; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v416) : "w"(v412), "w"(v414)); - svfloat32_t v417; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v417) : "w"(v412), "w"(v414)); - svfloat32_t v418; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v418) : "w"(v413), "w"(v415)); - svfloat32_t v455; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v455) : "w"(v415), "w"(v755)); - svfloat32_t v310; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v310) : "w"(v306), "w"(v308)); - svfloat32_t v311; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v311) : "w"(v306), "w"(v308)); - svfloat32_t v312; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v312) : "w"(v307), "w"(v309)); - svfloat32_t zero335; - asm volatile("mov %0.s, #0" : "=w"(zero335)); + svfloat32_t v416 = svadd_f32_x(svptrue_b32(), v412, v414); + svfloat32_t v417 = svsub_f32_x(svptrue_b32(), v412, v414); + svfloat32_t v418 = svadd_f32_x(svptrue_b32(), v413, v415); + svfloat32_t v455 = svmul_f32_x(svptrue_b32(), v415, v755); + svfloat32_t v310 = svadd_f32_x(svptrue_b32(), v306, v308); + svfloat32_t v311 = svsub_f32_x(svptrue_b32(), v306, v308); + svfloat32_t v312 = svadd_f32_x(svptrue_b32(), v307, v309); + svfloat32_t zero335 = svdup_n_f32(0); svfloat32_t v335 = svcmla_f32_x(pred_full, zero335, v741, v307, 90); - svfloat32_t v366; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v366) : "w"(v363), "w"(v284)); - svfloat32_t v376; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v376) : "w"(v363), "w"(v745)); - svfloat32_t zero395; - asm volatile("mov %0.s, #0" : "=w"(zero395)); + svfloat32_t v366 = svadd_f32_x(svptrue_b32(), v363, v284); + svfloat32_t v376 = svmul_f32_x(svptrue_b32(), v363, v745); + svfloat32_t zero395 = svdup_n_f32(0); svfloat32_t v395 = svcmla_f32_x(pred_full, zero395, v748, v365, 90); - svfloat32_t v419; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v419) : "w"(v416), "w"(v285)); - svfloat32_t zero440; - asm volatile("mov %0.s, #0" : "=w"(zero440)); + svfloat32_t v419 = svadd_f32_x(svptrue_b32(), v416, v285); + svfloat32_t zero440 = svdup_n_f32(0); svfloat32_t v440 = svcmla_f32_x(pred_full, zero440, v752, v417, 90); - svfloat32_t v450; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v450) : "w"(v418), "w"(v754)); - svfloat32_t v313; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v313) : "w"(v310), "w"(v293)); - svfloat32_t zero342; - asm volatile("mov %0.s, #0" : "=w"(zero342)); + svfloat32_t v450 = svmul_f32_x(svptrue_b32(), v418, v754); + svfloat32_t v313 = svadd_f32_x(svptrue_b32(), v310, v293); + svfloat32_t zero342 = svdup_n_f32(0); svfloat32_t v342 = svcmla_f32_x(pred_full, zero342, v742, v312, 90); - svfloat32_t v406; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v406) : "w"(v388), "w"(v395)); + svfloat32_t v406 = svsub_f32_x(svptrue_b32(), v388, v395); svfloat32_t v407 = svcmla_f32_x(pred_full, v395, v749, v362, 90); - svfloat32_t zero426; - asm volatile("mov %0.s, #0" : "=w"(zero426)); + svfloat32_t zero426 = svdup_n_f32(0); svfloat32_t v426 = svcmla_f32_x(pred_full, zero426, v750, v419, 90); svfloat32_t v459 = svnmls_f32_x(pred_full, v450, v413, v753); svfloat32_t v460 = svmla_f32_x(pred_full, v455, v418, v754); svfloat32_t v350 = svmla_f32_x(pred_full, v313, v310, v739); - svfloat32_t v353; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v353) : "w"(v335), "w"(v342)); + svfloat32_t v353 = svsub_f32_x(svptrue_b32(), v335, v342); svfloat32_t v354 = svcmla_f32_x(pred_full, v342, v743, v309, 90); svfloat32_t v403 = svmla_f32_x(pred_full, v376, v366, v744); svfloat32_t v456 = svcmla_f32_x(pred_full, v426, v751, v416, 90); @@ -6727,39 +6091,23 @@ void armral_fft_cf32_cf32_cs16_ab_t_gu15(const armral_cmplx_f32_t *restrict x, svfloat32_t v352 = svmls_f32_x(pred_full, v350, v311, v740); svfloat32_t v404 = svmla_f32_x(pred_full, v403, v364, v746); svfloat32_t v405 = svmls_f32_x(pred_full, v403, v364, v746); - svfloat32_t v457; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v457) : "w"(v456), "w"(v440)); - svfloat32_t v458; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v458) : "w"(v456), "w"(v440)); - svfloat32_t v466; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v466) : "w"(v465), "w"(v426)); - svfloat32_t v467; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v467) : "w"(v465), "w"(v426)); + svfloat32_t v457 = svadd_f32_x(svptrue_b32(), v456, v440); + svfloat32_t v458 = svsub_f32_x(svptrue_b32(), v456, v440); + svfloat32_t v466 = svadd_f32_x(svptrue_b32(), v465, v426); + svfloat32_t v467 = svsub_f32_x(svptrue_b32(), v465, v426); svst1w_u64(pred_full, (unsigned *)(v763), svreinterpret_u64_s16(v470)); - svfloat32_t v355; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v355) : "w"(v351), "w"(v353)); - svfloat32_t v356; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v356) : "w"(v351), "w"(v353)); - svfloat32_t v357; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v357) : "w"(v352), "w"(v354)); - svfloat32_t v358; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v358) : "w"(v352), "w"(v354)); - svfloat32_t v408; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v408) : "w"(v404), "w"(v406)); - svfloat32_t v409; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v409) : "w"(v404), "w"(v406)); - svfloat32_t v410; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v410) : "w"(v405), "w"(v407)); - svfloat32_t v411; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v411) : "w"(v405), "w"(v407)); - svfloat32_t v461; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v461) : "w"(v457), "w"(v459)); - svfloat32_t v462; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v462) : "w"(v457), "w"(v459)); - svfloat32_t v463; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v463) : "w"(v458), "w"(v460)); - svfloat32_t v464; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v464) : "w"(v458), "w"(v460)); + svfloat32_t v355 = svadd_f32_x(svptrue_b32(), v351, v353); + svfloat32_t v356 = svsub_f32_x(svptrue_b32(), v351, v353); + svfloat32_t v357 = svadd_f32_x(svptrue_b32(), v352, v354); + svfloat32_t v358 = svsub_f32_x(svptrue_b32(), v352, v354); + svfloat32_t v408 = svadd_f32_x(svptrue_b32(), v404, v406); + svfloat32_t v409 = svsub_f32_x(svptrue_b32(), v404, v406); + svfloat32_t v410 = svadd_f32_x(svptrue_b32(), v405, v407); + svfloat32_t v411 = svsub_f32_x(svptrue_b32(), v405, v407); + svfloat32_t v461 = svadd_f32_x(svptrue_b32(), v457, v459); + svfloat32_t v462 = svsub_f32_x(svptrue_b32(), v457, v459); + svfloat32_t v463 = svadd_f32_x(svptrue_b32(), v458, v460); + svfloat32_t v464 = svsub_f32_x(svptrue_b32(), v458, v460); svint16_t v478 = svtbl_s16( svreinterpret_s16_s32(svcvt_s32_f32_x( pred_full, svmul_n_f32_x(pred_full, v467, (float)(1ULL << 31ULL)))), @@ -6770,29 +6118,25 @@ void armral_fft_cf32_cf32_cs16_ab_t_gu15(const armral_cmplx_f32_t *restrict x, pred_full, svmul_n_f32_x(pred_full, v466, (float)(1ULL << 31ULL)))), svreinterpret_u16_u64( svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svfloat32_t v492; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v492) : "w"(v356), "w"(v409)); + svfloat32_t v492 = svadd_f32_x(svptrue_b32(), v356, v409); svint16_t v497 = svtbl_s16( svreinterpret_s16_s32(svcvt_s32_f32_x( pred_full, svmul_n_f32_x(pred_full, v356, (float)(1ULL << 31ULL)))), svreinterpret_u16_u64( svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svfloat32_t v519; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v519) : "w"(v358), "w"(v411)); + svfloat32_t v519 = svadd_f32_x(svptrue_b32(), v358, v411); svint16_t v524 = svtbl_s16( svreinterpret_s16_s32(svcvt_s32_f32_x( pred_full, svmul_n_f32_x(pred_full, v358, (float)(1ULL << 31ULL)))), svreinterpret_u16_u64( svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svfloat32_t v546; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v546) : "w"(v357), "w"(v410)); + svfloat32_t v546 = svadd_f32_x(svptrue_b32(), v357, v410); svint16_t v551 = svtbl_s16( svreinterpret_s16_s32(svcvt_s32_f32_x( pred_full, svmul_n_f32_x(pred_full, v357, (float)(1ULL << 31ULL)))), svreinterpret_u16_u64( svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svfloat32_t v573; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v573) : "w"(v355), "w"(v408)); + svfloat32_t v573 = svadd_f32_x(svptrue_b32(), v355, v408); svint16_t v578 = svtbl_s16( svreinterpret_s16_s32(svcvt_s32_f32_x( pred_full, svmul_n_f32_x(pred_full, v355, (float)(1ULL << 31ULL)))), @@ -6800,22 +6144,14 @@ void armral_fft_cf32_cf32_cs16_ab_t_gu15(const armral_cmplx_f32_t *restrict x, svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); svst1w_u64(pred_full, (unsigned *)(v772), svreinterpret_u64_s16(v478)); svst1w_u64(pred_full, (unsigned *)(v781), svreinterpret_u64_s16(v486)); - svfloat32_t v493; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v493) : "w"(v492), "w"(v462)); - svfloat32_t v494; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v494) : "w"(v492), "w"(v462)); - svfloat32_t v520; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v520) : "w"(v519), "w"(v464)); - svfloat32_t v521; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v521) : "w"(v519), "w"(v464)); - svfloat32_t v547; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v547) : "w"(v546), "w"(v463)); - svfloat32_t v548; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v548) : "w"(v546), "w"(v463)); - svfloat32_t v574; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v574) : "w"(v573), "w"(v461)); - svfloat32_t v575; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v575) : "w"(v573), "w"(v461)); + svfloat32_t v493 = svadd_f32_x(svptrue_b32(), v492, v462); + svfloat32_t v494 = svsub_f32_x(svptrue_b32(), v492, v462); + svfloat32_t v520 = svadd_f32_x(svptrue_b32(), v519, v464); + svfloat32_t v521 = svsub_f32_x(svptrue_b32(), v519, v464); + svfloat32_t v547 = svadd_f32_x(svptrue_b32(), v546, v463); + svfloat32_t v548 = svsub_f32_x(svptrue_b32(), v546, v463); + svfloat32_t v574 = svadd_f32_x(svptrue_b32(), v573, v461); + svfloat32_t v575 = svsub_f32_x(svptrue_b32(), v573, v461); svst1w_u64(pred_full, (unsigned *)(v790), svreinterpret_u64_s16(v497)); svst1w_u64(pred_full, (unsigned *)(v817), svreinterpret_u64_s16(v524)); svst1w_u64(pred_full, (unsigned *)(v844), svreinterpret_u64_s16(v551)); @@ -7355,8 +6691,7 @@ void armral_fft_cf32_cf32_cs16_ab_t_gu16(const armral_cmplx_f32_t *restrict x, svld1_f64(pred_full, &((const double *)v7)[v155])); svfloat32_t v163 = svreinterpret_f32_f64( svld1_f64(pred_full, &((const double *)v7)[v162])); - svfloat32_t zero199; - asm volatile("mov %0.s, #0" : "=w"(zero199)); + svfloat32_t zero199 = svdup_n_f32(0); svfloat32_t v199 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero199, v729, v198, 0), v729, v198, 90); @@ -7402,210 +6737,137 @@ void armral_fft_cf32_cf32_cs16_ab_t_gu16(const armral_cmplx_f32_t *restrict x, svld1_gather_s64index_f64(pred_full, (const double *)(v783), v803)); svfloat32_t v794 = svreinterpret_f32_f64( svld1_gather_s64index_f64(pred_full, (const double *)(v792), v803)); - svfloat32_t zero38; - asm volatile("mov %0.s, #0" : "=w"(zero38)); + svfloat32_t zero38 = svdup_n_f32(0); svfloat32_t v38 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero38, v666, v37, 0), v666, v37, 90); - svfloat32_t zero73; - asm volatile("mov %0.s, #0" : "=w"(zero73)); + svfloat32_t zero73 = svdup_n_f32(0); svfloat32_t v73 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero73, v675, v72, 0), v675, v72, 90); - svfloat32_t zero80; - asm volatile("mov %0.s, #0" : "=w"(zero80)); + svfloat32_t zero80 = svdup_n_f32(0); svfloat32_t v80 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero80, v684, v79, 0), v684, v79, 90); - svfloat32_t zero115; - asm volatile("mov %0.s, #0" : "=w"(zero115)); + svfloat32_t zero115 = svdup_n_f32(0); svfloat32_t v115 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero115, v693, v114, 0), v693, v114, 90); - svfloat32_t zero122; - asm volatile("mov %0.s, #0" : "=w"(zero122)); + svfloat32_t zero122 = svdup_n_f32(0); svfloat32_t v122 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero122, v702, v121, 0), v702, v121, 90); - svfloat32_t zero157; - asm volatile("mov %0.s, #0" : "=w"(zero157)); + svfloat32_t zero157 = svdup_n_f32(0); svfloat32_t v157 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero157, v711, v156, 0), v711, v156, 90); - svfloat32_t zero164; - asm volatile("mov %0.s, #0" : "=w"(zero164)); + svfloat32_t zero164 = svdup_n_f32(0); svfloat32_t v164 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero164, v720, v163, 0), v720, v163, 90); - svfloat32_t zero206; - asm volatile("mov %0.s, #0" : "=w"(zero206)); + svfloat32_t zero206 = svdup_n_f32(0); svfloat32_t v206 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero206, v739, v205, 0), v739, v205, 90); - svfloat32_t zero241; - asm volatile("mov %0.s, #0" : "=w"(zero241)); + svfloat32_t zero241 = svdup_n_f32(0); svfloat32_t v241 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero241, v749, v240, 0), v749, v240, 90); - svfloat32_t zero248; - asm volatile("mov %0.s, #0" : "=w"(zero248)); + svfloat32_t zero248 = svdup_n_f32(0); svfloat32_t v248 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero248, v758, v247, 0), v758, v247, 90); - svfloat32_t zero283; - asm volatile("mov %0.s, #0" : "=w"(zero283)); + svfloat32_t zero283 = svdup_n_f32(0); svfloat32_t v283 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero283, v767, v282, 0), v767, v282, 90); - svfloat32_t zero290; - asm volatile("mov %0.s, #0" : "=w"(zero290)); + svfloat32_t zero290 = svdup_n_f32(0); svfloat32_t v290 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero290, v776, v289, 0), v776, v289, 90); - svfloat32_t zero325; - asm volatile("mov %0.s, #0" : "=w"(zero325)); + svfloat32_t zero325 = svdup_n_f32(0); svfloat32_t v325 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero325, v785, v324, 0), v785, v324, 90); - svfloat32_t zero332; - asm volatile("mov %0.s, #0" : "=w"(zero332)); + svfloat32_t zero332 = svdup_n_f32(0); svfloat32_t v332 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero332, v794, v331, 0), v794, v331, 90); - svfloat32_t v340; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v340) : "w"(v804), "w"(v38)); - svfloat32_t v341; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v341) : "w"(v804), "w"(v38)); - svfloat32_t v342; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v342) : "w"(v73), "w"(v80)); - svfloat32_t v343; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v343) : "w"(v73), "w"(v80)); - svfloat32_t v344; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v344) : "w"(v115), "w"(v122)); - svfloat32_t v345; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v345) : "w"(v115), "w"(v122)); - svfloat32_t v346; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v346) : "w"(v157), "w"(v164)); - svfloat32_t v347; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v347) : "w"(v157), "w"(v164)); - svfloat32_t v348; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v348) : "w"(v199), "w"(v206)); - svfloat32_t v349; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v349) : "w"(v199), "w"(v206)); - svfloat32_t v350; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v350) : "w"(v241), "w"(v248)); - svfloat32_t v351; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v351) : "w"(v241), "w"(v248)); - svfloat32_t v352; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v352) : "w"(v283), "w"(v290)); - svfloat32_t v353; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v353) : "w"(v283), "w"(v290)); - svfloat32_t v354; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v354) : "w"(v325), "w"(v332)); - svfloat32_t v355; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v355) : "w"(v325), "w"(v332)); - svfloat32_t v356; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v356) : "w"(v340), "w"(v342)); - svfloat32_t v357; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v357) : "w"(v340), "w"(v342)); - svfloat32_t v358; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v358) : "w"(v344), "w"(v346)); - svfloat32_t v359; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v359) : "w"(v344), "w"(v346)); - svfloat32_t v360; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v360) : "w"(v348), "w"(v350)); - svfloat32_t v361; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v361) : "w"(v348), "w"(v350)); - svfloat32_t v362; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v362) : "w"(v352), "w"(v354)); - svfloat32_t v363; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v363) : "w"(v352), "w"(v354)); - svfloat32_t v372; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v372) : "w"(v345), "w"(v347)); - svfloat32_t v373; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v373) : "w"(v345), "w"(v347)); - svfloat32_t v374; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v374) : "w"(v349), "w"(v355)); - svfloat32_t v375; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v375) : "w"(v349), "w"(v355)); - svfloat32_t v376; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v376) : "w"(v351), "w"(v353)); - svfloat32_t v377; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v377) : "w"(v351), "w"(v353)); - svfloat32_t zero437; - asm volatile("mov %0.s, #0" : "=w"(zero437)); + svfloat32_t v340 = svadd_f32_x(svptrue_b32(), v804, v38); + svfloat32_t v341 = svsub_f32_x(svptrue_b32(), v804, v38); + svfloat32_t v342 = svadd_f32_x(svptrue_b32(), v73, v80); + svfloat32_t v343 = svsub_f32_x(svptrue_b32(), v73, v80); + svfloat32_t v344 = svadd_f32_x(svptrue_b32(), v115, v122); + svfloat32_t v345 = svsub_f32_x(svptrue_b32(), v115, v122); + svfloat32_t v346 = svadd_f32_x(svptrue_b32(), v157, v164); + svfloat32_t v347 = svsub_f32_x(svptrue_b32(), v157, v164); + svfloat32_t v348 = svadd_f32_x(svptrue_b32(), v199, v206); + svfloat32_t v349 = svsub_f32_x(svptrue_b32(), v199, v206); + svfloat32_t v350 = svadd_f32_x(svptrue_b32(), v241, v248); + svfloat32_t v351 = svsub_f32_x(svptrue_b32(), v241, v248); + svfloat32_t v352 = svadd_f32_x(svptrue_b32(), v283, v290); + svfloat32_t v353 = svsub_f32_x(svptrue_b32(), v283, v290); + svfloat32_t v354 = svadd_f32_x(svptrue_b32(), v325, v332); + svfloat32_t v355 = svsub_f32_x(svptrue_b32(), v325, v332); + svfloat32_t v356 = svadd_f32_x(svptrue_b32(), v340, v342); + svfloat32_t v357 = svsub_f32_x(svptrue_b32(), v340, v342); + svfloat32_t v358 = svadd_f32_x(svptrue_b32(), v344, v346); + svfloat32_t v359 = svsub_f32_x(svptrue_b32(), v344, v346); + svfloat32_t v360 = svadd_f32_x(svptrue_b32(), v348, v350); + svfloat32_t v361 = svsub_f32_x(svptrue_b32(), v348, v350); + svfloat32_t v362 = svadd_f32_x(svptrue_b32(), v352, v354); + svfloat32_t v363 = svsub_f32_x(svptrue_b32(), v352, v354); + svfloat32_t v372 = svadd_f32_x(svptrue_b32(), v345, v347); + svfloat32_t v373 = svsub_f32_x(svptrue_b32(), v345, v347); + svfloat32_t v374 = svadd_f32_x(svptrue_b32(), v349, v355); + svfloat32_t v375 = svsub_f32_x(svptrue_b32(), v349, v355); + svfloat32_t v376 = svadd_f32_x(svptrue_b32(), v351, v353); + svfloat32_t v377 = svsub_f32_x(svptrue_b32(), v351, v353); + svfloat32_t zero437 = svdup_n_f32(0); svfloat32_t v437 = svcmla_f32_x(pred_full, zero437, v814, v343, 90); - svfloat32_t v364; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v364) : "w"(v356), "w"(v358)); - svfloat32_t v365; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v365) : "w"(v356), "w"(v358)); - svfloat32_t v366; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v366) : "w"(v360), "w"(v362)); - svfloat32_t v367; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v367) : "w"(v360), "w"(v362)); - svfloat32_t v370; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v370) : "w"(v361), "w"(v363)); - svfloat32_t v371; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v371) : "w"(v361), "w"(v363)); - svfloat32_t v378; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v378) : "w"(v374), "w"(v376)); - svfloat32_t v379; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v379) : "w"(v375), "w"(v377)); - svfloat32_t zero413; - asm volatile("mov %0.s, #0" : "=w"(zero413)); + svfloat32_t v364 = svadd_f32_x(svptrue_b32(), v356, v358); + svfloat32_t v365 = svsub_f32_x(svptrue_b32(), v356, v358); + svfloat32_t v366 = svadd_f32_x(svptrue_b32(), v360, v362); + svfloat32_t v367 = svsub_f32_x(svptrue_b32(), v360, v362); + svfloat32_t v370 = svadd_f32_x(svptrue_b32(), v361, v363); + svfloat32_t v371 = svsub_f32_x(svptrue_b32(), v361, v363); + svfloat32_t v378 = svadd_f32_x(svptrue_b32(), v374, v376); + svfloat32_t v379 = svadd_f32_x(svptrue_b32(), v375, v377); + svfloat32_t zero413 = svdup_n_f32(0); svfloat32_t v413 = svcmla_f32_x(pred_full, zero413, v814, v359, 90); - svfloat32_t zero444; - asm volatile("mov %0.s, #0" : "=w"(zero444)); + svfloat32_t zero444 = svdup_n_f32(0); svfloat32_t v444 = svcmla_f32_x(pred_full, zero444, v815, v372, 90); - svfloat32_t zero470; - asm volatile("mov %0.s, #0" : "=w"(zero470)); + svfloat32_t zero470 = svdup_n_f32(0); svfloat32_t v470 = svcmla_f32_x(pred_full, zero470, v819, v376, 90); - svfloat32_t v480; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v480) : "w"(v375), "w"(v821)); - svfloat32_t v485; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v485) : "w"(v377), "w"(v822)); - svfloat32_t v368; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v368) : "w"(v364), "w"(v366)); - svfloat32_t v369; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v369) : "w"(v364), "w"(v366)); - svfloat32_t zero401; - asm volatile("mov %0.s, #0" : "=w"(zero401)); + svfloat32_t v480 = svmul_f32_x(svptrue_b32(), v375, v821); + svfloat32_t v485 = svmul_f32_x(svptrue_b32(), v377, v822); + svfloat32_t v368 = svadd_f32_x(svptrue_b32(), v364, v366); + svfloat32_t v369 = svsub_f32_x(svptrue_b32(), v364, v366); + svfloat32_t zero401 = svdup_n_f32(0); svfloat32_t v401 = svcmla_f32_x(pred_full, zero401, v814, v367, 90); - svfloat32_t zero420; - asm volatile("mov %0.s, #0" : "=w"(zero420)); + svfloat32_t zero420 = svdup_n_f32(0); svfloat32_t v420 = svcmla_f32_x(pred_full, zero420, v815, v370, 90); - svfloat32_t zero456; - asm volatile("mov %0.s, #0" : "=w"(zero456)); + svfloat32_t zero456 = svdup_n_f32(0); svfloat32_t v456 = svcmla_f32_x(pred_full, zero456, v817, v378, 90); - svfloat32_t v475; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v475) : "w"(v379), "w"(v820)); + svfloat32_t v475 = svmul_f32_x(svptrue_b32(), v379, v820); svfloat32_t v496 = svmla_f32_x(pred_full, v341, v373, v816); svfloat32_t v497 = svmls_f32_x(pred_full, v341, v373, v816); - svfloat32_t v498; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v498) : "w"(v437), "w"(v444)); - svfloat32_t v499; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v499) : "w"(v437), "w"(v444)); - svfloat32_t v486; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v486) : "w"(v365), "w"(v401)); - svfloat32_t v487; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v487) : "w"(v365), "w"(v401)); + svfloat32_t v498 = svadd_f32_x(svptrue_b32(), v437, v444); + svfloat32_t v499 = svsub_f32_x(svptrue_b32(), v437, v444); + svfloat32_t v486 = svadd_f32_x(svptrue_b32(), v365, v401); + svfloat32_t v487 = svsub_f32_x(svptrue_b32(), v365, v401); svfloat32_t v488 = svmla_f32_x(pred_full, v357, v371, v816); - svfloat32_t v489; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v489) : "w"(v413), "w"(v420)); + svfloat32_t v489 = svadd_f32_x(svptrue_b32(), v413, v420); svfloat32_t v490 = svmls_f32_x(pred_full, v357, v371, v816); - svfloat32_t v491; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v491) : "w"(v420), "w"(v413)); + svfloat32_t v491 = svsub_f32_x(svptrue_b32(), v420, v413); svfloat32_t v500 = svcmla_f32_x(pred_full, v456, v818, v374, 90); - svfloat32_t v501; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v501) : "w"(v456), "w"(v470)); + svfloat32_t v501 = svsub_f32_x(svptrue_b32(), v456, v470); svfloat32_t v502 = svnmls_f32_x(pred_full, v475, v375, v821); svfloat32_t v503 = svnmls_f32_x(pred_full, v475, v377, v822); svfloat32_t v504 = svnmls_f32_x(pred_full, v480, v379, v820); svfloat32_t v505 = svnmls_f32_x(pred_full, v485, v379, v820); - svfloat32_t v510; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v510) : "w"(v497), "w"(v499)); - svfloat32_t v511; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v511) : "w"(v497), "w"(v499)); + svfloat32_t v510 = svadd_f32_x(svptrue_b32(), v497, v499); + svfloat32_t v511 = svsub_f32_x(svptrue_b32(), v497, v499); svint16_t v532 = svtbl_s16( svreinterpret_s16_s32(svcvt_s32_f32_x( pred_full, svmul_n_f32_x(pred_full, v368, (float)(1ULL << 31ULL)))), @@ -7616,38 +6878,22 @@ void armral_fft_cf32_cf32_cs16_ab_t_gu16(const armral_cmplx_f32_t *restrict x, pred_full, svmul_n_f32_x(pred_full, v369, (float)(1ULL << 31ULL)))), svreinterpret_u16_u64( svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svfloat32_t v492; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v492) : "w"(v488), "w"(v489)); - svfloat32_t v493; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v493) : "w"(v490), "w"(v491)); - svfloat32_t v494; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v494) : "w"(v490), "w"(v491)); - svfloat32_t v495; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v495) : "w"(v488), "w"(v489)); - svfloat32_t v506; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v506) : "w"(v496), "w"(v502)); - svfloat32_t v507; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v507) : "w"(v496), "w"(v502)); - svfloat32_t v508; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v508) : "w"(v496), "w"(v504)); - svfloat32_t v509; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v509) : "w"(v496), "w"(v504)); - svfloat32_t v512; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v512) : "w"(v497), "w"(v505)); - svfloat32_t v513; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v513) : "w"(v497), "w"(v505)); - svfloat32_t v516; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v516) : "w"(v500), "w"(v498)); - svfloat32_t v517; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v517) : "w"(v500), "w"(v498)); - svfloat32_t v518; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v518) : "w"(v501), "w"(v503)); - svfloat32_t v519; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v519) : "w"(v501), "w"(v503)); - svfloat32_t v520; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v520) : "w"(v501), "w"(v499)); - svfloat32_t v521; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v521) : "w"(v501), "w"(v499)); + svfloat32_t v492 = svadd_f32_x(svptrue_b32(), v488, v489); + svfloat32_t v493 = svadd_f32_x(svptrue_b32(), v490, v491); + svfloat32_t v494 = svsub_f32_x(svptrue_b32(), v490, v491); + svfloat32_t v495 = svsub_f32_x(svptrue_b32(), v488, v489); + svfloat32_t v506 = svadd_f32_x(svptrue_b32(), v496, v502); + svfloat32_t v507 = svsub_f32_x(svptrue_b32(), v496, v502); + svfloat32_t v508 = svadd_f32_x(svptrue_b32(), v496, v504); + svfloat32_t v509 = svsub_f32_x(svptrue_b32(), v496, v504); + svfloat32_t v512 = svadd_f32_x(svptrue_b32(), v497, v505); + svfloat32_t v513 = svsub_f32_x(svptrue_b32(), v497, v505); + svfloat32_t v516 = svadd_f32_x(svptrue_b32(), v500, v498); + svfloat32_t v517 = svsub_f32_x(svptrue_b32(), v500, v498); + svfloat32_t v518 = svadd_f32_x(svptrue_b32(), v501, v503); + svfloat32_t v519 = svsub_f32_x(svptrue_b32(), v501, v503); + svfloat32_t v520 = svadd_f32_x(svptrue_b32(), v501, v499); + svfloat32_t v521 = svsub_f32_x(svptrue_b32(), v501, v499); svint16_t v564 = svtbl_s16( svreinterpret_s16_s32(svcvt_s32_f32_x( pred_full, svmul_n_f32_x(pred_full, v487, (float)(1ULL << 31ULL)))), @@ -7660,22 +6906,14 @@ void armral_fft_cf32_cf32_cs16_ab_t_gu16(const armral_cmplx_f32_t *restrict x, svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); svst1w_u64(pred_full, (unsigned *)(v830), svreinterpret_u64_s16(v532)); svst1w_u64(pred_full, (unsigned *)(v902), svreinterpret_u64_s16(v596)); - svfloat32_t v522; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v522) : "w"(v506), "w"(v516)); - svfloat32_t v523; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v523) : "w"(v507), "w"(v517)); - svfloat32_t v524; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v524) : "w"(v508), "w"(v517)); - svfloat32_t v525; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v525) : "w"(v509), "w"(v516)); - svfloat32_t v526; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v526) : "w"(v510), "w"(v518)); - svfloat32_t v527; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v527) : "w"(v511), "w"(v519)); - svfloat32_t v528; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v528) : "w"(v512), "w"(v521)); - svfloat32_t v529; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v529) : "w"(v513), "w"(v520)); + svfloat32_t v522 = svadd_f32_x(svptrue_b32(), v506, v516); + svfloat32_t v523 = svadd_f32_x(svptrue_b32(), v507, v517); + svfloat32_t v524 = svsub_f32_x(svptrue_b32(), v508, v517); + svfloat32_t v525 = svsub_f32_x(svptrue_b32(), v509, v516); + svfloat32_t v526 = svadd_f32_x(svptrue_b32(), v510, v518); + svfloat32_t v527 = svadd_f32_x(svptrue_b32(), v511, v519); + svfloat32_t v528 = svsub_f32_x(svptrue_b32(), v512, v521); + svfloat32_t v529 = svsub_f32_x(svptrue_b32(), v513, v520); svint16_t v548 = svtbl_s16( svreinterpret_s16_s32(svcvt_s32_f32_x( pred_full, svmul_n_f32_x(pred_full, v495, (float)(1ULL << 31ULL)))), @@ -8506,8 +7744,7 @@ void armral_fft_cf32_cf32_cs16_ab_t_gu17(const armral_cmplx_f32_t *restrict x, int32_t *v1201 = &v6[v851]; int32_t *v1210 = &v6[v860]; int32_t *v1219 = &v6[v869]; - svfloat32_t zero52; - asm volatile("mov %0.s, #0" : "=w"(zero52)); + svfloat32_t zero52 = svdup_n_f32(0); svfloat32_t v52 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero52, v884, v51, 0), v884, v51, 90); @@ -8571,242 +7808,151 @@ void armral_fft_cf32_cf32_cs16_ab_t_gu17(const armral_cmplx_f32_t *restrict x, svld1_gather_s64index_f64(pred_full, (const double *)(v1010), v1030)); svfloat32_t v1021 = svreinterpret_f32_f64( svld1_gather_s64index_f64(pred_full, (const double *)(v1019), v1030)); - svfloat32_t zero59; - asm volatile("mov %0.s, #0" : "=w"(zero59)); + svfloat32_t zero59 = svdup_n_f32(0); svfloat32_t v59 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero59, v894, v58, 0), v894, v58, 90); - svfloat32_t zero94; - asm volatile("mov %0.s, #0" : "=w"(zero94)); + svfloat32_t zero94 = svdup_n_f32(0); svfloat32_t v94 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero94, v904, v93, 0), v904, v93, 90); - svfloat32_t zero101; - asm volatile("mov %0.s, #0" : "=w"(zero101)); + svfloat32_t zero101 = svdup_n_f32(0); svfloat32_t v101 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero101, v913, v100, 0), v913, v100, 90); - svfloat32_t zero136; - asm volatile("mov %0.s, #0" : "=w"(zero136)); + svfloat32_t zero136 = svdup_n_f32(0); svfloat32_t v136 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero136, v922, v135, 0), v922, v135, 90); - svfloat32_t zero143; - asm volatile("mov %0.s, #0" : "=w"(zero143)); + svfloat32_t zero143 = svdup_n_f32(0); svfloat32_t v143 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero143, v931, v142, 0), v931, v142, 90); - svfloat32_t zero178; - asm volatile("mov %0.s, #0" : "=w"(zero178)); + svfloat32_t zero178 = svdup_n_f32(0); svfloat32_t v178 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero178, v940, v177, 0), v940, v177, 90); - svfloat32_t zero185; - asm volatile("mov %0.s, #0" : "=w"(zero185)); + svfloat32_t zero185 = svdup_n_f32(0); svfloat32_t v185 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero185, v949, v184, 0), v949, v184, 90); - svfloat32_t zero220; - asm volatile("mov %0.s, #0" : "=w"(zero220)); + svfloat32_t zero220 = svdup_n_f32(0); svfloat32_t v220 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero220, v958, v219, 0), v958, v219, 90); - svfloat32_t zero227; - asm volatile("mov %0.s, #0" : "=w"(zero227)); + svfloat32_t zero227 = svdup_n_f32(0); svfloat32_t v227 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero227, v967, v226, 0), v967, v226, 90); - svfloat32_t zero262; - asm volatile("mov %0.s, #0" : "=w"(zero262)); + svfloat32_t zero262 = svdup_n_f32(0); svfloat32_t v262 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero262, v976, v261, 0), v976, v261, 90); - svfloat32_t zero269; - asm volatile("mov %0.s, #0" : "=w"(zero269)); + svfloat32_t zero269 = svdup_n_f32(0); svfloat32_t v269 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero269, v985, v268, 0), v985, v268, 90); - svfloat32_t zero304; - asm volatile("mov %0.s, #0" : "=w"(zero304)); + svfloat32_t zero304 = svdup_n_f32(0); svfloat32_t v304 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero304, v994, v303, 0), v994, v303, 90); - svfloat32_t zero311; - asm volatile("mov %0.s, #0" : "=w"(zero311)); + svfloat32_t zero311 = svdup_n_f32(0); svfloat32_t v311 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero311, v1003, v310, 0), v1003, v310, 90); - svfloat32_t zero346; - asm volatile("mov %0.s, #0" : "=w"(zero346)); + svfloat32_t zero346 = svdup_n_f32(0); svfloat32_t v346 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero346, v1012, v345, 0), v1012, v345, 90); - svfloat32_t zero353; - asm volatile("mov %0.s, #0" : "=w"(zero353)); + svfloat32_t zero353 = svdup_n_f32(0); svfloat32_t v353 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero353, v1021, v352, 0), v1021, v352, 90); - svfloat32_t v354; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v354) : "w"(v52), "w"(v59)); - svfloat32_t v355; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v355) : "w"(v52), "w"(v59)); - svfloat32_t v356; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v356) : "w"(v94), "w"(v101)); - svfloat32_t v357; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v357) : "w"(v94), "w"(v101)); - svfloat32_t v358; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v358) : "w"(v136), "w"(v143)); - svfloat32_t v359; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v359) : "w"(v136), "w"(v143)); - svfloat32_t v360; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v360) : "w"(v178), "w"(v185)); - svfloat32_t v361; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v361) : "w"(v178), "w"(v185)); - svfloat32_t v362; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v362) : "w"(v220), "w"(v227)); - svfloat32_t v363; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v363) : "w"(v220), "w"(v227)); - svfloat32_t v364; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v364) : "w"(v262), "w"(v269)); - svfloat32_t v365; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v365) : "w"(v262), "w"(v269)); - svfloat32_t v366; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v366) : "w"(v304), "w"(v311)); - svfloat32_t v367; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v367) : "w"(v304), "w"(v311)); - svfloat32_t v368; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v368) : "w"(v346), "w"(v353)); - svfloat32_t v369; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v369) : "w"(v346), "w"(v353)); - svfloat32_t v370; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v370) : "w"(v354), "w"(v362)); - svfloat32_t v371; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v371) : "w"(v356), "w"(v364)); - svfloat32_t v372; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v372) : "w"(v358), "w"(v366)); - svfloat32_t v373; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v373) : "w"(v360), "w"(v368)); - svfloat32_t v376; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v376) : "w"(v354), "w"(v362)); - svfloat32_t v377; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v377) : "w"(v356), "w"(v364)); - svfloat32_t v378; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v378) : "w"(v358), "w"(v366)); - svfloat32_t v379; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v379) : "w"(v360), "w"(v368)); - svfloat32_t v390; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v390) : "w"(v355), "w"(v359)); - svfloat32_t v391; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v391) : "w"(v357), "w"(v361)); - svfloat32_t v392; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v392) : "w"(v355), "w"(v359)); - svfloat32_t v393; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v393) : "w"(v369), "w"(v365)); - svfloat32_t v394; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v394) : "w"(v363), "w"(v367)); - svfloat32_t v395; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v395) : "w"(v365), "w"(v369)); - svfloat32_t v396; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v396) : "w"(v363), "w"(v367)); - svfloat32_t v397; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v397) : "w"(v357), "w"(v361)); - svfloat32_t v410; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v410) : "w"(v355), "w"(v363)); - svfloat32_t v411; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v411) : "w"(v361), "w"(v369)); - svfloat32_t v374; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v374) : "w"(v370), "w"(v372)); - svfloat32_t v375; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v375) : "w"(v371), "w"(v373)); - svfloat32_t v380; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v380) : "w"(v370), "w"(v372)); - svfloat32_t v381; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v381) : "w"(v371), "w"(v373)); - svfloat32_t v384; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v384) : "w"(v377), "w"(v379)); - svfloat32_t v385; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v385) : "w"(v376), "w"(v378)); - svfloat32_t v387; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v387) : "w"(v378), "w"(v379)); - svfloat32_t v388; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v388) : "w"(v376), "w"(v377)); - svfloat32_t v398; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v398) : "w"(v390), "w"(v391)); - svfloat32_t v399; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v399) : "w"(v394), "w"(v395)); - svfloat32_t v401; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v401) : "w"(v390), "w"(v391)); - svfloat32_t v402; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v402) : "w"(v394), "w"(v395)); - svfloat32_t v404; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v404) : "w"(v392), "w"(v393)); - svfloat32_t v405; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v405) : "w"(v396), "w"(v397)); - svfloat32_t v407; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v407) : "w"(v392), "w"(v393)); - svfloat32_t v408; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v408) : "w"(v396), "w"(v397)); - svfloat32_t v447; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v447) : "w"(v378), "w"(v1035)); - svfloat32_t zero614; - asm volatile("mov %0.s, #0" : "=w"(zero614)); + svfloat32_t v354 = svadd_f32_x(svptrue_b32(), v52, v59); + svfloat32_t v355 = svsub_f32_x(svptrue_b32(), v52, v59); + svfloat32_t v356 = svadd_f32_x(svptrue_b32(), v94, v101); + svfloat32_t v357 = svsub_f32_x(svptrue_b32(), v94, v101); + svfloat32_t v358 = svadd_f32_x(svptrue_b32(), v136, v143); + svfloat32_t v359 = svsub_f32_x(svptrue_b32(), v136, v143); + svfloat32_t v360 = svadd_f32_x(svptrue_b32(), v178, v185); + svfloat32_t v361 = svsub_f32_x(svptrue_b32(), v178, v185); + svfloat32_t v362 = svadd_f32_x(svptrue_b32(), v220, v227); + svfloat32_t v363 = svsub_f32_x(svptrue_b32(), v220, v227); + svfloat32_t v364 = svadd_f32_x(svptrue_b32(), v262, v269); + svfloat32_t v365 = svsub_f32_x(svptrue_b32(), v262, v269); + svfloat32_t v366 = svadd_f32_x(svptrue_b32(), v304, v311); + svfloat32_t v367 = svsub_f32_x(svptrue_b32(), v304, v311); + svfloat32_t v368 = svadd_f32_x(svptrue_b32(), v346, v353); + svfloat32_t v369 = svsub_f32_x(svptrue_b32(), v346, v353); + svfloat32_t v370 = svadd_f32_x(svptrue_b32(), v354, v362); + svfloat32_t v371 = svadd_f32_x(svptrue_b32(), v356, v364); + svfloat32_t v372 = svadd_f32_x(svptrue_b32(), v358, v366); + svfloat32_t v373 = svadd_f32_x(svptrue_b32(), v360, v368); + svfloat32_t v376 = svsub_f32_x(svptrue_b32(), v354, v362); + svfloat32_t v377 = svsub_f32_x(svptrue_b32(), v356, v364); + svfloat32_t v378 = svsub_f32_x(svptrue_b32(), v358, v366); + svfloat32_t v379 = svsub_f32_x(svptrue_b32(), v360, v368); + svfloat32_t v390 = svadd_f32_x(svptrue_b32(), v355, v359); + svfloat32_t v391 = svadd_f32_x(svptrue_b32(), v357, v361); + svfloat32_t v392 = svsub_f32_x(svptrue_b32(), v355, v359); + svfloat32_t v393 = svsub_f32_x(svptrue_b32(), v369, v365); + svfloat32_t v394 = svadd_f32_x(svptrue_b32(), v363, v367); + svfloat32_t v395 = svadd_f32_x(svptrue_b32(), v365, v369); + svfloat32_t v396 = svsub_f32_x(svptrue_b32(), v363, v367); + svfloat32_t v397 = svsub_f32_x(svptrue_b32(), v357, v361); + svfloat32_t v410 = svadd_f32_x(svptrue_b32(), v355, v363); + svfloat32_t v411 = svadd_f32_x(svptrue_b32(), v361, v369); + svfloat32_t v374 = svadd_f32_x(svptrue_b32(), v370, v372); + svfloat32_t v375 = svadd_f32_x(svptrue_b32(), v371, v373); + svfloat32_t v380 = svsub_f32_x(svptrue_b32(), v370, v372); + svfloat32_t v381 = svsub_f32_x(svptrue_b32(), v371, v373); + svfloat32_t v384 = svadd_f32_x(svptrue_b32(), v377, v379); + svfloat32_t v385 = svadd_f32_x(svptrue_b32(), v376, v378); + svfloat32_t v387 = svsub_f32_x(svptrue_b32(), v378, v379); + svfloat32_t v388 = svsub_f32_x(svptrue_b32(), v376, v377); + svfloat32_t v398 = svadd_f32_x(svptrue_b32(), v390, v391); + svfloat32_t v399 = svadd_f32_x(svptrue_b32(), v394, v395); + svfloat32_t v401 = svsub_f32_x(svptrue_b32(), v390, v391); + svfloat32_t v402 = svsub_f32_x(svptrue_b32(), v394, v395); + svfloat32_t v404 = svadd_f32_x(svptrue_b32(), v392, v393); + svfloat32_t v405 = svadd_f32_x(svptrue_b32(), v396, v397); + svfloat32_t v407 = svsub_f32_x(svptrue_b32(), v392, v393); + svfloat32_t v408 = svsub_f32_x(svptrue_b32(), v396, v397); + svfloat32_t v447 = svmul_f32_x(svptrue_b32(), v378, v1035); + svfloat32_t zero614 = svdup_n_f32(0); svfloat32_t v614 = svcmla_f32_x(pred_full, zero614, v1062, v411, 90); - svfloat32_t v382; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v382) : "w"(v374), "w"(v375)); - svfloat32_t v383; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v383) : "w"(v374), "w"(v375)); - svfloat32_t v386; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v386) : "w"(v385), "w"(v384)); - svfloat32_t v389; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v389) : "w"(v380), "w"(v381)); - svfloat32_t v400; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v400) : "w"(v398), "w"(v399)); - svfloat32_t v403; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v403) : "w"(v401), "w"(v402)); - svfloat32_t v406; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v406) : "w"(v404), "w"(v405)); - svfloat32_t v409; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v409) : "w"(v407), "w"(v408)); - svfloat32_t v412; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v412) : "w"(v405), "w"(v399)); - svfloat32_t v415; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v415) : "w"(v398), "w"(v404)); - svfloat32_t v457; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v457) : "w"(v380), "w"(v1037)); - svfloat32_t v462; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v462) : "w"(v381), "w"(v1038)); - svfloat32_t v492; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v492) : "w"(v387), "w"(v1044)); - svfloat32_t v497; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v497) : "w"(v388), "w"(v1045)); - svfloat32_t v413; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v413) : "w"(v412), "w"(v355)); - svfloat32_t v416; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v416) : "w"(v415), "w"(v361)); - svfloat32_t v427; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v427) : "w"(v1031), "w"(v382)); - svfloat32_t v487; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v487) : "w"(v386), "w"(v1043)); - svfloat32_t zero523; - asm volatile("mov %0.s, #0" : "=w"(zero523)); + svfloat32_t v382 = svadd_f32_x(svptrue_b32(), v374, v375); + svfloat32_t v383 = svsub_f32_x(svptrue_b32(), v374, v375); + svfloat32_t v386 = svsub_f32_x(svptrue_b32(), v385, v384); + svfloat32_t v389 = svadd_f32_x(svptrue_b32(), v380, v381); + svfloat32_t v400 = svadd_f32_x(svptrue_b32(), v398, v399); + svfloat32_t v403 = svadd_f32_x(svptrue_b32(), v401, v402); + svfloat32_t v406 = svadd_f32_x(svptrue_b32(), v404, v405); + svfloat32_t v409 = svadd_f32_x(svptrue_b32(), v407, v408); + svfloat32_t v412 = svsub_f32_x(svptrue_b32(), v405, v399); + svfloat32_t v415 = svsub_f32_x(svptrue_b32(), v398, v404); + svfloat32_t v457 = svmul_f32_x(svptrue_b32(), v380, v1037); + svfloat32_t v462 = svmul_f32_x(svptrue_b32(), v381, v1038); + svfloat32_t v492 = svmul_f32_x(svptrue_b32(), v387, v1044); + svfloat32_t v497 = svmul_f32_x(svptrue_b32(), v388, v1045); + svfloat32_t v413 = svadd_f32_x(svptrue_b32(), v412, v355); + svfloat32_t v416 = svadd_f32_x(svptrue_b32(), v415, v361); + svfloat32_t v427 = svadd_f32_x(svptrue_b32(), v1031, v382); + svfloat32_t v487 = svmul_f32_x(svptrue_b32(), v386, v1043); + svfloat32_t zero523 = svdup_n_f32(0); svfloat32_t v523 = svcmla_f32_x(pred_full, zero523, v1049, v400, 90); - svfloat32_t zero544; - asm volatile("mov %0.s, #0" : "=w"(zero544)); + svfloat32_t zero544 = svdup_n_f32(0); svfloat32_t v544 = svcmla_f32_x(pred_full, zero544, v1052, v403, 90); - svfloat32_t zero565; - asm volatile("mov %0.s, #0" : "=w"(zero565)); + svfloat32_t zero565 = svdup_n_f32(0); svfloat32_t v565 = svcmla_f32_x(pred_full, zero565, v1055, v406, 90); - svfloat32_t zero586; - asm volatile("mov %0.s, #0" : "=w"(zero586)); + svfloat32_t zero586 = svdup_n_f32(0); svfloat32_t v586 = svcmla_f32_x(pred_full, zero586, v1058, v409, 90); svfloat32_t v652 = svmla_f32_x(pred_full, v492, v379, v1036); svfloat32_t v653 = svnmls_f32_x(pred_full, v447, v387, v1044); svfloat32_t v654 = svmla_f32_x(pred_full, v497, v377, v1034); svfloat32_t v655 = svnmls_f32_x(pred_full, v497, v376, v1033); - svfloat32_t v414; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v414) : "w"(v413), "w"(v411)); - svfloat32_t v417; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v417) : "w"(v416), "w"(v363)); + svfloat32_t v414 = svsub_f32_x(svptrue_b32(), v413, v411); + svfloat32_t v417 = svadd_f32_x(svptrue_b32(), v416, v363); svfloat32_t v650 = svmla_f32_x(pred_full, v487, v384, v1041); svfloat32_t v651 = svnmls_f32_x(pred_full, v487, v385, v1042); svfloat32_t v656 = svnmls_f32_x(pred_full, v462, v389, v1046); @@ -8825,113 +7971,65 @@ void armral_fft_cf32_cf32_cs16_ab_t_gu17(const armral_cmplx_f32_t *restrict x, pred_full, svmul_n_f32_x(pred_full, v427, (float)(1ULL << 31ULL)))), svreinterpret_u16_u64( svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svfloat32_t v418; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v418) : "w"(v417), "w"(v369)); - svfloat32_t zero635; - asm volatile("mov %0.s, #0" : "=w"(zero635)); + svfloat32_t v418 = svsub_f32_x(svptrue_b32(), v417, v369); + svfloat32_t zero635 = svdup_n_f32(0); svfloat32_t v635 = svcmla_f32_x(pred_full, zero635, v1065, v414, 90); svfloat32_t v659 = svmla_f32_x(pred_full, v658, v383, v1040); svfloat32_t v660 = svmls_f32_x(pred_full, v658, v383, v1040); - svfloat32_t v661; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v661) : "w"(v650), "w"(v652)); - svfloat32_t v663; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v663) : "w"(v651), "w"(v653)); - svfloat32_t v665; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v665) : "w"(v650), "w"(v654)); - svfloat32_t v667; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v667) : "w"(v651), "w"(v655)); - svfloat32_t v688; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v688) : "w"(v677), "w"(v679)); - svfloat32_t v689; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v689) : "w"(v677), "w"(v679)); - svfloat32_t v690; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v690) : "w"(v678), "w"(v680)); - svfloat32_t v691; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v691) : "w"(v678), "w"(v680)); - svfloat32_t v692; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v692) : "w"(v681), "w"(v683)); - svfloat32_t v693; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v693) : "w"(v683), "w"(v681)); - svfloat32_t v694; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v694) : "w"(v682), "w"(v684)); - svfloat32_t v695; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v695) : "w"(v684), "w"(v682)); + svfloat32_t v661 = svsub_f32_x(svptrue_b32(), v650, v652); + svfloat32_t v663 = svadd_f32_x(svptrue_b32(), v651, v653); + svfloat32_t v665 = svadd_f32_x(svptrue_b32(), v650, v654); + svfloat32_t v667 = svadd_f32_x(svptrue_b32(), v651, v655); + svfloat32_t v688 = svadd_f32_x(svptrue_b32(), v677, v679); + svfloat32_t v689 = svsub_f32_x(svptrue_b32(), v677, v679); + svfloat32_t v690 = svadd_f32_x(svptrue_b32(), v678, v680); + svfloat32_t v691 = svsub_f32_x(svptrue_b32(), v678, v680); + svfloat32_t v692 = svadd_f32_x(svptrue_b32(), v681, v683); + svfloat32_t v693 = svsub_f32_x(svptrue_b32(), v683, v681); + svfloat32_t v694 = svadd_f32_x(svptrue_b32(), v682, v684); + svfloat32_t v695 = svsub_f32_x(svptrue_b32(), v684, v682); svst1w_u64(pred_full, (unsigned *)(v1075), svreinterpret_u64_s16(v726)); - svfloat32_t v419; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v419) : "w"(v414), "w"(v418)); - svfloat32_t zero642; - asm volatile("mov %0.s, #0" : "=w"(zero642)); + svfloat32_t v419 = svadd_f32_x(svptrue_b32(), v414, v418); + svfloat32_t zero642 = svdup_n_f32(0); svfloat32_t v642 = svcmla_f32_x(pred_full, zero642, v1066, v418, 90); - svfloat32_t v662; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v662) : "w"(v656), "w"(v659)); - svfloat32_t v664; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v664) : "w"(v657), "w"(v660)); - svfloat32_t v666; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v666) : "w"(v659), "w"(v656)); - svfloat32_t v668; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v668) : "w"(v660), "w"(v657)); - svfloat32_t v705; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v705) : "w"(v690), "w"(v694)); - svfloat32_t v707; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v707) : "w"(v689), "w"(v695)); - svfloat32_t v709; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v709) : "w"(v688), "w"(v692)); - svfloat32_t v711; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v711) : "w"(v695), "w"(v689)); - svfloat32_t v713; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v713) : "w"(v688), "w"(v692)); - svfloat32_t v716; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v716) : "w"(v693), "w"(v691)); - svfloat32_t v719; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v719) : "w"(v694), "w"(v690)); - svfloat32_t v722; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v722) : "w"(v691), "w"(v693)); - svfloat32_t v669; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v669) : "w"(v661), "w"(v662)); - svfloat32_t v670; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v670) : "w"(v663), "w"(v664)); - svfloat32_t v671; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v671) : "w"(v665), "w"(v666)); - svfloat32_t v672; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v672) : "w"(v667), "w"(v668)); - svfloat32_t v673; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v673) : "w"(v662), "w"(v661)); - svfloat32_t v674; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v674) : "w"(v664), "w"(v663)); - svfloat32_t v675; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v675) : "w"(v666), "w"(v665)); - svfloat32_t v676; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v676) : "w"(v668), "w"(v667)); - svfloat32_t v696; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v696) : "w"(v635), "w"(v642)); + svfloat32_t v662 = svadd_f32_x(svptrue_b32(), v656, v659); + svfloat32_t v664 = svadd_f32_x(svptrue_b32(), v657, v660); + svfloat32_t v666 = svsub_f32_x(svptrue_b32(), v659, v656); + svfloat32_t v668 = svsub_f32_x(svptrue_b32(), v660, v657); + svfloat32_t v705 = svadd_f32_x(svptrue_b32(), v690, v694); + svfloat32_t v707 = svadd_f32_x(svptrue_b32(), v689, v695); + svfloat32_t v709 = svsub_f32_x(svptrue_b32(), v688, v692); + svfloat32_t v711 = svsub_f32_x(svptrue_b32(), v695, v689); + svfloat32_t v713 = svadd_f32_x(svptrue_b32(), v688, v692); + svfloat32_t v716 = svsub_f32_x(svptrue_b32(), v693, v691); + svfloat32_t v719 = svsub_f32_x(svptrue_b32(), v694, v690); + svfloat32_t v722 = svadd_f32_x(svptrue_b32(), v691, v693); + svfloat32_t v669 = svadd_f32_x(svptrue_b32(), v661, v662); + svfloat32_t v670 = svadd_f32_x(svptrue_b32(), v663, v664); + svfloat32_t v671 = svadd_f32_x(svptrue_b32(), v665, v666); + svfloat32_t v672 = svadd_f32_x(svptrue_b32(), v667, v668); + svfloat32_t v673 = svsub_f32_x(svptrue_b32(), v662, v661); + svfloat32_t v674 = svsub_f32_x(svptrue_b32(), v664, v663); + svfloat32_t v675 = svsub_f32_x(svptrue_b32(), v666, v665); + svfloat32_t v676 = svsub_f32_x(svptrue_b32(), v668, v667); + svfloat32_t v696 = svsub_f32_x(svptrue_b32(), v635, v642); svfloat32_t v685 = svcmla_f32_x(pred_full, v642, v1067, v419, 90); - svfloat32_t v698; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v698) : "w"(v696), "w"(v696)); - svfloat32_t v723; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v723) : "w"(v722), "w"(v696)); + svfloat32_t v698 = svadd_f32_x(svptrue_b32(), v696, v696); + svfloat32_t v723 = svsub_f32_x(svptrue_b32(), v722, v696); svfloat32_t v686 = svcmla_f32_x(pred_full, v685, v1059, v410, 90); - svfloat32_t v699; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v699) : "w"(v614), "w"(v698)); - svfloat32_t v702; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v702) : "w"(v685), "w"(v685)); - svfloat32_t v720; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v720) : "w"(v719), "w"(v698)); - svfloat32_t v768; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v768) : "w"(v676), "w"(v723)); - svfloat32_t v777; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v777) : "w"(v676), "w"(v723)); + svfloat32_t v699 = svsub_f32_x(svptrue_b32(), v614, v698); + svfloat32_t v702 = svadd_f32_x(svptrue_b32(), v685, v685); + svfloat32_t v720 = svadd_f32_x(svptrue_b32(), v719, v698); + svfloat32_t v768 = svadd_f32_x(svptrue_b32(), v676, v723); + svfloat32_t v777 = svsub_f32_x(svptrue_b32(), v676, v723); svfloat32_t v687 = svcmla_f32_x(pred_full, v686, v1060, v355, 90); svfloat32_t v697 = svcmla_f32_x(pred_full, v686, v1061, v363, 90); svfloat32_t v700 = svcmla_f32_x(pred_full, v699, v1063, v361, 90); svfloat32_t v701 = svcmla_f32_x(pred_full, v699, v1064, v369, 90); - svfloat32_t v703; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v703) : "w"(v702), "w"(v702)); - svfloat32_t v704; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v704) : "w"(v696), "w"(v702)); - svfloat32_t v710; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v710) : "w"(v709), "w"(v702)); - svfloat32_t v721; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v721) : "w"(v720), "w"(v702)); + svfloat32_t v703 = svadd_f32_x(svptrue_b32(), v702, v702); + svfloat32_t v704 = svadd_f32_x(svptrue_b32(), v696, v702); + svfloat32_t v710 = svadd_f32_x(svptrue_b32(), v709, v702); + svfloat32_t v721 = svadd_f32_x(svptrue_b32(), v720, v702); svint16_t v771 = svtbl_s16( svreinterpret_s16_s32(svcvt_s32_f32_x( pred_full, svmul_n_f32_x(pred_full, v768, (float)(1ULL << 31ULL)))), @@ -8942,34 +8040,21 @@ void armral_fft_cf32_cf32_cs16_ab_t_gu17(const armral_cmplx_f32_t *restrict x, pred_full, svmul_n_f32_x(pred_full, v777, (float)(1ULL << 31ULL)))), svreinterpret_u16_u64( svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svfloat32_t v706; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v706) : "w"(v705), "w"(v697)); - svfloat32_t v708; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v708) : "w"(v707), "w"(v700)); - svfloat32_t v712; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v712) : "w"(v711), "w"(v704)); - svfloat32_t v714; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v714) : "w"(v713), "w"(v687)); - svfloat32_t v717; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v717) : "w"(v716), "w"(v701)); - svfloat32_t v750; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v750) : "w"(v671), "w"(v710)); - svfloat32_t v759; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v759) : "w"(v671), "w"(v710)); - svfloat32_t v858; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v858) : "w"(v675), "w"(v721)); - svfloat32_t v867; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v867) : "w"(v675), "w"(v721)); + svfloat32_t v706 = svadd_f32_x(svptrue_b32(), v705, v697); + svfloat32_t v708 = svadd_f32_x(svptrue_b32(), v707, v700); + svfloat32_t v712 = svsub_f32_x(svptrue_b32(), v711, v704); + svfloat32_t v714 = svadd_f32_x(svptrue_b32(), v713, v687); + svfloat32_t v717 = svsub_f32_x(svptrue_b32(), v716, v701); + svfloat32_t v750 = svadd_f32_x(svptrue_b32(), v671, v710); + svfloat32_t v759 = svsub_f32_x(svptrue_b32(), v671, v710); + svfloat32_t v858 = svadd_f32_x(svptrue_b32(), v675, v721); + svfloat32_t v867 = svsub_f32_x(svptrue_b32(), v675, v721); svst1w_u64(pred_full, (unsigned *)(v1120), svreinterpret_u64_s16(v771)); svst1w_u64(pred_full, (unsigned *)(v1129), svreinterpret_u64_s16(v780)); - svfloat32_t v715; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v715) : "w"(v714), "w"(v696)); - svfloat32_t v718; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v718) : "w"(v717), "w"(v703)); - svfloat32_t v732; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v732) : "w"(v669), "w"(v706)); - svfloat32_t v741; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v741) : "w"(v669), "w"(v706)); + svfloat32_t v715 = svadd_f32_x(svptrue_b32(), v714, v696); + svfloat32_t v718 = svadd_f32_x(svptrue_b32(), v717, v703); + svfloat32_t v732 = svadd_f32_x(svptrue_b32(), v669, v706); + svfloat32_t v741 = svsub_f32_x(svptrue_b32(), v669, v706); svint16_t v753 = svtbl_s16( svreinterpret_s16_s32(svcvt_s32_f32_x( pred_full, svmul_n_f32_x(pred_full, v750, (float)(1ULL << 31ULL)))), @@ -8980,14 +8065,10 @@ void armral_fft_cf32_cf32_cs16_ab_t_gu17(const armral_cmplx_f32_t *restrict x, pred_full, svmul_n_f32_x(pred_full, v759, (float)(1ULL << 31ULL)))), svreinterpret_u16_u64( svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svfloat32_t v804; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v804) : "w"(v672), "w"(v712)); - svfloat32_t v813; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v813) : "w"(v672), "w"(v712)); - svfloat32_t v822; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v822) : "w"(v670), "w"(v708)); - svfloat32_t v831; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v831) : "w"(v670), "w"(v708)); + svfloat32_t v804 = svadd_f32_x(svptrue_b32(), v672, v712); + svfloat32_t v813 = svsub_f32_x(svptrue_b32(), v672, v712); + svfloat32_t v822 = svadd_f32_x(svptrue_b32(), v670, v708); + svfloat32_t v831 = svsub_f32_x(svptrue_b32(), v670, v708); svint16_t v861 = svtbl_s16( svreinterpret_s16_s32(svcvt_s32_f32_x( pred_full, svmul_n_f32_x(pred_full, v858, (float)(1ULL << 31ULL)))), @@ -9008,10 +8089,8 @@ void armral_fft_cf32_cf32_cs16_ab_t_gu17(const armral_cmplx_f32_t *restrict x, pred_full, svmul_n_f32_x(pred_full, v741, (float)(1ULL << 31ULL)))), svreinterpret_u16_u64( svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svfloat32_t v786; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v786) : "w"(v673), "w"(v715)); - svfloat32_t v795; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v795) : "w"(v673), "w"(v715)); + svfloat32_t v786 = svadd_f32_x(svptrue_b32(), v673, v715); + svfloat32_t v795 = svsub_f32_x(svptrue_b32(), v673, v715); svint16_t v807 = svtbl_s16( svreinterpret_s16_s32(svcvt_s32_f32_x( pred_full, svmul_n_f32_x(pred_full, v804, (float)(1ULL << 31ULL)))), @@ -9032,10 +8111,8 @@ void armral_fft_cf32_cf32_cs16_ab_t_gu17(const armral_cmplx_f32_t *restrict x, pred_full, svmul_n_f32_x(pred_full, v831, (float)(1ULL << 31ULL)))), svreinterpret_u16_u64( svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svfloat32_t v840; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v840) : "w"(v674), "w"(v718)); - svfloat32_t v849; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v849) : "w"(v674), "w"(v718)); + svfloat32_t v840 = svadd_f32_x(svptrue_b32(), v674, v718); + svfloat32_t v849 = svsub_f32_x(svptrue_b32(), v674, v718); svst1w_u64(pred_full, (unsigned *)(v1102), svreinterpret_u64_s16(v753)); svst1w_u64(pred_full, (unsigned *)(v1111), svreinterpret_u64_s16(v762)); svst1w_u64(pred_full, (unsigned *)(v1210), svreinterpret_u64_s16(v861)); @@ -9634,8 +8711,7 @@ void armral_fft_cf32_cf32_cs16_ab_t_gu18(const armral_cmplx_f32_t *restrict x, svld1_f64(pred_full, &((const double *)v7)[v204])); svfloat32_t v240 = svreinterpret_f32_f64( svld1_f64(pred_full, &((const double *)v7)[v239])); - svfloat32_t zero248; - asm volatile("mov %0.s, #0" : "=w"(zero248)); + svfloat32_t zero248 = svdup_n_f32(0); svfloat32_t v248 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero248, v862, v247, 0), v862, v247, 90); @@ -9683,264 +8759,164 @@ void armral_fft_cf32_cf32_cs16_ab_t_gu18(const armral_cmplx_f32_t *restrict x, svld1_gather_s64index_f64(pred_full, (const double *)(v907), v927)); svfloat32_t v918 = svreinterpret_f32_f64( svld1_gather_s64index_f64(pred_full, (const double *)(v916), v927)); - svfloat32_t zero38; - asm volatile("mov %0.s, #0" : "=w"(zero38)); + svfloat32_t zero38 = svdup_n_f32(0); svfloat32_t v38 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero38, v772, v37, 0), v772, v37, 90); - svfloat32_t zero73; - asm volatile("mov %0.s, #0" : "=w"(zero73)); + svfloat32_t zero73 = svdup_n_f32(0); svfloat32_t v73 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero73, v781, v72, 0), v781, v72, 90); - svfloat32_t zero80; - asm volatile("mov %0.s, #0" : "=w"(zero80)); + svfloat32_t zero80 = svdup_n_f32(0); svfloat32_t v80 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero80, v790, v79, 0), v790, v79, 90); - svfloat32_t zero115; - asm volatile("mov %0.s, #0" : "=w"(zero115)); + svfloat32_t zero115 = svdup_n_f32(0); svfloat32_t v115 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero115, v799, v114, 0), v799, v114, 90); - svfloat32_t zero122; - asm volatile("mov %0.s, #0" : "=w"(zero122)); + svfloat32_t zero122 = svdup_n_f32(0); svfloat32_t v122 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero122, v808, v121, 0), v808, v121, 90); - svfloat32_t zero157; - asm volatile("mov %0.s, #0" : "=w"(zero157)); + svfloat32_t zero157 = svdup_n_f32(0); svfloat32_t v157 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero157, v817, v156, 0), v817, v156, 90); - svfloat32_t zero164; - asm volatile("mov %0.s, #0" : "=w"(zero164)); + svfloat32_t zero164 = svdup_n_f32(0); svfloat32_t v164 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero164, v826, v163, 0), v826, v163, 90); - svfloat32_t zero199; - asm volatile("mov %0.s, #0" : "=w"(zero199)); + svfloat32_t zero199 = svdup_n_f32(0); svfloat32_t v199 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero199, v835, v198, 0), v835, v198, 90); - svfloat32_t zero206; - asm volatile("mov %0.s, #0" : "=w"(zero206)); + svfloat32_t zero206 = svdup_n_f32(0); svfloat32_t v206 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero206, v844, v205, 0), v844, v205, 90); - svfloat32_t zero241; - asm volatile("mov %0.s, #0" : "=w"(zero241)); + svfloat32_t zero241 = svdup_n_f32(0); svfloat32_t v241 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero241, v853, v240, 0), v853, v240, 90); - svfloat32_t zero283; - asm volatile("mov %0.s, #0" : "=w"(zero283)); + svfloat32_t zero283 = svdup_n_f32(0); svfloat32_t v283 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero283, v873, v282, 0), v873, v282, 90); - svfloat32_t zero290; - asm volatile("mov %0.s, #0" : "=w"(zero290)); + svfloat32_t zero290 = svdup_n_f32(0); svfloat32_t v290 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero290, v882, v289, 0), v882, v289, 90); - svfloat32_t zero325; - asm volatile("mov %0.s, #0" : "=w"(zero325)); + svfloat32_t zero325 = svdup_n_f32(0); svfloat32_t v325 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero325, v891, v324, 0), v891, v324, 90); - svfloat32_t zero332; - asm volatile("mov %0.s, #0" : "=w"(zero332)); + svfloat32_t zero332 = svdup_n_f32(0); svfloat32_t v332 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero332, v900, v331, 0), v900, v331, 90); - svfloat32_t zero367; - asm volatile("mov %0.s, #0" : "=w"(zero367)); + svfloat32_t zero367 = svdup_n_f32(0); svfloat32_t v367 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero367, v909, v366, 0), v909, v366, 90); - svfloat32_t zero374; - asm volatile("mov %0.s, #0" : "=w"(zero374)); + svfloat32_t zero374 = svdup_n_f32(0); svfloat32_t v374 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero374, v918, v373, 0), v918, v373, 90); - svfloat32_t v382; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v382) : "w"(v928), "w"(v38)); - svfloat32_t v383; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v383) : "w"(v928), "w"(v38)); - svfloat32_t v384; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v384) : "w"(v73), "w"(v80)); - svfloat32_t v385; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v385) : "w"(v73), "w"(v80)); - svfloat32_t v386; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v386) : "w"(v115), "w"(v122)); - svfloat32_t v387; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v387) : "w"(v115), "w"(v122)); - svfloat32_t v388; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v388) : "w"(v157), "w"(v164)); - svfloat32_t v389; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v389) : "w"(v157), "w"(v164)); - svfloat32_t v390; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v390) : "w"(v199), "w"(v206)); - svfloat32_t v391; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v391) : "w"(v199), "w"(v206)); - svfloat32_t v392; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v392) : "w"(v241), "w"(v248)); - svfloat32_t v393; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v393) : "w"(v241), "w"(v248)); - svfloat32_t v394; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v394) : "w"(v283), "w"(v290)); - svfloat32_t v395; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v395) : "w"(v283), "w"(v290)); - svfloat32_t v396; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v396) : "w"(v325), "w"(v332)); - svfloat32_t v397; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v397) : "w"(v325), "w"(v332)); - svfloat32_t v398; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v398) : "w"(v367), "w"(v374)); - svfloat32_t v399; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v399) : "w"(v367), "w"(v374)); - svfloat32_t v400; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v400) : "w"(v384), "w"(v398)); - svfloat32_t v401; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v401) : "w"(v384), "w"(v398)); - svfloat32_t v402; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v402) : "w"(v396), "w"(v386)); - svfloat32_t v403; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v403) : "w"(v396), "w"(v386)); - svfloat32_t v404; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v404) : "w"(v388), "w"(v394)); - svfloat32_t v405; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v405) : "w"(v388), "w"(v394)); - svfloat32_t v406; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v406) : "w"(v390), "w"(v392)); - svfloat32_t v407; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v407) : "w"(v390), "w"(v392)); - svfloat32_t v510; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v510) : "w"(v385), "w"(v399)); - svfloat32_t v511; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v511) : "w"(v385), "w"(v399)); - svfloat32_t v512; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v512) : "w"(v397), "w"(v387)); - svfloat32_t v513; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v513) : "w"(v397), "w"(v387)); - svfloat32_t v514; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v514) : "w"(v389), "w"(v395)); - svfloat32_t v515; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v515) : "w"(v389), "w"(v395)); - svfloat32_t v516; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v516) : "w"(v391), "w"(v393)); - svfloat32_t v517; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v517) : "w"(v391), "w"(v393)); - svfloat32_t v408; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v408) : "w"(v400), "w"(v402)); - svfloat32_t v412; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v412) : "w"(v401), "w"(v403)); - svfloat32_t v414; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v414) : "w"(v400), "w"(v402)); - svfloat32_t v415; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v415) : "w"(v402), "w"(v406)); - svfloat32_t v416; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v416) : "w"(v406), "w"(v400)); - svfloat32_t v417; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v417) : "w"(v401), "w"(v403)); - svfloat32_t v418; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v418) : "w"(v403), "w"(v407)); - svfloat32_t v419; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v419) : "w"(v407), "w"(v401)); - svfloat32_t zero448; - asm volatile("mov %0.s, #0" : "=w"(zero448)); + svfloat32_t v382 = svadd_f32_x(svptrue_b32(), v928, v38); + svfloat32_t v383 = svsub_f32_x(svptrue_b32(), v928, v38); + svfloat32_t v384 = svadd_f32_x(svptrue_b32(), v73, v80); + svfloat32_t v385 = svsub_f32_x(svptrue_b32(), v73, v80); + svfloat32_t v386 = svadd_f32_x(svptrue_b32(), v115, v122); + svfloat32_t v387 = svsub_f32_x(svptrue_b32(), v115, v122); + svfloat32_t v388 = svadd_f32_x(svptrue_b32(), v157, v164); + svfloat32_t v389 = svsub_f32_x(svptrue_b32(), v157, v164); + svfloat32_t v390 = svadd_f32_x(svptrue_b32(), v199, v206); + svfloat32_t v391 = svsub_f32_x(svptrue_b32(), v199, v206); + svfloat32_t v392 = svadd_f32_x(svptrue_b32(), v241, v248); + svfloat32_t v393 = svsub_f32_x(svptrue_b32(), v241, v248); + svfloat32_t v394 = svadd_f32_x(svptrue_b32(), v283, v290); + svfloat32_t v395 = svsub_f32_x(svptrue_b32(), v283, v290); + svfloat32_t v396 = svadd_f32_x(svptrue_b32(), v325, v332); + svfloat32_t v397 = svsub_f32_x(svptrue_b32(), v325, v332); + svfloat32_t v398 = svadd_f32_x(svptrue_b32(), v367, v374); + svfloat32_t v399 = svsub_f32_x(svptrue_b32(), v367, v374); + svfloat32_t v400 = svadd_f32_x(svptrue_b32(), v384, v398); + svfloat32_t v401 = svsub_f32_x(svptrue_b32(), v384, v398); + svfloat32_t v402 = svadd_f32_x(svptrue_b32(), v396, v386); + svfloat32_t v403 = svsub_f32_x(svptrue_b32(), v396, v386); + svfloat32_t v404 = svadd_f32_x(svptrue_b32(), v388, v394); + svfloat32_t v405 = svsub_f32_x(svptrue_b32(), v388, v394); + svfloat32_t v406 = svadd_f32_x(svptrue_b32(), v390, v392); + svfloat32_t v407 = svsub_f32_x(svptrue_b32(), v390, v392); + svfloat32_t v510 = svadd_f32_x(svptrue_b32(), v385, v399); + svfloat32_t v511 = svsub_f32_x(svptrue_b32(), v385, v399); + svfloat32_t v512 = svadd_f32_x(svptrue_b32(), v397, v387); + svfloat32_t v513 = svsub_f32_x(svptrue_b32(), v397, v387); + svfloat32_t v514 = svadd_f32_x(svptrue_b32(), v389, v395); + svfloat32_t v515 = svsub_f32_x(svptrue_b32(), v389, v395); + svfloat32_t v516 = svadd_f32_x(svptrue_b32(), v391, v393); + svfloat32_t v517 = svsub_f32_x(svptrue_b32(), v391, v393); + svfloat32_t v408 = svadd_f32_x(svptrue_b32(), v400, v402); + svfloat32_t v412 = svadd_f32_x(svptrue_b32(), v401, v403); + svfloat32_t v414 = svsub_f32_x(svptrue_b32(), v400, v402); + svfloat32_t v415 = svsub_f32_x(svptrue_b32(), v402, v406); + svfloat32_t v416 = svsub_f32_x(svptrue_b32(), v406, v400); + svfloat32_t v417 = svsub_f32_x(svptrue_b32(), v401, v403); + svfloat32_t v418 = svsub_f32_x(svptrue_b32(), v403, v407); + svfloat32_t v419 = svsub_f32_x(svptrue_b32(), v407, v401); + svfloat32_t zero448 = svdup_n_f32(0); svfloat32_t v448 = svcmla_f32_x(pred_full, zero448, v944, v405, 90); - svfloat32_t v518; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v518) : "w"(v510), "w"(v512)); - svfloat32_t v522; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v522) : "w"(v511), "w"(v513)); - svfloat32_t v524; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v524) : "w"(v510), "w"(v512)); - svfloat32_t v525; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v525) : "w"(v512), "w"(v516)); - svfloat32_t v526; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v526) : "w"(v516), "w"(v510)); - svfloat32_t v527; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v527) : "w"(v511), "w"(v513)); - svfloat32_t v528; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v528) : "w"(v513), "w"(v517)); - svfloat32_t v529; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v529) : "w"(v517), "w"(v511)); - svfloat32_t zero558; - asm volatile("mov %0.s, #0" : "=w"(zero558)); + svfloat32_t v518 = svadd_f32_x(svptrue_b32(), v510, v512); + svfloat32_t v522 = svadd_f32_x(svptrue_b32(), v511, v513); + svfloat32_t v524 = svsub_f32_x(svptrue_b32(), v510, v512); + svfloat32_t v525 = svsub_f32_x(svptrue_b32(), v512, v516); + svfloat32_t v526 = svsub_f32_x(svptrue_b32(), v516, v510); + svfloat32_t v527 = svsub_f32_x(svptrue_b32(), v511, v513); + svfloat32_t v528 = svsub_f32_x(svptrue_b32(), v513, v517); + svfloat32_t v529 = svsub_f32_x(svptrue_b32(), v517, v511); + svfloat32_t zero558 = svdup_n_f32(0); svfloat32_t v558 = svcmla_f32_x(pred_full, zero558, v944, v515, 90); - svfloat32_t v409; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v409) : "w"(v408), "w"(v406)); - svfloat32_t v413; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v413) : "w"(v412), "w"(v407)); - svfloat32_t zero470; - asm volatile("mov %0.s, #0" : "=w"(zero470)); + svfloat32_t v409 = svadd_f32_x(svptrue_b32(), v408, v406); + svfloat32_t v413 = svadd_f32_x(svptrue_b32(), v412, v407); + svfloat32_t zero470 = svdup_n_f32(0); svfloat32_t v470 = svcmla_f32_x(pred_full, zero470, v948, v417, 90); - svfloat32_t zero477; - asm volatile("mov %0.s, #0" : "=w"(zero477)); + svfloat32_t zero477 = svdup_n_f32(0); svfloat32_t v477 = svcmla_f32_x(pred_full, zero477, v949, v418, 90); - svfloat32_t zero484; - asm volatile("mov %0.s, #0" : "=w"(zero484)); + svfloat32_t zero484 = svdup_n_f32(0); svfloat32_t v484 = svcmla_f32_x(pred_full, zero484, v950, v419, 90); - svfloat32_t v519; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v519) : "w"(v518), "w"(v516)); - svfloat32_t v523; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v523) : "w"(v522), "w"(v517)); - svfloat32_t zero580; - asm volatile("mov %0.s, #0" : "=w"(zero580)); + svfloat32_t v519 = svadd_f32_x(svptrue_b32(), v518, v516); + svfloat32_t v523 = svadd_f32_x(svptrue_b32(), v522, v517); + svfloat32_t zero580 = svdup_n_f32(0); svfloat32_t v580 = svcmla_f32_x(pred_full, zero580, v948, v527, 90); - svfloat32_t zero587; - asm volatile("mov %0.s, #0" : "=w"(zero587)); + svfloat32_t zero587 = svdup_n_f32(0); svfloat32_t v587 = svcmla_f32_x(pred_full, zero587, v949, v528, 90); - svfloat32_t zero594; - asm volatile("mov %0.s, #0" : "=w"(zero594)); + svfloat32_t zero594 = svdup_n_f32(0); svfloat32_t v594 = svcmla_f32_x(pred_full, zero594, v950, v529, 90); - svfloat32_t v410; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v410) : "w"(v409), "w"(v404)); - svfloat32_t v429; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v429) : "w"(v409), "w"(v941)); - svfloat32_t zero436; - asm volatile("mov %0.s, #0" : "=w"(zero436)); + svfloat32_t v410 = svadd_f32_x(svptrue_b32(), v409, v404); + svfloat32_t v429 = svmul_f32_x(svptrue_b32(), v409, v941); + svfloat32_t zero436 = svdup_n_f32(0); svfloat32_t v436 = svcmla_f32_x(pred_full, zero436, v944, v413, 90); - svfloat32_t v498; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v498) : "w"(v448), "w"(v470)); - svfloat32_t v500; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v500) : "w"(v448), "w"(v477)); - svfloat32_t v502; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v502) : "w"(v448), "w"(v470)); - svfloat32_t v520; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v520) : "w"(v519), "w"(v514)); - svfloat32_t v539; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v539) : "w"(v519), "w"(v941)); - svfloat32_t zero546; - asm volatile("mov %0.s, #0" : "=w"(zero546)); + svfloat32_t v498 = svadd_f32_x(svptrue_b32(), v448, v470); + svfloat32_t v500 = svsub_f32_x(svptrue_b32(), v448, v477); + svfloat32_t v502 = svsub_f32_x(svptrue_b32(), v448, v470); + svfloat32_t v520 = svadd_f32_x(svptrue_b32(), v519, v514); + svfloat32_t v539 = svmul_f32_x(svptrue_b32(), v519, v941); + svfloat32_t zero546 = svdup_n_f32(0); svfloat32_t v546 = svcmla_f32_x(pred_full, zero546, v944, v523, 90); - svfloat32_t v608; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v608) : "w"(v558), "w"(v580)); - svfloat32_t v610; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v610) : "w"(v558), "w"(v587)); - svfloat32_t v612; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v612) : "w"(v558), "w"(v580)); - svfloat32_t v411; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v411) : "w"(v410), "w"(v382)); - svfloat32_t v485; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v485) : "w"(v429), "w"(v429)); - svfloat32_t v499; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v499) : "w"(v498), "w"(v477)); - svfloat32_t v501; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v501) : "w"(v500), "w"(v484)); - svfloat32_t v503; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v503) : "w"(v502), "w"(v484)); - svfloat32_t v521; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v521) : "w"(v520), "w"(v383)); - svfloat32_t v595; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v595) : "w"(v539), "w"(v539)); - svfloat32_t v609; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v609) : "w"(v608), "w"(v587)); - svfloat32_t v611; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v611) : "w"(v610), "w"(v594)); - svfloat32_t v613; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v613) : "w"(v612), "w"(v594)); + svfloat32_t v608 = svadd_f32_x(svptrue_b32(), v558, v580); + svfloat32_t v610 = svsub_f32_x(svptrue_b32(), v558, v587); + svfloat32_t v612 = svsub_f32_x(svptrue_b32(), v558, v580); + svfloat32_t v411 = svadd_f32_x(svptrue_b32(), v410, v382); + svfloat32_t v485 = svadd_f32_x(svptrue_b32(), v429, v429); + svfloat32_t v499 = svadd_f32_x(svptrue_b32(), v498, v477); + svfloat32_t v501 = svadd_f32_x(svptrue_b32(), v500, v484); + svfloat32_t v503 = svsub_f32_x(svptrue_b32(), v502, v484); + svfloat32_t v521 = svadd_f32_x(svptrue_b32(), v520, v383); + svfloat32_t v595 = svadd_f32_x(svptrue_b32(), v539, v539); + svfloat32_t v609 = svadd_f32_x(svptrue_b32(), v608, v587); + svfloat32_t v611 = svadd_f32_x(svptrue_b32(), v610, v594); + svfloat32_t v613 = svsub_f32_x(svptrue_b32(), v612, v594); svfloat32_t v486 = svmla_f32_x(pred_full, v485, v409, v941); svfloat32_t v490 = svmla_f32_x(pred_full, v411, v404, v943); svfloat32_t v596 = svmla_f32_x(pred_full, v595, v519, v941); @@ -9955,27 +8931,19 @@ void armral_fft_cf32_cf32_cs16_ab_t_gu18(const armral_cmplx_f32_t *restrict x, pred_full, svmul_n_f32_x(pred_full, v521, (float)(1ULL << 31ULL)))), svreinterpret_u16_u64( svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svfloat32_t v487; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v487) : "w"(v411), "w"(v486)); - svfloat32_t v491; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v491) : "w"(v490), "w"(v485)); - svfloat32_t v597; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v597) : "w"(v521), "w"(v596)); - svfloat32_t v601; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v601) : "w"(v600), "w"(v595)); + svfloat32_t v487 = svadd_f32_x(svptrue_b32(), v411, v486); + svfloat32_t v491 = svadd_f32_x(svptrue_b32(), v490, v485); + svfloat32_t v597 = svadd_f32_x(svptrue_b32(), v521, v596); + svfloat32_t v601 = svadd_f32_x(svptrue_b32(), v600, v595); svst1w_u64(pred_full, (unsigned *)(v958), svreinterpret_u64_s16(v622)); svst1w_u64(pred_full, (unsigned *)(v967), svreinterpret_u64_s16(v630)); - svfloat32_t v488; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v488) : "w"(v487), "w"(v436)); - svfloat32_t v489; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v489) : "w"(v487), "w"(v436)); + svfloat32_t v488 = svadd_f32_x(svptrue_b32(), v487, v436); + svfloat32_t v489 = svsub_f32_x(svptrue_b32(), v487, v436); svfloat32_t v492 = svmla_f32_x(pred_full, v491, v414, v945); svfloat32_t v494 = svmls_f32_x(pred_full, v491, v415, v946); svfloat32_t v496 = svmls_f32_x(pred_full, v491, v414, v945); - svfloat32_t v598; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v598) : "w"(v597), "w"(v546)); - svfloat32_t v599; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v599) : "w"(v597), "w"(v546)); + svfloat32_t v598 = svadd_f32_x(svptrue_b32(), v597, v546); + svfloat32_t v599 = svsub_f32_x(svptrue_b32(), v597, v546); svfloat32_t v602 = svmla_f32_x(pred_full, v601, v524, v945); svfloat32_t v604 = svmls_f32_x(pred_full, v601, v525, v946); svfloat32_t v606 = svmls_f32_x(pred_full, v601, v524, v945); @@ -10005,30 +8973,18 @@ void armral_fft_cf32_cf32_cs16_ab_t_gu18(const armral_cmplx_f32_t *restrict x, pred_full, svmul_n_f32_x(pred_full, v598, (float)(1ULL << 31ULL)))), svreinterpret_u16_u64( svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svfloat32_t v504; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v504) : "w"(v493), "w"(v499)); - svfloat32_t v505; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v505) : "w"(v493), "w"(v499)); - svfloat32_t v506; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v506) : "w"(v495), "w"(v501)); - svfloat32_t v507; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v507) : "w"(v495), "w"(v501)); - svfloat32_t v508; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v508) : "w"(v497), "w"(v503)); - svfloat32_t v509; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v509) : "w"(v497), "w"(v503)); - svfloat32_t v614; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v614) : "w"(v603), "w"(v609)); - svfloat32_t v615; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v615) : "w"(v603), "w"(v609)); - svfloat32_t v616; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v616) : "w"(v605), "w"(v611)); - svfloat32_t v617; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v617) : "w"(v605), "w"(v611)); - svfloat32_t v618; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v618) : "w"(v607), "w"(v613)); - svfloat32_t v619; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v619) : "w"(v607), "w"(v613)); + svfloat32_t v504 = svadd_f32_x(svptrue_b32(), v493, v499); + svfloat32_t v505 = svsub_f32_x(svptrue_b32(), v493, v499); + svfloat32_t v506 = svadd_f32_x(svptrue_b32(), v495, v501); + svfloat32_t v507 = svsub_f32_x(svptrue_b32(), v495, v501); + svfloat32_t v508 = svadd_f32_x(svptrue_b32(), v497, v503); + svfloat32_t v509 = svsub_f32_x(svptrue_b32(), v497, v503); + svfloat32_t v614 = svadd_f32_x(svptrue_b32(), v603, v609); + svfloat32_t v615 = svsub_f32_x(svptrue_b32(), v603, v609); + svfloat32_t v616 = svadd_f32_x(svptrue_b32(), v605, v611); + svfloat32_t v617 = svsub_f32_x(svptrue_b32(), v605, v611); + svfloat32_t v618 = svadd_f32_x(svptrue_b32(), v607, v613); + svfloat32_t v619 = svsub_f32_x(svptrue_b32(), v607, v613); svst1w_u64(pred_full, (unsigned *)(v1012), svreinterpret_u64_s16(v670)); svst1w_u64(pred_full, (unsigned *)(v1021), svreinterpret_u64_s16(v678)); svst1w_u64(pred_full, (unsigned *)(v1066), svreinterpret_u64_s16(v718)); @@ -10931,8 +9887,7 @@ void armral_fft_cf32_cf32_cs16_ab_t_gu19(const armral_cmplx_f32_t *restrict x, int32_t *v1338 = &v6[v949]; int32_t *v1347 = &v6[v958]; int32_t *v1356 = &v6[v967]; - svfloat32_t zero52; - asm volatile("mov %0.s, #0" : "=w"(zero52)); + svfloat32_t zero52 = svdup_n_f32(0); svfloat32_t v52 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero52, v982, v51, 0), v982, v51, 90); @@ -11004,319 +9959,198 @@ void armral_fft_cf32_cf32_cs16_ab_t_gu19(const armral_cmplx_f32_t *restrict x, svld1_gather_s64index_f64(pred_full, (const double *)(v1126), v1146)); svfloat32_t v1137 = svreinterpret_f32_f64( svld1_gather_s64index_f64(pred_full, (const double *)(v1135), v1146)); - svfloat32_t zero59; - asm volatile("mov %0.s, #0" : "=w"(zero59)); + svfloat32_t zero59 = svdup_n_f32(0); svfloat32_t v59 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero59, v992, v58, 0), v992, v58, 90); - svfloat32_t zero94; - asm volatile("mov %0.s, #0" : "=w"(zero94)); + svfloat32_t zero94 = svdup_n_f32(0); svfloat32_t v94 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero94, v1011, v93, 0), v1011, v93, 90); - svfloat32_t zero101; - asm volatile("mov %0.s, #0" : "=w"(zero101)); + svfloat32_t zero101 = svdup_n_f32(0); svfloat32_t v101 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero101, v1002, v100, 0), v1002, v100, 90); - svfloat32_t zero136; - asm volatile("mov %0.s, #0" : "=w"(zero136)); + svfloat32_t zero136 = svdup_n_f32(0); svfloat32_t v136 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero136, v1020, v135, 0), v1020, v135, 90); - svfloat32_t zero143; - asm volatile("mov %0.s, #0" : "=w"(zero143)); + svfloat32_t zero143 = svdup_n_f32(0); svfloat32_t v143 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero143, v1029, v142, 0), v1029, v142, 90); - svfloat32_t zero178; - asm volatile("mov %0.s, #0" : "=w"(zero178)); + svfloat32_t zero178 = svdup_n_f32(0); svfloat32_t v178 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero178, v1047, v177, 0), v1047, v177, 90); - svfloat32_t zero185; - asm volatile("mov %0.s, #0" : "=w"(zero185)); + svfloat32_t zero185 = svdup_n_f32(0); svfloat32_t v185 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero185, v1038, v184, 0), v1038, v184, 90); - svfloat32_t zero220; - asm volatile("mov %0.s, #0" : "=w"(zero220)); + svfloat32_t zero220 = svdup_n_f32(0); svfloat32_t v220 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero220, v1056, v219, 0), v1056, v219, 90); - svfloat32_t zero227; - asm volatile("mov %0.s, #0" : "=w"(zero227)); + svfloat32_t zero227 = svdup_n_f32(0); svfloat32_t v227 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero227, v1065, v226, 0), v1065, v226, 90); - svfloat32_t zero262; - asm volatile("mov %0.s, #0" : "=w"(zero262)); + svfloat32_t zero262 = svdup_n_f32(0); svfloat32_t v262 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero262, v1083, v261, 0), v1083, v261, 90); - svfloat32_t zero269; - asm volatile("mov %0.s, #0" : "=w"(zero269)); + svfloat32_t zero269 = svdup_n_f32(0); svfloat32_t v269 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero269, v1074, v268, 0), v1074, v268, 90); - svfloat32_t zero304; - asm volatile("mov %0.s, #0" : "=w"(zero304)); + svfloat32_t zero304 = svdup_n_f32(0); svfloat32_t v304 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero304, v1092, v303, 0), v1092, v303, 90); - svfloat32_t zero311; - asm volatile("mov %0.s, #0" : "=w"(zero311)); + svfloat32_t zero311 = svdup_n_f32(0); svfloat32_t v311 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero311, v1101, v310, 0), v1101, v310, 90); - svfloat32_t zero346; - asm volatile("mov %0.s, #0" : "=w"(zero346)); + svfloat32_t zero346 = svdup_n_f32(0); svfloat32_t v346 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero346, v1119, v345, 0), v1119, v345, 90); - svfloat32_t zero353; - asm volatile("mov %0.s, #0" : "=w"(zero353)); + svfloat32_t zero353 = svdup_n_f32(0); svfloat32_t v353 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero353, v1110, v352, 0), v1110, v352, 90); - svfloat32_t zero388; - asm volatile("mov %0.s, #0" : "=w"(zero388)); + svfloat32_t zero388 = svdup_n_f32(0); svfloat32_t v388 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero388, v1128, v387, 0), v1128, v387, 90); - svfloat32_t zero395; - asm volatile("mov %0.s, #0" : "=w"(zero395)); + svfloat32_t zero395 = svdup_n_f32(0); svfloat32_t v395 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero395, v1137, v394, 0), v1137, v394, 90); - svfloat32_t v396; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v396) : "w"(v52), "w"(v59)); - svfloat32_t v397; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v397) : "w"(v52), "w"(v59)); - svfloat32_t v398; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v398) : "w"(v101), "w"(v94)); - svfloat32_t v399; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v399) : "w"(v94), "w"(v101)); - svfloat32_t v400; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v400) : "w"(v136), "w"(v143)); - svfloat32_t v401; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v401) : "w"(v136), "w"(v143)); - svfloat32_t v402; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v402) : "w"(v185), "w"(v178)); - svfloat32_t v403; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v403) : "w"(v178), "w"(v185)); - svfloat32_t v404; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v404) : "w"(v220), "w"(v227)); - svfloat32_t v405; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v405) : "w"(v220), "w"(v227)); - svfloat32_t v406; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v406) : "w"(v269), "w"(v262)); - svfloat32_t v407; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v407) : "w"(v262), "w"(v269)); - svfloat32_t v408; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v408) : "w"(v304), "w"(v311)); - svfloat32_t v409; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v409) : "w"(v304), "w"(v311)); - svfloat32_t v410; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v410) : "w"(v353), "w"(v346)); - svfloat32_t v411; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v411) : "w"(v346), "w"(v353)); - svfloat32_t v412; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v412) : "w"(v388), "w"(v395)); - svfloat32_t v413; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v413) : "w"(v388), "w"(v395)); - svfloat32_t v414; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v414) : "w"(v396), "w"(v408)); - svfloat32_t v415; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v415) : "w"(v398), "w"(v410)); - svfloat32_t v416; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v416) : "w"(v400), "w"(v412)); - svfloat32_t v417; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v417) : "w"(v402), "w"(v408)); - svfloat32_t v418; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v418) : "w"(v404), "w"(v410)); - svfloat32_t v419; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v419) : "w"(v406), "w"(v412)); - svfloat32_t v420; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v420) : "w"(v396), "w"(v402)); - svfloat32_t v422; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v422) : "w"(v398), "w"(v404)); - svfloat32_t v424; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v424) : "w"(v400), "w"(v406)); - svfloat32_t v454; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v454) : "w"(v397), "w"(v409)); - svfloat32_t v455; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v455) : "w"(v399), "w"(v411)); - svfloat32_t v456; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v456) : "w"(v401), "w"(v413)); - svfloat32_t v457; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v457) : "w"(v403), "w"(v409)); - svfloat32_t v458; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v458) : "w"(v405), "w"(v411)); - svfloat32_t v459; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v459) : "w"(v407), "w"(v413)); - svfloat32_t v460; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v460) : "w"(v397), "w"(v403)); - svfloat32_t v462; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v462) : "w"(v399), "w"(v405)); - svfloat32_t v464; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v464) : "w"(v401), "w"(v407)); - svfloat32_t v421; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v421) : "w"(v420), "w"(v408)); - svfloat32_t v423; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v423) : "w"(v422), "w"(v410)); - svfloat32_t v425; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v425) : "w"(v424), "w"(v412)); - svfloat32_t v426; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v426) : "w"(v414), "w"(v416)); - svfloat32_t v427; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v427) : "w"(v417), "w"(v419)); - svfloat32_t v444; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v444) : "w"(v414), "w"(v417)); - svfloat32_t v445; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v445) : "w"(v416), "w"(v419)); - svfloat32_t v461; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v461) : "w"(v460), "w"(v409)); - svfloat32_t v463; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v463) : "w"(v462), "w"(v411)); - svfloat32_t v465; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v465) : "w"(v464), "w"(v413)); - svfloat32_t v466; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v466) : "w"(v454), "w"(v456)); - svfloat32_t v467; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v467) : "w"(v457), "w"(v459)); - svfloat32_t v476; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v476) : "w"(v454), "w"(v457)); - svfloat32_t v477; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v477) : "w"(v456), "w"(v459)); - svfloat32_t zero641; - asm volatile("mov %0.s, #0" : "=w"(zero641)); + svfloat32_t v396 = svadd_f32_x(svptrue_b32(), v52, v59); + svfloat32_t v397 = svsub_f32_x(svptrue_b32(), v52, v59); + svfloat32_t v398 = svadd_f32_x(svptrue_b32(), v101, v94); + svfloat32_t v399 = svsub_f32_x(svptrue_b32(), v94, v101); + svfloat32_t v400 = svadd_f32_x(svptrue_b32(), v136, v143); + svfloat32_t v401 = svsub_f32_x(svptrue_b32(), v136, v143); + svfloat32_t v402 = svadd_f32_x(svptrue_b32(), v185, v178); + svfloat32_t v403 = svsub_f32_x(svptrue_b32(), v178, v185); + svfloat32_t v404 = svadd_f32_x(svptrue_b32(), v220, v227); + svfloat32_t v405 = svsub_f32_x(svptrue_b32(), v220, v227); + svfloat32_t v406 = svadd_f32_x(svptrue_b32(), v269, v262); + svfloat32_t v407 = svsub_f32_x(svptrue_b32(), v262, v269); + svfloat32_t v408 = svadd_f32_x(svptrue_b32(), v304, v311); + svfloat32_t v409 = svsub_f32_x(svptrue_b32(), v304, v311); + svfloat32_t v410 = svadd_f32_x(svptrue_b32(), v353, v346); + svfloat32_t v411 = svsub_f32_x(svptrue_b32(), v346, v353); + svfloat32_t v412 = svadd_f32_x(svptrue_b32(), v388, v395); + svfloat32_t v413 = svsub_f32_x(svptrue_b32(), v388, v395); + svfloat32_t v414 = svsub_f32_x(svptrue_b32(), v396, v408); + svfloat32_t v415 = svsub_f32_x(svptrue_b32(), v398, v410); + svfloat32_t v416 = svsub_f32_x(svptrue_b32(), v400, v412); + svfloat32_t v417 = svsub_f32_x(svptrue_b32(), v402, v408); + svfloat32_t v418 = svsub_f32_x(svptrue_b32(), v404, v410); + svfloat32_t v419 = svsub_f32_x(svptrue_b32(), v406, v412); + svfloat32_t v420 = svadd_f32_x(svptrue_b32(), v396, v402); + svfloat32_t v422 = svadd_f32_x(svptrue_b32(), v398, v404); + svfloat32_t v424 = svadd_f32_x(svptrue_b32(), v400, v406); + svfloat32_t v454 = svsub_f32_x(svptrue_b32(), v397, v409); + svfloat32_t v455 = svsub_f32_x(svptrue_b32(), v399, v411); + svfloat32_t v456 = svsub_f32_x(svptrue_b32(), v401, v413); + svfloat32_t v457 = svsub_f32_x(svptrue_b32(), v403, v409); + svfloat32_t v458 = svsub_f32_x(svptrue_b32(), v405, v411); + svfloat32_t v459 = svsub_f32_x(svptrue_b32(), v407, v413); + svfloat32_t v460 = svadd_f32_x(svptrue_b32(), v397, v403); + svfloat32_t v462 = svadd_f32_x(svptrue_b32(), v399, v405); + svfloat32_t v464 = svadd_f32_x(svptrue_b32(), v401, v407); + svfloat32_t v421 = svadd_f32_x(svptrue_b32(), v420, v408); + svfloat32_t v423 = svadd_f32_x(svptrue_b32(), v422, v410); + svfloat32_t v425 = svadd_f32_x(svptrue_b32(), v424, v412); + svfloat32_t v426 = svadd_f32_x(svptrue_b32(), v414, v416); + svfloat32_t v427 = svadd_f32_x(svptrue_b32(), v417, v419); + svfloat32_t v444 = svsub_f32_x(svptrue_b32(), v414, v417); + svfloat32_t v445 = svsub_f32_x(svptrue_b32(), v416, v419); + svfloat32_t v461 = svadd_f32_x(svptrue_b32(), v460, v409); + svfloat32_t v463 = svadd_f32_x(svptrue_b32(), v462, v411); + svfloat32_t v465 = svadd_f32_x(svptrue_b32(), v464, v413); + svfloat32_t v466 = svadd_f32_x(svptrue_b32(), v454, v456); + svfloat32_t v467 = svadd_f32_x(svptrue_b32(), v457, v459); + svfloat32_t v476 = svsub_f32_x(svptrue_b32(), v454, v457); + svfloat32_t v477 = svsub_f32_x(svptrue_b32(), v456, v459); + svfloat32_t zero641 = svdup_n_f32(0); svfloat32_t v641 = svcmla_f32_x(pred_full, zero641, v1175, v457, 90); - svfloat32_t zero662; - asm volatile("mov %0.s, #0" : "=w"(zero662)); + svfloat32_t zero662 = svdup_n_f32(0); svfloat32_t v662 = svcmla_f32_x(pred_full, zero662, v1178, v459, 90); - svfloat32_t v428; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v428) : "w"(v421), "w"(v423)); - svfloat32_t v438; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v438) : "w"(v427), "w"(v418)); - svfloat32_t v439; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v439) : "w"(v426), "w"(v415)); - svfloat32_t v441; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v441) : "w"(v427), "w"(v418)); - svfloat32_t v442; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v442) : "w"(v426), "w"(v415)); - svfloat32_t v446; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v446) : "w"(v414), "w"(v445)); - svfloat32_t v448; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v448) : "w"(v444), "w"(v419)); - svfloat32_t v451; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v451) : "w"(v421), "w"(v425)); - svfloat32_t v452; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v452) : "w"(v423), "w"(v425)); - svfloat32_t v468; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v468) : "w"(v461), "w"(v463)); - svfloat32_t v470; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v470) : "w"(v467), "w"(v458)); - svfloat32_t v471; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v471) : "w"(v466), "w"(v455)); - svfloat32_t v473; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v473) : "w"(v467), "w"(v458)); - svfloat32_t v474; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v474) : "w"(v466), "w"(v455)); - svfloat32_t v478; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v478) : "w"(v454), "w"(v477)); - svfloat32_t v480; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v480) : "w"(v476), "w"(v459)); - svfloat32_t v483; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v483) : "w"(v461), "w"(v465)); - svfloat32_t v484; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v484) : "w"(v463), "w"(v465)); - svfloat32_t v429; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v429) : "w"(v428), "w"(v425)); - svfloat32_t v440; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v440) : "w"(v439), "w"(v438)); - svfloat32_t v443; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v443) : "w"(v442), "w"(v441)); - svfloat32_t v447; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v447) : "w"(v446), "w"(v418)); - svfloat32_t v449; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v449) : "w"(v448), "w"(v415)); - svfloat32_t v453; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v453) : "w"(v451), "w"(v452)); - svfloat32_t v469; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v469) : "w"(v468), "w"(v465)); - svfloat32_t v472; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v472) : "w"(v471), "w"(v470)); - svfloat32_t v475; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v475) : "w"(v474), "w"(v473)); - svfloat32_t v479; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v479) : "w"(v478), "w"(v458)); - svfloat32_t v481; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v481) : "w"(v480), "w"(v455)); - svfloat32_t v485; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v485) : "w"(v483), "w"(v484)); - svfloat32_t v505; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v505) : "w"(v439), "w"(v1151)); - svfloat32_t v520; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v520) : "w"(v442), "w"(v1154)); - svfloat32_t zero599; - asm volatile("mov %0.s, #0" : "=w"(zero599)); + svfloat32_t v428 = svadd_f32_x(svptrue_b32(), v421, v423); + svfloat32_t v438 = svadd_f32_x(svptrue_b32(), v427, v418); + svfloat32_t v439 = svadd_f32_x(svptrue_b32(), v426, v415); + svfloat32_t v441 = svsub_f32_x(svptrue_b32(), v427, v418); + svfloat32_t v442 = svsub_f32_x(svptrue_b32(), v426, v415); + svfloat32_t v446 = svsub_f32_x(svptrue_b32(), v414, v445); + svfloat32_t v448 = svadd_f32_x(svptrue_b32(), v444, v419); + svfloat32_t v451 = svsub_f32_x(svptrue_b32(), v421, v425); + svfloat32_t v452 = svsub_f32_x(svptrue_b32(), v423, v425); + svfloat32_t v468 = svadd_f32_x(svptrue_b32(), v461, v463); + svfloat32_t v470 = svadd_f32_x(svptrue_b32(), v467, v458); + svfloat32_t v471 = svadd_f32_x(svptrue_b32(), v466, v455); + svfloat32_t v473 = svsub_f32_x(svptrue_b32(), v467, v458); + svfloat32_t v474 = svsub_f32_x(svptrue_b32(), v466, v455); + svfloat32_t v478 = svsub_f32_x(svptrue_b32(), v454, v477); + svfloat32_t v480 = svadd_f32_x(svptrue_b32(), v476, v459); + svfloat32_t v483 = svsub_f32_x(svptrue_b32(), v461, v465); + svfloat32_t v484 = svsub_f32_x(svptrue_b32(), v463, v465); + svfloat32_t v429 = svadd_f32_x(svptrue_b32(), v428, v425); + svfloat32_t v440 = svsub_f32_x(svptrue_b32(), v439, v438); + svfloat32_t v443 = svsub_f32_x(svptrue_b32(), v442, v441); + svfloat32_t v447 = svsub_f32_x(svptrue_b32(), v446, v418); + svfloat32_t v449 = svsub_f32_x(svptrue_b32(), v448, v415); + svfloat32_t v453 = svadd_f32_x(svptrue_b32(), v451, v452); + svfloat32_t v469 = svadd_f32_x(svptrue_b32(), v468, v465); + svfloat32_t v472 = svsub_f32_x(svptrue_b32(), v471, v470); + svfloat32_t v475 = svsub_f32_x(svptrue_b32(), v474, v473); + svfloat32_t v479 = svsub_f32_x(svptrue_b32(), v478, v458); + svfloat32_t v481 = svsub_f32_x(svptrue_b32(), v480, v455); + svfloat32_t v485 = svadd_f32_x(svptrue_b32(), v483, v484); + svfloat32_t v505 = svmul_f32_x(svptrue_b32(), v439, v1151); + svfloat32_t v520 = svmul_f32_x(svptrue_b32(), v442, v1154); + svfloat32_t zero599 = svdup_n_f32(0); svfloat32_t v599 = svcmla_f32_x(pred_full, zero599, v1169, v470, 90); - svfloat32_t zero620; - asm volatile("mov %0.s, #0" : "=w"(zero620)); + svfloat32_t zero620 = svdup_n_f32(0); svfloat32_t v620 = svcmla_f32_x(pred_full, zero620, v1172, v473, 90); - svfloat32_t zero704; - asm volatile("mov %0.s, #0" : "=w"(zero704)); + svfloat32_t zero704 = svdup_n_f32(0); svfloat32_t v704 = svcmla_f32_x(pred_full, zero704, v1184, v483, 90); - svfloat32_t zero711; - asm volatile("mov %0.s, #0" : "=w"(zero711)); + svfloat32_t zero711 = svdup_n_f32(0); svfloat32_t v711 = svcmla_f32_x(pred_full, zero711, v1185, v484, 90); - svfloat32_t v437; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v437) : "w"(v1147), "w"(v429)); - svfloat32_t v450; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v450) : "w"(v447), "w"(v449)); - svfloat32_t v482; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v482) : "w"(v479), "w"(v481)); - svfloat32_t v510; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v510) : "w"(v440), "w"(v1152)); - svfloat32_t v525; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v525) : "w"(v443), "w"(v1155)); - svfloat32_t v585; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v585) : "w"(v453), "w"(v1167)); - svfloat32_t zero592; - asm volatile("mov %0.s, #0" : "=w"(zero592)); + svfloat32_t v437 = svadd_f32_x(svptrue_b32(), v1147, v429); + svfloat32_t v450 = svsub_f32_x(svptrue_b32(), v447, v449); + svfloat32_t v482 = svsub_f32_x(svptrue_b32(), v479, v481); + svfloat32_t v510 = svmul_f32_x(svptrue_b32(), v440, v1152); + svfloat32_t v525 = svmul_f32_x(svptrue_b32(), v443, v1155); + svfloat32_t v585 = svmul_f32_x(svptrue_b32(), v453, v1167); + svfloat32_t zero592 = svdup_n_f32(0); svfloat32_t v592 = svcmla_f32_x(pred_full, zero592, v1168, v469, 90); - svfloat32_t zero718; - asm volatile("mov %0.s, #0" : "=w"(zero718)); + svfloat32_t zero718 = svdup_n_f32(0); svfloat32_t v718 = svcmla_f32_x(pred_full, zero718, v1186, v485, 90); svfloat32_t v719 = svmla_f32_x(pred_full, v505, v438, v1150); svfloat32_t v720 = svmla_f32_x(pred_full, v520, v441, v1153); svfloat32_t v750 = svcmla_f32_x(pred_full, v599, v1170, v471, 90); svfloat32_t v751 = svcmla_f32_x(pred_full, v620, v1173, v474, 90); - svfloat32_t v570; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v570) : "w"(v450), "w"(v1164)); - svfloat32_t zero697; - asm volatile("mov %0.s, #0" : "=w"(zero697)); + svfloat32_t v570 = svmul_f32_x(svptrue_b32(), v450, v1164); + svfloat32_t zero697 = svdup_n_f32(0); svfloat32_t v697 = svcmla_f32_x(pred_full, zero697, v1183, v482, 90); - svfloat32_t v722; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v722) : "w"(v719), "w"(v720)); + svfloat32_t v722 = svadd_f32_x(svptrue_b32(), v719, v720); svfloat32_t v723 = svmla_f32_x(pred_full, v510, v438, v1150); svfloat32_t v724 = svmla_f32_x(pred_full, v525, v441, v1153); - svfloat32_t v741; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v741) : "w"(v719), "w"(v720)); + svfloat32_t v741 = svsub_f32_x(svptrue_b32(), v719, v720); svfloat32_t v743 = svnmls_f32_x(pred_full, v585, v451, v1165); svfloat32_t v744 = svnmls_f32_x(pred_full, v585, v452, v1166); svfloat32_t v745 = svmla_f32_x(pred_full, v437, v429, v1149); - svfloat32_t v753; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v753) : "w"(v750), "w"(v751)); + svfloat32_t v753 = svadd_f32_x(svptrue_b32(), v750, v751); svfloat32_t v754 = svcmla_f32_x(pred_full, v599, v1171, v472, 90); svfloat32_t v755 = svcmla_f32_x(pred_full, v620, v1174, v475, 90); - svfloat32_t v772; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v772) : "w"(v750), "w"(v751)); - svfloat32_t v774; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v774) : "w"(v704), "w"(v718)); - svfloat32_t v775; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v775) : "w"(v711), "w"(v718)); + svfloat32_t v772 = svsub_f32_x(svptrue_b32(), v750, v751); + svfloat32_t v774 = svsub_f32_x(svptrue_b32(), v704, v718); + svfloat32_t v775 = svsub_f32_x(svptrue_b32(), v711, v718); svint16_t v806 = svtbl_s16( svreinterpret_s16_s32(svcvt_s32_f32_x( pred_full, svmul_n_f32_x(pred_full, v437, (float)(1ULL << 31ULL)))), @@ -11325,126 +10159,72 @@ void armral_fft_cf32_cf32_cs16_ab_t_gu19(const armral_cmplx_f32_t *restrict x, svfloat32_t v721 = svmla_f32_x(pred_full, v570, v449, v1163); svfloat32_t v725 = svmla_f32_x(pred_full, v570, v447, v1162); svfloat32_t v726 = svnmls_f32_x(pred_full, v722, v417, v1156); - svfloat32_t v727; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v727) : "w"(v723), "w"(v724)); - svfloat32_t v733; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v733) : "w"(v723), "w"(v724)); + svfloat32_t v727 = svadd_f32_x(svptrue_b32(), v723, v724); + svfloat32_t v733 = svsub_f32_x(svptrue_b32(), v723, v724); svfloat32_t v738 = svmla_f32_x(pred_full, v722, v416, v1161); - svfloat32_t v746; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v746) : "w"(v745), "w"(v743)); - svfloat32_t v747; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v747) : "w"(v745), "w"(v743)); - svfloat32_t v749; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v749) : "w"(v745), "w"(v744)); + svfloat32_t v746 = svadd_f32_x(svptrue_b32(), v745, v743); + svfloat32_t v747 = svsub_f32_x(svptrue_b32(), v745, v743); + svfloat32_t v749 = svadd_f32_x(svptrue_b32(), v745, v744); svfloat32_t v752 = svcmla_f32_x(pred_full, v697, v1182, v481, 90); svfloat32_t v756 = svcmla_f32_x(pred_full, v697, v1181, v479, 90); - svfloat32_t v757; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v757) : "w"(v641), "w"(v753)); - svfloat32_t v758; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v758) : "w"(v754), "w"(v755)); - svfloat32_t v764; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v764) : "w"(v754), "w"(v755)); + svfloat32_t v757 = svsub_f32_x(svptrue_b32(), v641, v753); + svfloat32_t v758 = svadd_f32_x(svptrue_b32(), v754, v755); + svfloat32_t v764 = svsub_f32_x(svptrue_b32(), v754, v755); svfloat32_t v769 = svcmla_f32_x(pred_full, v753, v1180, v456, 90); - svfloat32_t v776; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v776) : "w"(v592), "w"(v774)); - svfloat32_t v777; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v777) : "w"(v592), "w"(v774)); - svfloat32_t v779; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v779) : "w"(v592), "w"(v775)); + svfloat32_t v776 = svadd_f32_x(svptrue_b32(), v592, v774); + svfloat32_t v777 = svsub_f32_x(svptrue_b32(), v592, v774); + svfloat32_t v779 = svadd_f32_x(svptrue_b32(), v592, v775); svst1w_u64(pred_full, (unsigned *)(v1194), svreinterpret_u64_s16(v806)); svfloat32_t v728 = svnmls_f32_x(pred_full, v725, v419, v1159); svfloat32_t v729 = svmla_f32_x(pred_full, v721, v444, v1157); svfloat32_t v731 = svmla_f32_x(pred_full, v727, v445, v1160); - svfloat32_t v734; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v734) : "w"(v733), "w"(v721)); - svfloat32_t v735; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v735) : "w"(v726), "w"(v727)); - svfloat32_t v742; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v742) : "w"(v741), "w"(v725)); - svfloat32_t v748; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v748) : "w"(v747), "w"(v744)); - svfloat32_t v759; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v759) : "w"(v662), "w"(v756)); + svfloat32_t v734 = svadd_f32_x(svptrue_b32(), v733, v721); + svfloat32_t v735 = svadd_f32_x(svptrue_b32(), v726, v727); + svfloat32_t v742 = svadd_f32_x(svptrue_b32(), v741, v725); + svfloat32_t v748 = svsub_f32_x(svptrue_b32(), v747, v744); + svfloat32_t v759 = svsub_f32_x(svptrue_b32(), v662, v756); svfloat32_t v760 = svcmla_f32_x(pred_full, v752, v1176, v476, 90); svfloat32_t v762 = svcmla_f32_x(pred_full, v758, v1179, v477, 90); - svfloat32_t v765; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v765) : "w"(v764), "w"(v752)); - svfloat32_t v766; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v766) : "w"(v757), "w"(v758)); - svfloat32_t v773; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v773) : "w"(v772), "w"(v756)); - svfloat32_t v778; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v778) : "w"(v777), "w"(v775)); - svfloat32_t v730; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v730) : "w"(v729), "w"(v726)); - svfloat32_t v732; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v732) : "w"(v731), "w"(v728)); + svfloat32_t v765 = svadd_f32_x(svptrue_b32(), v764, v752); + svfloat32_t v766 = svadd_f32_x(svptrue_b32(), v757, v758); + svfloat32_t v773 = svadd_f32_x(svptrue_b32(), v772, v756); + svfloat32_t v778 = svsub_f32_x(svptrue_b32(), v777, v775); + svfloat32_t v730 = svadd_f32_x(svptrue_b32(), v729, v726); + svfloat32_t v732 = svadd_f32_x(svptrue_b32(), v731, v728); svfloat32_t v736 = svmla_f32_x(pred_full, v735, v414, v1158); - svfloat32_t v739; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v739) : "w"(v738), "w"(v728)); - svfloat32_t v761; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v761) : "w"(v760), "w"(v757)); - svfloat32_t v763; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v763) : "w"(v762), "w"(v759)); + svfloat32_t v739 = svadd_f32_x(svptrue_b32(), v738, v728); + svfloat32_t v761 = svadd_f32_x(svptrue_b32(), v760, v757); + svfloat32_t v763 = svadd_f32_x(svptrue_b32(), v762, v759); svfloat32_t v767 = svcmla_f32_x(pred_full, v766, v1177, v454, 90); - svfloat32_t v770; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v770) : "w"(v769), "w"(v759)); - svfloat32_t v784; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v784) : "w"(v742), "w"(v734)); - svfloat32_t v788; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v788) : "w"(v749), "w"(v742)); - svfloat32_t v791; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v791) : "w"(v734), "w"(v749)); - svfloat32_t v796; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v796) : "w"(v773), "w"(v765)); - svfloat32_t v800; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v800) : "w"(v773), "w"(v779)); - svfloat32_t v803; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v803) : "w"(v765), "w"(v779)); - svfloat32_t v737; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v737) : "w"(v736), "w"(v725)); - svfloat32_t v740; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v740) : "w"(v739), "w"(v721)); - svfloat32_t v768; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v768) : "w"(v767), "w"(v756)); - svfloat32_t v771; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v771) : "w"(v770), "w"(v752)); - svfloat32_t v785; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v785) : "w"(v784), "w"(v749)); - svfloat32_t v789; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v789) : "w"(v730), "w"(v746)); - svfloat32_t v790; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v790) : "w"(v732), "w"(v748)); - svfloat32_t v797; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v797) : "w"(v796), "w"(v779)); - svfloat32_t v801; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v801) : "w"(v761), "w"(v776)); - svfloat32_t v802; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v802) : "w"(v763), "w"(v778)); - svfloat32_t v830; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v830) : "w"(v791), "w"(v803)); - svfloat32_t v839; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v839) : "w"(v791), "w"(v803)); - svfloat32_t v848; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v848) : "w"(v788), "w"(v800)); - svfloat32_t v857; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v857) : "w"(v788), "w"(v800)); - svfloat32_t v780; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v780) : "w"(v737), "w"(v730)); - svfloat32_t v782; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v782) : "w"(v740), "w"(v732)); - svfloat32_t v786; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v786) : "w"(v746), "w"(v737)); - svfloat32_t v787; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v787) : "w"(v748), "w"(v740)); - svfloat32_t v792; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v792) : "w"(v768), "w"(v761)); - svfloat32_t v794; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v794) : "w"(v771), "w"(v763)); - svfloat32_t v798; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v798) : "w"(v776), "w"(v768)); - svfloat32_t v799; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v799) : "w"(v778), "w"(v771)); + svfloat32_t v770 = svadd_f32_x(svptrue_b32(), v769, v759); + svfloat32_t v784 = svsub_f32_x(svptrue_b32(), v742, v734); + svfloat32_t v788 = svsub_f32_x(svptrue_b32(), v749, v742); + svfloat32_t v791 = svadd_f32_x(svptrue_b32(), v734, v749); + svfloat32_t v796 = svsub_f32_x(svptrue_b32(), v773, v765); + svfloat32_t v800 = svsub_f32_x(svptrue_b32(), v773, v779); + svfloat32_t v803 = svadd_f32_x(svptrue_b32(), v765, v779); + svfloat32_t v737 = svadd_f32_x(svptrue_b32(), v736, v725); + svfloat32_t v740 = svadd_f32_x(svptrue_b32(), v739, v721); + svfloat32_t v768 = svadd_f32_x(svptrue_b32(), v767, v756); + svfloat32_t v771 = svadd_f32_x(svptrue_b32(), v770, v752); + svfloat32_t v785 = svadd_f32_x(svptrue_b32(), v784, v749); + svfloat32_t v789 = svadd_f32_x(svptrue_b32(), v730, v746); + svfloat32_t v790 = svadd_f32_x(svptrue_b32(), v732, v748); + svfloat32_t v797 = svadd_f32_x(svptrue_b32(), v796, v779); + svfloat32_t v801 = svadd_f32_x(svptrue_b32(), v761, v776); + svfloat32_t v802 = svadd_f32_x(svptrue_b32(), v763, v778); + svfloat32_t v830 = svsub_f32_x(svptrue_b32(), v791, v803); + svfloat32_t v839 = svadd_f32_x(svptrue_b32(), v791, v803); + svfloat32_t v848 = svadd_f32_x(svptrue_b32(), v788, v800); + svfloat32_t v857 = svsub_f32_x(svptrue_b32(), v788, v800); + svfloat32_t v780 = svsub_f32_x(svptrue_b32(), v737, v730); + svfloat32_t v782 = svsub_f32_x(svptrue_b32(), v740, v732); + svfloat32_t v786 = svsub_f32_x(svptrue_b32(), v746, v737); + svfloat32_t v787 = svsub_f32_x(svptrue_b32(), v748, v740); + svfloat32_t v792 = svsub_f32_x(svptrue_b32(), v768, v761); + svfloat32_t v794 = svsub_f32_x(svptrue_b32(), v771, v763); + svfloat32_t v798 = svsub_f32_x(svptrue_b32(), v776, v768); + svfloat32_t v799 = svsub_f32_x(svptrue_b32(), v778, v771); svint16_t v833 = svtbl_s16( svreinterpret_s16_s32(svcvt_s32_f32_x( pred_full, svmul_n_f32_x(pred_full, v830, (float)(1ULL << 31ULL)))), @@ -11465,26 +10245,16 @@ void armral_fft_cf32_cf32_cs16_ab_t_gu19(const armral_cmplx_f32_t *restrict x, pred_full, svmul_n_f32_x(pred_full, v857, (float)(1ULL << 31ULL)))), svreinterpret_u16_u64( svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svfloat32_t v866; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v866) : "w"(v790), "w"(v802)); - svfloat32_t v875; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v875) : "w"(v790), "w"(v802)); - svfloat32_t v884; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v884) : "w"(v785), "w"(v797)); - svfloat32_t v893; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v893) : "w"(v785), "w"(v797)); - svfloat32_t v938; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v938) : "w"(v789), "w"(v801)); - svfloat32_t v947; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v947) : "w"(v789), "w"(v801)); - svfloat32_t v781; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v781) : "w"(v780), "w"(v746)); - svfloat32_t v783; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v783) : "w"(v782), "w"(v748)); - svfloat32_t v793; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v793) : "w"(v792), "w"(v776)); - svfloat32_t v795; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v795) : "w"(v794), "w"(v778)); + svfloat32_t v866 = svadd_f32_x(svptrue_b32(), v790, v802); + svfloat32_t v875 = svsub_f32_x(svptrue_b32(), v790, v802); + svfloat32_t v884 = svadd_f32_x(svptrue_b32(), v785, v797); + svfloat32_t v893 = svsub_f32_x(svptrue_b32(), v785, v797); + svfloat32_t v938 = svsub_f32_x(svptrue_b32(), v789, v801); + svfloat32_t v947 = svadd_f32_x(svptrue_b32(), v789, v801); + svfloat32_t v781 = svadd_f32_x(svptrue_b32(), v780, v746); + svfloat32_t v783 = svadd_f32_x(svptrue_b32(), v782, v748); + svfloat32_t v793 = svadd_f32_x(svptrue_b32(), v792, v776); + svfloat32_t v795 = svadd_f32_x(svptrue_b32(), v794, v778); svint16_t v869 = svtbl_s16( svreinterpret_s16_s32(svcvt_s32_f32_x( pred_full, svmul_n_f32_x(pred_full, v866, (float)(1ULL << 31ULL)))), @@ -11505,14 +10275,10 @@ void armral_fft_cf32_cf32_cs16_ab_t_gu19(const armral_cmplx_f32_t *restrict x, pred_full, svmul_n_f32_x(pred_full, v893, (float)(1ULL << 31ULL)))), svreinterpret_u16_u64( svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svfloat32_t v902; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v902) : "w"(v787), "w"(v799)); - svfloat32_t v911; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v911) : "w"(v787), "w"(v799)); - svfloat32_t v920; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v920) : "w"(v786), "w"(v798)); - svfloat32_t v929; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v929) : "w"(v786), "w"(v798)); + svfloat32_t v902 = svadd_f32_x(svptrue_b32(), v787, v799); + svfloat32_t v911 = svsub_f32_x(svptrue_b32(), v787, v799); + svfloat32_t v920 = svadd_f32_x(svptrue_b32(), v786, v798); + svfloat32_t v929 = svsub_f32_x(svptrue_b32(), v786, v798); svint16_t v941 = svtbl_s16( svreinterpret_s16_s32(svcvt_s32_f32_x( pred_full, svmul_n_f32_x(pred_full, v938, (float)(1ULL << 31ULL)))), @@ -11527,10 +10293,8 @@ void armral_fft_cf32_cf32_cs16_ab_t_gu19(const armral_cmplx_f32_t *restrict x, svst1w_u64(pred_full, (unsigned *)(v1230), svreinterpret_u64_s16(v842)); svst1w_u64(pred_full, (unsigned *)(v1239), svreinterpret_u64_s16(v851)); svst1w_u64(pred_full, (unsigned *)(v1248), svreinterpret_u64_s16(v860)); - svfloat32_t v812; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v812) : "w"(v781), "w"(v793)); - svfloat32_t v821; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v821) : "w"(v781), "w"(v793)); + svfloat32_t v812 = svadd_f32_x(svptrue_b32(), v781, v793); + svfloat32_t v821 = svsub_f32_x(svptrue_b32(), v781, v793); svint16_t v905 = svtbl_s16( svreinterpret_s16_s32(svcvt_s32_f32_x( pred_full, svmul_n_f32_x(pred_full, v902, (float)(1ULL << 31ULL)))), @@ -11551,10 +10315,8 @@ void armral_fft_cf32_cf32_cs16_ab_t_gu19(const armral_cmplx_f32_t *restrict x, pred_full, svmul_n_f32_x(pred_full, v929, (float)(1ULL << 31ULL)))), svreinterpret_u16_u64( svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svfloat32_t v956; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v956) : "w"(v783), "w"(v795)); - svfloat32_t v965; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v965) : "w"(v783), "w"(v795)); + svfloat32_t v956 = svadd_f32_x(svptrue_b32(), v783, v795); + svfloat32_t v965 = svsub_f32_x(svptrue_b32(), v783, v795); svst1w_u64(pred_full, (unsigned *)(v1257), svreinterpret_u64_s16(v869)); svst1w_u64(pred_full, (unsigned *)(v1266), svreinterpret_u64_s16(v878)); svst1w_u64(pred_full, (unsigned *)(v1275), svreinterpret_u64_s16(v887)); @@ -12212,8 +10974,7 @@ void armral_fft_cf32_cf32_cs16_ab_t_gu20(const armral_cmplx_f32_t *restrict x, svld1_f64(pred_full, &((const double *)v7)[v365])); svfloat32_t v373 = svreinterpret_f32_f64( svld1_f64(pred_full, &((const double *)v7)[v372])); - svfloat32_t zero409; - asm volatile("mov %0.s, #0" : "=w"(zero409)); + svfloat32_t zero409 = svdup_n_f32(0); svfloat32_t v409 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero409, v997, v408, 0), v997, v408, 90); @@ -12255,269 +11016,174 @@ void armral_fft_cf32_cf32_cs16_ab_t_gu20(const armral_cmplx_f32_t *restrict x, svld1_gather_s64index_f64(pred_full, (const double *)(v986), v1017)); svfloat32_t v1007 = svreinterpret_f32_f64( svld1_gather_s64index_f64(pred_full, (const double *)(v1005), v1017)); - svfloat32_t zero38; - asm volatile("mov %0.s, #0" : "=w"(zero38)); + svfloat32_t zero38 = svdup_n_f32(0); svfloat32_t v38 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero38, v844, v37, 0), v844, v37, 90); - svfloat32_t zero73; - asm volatile("mov %0.s, #0" : "=w"(zero73)); + svfloat32_t zero73 = svdup_n_f32(0); svfloat32_t v73 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero73, v853, v72, 0), v853, v72, 90); - svfloat32_t zero80; - asm volatile("mov %0.s, #0" : "=w"(zero80)); + svfloat32_t zero80 = svdup_n_f32(0); svfloat32_t v80 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero80, v862, v79, 0), v862, v79, 90); - svfloat32_t zero115; - asm volatile("mov %0.s, #0" : "=w"(zero115)); + svfloat32_t zero115 = svdup_n_f32(0); svfloat32_t v115 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero115, v871, v114, 0), v871, v114, 90); - svfloat32_t zero122; - asm volatile("mov %0.s, #0" : "=w"(zero122)); + svfloat32_t zero122 = svdup_n_f32(0); svfloat32_t v122 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero122, v880, v121, 0), v880, v121, 90); - svfloat32_t zero157; - asm volatile("mov %0.s, #0" : "=w"(zero157)); + svfloat32_t zero157 = svdup_n_f32(0); svfloat32_t v157 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero157, v889, v156, 0), v889, v156, 90); - svfloat32_t zero164; - asm volatile("mov %0.s, #0" : "=w"(zero164)); + svfloat32_t zero164 = svdup_n_f32(0); svfloat32_t v164 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero164, v898, v163, 0), v898, v163, 90); - svfloat32_t zero199; - asm volatile("mov %0.s, #0" : "=w"(zero199)); + svfloat32_t zero199 = svdup_n_f32(0); svfloat32_t v199 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero199, v907, v198, 0), v907, v198, 90); - svfloat32_t zero206; - asm volatile("mov %0.s, #0" : "=w"(zero206)); + svfloat32_t zero206 = svdup_n_f32(0); svfloat32_t v206 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero206, v916, v205, 0), v916, v205, 90); - svfloat32_t zero241; - asm volatile("mov %0.s, #0" : "=w"(zero241)); + svfloat32_t zero241 = svdup_n_f32(0); svfloat32_t v241 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero241, v925, v240, 0), v925, v240, 90); - svfloat32_t zero248; - asm volatile("mov %0.s, #0" : "=w"(zero248)); + svfloat32_t zero248 = svdup_n_f32(0); svfloat32_t v248 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero248, v934, v247, 0), v934, v247, 90); - svfloat32_t zero283; - asm volatile("mov %0.s, #0" : "=w"(zero283)); + svfloat32_t zero283 = svdup_n_f32(0); svfloat32_t v283 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero283, v943, v282, 0), v943, v282, 90); - svfloat32_t zero290; - asm volatile("mov %0.s, #0" : "=w"(zero290)); + svfloat32_t zero290 = svdup_n_f32(0); svfloat32_t v290 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero290, v952, v289, 0), v952, v289, 90); - svfloat32_t zero325; - asm volatile("mov %0.s, #0" : "=w"(zero325)); + svfloat32_t zero325 = svdup_n_f32(0); svfloat32_t v325 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero325, v961, v324, 0), v961, v324, 90); - svfloat32_t zero332; - asm volatile("mov %0.s, #0" : "=w"(zero332)); + svfloat32_t zero332 = svdup_n_f32(0); svfloat32_t v332 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero332, v970, v331, 0), v970, v331, 90); - svfloat32_t zero367; - asm volatile("mov %0.s, #0" : "=w"(zero367)); + svfloat32_t zero367 = svdup_n_f32(0); svfloat32_t v367 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero367, v979, v366, 0), v979, v366, 90); - svfloat32_t zero374; - asm volatile("mov %0.s, #0" : "=w"(zero374)); + svfloat32_t zero374 = svdup_n_f32(0); svfloat32_t v374 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero374, v988, v373, 0), v988, v373, 90); - svfloat32_t zero416; - asm volatile("mov %0.s, #0" : "=w"(zero416)); + svfloat32_t zero416 = svdup_n_f32(0); svfloat32_t v416 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero416, v1007, v415, 0), v1007, v415, 90); - svfloat32_t v424; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v424) : "w"(v1018), "w"(v38)); - svfloat32_t v425; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v425) : "w"(v1018), "w"(v38)); - svfloat32_t v426; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v426) : "w"(v73), "w"(v80)); - svfloat32_t v427; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v427) : "w"(v73), "w"(v80)); - svfloat32_t v430; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v430) : "w"(v115), "w"(v122)); - svfloat32_t v431; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v431) : "w"(v115), "w"(v122)); - svfloat32_t v432; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v432) : "w"(v157), "w"(v164)); - svfloat32_t v433; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v433) : "w"(v157), "w"(v164)); - svfloat32_t v436; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v436) : "w"(v199), "w"(v206)); - svfloat32_t v437; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v437) : "w"(v199), "w"(v206)); - svfloat32_t v438; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v438) : "w"(v241), "w"(v248)); - svfloat32_t v439; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v439) : "w"(v241), "w"(v248)); - svfloat32_t v442; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v442) : "w"(v283), "w"(v290)); - svfloat32_t v443; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v443) : "w"(v283), "w"(v290)); - svfloat32_t v444; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v444) : "w"(v325), "w"(v332)); - svfloat32_t v445; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v445) : "w"(v325), "w"(v332)); - svfloat32_t v448; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v448) : "w"(v367), "w"(v374)); - svfloat32_t v449; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v449) : "w"(v367), "w"(v374)); - svfloat32_t v450; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v450) : "w"(v409), "w"(v416)); - svfloat32_t v451; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v451) : "w"(v409), "w"(v416)); - svfloat32_t v428; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v428) : "w"(v424), "w"(v426)); - svfloat32_t v429; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v429) : "w"(v424), "w"(v426)); - svfloat32_t v434; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v434) : "w"(v430), "w"(v432)); - svfloat32_t v435; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v435) : "w"(v430), "w"(v432)); - svfloat32_t v440; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v440) : "w"(v436), "w"(v438)); - svfloat32_t v441; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v441) : "w"(v436), "w"(v438)); - svfloat32_t v446; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v446) : "w"(v442), "w"(v444)); - svfloat32_t v447; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v447) : "w"(v442), "w"(v444)); - svfloat32_t v452; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v452) : "w"(v448), "w"(v450)); - svfloat32_t v453; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v453) : "w"(v448), "w"(v450)); - svfloat32_t v560; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v560) : "w"(v431), "w"(v449)); - svfloat32_t v561; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v561) : "w"(v431), "w"(v449)); - svfloat32_t v562; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v562) : "w"(v443), "w"(v437)); - svfloat32_t v563; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v563) : "w"(v443), "w"(v437)); - svfloat32_t v613; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v613) : "w"(v433), "w"(v451)); - svfloat32_t v614; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v614) : "w"(v433), "w"(v451)); - svfloat32_t v615; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v615) : "w"(v445), "w"(v439)); - svfloat32_t v616; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v616) : "w"(v445), "w"(v439)); - svfloat32_t v454; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v454) : "w"(v434), "w"(v452)); - svfloat32_t v455; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v455) : "w"(v434), "w"(v452)); - svfloat32_t v456; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v456) : "w"(v446), "w"(v440)); - svfloat32_t v457; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v457) : "w"(v446), "w"(v440)); - svfloat32_t v507; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v507) : "w"(v435), "w"(v453)); - svfloat32_t v508; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v508) : "w"(v435), "w"(v453)); - svfloat32_t v509; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v509) : "w"(v447), "w"(v441)); - svfloat32_t v510; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v510) : "w"(v447), "w"(v441)); - svfloat32_t v564; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v564) : "w"(v560), "w"(v562)); - svfloat32_t v565; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v565) : "w"(v560), "w"(v562)); - svfloat32_t v566; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v566) : "w"(v561), "w"(v563)); - svfloat32_t zero589; - asm volatile("mov %0.s, #0" : "=w"(zero589)); + svfloat32_t v424 = svadd_f32_x(svptrue_b32(), v1018, v38); + svfloat32_t v425 = svsub_f32_x(svptrue_b32(), v1018, v38); + svfloat32_t v426 = svadd_f32_x(svptrue_b32(), v73, v80); + svfloat32_t v427 = svsub_f32_x(svptrue_b32(), v73, v80); + svfloat32_t v430 = svadd_f32_x(svptrue_b32(), v115, v122); + svfloat32_t v431 = svsub_f32_x(svptrue_b32(), v115, v122); + svfloat32_t v432 = svadd_f32_x(svptrue_b32(), v157, v164); + svfloat32_t v433 = svsub_f32_x(svptrue_b32(), v157, v164); + svfloat32_t v436 = svadd_f32_x(svptrue_b32(), v199, v206); + svfloat32_t v437 = svsub_f32_x(svptrue_b32(), v199, v206); + svfloat32_t v438 = svadd_f32_x(svptrue_b32(), v241, v248); + svfloat32_t v439 = svsub_f32_x(svptrue_b32(), v241, v248); + svfloat32_t v442 = svadd_f32_x(svptrue_b32(), v283, v290); + svfloat32_t v443 = svsub_f32_x(svptrue_b32(), v283, v290); + svfloat32_t v444 = svadd_f32_x(svptrue_b32(), v325, v332); + svfloat32_t v445 = svsub_f32_x(svptrue_b32(), v325, v332); + svfloat32_t v448 = svadd_f32_x(svptrue_b32(), v367, v374); + svfloat32_t v449 = svsub_f32_x(svptrue_b32(), v367, v374); + svfloat32_t v450 = svadd_f32_x(svptrue_b32(), v409, v416); + svfloat32_t v451 = svsub_f32_x(svptrue_b32(), v409, v416); + svfloat32_t v428 = svadd_f32_x(svptrue_b32(), v424, v426); + svfloat32_t v429 = svsub_f32_x(svptrue_b32(), v424, v426); + svfloat32_t v434 = svadd_f32_x(svptrue_b32(), v430, v432); + svfloat32_t v435 = svsub_f32_x(svptrue_b32(), v430, v432); + svfloat32_t v440 = svadd_f32_x(svptrue_b32(), v436, v438); + svfloat32_t v441 = svsub_f32_x(svptrue_b32(), v436, v438); + svfloat32_t v446 = svadd_f32_x(svptrue_b32(), v442, v444); + svfloat32_t v447 = svsub_f32_x(svptrue_b32(), v442, v444); + svfloat32_t v452 = svadd_f32_x(svptrue_b32(), v448, v450); + svfloat32_t v453 = svsub_f32_x(svptrue_b32(), v448, v450); + svfloat32_t v560 = svadd_f32_x(svptrue_b32(), v431, v449); + svfloat32_t v561 = svsub_f32_x(svptrue_b32(), v431, v449); + svfloat32_t v562 = svadd_f32_x(svptrue_b32(), v443, v437); + svfloat32_t v563 = svsub_f32_x(svptrue_b32(), v443, v437); + svfloat32_t v613 = svadd_f32_x(svptrue_b32(), v433, v451); + svfloat32_t v614 = svsub_f32_x(svptrue_b32(), v433, v451); + svfloat32_t v615 = svadd_f32_x(svptrue_b32(), v445, v439); + svfloat32_t v616 = svsub_f32_x(svptrue_b32(), v445, v439); + svfloat32_t v454 = svadd_f32_x(svptrue_b32(), v434, v452); + svfloat32_t v455 = svsub_f32_x(svptrue_b32(), v434, v452); + svfloat32_t v456 = svadd_f32_x(svptrue_b32(), v446, v440); + svfloat32_t v457 = svsub_f32_x(svptrue_b32(), v446, v440); + svfloat32_t v507 = svadd_f32_x(svptrue_b32(), v435, v453); + svfloat32_t v508 = svsub_f32_x(svptrue_b32(), v435, v453); + svfloat32_t v509 = svadd_f32_x(svptrue_b32(), v447, v441); + svfloat32_t v510 = svsub_f32_x(svptrue_b32(), v447, v441); + svfloat32_t v564 = svadd_f32_x(svptrue_b32(), v560, v562); + svfloat32_t v565 = svsub_f32_x(svptrue_b32(), v560, v562); + svfloat32_t v566 = svadd_f32_x(svptrue_b32(), v561, v563); + svfloat32_t zero589 = svdup_n_f32(0); svfloat32_t v589 = svcmla_f32_x(pred_full, zero589, v1034, v561, 90); - svfloat32_t v617; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v617) : "w"(v613), "w"(v615)); - svfloat32_t v618; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v618) : "w"(v613), "w"(v615)); - svfloat32_t v619; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v619) : "w"(v614), "w"(v616)); - svfloat32_t v656; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v656) : "w"(v616), "w"(v1042)); - svfloat32_t v458; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v458) : "w"(v454), "w"(v456)); - svfloat32_t v459; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v459) : "w"(v454), "w"(v456)); - svfloat32_t v460; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v460) : "w"(v455), "w"(v457)); - svfloat32_t zero483; - asm volatile("mov %0.s, #0" : "=w"(zero483)); + svfloat32_t v617 = svadd_f32_x(svptrue_b32(), v613, v615); + svfloat32_t v618 = svsub_f32_x(svptrue_b32(), v613, v615); + svfloat32_t v619 = svadd_f32_x(svptrue_b32(), v614, v616); + svfloat32_t v656 = svmul_f32_x(svptrue_b32(), v616, v1042); + svfloat32_t v458 = svadd_f32_x(svptrue_b32(), v454, v456); + svfloat32_t v459 = svsub_f32_x(svptrue_b32(), v454, v456); + svfloat32_t v460 = svadd_f32_x(svptrue_b32(), v455, v457); + svfloat32_t zero483 = svdup_n_f32(0); svfloat32_t v483 = svcmla_f32_x(pred_full, zero483, v1034, v455, 90); - svfloat32_t v511; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v511) : "w"(v507), "w"(v509)); - svfloat32_t v512; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v512) : "w"(v507), "w"(v509)); - svfloat32_t v513; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v513) : "w"(v508), "w"(v510)); - svfloat32_t zero536; - asm volatile("mov %0.s, #0" : "=w"(zero536)); + svfloat32_t v511 = svadd_f32_x(svptrue_b32(), v507, v509); + svfloat32_t v512 = svsub_f32_x(svptrue_b32(), v507, v509); + svfloat32_t v513 = svadd_f32_x(svptrue_b32(), v508, v510); + svfloat32_t zero536 = svdup_n_f32(0); svfloat32_t v536 = svcmla_f32_x(pred_full, zero536, v1034, v508, 90); - svfloat32_t v567; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v567) : "w"(v564), "w"(v425)); - svfloat32_t zero596; - asm volatile("mov %0.s, #0" : "=w"(zero596)); + svfloat32_t v567 = svadd_f32_x(svptrue_b32(), v564, v425); + svfloat32_t zero596 = svdup_n_f32(0); svfloat32_t v596 = svcmla_f32_x(pred_full, zero596, v1035, v566, 90); - svfloat32_t v620; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v620) : "w"(v617), "w"(v427)); - svfloat32_t zero641; - asm volatile("mov %0.s, #0" : "=w"(zero641)); + svfloat32_t v620 = svadd_f32_x(svptrue_b32(), v617, v427); + svfloat32_t zero641 = svdup_n_f32(0); svfloat32_t v641 = svcmla_f32_x(pred_full, zero641, v1039, v618, 90); - svfloat32_t v651; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v651) : "w"(v619), "w"(v1041)); - svfloat32_t v461; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v461) : "w"(v458), "w"(v428)); - svfloat32_t zero490; - asm volatile("mov %0.s, #0" : "=w"(zero490)); + svfloat32_t v651 = svmul_f32_x(svptrue_b32(), v619, v1041); + svfloat32_t v461 = svadd_f32_x(svptrue_b32(), v458, v428); + svfloat32_t zero490 = svdup_n_f32(0); svfloat32_t v490 = svcmla_f32_x(pred_full, zero490, v1035, v460, 90); - svfloat32_t v514; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v514) : "w"(v511), "w"(v429)); - svfloat32_t zero543; - asm volatile("mov %0.s, #0" : "=w"(zero543)); + svfloat32_t v514 = svadd_f32_x(svptrue_b32(), v511, v429); + svfloat32_t zero543 = svdup_n_f32(0); svfloat32_t v543 = svcmla_f32_x(pred_full, zero543, v1035, v513, 90); svfloat32_t v604 = svmla_f32_x(pred_full, v567, v564, v1032); - svfloat32_t v607; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v607) : "w"(v589), "w"(v596)); + svfloat32_t v607 = svsub_f32_x(svptrue_b32(), v589, v596); svfloat32_t v608 = svcmla_f32_x(pred_full, v596, v1036, v563, 90); - svfloat32_t zero627; - asm volatile("mov %0.s, #0" : "=w"(zero627)); + svfloat32_t zero627 = svdup_n_f32(0); svfloat32_t v627 = svcmla_f32_x(pred_full, zero627, v1037, v620, 90); svfloat32_t v660 = svnmls_f32_x(pred_full, v651, v614, v1040); svfloat32_t v661 = svmla_f32_x(pred_full, v656, v619, v1041); svfloat32_t v498 = svmla_f32_x(pred_full, v461, v458, v1032); - svfloat32_t v501; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v501) : "w"(v483), "w"(v490)); + svfloat32_t v501 = svsub_f32_x(svptrue_b32(), v483, v490); svfloat32_t v502 = svcmla_f32_x(pred_full, v490, v1036, v457, 90); svfloat32_t v551 = svmla_f32_x(pred_full, v514, v511, v1032); - svfloat32_t v554; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v554) : "w"(v536), "w"(v543)); + svfloat32_t v554 = svsub_f32_x(svptrue_b32(), v536, v543); svfloat32_t v555 = svcmla_f32_x(pred_full, v543, v1036, v510, 90); svfloat32_t v605 = svmla_f32_x(pred_full, v604, v565, v1033); svfloat32_t v606 = svmls_f32_x(pred_full, v604, v565, v1033); svfloat32_t v657 = svcmla_f32_x(pred_full, v627, v1038, v617, 90); - svfloat32_t v666; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v666) : "w"(v567), "w"(v627)); - svfloat32_t v667; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v667) : "w"(v567), "w"(v627)); + svfloat32_t v666 = svadd_f32_x(svptrue_b32(), v567, v627); + svfloat32_t v667 = svsub_f32_x(svptrue_b32(), v567, v627); svint16_t v670 = svtbl_s16( svreinterpret_s16_s32(svcvt_s32_f32_x( pred_full, svmul_n_f32_x(pred_full, v461, (float)(1ULL << 31ULL)))), @@ -12532,18 +11198,12 @@ void armral_fft_cf32_cf32_cs16_ab_t_gu20(const armral_cmplx_f32_t *restrict x, svfloat32_t v500 = svmls_f32_x(pred_full, v498, v459, v1033); svfloat32_t v552 = svmla_f32_x(pred_full, v551, v512, v1033); svfloat32_t v553 = svmls_f32_x(pred_full, v551, v512, v1033); - svfloat32_t v609; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v609) : "w"(v605), "w"(v607)); - svfloat32_t v610; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v610) : "w"(v605), "w"(v607)); - svfloat32_t v611; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v611) : "w"(v606), "w"(v608)); - svfloat32_t v612; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v612) : "w"(v606), "w"(v608)); - svfloat32_t v658; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v658) : "w"(v657), "w"(v641)); - svfloat32_t v659; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v659) : "w"(v657), "w"(v641)); + svfloat32_t v609 = svadd_f32_x(svptrue_b32(), v605, v607); + svfloat32_t v610 = svsub_f32_x(svptrue_b32(), v605, v607); + svfloat32_t v611 = svadd_f32_x(svptrue_b32(), v606, v608); + svfloat32_t v612 = svsub_f32_x(svptrue_b32(), v606, v608); + svfloat32_t v658 = svadd_f32_x(svptrue_b32(), v657, v641); + svfloat32_t v659 = svsub_f32_x(svptrue_b32(), v657, v641); svint16_t v678 = svtbl_s16( svreinterpret_s16_s32(svcvt_s32_f32_x( pred_full, svmul_n_f32_x(pred_full, v667, (float)(1ULL << 31ULL)))), @@ -12556,36 +11216,22 @@ void armral_fft_cf32_cf32_cs16_ab_t_gu20(const armral_cmplx_f32_t *restrict x, svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); svst1w_u64(pred_full, (unsigned *)(v1050), svreinterpret_u64_s16(v670)); svst1w_u64(pred_full, (unsigned *)(v1068), svreinterpret_u64_s16(v686)); - svfloat32_t v503; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v503) : "w"(v499), "w"(v501)); - svfloat32_t v504; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v504) : "w"(v499), "w"(v501)); - svfloat32_t v505; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v505) : "w"(v500), "w"(v502)); - svfloat32_t v506; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v506) : "w"(v500), "w"(v502)); - svfloat32_t v556; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v556) : "w"(v552), "w"(v554)); - svfloat32_t v557; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v557) : "w"(v552), "w"(v554)); - svfloat32_t v558; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v558) : "w"(v553), "w"(v555)); - svfloat32_t v559; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v559) : "w"(v553), "w"(v555)); - svfloat32_t v662; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v662) : "w"(v658), "w"(v660)); - svfloat32_t v663; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v663) : "w"(v658), "w"(v660)); - svfloat32_t v664; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v664) : "w"(v659), "w"(v661)); - svfloat32_t v665; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v665) : "w"(v659), "w"(v661)); + svfloat32_t v503 = svadd_f32_x(svptrue_b32(), v499, v501); + svfloat32_t v504 = svsub_f32_x(svptrue_b32(), v499, v501); + svfloat32_t v505 = svadd_f32_x(svptrue_b32(), v500, v502); + svfloat32_t v506 = svsub_f32_x(svptrue_b32(), v500, v502); + svfloat32_t v556 = svadd_f32_x(svptrue_b32(), v552, v554); + svfloat32_t v557 = svsub_f32_x(svptrue_b32(), v552, v554); + svfloat32_t v558 = svadd_f32_x(svptrue_b32(), v553, v555); + svfloat32_t v559 = svsub_f32_x(svptrue_b32(), v553, v555); + svfloat32_t v662 = svadd_f32_x(svptrue_b32(), v658, v660); + svfloat32_t v663 = svsub_f32_x(svptrue_b32(), v658, v660); + svfloat32_t v664 = svadd_f32_x(svptrue_b32(), v659, v661); + svfloat32_t v665 = svsub_f32_x(svptrue_b32(), v659, v661); svst1w_u64(pred_full, (unsigned *)(v1059), svreinterpret_u64_s16(v678)); svst1w_u64(pred_full, (unsigned *)(v1077), svreinterpret_u64_s16(v694)); - svfloat32_t v700; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v700) : "w"(v610), "w"(v663)); - svfloat32_t v701; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v701) : "w"(v610), "w"(v663)); + svfloat32_t v700 = svadd_f32_x(svptrue_b32(), v610, v663); + svfloat32_t v701 = svsub_f32_x(svptrue_b32(), v610, v663); svint16_t v704 = svtbl_s16( svreinterpret_s16_s32(svcvt_s32_f32_x( pred_full, svmul_n_f32_x(pred_full, v504, (float)(1ULL << 31ULL)))), @@ -12596,10 +11242,8 @@ void armral_fft_cf32_cf32_cs16_ab_t_gu20(const armral_cmplx_f32_t *restrict x, pred_full, svmul_n_f32_x(pred_full, v557, (float)(1ULL << 31ULL)))), svreinterpret_u16_u64( svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svfloat32_t v734; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v734) : "w"(v612), "w"(v665)); - svfloat32_t v735; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v735) : "w"(v612), "w"(v665)); + svfloat32_t v734 = svadd_f32_x(svptrue_b32(), v612, v665); + svfloat32_t v735 = svsub_f32_x(svptrue_b32(), v612, v665); svint16_t v738 = svtbl_s16( svreinterpret_s16_s32(svcvt_s32_f32_x( pred_full, svmul_n_f32_x(pred_full, v506, (float)(1ULL << 31ULL)))), @@ -12610,10 +11254,8 @@ void armral_fft_cf32_cf32_cs16_ab_t_gu20(const armral_cmplx_f32_t *restrict x, pred_full, svmul_n_f32_x(pred_full, v559, (float)(1ULL << 31ULL)))), svreinterpret_u16_u64( svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svfloat32_t v768; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v768) : "w"(v611), "w"(v664)); - svfloat32_t v769; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v769) : "w"(v611), "w"(v664)); + svfloat32_t v768 = svadd_f32_x(svptrue_b32(), v611, v664); + svfloat32_t v769 = svsub_f32_x(svptrue_b32(), v611, v664); svint16_t v772 = svtbl_s16( svreinterpret_s16_s32(svcvt_s32_f32_x( pred_full, svmul_n_f32_x(pred_full, v505, (float)(1ULL << 31ULL)))), @@ -12624,10 +11266,8 @@ void armral_fft_cf32_cf32_cs16_ab_t_gu20(const armral_cmplx_f32_t *restrict x, pred_full, svmul_n_f32_x(pred_full, v558, (float)(1ULL << 31ULL)))), svreinterpret_u16_u64( svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svfloat32_t v802; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v802) : "w"(v609), "w"(v662)); - svfloat32_t v803; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v803) : "w"(v609), "w"(v662)); + svfloat32_t v802 = svadd_f32_x(svptrue_b32(), v609, v662); + svfloat32_t v803 = svsub_f32_x(svptrue_b32(), v609, v662); svint16_t v806 = svtbl_s16( svreinterpret_s16_s32(svcvt_s32_f32_x( pred_full, svmul_n_f32_x(pred_full, v503, (float)(1ULL << 31ULL)))), @@ -13466,8 +12106,7 @@ void armral_fft_cf32_cf32_cs16_ab_t_gu21(const armral_cmplx_f32_t *restrict x, svld1_f64(pred_full, &((const double *)v7)[v267])); svfloat32_t v282 = svreinterpret_f32_f64( svld1_f64(pred_full, &((const double *)v7)[v281])); - svfloat32_t zero318; - asm volatile("mov %0.s, #0" : "=w"(zero318)); + svfloat32_t zero318 = svdup_n_f32(0); svfloat32_t v318 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero318, v1016, v317, 0), v1016, v317, 90); @@ -13519,101 +12158,73 @@ void armral_fft_cf32_cf32_cs16_ab_t_gu21(const armral_cmplx_f32_t *restrict x, svld1_gather_s64index_f64(pred_full, (const double *)(v1052), v1072)); svfloat32_t v1063 = svreinterpret_f32_f64( svld1_gather_s64index_f64(pred_full, (const double *)(v1061), v1072)); - svfloat32_t zero52; - asm volatile("mov %0.s, #0" : "=w"(zero52)); + svfloat32_t zero52 = svdup_n_f32(0); svfloat32_t v52 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero52, v890, v51, 0), v890, v51, 90); - svfloat32_t zero59; - asm volatile("mov %0.s, #0" : "=w"(zero59)); + svfloat32_t zero59 = svdup_n_f32(0); svfloat32_t v59 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero59, v899, v58, 0), v899, v58, 90); - svfloat32_t zero94; - asm volatile("mov %0.s, #0" : "=w"(zero94)); + svfloat32_t zero94 = svdup_n_f32(0); svfloat32_t v94 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero94, v908, v93, 0), v908, v93, 90); - svfloat32_t zero101; - asm volatile("mov %0.s, #0" : "=w"(zero101)); + svfloat32_t zero101 = svdup_n_f32(0); svfloat32_t v101 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero101, v917, v100, 0), v917, v100, 90); - svfloat32_t zero150; - asm volatile("mov %0.s, #0" : "=w"(zero150)); + svfloat32_t zero150 = svdup_n_f32(0); svfloat32_t v150 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero150, v935, v149, 0), v935, v149, 90); - svfloat32_t zero157; - asm volatile("mov %0.s, #0" : "=w"(zero157)); + svfloat32_t zero157 = svdup_n_f32(0); svfloat32_t v157 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero157, v944, v156, 0), v944, v156, 90); - svfloat32_t zero206; - asm volatile("mov %0.s, #0" : "=w"(zero206)); + svfloat32_t zero206 = svdup_n_f32(0); svfloat32_t v206 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero206, v962, v205, 0), v962, v205, 90); - svfloat32_t zero213; - asm volatile("mov %0.s, #0" : "=w"(zero213)); + svfloat32_t zero213 = svdup_n_f32(0); svfloat32_t v213 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero213, v971, v212, 0), v971, v212, 90); - svfloat32_t zero262; - asm volatile("mov %0.s, #0" : "=w"(zero262)); + svfloat32_t zero262 = svdup_n_f32(0); svfloat32_t v262 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero262, v989, v261, 0), v989, v261, 90); - svfloat32_t zero269; - asm volatile("mov %0.s, #0" : "=w"(zero269)); + svfloat32_t zero269 = svdup_n_f32(0); svfloat32_t v269 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero269, v998, v268, 0), v998, v268, 90); - svfloat32_t zero325; - asm volatile("mov %0.s, #0" : "=w"(zero325)); + svfloat32_t zero325 = svdup_n_f32(0); svfloat32_t v325 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero325, v1026, v324, 0), v1026, v324, 90); - svfloat32_t zero374; - asm volatile("mov %0.s, #0" : "=w"(zero374)); + svfloat32_t zero374 = svdup_n_f32(0); svfloat32_t v374 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero374, v1045, v373, 0), v1045, v373, 90); - svfloat32_t zero381; - asm volatile("mov %0.s, #0" : "=w"(zero381)); + svfloat32_t zero381 = svdup_n_f32(0); svfloat32_t v381 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero381, v1054, v380, 0), v1054, v380, 90); - svfloat32_t v396; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v396) : "w"(v52), "w"(v59)); - svfloat32_t v397; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v397) : "w"(v52), "w"(v59)); - svfloat32_t v406; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v406) : "w"(v94), "w"(v101)); - svfloat32_t v407; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v407) : "w"(v94), "w"(v101)); - svfloat32_t v409; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v409) : "w"(v150), "w"(v157)); - svfloat32_t v410; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v410) : "w"(v150), "w"(v157)); - svfloat32_t v412; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v412) : "w"(v206), "w"(v213)); - svfloat32_t v413; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v413) : "w"(v206), "w"(v213)); - svfloat32_t v415; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v415) : "w"(v262), "w"(v269)); - svfloat32_t v416; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v416) : "w"(v262), "w"(v269)); - svfloat32_t v418; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v418) : "w"(v318), "w"(v325)); - svfloat32_t v419; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v419) : "w"(v318), "w"(v325)); - svfloat32_t v421; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v421) : "w"(v374), "w"(v381)); - svfloat32_t v422; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v422) : "w"(v374), "w"(v381)); - svfloat32_t v405; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v405) : "w"(v396), "w"(v1073)); + svfloat32_t v396 = svadd_f32_x(svptrue_b32(), v52, v59); + svfloat32_t v397 = svsub_f32_x(svptrue_b32(), v52, v59); + svfloat32_t v406 = svadd_f32_x(svptrue_b32(), v94, v101); + svfloat32_t v407 = svsub_f32_x(svptrue_b32(), v94, v101); + svfloat32_t v409 = svadd_f32_x(svptrue_b32(), v150, v157); + svfloat32_t v410 = svsub_f32_x(svptrue_b32(), v150, v157); + svfloat32_t v412 = svadd_f32_x(svptrue_b32(), v206, v213); + svfloat32_t v413 = svsub_f32_x(svptrue_b32(), v206, v213); + svfloat32_t v415 = svadd_f32_x(svptrue_b32(), v262, v269); + svfloat32_t v416 = svsub_f32_x(svptrue_b32(), v262, v269); + svfloat32_t v418 = svadd_f32_x(svptrue_b32(), v318, v325); + svfloat32_t v419 = svsub_f32_x(svptrue_b32(), v318, v325); + svfloat32_t v421 = svadd_f32_x(svptrue_b32(), v374, v381); + svfloat32_t v422 = svsub_f32_x(svptrue_b32(), v374, v381); + svfloat32_t v405 = svadd_f32_x(svptrue_b32(), v396, v1073); svfloat32_t v408 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, v406, v926, v114, 0), v926, v114, 90); @@ -13632,173 +12243,98 @@ void armral_fft_cf32_cf32_cs16_ab_t_gu21(const armral_cmplx_f32_t *restrict x, svfloat32_t v423 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, v421, v1063, v394, 0), v1063, v394, 90); - svfloat32_t v513; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v513) : "w"(v406), "w"(v421)); - svfloat32_t v514; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v514) : "w"(v406), "w"(v421)); - svfloat32_t v515; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v515) : "w"(v415), "w"(v412)); - svfloat32_t v516; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v516) : "w"(v415), "w"(v412)); - svfloat32_t v517; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v517) : "w"(v409), "w"(v418)); - svfloat32_t v518; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v518) : "w"(v409), "w"(v418)); - svfloat32_t v602; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v602) : "w"(v407), "w"(v422)); - svfloat32_t v603; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v603) : "w"(v407), "w"(v422)); - svfloat32_t v604; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v604) : "w"(v416), "w"(v413)); - svfloat32_t v605; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v605) : "w"(v416), "w"(v413)); - svfloat32_t v606; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v606) : "w"(v410), "w"(v419)); - svfloat32_t v607; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v607) : "w"(v410), "w"(v419)); - svfloat32_t v424; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v424) : "w"(v408), "w"(v423)); - svfloat32_t v425; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v425) : "w"(v408), "w"(v423)); - svfloat32_t v426; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v426) : "w"(v417), "w"(v414)); - svfloat32_t v427; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v427) : "w"(v417), "w"(v414)); - svfloat32_t v428; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v428) : "w"(v411), "w"(v420)); - svfloat32_t v429; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v429) : "w"(v411), "w"(v420)); - svfloat32_t v519; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v519) : "w"(v513), "w"(v515)); - svfloat32_t v522; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v522) : "w"(v513), "w"(v515)); - svfloat32_t v523; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v523) : "w"(v515), "w"(v517)); - svfloat32_t v524; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v524) : "w"(v517), "w"(v513)); - svfloat32_t v525; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v525) : "w"(v514), "w"(v516)); - svfloat32_t v527; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v527) : "w"(v514), "w"(v516)); - svfloat32_t v528; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v528) : "w"(v516), "w"(v518)); - svfloat32_t v529; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v529) : "w"(v518), "w"(v514)); - svfloat32_t v608; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v608) : "w"(v602), "w"(v604)); - svfloat32_t v611; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v611) : "w"(v602), "w"(v604)); - svfloat32_t v612; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v612) : "w"(v604), "w"(v606)); - svfloat32_t v613; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v613) : "w"(v606), "w"(v602)); - svfloat32_t v614; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v614) : "w"(v603), "w"(v605)); - svfloat32_t v616; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v616) : "w"(v603), "w"(v605)); - svfloat32_t v617; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v617) : "w"(v605), "w"(v607)); - svfloat32_t v618; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v618) : "w"(v607), "w"(v603)); - svfloat32_t v430; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v430) : "w"(v424), "w"(v426)); - svfloat32_t v433; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v433) : "w"(v424), "w"(v426)); - svfloat32_t v434; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v434) : "w"(v426), "w"(v428)); - svfloat32_t v435; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v435) : "w"(v428), "w"(v424)); - svfloat32_t v436; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v436) : "w"(v425), "w"(v427)); - svfloat32_t v438; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v438) : "w"(v425), "w"(v427)); - svfloat32_t v439; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v439) : "w"(v427), "w"(v429)); - svfloat32_t v440; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v440) : "w"(v429), "w"(v425)); - svfloat32_t v520; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v520) : "w"(v519), "w"(v517)); - svfloat32_t v526; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v526) : "w"(v525), "w"(v518)); - svfloat32_t zero568; - asm volatile("mov %0.s, #0" : "=w"(zero568)); + svfloat32_t v513 = svadd_f32_x(svptrue_b32(), v406, v421); + svfloat32_t v514 = svsub_f32_x(svptrue_b32(), v406, v421); + svfloat32_t v515 = svadd_f32_x(svptrue_b32(), v415, v412); + svfloat32_t v516 = svsub_f32_x(svptrue_b32(), v415, v412); + svfloat32_t v517 = svadd_f32_x(svptrue_b32(), v409, v418); + svfloat32_t v518 = svsub_f32_x(svptrue_b32(), v409, v418); + svfloat32_t v602 = svadd_f32_x(svptrue_b32(), v407, v422); + svfloat32_t v603 = svsub_f32_x(svptrue_b32(), v407, v422); + svfloat32_t v604 = svadd_f32_x(svptrue_b32(), v416, v413); + svfloat32_t v605 = svsub_f32_x(svptrue_b32(), v416, v413); + svfloat32_t v606 = svadd_f32_x(svptrue_b32(), v410, v419); + svfloat32_t v607 = svsub_f32_x(svptrue_b32(), v410, v419); + svfloat32_t v424 = svadd_f32_x(svptrue_b32(), v408, v423); + svfloat32_t v425 = svsub_f32_x(svptrue_b32(), v408, v423); + svfloat32_t v426 = svadd_f32_x(svptrue_b32(), v417, v414); + svfloat32_t v427 = svsub_f32_x(svptrue_b32(), v417, v414); + svfloat32_t v428 = svadd_f32_x(svptrue_b32(), v411, v420); + svfloat32_t v429 = svsub_f32_x(svptrue_b32(), v411, v420); + svfloat32_t v519 = svadd_f32_x(svptrue_b32(), v513, v515); + svfloat32_t v522 = svsub_f32_x(svptrue_b32(), v513, v515); + svfloat32_t v523 = svsub_f32_x(svptrue_b32(), v515, v517); + svfloat32_t v524 = svsub_f32_x(svptrue_b32(), v517, v513); + svfloat32_t v525 = svadd_f32_x(svptrue_b32(), v514, v516); + svfloat32_t v527 = svsub_f32_x(svptrue_b32(), v514, v516); + svfloat32_t v528 = svsub_f32_x(svptrue_b32(), v516, v518); + svfloat32_t v529 = svsub_f32_x(svptrue_b32(), v518, v514); + svfloat32_t v608 = svadd_f32_x(svptrue_b32(), v602, v604); + svfloat32_t v611 = svsub_f32_x(svptrue_b32(), v602, v604); + svfloat32_t v612 = svsub_f32_x(svptrue_b32(), v604, v606); + svfloat32_t v613 = svsub_f32_x(svptrue_b32(), v606, v602); + svfloat32_t v614 = svadd_f32_x(svptrue_b32(), v603, v605); + svfloat32_t v616 = svsub_f32_x(svptrue_b32(), v603, v605); + svfloat32_t v617 = svsub_f32_x(svptrue_b32(), v605, v607); + svfloat32_t v618 = svsub_f32_x(svptrue_b32(), v607, v603); + svfloat32_t v430 = svadd_f32_x(svptrue_b32(), v424, v426); + svfloat32_t v433 = svsub_f32_x(svptrue_b32(), v424, v426); + svfloat32_t v434 = svsub_f32_x(svptrue_b32(), v426, v428); + svfloat32_t v435 = svsub_f32_x(svptrue_b32(), v428, v424); + svfloat32_t v436 = svadd_f32_x(svptrue_b32(), v425, v427); + svfloat32_t v438 = svsub_f32_x(svptrue_b32(), v425, v427); + svfloat32_t v439 = svsub_f32_x(svptrue_b32(), v427, v429); + svfloat32_t v440 = svsub_f32_x(svptrue_b32(), v429, v425); + svfloat32_t v520 = svadd_f32_x(svptrue_b32(), v519, v517); + svfloat32_t v526 = svadd_f32_x(svptrue_b32(), v525, v518); + svfloat32_t zero568 = svdup_n_f32(0); svfloat32_t v568 = svcmla_f32_x(pred_full, zero568, v1089, v527, 90); - svfloat32_t zero575; - asm volatile("mov %0.s, #0" : "=w"(zero575)); + svfloat32_t zero575 = svdup_n_f32(0); svfloat32_t v575 = svcmla_f32_x(pred_full, zero575, v1090, v528, 90); - svfloat32_t zero582; - asm volatile("mov %0.s, #0" : "=w"(zero582)); + svfloat32_t zero582 = svdup_n_f32(0); svfloat32_t v582 = svcmla_f32_x(pred_full, zero582, v1091, v529, 90); - svfloat32_t v609; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v609) : "w"(v608), "w"(v606)); - svfloat32_t v615; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v615) : "w"(v614), "w"(v607)); - svfloat32_t zero639; - asm volatile("mov %0.s, #0" : "=w"(zero639)); + svfloat32_t v609 = svadd_f32_x(svptrue_b32(), v608, v606); + svfloat32_t v615 = svadd_f32_x(svptrue_b32(), v614, v607); + svfloat32_t zero639 = svdup_n_f32(0); svfloat32_t v639 = svcmla_f32_x(pred_full, zero639, v1094, v611, 90); - svfloat32_t zero646; - asm volatile("mov %0.s, #0" : "=w"(zero646)); + svfloat32_t zero646 = svdup_n_f32(0); svfloat32_t v646 = svcmla_f32_x(pred_full, zero646, v1095, v612, 90); - svfloat32_t zero653; - asm volatile("mov %0.s, #0" : "=w"(zero653)); + svfloat32_t zero653 = svdup_n_f32(0); svfloat32_t v653 = svcmla_f32_x(pred_full, zero653, v1096, v613, 90); - svfloat32_t v663; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v663) : "w"(v616), "w"(v1098)); - svfloat32_t v668; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v668) : "w"(v617), "w"(v1099)); - svfloat32_t v431; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v431) : "w"(v430), "w"(v428)); - svfloat32_t v437; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v437) : "w"(v436), "w"(v429)); - svfloat32_t zero479; - asm volatile("mov %0.s, #0" : "=w"(zero479)); + svfloat32_t v663 = svmul_f32_x(svptrue_b32(), v616, v1098); + svfloat32_t v668 = svmul_f32_x(svptrue_b32(), v617, v1099); + svfloat32_t v431 = svadd_f32_x(svptrue_b32(), v430, v428); + svfloat32_t v437 = svadd_f32_x(svptrue_b32(), v436, v429); + svfloat32_t zero479 = svdup_n_f32(0); svfloat32_t v479 = svcmla_f32_x(pred_full, zero479, v1080, v438, 90); - svfloat32_t zero486; - asm volatile("mov %0.s, #0" : "=w"(zero486)); + svfloat32_t zero486 = svdup_n_f32(0); svfloat32_t v486 = svcmla_f32_x(pred_full, zero486, v1081, v439, 90); - svfloat32_t zero493; - asm volatile("mov %0.s, #0" : "=w"(zero493)); + svfloat32_t zero493 = svdup_n_f32(0); svfloat32_t v493 = svcmla_f32_x(pred_full, zero493, v1082, v440, 90); - svfloat32_t v521; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v521) : "w"(v520), "w"(v396)); - svfloat32_t v539; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v539) : "w"(v520), "w"(v1084)); - svfloat32_t zero561; - asm volatile("mov %0.s, #0" : "=w"(zero561)); + svfloat32_t v521 = svadd_f32_x(svptrue_b32(), v520, v396); + svfloat32_t v539 = svmul_f32_x(svptrue_b32(), v520, v1084); + svfloat32_t zero561 = svdup_n_f32(0); svfloat32_t v561 = svcmla_f32_x(pred_full, zero561, v1088, v526, 90); - svfloat32_t v610; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v610) : "w"(v609), "w"(v397)); - svfloat32_t v432; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v432) : "w"(v431), "w"(v405)); - svfloat32_t zero472; - asm volatile("mov %0.s, #0" : "=w"(zero472)); + svfloat32_t v610 = svadd_f32_x(svptrue_b32(), v609, v397); + svfloat32_t v432 = svadd_f32_x(svptrue_b32(), v431, v405); + svfloat32_t zero472 = svdup_n_f32(0); svfloat32_t v472 = svcmla_f32_x(pred_full, zero472, v1079, v437, 90); - svfloat32_t v590; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v590) : "w"(v561), "w"(v568)); - svfloat32_t v592; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v592) : "w"(v561), "w"(v568)); - svfloat32_t v594; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v594) : "w"(v561), "w"(v575)); - svfloat32_t zero625; - asm volatile("mov %0.s, #0" : "=w"(zero625)); + svfloat32_t v590 = svadd_f32_x(svptrue_b32(), v561, v568); + svfloat32_t v592 = svsub_f32_x(svptrue_b32(), v561, v568); + svfloat32_t v594 = svsub_f32_x(svptrue_b32(), v561, v575); + svfloat32_t zero625 = svdup_n_f32(0); svfloat32_t v625 = svcmla_f32_x(pred_full, zero625, v1092, v610, 90); svfloat32_t v681 = svmla_f32_x(pred_full, v663, v615, v1097); svfloat32_t v683 = svnmls_f32_x(pred_full, v663, v615, v1097); svfloat32_t v685 = svnmls_f32_x(pred_full, v668, v615, v1097); svfloat32_t v494 = svmla_f32_x(pred_full, v432, v431, v1075); - svfloat32_t v501; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v501) : "w"(v472), "w"(v479)); - svfloat32_t v503; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v503) : "w"(v472), "w"(v479)); - svfloat32_t v505; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v505) : "w"(v472), "w"(v486)); + svfloat32_t v501 = svadd_f32_x(svptrue_b32(), v472, v479); + svfloat32_t v503 = svsub_f32_x(svptrue_b32(), v472, v479); + svfloat32_t v505 = svsub_f32_x(svptrue_b32(), v472, v486); svfloat32_t v583 = svmla_f32_x(pred_full, v539, v521, v1083); - svfloat32_t v591; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v591) : "w"(v590), "w"(v575)); - svfloat32_t v593; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v593) : "w"(v592), "w"(v582)); - svfloat32_t v595; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v595) : "w"(v594), "w"(v582)); + svfloat32_t v591 = svadd_f32_x(svptrue_b32(), v590, v575); + svfloat32_t v593 = svsub_f32_x(svptrue_b32(), v592, v582); + svfloat32_t v595 = svadd_f32_x(svptrue_b32(), v594, v582); svfloat32_t v674 = svcmla_f32_x(pred_full, v625, v1093, v609, 90); svfloat32_t v682 = svmla_f32_x(pred_full, v681, v617, v1099); svfloat32_t v684 = svmls_f32_x(pred_full, v683, v618, v1100); @@ -13812,25 +12348,17 @@ void armral_fft_cf32_cf32_cs16_ab_t_gu21(const armral_cmplx_f32_t *restrict x, svfloat32_t v495 = svmla_f32_x(pred_full, v494, v433, v1076); svfloat32_t v497 = svmls_f32_x(pred_full, v494, v433, v1076); svfloat32_t v499 = svmls_f32_x(pred_full, v494, v434, v1077); - svfloat32_t v502; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v502) : "w"(v501), "w"(v486)); - svfloat32_t v504; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v504) : "w"(v503), "w"(v493)); - svfloat32_t v506; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v506) : "w"(v505), "w"(v493)); + svfloat32_t v502 = svadd_f32_x(svptrue_b32(), v501, v486); + svfloat32_t v504 = svsub_f32_x(svptrue_b32(), v503, v493); + svfloat32_t v506 = svadd_f32_x(svptrue_b32(), v505, v493); svfloat32_t v584 = svmla_f32_x(pred_full, v583, v522, v1085); svfloat32_t v586 = svmls_f32_x(pred_full, v583, v522, v1085); svfloat32_t v588 = svmls_f32_x(pred_full, v583, v523, v1086); - svfloat32_t v675; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v675) : "w"(v674), "w"(v639)); - svfloat32_t v677; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v677) : "w"(v674), "w"(v639)); - svfloat32_t v679; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v679) : "w"(v674), "w"(v646)); - svfloat32_t v694; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v694) : "w"(v693), "w"(v625)); - svfloat32_t v695; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v695) : "w"(v693), "w"(v625)); + svfloat32_t v675 = svadd_f32_x(svptrue_b32(), v674, v639); + svfloat32_t v677 = svsub_f32_x(svptrue_b32(), v674, v639); + svfloat32_t v679 = svsub_f32_x(svptrue_b32(), v674, v646); + svfloat32_t v694 = svadd_f32_x(svptrue_b32(), v693, v625); + svfloat32_t v695 = svsub_f32_x(svptrue_b32(), v693, v625); svst1w_u64(pred_full, (unsigned *)(v1108), svreinterpret_u64_s16(v698)); svfloat32_t v496 = svmla_f32_x(pred_full, v495, v434, v1077); svfloat32_t v498 = svmls_f32_x(pred_full, v497, v435, v1078); @@ -13838,12 +12366,9 @@ void armral_fft_cf32_cf32_cs16_ab_t_gu21(const armral_cmplx_f32_t *restrict x, svfloat32_t v585 = svmla_f32_x(pred_full, v584, v523, v1086); svfloat32_t v587 = svmls_f32_x(pred_full, v586, v524, v1087); svfloat32_t v589 = svmla_f32_x(pred_full, v588, v524, v1087); - svfloat32_t v676; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v676) : "w"(v675), "w"(v646)); - svfloat32_t v678; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v678) : "w"(v677), "w"(v653)); - svfloat32_t v680; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v680) : "w"(v679), "w"(v653)); + svfloat32_t v676 = svadd_f32_x(svptrue_b32(), v675, v646); + svfloat32_t v678 = svsub_f32_x(svptrue_b32(), v677, v653); + svfloat32_t v680 = svadd_f32_x(svptrue_b32(), v679, v653); svint16_t v706 = svtbl_s16( svreinterpret_s16_s32(svcvt_s32_f32_x( pred_full, svmul_n_f32_x(pred_full, v695, (float)(1ULL << 31ULL)))), @@ -13854,110 +12379,74 @@ void armral_fft_cf32_cf32_cs16_ab_t_gu21(const armral_cmplx_f32_t *restrict x, pred_full, svmul_n_f32_x(pred_full, v694, (float)(1ULL << 31ULL)))), svreinterpret_u16_u64( svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svfloat32_t v507; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v507) : "w"(v496), "w"(v502)); - svfloat32_t v508; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v508) : "w"(v496), "w"(v502)); - svfloat32_t v509; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v509) : "w"(v498), "w"(v504)); - svfloat32_t v510; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v510) : "w"(v498), "w"(v504)); - svfloat32_t v511; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v511) : "w"(v500), "w"(v506)); - svfloat32_t v512; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v512) : "w"(v500), "w"(v506)); - svfloat32_t v596; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v596) : "w"(v585), "w"(v591)); - svfloat32_t v597; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v597) : "w"(v585), "w"(v591)); - svfloat32_t v598; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v598) : "w"(v587), "w"(v593)); - svfloat32_t v599; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v599) : "w"(v587), "w"(v593)); - svfloat32_t v600; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v600) : "w"(v589), "w"(v595)); - svfloat32_t v601; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v601) : "w"(v589), "w"(v595)); - svfloat32_t v687; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v687) : "w"(v676), "w"(v682)); - svfloat32_t v688; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v688) : "w"(v676), "w"(v682)); - svfloat32_t v689; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v689) : "w"(v678), "w"(v684)); - svfloat32_t v690; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v690) : "w"(v678), "w"(v684)); - svfloat32_t v691; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v691) : "w"(v680), "w"(v686)); - svfloat32_t v692; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v692) : "w"(v680), "w"(v686)); + svfloat32_t v507 = svadd_f32_x(svptrue_b32(), v496, v502); + svfloat32_t v508 = svsub_f32_x(svptrue_b32(), v496, v502); + svfloat32_t v509 = svadd_f32_x(svptrue_b32(), v498, v504); + svfloat32_t v510 = svsub_f32_x(svptrue_b32(), v498, v504); + svfloat32_t v511 = svadd_f32_x(svptrue_b32(), v500, v506); + svfloat32_t v512 = svsub_f32_x(svptrue_b32(), v500, v506); + svfloat32_t v596 = svadd_f32_x(svptrue_b32(), v585, v591); + svfloat32_t v597 = svsub_f32_x(svptrue_b32(), v585, v591); + svfloat32_t v598 = svadd_f32_x(svptrue_b32(), v587, v593); + svfloat32_t v599 = svsub_f32_x(svptrue_b32(), v587, v593); + svfloat32_t v600 = svadd_f32_x(svptrue_b32(), v589, v595); + svfloat32_t v601 = svsub_f32_x(svptrue_b32(), v589, v595); + svfloat32_t v687 = svadd_f32_x(svptrue_b32(), v676, v682); + svfloat32_t v688 = svsub_f32_x(svptrue_b32(), v676, v682); + svfloat32_t v689 = svadd_f32_x(svptrue_b32(), v678, v684); + svfloat32_t v690 = svsub_f32_x(svptrue_b32(), v678, v684); + svfloat32_t v691 = svadd_f32_x(svptrue_b32(), v680, v686); + svfloat32_t v692 = svsub_f32_x(svptrue_b32(), v680, v686); svst1w_u64(pred_full, (unsigned *)(v1117), svreinterpret_u64_s16(v706)); svst1w_u64(pred_full, (unsigned *)(v1126), svreinterpret_u64_s16(v714)); - svfloat32_t v720; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v720) : "w"(v508), "w"(v597)); + svfloat32_t v720 = svadd_f32_x(svptrue_b32(), v508, v597); svint16_t v725 = svtbl_s16( svreinterpret_s16_s32(svcvt_s32_f32_x( pred_full, svmul_n_f32_x(pred_full, v508, (float)(1ULL << 31ULL)))), svreinterpret_u16_u64( svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svfloat32_t v747; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v747) : "w"(v510), "w"(v599)); + svfloat32_t v747 = svadd_f32_x(svptrue_b32(), v510, v599); svint16_t v752 = svtbl_s16( svreinterpret_s16_s32(svcvt_s32_f32_x( pred_full, svmul_n_f32_x(pred_full, v510, (float)(1ULL << 31ULL)))), svreinterpret_u16_u64( svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svfloat32_t v774; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v774) : "w"(v511), "w"(v600)); + svfloat32_t v774 = svadd_f32_x(svptrue_b32(), v511, v600); svint16_t v779 = svtbl_s16( svreinterpret_s16_s32(svcvt_s32_f32_x( pred_full, svmul_n_f32_x(pred_full, v511, (float)(1ULL << 31ULL)))), svreinterpret_u16_u64( svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svfloat32_t v801; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v801) : "w"(v512), "w"(v601)); + svfloat32_t v801 = svadd_f32_x(svptrue_b32(), v512, v601); svint16_t v806 = svtbl_s16( svreinterpret_s16_s32(svcvt_s32_f32_x( pred_full, svmul_n_f32_x(pred_full, v512, (float)(1ULL << 31ULL)))), svreinterpret_u16_u64( svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svfloat32_t v828; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v828) : "w"(v509), "w"(v598)); + svfloat32_t v828 = svadd_f32_x(svptrue_b32(), v509, v598); svint16_t v833 = svtbl_s16( svreinterpret_s16_s32(svcvt_s32_f32_x( pred_full, svmul_n_f32_x(pred_full, v509, (float)(1ULL << 31ULL)))), svreinterpret_u16_u64( svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svfloat32_t v855; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v855) : "w"(v507), "w"(v596)); + svfloat32_t v855 = svadd_f32_x(svptrue_b32(), v507, v596); svint16_t v860 = svtbl_s16( svreinterpret_s16_s32(svcvt_s32_f32_x( pred_full, svmul_n_f32_x(pred_full, v507, (float)(1ULL << 31ULL)))), svreinterpret_u16_u64( svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svfloat32_t v721; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v721) : "w"(v720), "w"(v688)); - svfloat32_t v722; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v722) : "w"(v720), "w"(v688)); - svfloat32_t v748; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v748) : "w"(v747), "w"(v690)); - svfloat32_t v749; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v749) : "w"(v747), "w"(v690)); - svfloat32_t v775; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v775) : "w"(v774), "w"(v691)); - svfloat32_t v776; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v776) : "w"(v774), "w"(v691)); - svfloat32_t v802; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v802) : "w"(v801), "w"(v692)); - svfloat32_t v803; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v803) : "w"(v801), "w"(v692)); - svfloat32_t v829; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v829) : "w"(v828), "w"(v689)); - svfloat32_t v830; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v830) : "w"(v828), "w"(v689)); - svfloat32_t v856; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v856) : "w"(v855), "w"(v687)); - svfloat32_t v857; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v857) : "w"(v855), "w"(v687)); + svfloat32_t v721 = svadd_f32_x(svptrue_b32(), v720, v688); + svfloat32_t v722 = svsub_f32_x(svptrue_b32(), v720, v688); + svfloat32_t v748 = svadd_f32_x(svptrue_b32(), v747, v690); + svfloat32_t v749 = svsub_f32_x(svptrue_b32(), v747, v690); + svfloat32_t v775 = svadd_f32_x(svptrue_b32(), v774, v691); + svfloat32_t v776 = svsub_f32_x(svptrue_b32(), v774, v691); + svfloat32_t v802 = svadd_f32_x(svptrue_b32(), v801, v692); + svfloat32_t v803 = svsub_f32_x(svptrue_b32(), v801, v692); + svfloat32_t v829 = svadd_f32_x(svptrue_b32(), v828, v689); + svfloat32_t v830 = svsub_f32_x(svptrue_b32(), v828, v689); + svfloat32_t v856 = svadd_f32_x(svptrue_b32(), v855, v687); + svfloat32_t v857 = svsub_f32_x(svptrue_b32(), v855, v687); svst1w_u64(pred_full, (unsigned *)(v1135), svreinterpret_u64_s16(v725)); svst1w_u64(pred_full, (unsigned *)(v1162), svreinterpret_u64_s16(v752)); svst1w_u64(pred_full, (unsigned *)(v1189), svreinterpret_u64_s16(v779)); @@ -14850,8 +13339,7 @@ void armral_fft_cf32_cf32_cs16_ab_t_gu22(const armral_cmplx_f32_t *restrict x, svld1_f64(pred_full, &((const double *)v7)[v246])); svfloat32_t v282 = svreinterpret_f32_f64( svld1_f64(pred_full, &((const double *)v7)[v281])); - svfloat32_t zero290; - asm volatile("mov %0.s, #0" : "=w"(zero290)); + svfloat32_t zero290 = svdup_n_f32(0); svfloat32_t v290 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero290, v1198, v289, 0), v1198, v289, 90); @@ -14911,340 +13399,208 @@ void armral_fft_cf32_cf32_cs16_ab_t_gu22(const armral_cmplx_f32_t *restrict x, svld1_gather_s64index_f64(pred_full, (const double *)(v1261), v1281)); svfloat32_t v1272 = svreinterpret_f32_f64( svld1_gather_s64index_f64(pred_full, (const double *)(v1270), v1281)); - svfloat32_t zero38; - asm volatile("mov %0.s, #0" : "=w"(zero38)); + svfloat32_t zero38 = svdup_n_f32(0); svfloat32_t v38 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero38, v1090, v37, 0), v1090, v37, 90); - svfloat32_t zero73; - asm volatile("mov %0.s, #0" : "=w"(zero73)); + svfloat32_t zero73 = svdup_n_f32(0); svfloat32_t v73 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero73, v1099, v72, 0), v1099, v72, 90); - svfloat32_t zero80; - asm volatile("mov %0.s, #0" : "=w"(zero80)); + svfloat32_t zero80 = svdup_n_f32(0); svfloat32_t v80 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero80, v1108, v79, 0), v1108, v79, 90); - svfloat32_t zero115; - asm volatile("mov %0.s, #0" : "=w"(zero115)); + svfloat32_t zero115 = svdup_n_f32(0); svfloat32_t v115 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero115, v1117, v114, 0), v1117, v114, 90); - svfloat32_t zero122; - asm volatile("mov %0.s, #0" : "=w"(zero122)); + svfloat32_t zero122 = svdup_n_f32(0); svfloat32_t v122 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero122, v1126, v121, 0), v1126, v121, 90); - svfloat32_t zero157; - asm volatile("mov %0.s, #0" : "=w"(zero157)); + svfloat32_t zero157 = svdup_n_f32(0); svfloat32_t v157 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero157, v1135, v156, 0), v1135, v156, 90); - svfloat32_t zero164; - asm volatile("mov %0.s, #0" : "=w"(zero164)); + svfloat32_t zero164 = svdup_n_f32(0); svfloat32_t v164 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero164, v1144, v163, 0), v1144, v163, 90); - svfloat32_t zero199; - asm volatile("mov %0.s, #0" : "=w"(zero199)); + svfloat32_t zero199 = svdup_n_f32(0); svfloat32_t v199 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero199, v1153, v198, 0), v1153, v198, 90); - svfloat32_t zero206; - asm volatile("mov %0.s, #0" : "=w"(zero206)); + svfloat32_t zero206 = svdup_n_f32(0); svfloat32_t v206 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero206, v1162, v205, 0), v1162, v205, 90); - svfloat32_t zero241; - asm volatile("mov %0.s, #0" : "=w"(zero241)); + svfloat32_t zero241 = svdup_n_f32(0); svfloat32_t v241 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero241, v1171, v240, 0), v1171, v240, 90); - svfloat32_t zero248; - asm volatile("mov %0.s, #0" : "=w"(zero248)); + svfloat32_t zero248 = svdup_n_f32(0); svfloat32_t v248 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero248, v1180, v247, 0), v1180, v247, 90); - svfloat32_t zero283; - asm volatile("mov %0.s, #0" : "=w"(zero283)); + svfloat32_t zero283 = svdup_n_f32(0); svfloat32_t v283 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero283, v1189, v282, 0), v1189, v282, 90); - svfloat32_t zero325; - asm volatile("mov %0.s, #0" : "=w"(zero325)); + svfloat32_t zero325 = svdup_n_f32(0); svfloat32_t v325 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero325, v1209, v324, 0), v1209, v324, 90); - svfloat32_t zero332; - asm volatile("mov %0.s, #0" : "=w"(zero332)); + svfloat32_t zero332 = svdup_n_f32(0); svfloat32_t v332 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero332, v1218, v331, 0), v1218, v331, 90); - svfloat32_t zero367; - asm volatile("mov %0.s, #0" : "=w"(zero367)); + svfloat32_t zero367 = svdup_n_f32(0); svfloat32_t v367 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero367, v1227, v366, 0), v1227, v366, 90); - svfloat32_t zero374; - asm volatile("mov %0.s, #0" : "=w"(zero374)); + svfloat32_t zero374 = svdup_n_f32(0); svfloat32_t v374 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero374, v1236, v373, 0), v1236, v373, 90); - svfloat32_t zero409; - asm volatile("mov %0.s, #0" : "=w"(zero409)); + svfloat32_t zero409 = svdup_n_f32(0); svfloat32_t v409 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero409, v1245, v408, 0), v1245, v408, 90); - svfloat32_t zero416; - asm volatile("mov %0.s, #0" : "=w"(zero416)); + svfloat32_t zero416 = svdup_n_f32(0); svfloat32_t v416 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero416, v1254, v415, 0), v1254, v415, 90); - svfloat32_t zero451; - asm volatile("mov %0.s, #0" : "=w"(zero451)); + svfloat32_t zero451 = svdup_n_f32(0); svfloat32_t v451 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero451, v1263, v450, 0), v1263, v450, 90); - svfloat32_t zero458; - asm volatile("mov %0.s, #0" : "=w"(zero458)); + svfloat32_t zero458 = svdup_n_f32(0); svfloat32_t v458 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero458, v1272, v457, 0), v1272, v457, 90); - svfloat32_t v466; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v466) : "w"(v1282), "w"(v38)); - svfloat32_t v467; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v467) : "w"(v1282), "w"(v38)); - svfloat32_t v468; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v468) : "w"(v73), "w"(v80)); - svfloat32_t v469; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v469) : "w"(v73), "w"(v80)); - svfloat32_t v470; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v470) : "w"(v115), "w"(v122)); - svfloat32_t v471; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v471) : "w"(v115), "w"(v122)); - svfloat32_t v472; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v472) : "w"(v157), "w"(v164)); - svfloat32_t v473; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v473) : "w"(v157), "w"(v164)); - svfloat32_t v474; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v474) : "w"(v199), "w"(v206)); - svfloat32_t v475; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v475) : "w"(v199), "w"(v206)); - svfloat32_t v476; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v476) : "w"(v241), "w"(v248)); - svfloat32_t v477; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v477) : "w"(v241), "w"(v248)); - svfloat32_t v478; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v478) : "w"(v283), "w"(v290)); - svfloat32_t v479; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v479) : "w"(v283), "w"(v290)); - svfloat32_t v480; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v480) : "w"(v325), "w"(v332)); - svfloat32_t v481; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v481) : "w"(v325), "w"(v332)); - svfloat32_t v482; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v482) : "w"(v367), "w"(v374)); - svfloat32_t v483; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v483) : "w"(v367), "w"(v374)); - svfloat32_t v484; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v484) : "w"(v409), "w"(v416)); - svfloat32_t v485; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v485) : "w"(v409), "w"(v416)); - svfloat32_t v486; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v486) : "w"(v451), "w"(v458)); - svfloat32_t v487; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v487) : "w"(v451), "w"(v458)); - svfloat32_t v488; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v488) : "w"(v468), "w"(v486)); - svfloat32_t v489; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v489) : "w"(v470), "w"(v484)); - svfloat32_t v490; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v490) : "w"(v472), "w"(v482)); - svfloat32_t v491; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v491) : "w"(v474), "w"(v480)); - svfloat32_t v492; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v492) : "w"(v476), "w"(v478)); - svfloat32_t v493; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v493) : "w"(v468), "w"(v486)); - svfloat32_t v494; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v494) : "w"(v470), "w"(v484)); - svfloat32_t v495; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v495) : "w"(v472), "w"(v482)); - svfloat32_t v496; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v496) : "w"(v474), "w"(v480)); - svfloat32_t v497; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v497) : "w"(v476), "w"(v478)); - svfloat32_t v697; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v697) : "w"(v469), "w"(v487)); - svfloat32_t v698; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v698) : "w"(v471), "w"(v485)); - svfloat32_t v699; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v699) : "w"(v473), "w"(v483)); - svfloat32_t v700; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v700) : "w"(v475), "w"(v481)); - svfloat32_t v701; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v701) : "w"(v477), "w"(v479)); - svfloat32_t v702; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v702) : "w"(v469), "w"(v487)); - svfloat32_t v703; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v703) : "w"(v471), "w"(v485)); - svfloat32_t v704; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v704) : "w"(v473), "w"(v483)); - svfloat32_t v705; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v705) : "w"(v475), "w"(v481)); - svfloat32_t v706; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v706) : "w"(v477), "w"(v479)); - svfloat32_t v498; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v498) : "w"(v488), "w"(v489)); - svfloat32_t v499; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v499) : "w"(v490), "w"(v492)); - svfloat32_t v501; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v501) : "w"(v494), "w"(v495)); - svfloat32_t v502; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v502) : "w"(v493), "w"(v497)); - svfloat32_t v507; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v507) : "w"(v489), "w"(v491)); - svfloat32_t v508; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v508) : "w"(v488), "w"(v491)); - svfloat32_t v509; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v509) : "w"(v489), "w"(v488)); - svfloat32_t v510; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v510) : "w"(v492), "w"(v491)); - svfloat32_t v511; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v511) : "w"(v490), "w"(v491)); - svfloat32_t v512; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v512) : "w"(v492), "w"(v490)); - svfloat32_t v513; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v513) : "w"(v489), "w"(v492)); - svfloat32_t v514; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v514) : "w"(v488), "w"(v490)); - svfloat32_t v516; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v516) : "w"(v494), "w"(v496)); - svfloat32_t v517; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v517) : "w"(v493), "w"(v496)); - svfloat32_t v518; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v518) : "w"(v493), "w"(v494)); - svfloat32_t v519; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v519) : "w"(v496), "w"(v497)); - svfloat32_t v520; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v520) : "w"(v495), "w"(v496)); - svfloat32_t v521; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v521) : "w"(v495), "w"(v497)); - svfloat32_t v522; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v522) : "w"(v494), "w"(v497)); - svfloat32_t v523; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v523) : "w"(v493), "w"(v495)); - svfloat32_t v707; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v707) : "w"(v697), "w"(v698)); - svfloat32_t v708; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v708) : "w"(v699), "w"(v701)); - svfloat32_t v710; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v710) : "w"(v703), "w"(v704)); - svfloat32_t v711; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v711) : "w"(v702), "w"(v706)); - svfloat32_t v716; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v716) : "w"(v698), "w"(v700)); - svfloat32_t v717; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v717) : "w"(v697), "w"(v700)); - svfloat32_t v718; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v718) : "w"(v698), "w"(v697)); - svfloat32_t v719; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v719) : "w"(v701), "w"(v700)); - svfloat32_t v720; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v720) : "w"(v699), "w"(v700)); - svfloat32_t v721; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v721) : "w"(v701), "w"(v699)); - svfloat32_t v722; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v722) : "w"(v698), "w"(v701)); - svfloat32_t v723; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v723) : "w"(v697), "w"(v699)); - svfloat32_t v725; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v725) : "w"(v703), "w"(v705)); - svfloat32_t v726; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v726) : "w"(v702), "w"(v705)); - svfloat32_t v727; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v727) : "w"(v702), "w"(v703)); - svfloat32_t v728; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v728) : "w"(v705), "w"(v706)); - svfloat32_t v729; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v729) : "w"(v704), "w"(v705)); - svfloat32_t v730; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v730) : "w"(v704), "w"(v706)); - svfloat32_t v731; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v731) : "w"(v703), "w"(v706)); - svfloat32_t v732; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v732) : "w"(v702), "w"(v704)); - svfloat32_t v500; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v500) : "w"(v491), "w"(v498)); - svfloat32_t v505; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v505) : "w"(v501), "w"(v502)); - svfloat32_t v515; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v515) : "w"(v499), "w"(v498)); - svfloat32_t v524; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v524) : "w"(v501), "w"(v502)); - svfloat32_t v551; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v551) : "w"(v508), "w"(v1308)); - svfloat32_t v556; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v556) : "w"(v509), "w"(v1309)); - svfloat32_t v566; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v566) : "w"(v511), "w"(v1311)); - svfloat32_t v571; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v571) : "w"(v512), "w"(v1312)); - svfloat32_t zero593; - asm volatile("mov %0.s, #0" : "=w"(zero593)); + svfloat32_t v466 = svadd_f32_x(svptrue_b32(), v1282, v38); + svfloat32_t v467 = svsub_f32_x(svptrue_b32(), v1282, v38); + svfloat32_t v468 = svadd_f32_x(svptrue_b32(), v73, v80); + svfloat32_t v469 = svsub_f32_x(svptrue_b32(), v73, v80); + svfloat32_t v470 = svadd_f32_x(svptrue_b32(), v115, v122); + svfloat32_t v471 = svsub_f32_x(svptrue_b32(), v115, v122); + svfloat32_t v472 = svadd_f32_x(svptrue_b32(), v157, v164); + svfloat32_t v473 = svsub_f32_x(svptrue_b32(), v157, v164); + svfloat32_t v474 = svadd_f32_x(svptrue_b32(), v199, v206); + svfloat32_t v475 = svsub_f32_x(svptrue_b32(), v199, v206); + svfloat32_t v476 = svadd_f32_x(svptrue_b32(), v241, v248); + svfloat32_t v477 = svsub_f32_x(svptrue_b32(), v241, v248); + svfloat32_t v478 = svadd_f32_x(svptrue_b32(), v283, v290); + svfloat32_t v479 = svsub_f32_x(svptrue_b32(), v283, v290); + svfloat32_t v480 = svadd_f32_x(svptrue_b32(), v325, v332); + svfloat32_t v481 = svsub_f32_x(svptrue_b32(), v325, v332); + svfloat32_t v482 = svadd_f32_x(svptrue_b32(), v367, v374); + svfloat32_t v483 = svsub_f32_x(svptrue_b32(), v367, v374); + svfloat32_t v484 = svadd_f32_x(svptrue_b32(), v409, v416); + svfloat32_t v485 = svsub_f32_x(svptrue_b32(), v409, v416); + svfloat32_t v486 = svadd_f32_x(svptrue_b32(), v451, v458); + svfloat32_t v487 = svsub_f32_x(svptrue_b32(), v451, v458); + svfloat32_t v488 = svadd_f32_x(svptrue_b32(), v468, v486); + svfloat32_t v489 = svadd_f32_x(svptrue_b32(), v470, v484); + svfloat32_t v490 = svadd_f32_x(svptrue_b32(), v472, v482); + svfloat32_t v491 = svadd_f32_x(svptrue_b32(), v474, v480); + svfloat32_t v492 = svadd_f32_x(svptrue_b32(), v476, v478); + svfloat32_t v493 = svsub_f32_x(svptrue_b32(), v468, v486); + svfloat32_t v494 = svsub_f32_x(svptrue_b32(), v470, v484); + svfloat32_t v495 = svsub_f32_x(svptrue_b32(), v472, v482); + svfloat32_t v496 = svsub_f32_x(svptrue_b32(), v474, v480); + svfloat32_t v497 = svsub_f32_x(svptrue_b32(), v476, v478); + svfloat32_t v697 = svadd_f32_x(svptrue_b32(), v469, v487); + svfloat32_t v698 = svadd_f32_x(svptrue_b32(), v471, v485); + svfloat32_t v699 = svadd_f32_x(svptrue_b32(), v473, v483); + svfloat32_t v700 = svadd_f32_x(svptrue_b32(), v475, v481); + svfloat32_t v701 = svadd_f32_x(svptrue_b32(), v477, v479); + svfloat32_t v702 = svsub_f32_x(svptrue_b32(), v469, v487); + svfloat32_t v703 = svsub_f32_x(svptrue_b32(), v471, v485); + svfloat32_t v704 = svsub_f32_x(svptrue_b32(), v473, v483); + svfloat32_t v705 = svsub_f32_x(svptrue_b32(), v475, v481); + svfloat32_t v706 = svsub_f32_x(svptrue_b32(), v477, v479); + svfloat32_t v498 = svadd_f32_x(svptrue_b32(), v488, v489); + svfloat32_t v499 = svadd_f32_x(svptrue_b32(), v490, v492); + svfloat32_t v501 = svsub_f32_x(svptrue_b32(), v494, v495); + svfloat32_t v502 = svadd_f32_x(svptrue_b32(), v493, v497); + svfloat32_t v507 = svsub_f32_x(svptrue_b32(), v489, v491); + svfloat32_t v508 = svsub_f32_x(svptrue_b32(), v488, v491); + svfloat32_t v509 = svsub_f32_x(svptrue_b32(), v489, v488); + svfloat32_t v510 = svsub_f32_x(svptrue_b32(), v492, v491); + svfloat32_t v511 = svsub_f32_x(svptrue_b32(), v490, v491); + svfloat32_t v512 = svsub_f32_x(svptrue_b32(), v492, v490); + svfloat32_t v513 = svsub_f32_x(svptrue_b32(), v489, v492); + svfloat32_t v514 = svsub_f32_x(svptrue_b32(), v488, v490); + svfloat32_t v516 = svadd_f32_x(svptrue_b32(), v494, v496); + svfloat32_t v517 = svsub_f32_x(svptrue_b32(), v493, v496); + svfloat32_t v518 = svadd_f32_x(svptrue_b32(), v493, v494); + svfloat32_t v519 = svsub_f32_x(svptrue_b32(), v496, v497); + svfloat32_t v520 = svsub_f32_x(svptrue_b32(), v495, v496); + svfloat32_t v521 = svsub_f32_x(svptrue_b32(), v495, v497); + svfloat32_t v522 = svadd_f32_x(svptrue_b32(), v494, v497); + svfloat32_t v523 = svsub_f32_x(svptrue_b32(), v493, v495); + svfloat32_t v707 = svadd_f32_x(svptrue_b32(), v697, v698); + svfloat32_t v708 = svadd_f32_x(svptrue_b32(), v699, v701); + svfloat32_t v710 = svsub_f32_x(svptrue_b32(), v703, v704); + svfloat32_t v711 = svadd_f32_x(svptrue_b32(), v702, v706); + svfloat32_t v716 = svsub_f32_x(svptrue_b32(), v698, v700); + svfloat32_t v717 = svsub_f32_x(svptrue_b32(), v697, v700); + svfloat32_t v718 = svsub_f32_x(svptrue_b32(), v698, v697); + svfloat32_t v719 = svsub_f32_x(svptrue_b32(), v701, v700); + svfloat32_t v720 = svsub_f32_x(svptrue_b32(), v699, v700); + svfloat32_t v721 = svsub_f32_x(svptrue_b32(), v701, v699); + svfloat32_t v722 = svsub_f32_x(svptrue_b32(), v698, v701); + svfloat32_t v723 = svsub_f32_x(svptrue_b32(), v697, v699); + svfloat32_t v725 = svadd_f32_x(svptrue_b32(), v703, v705); + svfloat32_t v726 = svsub_f32_x(svptrue_b32(), v702, v705); + svfloat32_t v727 = svadd_f32_x(svptrue_b32(), v702, v703); + svfloat32_t v728 = svsub_f32_x(svptrue_b32(), v705, v706); + svfloat32_t v729 = svsub_f32_x(svptrue_b32(), v704, v705); + svfloat32_t v730 = svsub_f32_x(svptrue_b32(), v704, v706); + svfloat32_t v731 = svadd_f32_x(svptrue_b32(), v703, v706); + svfloat32_t v732 = svsub_f32_x(svptrue_b32(), v702, v704); + svfloat32_t v500 = svadd_f32_x(svptrue_b32(), v491, v498); + svfloat32_t v505 = svsub_f32_x(svptrue_b32(), v501, v502); + svfloat32_t v515 = svsub_f32_x(svptrue_b32(), v499, v498); + svfloat32_t v524 = svadd_f32_x(svptrue_b32(), v501, v502); + svfloat32_t v551 = svmul_f32_x(svptrue_b32(), v508, v1308); + svfloat32_t v556 = svmul_f32_x(svptrue_b32(), v509, v1309); + svfloat32_t v566 = svmul_f32_x(svptrue_b32(), v511, v1311); + svfloat32_t v571 = svmul_f32_x(svptrue_b32(), v512, v1312); + svfloat32_t zero593 = svdup_n_f32(0); svfloat32_t v593 = svcmla_f32_x(pred_full, zero593, v1316, v516, 90); - svfloat32_t zero607; - asm volatile("mov %0.s, #0" : "=w"(zero607)); + svfloat32_t zero607 = svdup_n_f32(0); svfloat32_t v607 = svcmla_f32_x(pred_full, zero607, v1318, v518, 90); - svfloat32_t zero614; - asm volatile("mov %0.s, #0" : "=w"(zero614)); + svfloat32_t zero614 = svdup_n_f32(0); svfloat32_t v614 = svcmla_f32_x(pred_full, zero614, v1319, v519, 90); - svfloat32_t zero628; - asm volatile("mov %0.s, #0" : "=w"(zero628)); + svfloat32_t zero628 = svdup_n_f32(0); svfloat32_t v628 = svcmla_f32_x(pred_full, zero628, v1321, v521, 90); - svfloat32_t zero635; - asm volatile("mov %0.s, #0" : "=w"(zero635)); + svfloat32_t zero635 = svdup_n_f32(0); svfloat32_t v635 = svcmla_f32_x(pred_full, zero635, v1322, v522, 90); - svfloat32_t v709; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v709) : "w"(v700), "w"(v707)); - svfloat32_t v714; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v714) : "w"(v710), "w"(v711)); - svfloat32_t v724; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v724) : "w"(v708), "w"(v707)); - svfloat32_t v733; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v733) : "w"(v710), "w"(v711)); - svfloat32_t v760; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v760) : "w"(v717), "w"(v1308)); - svfloat32_t v765; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v765) : "w"(v718), "w"(v1309)); - svfloat32_t v775; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v775) : "w"(v720), "w"(v1311)); - svfloat32_t v780; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v780) : "w"(v721), "w"(v1312)); - svfloat32_t zero802; - asm volatile("mov %0.s, #0" : "=w"(zero802)); + svfloat32_t v709 = svadd_f32_x(svptrue_b32(), v700, v707); + svfloat32_t v714 = svsub_f32_x(svptrue_b32(), v710, v711); + svfloat32_t v724 = svsub_f32_x(svptrue_b32(), v708, v707); + svfloat32_t v733 = svadd_f32_x(svptrue_b32(), v710, v711); + svfloat32_t v760 = svmul_f32_x(svptrue_b32(), v717, v1308); + svfloat32_t v765 = svmul_f32_x(svptrue_b32(), v718, v1309); + svfloat32_t v775 = svmul_f32_x(svptrue_b32(), v720, v1311); + svfloat32_t v780 = svmul_f32_x(svptrue_b32(), v721, v1312); + svfloat32_t zero802 = svdup_n_f32(0); svfloat32_t v802 = svcmla_f32_x(pred_full, zero802, v1316, v725, 90); - svfloat32_t zero816; - asm volatile("mov %0.s, #0" : "=w"(zero816)); + svfloat32_t zero816 = svdup_n_f32(0); svfloat32_t v816 = svcmla_f32_x(pred_full, zero816, v1318, v727, 90); - svfloat32_t zero823; - asm volatile("mov %0.s, #0" : "=w"(zero823)); + svfloat32_t zero823 = svdup_n_f32(0); svfloat32_t v823 = svcmla_f32_x(pred_full, zero823, v1319, v728, 90); - svfloat32_t zero837; - asm volatile("mov %0.s, #0" : "=w"(zero837)); + svfloat32_t zero837 = svdup_n_f32(0); svfloat32_t v837 = svcmla_f32_x(pred_full, zero837, v1321, v730, 90); - svfloat32_t zero844; - asm volatile("mov %0.s, #0" : "=w"(zero844)); + svfloat32_t zero844 = svdup_n_f32(0); svfloat32_t v844 = svcmla_f32_x(pred_full, zero844, v1322, v731, 90); - svfloat32_t v503; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v503) : "w"(v500), "w"(v499)); - svfloat32_t v506; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v506) : "w"(v505), "w"(v496)); - svfloat32_t v586; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v586) : "w"(v515), "w"(v1315)); - svfloat32_t zero649; - asm volatile("mov %0.s, #0" : "=w"(zero649)); + svfloat32_t v503 = svadd_f32_x(svptrue_b32(), v500, v499); + svfloat32_t v506 = svsub_f32_x(svptrue_b32(), v505, v496); + svfloat32_t v586 = svmul_f32_x(svptrue_b32(), v515, v1315); + svfloat32_t zero649 = svdup_n_f32(0); svfloat32_t v649 = svcmla_f32_x(pred_full, zero649, v1324, v524, 90); svfloat32_t v651 = svmla_f32_x(pred_full, v551, v507, v1307); svfloat32_t v652 = svmla_f32_x(pred_full, v556, v508, v1308); @@ -15253,19 +13609,13 @@ void armral_fft_cf32_cf32_cs16_ab_t_gu22(const armral_cmplx_f32_t *restrict x, svfloat32_t v655 = svmla_f32_x(pred_full, v571, v511, v1311); svfloat32_t v656 = svnmls_f32_x(pred_full, v571, v510, v1310); svfloat32_t v659 = svcmla_f32_x(pred_full, v607, v1317, v517, 90); - svfloat32_t v660; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v660) : "w"(v593), "w"(v607)); + svfloat32_t v660 = svsub_f32_x(svptrue_b32(), v593, v607); svfloat32_t v661 = svcmla_f32_x(pred_full, v628, v1320, v520, 90); - svfloat32_t v662; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v662) : "w"(v614), "w"(v628)); - svfloat32_t v712; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v712) : "w"(v709), "w"(v708)); - svfloat32_t v715; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v715) : "w"(v714), "w"(v705)); - svfloat32_t v795; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v795) : "w"(v724), "w"(v1315)); - svfloat32_t zero858; - asm volatile("mov %0.s, #0" : "=w"(zero858)); + svfloat32_t v662 = svsub_f32_x(svptrue_b32(), v614, v628); + svfloat32_t v712 = svadd_f32_x(svptrue_b32(), v709, v708); + svfloat32_t v715 = svsub_f32_x(svptrue_b32(), v714, v705); + svfloat32_t v795 = svmul_f32_x(svptrue_b32(), v724, v1315); + svfloat32_t zero858 = svdup_n_f32(0); svfloat32_t v858 = svcmla_f32_x(pred_full, zero858, v1324, v733, 90); svfloat32_t v860 = svmla_f32_x(pred_full, v760, v716, v1307); svfloat32_t v861 = svmla_f32_x(pred_full, v765, v717, v1308); @@ -15274,61 +13624,39 @@ void armral_fft_cf32_cf32_cs16_ab_t_gu22(const armral_cmplx_f32_t *restrict x, svfloat32_t v864 = svmla_f32_x(pred_full, v780, v720, v1311); svfloat32_t v865 = svnmls_f32_x(pred_full, v780, v719, v1310); svfloat32_t v868 = svcmla_f32_x(pred_full, v816, v1317, v726, 90); - svfloat32_t v869; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v869) : "w"(v802), "w"(v816)); + svfloat32_t v869 = svsub_f32_x(svptrue_b32(), v802, v816); svfloat32_t v870 = svcmla_f32_x(pred_full, v837, v1320, v729, 90); - svfloat32_t v871; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v871) : "w"(v823), "w"(v837)); - svfloat32_t v504; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v504) : "w"(v466), "w"(v503)); - svfloat32_t zero541; - asm volatile("mov %0.s, #0" : "=w"(zero541)); + svfloat32_t v871 = svsub_f32_x(svptrue_b32(), v823, v837); + svfloat32_t v504 = svadd_f32_x(svptrue_b32(), v466, v503); + svfloat32_t zero541 = svdup_n_f32(0); svfloat32_t v541 = svcmla_f32_x(pred_full, zero541, v1306, v506, 90); svfloat32_t v657 = svmla_f32_x(pred_full, v586, v514, v1314); svfloat32_t v658 = svmla_f32_x(pred_full, v586, v513, v1313); svfloat32_t v663 = svcmla_f32_x(pred_full, v649, v1323, v523, 90); - svfloat32_t v664; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v664) : "w"(v635), "w"(v649)); - svfloat32_t v683; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v683) : "w"(v659), "w"(v660)); - svfloat32_t v713; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v713) : "w"(v467), "w"(v712)); - svfloat32_t zero750; - asm volatile("mov %0.s, #0" : "=w"(zero750)); + svfloat32_t v664 = svsub_f32_x(svptrue_b32(), v635, v649); + svfloat32_t v683 = svadd_f32_x(svptrue_b32(), v659, v660); + svfloat32_t v713 = svadd_f32_x(svptrue_b32(), v467, v712); + svfloat32_t zero750 = svdup_n_f32(0); svfloat32_t v750 = svcmla_f32_x(pred_full, zero750, v1306, v715, 90); svfloat32_t v866 = svmla_f32_x(pred_full, v795, v723, v1314); svfloat32_t v867 = svmla_f32_x(pred_full, v795, v722, v1313); svfloat32_t v872 = svcmla_f32_x(pred_full, v858, v1323, v732, 90); - svfloat32_t v873; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v873) : "w"(v844), "w"(v858)); - svfloat32_t v892; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v892) : "w"(v868), "w"(v869)); + svfloat32_t v873 = svsub_f32_x(svptrue_b32(), v844, v858); + svfloat32_t v892 = svadd_f32_x(svptrue_b32(), v868, v869); svfloat32_t v650 = svmls_f32_x(pred_full, v504, v503, v1305); - svfloat32_t v665; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v665) : "w"(v655), "w"(v657)); - svfloat32_t v675; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v675) : "w"(v541), "w"(v661)); - svfloat32_t v677; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v677) : "w"(v663), "w"(v659)); - svfloat32_t v679; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v679) : "w"(v541), "w"(v664)); - svfloat32_t v681; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v681) : "w"(v664), "w"(v660)); - svfloat32_t v684; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v684) : "w"(v683), "w"(v661)); + svfloat32_t v665 = svadd_f32_x(svptrue_b32(), v655, v657); + svfloat32_t v675 = svadd_f32_x(svptrue_b32(), v541, v661); + svfloat32_t v677 = svsub_f32_x(svptrue_b32(), v663, v659); + svfloat32_t v679 = svadd_f32_x(svptrue_b32(), v541, v664); + svfloat32_t v681 = svsub_f32_x(svptrue_b32(), v664, v660); + svfloat32_t v684 = svadd_f32_x(svptrue_b32(), v683, v661); svfloat32_t v859 = svmls_f32_x(pred_full, v713, v712, v1305); - svfloat32_t v874; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v874) : "w"(v864), "w"(v866)); - svfloat32_t v884; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v884) : "w"(v750), "w"(v870)); - svfloat32_t v886; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v886) : "w"(v872), "w"(v868)); - svfloat32_t v888; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v888) : "w"(v750), "w"(v873)); - svfloat32_t v890; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v890) : "w"(v873), "w"(v869)); - svfloat32_t v893; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v893) : "w"(v892), "w"(v870)); + svfloat32_t v874 = svadd_f32_x(svptrue_b32(), v864, v866); + svfloat32_t v884 = svadd_f32_x(svptrue_b32(), v750, v870); + svfloat32_t v886 = svsub_f32_x(svptrue_b32(), v872, v868); + svfloat32_t v888 = svadd_f32_x(svptrue_b32(), v750, v873); + svfloat32_t v890 = svsub_f32_x(svptrue_b32(), v873, v869); + svfloat32_t v893 = svadd_f32_x(svptrue_b32(), v892, v870); svint16_t v908 = svtbl_s16( svreinterpret_s16_s32(svcvt_s32_f32_x( pred_full, svmul_n_f32_x(pred_full, v504, (float)(1ULL << 31ULL)))), @@ -15339,108 +13667,58 @@ void armral_fft_cf32_cf32_cs16_ab_t_gu22(const armral_cmplx_f32_t *restrict x, pred_full, svmul_n_f32_x(pred_full, v713, (float)(1ULL << 31ULL)))), svreinterpret_u16_u64( svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svfloat32_t v666; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v666) : "w"(v665), "w"(v650)); - svfloat32_t v667; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v667) : "w"(v650), "w"(v652)); - svfloat32_t v669; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v669) : "w"(v650), "w"(v656)); - svfloat32_t v671; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v671) : "w"(v650), "w"(v653)); - svfloat32_t v673; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v673) : "w"(v650), "w"(v651)); - svfloat32_t v676; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v676) : "w"(v675), "w"(v663)); - svfloat32_t v678; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v678) : "w"(v677), "w"(v541)); - svfloat32_t v680; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v680) : "w"(v679), "w"(v662)); - svfloat32_t v682; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v682) : "w"(v681), "w"(v541)); - svfloat32_t v685; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v685) : "w"(v684), "w"(v662)); - svfloat32_t v875; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v875) : "w"(v874), "w"(v859)); - svfloat32_t v876; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v876) : "w"(v859), "w"(v861)); - svfloat32_t v878; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v878) : "w"(v859), "w"(v865)); - svfloat32_t v880; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v880) : "w"(v859), "w"(v862)); - svfloat32_t v882; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v882) : "w"(v859), "w"(v860)); - svfloat32_t v885; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v885) : "w"(v884), "w"(v872)); - svfloat32_t v887; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v887) : "w"(v886), "w"(v750)); - svfloat32_t v889; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v889) : "w"(v888), "w"(v871)); - svfloat32_t v891; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v891) : "w"(v890), "w"(v750)); - svfloat32_t v894; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v894) : "w"(v893), "w"(v871)); + svfloat32_t v666 = svadd_f32_x(svptrue_b32(), v665, v650); + svfloat32_t v667 = svsub_f32_x(svptrue_b32(), v650, v652); + svfloat32_t v669 = svadd_f32_x(svptrue_b32(), v650, v656); + svfloat32_t v671 = svsub_f32_x(svptrue_b32(), v650, v653); + svfloat32_t v673 = svadd_f32_x(svptrue_b32(), v650, v651); + svfloat32_t v676 = svadd_f32_x(svptrue_b32(), v675, v663); + svfloat32_t v678 = svsub_f32_x(svptrue_b32(), v677, v541); + svfloat32_t v680 = svadd_f32_x(svptrue_b32(), v679, v662); + svfloat32_t v682 = svsub_f32_x(svptrue_b32(), v681, v541); + svfloat32_t v685 = svadd_f32_x(svptrue_b32(), v684, v662); + svfloat32_t v875 = svadd_f32_x(svptrue_b32(), v874, v859); + svfloat32_t v876 = svsub_f32_x(svptrue_b32(), v859, v861); + svfloat32_t v878 = svadd_f32_x(svptrue_b32(), v859, v865); + svfloat32_t v880 = svsub_f32_x(svptrue_b32(), v859, v862); + svfloat32_t v882 = svadd_f32_x(svptrue_b32(), v859, v860); + svfloat32_t v885 = svadd_f32_x(svptrue_b32(), v884, v872); + svfloat32_t v887 = svsub_f32_x(svptrue_b32(), v886, v750); + svfloat32_t v889 = svadd_f32_x(svptrue_b32(), v888, v871); + svfloat32_t v891 = svsub_f32_x(svptrue_b32(), v890, v750); + svfloat32_t v894 = svadd_f32_x(svptrue_b32(), v893, v871); svst1w_u64(pred_full, (unsigned *)(v1332), svreinterpret_u64_s16(v908)); svst1w_u64(pred_full, (unsigned *)(v1341), svreinterpret_u64_s16(v916)); - svfloat32_t v668; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v668) : "w"(v667), "w"(v657)); - svfloat32_t v670; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v670) : "w"(v669), "w"(v658)); - svfloat32_t v672; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v672) : "w"(v671), "w"(v658)); - svfloat32_t v674; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v674) : "w"(v673), "w"(v654)); - svfloat32_t v686; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v686) : "w"(v685), "w"(v541)); - svfloat32_t v688; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v688) : "w"(v666), "w"(v676)); - svfloat32_t v695; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v695) : "w"(v666), "w"(v676)); - svfloat32_t v877; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v877) : "w"(v876), "w"(v866)); - svfloat32_t v879; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v879) : "w"(v878), "w"(v867)); - svfloat32_t v881; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v881) : "w"(v880), "w"(v867)); - svfloat32_t v883; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v883) : "w"(v882), "w"(v863)); - svfloat32_t v895; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v895) : "w"(v894), "w"(v750)); - svfloat32_t v897; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v897) : "w"(v875), "w"(v885)); - svfloat32_t v904; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v904) : "w"(v875), "w"(v885)); - svfloat32_t v687; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v687) : "w"(v674), "w"(v686)); - svfloat32_t v689; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v689) : "w"(v668), "w"(v678)); - svfloat32_t v690; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v690) : "w"(v670), "w"(v680)); - svfloat32_t v691; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v691) : "w"(v672), "w"(v682)); - svfloat32_t v692; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v692) : "w"(v672), "w"(v682)); - svfloat32_t v693; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v693) : "w"(v670), "w"(v680)); - svfloat32_t v694; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v694) : "w"(v668), "w"(v678)); - svfloat32_t v696; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v696) : "w"(v674), "w"(v686)); - svfloat32_t v896; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v896) : "w"(v883), "w"(v895)); - svfloat32_t v898; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v898) : "w"(v877), "w"(v887)); - svfloat32_t v899; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v899) : "w"(v879), "w"(v889)); - svfloat32_t v900; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v900) : "w"(v881), "w"(v891)); - svfloat32_t v901; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v901) : "w"(v881), "w"(v891)); - svfloat32_t v902; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v902) : "w"(v879), "w"(v889)); - svfloat32_t v903; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v903) : "w"(v877), "w"(v887)); - svfloat32_t v905; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v905) : "w"(v883), "w"(v895)); + svfloat32_t v668 = svsub_f32_x(svptrue_b32(), v667, v657); + svfloat32_t v670 = svadd_f32_x(svptrue_b32(), v669, v658); + svfloat32_t v672 = svsub_f32_x(svptrue_b32(), v671, v658); + svfloat32_t v674 = svsub_f32_x(svptrue_b32(), v673, v654); + svfloat32_t v686 = svsub_f32_x(svptrue_b32(), v685, v541); + svfloat32_t v688 = svadd_f32_x(svptrue_b32(), v666, v676); + svfloat32_t v695 = svsub_f32_x(svptrue_b32(), v666, v676); + svfloat32_t v877 = svsub_f32_x(svptrue_b32(), v876, v866); + svfloat32_t v879 = svadd_f32_x(svptrue_b32(), v878, v867); + svfloat32_t v881 = svsub_f32_x(svptrue_b32(), v880, v867); + svfloat32_t v883 = svsub_f32_x(svptrue_b32(), v882, v863); + svfloat32_t v895 = svsub_f32_x(svptrue_b32(), v894, v750); + svfloat32_t v897 = svadd_f32_x(svptrue_b32(), v875, v885); + svfloat32_t v904 = svsub_f32_x(svptrue_b32(), v875, v885); + svfloat32_t v687 = svadd_f32_x(svptrue_b32(), v674, v686); + svfloat32_t v689 = svadd_f32_x(svptrue_b32(), v668, v678); + svfloat32_t v690 = svsub_f32_x(svptrue_b32(), v670, v680); + svfloat32_t v691 = svadd_f32_x(svptrue_b32(), v672, v682); + svfloat32_t v692 = svsub_f32_x(svptrue_b32(), v672, v682); + svfloat32_t v693 = svadd_f32_x(svptrue_b32(), v670, v680); + svfloat32_t v694 = svsub_f32_x(svptrue_b32(), v668, v678); + svfloat32_t v696 = svsub_f32_x(svptrue_b32(), v674, v686); + svfloat32_t v896 = svadd_f32_x(svptrue_b32(), v883, v895); + svfloat32_t v898 = svadd_f32_x(svptrue_b32(), v877, v887); + svfloat32_t v899 = svsub_f32_x(svptrue_b32(), v879, v889); + svfloat32_t v900 = svadd_f32_x(svptrue_b32(), v881, v891); + svfloat32_t v901 = svsub_f32_x(svptrue_b32(), v881, v891); + svfloat32_t v902 = svadd_f32_x(svptrue_b32(), v879, v889); + svfloat32_t v903 = svsub_f32_x(svptrue_b32(), v877, v887); + svfloat32_t v905 = svsub_f32_x(svptrue_b32(), v883, v895); svint16_t v940 = svtbl_s16( svreinterpret_s16_s32(svcvt_s32_f32_x( pred_full, svmul_n_f32_x(pred_full, v695, (float)(1ULL << 31ULL)))), @@ -16257,8 +14535,7 @@ void armral_fft_cf32_cf32_cs16_ab_t_gu24(const armral_cmplx_f32_t *restrict x, svld1_f64(pred_full, &((const double *)v7)[v169])); svfloat32_t v205 = svreinterpret_f32_f64( svld1_f64(pred_full, &((const double *)v7)[v204])); - svfloat32_t zero213; - asm volatile("mov %0.s, #0" : "=w"(zero213)); + svfloat32_t zero213 = svdup_n_f32(0); svfloat32_t v213 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero213, v1008, v212, 0), v1008, v212, 90); @@ -16332,115 +14609,83 @@ void armral_fft_cf32_cf32_cs16_ab_t_gu24(const armral_cmplx_f32_t *restrict x, svld1_gather_s64index_f64(pred_full, (const double *)(v1116), v1136)); svfloat32_t v1127 = svreinterpret_f32_f64( svld1_gather_s64index_f64(pred_full, (const double *)(v1125), v1136)); - svfloat32_t zero52; - asm volatile("mov %0.s, #0" : "=w"(zero52)); + svfloat32_t zero52 = svdup_n_f32(0); svfloat32_t v52 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero52, v927, v51, 0), v927, v51, 90); - svfloat32_t zero59; - asm volatile("mov %0.s, #0" : "=w"(zero59)); + svfloat32_t zero59 = svdup_n_f32(0); svfloat32_t v59 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero59, v936, v58, 0), v936, v58, 90); - svfloat32_t zero94; - asm volatile("mov %0.s, #0" : "=w"(zero94)); + svfloat32_t zero94 = svdup_n_f32(0); svfloat32_t v94 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero94, v945, v93, 0), v945, v93, 90); - svfloat32_t zero101; - asm volatile("mov %0.s, #0" : "=w"(zero101)); + svfloat32_t zero101 = svdup_n_f32(0); svfloat32_t v101 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero101, v954, v100, 0), v954, v100, 90); - svfloat32_t zero150; - asm volatile("mov %0.s, #0" : "=w"(zero150)); + svfloat32_t zero150 = svdup_n_f32(0); svfloat32_t v150 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero150, v972, v149, 0), v972, v149, 90); - svfloat32_t zero157; - asm volatile("mov %0.s, #0" : "=w"(zero157)); + svfloat32_t zero157 = svdup_n_f32(0); svfloat32_t v157 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero157, v981, v156, 0), v981, v156, 90); - svfloat32_t zero206; - asm volatile("mov %0.s, #0" : "=w"(zero206)); + svfloat32_t zero206 = svdup_n_f32(0); svfloat32_t v206 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero206, v999, v205, 0), v999, v205, 90); - svfloat32_t zero262; - asm volatile("mov %0.s, #0" : "=w"(zero262)); + svfloat32_t zero262 = svdup_n_f32(0); svfloat32_t v262 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero262, v1028, v261, 0), v1028, v261, 90); - svfloat32_t zero269; - asm volatile("mov %0.s, #0" : "=w"(zero269)); + svfloat32_t zero269 = svdup_n_f32(0); svfloat32_t v269 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero269, v1037, v268, 0), v1037, v268, 90); - svfloat32_t zero318; - asm volatile("mov %0.s, #0" : "=w"(zero318)); + svfloat32_t zero318 = svdup_n_f32(0); svfloat32_t v318 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero318, v1055, v317, 0), v1055, v317, 90); - svfloat32_t zero325; - asm volatile("mov %0.s, #0" : "=w"(zero325)); + svfloat32_t zero325 = svdup_n_f32(0); svfloat32_t v325 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero325, v1064, v324, 0), v1064, v324, 90); - svfloat32_t zero374; - asm volatile("mov %0.s, #0" : "=w"(zero374)); + svfloat32_t zero374 = svdup_n_f32(0); svfloat32_t v374 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero374, v1082, v373, 0), v1082, v373, 90); - svfloat32_t zero381; - asm volatile("mov %0.s, #0" : "=w"(zero381)); + svfloat32_t zero381 = svdup_n_f32(0); svfloat32_t v381 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero381, v1091, v380, 0), v1091, v380, 90); - svfloat32_t zero430; - asm volatile("mov %0.s, #0" : "=w"(zero430)); + svfloat32_t zero430 = svdup_n_f32(0); svfloat32_t v430 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero430, v1109, v429, 0), v1109, v429, 90); - svfloat32_t zero437; - asm volatile("mov %0.s, #0" : "=w"(zero437)); + svfloat32_t zero437 = svdup_n_f32(0); svfloat32_t v437 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero437, v1118, v436, 0), v1118, v436, 90); - svfloat32_t v452; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v452) : "w"(v52), "w"(v59)); - svfloat32_t v453; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v453) : "w"(v52), "w"(v59)); - svfloat32_t v462; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v462) : "w"(v94), "w"(v101)); - svfloat32_t v463; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v463) : "w"(v94), "w"(v101)); - svfloat32_t v465; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v465) : "w"(v150), "w"(v157)); - svfloat32_t v466; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v466) : "w"(v150), "w"(v157)); - svfloat32_t v468; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v468) : "w"(v206), "w"(v213)); - svfloat32_t v469; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v469) : "w"(v206), "w"(v213)); - svfloat32_t v471; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v471) : "w"(v262), "w"(v269)); - svfloat32_t v472; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v472) : "w"(v262), "w"(v269)); - svfloat32_t v474; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v474) : "w"(v318), "w"(v325)); - svfloat32_t v475; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v475) : "w"(v318), "w"(v325)); - svfloat32_t v477; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v477) : "w"(v374), "w"(v381)); - svfloat32_t v478; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v478) : "w"(v374), "w"(v381)); - svfloat32_t v480; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v480) : "w"(v430), "w"(v437)); - svfloat32_t v481; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v481) : "w"(v430), "w"(v437)); - svfloat32_t v461; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v461) : "w"(v452), "w"(v1137)); + svfloat32_t v452 = svadd_f32_x(svptrue_b32(), v52, v59); + svfloat32_t v453 = svsub_f32_x(svptrue_b32(), v52, v59); + svfloat32_t v462 = svadd_f32_x(svptrue_b32(), v94, v101); + svfloat32_t v463 = svsub_f32_x(svptrue_b32(), v94, v101); + svfloat32_t v465 = svadd_f32_x(svptrue_b32(), v150, v157); + svfloat32_t v466 = svsub_f32_x(svptrue_b32(), v150, v157); + svfloat32_t v468 = svadd_f32_x(svptrue_b32(), v206, v213); + svfloat32_t v469 = svsub_f32_x(svptrue_b32(), v206, v213); + svfloat32_t v471 = svadd_f32_x(svptrue_b32(), v262, v269); + svfloat32_t v472 = svsub_f32_x(svptrue_b32(), v262, v269); + svfloat32_t v474 = svadd_f32_x(svptrue_b32(), v318, v325); + svfloat32_t v475 = svsub_f32_x(svptrue_b32(), v318, v325); + svfloat32_t v477 = svadd_f32_x(svptrue_b32(), v374, v381); + svfloat32_t v478 = svsub_f32_x(svptrue_b32(), v374, v381); + svfloat32_t v480 = svadd_f32_x(svptrue_b32(), v430, v437); + svfloat32_t v481 = svsub_f32_x(svptrue_b32(), v430, v437); + svfloat32_t v461 = svadd_f32_x(svptrue_b32(), v452, v1137); svfloat32_t v464 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, v462, v963, v114, 0), v963, v114, 90); @@ -16462,181 +14707,104 @@ void armral_fft_cf32_cf32_cs16_ab_t_gu24(const armral_cmplx_f32_t *restrict x, svfloat32_t v482 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, v480, v1127, v450, 0), v1127, v450, 90); - svfloat32_t v555; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v555) : "w"(v452), "w"(v471)); - svfloat32_t v556; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v556) : "w"(v452), "w"(v471)); - svfloat32_t v557; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v557) : "w"(v465), "w"(v477)); - svfloat32_t v558; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v558) : "w"(v465), "w"(v477)); - svfloat32_t v559; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v559) : "w"(v462), "w"(v474)); - svfloat32_t v560; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v560) : "w"(v462), "w"(v474)); - svfloat32_t v561; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v561) : "w"(v468), "w"(v480)); - svfloat32_t v562; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v562) : "w"(v468), "w"(v480)); - svfloat32_t v627; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v627) : "w"(v453), "w"(v472)); - svfloat32_t v628; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v628) : "w"(v453), "w"(v472)); - svfloat32_t v629; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v629) : "w"(v466), "w"(v478)); - svfloat32_t v630; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v630) : "w"(v466), "w"(v478)); - svfloat32_t v631; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v631) : "w"(v463), "w"(v475)); - svfloat32_t v632; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v632) : "w"(v463), "w"(v475)); - svfloat32_t v633; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v633) : "w"(v469), "w"(v481)); - svfloat32_t v634; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v634) : "w"(v469), "w"(v481)); - svfloat32_t v483; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v483) : "w"(v461), "w"(v473)); - svfloat32_t v484; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v484) : "w"(v461), "w"(v473)); - svfloat32_t v485; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v485) : "w"(v467), "w"(v479)); - svfloat32_t v486; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v486) : "w"(v467), "w"(v479)); - svfloat32_t v487; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v487) : "w"(v464), "w"(v476)); - svfloat32_t v488; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v488) : "w"(v464), "w"(v476)); - svfloat32_t v489; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v489) : "w"(v470), "w"(v482)); - svfloat32_t v490; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v490) : "w"(v470), "w"(v482)); - svfloat32_t v563; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v563) : "w"(v555), "w"(v557)); - svfloat32_t v564; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v564) : "w"(v555), "w"(v557)); - svfloat32_t v565; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v565) : "w"(v559), "w"(v561)); - svfloat32_t v566; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v566) : "w"(v559), "w"(v561)); - svfloat32_t v569; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v569) : "w"(v560), "w"(v562)); - svfloat32_t v570; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v570) : "w"(v560), "w"(v562)); - svfloat32_t zero604; - asm volatile("mov %0.s, #0" : "=w"(zero604)); + svfloat32_t v555 = svadd_f32_x(svptrue_b32(), v452, v471); + svfloat32_t v556 = svsub_f32_x(svptrue_b32(), v452, v471); + svfloat32_t v557 = svadd_f32_x(svptrue_b32(), v465, v477); + svfloat32_t v558 = svsub_f32_x(svptrue_b32(), v465, v477); + svfloat32_t v559 = svadd_f32_x(svptrue_b32(), v462, v474); + svfloat32_t v560 = svsub_f32_x(svptrue_b32(), v462, v474); + svfloat32_t v561 = svadd_f32_x(svptrue_b32(), v468, v480); + svfloat32_t v562 = svsub_f32_x(svptrue_b32(), v468, v480); + svfloat32_t v627 = svadd_f32_x(svptrue_b32(), v453, v472); + svfloat32_t v628 = svsub_f32_x(svptrue_b32(), v453, v472); + svfloat32_t v629 = svadd_f32_x(svptrue_b32(), v466, v478); + svfloat32_t v630 = svsub_f32_x(svptrue_b32(), v466, v478); + svfloat32_t v631 = svadd_f32_x(svptrue_b32(), v463, v475); + svfloat32_t v632 = svsub_f32_x(svptrue_b32(), v463, v475); + svfloat32_t v633 = svadd_f32_x(svptrue_b32(), v469, v481); + svfloat32_t v634 = svsub_f32_x(svptrue_b32(), v469, v481); + svfloat32_t v483 = svadd_f32_x(svptrue_b32(), v461, v473); + svfloat32_t v484 = svsub_f32_x(svptrue_b32(), v461, v473); + svfloat32_t v485 = svadd_f32_x(svptrue_b32(), v467, v479); + svfloat32_t v486 = svsub_f32_x(svptrue_b32(), v467, v479); + svfloat32_t v487 = svadd_f32_x(svptrue_b32(), v464, v476); + svfloat32_t v488 = svsub_f32_x(svptrue_b32(), v464, v476); + svfloat32_t v489 = svadd_f32_x(svptrue_b32(), v470, v482); + svfloat32_t v490 = svsub_f32_x(svptrue_b32(), v470, v482); + svfloat32_t v563 = svadd_f32_x(svptrue_b32(), v555, v557); + svfloat32_t v564 = svsub_f32_x(svptrue_b32(), v555, v557); + svfloat32_t v565 = svadd_f32_x(svptrue_b32(), v559, v561); + svfloat32_t v566 = svsub_f32_x(svptrue_b32(), v559, v561); + svfloat32_t v569 = svadd_f32_x(svptrue_b32(), v560, v562); + svfloat32_t v570 = svsub_f32_x(svptrue_b32(), v560, v562); + svfloat32_t zero604 = svdup_n_f32(0); svfloat32_t v604 = svcmla_f32_x(pred_full, zero604, v1151, v558, 90); - svfloat32_t v635; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v635) : "w"(v627), "w"(v629)); - svfloat32_t v636; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v636) : "w"(v627), "w"(v629)); - svfloat32_t v637; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v637) : "w"(v631), "w"(v633)); - svfloat32_t v638; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v638) : "w"(v631), "w"(v633)); - svfloat32_t v641; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v641) : "w"(v632), "w"(v634)); - svfloat32_t v642; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v642) : "w"(v632), "w"(v634)); - svfloat32_t zero675; - asm volatile("mov %0.s, #0" : "=w"(zero675)); + svfloat32_t v635 = svadd_f32_x(svptrue_b32(), v627, v629); + svfloat32_t v636 = svsub_f32_x(svptrue_b32(), v627, v629); + svfloat32_t v637 = svadd_f32_x(svptrue_b32(), v631, v633); + svfloat32_t v638 = svsub_f32_x(svptrue_b32(), v631, v633); + svfloat32_t v641 = svadd_f32_x(svptrue_b32(), v632, v634); + svfloat32_t v642 = svsub_f32_x(svptrue_b32(), v632, v634); + svfloat32_t zero675 = svdup_n_f32(0); svfloat32_t v675 = svcmla_f32_x(pred_full, zero675, v1158, v628, 90); - svfloat32_t v491; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v491) : "w"(v483), "w"(v485)); - svfloat32_t v492; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v492) : "w"(v483), "w"(v485)); - svfloat32_t v493; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v493) : "w"(v487), "w"(v489)); - svfloat32_t v494; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v494) : "w"(v487), "w"(v489)); - svfloat32_t v497; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v497) : "w"(v488), "w"(v490)); - svfloat32_t v498; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v498) : "w"(v488), "w"(v490)); - svfloat32_t zero532; - asm volatile("mov %0.s, #0" : "=w"(zero532)); + svfloat32_t v491 = svadd_f32_x(svptrue_b32(), v483, v485); + svfloat32_t v492 = svsub_f32_x(svptrue_b32(), v483, v485); + svfloat32_t v493 = svadd_f32_x(svptrue_b32(), v487, v489); + svfloat32_t v494 = svsub_f32_x(svptrue_b32(), v487, v489); + svfloat32_t v497 = svadd_f32_x(svptrue_b32(), v488, v490); + svfloat32_t v498 = svsub_f32_x(svptrue_b32(), v488, v490); + svfloat32_t zero532 = svdup_n_f32(0); svfloat32_t v532 = svcmla_f32_x(pred_full, zero532, v1143, v486, 90); - svfloat32_t v567; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v567) : "w"(v563), "w"(v565)); - svfloat32_t v568; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v568) : "w"(v563), "w"(v565)); - svfloat32_t zero592; - asm volatile("mov %0.s, #0" : "=w"(zero592)); + svfloat32_t v567 = svadd_f32_x(svptrue_b32(), v563, v565); + svfloat32_t v568 = svsub_f32_x(svptrue_b32(), v563, v565); + svfloat32_t zero592 = svdup_n_f32(0); svfloat32_t v592 = svcmla_f32_x(pred_full, zero592, v1151, v566, 90); - svfloat32_t zero611; - asm volatile("mov %0.s, #0" : "=w"(zero611)); + svfloat32_t zero611 = svdup_n_f32(0); svfloat32_t v611 = svcmla_f32_x(pred_full, zero611, v1152, v569, 90); - svfloat32_t v616; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v616) : "w"(v570), "w"(v1153)); - svfloat32_t v639; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v639) : "w"(v635), "w"(v637)); - svfloat32_t v640; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v640) : "w"(v635), "w"(v637)); - svfloat32_t zero663; - asm volatile("mov %0.s, #0" : "=w"(zero663)); + svfloat32_t v616 = svmul_f32_x(svptrue_b32(), v570, v1153); + svfloat32_t v639 = svadd_f32_x(svptrue_b32(), v635, v637); + svfloat32_t v640 = svsub_f32_x(svptrue_b32(), v635, v637); + svfloat32_t zero663 = svdup_n_f32(0); svfloat32_t v663 = svcmla_f32_x(pred_full, zero663, v1158, v636, 90); - svfloat32_t v685; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v685) : "w"(v641), "w"(v1160)); - svfloat32_t zero692; - asm volatile("mov %0.s, #0" : "=w"(zero692)); + svfloat32_t v685 = svmul_f32_x(svptrue_b32(), v641, v1160); + svfloat32_t zero692 = svdup_n_f32(0); svfloat32_t v692 = svcmla_f32_x(pred_full, zero692, v1161, v642, 90); - svfloat32_t v495; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v495) : "w"(v491), "w"(v493)); - svfloat32_t v496; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v496) : "w"(v491), "w"(v493)); - svfloat32_t zero520; - asm volatile("mov %0.s, #0" : "=w"(zero520)); + svfloat32_t v495 = svadd_f32_x(svptrue_b32(), v491, v493); + svfloat32_t v496 = svsub_f32_x(svptrue_b32(), v491, v493); + svfloat32_t zero520 = svdup_n_f32(0); svfloat32_t v520 = svcmla_f32_x(pred_full, zero520, v1143, v494, 90); - svfloat32_t zero539; - asm volatile("mov %0.s, #0" : "=w"(zero539)); + svfloat32_t zero539 = svdup_n_f32(0); svfloat32_t v539 = svcmla_f32_x(pred_full, zero539, v1144, v497, 90); svfloat32_t v617 = svmla_f32_x(pred_full, v592, v564, v1150); svfloat32_t v618 = svnmls_f32_x(pred_full, v592, v564, v1150); svfloat32_t v619 = svmla_f32_x(pred_full, v616, v556, v1150); svfloat32_t v620 = svnmls_f32_x(pred_full, v616, v556, v1150); - svfloat32_t v621; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v621) : "w"(v604), "w"(v611)); - svfloat32_t v622; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v622) : "w"(v604), "w"(v611)); - svfloat32_t zero649; - asm volatile("mov %0.s, #0" : "=w"(zero649)); + svfloat32_t v621 = svadd_f32_x(svptrue_b32(), v604, v611); + svfloat32_t v622 = svsub_f32_x(svptrue_b32(), v604, v611); + svfloat32_t zero649 = svdup_n_f32(0); svfloat32_t v649 = svcmla_f32_x(pred_full, zero649, v1158, v639, 90); - svfloat32_t zero656; - asm volatile("mov %0.s, #0" : "=w"(zero656)); + svfloat32_t zero656 = svdup_n_f32(0); svfloat32_t v656 = svcmla_f32_x(pred_full, zero656, v1158, v640, 90); svfloat32_t v693 = svmla_f32_x(pred_full, v663, v638, v1159); svfloat32_t v694 = svmls_f32_x(pred_full, v663, v638, v1159); - svfloat32_t v695; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v695) : "w"(v675), "w"(v692)); - svfloat32_t v696; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v696) : "w"(v675), "w"(v692)); + svfloat32_t v695 = svadd_f32_x(svptrue_b32(), v675, v692); + svfloat32_t v696 = svsub_f32_x(svptrue_b32(), v675, v692); svfloat32_t v697 = svmla_f32_x(pred_full, v685, v630, v1159); svfloat32_t v698 = svnmls_f32_x(pred_full, v685, v630, v1159); - svfloat32_t v545; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v545) : "w"(v492), "w"(v520)); - svfloat32_t v546; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v546) : "w"(v492), "w"(v520)); + svfloat32_t v545 = svadd_f32_x(svptrue_b32(), v492, v520); + svfloat32_t v546 = svsub_f32_x(svptrue_b32(), v492, v520); svfloat32_t v547 = svmla_f32_x(pred_full, v484, v498, v1145); svfloat32_t v548 = svmls_f32_x(pred_full, v484, v498, v1145); - svfloat32_t v549; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v549) : "w"(v532), "w"(v539)); - svfloat32_t v550; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v550) : "w"(v532), "w"(v539)); - svfloat32_t v623; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v623) : "w"(v619), "w"(v621)); - svfloat32_t v624; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v624) : "w"(v619), "w"(v621)); - svfloat32_t v625; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v625) : "w"(v620), "w"(v622)); - svfloat32_t v626; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v626) : "w"(v620), "w"(v622)); - svfloat32_t v699; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v699) : "w"(v695), "w"(v697)); - svfloat32_t v700; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v700) : "w"(v695), "w"(v697)); - svfloat32_t v701; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v701) : "w"(v696), "w"(v698)); - svfloat32_t v702; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v702) : "w"(v696), "w"(v698)); + svfloat32_t v549 = svadd_f32_x(svptrue_b32(), v532, v539); + svfloat32_t v550 = svsub_f32_x(svptrue_b32(), v532, v539); + svfloat32_t v623 = svadd_f32_x(svptrue_b32(), v619, v621); + svfloat32_t v624 = svsub_f32_x(svptrue_b32(), v619, v621); + svfloat32_t v625 = svadd_f32_x(svptrue_b32(), v620, v622); + svfloat32_t v626 = svsub_f32_x(svptrue_b32(), v620, v622); + svfloat32_t v699 = svadd_f32_x(svptrue_b32(), v695, v697); + svfloat32_t v700 = svsub_f32_x(svptrue_b32(), v695, v697); + svfloat32_t v701 = svadd_f32_x(svptrue_b32(), v696, v698); + svfloat32_t v702 = svsub_f32_x(svptrue_b32(), v696, v698); svfloat32_t v703 = svmla_f32_x(pred_full, v495, v567, v1150); svint16_t v708 = svtbl_s16( svreinterpret_s16_s32(svcvt_s32_f32_x( @@ -16649,31 +14817,21 @@ void armral_fft_cf32_cf32_cs16_ab_t_gu24(const armral_cmplx_f32_t *restrict x, pred_full, svmul_n_f32_x(pred_full, v496, (float)(1ULL << 31ULL)))), svreinterpret_u16_u64( svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svfloat32_t v551; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v551) : "w"(v547), "w"(v549)); - svfloat32_t v552; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v552) : "w"(v547), "w"(v549)); - svfloat32_t v553; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v553) : "w"(v548), "w"(v550)); - svfloat32_t v554; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v554) : "w"(v548), "w"(v550)); - svfloat32_t v704; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v704) : "w"(v703), "w"(v649)); - svfloat32_t v705; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v705) : "w"(v703), "w"(v649)); - svfloat32_t v757; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v757) : "w"(v546), "w"(v618)); + svfloat32_t v551 = svadd_f32_x(svptrue_b32(), v547, v549); + svfloat32_t v552 = svsub_f32_x(svptrue_b32(), v547, v549); + svfloat32_t v553 = svadd_f32_x(svptrue_b32(), v548, v550); + svfloat32_t v554 = svsub_f32_x(svptrue_b32(), v548, v550); + svfloat32_t v704 = svadd_f32_x(svptrue_b32(), v703, v649); + svfloat32_t v705 = svsub_f32_x(svptrue_b32(), v703, v649); + svfloat32_t v757 = svadd_f32_x(svptrue_b32(), v546, v618); svint16_t v762 = svtbl_s16( svreinterpret_s16_s32(svcvt_s32_f32_x( pred_full, svmul_n_f32_x(pred_full, v546, (float)(1ULL << 31ULL)))), svreinterpret_u16_u64( svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svfloat32_t v812; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v812) : "w"(v811), "w"(v656)); - svfloat32_t v813; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v813) : "w"(v811), "w"(v656)); - svfloat32_t v865; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v865) : "w"(v545), "w"(v617)); + svfloat32_t v812 = svadd_f32_x(svptrue_b32(), v811, v656); + svfloat32_t v813 = svsub_f32_x(svptrue_b32(), v811, v656); + svfloat32_t v865 = svadd_f32_x(svptrue_b32(), v545, v617); svint16_t v870 = svtbl_s16( svreinterpret_s16_s32(svcvt_s32_f32_x( pred_full, svmul_n_f32_x(pred_full, v545, (float)(1ULL << 31ULL)))), @@ -16691,19 +14849,15 @@ void armral_fft_cf32_cf32_cs16_ab_t_gu24(const armral_cmplx_f32_t *restrict x, pred_full, svmul_n_f32_x(pred_full, v704, (float)(1ULL << 31ULL)))), svreinterpret_u16_u64( svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svfloat32_t v730; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v730) : "w"(v552), "w"(v624)); + svfloat32_t v730 = svadd_f32_x(svptrue_b32(), v552, v624); svint16_t v735 = svtbl_s16( svreinterpret_s16_s32(svcvt_s32_f32_x( pred_full, svmul_n_f32_x(pred_full, v552, (float)(1ULL << 31ULL)))), svreinterpret_u16_u64( svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svfloat32_t v758; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v758) : "w"(v757), "w"(v694)); - svfloat32_t v759; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v759) : "w"(v757), "w"(v694)); - svfloat32_t v784; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v784) : "w"(v553), "w"(v625)); + svfloat32_t v758 = svadd_f32_x(svptrue_b32(), v757, v694); + svfloat32_t v759 = svsub_f32_x(svptrue_b32(), v757, v694); + svfloat32_t v784 = svadd_f32_x(svptrue_b32(), v553, v625); svint16_t v789 = svtbl_s16( svreinterpret_s16_s32(svcvt_s32_f32_x( pred_full, svmul_n_f32_x(pred_full, v553, (float)(1ULL << 31ULL)))), @@ -16719,19 +14873,15 @@ void armral_fft_cf32_cf32_cs16_ab_t_gu24(const armral_cmplx_f32_t *restrict x, pred_full, svmul_n_f32_x(pred_full, v812, (float)(1ULL << 31ULL)))), svreinterpret_u16_u64( svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svfloat32_t v838; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v838) : "w"(v554), "w"(v626)); + svfloat32_t v838 = svadd_f32_x(svptrue_b32(), v554, v626); svint16_t v843 = svtbl_s16( svreinterpret_s16_s32(svcvt_s32_f32_x( pred_full, svmul_n_f32_x(pred_full, v554, (float)(1ULL << 31ULL)))), svreinterpret_u16_u64( svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svfloat32_t v866; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v866) : "w"(v865), "w"(v693)); - svfloat32_t v867; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v867) : "w"(v865), "w"(v693)); - svfloat32_t v892; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v892) : "w"(v551), "w"(v623)); + svfloat32_t v866 = svadd_f32_x(svptrue_b32(), v865, v693); + svfloat32_t v867 = svsub_f32_x(svptrue_b32(), v865, v693); + svfloat32_t v892 = svadd_f32_x(svptrue_b32(), v551, v623); svint16_t v897 = svtbl_s16( svreinterpret_s16_s32(svcvt_s32_f32_x( pred_full, svmul_n_f32_x(pred_full, v551, (float)(1ULL << 31ULL)))), @@ -16739,10 +14889,8 @@ void armral_fft_cf32_cf32_cs16_ab_t_gu24(const armral_cmplx_f32_t *restrict x, svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); svst1w_u64(pred_full, (unsigned *)(v1223), svreinterpret_u64_s16(v762)); svst1w_u64(pred_full, (unsigned *)(v1331), svreinterpret_u64_s16(v870)); - svfloat32_t v731; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v731) : "w"(v730), "w"(v700)); - svfloat32_t v732; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v732) : "w"(v730), "w"(v700)); + svfloat32_t v731 = svadd_f32_x(svptrue_b32(), v730, v700); + svfloat32_t v732 = svsub_f32_x(svptrue_b32(), v730, v700); svint16_t v770 = svtbl_s16( svreinterpret_s16_s32(svcvt_s32_f32_x( pred_full, svmul_n_f32_x(pred_full, v759, (float)(1ULL << 31ULL)))), @@ -16753,14 +14901,10 @@ void armral_fft_cf32_cf32_cs16_ab_t_gu24(const armral_cmplx_f32_t *restrict x, pred_full, svmul_n_f32_x(pred_full, v758, (float)(1ULL << 31ULL)))), svreinterpret_u16_u64( svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svfloat32_t v785; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v785) : "w"(v784), "w"(v701)); - svfloat32_t v786; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v786) : "w"(v784), "w"(v701)); - svfloat32_t v839; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v839) : "w"(v838), "w"(v702)); - svfloat32_t v840; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v840) : "w"(v838), "w"(v702)); + svfloat32_t v785 = svadd_f32_x(svptrue_b32(), v784, v701); + svfloat32_t v786 = svsub_f32_x(svptrue_b32(), v784, v701); + svfloat32_t v839 = svadd_f32_x(svptrue_b32(), v838, v702); + svfloat32_t v840 = svsub_f32_x(svptrue_b32(), v838, v702); svint16_t v878 = svtbl_s16( svreinterpret_s16_s32(svcvt_s32_f32_x( pred_full, svmul_n_f32_x(pred_full, v867, (float)(1ULL << 31ULL)))), @@ -16771,10 +14915,8 @@ void armral_fft_cf32_cf32_cs16_ab_t_gu24(const armral_cmplx_f32_t *restrict x, pred_full, svmul_n_f32_x(pred_full, v866, (float)(1ULL << 31ULL)))), svreinterpret_u16_u64( svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svfloat32_t v893; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v893) : "w"(v892), "w"(v699)); - svfloat32_t v894; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v894) : "w"(v892), "w"(v699)); + svfloat32_t v893 = svadd_f32_x(svptrue_b32(), v892, v699); + svfloat32_t v894 = svsub_f32_x(svptrue_b32(), v892, v699); svst1w_u64(pred_full, (unsigned *)(v1178), svreinterpret_u64_s16(v716)); svst1w_u64(pred_full, (unsigned *)(v1187), svreinterpret_u64_s16(v724)); svst1w_u64(pred_full, (unsigned *)(v1196), svreinterpret_u64_s16(v735)); @@ -16853,7 +14995,6 @@ void armral_fft_cf32_cf32_cs16_ab_t_gu25(const armral_cmplx_f32_t *restrict x, const float32x2_t *v7 = (const float32x2_t *)w; for (int j = 0; j < howmany; j += 1) { float32x2_t v92 = v5[istride]; - float v1070 = 0.0000000000000000e+00F; float v1168 = 9.6858316112863108e-01F; float v1171 = -2.4868988716485479e-01F; float v1172 = 2.4868988716485479e-01F; @@ -16888,7 +15029,6 @@ void armral_fft_cf32_cf32_cs16_ab_t_gu25(const armral_cmplx_f32_t *restrict x, float32x2_t v98 = vtrn1_f32(v92, v92); float32x2_t v99 = vtrn2_f32(v92, v92); float32x2_t v452 = v5[0]; - float v1073 = dir * v1070; float32x2_t v1169 = (float32x2_t){v1168, v1168}; float32x2_t v1173 = (float32x2_t){v1171, v1172}; float32x2_t v1313 = (float32x2_t){v1312, v1312}; @@ -16959,7 +15099,6 @@ void armral_fft_cf32_cf32_cs16_ab_t_gu25(const armral_cmplx_f32_t *restrict x, int64_t v420 = 36 + j * 48; float32x2_t v434 = v5[istride * 24]; int64_t v438 = 46 + j * 48; - float32x2_t v1071 = (float32x2_t){v1070, v1073}; float32x2_t v1175 = vmul_f32(v1710, v1173); float32x2_t v1319 = vmul_f32(v1710, v1317); float32x2_t v1463 = vmul_f32(v1710, v1461); @@ -17134,86 +15273,26 @@ void armral_fft_cf32_cf32_cs16_ab_t_gu25(const armral_cmplx_f32_t *restrict x, float32x2_t v411 = vfma_f32(v409, v405, v408); float32x2_t v429 = vfma_f32(v427, v423, v426); float32x2_t v447 = vfma_f32(v445, v441, v444); - float32x2_t v462 = vrev64_f32(v33); - float32x2_t v474 = vrev64_f32(v51); - float32x2_t v486 = vrev64_f32(v87); - float32x2_t v504 = vrev64_f32(v69); - float32x2_t v576 = vrev64_f32(v123); - float32x2_t v588 = vrev64_f32(v141); - float32x2_t v600 = vrev64_f32(v177); - float32x2_t v618 = vrev64_f32(v159); - float32x2_t v690 = vrev64_f32(v213); - float32x2_t v702 = vrev64_f32(v231); - float32x2_t v714 = vrev64_f32(v267); - float32x2_t v732 = vrev64_f32(v249); - float32x2_t v804 = vrev64_f32(v303); - float32x2_t v816 = vrev64_f32(v321); - float32x2_t v828 = vrev64_f32(v357); - float32x2_t v846 = vrev64_f32(v339); - float32x2_t v918 = vrev64_f32(v393); - float32x2_t v930 = vrev64_f32(v411); - float32x2_t v942 = vrev64_f32(v447); - float32x2_t v960 = vrev64_f32(v429); - float32x2_t v463 = vmul_f32(v462, v1071); - float32x2_t v475 = vmul_f32(v474, v1071); - float32x2_t v487 = vmul_f32(v486, v1071); - float32x2_t v505 = vmul_f32(v504, v1071); - float32x2_t v577 = vmul_f32(v576, v1071); - float32x2_t v589 = vmul_f32(v588, v1071); - float32x2_t v601 = vmul_f32(v600, v1071); - float32x2_t v619 = vmul_f32(v618, v1071); - float32x2_t v691 = vmul_f32(v690, v1071); - float32x2_t v703 = vmul_f32(v702, v1071); - float32x2_t v715 = vmul_f32(v714, v1071); - float32x2_t v733 = vmul_f32(v732, v1071); - float32x2_t v805 = vmul_f32(v804, v1071); - float32x2_t v817 = vmul_f32(v816, v1071); - float32x2_t v829 = vmul_f32(v828, v1071); - float32x2_t v847 = vmul_f32(v846, v1071); - float32x2_t v919 = vmul_f32(v918, v1071); - float32x2_t v931 = vmul_f32(v930, v1071); - float32x2_t v943 = vmul_f32(v942, v1071); - float32x2_t v961 = vmul_f32(v960, v1071); - float32x2_t v464 = vadd_f32(v463, v33); - float32x2_t v476 = vadd_f32(v475, v51); - float32x2_t v488 = vadd_f32(v487, v87); - float32x2_t v506 = vadd_f32(v505, v69); - float32x2_t v578 = vadd_f32(v577, v123); - float32x2_t v590 = vadd_f32(v589, v141); - float32x2_t v602 = vadd_f32(v601, v177); - float32x2_t v620 = vadd_f32(v619, v159); - float32x2_t v692 = vadd_f32(v691, v213); - float32x2_t v704 = vadd_f32(v703, v231); - float32x2_t v716 = vadd_f32(v715, v267); - float32x2_t v734 = vadd_f32(v733, v249); - float32x2_t v806 = vadd_f32(v805, v303); - float32x2_t v818 = vadd_f32(v817, v321); - float32x2_t v830 = vadd_f32(v829, v357); - float32x2_t v848 = vadd_f32(v847, v339); - float32x2_t v920 = vadd_f32(v919, v393); - float32x2_t v932 = vadd_f32(v931, v411); - float32x2_t v944 = vadd_f32(v943, v447); - float32x2_t v962 = vadd_f32(v961, v429); - float32x2_t v489 = vsub_f32(v464, v488); - float32x2_t v493 = vmul_f32(v464, v1734); - float32x2_t v507 = vsub_f32(v476, v506); - float32x2_t v511 = vmul_f32(v476, v1734); - float32x2_t v603 = vsub_f32(v578, v602); - float32x2_t v607 = vmul_f32(v578, v1734); - float32x2_t v621 = vsub_f32(v590, v620); - float32x2_t v625 = vmul_f32(v590, v1734); - float32x2_t v717 = vsub_f32(v692, v716); - float32x2_t v721 = vmul_f32(v692, v1734); - float32x2_t v735 = vsub_f32(v704, v734); - float32x2_t v739 = vmul_f32(v704, v1734); - float32x2_t v831 = vsub_f32(v806, v830); - float32x2_t v835 = vmul_f32(v806, v1734); - float32x2_t v849 = vsub_f32(v818, v848); - float32x2_t v853 = vmul_f32(v818, v1734); - float32x2_t v945 = vsub_f32(v920, v944); - float32x2_t v949 = vmul_f32(v920, v1734); - float32x2_t v963 = vsub_f32(v932, v962); - float32x2_t v967 = vmul_f32(v932, v1734); + float32x2_t v489 = vsub_f32(v33, v87); + float32x2_t v493 = vmul_f32(v33, v1734); + float32x2_t v507 = vsub_f32(v51, v69); + float32x2_t v511 = vmul_f32(v51, v1734); + float32x2_t v603 = vsub_f32(v123, v177); + float32x2_t v607 = vmul_f32(v123, v1734); + float32x2_t v621 = vsub_f32(v141, v159); + float32x2_t v625 = vmul_f32(v141, v1734); + float32x2_t v717 = vsub_f32(v213, v267); + float32x2_t v721 = vmul_f32(v213, v1734); + float32x2_t v735 = vsub_f32(v231, v249); + float32x2_t v739 = vmul_f32(v231, v1734); + float32x2_t v831 = vsub_f32(v303, v357); + float32x2_t v835 = vmul_f32(v303, v1734); + float32x2_t v849 = vsub_f32(v321, v339); + float32x2_t v853 = vmul_f32(v321, v1734); + float32x2_t v945 = vsub_f32(v393, v447); + float32x2_t v949 = vmul_f32(v393, v1734); + float32x2_t v963 = vsub_f32(v411, v429); + float32x2_t v967 = vmul_f32(v411, v1734); float32x2_t v494 = vsub_f32(v493, v489); float32x2_t v512 = vsub_f32(v511, v507); float32x2_t v523 = vmul_f32(v507, v1683); @@ -17294,10 +15373,10 @@ void armral_fft_cf32_cf32_cs16_ab_t_gu25(const armral_cmplx_f32_t *restrict x, float32x2_t v975 = vsub_f32(v375, v974); float32x2_t v1003 = vmul_f32(v1002, v1711); float32x2_t v1011 = vmul_f32(v1010, v1711); - float32x2_t v1032 = vrev64_f32(v654); - float32x2_t v1044 = vrev64_f32(v768); - float32x2_t v1056 = vrev64_f32(v996); - float32x2_t v1074 = vrev64_f32(v882); + float32x2_t v1059 = vsub_f32(v654, v996); + float32x2_t v1063 = vmul_f32(v654, v1734); + float32x2_t v1077 = vsub_f32(v768, v882); + float32x2_t v1081 = vmul_f32(v768, v1734); float32x2_t v529 = vsub_f32(v519, v528); float32x2_t v533 = vmul_f32(v519, v1734); float32x2_t v643 = vsub_f32(v633, v642); @@ -17308,10 +15387,10 @@ void armral_fft_cf32_cf32_cs16_ab_t_gu25(const armral_cmplx_f32_t *restrict x, float32x2_t v875 = vmul_f32(v861, v1734); float32x2_t v985 = vsub_f32(v975, v984); float32x2_t v989 = vmul_f32(v975, v1734); - float32x2_t v1033 = vmul_f32(v1032, v1071); - float32x2_t v1045 = vmul_f32(v1044, v1071); - float32x2_t v1057 = vmul_f32(v1056, v1071); - float32x2_t v1075 = vmul_f32(v1074, v1071); + float32x2_t v1064 = vsub_f32(v1063, v1059); + float32x2_t v1082 = vsub_f32(v1081, v1077); + float32x2_t v1093 = vmul_f32(v1077, v1683); + float32x2_t v1108 = vmul_f32(v1059, v1683); float32x2_t v534 = vsub_f32(v533, v529); float32x2_t v556 = vsub_f32(v529, v555); float32x2_t v560 = vmul_f32(v529, v1734); @@ -17327,10 +15406,10 @@ void armral_fft_cf32_cf32_cs16_ab_t_gu25(const armral_cmplx_f32_t *restrict x, float32x2_t v990 = vsub_f32(v989, v985); float32x2_t v1012 = vsub_f32(v985, v1011); float32x2_t v1016 = vmul_f32(v985, v1734); - float32x2_t v1034 = vadd_f32(v1033, v654); - float32x2_t v1046 = vadd_f32(v1045, v768); - float32x2_t v1058 = vadd_f32(v1057, v996); - float32x2_t v1076 = vadd_f32(v1075, v882); + float32x2_t v1083 = vadd_f32(v1064, v1082); + float32x2_t v1084 = vsub_f32(v1064, v1082); + float32x2_t v1094 = vadd_f32(v1059, v1093); + float32x2_t v1109 = vsub_f32(v1108, v1077); float32x2_t v548 = vsub_f32(v534, v547); float32x2_t v561 = vsub_f32(v560, v556); float32x2_t v565 = vmul_f32(v534, v1734); @@ -17346,10 +15425,11 @@ void armral_fft_cf32_cf32_cs16_ab_t_gu25(const armral_cmplx_f32_t *restrict x, float32x2_t v1004 = vsub_f32(v990, v1003); float32x2_t v1017 = vsub_f32(v1016, v1012); float32x2_t v1021 = vmul_f32(v990, v1734); - float32x2_t v1059 = vsub_f32(v1034, v1058); - float32x2_t v1063 = vmul_f32(v1034, v1734); - float32x2_t v1077 = vsub_f32(v1046, v1076); - float32x2_t v1081 = vmul_f32(v1046, v1734); + float32x2_t v1088 = vmul_f32(v1083, v1663); + float32x2_t v1098 = vmul_f32(v1084, v1673); + float32x2_t v1110 = vadd_f32(v540, v1083); + float32x2_t v1122 = vrev64_f32(v1094); + float32x2_t v1136 = vrev64_f32(v1109); float32x2_t v1320 = vrev64_f32(v670); float32x2_t v1332 = vrev64_f32(v784); float32x2_t v1344 = vrev64_f32(v1012); @@ -17359,10 +15439,11 @@ void armral_fft_cf32_cf32_cs16_ab_t_gu25(const armral_cmplx_f32_t *restrict x, float32x2_t v794 = vsub_f32(v793, v776); float32x2_t v908 = vsub_f32(v907, v890); float32x2_t v1022 = vsub_f32(v1021, v1004); - float32x2_t v1064 = vsub_f32(v1063, v1059); - float32x2_t v1082 = vsub_f32(v1081, v1077); - float32x2_t v1093 = vmul_f32(v1077, v1683); - float32x2_t v1108 = vmul_f32(v1059, v1683); + float32x2_t v1089 = vsub_f32(v540, v1088); + int16x4_t v1113 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1110, 15), (int32x2_t){0, 0})); + float32x2_t v1123 = vmul_f32(v1122, v1711); + float32x2_t v1137 = vmul_f32(v1136, v1711); float32x2_t v1176 = vrev64_f32(v662); float32x2_t v1188 = vrev64_f32(v776); float32x2_t v1200 = vrev64_f32(v1004); @@ -17375,10 +15456,9 @@ void armral_fft_cf32_cf32_cs16_ab_t_gu25(const armral_cmplx_f32_t *restrict x, float32x2_t v1476 = vrev64_f32(v789); float32x2_t v1488 = vrev64_f32(v1017); float32x2_t v1506 = vrev64_f32(v903); - float32x2_t v1083 = vadd_f32(v1064, v1082); - float32x2_t v1084 = vsub_f32(v1064, v1082); - float32x2_t v1094 = vadd_f32(v1059, v1093); - float32x2_t v1109 = vsub_f32(v1108, v1077); + float32x2_t v1099 = vsub_f32(v1089, v1098); + float32x2_t v1103 = vmul_f32(v1089, v1734); + v6[0] = vget_lane_s32(vreinterpret_s32_s16(v1113), 0); float32x2_t v1177 = vmul_f32(v1176, v1175); float32x2_t v1189 = vmul_f32(v1188, v1319); float32x2_t v1201 = vmul_f32(v1200, v1607); @@ -17395,11 +15475,9 @@ void armral_fft_cf32_cf32_cs16_ab_t_gu25(const armral_cmplx_f32_t *restrict x, float32x2_t v1620 = vrev64_f32(v794); float32x2_t v1632 = vrev64_f32(v1022); float32x2_t v1650 = vrev64_f32(v908); - float32x2_t v1088 = vmul_f32(v1083, v1663); - float32x2_t v1098 = vmul_f32(v1084, v1673); - float32x2_t v1110 = vadd_f32(v540, v1083); - float32x2_t v1122 = vrev64_f32(v1094); - float32x2_t v1136 = vrev64_f32(v1109); + float32x2_t v1104 = vsub_f32(v1103, v1099); + float32x2_t v1138 = vsub_f32(v1099, v1137); + float32x2_t v1148 = vmul_f32(v1099, v1734); float32x2_t v1178 = vfma_f32(v1177, v662, v1169); float32x2_t v1190 = vfma_f32(v1189, v776, v1313); float32x2_t v1202 = vfma_f32(v1201, v1004, v1601); @@ -17416,11 +15494,11 @@ void armral_fft_cf32_cf32_cs16_ab_t_gu25(const armral_cmplx_f32_t *restrict x, float32x2_t v1621 = vmul_f32(v1620, v1619); float32x2_t v1633 = vmul_f32(v1632, v1631); float32x2_t v1651 = vmul_f32(v1650, v1649); - float32x2_t v1089 = vsub_f32(v540, v1088); - int16x4_t v1113 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1110, 15), (int32x2_t){0, 0})); - float32x2_t v1123 = vmul_f32(v1122, v1711); - float32x2_t v1137 = vmul_f32(v1136, v1711); + float32x2_t v1124 = vsub_f32(v1104, v1123); + int16x4_t v1141 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1138, 15), (int32x2_t){0, 0})); + float32x2_t v1149 = vsub_f32(v1148, v1138); + float32x2_t v1159 = vmul_f32(v1104, v1734); float32x2_t v1203 = vsub_f32(v1178, v1202); float32x2_t v1207 = vmul_f32(v1178, v1734); float32x2_t v1221 = vsub_f32(v1190, v1220); @@ -17437,9 +15515,12 @@ void armral_fft_cf32_cf32_cs16_ab_t_gu25(const armral_cmplx_f32_t *restrict x, float32x2_t v1622 = vfma_f32(v1621, v794, v1613); float32x2_t v1634 = vfma_f32(v1633, v1022, v1625); float32x2_t v1652 = vfma_f32(v1651, v908, v1643); - float32x2_t v1099 = vsub_f32(v1089, v1098); - float32x2_t v1103 = vmul_f32(v1089, v1734); - v6[0] = vget_lane_s32(vreinterpret_s32_s16(v1113), 0); + int16x4_t v1127 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1124, 15), (int32x2_t){0, 0})); + v6[ostride * 10] = vget_lane_s32(vreinterpret_s32_s16(v1141), 0); + int16x4_t v1152 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1149, 15), (int32x2_t){0, 0})); + float32x2_t v1160 = vsub_f32(v1159, v1124); float32x2_t v1208 = vsub_f32(v1207, v1203); float32x2_t v1226 = vsub_f32(v1225, v1221); float32x2_t v1237 = vmul_f32(v1221, v1683); @@ -17456,9 +15537,10 @@ void armral_fft_cf32_cf32_cs16_ab_t_gu25(const armral_cmplx_f32_t *restrict x, float32x2_t v1639 = vmul_f32(v1610, v1734); float32x2_t v1653 = vsub_f32(v1622, v1652); float32x2_t v1657 = vmul_f32(v1622, v1734); - float32x2_t v1104 = vsub_f32(v1103, v1099); - float32x2_t v1138 = vsub_f32(v1099, v1137); - float32x2_t v1148 = vmul_f32(v1099, v1734); + v6[ostride * 5] = vget_lane_s32(vreinterpret_s32_s16(v1127), 0); + v6[ostride * 15] = vget_lane_s32(vreinterpret_s32_s16(v1152), 0); + int16x4_t v1163 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1160, 15), (int32x2_t){0, 0})); float32x2_t v1227 = vadd_f32(v1208, v1226); float32x2_t v1228 = vsub_f32(v1208, v1226); float32x2_t v1238 = vadd_f32(v1203, v1237); @@ -17476,11 +15558,7 @@ void armral_fft_cf32_cf32_cs16_ab_t_gu25(const armral_cmplx_f32_t *restrict x, float32x2_t v1658 = vsub_f32(v1657, v1653); float32x2_t v1669 = vmul_f32(v1653, v1683); float32x2_t v1684 = vmul_f32(v1635, v1683); - float32x2_t v1124 = vsub_f32(v1104, v1123); - int16x4_t v1141 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1138, 15), (int32x2_t){0, 0})); - float32x2_t v1149 = vsub_f32(v1148, v1138); - float32x2_t v1159 = vmul_f32(v1104, v1734); + v6[ostride * 20] = vget_lane_s32(vreinterpret_s32_s16(v1163), 0); float32x2_t v1232 = vmul_f32(v1227, v1663); float32x2_t v1242 = vmul_f32(v1228, v1673); float32x2_t v1254 = vadd_f32(v548, v1227); @@ -17500,12 +15578,6 @@ void armral_fft_cf32_cf32_cs16_ab_t_gu25(const armral_cmplx_f32_t *restrict x, float32x2_t v1660 = vsub_f32(v1640, v1658); float32x2_t v1670 = vadd_f32(v1635, v1669); float32x2_t v1685 = vsub_f32(v1684, v1653); - int16x4_t v1127 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1124, 15), (int32x2_t){0, 0})); - v6[ostride * 10] = vget_lane_s32(vreinterpret_s32_s16(v1141), 0); - int16x4_t v1152 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1149, 15), (int32x2_t){0, 0})); - float32x2_t v1160 = vsub_f32(v1159, v1124); float32x2_t v1233 = vsub_f32(v548, v1232); int16x4_t v1257 = vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1254, 15), (int32x2_t){0, 0})); @@ -17524,10 +15596,6 @@ void armral_fft_cf32_cf32_cs16_ab_t_gu25(const armral_cmplx_f32_t *restrict x, float32x2_t v1686 = vadd_f32(v566, v1659); float32x2_t v1698 = vrev64_f32(v1670); float32x2_t v1712 = vrev64_f32(v1685); - v6[ostride * 5] = vget_lane_s32(vreinterpret_s32_s16(v1127), 0); - v6[ostride * 15] = vget_lane_s32(vreinterpret_s32_s16(v1152), 0); - int16x4_t v1163 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1160, 15), (int32x2_t){0, 0})); float32x2_t v1243 = vsub_f32(v1233, v1242); float32x2_t v1247 = vmul_f32(v1233, v1734); v6[ostride] = vget_lane_s32(vreinterpret_s32_s16(v1257), 0); @@ -17542,7 +15610,6 @@ void armral_fft_cf32_cf32_cs16_ab_t_gu25(const armral_cmplx_f32_t *restrict x, vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1686, 15), (int32x2_t){0, 0})); float32x2_t v1699 = vmul_f32(v1698, v1711); float32x2_t v1713 = vmul_f32(v1712, v1711); - v6[ostride * 20] = vget_lane_s32(vreinterpret_s32_s16(v1163), 0); float32x2_t v1248 = vsub_f32(v1247, v1243); float32x2_t v1282 = vsub_f32(v1243, v1281); float32x2_t v1292 = vmul_f32(v1243, v1734); @@ -17665,7 +15732,6 @@ void armral_fft_cf32_cf32_cs16_ab_t_gu25(const armral_cmplx_f32_t *restrict x, float v1733 = 2.5000000000000000e-01F; float v1745 = 5.5901699437494745e-01F; float v1757 = 6.1803398874989490e-01F; - float v1787 = 0.0000000000000000e+00F; float v1788 = -9.5105651629515353e-01F; float v1818 = 2.0000000000000000e+00F; const float32x2_t *v1873 = &v5[v0]; @@ -17716,7 +15782,6 @@ void armral_fft_cf32_cf32_cs16_ab_t_gu25(const armral_cmplx_f32_t *restrict x, int64_t v341 = v0 * 24; int64_t v349 = v10 * 23; int64_t v350 = v13 * 24; - float v1051 = v4 * v1787; int64_t v1112 = v2 * 5; int64_t v1128 = v2 * 10; int64_t v1142 = v2 * 15; @@ -17752,6 +15817,7 @@ void armral_fft_cf32_cf32_cs16_ab_t_gu25(const armral_cmplx_f32_t *restrict x, int64_t v1824 = v2 * 24; const float32x2_t *v2055 = &v5[0]; svint64_t v2056 = svindex_s64(0, v1); + svfloat32_t v2161 = svdup_n_f32(0); int32_t *v2175 = &v6[0]; svfloat32_t v2218 = svdup_n_f32(v1164); svfloat32_t v2282 = svdup_n_f32(v1331); @@ -17817,7 +15883,6 @@ void armral_fft_cf32_cf32_cs16_ab_t_gu25(const armral_cmplx_f32_t *restrict x, const float32x2_t *v2045 = &v5[v341]; svfloat32_t v2057 = svreinterpret_f32_f64( svld1_gather_s64index_f64(pred_full, (const double *)(v2055), v2056)); - svfloat32_t v2161 = svdup_n_f32(v1051); int32_t *v2185 = &v6[v1112]; int32_t *v2195 = &v6[v1128]; int32_t *v2205 = &v6[v1142]; @@ -17859,8 +15924,7 @@ void armral_fft_cf32_cf32_cs16_ab_t_gu25(const armral_cmplx_f32_t *restrict x, svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v7)[v57])); svfloat32_t v72 = svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v7)[v71])); - svfloat32_t zero87; - asm volatile("mov %0.s, #0" : "=w"(zero87)); + svfloat32_t zero87 = svdup_n_f32(0); svfloat32_t v87 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero87, v1875, v86, 0), v1875, v86, 90); @@ -17948,118 +16012,95 @@ void armral_fft_cf32_cf32_cs16_ab_t_gu25(const armral_cmplx_f32_t *restrict x, svld1_gather_s64index_f64(pred_full, (const double *)(v2036), v2056)); svfloat32_t v2047 = svreinterpret_f32_f64( svld1_gather_s64index_f64(pred_full, (const double *)(v2045), v2056)); - svfloat32_t zero31; - asm volatile("mov %0.s, #0" : "=w"(zero31)); + svfloat32_t zero31 = svdup_n_f32(0); svfloat32_t v31 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero31, v1839, v30, 0), v1839, v30, 90); - svfloat32_t zero45; - asm volatile("mov %0.s, #0" : "=w"(zero45)); + svfloat32_t zero45 = svdup_n_f32(0); svfloat32_t v45 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero45, v1848, v44, 0), v1848, v44, 90); - svfloat32_t zero59; - asm volatile("mov %0.s, #0" : "=w"(zero59)); + svfloat32_t zero59 = svdup_n_f32(0); svfloat32_t v59 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero59, v1857, v58, 0), v1857, v58, 90); - svfloat32_t zero73; - asm volatile("mov %0.s, #0" : "=w"(zero73)); + svfloat32_t zero73 = svdup_n_f32(0); svfloat32_t v73 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero73, v1866, v72, 0), v1866, v72, 90); - svfloat32_t zero101; - asm volatile("mov %0.s, #0" : "=w"(zero101)); + svfloat32_t zero101 = svdup_n_f32(0); svfloat32_t v101 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero101, v1885, v100, 0), v1885, v100, 90); - svfloat32_t zero115; - asm volatile("mov %0.s, #0" : "=w"(zero115)); + svfloat32_t zero115 = svdup_n_f32(0); svfloat32_t v115 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero115, v1894, v114, 0), v1894, v114, 90); - svfloat32_t zero129; - asm volatile("mov %0.s, #0" : "=w"(zero129)); + svfloat32_t zero129 = svdup_n_f32(0); svfloat32_t v129 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero129, v1903, v128, 0), v1903, v128, 90); - svfloat32_t zero143; - asm volatile("mov %0.s, #0" : "=w"(zero143)); + svfloat32_t zero143 = svdup_n_f32(0); svfloat32_t v143 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero143, v1912, v142, 0), v1912, v142, 90); - svfloat32_t zero157; - asm volatile("mov %0.s, #0" : "=w"(zero157)); + svfloat32_t zero157 = svdup_n_f32(0); svfloat32_t v157 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero157, v1921, v156, 0), v1921, v156, 90); - svfloat32_t zero171; - asm volatile("mov %0.s, #0" : "=w"(zero171)); + svfloat32_t zero171 = svdup_n_f32(0); svfloat32_t v171 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero171, v1930, v170, 0), v1930, v170, 90); - svfloat32_t zero185; - asm volatile("mov %0.s, #0" : "=w"(zero185)); + svfloat32_t zero185 = svdup_n_f32(0); svfloat32_t v185 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero185, v1939, v184, 0), v1939, v184, 90); - svfloat32_t zero199; - asm volatile("mov %0.s, #0" : "=w"(zero199)); + svfloat32_t zero199 = svdup_n_f32(0); svfloat32_t v199 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero199, v1948, v198, 0), v1948, v198, 90); - svfloat32_t zero213; - asm volatile("mov %0.s, #0" : "=w"(zero213)); + svfloat32_t zero213 = svdup_n_f32(0); svfloat32_t v213 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero213, v1957, v212, 0), v1957, v212, 90); - svfloat32_t zero227; - asm volatile("mov %0.s, #0" : "=w"(zero227)); + svfloat32_t zero227 = svdup_n_f32(0); svfloat32_t v227 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero227, v1966, v226, 0), v1966, v226, 90); - svfloat32_t zero241; - asm volatile("mov %0.s, #0" : "=w"(zero241)); + svfloat32_t zero241 = svdup_n_f32(0); svfloat32_t v241 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero241, v1975, v240, 0), v1975, v240, 90); - svfloat32_t zero255; - asm volatile("mov %0.s, #0" : "=w"(zero255)); + svfloat32_t zero255 = svdup_n_f32(0); svfloat32_t v255 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero255, v1984, v254, 0), v1984, v254, 90); - svfloat32_t zero269; - asm volatile("mov %0.s, #0" : "=w"(zero269)); + svfloat32_t zero269 = svdup_n_f32(0); svfloat32_t v269 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero269, v1993, v268, 0), v1993, v268, 90); - svfloat32_t zero283; - asm volatile("mov %0.s, #0" : "=w"(zero283)); + svfloat32_t zero283 = svdup_n_f32(0); svfloat32_t v283 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero283, v2002, v282, 0), v2002, v282, 90); - svfloat32_t zero297; - asm volatile("mov %0.s, #0" : "=w"(zero297)); + svfloat32_t zero297 = svdup_n_f32(0); svfloat32_t v297 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero297, v2011, v296, 0), v2011, v296, 90); - svfloat32_t zero311; - asm volatile("mov %0.s, #0" : "=w"(zero311)); + svfloat32_t zero311 = svdup_n_f32(0); svfloat32_t v311 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero311, v2020, v310, 0), v2020, v310, 90); - svfloat32_t zero325; - asm volatile("mov %0.s, #0" : "=w"(zero325)); + svfloat32_t zero325 = svdup_n_f32(0); svfloat32_t v325 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero325, v2029, v324, 0), v2029, v324, 90); - svfloat32_t zero339; - asm volatile("mov %0.s, #0" : "=w"(zero339)); + svfloat32_t zero339 = svdup_n_f32(0); svfloat32_t v339 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero339, v2038, v338, 0), v2038, v338, 90); - svfloat32_t zero353; - asm volatile("mov %0.s, #0" : "=w"(zero353)); + svfloat32_t zero353 = svdup_n_f32(0); svfloat32_t v353 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero353, v2047, v352, 0), v2047, v352, 90); @@ -18083,26 +16124,16 @@ void armral_fft_cf32_cf32_cs16_ab_t_gu25(const armral_cmplx_f32_t *restrict x, svfloat32_t v894 = svcmla_f32_x(pred_full, v325, v2161, v325, 90); svfloat32_t v907 = svcmla_f32_x(pred_full, v353, v2161, v353, 90); svfloat32_t v927 = svcmla_f32_x(pred_full, v339, v2161, v339, 90); - svfloat32_t v400; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v400) : "w"(v373), "w"(v399)); - svfloat32_t v420; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v420) : "w"(v386), "w"(v419)); - svfloat32_t v527; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v527) : "w"(v500), "w"(v526)); - svfloat32_t v547; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v547) : "w"(v513), "w"(v546)); - svfloat32_t v654; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v654) : "w"(v627), "w"(v653)); - svfloat32_t v674; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v674) : "w"(v640), "w"(v673)); - svfloat32_t v781; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v781) : "w"(v754), "w"(v780)); - svfloat32_t v801; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v801) : "w"(v767), "w"(v800)); - svfloat32_t v908; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v908) : "w"(v881), "w"(v907)); - svfloat32_t v928; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v928) : "w"(v894), "w"(v927)); + svfloat32_t v400 = svsub_f32_x(svptrue_b32(), v373, v399); + svfloat32_t v420 = svsub_f32_x(svptrue_b32(), v386, v419); + svfloat32_t v527 = svsub_f32_x(svptrue_b32(), v500, v526); + svfloat32_t v547 = svsub_f32_x(svptrue_b32(), v513, v546); + svfloat32_t v654 = svsub_f32_x(svptrue_b32(), v627, v653); + svfloat32_t v674 = svsub_f32_x(svptrue_b32(), v640, v673); + svfloat32_t v781 = svsub_f32_x(svptrue_b32(), v754, v780); + svfloat32_t v801 = svsub_f32_x(svptrue_b32(), v767, v800); + svfloat32_t v908 = svsub_f32_x(svptrue_b32(), v881, v907); + svfloat32_t v928 = svsub_f32_x(svptrue_b32(), v894, v927); svfloat32_t v406 = svnmls_f32_x(pred_full, v400, v373, v2464); svfloat32_t v426 = svnmls_f32_x(pred_full, v420, v386, v2464); svfloat32_t v533 = svnmls_f32_x(pred_full, v527, v500, v2464); @@ -18113,75 +16144,50 @@ void armral_fft_cf32_cf32_cs16_ab_t_gu25(const armral_cmplx_f32_t *restrict x, svfloat32_t v807 = svnmls_f32_x(pred_full, v801, v767, v2464); svfloat32_t v914 = svnmls_f32_x(pred_full, v908, v881, v2464); svfloat32_t v934 = svnmls_f32_x(pred_full, v928, v894, v2464); - svfloat32_t v427; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v427) : "w"(v406), "w"(v426)); - svfloat32_t v428; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v428) : "w"(v406), "w"(v426)); + svfloat32_t v427 = svadd_f32_x(svptrue_b32(), v406, v426); + svfloat32_t v428 = svsub_f32_x(svptrue_b32(), v406, v426); svfloat32_t v440 = svmla_f32_x(pred_full, v400, v420, v2424); svfloat32_t v458 = svnmls_f32_x(pred_full, v420, v400, v2424); - svfloat32_t v554; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v554) : "w"(v533), "w"(v553)); - svfloat32_t v555; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v555) : "w"(v533), "w"(v553)); + svfloat32_t v554 = svadd_f32_x(svptrue_b32(), v533, v553); + svfloat32_t v555 = svsub_f32_x(svptrue_b32(), v533, v553); svfloat32_t v567 = svmla_f32_x(pred_full, v527, v547, v2424); svfloat32_t v585 = svnmls_f32_x(pred_full, v547, v527, v2424); - svfloat32_t v681; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v681) : "w"(v660), "w"(v680)); - svfloat32_t v682; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v682) : "w"(v660), "w"(v680)); + svfloat32_t v681 = svadd_f32_x(svptrue_b32(), v660, v680); + svfloat32_t v682 = svsub_f32_x(svptrue_b32(), v660, v680); svfloat32_t v694 = svmla_f32_x(pred_full, v654, v674, v2424); svfloat32_t v712 = svnmls_f32_x(pred_full, v674, v654, v2424); - svfloat32_t v808; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v808) : "w"(v787), "w"(v807)); - svfloat32_t v809; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v809) : "w"(v787), "w"(v807)); + svfloat32_t v808 = svadd_f32_x(svptrue_b32(), v787, v807); + svfloat32_t v809 = svsub_f32_x(svptrue_b32(), v787, v807); svfloat32_t v821 = svmla_f32_x(pred_full, v781, v801, v2424); svfloat32_t v839 = svnmls_f32_x(pred_full, v801, v781, v2424); - svfloat32_t v935; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v935) : "w"(v914), "w"(v934)); - svfloat32_t v936; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v936) : "w"(v914), "w"(v934)); + svfloat32_t v935 = svadd_f32_x(svptrue_b32(), v914, v934); + svfloat32_t v936 = svsub_f32_x(svptrue_b32(), v914, v934); svfloat32_t v948 = svmla_f32_x(pred_full, v908, v928, v2424); svfloat32_t v966 = svnmls_f32_x(pred_full, v928, v908, v2424); - svfloat32_t v459; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v459) : "w"(v2057), "w"(v427)); - svfloat32_t zero466; - asm volatile("mov %0.s, #0" : "=w"(zero466)); + svfloat32_t v459 = svadd_f32_x(svptrue_b32(), v2057, v427); + svfloat32_t zero466 = svdup_n_f32(0); svfloat32_t v466 = svcmla_f32_x(pred_full, zero466, v2444, v440, 90); - svfloat32_t zero474; - asm volatile("mov %0.s, #0" : "=w"(zero474)); + svfloat32_t zero474 = svdup_n_f32(0); svfloat32_t v474 = svcmla_f32_x(pred_full, zero474, v2444, v458, 90); - svfloat32_t v586; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v586) : "w"(v87), "w"(v554)); - svfloat32_t zero593; - asm volatile("mov %0.s, #0" : "=w"(zero593)); + svfloat32_t v586 = svadd_f32_x(svptrue_b32(), v87, v554); + svfloat32_t zero593 = svdup_n_f32(0); svfloat32_t v593 = svcmla_f32_x(pred_full, zero593, v2444, v567, 90); - svfloat32_t zero601; - asm volatile("mov %0.s, #0" : "=w"(zero601)); + svfloat32_t zero601 = svdup_n_f32(0); svfloat32_t v601 = svcmla_f32_x(pred_full, zero601, v2444, v585, 90); - svfloat32_t v713; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v713) : "w"(v157), "w"(v681)); - svfloat32_t zero720; - asm volatile("mov %0.s, #0" : "=w"(zero720)); + svfloat32_t v713 = svadd_f32_x(svptrue_b32(), v157, v681); + svfloat32_t zero720 = svdup_n_f32(0); svfloat32_t v720 = svcmla_f32_x(pred_full, zero720, v2444, v694, 90); - svfloat32_t zero728; - asm volatile("mov %0.s, #0" : "=w"(zero728)); + svfloat32_t zero728 = svdup_n_f32(0); svfloat32_t v728 = svcmla_f32_x(pred_full, zero728, v2444, v712, 90); - svfloat32_t v840; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v840) : "w"(v227), "w"(v808)); - svfloat32_t zero847; - asm volatile("mov %0.s, #0" : "=w"(zero847)); + svfloat32_t v840 = svadd_f32_x(svptrue_b32(), v227, v808); + svfloat32_t zero847 = svdup_n_f32(0); svfloat32_t v847 = svcmla_f32_x(pred_full, zero847, v2444, v821, 90); - svfloat32_t zero855; - asm volatile("mov %0.s, #0" : "=w"(zero855)); + svfloat32_t zero855 = svdup_n_f32(0); svfloat32_t v855 = svcmla_f32_x(pred_full, zero855, v2444, v839, 90); - svfloat32_t v967; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v967) : "w"(v297), "w"(v935)); - svfloat32_t zero974; - asm volatile("mov %0.s, #0" : "=w"(zero974)); + svfloat32_t v967 = svadd_f32_x(svptrue_b32(), v297, v935); + svfloat32_t zero974 = svdup_n_f32(0); svfloat32_t v974 = svcmla_f32_x(pred_full, zero974, v2444, v948, 90); - svfloat32_t zero982; - asm volatile("mov %0.s, #0" : "=w"(zero982)); + svfloat32_t zero982 = svdup_n_f32(0); svfloat32_t v982 = svcmla_f32_x(pred_full, zero982, v2444, v966, 90); svfloat32_t v434 = svmls_f32_x(pred_full, v2057, v427, v2420); svfloat32_t v561 = svmls_f32_x(pred_full, v87, v554, v2420); @@ -18198,108 +16204,73 @@ void armral_fft_cf32_cf32_cs16_ab_t_gu25(const armral_cmplx_f32_t *restrict x, svfloat32_t v1034 = svcmla_f32_x(pred_full, v967, v2161, v967, 90); svfloat32_t v1054 = svcmla_f32_x(pred_full, v840, v2161, v840, 90); svfloat32_t v452 = svnmls_f32_x(pred_full, v446, v434, v2464); - svfloat32_t v475; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v475) : "w"(v446), "w"(v474)); + svfloat32_t v475 = svsub_f32_x(svptrue_b32(), v446, v474); svfloat32_t v579 = svnmls_f32_x(pred_full, v573, v561, v2464); - svfloat32_t v602; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v602) : "w"(v573), "w"(v601)); + svfloat32_t v602 = svsub_f32_x(svptrue_b32(), v573, v601); svfloat32_t v706 = svnmls_f32_x(pred_full, v700, v688, v2464); - svfloat32_t v729; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v729) : "w"(v700), "w"(v728)); + svfloat32_t v729 = svsub_f32_x(svptrue_b32(), v700, v728); svfloat32_t v833 = svnmls_f32_x(pred_full, v827, v815, v2464); - svfloat32_t v856; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v856) : "w"(v827), "w"(v855)); + svfloat32_t v856 = svsub_f32_x(svptrue_b32(), v827, v855); svfloat32_t v960 = svnmls_f32_x(pred_full, v954, v942, v2464); - svfloat32_t v983; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v983) : "w"(v954), "w"(v982)); - svfloat32_t v1035; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1035) : "w"(v1008), "w"(v1034)); - svfloat32_t v1055; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1055) : "w"(v1021), "w"(v1054)); - svfloat32_t v467; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v467) : "w"(v452), "w"(v466)); + svfloat32_t v983 = svsub_f32_x(svptrue_b32(), v954, v982); + svfloat32_t v1035 = svsub_f32_x(svptrue_b32(), v1008, v1034); + svfloat32_t v1055 = svsub_f32_x(svptrue_b32(), v1021, v1054); + svfloat32_t v467 = svsub_f32_x(svptrue_b32(), v452, v466); svfloat32_t v481 = svnmls_f32_x(pred_full, v475, v446, v2464); - svfloat32_t v594; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v594) : "w"(v579), "w"(v593)); + svfloat32_t v594 = svsub_f32_x(svptrue_b32(), v579, v593); svfloat32_t v608 = svnmls_f32_x(pred_full, v602, v573, v2464); - svfloat32_t v721; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v721) : "w"(v706), "w"(v720)); + svfloat32_t v721 = svsub_f32_x(svptrue_b32(), v706, v720); svfloat32_t v735 = svnmls_f32_x(pred_full, v729, v700, v2464); - svfloat32_t v848; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v848) : "w"(v833), "w"(v847)); + svfloat32_t v848 = svsub_f32_x(svptrue_b32(), v833, v847); svfloat32_t v862 = svnmls_f32_x(pred_full, v856, v827, v2464); - svfloat32_t v975; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v975) : "w"(v960), "w"(v974)); + svfloat32_t v975 = svsub_f32_x(svptrue_b32(), v960, v974); svfloat32_t v989 = svnmls_f32_x(pred_full, v983, v954, v2464); svfloat32_t v1041 = svnmls_f32_x(pred_full, v1035, v1008, v2464); svfloat32_t v1061 = svnmls_f32_x(pred_full, v1055, v1021, v2464); - svfloat32_t v1334; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v1334) : "w"(v602), "w"(v2282)); - svfloat32_t v1347; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v1347) : "w"(v729), "w"(v2410)); - svfloat32_t v1360; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v1360) : "w"(v983), "w"(v2412)); - svfloat32_t v1380; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v1380) : "w"(v856), "w"(v2348)); + svfloat32_t v1334 = svmul_f32_x(svptrue_b32(), v602, v2282); + svfloat32_t v1347 = svmul_f32_x(svptrue_b32(), v729, v2410); + svfloat32_t v1360 = svmul_f32_x(svptrue_b32(), v983, v2412); + svfloat32_t v1380 = svmul_f32_x(svptrue_b32(), v856, v2348); svfloat32_t v487 = svnmls_f32_x(pred_full, v467, v452, v2464); svfloat32_t v614 = svnmls_f32_x(pred_full, v594, v579, v2464); svfloat32_t v741 = svnmls_f32_x(pred_full, v721, v706, v2464); svfloat32_t v868 = svnmls_f32_x(pred_full, v848, v833, v2464); svfloat32_t v995 = svnmls_f32_x(pred_full, v975, v960, v2464); - svfloat32_t v1062; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v1062) : "w"(v1041), "w"(v1061)); - svfloat32_t v1063; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1063) : "w"(v1041), "w"(v1061)); + svfloat32_t v1062 = svadd_f32_x(svptrue_b32(), v1041, v1061); + svfloat32_t v1063 = svsub_f32_x(svptrue_b32(), v1041, v1061); svfloat32_t v1075 = svmla_f32_x(pred_full, v1035, v1055, v2424); svfloat32_t v1093 = svnmls_f32_x(pred_full, v1055, v1035, v2424); - svfloat32_t v1167; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v1167) : "w"(v594), "w"(v2218)); - svfloat32_t v1180; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v1180) : "w"(v721), "w"(v2282)); - svfloat32_t v1193; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v1193) : "w"(v975), "w"(v2410)); - svfloat32_t v1213; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v1213) : "w"(v848), "w"(v2346)); + svfloat32_t v1167 = svmul_f32_x(svptrue_b32(), v594, v2218); + svfloat32_t v1180 = svmul_f32_x(svptrue_b32(), v721, v2282); + svfloat32_t v1193 = svmul_f32_x(svptrue_b32(), v975, v2410); + svfloat32_t v1213 = svmul_f32_x(svptrue_b32(), v848, v2346); svfloat32_t v1342 = svcmla_f32_x(pred_full, v1334, v2283, v602, 90); svfloat32_t v1355 = svcmla_f32_x(pred_full, v1347, v2411, v729, 90); svfloat32_t v1368 = svcmla_f32_x(pred_full, v1360, v2413, v983, 90); svfloat32_t v1388 = svcmla_f32_x(pred_full, v1380, v2349, v856, 90); - svfloat32_t v1501; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v1501) : "w"(v608), "w"(v2346)); - svfloat32_t v1514; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v1514) : "w"(v735), "w"(v2348)); - svfloat32_t v1527; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v1527) : "w"(v989), "w"(v2417)); - svfloat32_t v1547; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v1547) : "w"(v862), "w"(v2414)); - svfloat32_t v1094; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v1094) : "w"(v459), "w"(v1062)); - svfloat32_t zero1109; - asm volatile("mov %0.s, #0" : "=w"(zero1109)); + svfloat32_t v1501 = svmul_f32_x(svptrue_b32(), v608, v2346); + svfloat32_t v1514 = svmul_f32_x(svptrue_b32(), v735, v2348); + svfloat32_t v1527 = svmul_f32_x(svptrue_b32(), v989, v2417); + svfloat32_t v1547 = svmul_f32_x(svptrue_b32(), v862, v2414); + svfloat32_t v1094 = svadd_f32_x(svptrue_b32(), v459, v1062); + svfloat32_t zero1109 = svdup_n_f32(0); svfloat32_t v1109 = svcmla_f32_x(pred_full, zero1109, v2444, v1075, 90); - svfloat32_t zero1125; - asm volatile("mov %0.s, #0" : "=w"(zero1125)); + svfloat32_t zero1125 = svdup_n_f32(0); svfloat32_t v1125 = svcmla_f32_x(pred_full, zero1125, v2444, v1093, 90); svfloat32_t v1175 = svcmla_f32_x(pred_full, v1167, v2219, v594, 90); svfloat32_t v1188 = svcmla_f32_x(pred_full, v1180, v2283, v721, 90); svfloat32_t v1201 = svcmla_f32_x(pred_full, v1193, v2411, v975, 90); svfloat32_t v1221 = svcmla_f32_x(pred_full, v1213, v2347, v848, 90); - svfloat32_t v1369; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1369) : "w"(v1342), "w"(v1368)); - svfloat32_t v1389; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1389) : "w"(v1355), "w"(v1388)); + svfloat32_t v1369 = svsub_f32_x(svptrue_b32(), v1342, v1368); + svfloat32_t v1389 = svsub_f32_x(svptrue_b32(), v1355, v1388); svfloat32_t v1509 = svcmla_f32_x(pred_full, v1501, v2347, v608, 90); svfloat32_t v1522 = svcmla_f32_x(pred_full, v1514, v2349, v735, 90); svfloat32_t v1535 = svcmla_f32_x(pred_full, v1527, v2418, v989, 90); svfloat32_t v1555 = svcmla_f32_x(pred_full, v1547, v2354, v862, 90); - svfloat32_t v1668; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v1668) : "w"(v614), "w"(v2410)); - svfloat32_t v1681; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v1681) : "w"(v741), "w"(v2412)); - svfloat32_t v1694; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v1694) : "w"(v995), "w"(v2414)); - svfloat32_t v1714; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v1714) : "w"(v868), "w"(v2417)); + svfloat32_t v1668 = svmul_f32_x(svptrue_b32(), v614, v2410); + svfloat32_t v1681 = svmul_f32_x(svptrue_b32(), v741, v2412); + svfloat32_t v1694 = svmul_f32_x(svptrue_b32(), v995, v2414); + svfloat32_t v1714 = svmul_f32_x(svptrue_b32(), v868, v2417); svfloat32_t v1069 = svmls_f32_x(pred_full, v459, v1062, v2420); svint16_t v1097 = svtbl_s16(svreinterpret_s16_s32(svcvt_s32_f32_x( @@ -18307,16 +16278,12 @@ void armral_fft_cf32_cf32_cs16_ab_t_gu25(const armral_cmplx_f32_t *restrict x, svmul_n_f32_x(pred_full, v1094, (float)(1ULL << 31ULL)))), svreinterpret_u16_u64(svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svfloat32_t v1202; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1202) : "w"(v1175), "w"(v1201)); - svfloat32_t v1222; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1222) : "w"(v1188), "w"(v1221)); + svfloat32_t v1202 = svsub_f32_x(svptrue_b32(), v1175, v1201); + svfloat32_t v1222 = svsub_f32_x(svptrue_b32(), v1188, v1221); svfloat32_t v1375 = svnmls_f32_x(pred_full, v1369, v1342, v2464); svfloat32_t v1395 = svnmls_f32_x(pred_full, v1389, v1355, v2464); - svfloat32_t v1536; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1536) : "w"(v1509), "w"(v1535)); - svfloat32_t v1556; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1556) : "w"(v1522), "w"(v1555)); + svfloat32_t v1536 = svsub_f32_x(svptrue_b32(), v1509, v1535); + svfloat32_t v1556 = svsub_f32_x(svptrue_b32(), v1522, v1555); svfloat32_t v1676 = svcmla_f32_x(pred_full, v1668, v2411, v614, 90); svfloat32_t v1689 = svcmla_f32_x(pred_full, v1681, v2413, v741, 90); svfloat32_t v1702 = svcmla_f32_x(pred_full, v1694, v2415, v995, 90); @@ -18324,46 +16291,33 @@ void armral_fft_cf32_cf32_cs16_ab_t_gu25(const armral_cmplx_f32_t *restrict x, svfloat32_t v1081 = svmls_f32_x(pred_full, v1069, v1063, v2422); svfloat32_t v1208 = svnmls_f32_x(pred_full, v1202, v1175, v2464); svfloat32_t v1228 = svnmls_f32_x(pred_full, v1222, v1188, v2464); - svfloat32_t v1396; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v1396) : "w"(v1375), "w"(v1395)); - svfloat32_t v1397; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1397) : "w"(v1375), "w"(v1395)); + svfloat32_t v1396 = svadd_f32_x(svptrue_b32(), v1375, v1395); + svfloat32_t v1397 = svsub_f32_x(svptrue_b32(), v1375, v1395); svfloat32_t v1409 = svmla_f32_x(pred_full, v1369, v1389, v2424); svfloat32_t v1427 = svnmls_f32_x(pred_full, v1389, v1369, v2424); svfloat32_t v1542 = svnmls_f32_x(pred_full, v1536, v1509, v2464); svfloat32_t v1562 = svnmls_f32_x(pred_full, v1556, v1522, v2464); - svfloat32_t v1703; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1703) : "w"(v1676), "w"(v1702)); - svfloat32_t v1723; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1723) : "w"(v1689), "w"(v1722)); + svfloat32_t v1703 = svsub_f32_x(svptrue_b32(), v1676, v1702); + svfloat32_t v1723 = svsub_f32_x(svptrue_b32(), v1689, v1722); svst1w_u64(pred_full, (unsigned *)(v2175), svreinterpret_u64_s16(v1097)); svfloat32_t v1087 = svnmls_f32_x(pred_full, v1081, v1069, v2464); - svfloat32_t v1126; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1126) : "w"(v1081), "w"(v1125)); - svfloat32_t v1229; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v1229) : "w"(v1208), "w"(v1228)); - svfloat32_t v1230; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1230) : "w"(v1208), "w"(v1228)); + svfloat32_t v1126 = svsub_f32_x(svptrue_b32(), v1081, v1125); + svfloat32_t v1229 = svadd_f32_x(svptrue_b32(), v1208, v1228); + svfloat32_t v1230 = svsub_f32_x(svptrue_b32(), v1208, v1228); svfloat32_t v1242 = svmla_f32_x(pred_full, v1202, v1222, v2424); svfloat32_t v1260 = svnmls_f32_x(pred_full, v1222, v1202, v2424); - svfloat32_t v1428; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v1428) : "w"(v475), "w"(v1396)); - svfloat32_t zero1443; - asm volatile("mov %0.s, #0" : "=w"(zero1443)); + svfloat32_t v1428 = svadd_f32_x(svptrue_b32(), v475, v1396); + svfloat32_t zero1443 = svdup_n_f32(0); svfloat32_t v1443 = svcmla_f32_x(pred_full, zero1443, v2444, v1409, 90); - svfloat32_t zero1459; - asm volatile("mov %0.s, #0" : "=w"(zero1459)); + svfloat32_t zero1459 = svdup_n_f32(0); svfloat32_t v1459 = svcmla_f32_x(pred_full, zero1459, v2444, v1427, 90); - svfloat32_t v1563; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v1563) : "w"(v1542), "w"(v1562)); - svfloat32_t v1564; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1564) : "w"(v1542), "w"(v1562)); + svfloat32_t v1563 = svadd_f32_x(svptrue_b32(), v1542, v1562); + svfloat32_t v1564 = svsub_f32_x(svptrue_b32(), v1542, v1562); svfloat32_t v1576 = svmla_f32_x(pred_full, v1536, v1556, v2424); svfloat32_t v1594 = svnmls_f32_x(pred_full, v1556, v1536, v2424); svfloat32_t v1709 = svnmls_f32_x(pred_full, v1703, v1676, v2464); svfloat32_t v1729 = svnmls_f32_x(pred_full, v1723, v1689, v2464); - svfloat32_t v1110; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1110) : "w"(v1087), "w"(v1109)); + svfloat32_t v1110 = svsub_f32_x(svptrue_b32(), v1087, v1109); svint16_t v1129 = svtbl_s16(svreinterpret_s16_s32(svcvt_s32_f32_x( pred_full, @@ -18371,13 +16325,10 @@ void armral_fft_cf32_cf32_cs16_ab_t_gu25(const armral_cmplx_f32_t *restrict x, svreinterpret_u16_u64(svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); svfloat32_t v1140 = svnmls_f32_x(pred_full, v1126, v1081, v2464); - svfloat32_t v1261; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v1261) : "w"(v467), "w"(v1229)); - svfloat32_t zero1276; - asm volatile("mov %0.s, #0" : "=w"(zero1276)); + svfloat32_t v1261 = svadd_f32_x(svptrue_b32(), v467, v1229); + svfloat32_t zero1276 = svdup_n_f32(0); svfloat32_t v1276 = svcmla_f32_x(pred_full, zero1276, v2444, v1242, 90); - svfloat32_t zero1292; - asm volatile("mov %0.s, #0" : "=w"(zero1292)); + svfloat32_t zero1292 = svdup_n_f32(0); svfloat32_t v1292 = svcmla_f32_x(pred_full, zero1292, v2444, v1260, 90); svfloat32_t v1403 = svmls_f32_x(pred_full, v475, v1396, v2420); svint16_t v1431 = @@ -18386,18 +16337,13 @@ void armral_fft_cf32_cf32_cs16_ab_t_gu25(const armral_cmplx_f32_t *restrict x, svmul_n_f32_x(pred_full, v1428, (float)(1ULL << 31ULL)))), svreinterpret_u16_u64(svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svfloat32_t v1595; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v1595) : "w"(v481), "w"(v1563)); - svfloat32_t zero1610; - asm volatile("mov %0.s, #0" : "=w"(zero1610)); + svfloat32_t v1595 = svadd_f32_x(svptrue_b32(), v481, v1563); + svfloat32_t zero1610 = svdup_n_f32(0); svfloat32_t v1610 = svcmla_f32_x(pred_full, zero1610, v2444, v1576, 90); - svfloat32_t zero1626; - asm volatile("mov %0.s, #0" : "=w"(zero1626)); + svfloat32_t zero1626 = svdup_n_f32(0); svfloat32_t v1626 = svcmla_f32_x(pred_full, zero1626, v2444, v1594, 90); - svfloat32_t v1730; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v1730) : "w"(v1709), "w"(v1729)); - svfloat32_t v1731; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1731) : "w"(v1709), "w"(v1729)); + svfloat32_t v1730 = svadd_f32_x(svptrue_b32(), v1709, v1729); + svfloat32_t v1731 = svsub_f32_x(svptrue_b32(), v1709, v1729); svfloat32_t v1743 = svmla_f32_x(pred_full, v1703, v1723, v2424); svfloat32_t v1761 = svnmls_f32_x(pred_full, v1723, v1703, v2424); svint16_t v1113 = @@ -18428,13 +16374,10 @@ void armral_fft_cf32_cf32_cs16_ab_t_gu25(const armral_cmplx_f32_t *restrict x, svmul_n_f32_x(pred_full, v1595, (float)(1ULL << 31ULL)))), svreinterpret_u16_u64(svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svfloat32_t v1762; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v1762) : "w"(v487), "w"(v1730)); - svfloat32_t zero1777; - asm volatile("mov %0.s, #0" : "=w"(zero1777)); + svfloat32_t v1762 = svadd_f32_x(svptrue_b32(), v487, v1730); + svfloat32_t zero1777 = svdup_n_f32(0); svfloat32_t v1777 = svcmla_f32_x(pred_full, zero1777, v2444, v1743, 90); - svfloat32_t zero1793; - asm volatile("mov %0.s, #0" : "=w"(zero1793)); + svfloat32_t zero1793 = svdup_n_f32(0); svfloat32_t v1793 = svcmla_f32_x(pred_full, zero1793, v2444, v1761, 90); svst1w_u64(pred_full, (unsigned *)(v2195), svreinterpret_u64_s16(v1129)); svst1w_u64(pred_full, (unsigned *)(v2303), svreinterpret_u64_s16(v1431)); @@ -18446,8 +16389,7 @@ void armral_fft_cf32_cf32_cs16_ab_t_gu25(const armral_cmplx_f32_t *restrict x, 0x0000000000040004ULL))); svfloat32_t v1248 = svmls_f32_x(pred_full, v1236, v1230, v2422); svfloat32_t v1421 = svnmls_f32_x(pred_full, v1415, v1403, v2464); - svfloat32_t v1460; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1460) : "w"(v1415), "w"(v1459)); + svfloat32_t v1460 = svsub_f32_x(svptrue_b32(), v1415, v1459); svfloat32_t v1582 = svmls_f32_x(pred_full, v1570, v1564, v2422); svfloat32_t v1737 = svmls_f32_x(pred_full, v487, v1730, v2420); svint16_t v1765 = @@ -18461,10 +16403,8 @@ void armral_fft_cf32_cf32_cs16_ab_t_gu25(const armral_cmplx_f32_t *restrict x, svst1w_u64(pred_full, (unsigned *)(v2239), svreinterpret_u64_s16(v1264)); svst1w_u64(pred_full, (unsigned *)(v2367), svreinterpret_u64_s16(v1598)); svfloat32_t v1254 = svnmls_f32_x(pred_full, v1248, v1236, v2464); - svfloat32_t v1293; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1293) : "w"(v1248), "w"(v1292)); - svfloat32_t v1444; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1444) : "w"(v1421), "w"(v1443)); + svfloat32_t v1293 = svsub_f32_x(svptrue_b32(), v1248, v1292); + svfloat32_t v1444 = svsub_f32_x(svptrue_b32(), v1421, v1443); svint16_t v1463 = svtbl_s16(svreinterpret_s16_s32(svcvt_s32_f32_x( pred_full, @@ -18473,13 +16413,11 @@ void armral_fft_cf32_cf32_cs16_ab_t_gu25(const armral_cmplx_f32_t *restrict x, 0x0000000000040004ULL))); svfloat32_t v1474 = svnmls_f32_x(pred_full, v1460, v1415, v2464); svfloat32_t v1588 = svnmls_f32_x(pred_full, v1582, v1570, v2464); - svfloat32_t v1627; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1627) : "w"(v1582), "w"(v1626)); + svfloat32_t v1627 = svsub_f32_x(svptrue_b32(), v1582, v1626); svfloat32_t v1749 = svmls_f32_x(pred_full, v1737, v1731, v2422); svst1w_u64(pred_full, (unsigned *)(v2215), svreinterpret_u64_s16(v1157)); svst1w_u64(pred_full, (unsigned *)(v2431), svreinterpret_u64_s16(v1765)); - svfloat32_t v1277; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1277) : "w"(v1254), "w"(v1276)); + svfloat32_t v1277 = svsub_f32_x(svptrue_b32(), v1254, v1276); svint16_t v1296 = svtbl_s16(svreinterpret_s16_s32(svcvt_s32_f32_x( pred_full, @@ -18500,8 +16438,7 @@ void armral_fft_cf32_cf32_cs16_ab_t_gu25(const armral_cmplx_f32_t *restrict x, svreinterpret_u16_u64(svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); svfloat32_t v1488 = svnmls_f32_x(pred_full, v1444, v1421, v2464); - svfloat32_t v1611; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1611) : "w"(v1588), "w"(v1610)); + svfloat32_t v1611 = svsub_f32_x(svptrue_b32(), v1588, v1610); svint16_t v1630 = svtbl_s16(svreinterpret_s16_s32(svcvt_s32_f32_x( pred_full, @@ -18510,8 +16447,7 @@ void armral_fft_cf32_cf32_cs16_ab_t_gu25(const armral_cmplx_f32_t *restrict x, 0x0000000000040004ULL))); svfloat32_t v1641 = svnmls_f32_x(pred_full, v1627, v1582, v2464); svfloat32_t v1755 = svnmls_f32_x(pred_full, v1749, v1737, v2464); - svfloat32_t v1794; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1794) : "w"(v1749), "w"(v1793)); + svfloat32_t v1794 = svsub_f32_x(svptrue_b32(), v1749, v1793); svst1w_u64(pred_full, (unsigned *)(v2323), svreinterpret_u64_s16(v1463)); svint16_t v1280 = svtbl_s16(svreinterpret_s16_s32(svcvt_s32_f32_x( @@ -18545,8 +16481,7 @@ void armral_fft_cf32_cf32_cs16_ab_t_gu25(const armral_cmplx_f32_t *restrict x, svreinterpret_u16_u64(svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); svfloat32_t v1655 = svnmls_f32_x(pred_full, v1611, v1588, v2464); - svfloat32_t v1778; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1778) : "w"(v1755), "w"(v1777)); + svfloat32_t v1778 = svsub_f32_x(svptrue_b32(), v1755, v1777); svint16_t v1797 = svtbl_s16(svreinterpret_s16_s32(svcvt_s32_f32_x( pred_full, @@ -19376,7 +17311,6 @@ void armral_fft_cf32_cf32_cs16_ab_t_gu32(const armral_cmplx_f32_t *restrict x, float v1456 = 9.8078528040323043e-01F; float v1463 = -5.5557023301960218e-01F; float v1468 = -8.3146961230254524e-01F; - float v1479 = 1.0000000000000000e+00F; const float32x2_t *v1662 = &v5[v0]; int32_t *v1900 = &v6[v2]; int64_t v19 = v0 * 16; @@ -19474,7 +17408,6 @@ void armral_fft_cf32_cf32_cs16_ab_t_gu32(const armral_cmplx_f32_t *restrict x, int64_t v1443 = v2 * 30; float v1459 = v4 * v1456; float v1471 = v4 * v1468; - float v1482 = v4 * v1479; int64_t v1490 = v2 * 7; int64_t v1498 = v2 * 15; int64_t v1506 = v2 * 23; @@ -19494,6 +17427,7 @@ void armral_fft_cf32_cf32_cs16_ab_t_gu32(const armral_cmplx_f32_t *restrict x, svfloat32_t v2096 = svdup_n_f32(v1392); svfloat32_t v2135 = svdup_n_f32(v1451); svfloat32_t v2137 = svdup_n_f32(v1463); + svfloat32_t v2139 = svdup_n_f32(v4); int64_t v36 = v34 + v595; int64_t v50 = v48 + v595; int64_t v64 = v62 + v595; @@ -19595,7 +17529,6 @@ void armral_fft_cf32_cf32_cs16_ab_t_gu32(const armral_cmplx_f32_t *restrict x, int32_t *v2132 = &v6[v1443]; svfloat32_t v2136 = svdup_n_f32(v1459); svfloat32_t v2138 = svdup_n_f32(v1471); - svfloat32_t v2139 = svdup_n_f32(v1482); int32_t *v2146 = &v6[v1490]; int32_t *v2155 = &v6[v1498]; int32_t *v2164 = &v6[v1506]; @@ -19630,8 +17563,7 @@ void armral_fft_cf32_cf32_cs16_ab_t_gu32(const armral_cmplx_f32_t *restrict x, svld1_f64(pred_full, &((const double *)v7)[v274])); svfloat32_t v289 = svreinterpret_f32_f64( svld1_f64(pred_full, &((const double *)v7)[v288])); - svfloat32_t zero325; - asm volatile("mov %0.s, #0" : "=w"(zero325)); + svfloat32_t zero325 = svdup_n_f32(0); svfloat32_t v325 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero325, v1664, v324, 0), v1664, v324, 90); @@ -19725,443 +17657,282 @@ void armral_fft_cf32_cf32_cs16_ab_t_gu32(const armral_cmplx_f32_t *restrict x, svld1_gather_s64index_f64(pred_full, (const double *)(v1790), v1810)); svfloat32_t v1801 = svreinterpret_f32_f64( svld1_gather_s64index_f64(pred_full, (const double *)(v1799), v1810)); - svfloat32_t zero38; - asm volatile("mov %0.s, #0" : "=w"(zero38)); + svfloat32_t zero38 = svdup_n_f32(0); svfloat32_t v38 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero38, v1529, v37, 0), v1529, v37, 90); - svfloat32_t zero52; - asm volatile("mov %0.s, #0" : "=w"(zero52)); + svfloat32_t zero52 = svdup_n_f32(0); svfloat32_t v52 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero52, v1538, v51, 0), v1538, v51, 90); - svfloat32_t zero66; - asm volatile("mov %0.s, #0" : "=w"(zero66)); + svfloat32_t zero66 = svdup_n_f32(0); svfloat32_t v66 = svcmla_f32_x(pred_full, svcmla_f32_x(pred_full, zero66, v1547, v65, 0), v1547, v65, 90); - svfloat32_t zero101; - asm volatile("mov %0.s, #0" : "=w"(zero101)); + svfloat32_t zero101 = svdup_n_f32(0); svfloat32_t v101 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero101, v1556, v100, 0), v1556, v100, 90); - svfloat32_t zero108; - asm volatile("mov %0.s, #0" : "=w"(zero108)); + svfloat32_t zero108 = svdup_n_f32(0); svfloat32_t v108 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero108, v1565, v107, 0), v1565, v107, 90); - svfloat32_t zero143; - asm volatile("mov %0.s, #0" : "=w"(zero143)); + svfloat32_t zero143 = svdup_n_f32(0); svfloat32_t v143 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero143, v1574, v142, 0), v1574, v142, 90); - svfloat32_t zero150; - asm volatile("mov %0.s, #0" : "=w"(zero150)); + svfloat32_t zero150 = svdup_n_f32(0); svfloat32_t v150 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero150, v1583, v149, 0), v1583, v149, 90); - svfloat32_t zero185; - asm volatile("mov %0.s, #0" : "=w"(zero185)); + svfloat32_t zero185 = svdup_n_f32(0); svfloat32_t v185 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero185, v1592, v184, 0), v1592, v184, 90); - svfloat32_t zero192; - asm volatile("mov %0.s, #0" : "=w"(zero192)); + svfloat32_t zero192 = svdup_n_f32(0); svfloat32_t v192 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero192, v1601, v191, 0), v1601, v191, 90); - svfloat32_t zero206; - asm volatile("mov %0.s, #0" : "=w"(zero206)); + svfloat32_t zero206 = svdup_n_f32(0); svfloat32_t v206 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero206, v1610, v205, 0), v1610, v205, 90); - svfloat32_t zero220; - asm volatile("mov %0.s, #0" : "=w"(zero220)); + svfloat32_t zero220 = svdup_n_f32(0); svfloat32_t v220 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero220, v1619, v219, 0), v1619, v219, 90); - svfloat32_t zero255; - asm volatile("mov %0.s, #0" : "=w"(zero255)); + svfloat32_t zero255 = svdup_n_f32(0); svfloat32_t v255 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero255, v1628, v254, 0), v1628, v254, 90); - svfloat32_t zero262; - asm volatile("mov %0.s, #0" : "=w"(zero262)); + svfloat32_t zero262 = svdup_n_f32(0); svfloat32_t v262 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero262, v1637, v261, 0), v1637, v261, 90); - svfloat32_t zero276; - asm volatile("mov %0.s, #0" : "=w"(zero276)); + svfloat32_t zero276 = svdup_n_f32(0); svfloat32_t v276 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero276, v1646, v275, 0), v1646, v275, 90); - svfloat32_t zero290; - asm volatile("mov %0.s, #0" : "=w"(zero290)); + svfloat32_t zero290 = svdup_n_f32(0); svfloat32_t v290 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero290, v1655, v289, 0), v1655, v289, 90); - svfloat32_t zero332; - asm volatile("mov %0.s, #0" : "=w"(zero332)); + svfloat32_t zero332 = svdup_n_f32(0); svfloat32_t v332 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero332, v1674, v331, 0), v1674, v331, 90); - svfloat32_t zero346; - asm volatile("mov %0.s, #0" : "=w"(zero346)); + svfloat32_t zero346 = svdup_n_f32(0); svfloat32_t v346 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero346, v1684, v345, 0), v1684, v345, 90); - svfloat32_t zero360; - asm volatile("mov %0.s, #0" : "=w"(zero360)); + svfloat32_t zero360 = svdup_n_f32(0); svfloat32_t v360 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero360, v1693, v359, 0), v1693, v359, 90); - svfloat32_t zero395; - asm volatile("mov %0.s, #0" : "=w"(zero395)); + svfloat32_t zero395 = svdup_n_f32(0); svfloat32_t v395 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero395, v1702, v394, 0), v1702, v394, 90); - svfloat32_t zero402; - asm volatile("mov %0.s, #0" : "=w"(zero402)); + svfloat32_t zero402 = svdup_n_f32(0); svfloat32_t v402 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero402, v1711, v401, 0), v1711, v401, 90); - svfloat32_t zero437; - asm volatile("mov %0.s, #0" : "=w"(zero437)); + svfloat32_t zero437 = svdup_n_f32(0); svfloat32_t v437 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero437, v1720, v436, 0), v1720, v436, 90); - svfloat32_t zero444; - asm volatile("mov %0.s, #0" : "=w"(zero444)); + svfloat32_t zero444 = svdup_n_f32(0); svfloat32_t v444 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero444, v1729, v443, 0), v1729, v443, 90); - svfloat32_t zero479; - asm volatile("mov %0.s, #0" : "=w"(zero479)); + svfloat32_t zero479 = svdup_n_f32(0); svfloat32_t v479 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero479, v1738, v478, 0), v1738, v478, 90); - svfloat32_t zero486; - asm volatile("mov %0.s, #0" : "=w"(zero486)); + svfloat32_t zero486 = svdup_n_f32(0); svfloat32_t v486 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero486, v1747, v485, 0), v1747, v485, 90); - svfloat32_t zero500; - asm volatile("mov %0.s, #0" : "=w"(zero500)); + svfloat32_t zero500 = svdup_n_f32(0); svfloat32_t v500 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero500, v1756, v499, 0), v1756, v499, 90); - svfloat32_t zero514; - asm volatile("mov %0.s, #0" : "=w"(zero514)); + svfloat32_t zero514 = svdup_n_f32(0); svfloat32_t v514 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero514, v1765, v513, 0), v1765, v513, 90); - svfloat32_t zero549; - asm volatile("mov %0.s, #0" : "=w"(zero549)); + svfloat32_t zero549 = svdup_n_f32(0); svfloat32_t v549 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero549, v1774, v548, 0), v1774, v548, 90); - svfloat32_t zero556; - asm volatile("mov %0.s, #0" : "=w"(zero556)); + svfloat32_t zero556 = svdup_n_f32(0); svfloat32_t v556 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero556, v1783, v555, 0), v1783, v555, 90); - svfloat32_t zero591; - asm volatile("mov %0.s, #0" : "=w"(zero591)); + svfloat32_t zero591 = svdup_n_f32(0); svfloat32_t v591 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero591, v1792, v590, 0), v1792, v590, 90); - svfloat32_t zero598; - asm volatile("mov %0.s, #0" : "=w"(zero598)); + svfloat32_t zero598 = svdup_n_f32(0); svfloat32_t v598 = svcmla_f32_x( pred_full, svcmla_f32_x(pred_full, zero598, v1801, v597, 0), v1801, v597, 90); - svfloat32_t v606; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v606) : "w"(v1811), "w"(v38)); - svfloat32_t v607; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v607) : "w"(v1811), "w"(v38)); - svfloat32_t v608; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v608) : "w"(v52), "w"(v66)); - svfloat32_t v609; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v609) : "w"(v52), "w"(v66)); - svfloat32_t v621; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v621) : "w"(v101), "w"(v108)); - svfloat32_t v622; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v622) : "w"(v101), "w"(v108)); - svfloat32_t v623; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v623) : "w"(v143), "w"(v150)); - svfloat32_t v624; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v624) : "w"(v143), "w"(v150)); - svfloat32_t v677; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v677) : "w"(v185), "w"(v192)); - svfloat32_t v678; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v678) : "w"(v185), "w"(v192)); - svfloat32_t v679; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v679) : "w"(v206), "w"(v220)); - svfloat32_t v680; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v680) : "w"(v206), "w"(v220)); - svfloat32_t v692; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v692) : "w"(v255), "w"(v262)); - svfloat32_t v693; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v693) : "w"(v255), "w"(v262)); - svfloat32_t v694; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v694) : "w"(v276), "w"(v290)); - svfloat32_t v695; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v695) : "w"(v276), "w"(v290)); - svfloat32_t v837; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v837) : "w"(v325), "w"(v332)); - svfloat32_t v838; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v838) : "w"(v325), "w"(v332)); - svfloat32_t v839; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v839) : "w"(v346), "w"(v360)); - svfloat32_t v840; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v840) : "w"(v346), "w"(v360)); - svfloat32_t v852; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v852) : "w"(v395), "w"(v402)); - svfloat32_t v853; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v853) : "w"(v395), "w"(v402)); - svfloat32_t v854; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v854) : "w"(v437), "w"(v444)); - svfloat32_t v855; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v855) : "w"(v437), "w"(v444)); - svfloat32_t v908; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v908) : "w"(v479), "w"(v486)); - svfloat32_t v909; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v909) : "w"(v479), "w"(v486)); - svfloat32_t v910; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v910) : "w"(v500), "w"(v514)); - svfloat32_t v911; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v911) : "w"(v500), "w"(v514)); - svfloat32_t v923; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v923) : "w"(v549), "w"(v556)); - svfloat32_t v924; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v924) : "w"(v549), "w"(v556)); - svfloat32_t v925; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v925) : "w"(v591), "w"(v598)); - svfloat32_t v926; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v926) : "w"(v591), "w"(v598)); - svfloat32_t zero616; - asm volatile("mov %0.s, #0" : "=w"(zero616)); + svfloat32_t v606 = svadd_f32_x(svptrue_b32(), v1811, v38); + svfloat32_t v607 = svsub_f32_x(svptrue_b32(), v1811, v38); + svfloat32_t v608 = svadd_f32_x(svptrue_b32(), v52, v66); + svfloat32_t v609 = svsub_f32_x(svptrue_b32(), v52, v66); + svfloat32_t v621 = svadd_f32_x(svptrue_b32(), v101, v108); + svfloat32_t v622 = svsub_f32_x(svptrue_b32(), v101, v108); + svfloat32_t v623 = svadd_f32_x(svptrue_b32(), v143, v150); + svfloat32_t v624 = svsub_f32_x(svptrue_b32(), v143, v150); + svfloat32_t v677 = svadd_f32_x(svptrue_b32(), v185, v192); + svfloat32_t v678 = svsub_f32_x(svptrue_b32(), v185, v192); + svfloat32_t v679 = svadd_f32_x(svptrue_b32(), v206, v220); + svfloat32_t v680 = svsub_f32_x(svptrue_b32(), v206, v220); + svfloat32_t v692 = svadd_f32_x(svptrue_b32(), v255, v262); + svfloat32_t v693 = svsub_f32_x(svptrue_b32(), v255, v262); + svfloat32_t v694 = svadd_f32_x(svptrue_b32(), v276, v290); + svfloat32_t v695 = svsub_f32_x(svptrue_b32(), v276, v290); + svfloat32_t v837 = svadd_f32_x(svptrue_b32(), v325, v332); + svfloat32_t v838 = svsub_f32_x(svptrue_b32(), v325, v332); + svfloat32_t v839 = svadd_f32_x(svptrue_b32(), v346, v360); + svfloat32_t v840 = svsub_f32_x(svptrue_b32(), v346, v360); + svfloat32_t v852 = svadd_f32_x(svptrue_b32(), v395, v402); + svfloat32_t v853 = svsub_f32_x(svptrue_b32(), v395, v402); + svfloat32_t v854 = svadd_f32_x(svptrue_b32(), v437, v444); + svfloat32_t v855 = svsub_f32_x(svptrue_b32(), v437, v444); + svfloat32_t v908 = svadd_f32_x(svptrue_b32(), v479, v486); + svfloat32_t v909 = svsub_f32_x(svptrue_b32(), v479, v486); + svfloat32_t v910 = svadd_f32_x(svptrue_b32(), v500, v514); + svfloat32_t v911 = svsub_f32_x(svptrue_b32(), v500, v514); + svfloat32_t v923 = svadd_f32_x(svptrue_b32(), v549, v556); + svfloat32_t v924 = svsub_f32_x(svptrue_b32(), v549, v556); + svfloat32_t v925 = svadd_f32_x(svptrue_b32(), v591, v598); + svfloat32_t v926 = svsub_f32_x(svptrue_b32(), v591, v598); + svfloat32_t zero616 = svdup_n_f32(0); svfloat32_t v616 = svcmla_f32_x(pred_full, zero616, v2015, v609, 90); - svfloat32_t v617; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v617) : "w"(v606), "w"(v608)); - svfloat32_t v618; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v618) : "w"(v606), "w"(v608)); - svfloat32_t v625; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v625) : "w"(v621), "w"(v623)); - svfloat32_t v626; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v626) : "w"(v621), "w"(v623)); - svfloat32_t v642; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v642) : "w"(v622), "w"(v2012)); - svfloat32_t v654; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v654) : "w"(v624), "w"(v2014)); - svfloat32_t zero687; - asm volatile("mov %0.s, #0" : "=w"(zero687)); + svfloat32_t v617 = svadd_f32_x(svptrue_b32(), v606, v608); + svfloat32_t v618 = svsub_f32_x(svptrue_b32(), v606, v608); + svfloat32_t v625 = svadd_f32_x(svptrue_b32(), v621, v623); + svfloat32_t v626 = svsub_f32_x(svptrue_b32(), v621, v623); + svfloat32_t v642 = svmul_f32_x(svptrue_b32(), v622, v2012); + svfloat32_t v654 = svmul_f32_x(svptrue_b32(), v624, v2014); + svfloat32_t zero687 = svdup_n_f32(0); svfloat32_t v687 = svcmla_f32_x(pred_full, zero687, v2015, v680, 90); - svfloat32_t v688; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v688) : "w"(v677), "w"(v679)); - svfloat32_t v689; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v689) : "w"(v677), "w"(v679)); - svfloat32_t zero702; - asm volatile("mov %0.s, #0" : "=w"(zero702)); + svfloat32_t v688 = svadd_f32_x(svptrue_b32(), v677, v679); + svfloat32_t v689 = svsub_f32_x(svptrue_b32(), v677, v679); + svfloat32_t zero702 = svdup_n_f32(0); svfloat32_t v702 = svcmla_f32_x(pred_full, zero702, v2015, v695, 90); - svfloat32_t v703; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v703) : "w"(v692), "w"(v694)); - svfloat32_t v704; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v704) : "w"(v692), "w"(v694)); - svfloat32_t zero847; - asm volatile("mov %0.s, #0" : "=w"(zero847)); + svfloat32_t v703 = svadd_f32_x(svptrue_b32(), v692, v694); + svfloat32_t v704 = svsub_f32_x(svptrue_b32(), v692, v694); + svfloat32_t zero847 = svdup_n_f32(0); svfloat32_t v847 = svcmla_f32_x(pred_full, zero847, v2015, v840, 90); - svfloat32_t v848; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v848) : "w"(v837), "w"(v839)); - svfloat32_t v849; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v849) : "w"(v837), "w"(v839)); - svfloat32_t v856; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v856) : "w"(v852), "w"(v854)); - svfloat32_t v857; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v857) : "w"(v852), "w"(v854)); - svfloat32_t v873; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v873) : "w"(v853), "w"(v2012)); - svfloat32_t v885; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v885) : "w"(v855), "w"(v2014)); - svfloat32_t zero918; - asm volatile("mov %0.s, #0" : "=w"(zero918)); + svfloat32_t v848 = svadd_f32_x(svptrue_b32(), v837, v839); + svfloat32_t v849 = svsub_f32_x(svptrue_b32(), v837, v839); + svfloat32_t v856 = svadd_f32_x(svptrue_b32(), v852, v854); + svfloat32_t v857 = svsub_f32_x(svptrue_b32(), v852, v854); + svfloat32_t v873 = svmul_f32_x(svptrue_b32(), v853, v2012); + svfloat32_t v885 = svmul_f32_x(svptrue_b32(), v855, v2014); + svfloat32_t zero918 = svdup_n_f32(0); svfloat32_t v918 = svcmla_f32_x(pred_full, zero918, v2015, v911, 90); - svfloat32_t v919; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v919) : "w"(v908), "w"(v910)); - svfloat32_t v920; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v920) : "w"(v908), "w"(v910)); - svfloat32_t v927; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v927) : "w"(v923), "w"(v925)); - svfloat32_t v928; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v928) : "w"(v923), "w"(v925)); - svfloat32_t v944; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v944) : "w"(v924), "w"(v2012)); - svfloat32_t v956; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v956) : "w"(v926), "w"(v2014)); - svfloat32_t v619; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v619) : "w"(v607), "w"(v616)); - svfloat32_t v620; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v620) : "w"(v607), "w"(v616)); - svfloat32_t zero633; - asm volatile("mov %0.s, #0" : "=w"(zero633)); + svfloat32_t v919 = svadd_f32_x(svptrue_b32(), v908, v910); + svfloat32_t v920 = svsub_f32_x(svptrue_b32(), v908, v910); + svfloat32_t v927 = svadd_f32_x(svptrue_b32(), v923, v925); + svfloat32_t v928 = svsub_f32_x(svptrue_b32(), v923, v925); + svfloat32_t v944 = svmul_f32_x(svptrue_b32(), v924, v2012); + svfloat32_t v956 = svmul_f32_x(svptrue_b32(), v926, v2014); + svfloat32_t v619 = svsub_f32_x(svptrue_b32(), v607, v616); + svfloat32_t v620 = svadd_f32_x(svptrue_b32(), v607, v616); + svfloat32_t zero633 = svdup_n_f32(0); svfloat32_t v633 = svcmla_f32_x(pred_full, zero633, v2015, v626, 90); - svfloat32_t v634; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v634) : "w"(v617), "w"(v625)); - svfloat32_t v635; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v635) : "w"(v617), "w"(v625)); - svfloat32_t v690; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v690) : "w"(v678), "w"(v687)); - svfloat32_t v691; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v691) : "w"(v678), "w"(v687)); - svfloat32_t v705; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v705) : "w"(v693), "w"(v702)); - svfloat32_t v706; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v706) : "w"(v693), "w"(v702)); - svfloat32_t v707; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v707) : "w"(v688), "w"(v703)); - svfloat32_t v708; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v708) : "w"(v688), "w"(v703)); - svfloat32_t v763; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v763) : "w"(v689), "w"(v2012)); - svfloat32_t v775; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v775) : "w"(v704), "w"(v2014)); - svfloat32_t v850; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v850) : "w"(v838), "w"(v847)); - svfloat32_t v851; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v851) : "w"(v838), "w"(v847)); - svfloat32_t zero864; - asm volatile("mov %0.s, #0" : "=w"(zero864)); + svfloat32_t v634 = svadd_f32_x(svptrue_b32(), v617, v625); + svfloat32_t v635 = svsub_f32_x(svptrue_b32(), v617, v625); + svfloat32_t v690 = svsub_f32_x(svptrue_b32(), v678, v687); + svfloat32_t v691 = svadd_f32_x(svptrue_b32(), v678, v687); + svfloat32_t v705 = svsub_f32_x(svptrue_b32(), v693, v702); + svfloat32_t v706 = svadd_f32_x(svptrue_b32(), v693, v702); + svfloat32_t v707 = svadd_f32_x(svptrue_b32(), v688, v703); + svfloat32_t v708 = svsub_f32_x(svptrue_b32(), v688, v703); + svfloat32_t v763 = svmul_f32_x(svptrue_b32(), v689, v2012); + svfloat32_t v775 = svmul_f32_x(svptrue_b32(), v704, v2014); + svfloat32_t v850 = svsub_f32_x(svptrue_b32(), v838, v847); + svfloat32_t v851 = svadd_f32_x(svptrue_b32(), v838, v847); + svfloat32_t zero864 = svdup_n_f32(0); svfloat32_t v864 = svcmla_f32_x(pred_full, zero864, v2015, v857, 90); - svfloat32_t v865; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v865) : "w"(v848), "w"(v856)); - svfloat32_t v866; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v866) : "w"(v848), "w"(v856)); - svfloat32_t v921; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v921) : "w"(v909), "w"(v918)); - svfloat32_t v922; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v922) : "w"(v909), "w"(v918)); - svfloat32_t zero935; - asm volatile("mov %0.s, #0" : "=w"(zero935)); + svfloat32_t v865 = svadd_f32_x(svptrue_b32(), v848, v856); + svfloat32_t v866 = svsub_f32_x(svptrue_b32(), v848, v856); + svfloat32_t v921 = svsub_f32_x(svptrue_b32(), v909, v918); + svfloat32_t v922 = svadd_f32_x(svptrue_b32(), v909, v918); + svfloat32_t zero935 = svdup_n_f32(0); svfloat32_t v935 = svcmla_f32_x(pred_full, zero935, v2015, v928, 90); - svfloat32_t v936; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v936) : "w"(v919), "w"(v927)); - svfloat32_t v937; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v937) : "w"(v919), "w"(v927)); - svfloat32_t v636; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v636) : "w"(v618), "w"(v633)); - svfloat32_t v637; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v637) : "w"(v618), "w"(v633)); + svfloat32_t v936 = svadd_f32_x(svptrue_b32(), v919, v927); + svfloat32_t v937 = svsub_f32_x(svptrue_b32(), v919, v927); + svfloat32_t v636 = svsub_f32_x(svptrue_b32(), v618, v633); + svfloat32_t v637 = svadd_f32_x(svptrue_b32(), v618, v633); svfloat32_t v662 = svcmla_f32_x(pred_full, v642, v2139, v642, 90); svfloat32_t v663 = svcmla_f32_x(pred_full, v654, v2015, v654, 90); - svfloat32_t zero715; - asm volatile("mov %0.s, #0" : "=w"(zero715)); + svfloat32_t zero715 = svdup_n_f32(0); svfloat32_t v715 = svcmla_f32_x(pred_full, zero715, v2015, v708, 90); - svfloat32_t v716; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v716) : "w"(v634), "w"(v707)); - svfloat32_t v717; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v717) : "w"(v634), "w"(v707)); - svfloat32_t v724; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v724) : "w"(v690), "w"(v1930)); - svfloat32_t v736; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v736) : "w"(v705), "w"(v2094)); - svfloat32_t v802; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v802) : "w"(v691), "w"(v2094)); - svfloat32_t v814; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v814) : "w"(v706), "w"(v2096)); - svfloat32_t v867; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v867) : "w"(v849), "w"(v864)); - svfloat32_t v868; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v868) : "w"(v849), "w"(v864)); + svfloat32_t v716 = svadd_f32_x(svptrue_b32(), v634, v707); + svfloat32_t v717 = svsub_f32_x(svptrue_b32(), v634, v707); + svfloat32_t v724 = svmul_f32_x(svptrue_b32(), v690, v1930); + svfloat32_t v736 = svmul_f32_x(svptrue_b32(), v705, v2094); + svfloat32_t v802 = svmul_f32_x(svptrue_b32(), v691, v2094); + svfloat32_t v814 = svmul_f32_x(svptrue_b32(), v706, v2096); + svfloat32_t v867 = svsub_f32_x(svptrue_b32(), v849, v864); + svfloat32_t v868 = svadd_f32_x(svptrue_b32(), v849, v864); svfloat32_t v893 = svcmla_f32_x(pred_full, v873, v2139, v873, 90); svfloat32_t v894 = svcmla_f32_x(pred_full, v885, v2015, v885, 90); - svfloat32_t v938; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v938) : "w"(v920), "w"(v935)); - svfloat32_t v939; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v939) : "w"(v920), "w"(v935)); + svfloat32_t v938 = svsub_f32_x(svptrue_b32(), v920, v935); + svfloat32_t v939 = svadd_f32_x(svptrue_b32(), v920, v935); svfloat32_t v964 = svcmla_f32_x(pred_full, v944, v2139, v944, 90); svfloat32_t v965 = svcmla_f32_x(pred_full, v956, v2015, v956, 90); - svfloat32_t v979; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v979) : "w"(v865), "w"(v936)); - svfloat32_t v980; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v980) : "w"(v865), "w"(v936)); - svfloat32_t v1241; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v1241) : "w"(v866), "w"(v2012)); - svfloat32_t v1253; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v1253) : "w"(v937), "w"(v2014)); - svfloat32_t v664; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v664) : "w"(v662), "w"(v663)); - svfloat32_t v665; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v665) : "w"(v663), "w"(v662)); - svfloat32_t v718; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v718) : "w"(v635), "w"(v715)); - svfloat32_t v719; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v719) : "w"(v635), "w"(v715)); + svfloat32_t v979 = svadd_f32_x(svptrue_b32(), v865, v936); + svfloat32_t v980 = svsub_f32_x(svptrue_b32(), v865, v936); + svfloat32_t v1241 = svmul_f32_x(svptrue_b32(), v866, v2012); + svfloat32_t v1253 = svmul_f32_x(svptrue_b32(), v937, v2014); + svfloat32_t v664 = svadd_f32_x(svptrue_b32(), v662, v663); + svfloat32_t v665 = svsub_f32_x(svptrue_b32(), v663, v662); + svfloat32_t v718 = svsub_f32_x(svptrue_b32(), v635, v715); + svfloat32_t v719 = svadd_f32_x(svptrue_b32(), v635, v715); svfloat32_t v744 = svcmla_f32_x(pred_full, v724, v1931, v690, 90); svfloat32_t v745 = svcmla_f32_x(pred_full, v736, v2095, v705, 90); svfloat32_t v783 = svcmla_f32_x(pred_full, v763, v2139, v763, 90); svfloat32_t v784 = svcmla_f32_x(pred_full, v775, v2015, v775, 90); svfloat32_t v822 = svcmla_f32_x(pred_full, v802, v2095, v691, 90); svfloat32_t v823 = svcmla_f32_x(pred_full, v814, v2097, v706, 90); - svfloat32_t v895; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v895) : "w"(v893), "w"(v894)); - svfloat32_t v896; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v896) : "w"(v894), "w"(v893)); - svfloat32_t v966; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v966) : "w"(v964), "w"(v965)); - svfloat32_t v967; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v967) : "w"(v965), "w"(v964)); - svfloat32_t zero987; - asm volatile("mov %0.s, #0" : "=w"(zero987)); + svfloat32_t v895 = svadd_f32_x(svptrue_b32(), v893, v894); + svfloat32_t v896 = svsub_f32_x(svptrue_b32(), v894, v893); + svfloat32_t v966 = svadd_f32_x(svptrue_b32(), v964, v965); + svfloat32_t v967 = svsub_f32_x(svptrue_b32(), v965, v964); + svfloat32_t zero987 = svdup_n_f32(0); svfloat32_t v987 = svcmla_f32_x(pred_full, zero987, v2015, v980, 90); - svfloat32_t v988; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v988) : "w"(v716), "w"(v979)); - svfloat32_t v989; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v989) : "w"(v716), "w"(v979)); - svfloat32_t v1099; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v1099) : "w"(v867), "w"(v1930)); - svfloat32_t v1111; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v1111) : "w"(v938), "w"(v2094)); - svfloat32_t v1383; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v1383) : "w"(v868), "w"(v2094)); - svfloat32_t v1395; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v1395) : "w"(v939), "w"(v2096)); - svfloat32_t zero672; - asm volatile("mov %0.s, #0" : "=w"(zero672)); + svfloat32_t v988 = svadd_f32_x(svptrue_b32(), v716, v979); + svfloat32_t v989 = svsub_f32_x(svptrue_b32(), v716, v979); + svfloat32_t v1099 = svmul_f32_x(svptrue_b32(), v867, v1930); + svfloat32_t v1111 = svmul_f32_x(svptrue_b32(), v938, v2094); + svfloat32_t v1383 = svmul_f32_x(svptrue_b32(), v868, v2094); + svfloat32_t v1395 = svmul_f32_x(svptrue_b32(), v939, v2096); + svfloat32_t zero672 = svdup_n_f32(0); svfloat32_t v672 = svcmla_f32_x(pred_full, zero672, v2139, v665, 90); - svfloat32_t v673; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v673) : "w"(v619), "w"(v664)); - svfloat32_t v674; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v674) : "w"(v619), "w"(v664)); - svfloat32_t v746; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v746) : "w"(v744), "w"(v745)); - svfloat32_t v747; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v747) : "w"(v745), "w"(v744)); - svfloat32_t v785; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v785) : "w"(v783), "w"(v784)); - svfloat32_t v786; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v786) : "w"(v784), "w"(v783)); - svfloat32_t v824; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v824) : "w"(v822), "w"(v823)); - svfloat32_t v825; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v825) : "w"(v823), "w"(v822)); - svfloat32_t zero903; - asm volatile("mov %0.s, #0" : "=w"(zero903)); + svfloat32_t v673 = svadd_f32_x(svptrue_b32(), v619, v664); + svfloat32_t v674 = svsub_f32_x(svptrue_b32(), v619, v664); + svfloat32_t v746 = svadd_f32_x(svptrue_b32(), v744, v745); + svfloat32_t v747 = svsub_f32_x(svptrue_b32(), v745, v744); + svfloat32_t v785 = svadd_f32_x(svptrue_b32(), v783, v784); + svfloat32_t v786 = svsub_f32_x(svptrue_b32(), v784, v783); + svfloat32_t v824 = svadd_f32_x(svptrue_b32(), v822, v823); + svfloat32_t v825 = svsub_f32_x(svptrue_b32(), v823, v822); + svfloat32_t zero903 = svdup_n_f32(0); svfloat32_t v903 = svcmla_f32_x(pred_full, zero903, v2139, v896, 90); - svfloat32_t v904; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v904) : "w"(v850), "w"(v895)); - svfloat32_t v905; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v905) : "w"(v850), "w"(v895)); - svfloat32_t zero974; - asm volatile("mov %0.s, #0" : "=w"(zero974)); + svfloat32_t v904 = svadd_f32_x(svptrue_b32(), v850, v895); + svfloat32_t v905 = svsub_f32_x(svptrue_b32(), v850, v895); + svfloat32_t zero974 = svdup_n_f32(0); svfloat32_t v974 = svcmla_f32_x(pred_full, zero974, v2139, v967, 90); - svfloat32_t v975; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v975) : "w"(v921), "w"(v966)); - svfloat32_t v976; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v976) : "w"(v921), "w"(v966)); - svfloat32_t v990; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v990) : "w"(v717), "w"(v987)); - svfloat32_t v991; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v991) : "w"(v717), "w"(v987)); + svfloat32_t v975 = svadd_f32_x(svptrue_b32(), v921, v966); + svfloat32_t v976 = svsub_f32_x(svptrue_b32(), v921, v966); + svfloat32_t v990 = svsub_f32_x(svptrue_b32(), v717, v987); + svfloat32_t v991 = svadd_f32_x(svptrue_b32(), v717, v987); svint16_t v994 = svtbl_s16( svreinterpret_s16_s32(svcvt_s32_f32_x( pred_full, svmul_n_f32_x(pred_full, v988, (float)(1ULL << 31ULL)))), @@ -20178,35 +17949,22 @@ void armral_fft_cf32_cf32_cs16_ab_t_gu32(const armral_cmplx_f32_t *restrict x, svfloat32_t v1262 = svcmla_f32_x(pred_full, v1253, v2015, v1253, 90); svfloat32_t v1403 = svcmla_f32_x(pred_full, v1383, v2095, v868, 90); svfloat32_t v1404 = svcmla_f32_x(pred_full, v1395, v2097, v939, 90); - svfloat32_t v675; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v675) : "w"(v620), "w"(v672)); - svfloat32_t v676; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v676) : "w"(v620), "w"(v672)); - svfloat32_t zero754; - asm volatile("mov %0.s, #0" : "=w"(zero754)); + svfloat32_t v675 = svsub_f32_x(svptrue_b32(), v620, v672); + svfloat32_t v676 = svadd_f32_x(svptrue_b32(), v620, v672); + svfloat32_t zero754 = svdup_n_f32(0); svfloat32_t v754 = svcmla_f32_x(pred_full, zero754, v2139, v747, 90); - svfloat32_t v755; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v755) : "w"(v673), "w"(v746)); - svfloat32_t v756; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v756) : "w"(v673), "w"(v746)); - svfloat32_t zero793; - asm volatile("mov %0.s, #0" : "=w"(zero793)); + svfloat32_t v755 = svadd_f32_x(svptrue_b32(), v673, v746); + svfloat32_t v756 = svsub_f32_x(svptrue_b32(), v673, v746); + svfloat32_t zero793 = svdup_n_f32(0); svfloat32_t v793 = svcmla_f32_x(pred_full, zero793, v2139, v786, 90); - svfloat32_t v794; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v794) : "w"(v636), "w"(v785)); - svfloat32_t v795; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v795) : "w"(v636), "w"(v785)); - svfloat32_t zero832; - asm volatile("mov %0.s, #0" : "=w"(zero832)); + svfloat32_t v794 = svadd_f32_x(svptrue_b32(), v636, v785); + svfloat32_t v795 = svsub_f32_x(svptrue_b32(), v636, v785); + svfloat32_t zero832 = svdup_n_f32(0); svfloat32_t v832 = svcmla_f32_x(pred_full, zero832, v2139, v825, 90); - svfloat32_t v906; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v906) : "w"(v851), "w"(v903)); - svfloat32_t v907; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v907) : "w"(v851), "w"(v903)); - svfloat32_t v977; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v977) : "w"(v922), "w"(v974)); - svfloat32_t v978; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v978) : "w"(v922), "w"(v974)); + svfloat32_t v906 = svsub_f32_x(svptrue_b32(), v851, v903); + svfloat32_t v907 = svadd_f32_x(svptrue_b32(), v851, v903); + svfloat32_t v977 = svsub_f32_x(svptrue_b32(), v922, v974); + svfloat32_t v978 = svadd_f32_x(svptrue_b32(), v922, v974); svint16_t v1002 = svtbl_s16( svreinterpret_s16_s32(svcvt_s32_f32_x( pred_full, svmul_n_f32_x(pred_full, v990, (float)(1ULL << 31ULL)))), @@ -20217,83 +17975,50 @@ void armral_fft_cf32_cf32_cs16_ab_t_gu32(const armral_cmplx_f32_t *restrict x, pred_full, svmul_n_f32_x(pred_full, v991, (float)(1ULL << 31ULL)))), svreinterpret_u16_u64( svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svfloat32_t v1028; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v1028) : "w"(v904), "w"(v1889)); - svfloat32_t v1040; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v1040) : "w"(v975), "w"(v1971)); - svfloat32_t v1121; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v1121) : "w"(v1119), "w"(v1120)); - svfloat32_t v1122; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1122) : "w"(v1120), "w"(v1119)); - svfloat32_t v1263; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v1263) : "w"(v1261), "w"(v1262)); - svfloat32_t v1264; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1264) : "w"(v1262), "w"(v1261)); - svfloat32_t v1312; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v1312) : "w"(v905), "w"(v2053)); - svfloat32_t v1324; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v1324) : "w"(v976), "w"(v2055)); - svfloat32_t v1405; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v1405) : "w"(v1403), "w"(v1404)); - svfloat32_t v1406; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1406) : "w"(v1404), "w"(v1403)); + svfloat32_t v1028 = svmul_f32_x(svptrue_b32(), v904, v1889); + svfloat32_t v1040 = svmul_f32_x(svptrue_b32(), v975, v1971); + svfloat32_t v1121 = svadd_f32_x(svptrue_b32(), v1119, v1120); + svfloat32_t v1122 = svsub_f32_x(svptrue_b32(), v1120, v1119); + svfloat32_t v1263 = svadd_f32_x(svptrue_b32(), v1261, v1262); + svfloat32_t v1264 = svsub_f32_x(svptrue_b32(), v1262, v1261); + svfloat32_t v1312 = svmul_f32_x(svptrue_b32(), v905, v2053); + svfloat32_t v1324 = svmul_f32_x(svptrue_b32(), v976, v2055); + svfloat32_t v1405 = svadd_f32_x(svptrue_b32(), v1403, v1404); + svfloat32_t v1406 = svsub_f32_x(svptrue_b32(), v1404, v1403); svst1w_u64(pred_full, (unsigned *)(v1859), svreinterpret_u64_s16(v994)); svst1w_u64(pred_full, (unsigned *)(v1877), svreinterpret_u64_s16(v1010)); - svfloat32_t v757; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v757) : "w"(v674), "w"(v754)); - svfloat32_t v758; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v758) : "w"(v674), "w"(v754)); - svfloat32_t v796; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v796) : "w"(v637), "w"(v793)); - svfloat32_t v797; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v797) : "w"(v637), "w"(v793)); - svfloat32_t v833; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v833) : "w"(v675), "w"(v824)); - svfloat32_t v834; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v834) : "w"(v675), "w"(v824)); - svfloat32_t v835; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v835) : "w"(v676), "w"(v832)); - svfloat32_t v836; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v836) : "w"(v676), "w"(v832)); + svfloat32_t v757 = svsub_f32_x(svptrue_b32(), v674, v754); + svfloat32_t v758 = svadd_f32_x(svptrue_b32(), v674, v754); + svfloat32_t v796 = svsub_f32_x(svptrue_b32(), v637, v793); + svfloat32_t v797 = svadd_f32_x(svptrue_b32(), v637, v793); + svfloat32_t v833 = svadd_f32_x(svptrue_b32(), v675, v824); + svfloat32_t v834 = svsub_f32_x(svptrue_b32(), v675, v824); + svfloat32_t v835 = svsub_f32_x(svptrue_b32(), v676, v832); + svfloat32_t v836 = svadd_f32_x(svptrue_b32(), v676, v832); svfloat32_t v1048 = svcmla_f32_x(pred_full, v1028, v2056, v904, 90); svfloat32_t v1049 = svcmla_f32_x(pred_full, v1040, v1972, v975, 90); - svfloat32_t zero1129; - asm volatile("mov %0.s, #0" : "=w"(zero1129)); + svfloat32_t zero1129 = svdup_n_f32(0); svfloat32_t v1129 = svcmla_f32_x(pred_full, zero1129, v2139, v1122, 90); - svfloat32_t v1130; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v1130) : "w"(v794), "w"(v1121)); - svfloat32_t v1131; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1131) : "w"(v794), "w"(v1121)); - svfloat32_t v1170; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v1170) : "w"(v906), "w"(v1971)); - svfloat32_t v1182; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v1182) : "w"(v977), "w"(v1973)); - svfloat32_t zero1271; - asm volatile("mov %0.s, #0" : "=w"(zero1271)); + svfloat32_t v1130 = svadd_f32_x(svptrue_b32(), v794, v1121); + svfloat32_t v1131 = svsub_f32_x(svptrue_b32(), v794, v1121); + svfloat32_t v1170 = svmul_f32_x(svptrue_b32(), v906, v1971); + svfloat32_t v1182 = svmul_f32_x(svptrue_b32(), v977, v1973); + svfloat32_t zero1271 = svdup_n_f32(0); svfloat32_t v1271 = svcmla_f32_x(pred_full, zero1271, v2139, v1264, 90); - svfloat32_t v1272; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v1272) : "w"(v718), "w"(v1263)); - svfloat32_t v1273; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1273) : "w"(v718), "w"(v1263)); + svfloat32_t v1272 = svadd_f32_x(svptrue_b32(), v718, v1263); + svfloat32_t v1273 = svsub_f32_x(svptrue_b32(), v718, v1263); svfloat32_t v1332 = svcmla_f32_x(pred_full, v1312, v2054, v905, 90); svfloat32_t v1333 = svcmla_f32_x(pred_full, v1324, v2056, v976, 90); - svfloat32_t zero1413; - asm volatile("mov %0.s, #0" : "=w"(zero1413)); + svfloat32_t zero1413 = svdup_n_f32(0); svfloat32_t v1413 = svcmla_f32_x(pred_full, zero1413, v2139, v1406, 90); - svfloat32_t v1454; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v1454) : "w"(v907), "w"(v2135)); - svfloat32_t v1466; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v1466) : "w"(v978), "w"(v2137)); + svfloat32_t v1454 = svmul_f32_x(svptrue_b32(), v907, v2135); + svfloat32_t v1466 = svmul_f32_x(svptrue_b32(), v978, v2137); svst1w_u64(pred_full, (unsigned *)(v1868), svreinterpret_u64_s16(v1002)); svst1w_u64(pred_full, (unsigned *)(v1886), svreinterpret_u64_s16(v1018)); - svfloat32_t v1050; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v1050) : "w"(v1048), "w"(v1049)); - svfloat32_t v1051; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1051) : "w"(v1049), "w"(v1048)); - svfloat32_t v1132; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1132) : "w"(v795), "w"(v1129)); - svfloat32_t v1133; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v1133) : "w"(v795), "w"(v1129)); + svfloat32_t v1050 = svadd_f32_x(svptrue_b32(), v1048, v1049); + svfloat32_t v1051 = svsub_f32_x(svptrue_b32(), v1049, v1048); + svfloat32_t v1132 = svsub_f32_x(svptrue_b32(), v795, v1129); + svfloat32_t v1133 = svadd_f32_x(svptrue_b32(), v795, v1129); svint16_t v1136 = svtbl_s16(svreinterpret_s16_s32(svcvt_s32_f32_x( pred_full, @@ -20308,10 +18033,8 @@ void armral_fft_cf32_cf32_cs16_ab_t_gu32(const armral_cmplx_f32_t *restrict x, 0x0000000000040004ULL))); svfloat32_t v1190 = svcmla_f32_x(pred_full, v1170, v1972, v906, 90); svfloat32_t v1191 = svcmla_f32_x(pred_full, v1182, v2136, v977, 90); - svfloat32_t v1274; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1274) : "w"(v719), "w"(v1271)); - svfloat32_t v1275; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v1275) : "w"(v719), "w"(v1271)); + svfloat32_t v1274 = svsub_f32_x(svptrue_b32(), v719, v1271); + svfloat32_t v1275 = svadd_f32_x(svptrue_b32(), v719, v1271); svint16_t v1278 = svtbl_s16(svreinterpret_s16_s32(svcvt_s32_f32_x( pred_full, @@ -20324,27 +18047,18 @@ void armral_fft_cf32_cf32_cs16_ab_t_gu32(const armral_cmplx_f32_t *restrict x, svmul_n_f32_x(pred_full, v1273, (float)(1ULL << 31ULL)))), svreinterpret_u16_u64(svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svfloat32_t v1334; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v1334) : "w"(v1332), "w"(v1333)); - svfloat32_t v1335; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1335) : "w"(v1333), "w"(v1332)); - svfloat32_t v1414; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v1414) : "w"(v796), "w"(v1405)); - svfloat32_t v1415; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1415) : "w"(v796), "w"(v1405)); - svfloat32_t v1416; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1416) : "w"(v797), "w"(v1413)); - svfloat32_t v1417; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v1417) : "w"(v797), "w"(v1413)); + svfloat32_t v1334 = svadd_f32_x(svptrue_b32(), v1332, v1333); + svfloat32_t v1335 = svsub_f32_x(svptrue_b32(), v1333, v1332); + svfloat32_t v1414 = svadd_f32_x(svptrue_b32(), v796, v1405); + svfloat32_t v1415 = svsub_f32_x(svptrue_b32(), v796, v1405); + svfloat32_t v1416 = svsub_f32_x(svptrue_b32(), v797, v1413); + svfloat32_t v1417 = svadd_f32_x(svptrue_b32(), v797, v1413); svfloat32_t v1474 = svcmla_f32_x(pred_full, v1454, v2136, v907, 90); svfloat32_t v1475 = svcmla_f32_x(pred_full, v1466, v2138, v978, 90); - svfloat32_t zero1058; - asm volatile("mov %0.s, #0" : "=w"(zero1058)); + svfloat32_t zero1058 = svdup_n_f32(0); svfloat32_t v1058 = svcmla_f32_x(pred_full, zero1058, v2139, v1051, 90); - svfloat32_t v1059; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v1059) : "w"(v755), "w"(v1050)); - svfloat32_t v1060; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1060) : "w"(v755), "w"(v1050)); + svfloat32_t v1059 = svadd_f32_x(svptrue_b32(), v755, v1050); + svfloat32_t v1060 = svsub_f32_x(svptrue_b32(), v755, v1050); svint16_t v1144 = svtbl_s16(svreinterpret_s16_s32(svcvt_s32_f32_x( pred_full, @@ -20357,10 +18071,8 @@ void armral_fft_cf32_cf32_cs16_ab_t_gu32(const armral_cmplx_f32_t *restrict x, svmul_n_f32_x(pred_full, v1133, (float)(1ULL << 31ULL)))), svreinterpret_u16_u64(svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svfloat32_t v1192; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v1192) : "w"(v1190), "w"(v1191)); - svfloat32_t v1193; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1193) : "w"(v1191), "w"(v1190)); + svfloat32_t v1192 = svadd_f32_x(svptrue_b32(), v1190, v1191); + svfloat32_t v1193 = svsub_f32_x(svptrue_b32(), v1191, v1190); svint16_t v1286 = svtbl_s16(svreinterpret_s16_s32(svcvt_s32_f32_x( pred_full, @@ -20373,13 +18085,10 @@ void armral_fft_cf32_cf32_cs16_ab_t_gu32(const armral_cmplx_f32_t *restrict x, svmul_n_f32_x(pred_full, v1275, (float)(1ULL << 31ULL)))), svreinterpret_u16_u64(svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svfloat32_t zero1342; - asm volatile("mov %0.s, #0" : "=w"(zero1342)); + svfloat32_t zero1342 = svdup_n_f32(0); svfloat32_t v1342 = svcmla_f32_x(pred_full, zero1342, v2139, v1335, 90); - svfloat32_t v1343; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v1343) : "w"(v757), "w"(v1334)); - svfloat32_t v1344; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1344) : "w"(v757), "w"(v1334)); + svfloat32_t v1343 = svadd_f32_x(svptrue_b32(), v757, v1334); + svfloat32_t v1344 = svsub_f32_x(svptrue_b32(), v757, v1334); svint16_t v1420 = svtbl_s16(svreinterpret_s16_s32(svcvt_s32_f32_x( pred_full, @@ -20404,18 +18113,14 @@ void armral_fft_cf32_cf32_cs16_ab_t_gu32(const armral_cmplx_f32_t *restrict x, svmul_n_f32_x(pred_full, v1417, (float)(1ULL << 31ULL)))), svreinterpret_u16_u64(svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svfloat32_t v1476; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v1476) : "w"(v1474), "w"(v1475)); - svfloat32_t v1477; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1477) : "w"(v1475), "w"(v1474)); + svfloat32_t v1476 = svadd_f32_x(svptrue_b32(), v1474, v1475); + svfloat32_t v1477 = svsub_f32_x(svptrue_b32(), v1475, v1474); svst1w_u64(pred_full, (unsigned *)(v1941), svreinterpret_u64_s16(v1136)); svst1w_u64(pred_full, (unsigned *)(v1959), svreinterpret_u64_s16(v1152)); svst1w_u64(pred_full, (unsigned *)(v2023), svreinterpret_u64_s16(v1278)); svst1w_u64(pred_full, (unsigned *)(v2041), svreinterpret_u64_s16(v1294)); - svfloat32_t v1061; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1061) : "w"(v756), "w"(v1058)); - svfloat32_t v1062; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v1062) : "w"(v756), "w"(v1058)); + svfloat32_t v1061 = svsub_f32_x(svptrue_b32(), v756, v1058); + svfloat32_t v1062 = svadd_f32_x(svptrue_b32(), v756, v1058); svint16_t v1065 = svtbl_s16(svreinterpret_s16_s32(svcvt_s32_f32_x( pred_full, @@ -20428,17 +18133,12 @@ void armral_fft_cf32_cf32_cs16_ab_t_gu32(const armral_cmplx_f32_t *restrict x, svmul_n_f32_x(pred_full, v1060, (float)(1ULL << 31ULL)))), svreinterpret_u16_u64(svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svfloat32_t zero1200; - asm volatile("mov %0.s, #0" : "=w"(zero1200)); + svfloat32_t zero1200 = svdup_n_f32(0); svfloat32_t v1200 = svcmla_f32_x(pred_full, zero1200, v2139, v1193, 90); - svfloat32_t v1201; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v1201) : "w"(v833), "w"(v1192)); - svfloat32_t v1202; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1202) : "w"(v833), "w"(v1192)); - svfloat32_t v1345; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1345) : "w"(v758), "w"(v1342)); - svfloat32_t v1346; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v1346) : "w"(v758), "w"(v1342)); + svfloat32_t v1201 = svadd_f32_x(svptrue_b32(), v833, v1192); + svfloat32_t v1202 = svsub_f32_x(svptrue_b32(), v833, v1192); + svfloat32_t v1345 = svsub_f32_x(svptrue_b32(), v758, v1342); + svfloat32_t v1346 = svadd_f32_x(svptrue_b32(), v758, v1342); svint16_t v1349 = svtbl_s16(svreinterpret_s16_s32(svcvt_s32_f32_x( pred_full, @@ -20451,13 +18151,10 @@ void armral_fft_cf32_cf32_cs16_ab_t_gu32(const armral_cmplx_f32_t *restrict x, svmul_n_f32_x(pred_full, v1344, (float)(1ULL << 31ULL)))), svreinterpret_u16_u64(svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svfloat32_t zero1484; - asm volatile("mov %0.s, #0" : "=w"(zero1484)); + svfloat32_t zero1484 = svdup_n_f32(0); svfloat32_t v1484 = svcmla_f32_x(pred_full, zero1484, v2139, v1477, 90); - svfloat32_t v1485; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v1485) : "w"(v835), "w"(v1476)); - svfloat32_t v1486; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1486) : "w"(v835), "w"(v1476)); + svfloat32_t v1485 = svadd_f32_x(svptrue_b32(), v835, v1476); + svfloat32_t v1486 = svsub_f32_x(svptrue_b32(), v835, v1476); svst1w_u64(pred_full, (unsigned *)(v1950), svreinterpret_u64_s16(v1144)); svst1w_u64(pred_full, (unsigned *)(v1968), svreinterpret_u64_s16(v1160)); svst1w_u64(pred_full, (unsigned *)(v2032), svreinterpret_u64_s16(v1286)); @@ -20478,10 +18175,8 @@ void armral_fft_cf32_cf32_cs16_ab_t_gu32(const armral_cmplx_f32_t *restrict x, svmul_n_f32_x(pred_full, v1062, (float)(1ULL << 31ULL)))), svreinterpret_u16_u64(svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svfloat32_t v1203; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1203) : "w"(v834), "w"(v1200)); - svfloat32_t v1204; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v1204) : "w"(v834), "w"(v1200)); + svfloat32_t v1203 = svsub_f32_x(svptrue_b32(), v834, v1200); + svfloat32_t v1204 = svadd_f32_x(svptrue_b32(), v834, v1200); svint16_t v1207 = svtbl_s16(svreinterpret_s16_s32(svcvt_s32_f32_x( pred_full, @@ -20506,10 +18201,8 @@ void armral_fft_cf32_cf32_cs16_ab_t_gu32(const armral_cmplx_f32_t *restrict x, svmul_n_f32_x(pred_full, v1346, (float)(1ULL << 31ULL)))), svreinterpret_u16_u64(svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svfloat32_t v1487; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1487) : "w"(v836), "w"(v1484)); - svfloat32_t v1488; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v1488) : "w"(v836), "w"(v1484)); + svfloat32_t v1487 = svsub_f32_x(svptrue_b32(), v836, v1484); + svfloat32_t v1488 = svadd_f32_x(svptrue_b32(), v836, v1484); svint16_t v1491 = svtbl_s16(svreinterpret_s16_s32(svcvt_s32_f32_x( pred_full, diff --git a/src/LowerPHY/FFT/fft_cf32_cf32_cs16_ab_t_gu.h b/src/LowerPHY/FFT/fft_cf32_cf32_cs16_ab_t_gu.h index 896851eb572841645d9efdad724492b4617731c7..60da2ffb082cd0b634c759ddb5a8ba7987c945b7 100644 --- a/src/LowerPHY/FFT/fft_cf32_cf32_cs16_ab_t_gu.h +++ b/src/LowerPHY/FFT/fft_cf32_cf32_cs16_ab_t_gu.h @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #pragma once diff --git a/src/LowerPHY/FFT/fft_cf32_cf32_cs16_ac_n_uu.c b/src/LowerPHY/FFT/fft_cf32_cf32_cs16_ac_n_uu.c index 81f601770bd55f12f3302a4321d122fb6642ee40..e13f66b823337050caf15251d51c83dde1969366 100644 --- a/src/LowerPHY/FFT/fft_cf32_cf32_cs16_ac_n_uu.c +++ b/src/LowerPHY/FFT/fft_cf32_cf32_cs16_ac_n_uu.c @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "fft_cf32_cf32_cs16_ac_n_uu.h" @@ -16,35 +18,17 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu2(const armral_cmplx_f32_t *restrict x, float dir) { const float32x2_t *v5 = (const float32x2_t *)x; int32_t *v6 = (int32_t *)y; - int64_t v12 = howmany - 1; - int64_t v63 = howmany / 2; - for (int j = 0; j < v12; j += 2) { - const float32x2_t *v123 = &v5[istride]; - int32_t *v142 = &v6[ostride]; - const float32x2_t *v114 = &v5[0]; - int32_t *v133 = &v6[0]; - float32x4_t v148 = vld1q_f32((const float32_t *)v123); - float32x4_t v146 = vld1q_f32((const float32_t *)v114); - float32x4_t v35 = vaddq_f32(v146, v148); - float32x4_t v36 = vsubq_f32(v146, v148); - int16x4_t v49 = vqmovn_s32(vcvtq_n_s32_f32(v35, 15)); - int16x4_t v57 = vqmovn_s32(vcvtq_n_s32_f32(v36, 15)); - vst1_s16((int16_t *)v133, v49); - vst1_s16((int16_t *)v142, v57); - v5 += 2 * 1; - v6 += 2 * 1; - } - for (int j = v63 * 2; j < howmany; j += 1) { - float32x2_t v80 = v5[istride]; - float32x2_t v75 = v5[0]; - float32x2_t v81 = vadd_f32(v75, v80); - float32x2_t v82 = vsub_f32(v75, v80); - int16x4_t v93 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v81, 15), (int32x2_t){0, 0})); - int16x4_t v99 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v82, 15), (int32x2_t){0, 0})); - v6[0] = vget_lane_s32(vreinterpret_s32_s16(v93), 0); - v6[ostride] = vget_lane_s32(vreinterpret_s32_s16(v99), 0); + for (int j = 0; j < howmany; j += 1) { + float32x2_t v25 = v5[istride]; + float32x2_t v20 = v5[0]; + float32x2_t v26 = vadd_f32(v20, v25); + float32x2_t v27 = vsub_f32(v20, v25); + int16x4_t v38 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v26, 15), (int32x2_t){0, 0})); + int16x4_t v44 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v27, 15), (int32x2_t){0, 0})); + v6[0] = vget_lane_s32(vreinterpret_s32_s16(v38), 0); + v6[ostride] = vget_lane_s32(vreinterpret_s32_s16(v44), 0); v5 += 1 * 1; v6 += 1 * 1; } @@ -74,10 +58,8 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu2(const armral_cmplx_f32_t *restrict x, svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v76)[0])); svfloat32_t v101 = svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v67)[0])); - svfloat32_t v32; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v32) : "w"(v101), "w"(v103)); - svfloat32_t v33; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v33) : "w"(v101), "w"(v103)); + svfloat32_t v32 = svadd_f32_x(svptrue_b32(), v101, v103); + svfloat32_t v33 = svsub_f32_x(svptrue_b32(), v101, v103); svint16_t v46 = svtbl_s16( svreinterpret_s16_s32(svcvt_s32_f32_x( pred_full, svmul_n_f32_x(pred_full, v32, (float)(1ULL << 31ULL)))), @@ -104,74 +86,35 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu3(const armral_cmplx_f32_t *restrict x, float v4 = dir; const float32x2_t *v5 = (const float32x2_t *)x; int32_t *v6 = (int32_t *)y; - int64_t v12 = howmany - 1; - int64_t v90 = howmany / 2; - for (int j = 0; j < v12; j += 2) { - float v51 = -1.4999999999999998e+00F; - float v55 = 8.6602540378443871e-01F; - float v56 = -8.6602540378443871e-01F; - float32x2_t v58 = (float32x2_t){v4, v4}; - const float32x2_t *v162 = &v5[istride]; - int32_t *v200 = &v6[ostride]; - float32x2_t v52 = (float32x2_t){v51, v51}; - float32x2_t v57 = (float32x2_t){v55, v56}; - const float32x2_t *v181 = &v5[0]; - int32_t *v191 = &v6[0]; - float32x4_t v213 = vld1q_f32((const float32_t *)v162); - float32x4_t v53 = vcombine_f32(v52, v52); - float32x2_t v59 = vmul_f32(v58, v57); - const float32x2_t *v171 = &v5[istride * 2]; - int32_t *v209 = &v6[ostride * 2]; - float32x4_t v217 = vld1q_f32((const float32_t *)v181); - float32x4_t v61 = vcombine_f32(v59, v59); - float32x4_t v215 = vld1q_f32((const float32_t *)v171); - float32x4_t v35 = vaddq_f32(v213, v215); - float32x4_t v36 = vsubq_f32(v213, v215); - float32x4_t v44 = vaddq_f32(v35, v217); - float32x4_t v54 = vmulq_f32(v35, v53); - float32x4_t v60 = vrev64q_f32(v36); - float32x4_t v62 = vmulq_f32(v60, v61); - float32x4_t v63 = vaddq_f32(v44, v54); - int16x4_t v68 = vqmovn_s32(vcvtq_n_s32_f32(v44, 15)); - float32x4_t v64 = vaddq_f32(v63, v62); - float32x4_t v65 = vsubq_f32(v63, v62); - vst1_s16((int16_t *)v191, v68); - int16x4_t v76 = vqmovn_s32(vcvtq_n_s32_f32(v65, 15)); - int16x4_t v84 = vqmovn_s32(vcvtq_n_s32_f32(v64, 15)); - vst1_s16((int16_t *)v200, v76); - vst1_s16((int16_t *)v209, v84); - v5 += 2 * 1; - v6 += 2 * 1; - } - for (int j = v90 * 2; j < howmany; j += 1) { - float32x2_t v102 = v5[istride]; - float v121 = -1.4999999999999998e+00F; - float v124 = 8.6602540378443871e-01F; - float v125 = -8.6602540378443871e-01F; - float32x2_t v127 = (float32x2_t){v4, v4}; - float32x2_t v114 = v5[0]; - float32x2_t v122 = (float32x2_t){v121, v121}; - float32x2_t v126 = (float32x2_t){v124, v125}; - float32x2_t v107 = v5[istride * 2]; - float32x2_t v128 = vmul_f32(v127, v126); - float32x2_t v108 = vadd_f32(v102, v107); - float32x2_t v109 = vsub_f32(v102, v107); - float32x2_t v115 = vadd_f32(v108, v114); - float32x2_t v123 = vmul_f32(v108, v122); - float32x2_t v129 = vrev64_f32(v109); - float32x2_t v130 = vmul_f32(v129, v128); - float32x2_t v131 = vadd_f32(v115, v123); - int16x4_t v136 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v115, 15), (int32x2_t){0, 0})); - float32x2_t v132 = vadd_f32(v131, v130); - float32x2_t v133 = vsub_f32(v131, v130); - v6[0] = vget_lane_s32(vreinterpret_s32_s16(v136), 0); - int16x4_t v142 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v133, 15), (int32x2_t){0, 0})); - int16x4_t v148 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v132, 15), (int32x2_t){0, 0})); - v6[ostride] = vget_lane_s32(vreinterpret_s32_s16(v142), 0); - v6[ostride * 2] = vget_lane_s32(vreinterpret_s32_s16(v148), 0); + for (int j = 0; j < howmany; j += 1) { + float32x2_t v20 = v5[istride]; + float v39 = -1.4999999999999998e+00F; + float v42 = 8.6602540378443871e-01F; + float v43 = -8.6602540378443871e-01F; + float32x2_t v45 = (float32x2_t){v4, v4}; + float32x2_t v32 = v5[0]; + float32x2_t v40 = (float32x2_t){v39, v39}; + float32x2_t v44 = (float32x2_t){v42, v43}; + float32x2_t v25 = v5[istride * 2]; + float32x2_t v46 = vmul_f32(v45, v44); + float32x2_t v26 = vadd_f32(v20, v25); + float32x2_t v27 = vsub_f32(v20, v25); + float32x2_t v33 = vadd_f32(v26, v32); + float32x2_t v41 = vmul_f32(v26, v40); + float32x2_t v47 = vrev64_f32(v27); + float32x2_t v48 = vmul_f32(v47, v46); + float32x2_t v49 = vadd_f32(v33, v41); + int16x4_t v54 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v33, 15), (int32x2_t){0, 0})); + float32x2_t v50 = vadd_f32(v49, v48); + float32x2_t v51 = vsub_f32(v49, v48); + v6[0] = vget_lane_s32(vreinterpret_s32_s16(v54), 0); + int16x4_t v60 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v51, 15), (int32x2_t){0, 0})); + int16x4_t v66 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v50, 15), (int32x2_t){0, 0})); + v6[ostride] = vget_lane_s32(vreinterpret_s32_s16(v60), 0); + v6[ostride * 2] = vget_lane_s32(vreinterpret_s32_s16(v66), 0); v5 += 1 * 1; v6 += 1 * 1; } @@ -213,14 +156,10 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu3(const armral_cmplx_f32_t *restrict x, svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v111)[0])); svfloat32_t v148 = svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v101)[0])); - svfloat32_t v32; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v32) : "w"(v146), "w"(v148)); - svfloat32_t v33; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v33) : "w"(v146), "w"(v148)); - svfloat32_t v41; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v41) : "w"(v32), "w"(v150)); - svfloat32_t zero58; - asm volatile("mov %0.s, #0" : "=w"(zero58)); + svfloat32_t v32 = svadd_f32_x(svptrue_b32(), v146, v148); + svfloat32_t v33 = svsub_f32_x(svptrue_b32(), v146, v148); + svfloat32_t v41 = svadd_f32_x(svptrue_b32(), v32, v150); + svfloat32_t zero58 = svdup_n_f32(0); svfloat32_t v58 = svcmla_f32_x(pred_full, zero58, v116, v33, 90); svfloat32_t v59 = svmla_f32_x(pred_full, v41, v32, v115); svint16_t v64 = svtbl_s16( @@ -228,10 +167,8 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu3(const armral_cmplx_f32_t *restrict x, pred_full, svmul_n_f32_x(pred_full, v41, (float)(1ULL << 31ULL)))), svreinterpret_u16_u64( svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svfloat32_t v60; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v60) : "w"(v59), "w"(v58)); - svfloat32_t v61; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v61) : "w"(v59), "w"(v58)); + svfloat32_t v60 = svadd_f32_x(svptrue_b32(), v59, v58); + svfloat32_t v61 = svsub_f32_x(svptrue_b32(), v59, v58); svst1w_u64(pred_full, (unsigned *)(v124), svreinterpret_u64_s16(v64)); svint16_t v72 = svtbl_s16( svreinterpret_s16_s32(svcvt_s32_f32_x( @@ -259,80 +196,38 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu4(const armral_cmplx_f32_t *restrict x, float v4 = dir; const float32x2_t *v5 = (const float32x2_t *)x; int32_t *v6 = (int32_t *)y; - int64_t v12 = howmany - 1; - int64_t v112 = howmany / 2; - for (int j = 0; j < v12; j += 2) { - float v70 = 1.0000000000000000e+00F; - float v71 = -1.0000000000000000e+00F; - float32x2_t v73 = (float32x2_t){v4, v4}; - const float32x2_t *v220 = &v5[istride]; - int32_t *v248 = &v6[ostride]; - float32x2_t v72 = (float32x2_t){v70, v71}; - const float32x2_t *v202 = &v5[0]; - int32_t *v239 = &v6[0]; - float32x4_t v274 = vld1q_f32((const float32_t *)v220); - float32x2_t v74 = vmul_f32(v73, v72); - const float32x2_t *v211 = &v5[istride * 2]; - const float32x2_t *v229 = &v5[istride * 3]; - int32_t *v257 = &v6[ostride * 2]; - int32_t *v266 = &v6[ostride * 3]; - float32x4_t v270 = vld1q_f32((const float32_t *)v202); - float32x4_t v76 = vcombine_f32(v74, v74); - float32x4_t v272 = vld1q_f32((const float32_t *)v211); - float32x4_t v276 = vld1q_f32((const float32_t *)v229); - float32x4_t v35 = vaddq_f32(v270, v272); - float32x4_t v36 = vsubq_f32(v270, v272); - float32x4_t v51 = vaddq_f32(v274, v276); - float32x4_t v52 = vsubq_f32(v274, v276); - float32x4_t v53 = vaddq_f32(v35, v51); - float32x4_t v54 = vsubq_f32(v35, v51); - float32x4_t v75 = vrev64q_f32(v52); - float32x4_t v77 = vmulq_f32(v75, v76); - int16x4_t v82 = vqmovn_s32(vcvtq_n_s32_f32(v53, 15)); - int16x4_t v98 = vqmovn_s32(vcvtq_n_s32_f32(v54, 15)); - float32x4_t v78 = vaddq_f32(v36, v77); - float32x4_t v79 = vsubq_f32(v36, v77); - vst1_s16((int16_t *)v239, v82); - vst1_s16((int16_t *)v257, v98); - int16x4_t v90 = vqmovn_s32(vcvtq_n_s32_f32(v79, 15)); - int16x4_t v106 = vqmovn_s32(vcvtq_n_s32_f32(v78, 15)); - vst1_s16((int16_t *)v248, v90); - vst1_s16((int16_t *)v266, v106); - v5 += 2 * 1; - v6 += 2 * 1; - } - for (int j = v112 * 2; j < howmany; j += 1) { - float32x2_t v136 = v5[istride]; - float v158 = 1.0000000000000000e+00F; - float v159 = -1.0000000000000000e+00F; - float32x2_t v161 = (float32x2_t){v4, v4}; - float32x2_t v124 = v5[0]; - float32x2_t v160 = (float32x2_t){v158, v159}; - float32x2_t v129 = v5[istride * 2]; - float32x2_t v141 = v5[istride * 3]; - float32x2_t v162 = vmul_f32(v161, v160); - float32x2_t v130 = vadd_f32(v124, v129); - float32x2_t v131 = vsub_f32(v124, v129); - float32x2_t v142 = vadd_f32(v136, v141); - float32x2_t v143 = vsub_f32(v136, v141); - float32x2_t v144 = vadd_f32(v130, v142); - float32x2_t v145 = vsub_f32(v130, v142); - float32x2_t v163 = vrev64_f32(v143); - float32x2_t v164 = vmul_f32(v163, v162); - int16x4_t v169 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v144, 15), (int32x2_t){0, 0})); - int16x4_t v181 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v145, 15), (int32x2_t){0, 0})); - float32x2_t v165 = vadd_f32(v131, v164); - float32x2_t v166 = vsub_f32(v131, v164); - v6[0] = vget_lane_s32(vreinterpret_s32_s16(v169), 0); - v6[ostride * 2] = vget_lane_s32(vreinterpret_s32_s16(v181), 0); - int16x4_t v175 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v166, 15), (int32x2_t){0, 0})); - int16x4_t v187 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v165, 15), (int32x2_t){0, 0})); - v6[ostride] = vget_lane_s32(vreinterpret_s32_s16(v175), 0); - v6[ostride * 3] = vget_lane_s32(vreinterpret_s32_s16(v187), 0); + for (int j = 0; j < howmany; j += 1) { + float32x2_t v32 = v5[istride]; + float v54 = 1.0000000000000000e+00F; + float v55 = -1.0000000000000000e+00F; + float32x2_t v57 = (float32x2_t){v4, v4}; + float32x2_t v20 = v5[0]; + float32x2_t v56 = (float32x2_t){v54, v55}; + float32x2_t v25 = v5[istride * 2]; + float32x2_t v37 = v5[istride * 3]; + float32x2_t v58 = vmul_f32(v57, v56); + float32x2_t v26 = vadd_f32(v20, v25); + float32x2_t v27 = vsub_f32(v20, v25); + float32x2_t v38 = vadd_f32(v32, v37); + float32x2_t v39 = vsub_f32(v32, v37); + float32x2_t v40 = vadd_f32(v26, v38); + float32x2_t v41 = vsub_f32(v26, v38); + float32x2_t v59 = vrev64_f32(v39); + float32x2_t v60 = vmul_f32(v59, v58); + int16x4_t v65 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v40, 15), (int32x2_t){0, 0})); + int16x4_t v77 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v41, 15), (int32x2_t){0, 0})); + float32x2_t v61 = vadd_f32(v27, v60); + float32x2_t v62 = vsub_f32(v27, v60); + v6[0] = vget_lane_s32(vreinterpret_s32_s16(v65), 0); + v6[ostride * 2] = vget_lane_s32(vreinterpret_s32_s16(v77), 0); + int16x4_t v71 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v62, 15), (int32x2_t){0, 0})); + int16x4_t v83 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v61, 15), (int32x2_t){0, 0})); + v6[ostride] = vget_lane_s32(vreinterpret_s32_s16(v71), 0); + v6[ostride * 3] = vget_lane_s32(vreinterpret_s32_s16(v83), 0); v5 += 1 * 1; v6 += 1 * 1; } @@ -378,25 +273,16 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu4(const armral_cmplx_f32_t *restrict x, svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v124)[0])); svfloat32_t v193 = svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v142)[0])); - svfloat32_t v32; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v32) : "w"(v187), "w"(v189)); - svfloat32_t v33; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v33) : "w"(v187), "w"(v189)); - svfloat32_t v48; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v48) : "w"(v191), "w"(v193)); - svfloat32_t v49; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v49) : "w"(v191), "w"(v193)); - svfloat32_t v50; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v50) : "w"(v32), "w"(v48)); - svfloat32_t v51; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v51) : "w"(v32), "w"(v48)); - svfloat32_t zero73; - asm volatile("mov %0.s, #0" : "=w"(zero73)); + svfloat32_t v32 = svadd_f32_x(svptrue_b32(), v187, v189); + svfloat32_t v33 = svsub_f32_x(svptrue_b32(), v187, v189); + svfloat32_t v48 = svadd_f32_x(svptrue_b32(), v191, v193); + svfloat32_t v49 = svsub_f32_x(svptrue_b32(), v191, v193); + svfloat32_t v50 = svadd_f32_x(svptrue_b32(), v32, v48); + svfloat32_t v51 = svsub_f32_x(svptrue_b32(), v32, v48); + svfloat32_t zero73 = svdup_n_f32(0); svfloat32_t v73 = svcmla_f32_x(pred_full, zero73, v148, v49, 90); - svfloat32_t v74; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v74) : "w"(v33), "w"(v73)); - svfloat32_t v75; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v75) : "w"(v33), "w"(v73)); + svfloat32_t v74 = svadd_f32_x(svptrue_b32(), v33, v73); + svfloat32_t v75 = svsub_f32_x(svptrue_b32(), v33, v73); svint16_t v78 = svtbl_s16( svreinterpret_s16_s32(svcvt_s32_f32_x( pred_full, svmul_n_f32_x(pred_full, v50, (float)(1ULL << 31ULL)))), @@ -435,147 +321,69 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu5(const armral_cmplx_f32_t *restrict x, float v4 = dir; const float32x2_t *v5 = (const float32x2_t *)x; int32_t *v6 = (int32_t *)y; - int64_t v12 = howmany - 1; - int64_t v152 = howmany / 2; - for (int j = 0; j < v12; j += 2) { - float v70 = -1.2500000000000000e+00F; - float v75 = 5.5901699437494745e-01F; - float v79 = 1.5388417685876268e+00F; - float v80 = -1.5388417685876268e+00F; - float v87 = 5.8778525229247325e-01F; - float v88 = -5.8778525229247325e-01F; - float v95 = 3.6327126400268028e-01F; - float v96 = -3.6327126400268028e-01F; - float32x2_t v98 = (float32x2_t){v4, v4}; - const float32x2_t *v275 = &v5[istride]; - int32_t *v331 = &v6[ostride]; - float32x2_t v71 = (float32x2_t){v70, v70}; - float32x2_t v76 = (float32x2_t){v75, v75}; - float32x2_t v81 = (float32x2_t){v79, v80}; - float32x2_t v89 = (float32x2_t){v87, v88}; - float32x2_t v97 = (float32x2_t){v95, v96}; - const float32x2_t *v312 = &v5[0]; - int32_t *v322 = &v6[0]; - float32x4_t v362 = vld1q_f32((const float32_t *)v275); - float32x4_t v72 = vcombine_f32(v71, v71); - float32x4_t v77 = vcombine_f32(v76, v76); - float32x2_t v83 = vmul_f32(v98, v81); - float32x2_t v91 = vmul_f32(v98, v89); - float32x2_t v99 = vmul_f32(v98, v97); - const float32x2_t *v284 = &v5[istride * 4]; - const float32x2_t *v293 = &v5[istride * 3]; - const float32x2_t *v302 = &v5[istride * 2]; - int32_t *v340 = &v6[ostride * 2]; - int32_t *v349 = &v6[ostride * 3]; - int32_t *v358 = &v6[ostride * 4]; - float32x4_t v370 = vld1q_f32((const float32_t *)v312); - float32x4_t v85 = vcombine_f32(v83, v83); - float32x4_t v93 = vcombine_f32(v91, v91); - float32x4_t v101 = vcombine_f32(v99, v99); - float32x4_t v364 = vld1q_f32((const float32_t *)v284); - float32x4_t v366 = vld1q_f32((const float32_t *)v293); - float32x4_t v368 = vld1q_f32((const float32_t *)v302); - float32x4_t v35 = vaddq_f32(v362, v364); - float32x4_t v36 = vsubq_f32(v362, v364); - float32x4_t v51 = vaddq_f32(v366, v368); - float32x4_t v52 = vsubq_f32(v366, v368); - float32x4_t v53 = vaddq_f32(v35, v51); - float32x4_t v54 = vsubq_f32(v35, v51); - float32x4_t v55 = vaddq_f32(v36, v52); - float32x4_t v84 = vrev64q_f32(v36); - float32x4_t v100 = vrev64q_f32(v52); - float32x4_t v63 = vaddq_f32(v53, v370); - float32x4_t v73 = vmulq_f32(v53, v72); - float32x4_t v78 = vmulq_f32(v54, v77); - float32x4_t v86 = vmulq_f32(v84, v85); - float32x4_t v92 = vrev64q_f32(v55); - float32x4_t v102 = vmulq_f32(v100, v101); - float32x4_t v94 = vmulq_f32(v92, v93); - float32x4_t v103 = vaddq_f32(v63, v73); - int16x4_t v114 = vqmovn_s32(vcvtq_n_s32_f32(v63, 15)); - float32x4_t v104 = vaddq_f32(v103, v78); - float32x4_t v105 = vsubq_f32(v103, v78); - float32x4_t v106 = vsubq_f32(v86, v94); - float32x4_t v107 = vaddq_f32(v94, v102); - vst1_s16((int16_t *)v322, v114); - float32x4_t v108 = vaddq_f32(v104, v106); - float32x4_t v109 = vsubq_f32(v104, v106); - float32x4_t v110 = vaddq_f32(v105, v107); - float32x4_t v111 = vsubq_f32(v105, v107); - int16x4_t v122 = vqmovn_s32(vcvtq_n_s32_f32(v109, 15)); - int16x4_t v130 = vqmovn_s32(vcvtq_n_s32_f32(v111, 15)); - int16x4_t v138 = vqmovn_s32(vcvtq_n_s32_f32(v110, 15)); - int16x4_t v146 = vqmovn_s32(vcvtq_n_s32_f32(v108, 15)); - vst1_s16((int16_t *)v331, v122); - vst1_s16((int16_t *)v340, v130); - vst1_s16((int16_t *)v349, v138); - vst1_s16((int16_t *)v358, v146); - v5 += 2 * 1; - v6 += 2 * 1; - } - for (int j = v152 * 2; j < howmany; j += 1) { - float32x2_t v164 = v5[istride]; - float v198 = -1.2500000000000000e+00F; - float v202 = 5.5901699437494745e-01F; - float v205 = 1.5388417685876268e+00F; - float v206 = -1.5388417685876268e+00F; - float v212 = 5.8778525229247325e-01F; - float v213 = -5.8778525229247325e-01F; - float v219 = 3.6327126400268028e-01F; - float v220 = -3.6327126400268028e-01F; - float32x2_t v222 = (float32x2_t){v4, v4}; - float32x2_t v191 = v5[0]; - float32x2_t v199 = (float32x2_t){v198, v198}; - float32x2_t v203 = (float32x2_t){v202, v202}; - float32x2_t v207 = (float32x2_t){v205, v206}; - float32x2_t v214 = (float32x2_t){v212, v213}; - float32x2_t v221 = (float32x2_t){v219, v220}; - float32x2_t v169 = v5[istride * 4]; - float32x2_t v176 = v5[istride * 3]; - float32x2_t v181 = v5[istride * 2]; - float32x2_t v209 = vmul_f32(v222, v207); - float32x2_t v216 = vmul_f32(v222, v214); - float32x2_t v223 = vmul_f32(v222, v221); - float32x2_t v170 = vadd_f32(v164, v169); - float32x2_t v171 = vsub_f32(v164, v169); - float32x2_t v182 = vadd_f32(v176, v181); - float32x2_t v183 = vsub_f32(v176, v181); - float32x2_t v184 = vadd_f32(v170, v182); - float32x2_t v185 = vsub_f32(v170, v182); - float32x2_t v186 = vadd_f32(v171, v183); - float32x2_t v210 = vrev64_f32(v171); - float32x2_t v224 = vrev64_f32(v183); - float32x2_t v192 = vadd_f32(v184, v191); - float32x2_t v200 = vmul_f32(v184, v199); - float32x2_t v204 = vmul_f32(v185, v203); - float32x2_t v211 = vmul_f32(v210, v209); - float32x2_t v217 = vrev64_f32(v186); - float32x2_t v225 = vmul_f32(v224, v223); - float32x2_t v218 = vmul_f32(v217, v216); - float32x2_t v226 = vadd_f32(v192, v200); - int16x4_t v237 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v192, 15), (int32x2_t){0, 0})); - float32x2_t v227 = vadd_f32(v226, v204); - float32x2_t v228 = vsub_f32(v226, v204); - float32x2_t v229 = vsub_f32(v211, v218); - float32x2_t v230 = vadd_f32(v218, v225); - v6[0] = vget_lane_s32(vreinterpret_s32_s16(v237), 0); - float32x2_t v231 = vadd_f32(v227, v229); - float32x2_t v232 = vsub_f32(v227, v229); - float32x2_t v233 = vadd_f32(v228, v230); - float32x2_t v234 = vsub_f32(v228, v230); - int16x4_t v243 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v232, 15), (int32x2_t){0, 0})); - int16x4_t v249 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v234, 15), (int32x2_t){0, 0})); - int16x4_t v255 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v233, 15), (int32x2_t){0, 0})); - int16x4_t v261 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v231, 15), (int32x2_t){0, 0})); - v6[ostride] = vget_lane_s32(vreinterpret_s32_s16(v243), 0); - v6[ostride * 2] = vget_lane_s32(vreinterpret_s32_s16(v249), 0); - v6[ostride * 3] = vget_lane_s32(vreinterpret_s32_s16(v255), 0); - v6[ostride * 4] = vget_lane_s32(vreinterpret_s32_s16(v261), 0); + for (int j = 0; j < howmany; j += 1) { + float32x2_t v20 = v5[istride]; + float v54 = -1.2500000000000000e+00F; + float v58 = 5.5901699437494745e-01F; + float v61 = 1.5388417685876268e+00F; + float v62 = -1.5388417685876268e+00F; + float v68 = 5.8778525229247325e-01F; + float v69 = -5.8778525229247325e-01F; + float v75 = 3.6327126400268028e-01F; + float v76 = -3.6327126400268028e-01F; + float32x2_t v78 = (float32x2_t){v4, v4}; + float32x2_t v47 = v5[0]; + float32x2_t v55 = (float32x2_t){v54, v54}; + float32x2_t v59 = (float32x2_t){v58, v58}; + float32x2_t v63 = (float32x2_t){v61, v62}; + float32x2_t v70 = (float32x2_t){v68, v69}; + float32x2_t v77 = (float32x2_t){v75, v76}; + float32x2_t v25 = v5[istride * 4]; + float32x2_t v32 = v5[istride * 3]; + float32x2_t v37 = v5[istride * 2]; + float32x2_t v65 = vmul_f32(v78, v63); + float32x2_t v72 = vmul_f32(v78, v70); + float32x2_t v79 = vmul_f32(v78, v77); + float32x2_t v26 = vadd_f32(v20, v25); + float32x2_t v27 = vsub_f32(v20, v25); + float32x2_t v38 = vadd_f32(v32, v37); + float32x2_t v39 = vsub_f32(v32, v37); + float32x2_t v40 = vadd_f32(v26, v38); + float32x2_t v41 = vsub_f32(v26, v38); + float32x2_t v42 = vadd_f32(v27, v39); + float32x2_t v66 = vrev64_f32(v27); + float32x2_t v80 = vrev64_f32(v39); + float32x2_t v48 = vadd_f32(v40, v47); + float32x2_t v56 = vmul_f32(v40, v55); + float32x2_t v60 = vmul_f32(v41, v59); + float32x2_t v67 = vmul_f32(v66, v65); + float32x2_t v73 = vrev64_f32(v42); + float32x2_t v81 = vmul_f32(v80, v79); + float32x2_t v74 = vmul_f32(v73, v72); + float32x2_t v82 = vadd_f32(v48, v56); + int16x4_t v93 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v48, 15), (int32x2_t){0, 0})); + float32x2_t v83 = vadd_f32(v82, v60); + float32x2_t v84 = vsub_f32(v82, v60); + float32x2_t v85 = vsub_f32(v67, v74); + float32x2_t v86 = vadd_f32(v74, v81); + v6[0] = vget_lane_s32(vreinterpret_s32_s16(v93), 0); + float32x2_t v87 = vadd_f32(v83, v85); + float32x2_t v88 = vsub_f32(v83, v85); + float32x2_t v89 = vadd_f32(v84, v86); + float32x2_t v90 = vsub_f32(v84, v86); + int16x4_t v99 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v88, 15), (int32x2_t){0, 0})); + int16x4_t v105 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v90, 15), (int32x2_t){0, 0})); + int16x4_t v111 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v89, 15), (int32x2_t){0, 0})); + int16x4_t v117 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v87, 15), (int32x2_t){0, 0})); + v6[ostride] = vget_lane_s32(vreinterpret_s32_s16(v99), 0); + v6[ostride * 2] = vget_lane_s32(vreinterpret_s32_s16(v105), 0); + v6[ostride * 3] = vget_lane_s32(vreinterpret_s32_s16(v111), 0); + v6[ostride * 4] = vget_lane_s32(vreinterpret_s32_s16(v117), 0); v5 += 1 * 1; v6 += 1 * 1; } @@ -637,31 +445,20 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu5(const armral_cmplx_f32_t *restrict x, svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v170)[0])); svfloat32_t v251 = svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v179)[0])); - svfloat32_t v32; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v32) : "w"(v245), "w"(v247)); - svfloat32_t v33; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v33) : "w"(v245), "w"(v247)); - svfloat32_t v48; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v48) : "w"(v249), "w"(v251)); - svfloat32_t v49; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v49) : "w"(v249), "w"(v251)); - svfloat32_t v50; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v50) : "w"(v32), "w"(v48)); - svfloat32_t v51; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v51) : "w"(v32), "w"(v48)); - svfloat32_t v52; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v52) : "w"(v33), "w"(v49)); - svfloat32_t zero82; - asm volatile("mov %0.s, #0" : "=w"(zero82)); + svfloat32_t v32 = svadd_f32_x(svptrue_b32(), v245, v247); + svfloat32_t v33 = svsub_f32_x(svptrue_b32(), v245, v247); + svfloat32_t v48 = svadd_f32_x(svptrue_b32(), v249, v251); + svfloat32_t v49 = svsub_f32_x(svptrue_b32(), v249, v251); + svfloat32_t v50 = svadd_f32_x(svptrue_b32(), v32, v48); + svfloat32_t v51 = svsub_f32_x(svptrue_b32(), v32, v48); + svfloat32_t v52 = svadd_f32_x(svptrue_b32(), v33, v49); + svfloat32_t zero82 = svdup_n_f32(0); svfloat32_t v82 = svcmla_f32_x(pred_full, zero82, v195, v33, 90); - svfloat32_t v60; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v60) : "w"(v50), "w"(v253)); - svfloat32_t zero89; - asm volatile("mov %0.s, #0" : "=w"(zero89)); + svfloat32_t v60 = svadd_f32_x(svptrue_b32(), v50, v253); + svfloat32_t zero89 = svdup_n_f32(0); svfloat32_t v89 = svcmla_f32_x(pred_full, zero89, v196, v52, 90); svfloat32_t v97 = svmla_f32_x(pred_full, v60, v50, v193); - svfloat32_t v100; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v100) : "w"(v82), "w"(v89)); + svfloat32_t v100 = svsub_f32_x(svptrue_b32(), v82, v89); svfloat32_t v101 = svcmla_f32_x(pred_full, v89, v197, v49, 90); svint16_t v108 = svtbl_s16( svreinterpret_s16_s32(svcvt_s32_f32_x( @@ -671,14 +468,10 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu5(const armral_cmplx_f32_t *restrict x, svfloat32_t v98 = svmla_f32_x(pred_full, v97, v51, v194); svfloat32_t v99 = svmls_f32_x(pred_full, v97, v51, v194); svst1w_u64(pred_full, (unsigned *)(v205), svreinterpret_u64_s16(v108)); - svfloat32_t v102; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v102) : "w"(v98), "w"(v100)); - svfloat32_t v103; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v103) : "w"(v98), "w"(v100)); - svfloat32_t v104; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v104) : "w"(v99), "w"(v101)); - svfloat32_t v105; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v105) : "w"(v99), "w"(v101)); + svfloat32_t v102 = svadd_f32_x(svptrue_b32(), v98, v100); + svfloat32_t v103 = svsub_f32_x(svptrue_b32(), v98, v100); + svfloat32_t v104 = svadd_f32_x(svptrue_b32(), v99, v101); + svfloat32_t v105 = svsub_f32_x(svptrue_b32(), v99, v101); svint16_t v116 = svtbl_s16( svreinterpret_s16_s32(svcvt_s32_f32_x( pred_full, svmul_n_f32_x(pred_full, v103, (float)(1ULL << 31ULL)))), @@ -717,131 +510,62 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu6(const armral_cmplx_f32_t *restrict x, float v4 = dir; const float32x2_t *v5 = (const float32x2_t *)x; int32_t *v6 = (int32_t *)y; - int64_t v12 = howmany - 1; - int64_t v165 = howmany / 2; - for (int j = 0; j < v12; j += 2) { - float v102 = -1.4999999999999998e+00F; - float v106 = 8.6602540378443871e-01F; - float v107 = -8.6602540378443871e-01F; - float32x2_t v109 = (float32x2_t){v4, v4}; - const float32x2_t *v343 = &v5[istride]; - int32_t *v380 = &v6[ostride]; - float32x2_t v103 = (float32x2_t){v102, v102}; - float32x2_t v108 = (float32x2_t){v106, v107}; - const float32x2_t *v298 = &v5[0]; - int32_t *v353 = &v6[0]; - float32x4_t v412 = vld1q_f32((const float32_t *)v343); - float32x4_t v104 = vcombine_f32(v103, v103); - float32x2_t v110 = vmul_f32(v109, v108); - const float32x2_t *v307 = &v5[istride * 3]; - const float32x2_t *v316 = &v5[istride * 2]; - const float32x2_t *v325 = &v5[istride * 5]; - const float32x2_t *v334 = &v5[istride * 4]; - int32_t *v362 = &v6[ostride * 3]; - int32_t *v371 = &v6[ostride * 4]; - int32_t *v389 = &v6[ostride * 2]; - int32_t *v398 = &v6[ostride * 5]; - float32x4_t v402 = vld1q_f32((const float32_t *)v298); - float32x4_t v112 = vcombine_f32(v110, v110); - float32x4_t v404 = vld1q_f32((const float32_t *)v307); - float32x4_t v406 = vld1q_f32((const float32_t *)v316); - float32x4_t v408 = vld1q_f32((const float32_t *)v325); - float32x4_t v410 = vld1q_f32((const float32_t *)v334); - float32x4_t v35 = vaddq_f32(v402, v404); - float32x4_t v36 = vsubq_f32(v402, v404); - float32x4_t v51 = vaddq_f32(v406, v408); - float32x4_t v52 = vsubq_f32(v406, v408); - float32x4_t v67 = vaddq_f32(v410, v412); - float32x4_t v68 = vsubq_f32(v410, v412); - float32x4_t v69 = vaddq_f32(v51, v67); - float32x4_t v70 = vsubq_f32(v51, v67); - float32x4_t v93 = vaddq_f32(v52, v68); - float32x4_t v94 = vsubq_f32(v52, v68); - float32x4_t v71 = vaddq_f32(v69, v35); - float32x4_t v81 = vmulq_f32(v69, v104); - float32x4_t v87 = vrev64q_f32(v70); - float32x4_t v95 = vaddq_f32(v93, v36); - float32x4_t v105 = vmulq_f32(v93, v104); - float32x4_t v111 = vrev64q_f32(v94); - float32x4_t v89 = vmulq_f32(v87, v112); - float32x4_t v90 = vaddq_f32(v71, v81); - float32x4_t v113 = vmulq_f32(v111, v112); - float32x4_t v114 = vaddq_f32(v95, v105); - int16x4_t v119 = vqmovn_s32(vcvtq_n_s32_f32(v71, 15)); - int16x4_t v127 = vqmovn_s32(vcvtq_n_s32_f32(v95, 15)); - float32x4_t v91 = vaddq_f32(v90, v89); - float32x4_t v92 = vsubq_f32(v90, v89); - float32x4_t v115 = vaddq_f32(v114, v113); - float32x4_t v116 = vsubq_f32(v114, v113); - vst1_s16((int16_t *)v353, v119); - vst1_s16((int16_t *)v362, v127); - int16x4_t v135 = vqmovn_s32(vcvtq_n_s32_f32(v92, 15)); - int16x4_t v143 = vqmovn_s32(vcvtq_n_s32_f32(v116, 15)); - int16x4_t v151 = vqmovn_s32(vcvtq_n_s32_f32(v91, 15)); - int16x4_t v159 = vqmovn_s32(vcvtq_n_s32_f32(v115, 15)); - vst1_s16((int16_t *)v371, v135); - vst1_s16((int16_t *)v380, v143); - vst1_s16((int16_t *)v389, v151); - vst1_s16((int16_t *)v398, v159); - v5 += 2 * 1; - v6 += 2 * 1; - } - for (int j = v165 * 2; j < howmany; j += 1) { - float32x2_t v206 = v5[istride]; - float v238 = -1.4999999999999998e+00F; - float v241 = 8.6602540378443871e-01F; - float v242 = -8.6602540378443871e-01F; - float32x2_t v244 = (float32x2_t){v4, v4}; - float32x2_t v177 = v5[0]; - float32x2_t v239 = (float32x2_t){v238, v238}; - float32x2_t v243 = (float32x2_t){v241, v242}; - float32x2_t v182 = v5[istride * 3]; - float32x2_t v189 = v5[istride * 2]; - float32x2_t v194 = v5[istride * 5]; - float32x2_t v201 = v5[istride * 4]; - float32x2_t v245 = vmul_f32(v244, v243); - float32x2_t v183 = vadd_f32(v177, v182); - float32x2_t v184 = vsub_f32(v177, v182); - float32x2_t v195 = vadd_f32(v189, v194); - float32x2_t v196 = vsub_f32(v189, v194); - float32x2_t v207 = vadd_f32(v201, v206); - float32x2_t v208 = vsub_f32(v201, v206); - float32x2_t v209 = vadd_f32(v195, v207); - float32x2_t v210 = vsub_f32(v195, v207); - float32x2_t v230 = vadd_f32(v196, v208); - float32x2_t v231 = vsub_f32(v196, v208); - float32x2_t v211 = vadd_f32(v209, v183); - float32x2_t v219 = vmul_f32(v209, v239); - float32x2_t v225 = vrev64_f32(v210); - float32x2_t v232 = vadd_f32(v230, v184); - float32x2_t v240 = vmul_f32(v230, v239); - float32x2_t v246 = vrev64_f32(v231); - float32x2_t v226 = vmul_f32(v225, v245); - float32x2_t v227 = vadd_f32(v211, v219); - float32x2_t v247 = vmul_f32(v246, v245); - float32x2_t v248 = vadd_f32(v232, v240); - int16x4_t v253 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v211, 15), (int32x2_t){0, 0})); - int16x4_t v259 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v232, 15), (int32x2_t){0, 0})); - float32x2_t v228 = vadd_f32(v227, v226); - float32x2_t v229 = vsub_f32(v227, v226); - float32x2_t v249 = vadd_f32(v248, v247); - float32x2_t v250 = vsub_f32(v248, v247); - v6[0] = vget_lane_s32(vreinterpret_s32_s16(v253), 0); - v6[ostride * 3] = vget_lane_s32(vreinterpret_s32_s16(v259), 0); - int16x4_t v265 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v229, 15), (int32x2_t){0, 0})); - int16x4_t v271 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v250, 15), (int32x2_t){0, 0})); - int16x4_t v277 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v228, 15), (int32x2_t){0, 0})); - int16x4_t v283 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v249, 15), (int32x2_t){0, 0})); - v6[ostride * 4] = vget_lane_s32(vreinterpret_s32_s16(v265), 0); - v6[ostride] = vget_lane_s32(vreinterpret_s32_s16(v271), 0); - v6[ostride * 2] = vget_lane_s32(vreinterpret_s32_s16(v277), 0); - v6[ostride * 5] = vget_lane_s32(vreinterpret_s32_s16(v283), 0); + for (int j = 0; j < howmany; j += 1) { + float32x2_t v49 = v5[istride]; + float v81 = -1.4999999999999998e+00F; + float v84 = 8.6602540378443871e-01F; + float v85 = -8.6602540378443871e-01F; + float32x2_t v87 = (float32x2_t){v4, v4}; + float32x2_t v20 = v5[0]; + float32x2_t v82 = (float32x2_t){v81, v81}; + float32x2_t v86 = (float32x2_t){v84, v85}; + float32x2_t v25 = v5[istride * 3]; + float32x2_t v32 = v5[istride * 2]; + float32x2_t v37 = v5[istride * 5]; + float32x2_t v44 = v5[istride * 4]; + float32x2_t v88 = vmul_f32(v87, v86); + float32x2_t v26 = vadd_f32(v20, v25); + float32x2_t v27 = vsub_f32(v20, v25); + float32x2_t v38 = vadd_f32(v32, v37); + float32x2_t v39 = vsub_f32(v32, v37); + float32x2_t v50 = vadd_f32(v44, v49); + float32x2_t v51 = vsub_f32(v44, v49); + float32x2_t v52 = vadd_f32(v38, v50); + float32x2_t v53 = vsub_f32(v38, v50); + float32x2_t v73 = vadd_f32(v39, v51); + float32x2_t v74 = vsub_f32(v39, v51); + float32x2_t v54 = vadd_f32(v52, v26); + float32x2_t v62 = vmul_f32(v52, v82); + float32x2_t v68 = vrev64_f32(v53); + float32x2_t v75 = vadd_f32(v73, v27); + float32x2_t v83 = vmul_f32(v73, v82); + float32x2_t v89 = vrev64_f32(v74); + float32x2_t v69 = vmul_f32(v68, v88); + float32x2_t v70 = vadd_f32(v54, v62); + float32x2_t v90 = vmul_f32(v89, v88); + float32x2_t v91 = vadd_f32(v75, v83); + int16x4_t v96 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v54, 15), (int32x2_t){0, 0})); + int16x4_t v102 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v75, 15), (int32x2_t){0, 0})); + float32x2_t v71 = vadd_f32(v70, v69); + float32x2_t v72 = vsub_f32(v70, v69); + float32x2_t v92 = vadd_f32(v91, v90); + float32x2_t v93 = vsub_f32(v91, v90); + v6[0] = vget_lane_s32(vreinterpret_s32_s16(v96), 0); + v6[ostride * 3] = vget_lane_s32(vreinterpret_s32_s16(v102), 0); + int16x4_t v108 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v72, 15), (int32x2_t){0, 0})); + int16x4_t v114 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v93, 15), (int32x2_t){0, 0})); + int16x4_t v120 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v71, 15), (int32x2_t){0, 0})); + int16x4_t v126 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v92, 15), (int32x2_t){0, 0})); + v6[ostride * 4] = vget_lane_s32(vreinterpret_s32_s16(v108), 0); + v6[ostride] = vget_lane_s32(vreinterpret_s32_s16(v114), 0); + v6[ostride * 2] = vget_lane_s32(vreinterpret_s32_s16(v120), 0); + v6[ostride * 5] = vget_lane_s32(vreinterpret_s32_s16(v126), 0); v5 += 1 * 1; v6 += 1 * 1; } @@ -901,35 +625,21 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu6(const armral_cmplx_f32_t *restrict x, svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v194)[0])); svfloat32_t v285 = svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v203)[0])); - svfloat32_t v32; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v32) : "w"(v277), "w"(v279)); - svfloat32_t v33; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v33) : "w"(v277), "w"(v279)); - svfloat32_t v48; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v48) : "w"(v281), "w"(v283)); - svfloat32_t v49; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v49) : "w"(v281), "w"(v283)); - svfloat32_t v64; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v64) : "w"(v285), "w"(v287)); - svfloat32_t v65; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v65) : "w"(v285), "w"(v287)); - svfloat32_t v66; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v66) : "w"(v48), "w"(v64)); - svfloat32_t v67; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v67) : "w"(v48), "w"(v64)); - svfloat32_t v89; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v89) : "w"(v49), "w"(v65)); - svfloat32_t v90; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v90) : "w"(v49), "w"(v65)); - svfloat32_t v68; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v68) : "w"(v66), "w"(v32)); - svfloat32_t zero85; - asm volatile("mov %0.s, #0" : "=w"(zero85)); + svfloat32_t v32 = svadd_f32_x(svptrue_b32(), v277, v279); + svfloat32_t v33 = svsub_f32_x(svptrue_b32(), v277, v279); + svfloat32_t v48 = svadd_f32_x(svptrue_b32(), v281, v283); + svfloat32_t v49 = svsub_f32_x(svptrue_b32(), v281, v283); + svfloat32_t v64 = svadd_f32_x(svptrue_b32(), v285, v287); + svfloat32_t v65 = svsub_f32_x(svptrue_b32(), v285, v287); + svfloat32_t v66 = svadd_f32_x(svptrue_b32(), v48, v64); + svfloat32_t v67 = svsub_f32_x(svptrue_b32(), v48, v64); + svfloat32_t v89 = svadd_f32_x(svptrue_b32(), v49, v65); + svfloat32_t v90 = svsub_f32_x(svptrue_b32(), v49, v65); + svfloat32_t v68 = svadd_f32_x(svptrue_b32(), v66, v32); + svfloat32_t zero85 = svdup_n_f32(0); svfloat32_t v85 = svcmla_f32_x(pred_full, zero85, v220, v67, 90); - svfloat32_t v91; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v91) : "w"(v89), "w"(v33)); - svfloat32_t zero108; - asm volatile("mov %0.s, #0" : "=w"(zero108)); + svfloat32_t v91 = svadd_f32_x(svptrue_b32(), v89, v33); + svfloat32_t zero108 = svdup_n_f32(0); svfloat32_t v108 = svcmla_f32_x(pred_full, zero108, v220, v90, 90); svfloat32_t v86 = svmla_f32_x(pred_full, v68, v66, v219); svfloat32_t v109 = svmla_f32_x(pred_full, v91, v89, v219); @@ -943,14 +653,10 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu6(const armral_cmplx_f32_t *restrict x, pred_full, svmul_n_f32_x(pred_full, v91, (float)(1ULL << 31ULL)))), svreinterpret_u16_u64( svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svfloat32_t v87; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v87) : "w"(v86), "w"(v85)); - svfloat32_t v88; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v88) : "w"(v86), "w"(v85)); - svfloat32_t v110; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v110) : "w"(v109), "w"(v108)); - svfloat32_t v111; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v111) : "w"(v109), "w"(v108)); + svfloat32_t v87 = svadd_f32_x(svptrue_b32(), v86, v85); + svfloat32_t v88 = svsub_f32_x(svptrue_b32(), v86, v85); + svfloat32_t v110 = svadd_f32_x(svptrue_b32(), v109, v108); + svfloat32_t v111 = svsub_f32_x(svptrue_b32(), v109, v108); svst1w_u64(pred_full, (unsigned *)(v228), svreinterpret_u64_s16(v114)); svst1w_u64(pred_full, (unsigned *)(v237), svreinterpret_u64_s16(v122)); svint16_t v130 = svtbl_s16( @@ -991,230 +697,108 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu7(const armral_cmplx_f32_t *restrict x, float v4 = dir; const float32x2_t *v5 = (const float32x2_t *)x; int32_t *v6 = (int32_t *)y; - int64_t v12 = howmany - 1; - int64_t v219 = howmany / 2; - for (int j = 0; j < v12; j += 2) { - float v93 = -1.1666666666666665e+00F; - float v98 = 7.9015646852540022e-01F; - float v103 = 5.5854267289647742e-02F; - float v108 = 7.3430220123575241e-01F; - float v112 = 4.4095855184409838e-01F; - float v113 = -4.4095855184409838e-01F; - float v120 = 3.4087293062393137e-01F; - float v121 = -3.4087293062393137e-01F; - float v128 = -5.3396936033772524e-01F; - float v129 = 5.3396936033772524e-01F; - float v136 = 8.7484229096165667e-01F; - float v137 = -8.7484229096165667e-01F; - float32x2_t v139 = (float32x2_t){v4, v4}; - const float32x2_t *v398 = &v5[istride]; - int32_t *v472 = &v6[ostride]; - float32x2_t v94 = (float32x2_t){v93, v93}; - float32x2_t v99 = (float32x2_t){v98, v98}; - float32x2_t v104 = (float32x2_t){v103, v103}; - float32x2_t v109 = (float32x2_t){v108, v108}; - float32x2_t v114 = (float32x2_t){v112, v113}; - float32x2_t v122 = (float32x2_t){v120, v121}; - float32x2_t v130 = (float32x2_t){v128, v129}; - float32x2_t v138 = (float32x2_t){v136, v137}; - const float32x2_t *v453 = &v5[0]; - int32_t *v463 = &v6[0]; - float32x4_t v521 = vld1q_f32((const float32_t *)v398); - float32x4_t v95 = vcombine_f32(v94, v94); - float32x4_t v100 = vcombine_f32(v99, v99); - float32x4_t v105 = vcombine_f32(v104, v104); - float32x4_t v110 = vcombine_f32(v109, v109); - float32x2_t v116 = vmul_f32(v139, v114); - float32x2_t v124 = vmul_f32(v139, v122); - float32x2_t v132 = vmul_f32(v139, v130); - float32x2_t v140 = vmul_f32(v139, v138); - const float32x2_t *v407 = &v5[istride * 6]; - const float32x2_t *v416 = &v5[istride * 4]; - const float32x2_t *v425 = &v5[istride * 3]; - const float32x2_t *v434 = &v5[istride * 2]; - const float32x2_t *v443 = &v5[istride * 5]; - int32_t *v481 = &v6[ostride * 2]; - int32_t *v490 = &v6[ostride * 3]; - int32_t *v499 = &v6[ostride * 4]; - int32_t *v508 = &v6[ostride * 5]; - int32_t *v517 = &v6[ostride * 6]; - float32x4_t v533 = vld1q_f32((const float32_t *)v453); - float32x4_t v118 = vcombine_f32(v116, v116); - float32x4_t v126 = vcombine_f32(v124, v124); - float32x4_t v134 = vcombine_f32(v132, v132); - float32x4_t v142 = vcombine_f32(v140, v140); - float32x4_t v523 = vld1q_f32((const float32_t *)v407); - float32x4_t v525 = vld1q_f32((const float32_t *)v416); - float32x4_t v527 = vld1q_f32((const float32_t *)v425); - float32x4_t v529 = vld1q_f32((const float32_t *)v434); - float32x4_t v531 = vld1q_f32((const float32_t *)v443); - float32x4_t v35 = vaddq_f32(v521, v523); - float32x4_t v36 = vsubq_f32(v521, v523); - float32x4_t v51 = vaddq_f32(v525, v527); - float32x4_t v52 = vsubq_f32(v525, v527); - float32x4_t v67 = vaddq_f32(v529, v531); - float32x4_t v68 = vsubq_f32(v529, v531); - float32x4_t v69 = vaddq_f32(v35, v51); - float32x4_t v79 = vsubq_f32(v35, v51); - float32x4_t v80 = vsubq_f32(v51, v67); - float32x4_t v81 = vsubq_f32(v67, v35); - float32x4_t v82 = vaddq_f32(v36, v52); - float32x4_t v84 = vsubq_f32(v36, v52); - float32x4_t v85 = vsubq_f32(v52, v68); - float32x4_t v86 = vsubq_f32(v68, v36); - float32x4_t v70 = vaddq_f32(v69, v67); - float32x4_t v83 = vaddq_f32(v82, v68); - float32x4_t v101 = vmulq_f32(v79, v100); - float32x4_t v106 = vmulq_f32(v80, v105); - float32x4_t v111 = vmulq_f32(v81, v110); - float32x4_t v125 = vrev64q_f32(v84); - float32x4_t v133 = vrev64q_f32(v85); - float32x4_t v141 = vrev64q_f32(v86); - float32x4_t v78 = vaddq_f32(v70, v533); - float32x4_t v96 = vmulq_f32(v70, v95); - float32x4_t v117 = vrev64q_f32(v83); - float32x4_t v127 = vmulq_f32(v125, v126); - float32x4_t v135 = vmulq_f32(v133, v134); - float32x4_t v143 = vmulq_f32(v141, v142); - float32x4_t v119 = vmulq_f32(v117, v118); - float32x4_t v144 = vaddq_f32(v78, v96); - int16x4_t v165 = vqmovn_s32(vcvtq_n_s32_f32(v78, 15)); - float32x4_t v145 = vaddq_f32(v144, v101); - float32x4_t v147 = vsubq_f32(v144, v101); - float32x4_t v149 = vsubq_f32(v144, v106); - float32x4_t v151 = vaddq_f32(v119, v127); - float32x4_t v153 = vsubq_f32(v119, v127); - float32x4_t v155 = vsubq_f32(v119, v135); - vst1_s16((int16_t *)v463, v165); - float32x4_t v146 = vaddq_f32(v145, v106); - float32x4_t v148 = vsubq_f32(v147, v111); - float32x4_t v150 = vaddq_f32(v149, v111); - float32x4_t v152 = vaddq_f32(v151, v135); - float32x4_t v154 = vsubq_f32(v153, v143); - float32x4_t v156 = vaddq_f32(v155, v143); - float32x4_t v157 = vaddq_f32(v146, v152); - float32x4_t v158 = vsubq_f32(v146, v152); - float32x4_t v159 = vaddq_f32(v148, v154); - float32x4_t v160 = vsubq_f32(v148, v154); - float32x4_t v161 = vaddq_f32(v150, v156); - float32x4_t v162 = vsubq_f32(v150, v156); - int16x4_t v173 = vqmovn_s32(vcvtq_n_s32_f32(v158, 15)); - int16x4_t v181 = vqmovn_s32(vcvtq_n_s32_f32(v160, 15)); - int16x4_t v189 = vqmovn_s32(vcvtq_n_s32_f32(v161, 15)); - int16x4_t v197 = vqmovn_s32(vcvtq_n_s32_f32(v162, 15)); - int16x4_t v205 = vqmovn_s32(vcvtq_n_s32_f32(v159, 15)); - int16x4_t v213 = vqmovn_s32(vcvtq_n_s32_f32(v157, 15)); - vst1_s16((int16_t *)v472, v173); - vst1_s16((int16_t *)v481, v181); - vst1_s16((int16_t *)v490, v189); - vst1_s16((int16_t *)v499, v197); - vst1_s16((int16_t *)v508, v205); - vst1_s16((int16_t *)v517, v213); - v5 += 2 * 1; - v6 += 2 * 1; - } - for (int j = v219 * 2; j < howmany; j += 1) { - float32x2_t v231 = v5[istride]; - float v284 = -1.1666666666666665e+00F; - float v288 = 7.9015646852540022e-01F; - float v292 = 5.5854267289647742e-02F; - float v296 = 7.3430220123575241e-01F; - float v299 = 4.4095855184409838e-01F; - float v300 = -4.4095855184409838e-01F; - float v306 = 3.4087293062393137e-01F; - float v307 = -3.4087293062393137e-01F; - float v313 = -5.3396936033772524e-01F; - float v314 = 5.3396936033772524e-01F; - float v320 = 8.7484229096165667e-01F; - float v321 = -8.7484229096165667e-01F; - float32x2_t v323 = (float32x2_t){v4, v4}; - float32x2_t v269 = v5[0]; - float32x2_t v285 = (float32x2_t){v284, v284}; - float32x2_t v289 = (float32x2_t){v288, v288}; - float32x2_t v293 = (float32x2_t){v292, v292}; - float32x2_t v297 = (float32x2_t){v296, v296}; - float32x2_t v301 = (float32x2_t){v299, v300}; - float32x2_t v308 = (float32x2_t){v306, v307}; - float32x2_t v315 = (float32x2_t){v313, v314}; - float32x2_t v322 = (float32x2_t){v320, v321}; - float32x2_t v236 = v5[istride * 6]; - float32x2_t v243 = v5[istride * 4]; - float32x2_t v248 = v5[istride * 3]; - float32x2_t v255 = v5[istride * 2]; - float32x2_t v260 = v5[istride * 5]; - float32x2_t v303 = vmul_f32(v323, v301); - float32x2_t v310 = vmul_f32(v323, v308); - float32x2_t v317 = vmul_f32(v323, v315); - float32x2_t v324 = vmul_f32(v323, v322); - float32x2_t v237 = vadd_f32(v231, v236); - float32x2_t v238 = vsub_f32(v231, v236); - float32x2_t v249 = vadd_f32(v243, v248); - float32x2_t v250 = vsub_f32(v243, v248); - float32x2_t v261 = vadd_f32(v255, v260); - float32x2_t v262 = vsub_f32(v255, v260); - float32x2_t v263 = vadd_f32(v237, v249); - float32x2_t v271 = vsub_f32(v237, v249); - float32x2_t v272 = vsub_f32(v249, v261); - float32x2_t v273 = vsub_f32(v261, v237); - float32x2_t v274 = vadd_f32(v238, v250); - float32x2_t v276 = vsub_f32(v238, v250); - float32x2_t v277 = vsub_f32(v250, v262); - float32x2_t v278 = vsub_f32(v262, v238); - float32x2_t v264 = vadd_f32(v263, v261); - float32x2_t v275 = vadd_f32(v274, v262); - float32x2_t v290 = vmul_f32(v271, v289); - float32x2_t v294 = vmul_f32(v272, v293); - float32x2_t v298 = vmul_f32(v273, v297); - float32x2_t v311 = vrev64_f32(v276); - float32x2_t v318 = vrev64_f32(v277); - float32x2_t v325 = vrev64_f32(v278); - float32x2_t v270 = vadd_f32(v264, v269); - float32x2_t v286 = vmul_f32(v264, v285); - float32x2_t v304 = vrev64_f32(v275); - float32x2_t v312 = vmul_f32(v311, v310); - float32x2_t v319 = vmul_f32(v318, v317); - float32x2_t v326 = vmul_f32(v325, v324); - float32x2_t v305 = vmul_f32(v304, v303); - float32x2_t v327 = vadd_f32(v270, v286); - int16x4_t v348 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v270, 15), (int32x2_t){0, 0})); - float32x2_t v328 = vadd_f32(v327, v290); - float32x2_t v330 = vsub_f32(v327, v290); - float32x2_t v332 = vsub_f32(v327, v294); - float32x2_t v334 = vadd_f32(v305, v312); - float32x2_t v336 = vsub_f32(v305, v312); - float32x2_t v338 = vsub_f32(v305, v319); - v6[0] = vget_lane_s32(vreinterpret_s32_s16(v348), 0); - float32x2_t v329 = vadd_f32(v328, v294); - float32x2_t v331 = vsub_f32(v330, v298); - float32x2_t v333 = vadd_f32(v332, v298); - float32x2_t v335 = vadd_f32(v334, v319); - float32x2_t v337 = vsub_f32(v336, v326); - float32x2_t v339 = vadd_f32(v338, v326); - float32x2_t v340 = vadd_f32(v329, v335); - float32x2_t v341 = vsub_f32(v329, v335); - float32x2_t v342 = vadd_f32(v331, v337); - float32x2_t v343 = vsub_f32(v331, v337); - float32x2_t v344 = vadd_f32(v333, v339); - float32x2_t v345 = vsub_f32(v333, v339); - int16x4_t v354 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v341, 15), (int32x2_t){0, 0})); - int16x4_t v360 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v343, 15), (int32x2_t){0, 0})); - int16x4_t v366 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v344, 15), (int32x2_t){0, 0})); - int16x4_t v372 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v345, 15), (int32x2_t){0, 0})); - int16x4_t v378 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v342, 15), (int32x2_t){0, 0})); - int16x4_t v384 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v340, 15), (int32x2_t){0, 0})); - v6[ostride] = vget_lane_s32(vreinterpret_s32_s16(v354), 0); - v6[ostride * 2] = vget_lane_s32(vreinterpret_s32_s16(v360), 0); - v6[ostride * 3] = vget_lane_s32(vreinterpret_s32_s16(v366), 0); - v6[ostride * 4] = vget_lane_s32(vreinterpret_s32_s16(v372), 0); - v6[ostride * 5] = vget_lane_s32(vreinterpret_s32_s16(v378), 0); - v6[ostride * 6] = vget_lane_s32(vreinterpret_s32_s16(v384), 0); + for (int j = 0; j < howmany; j += 1) { + float32x2_t v20 = v5[istride]; + float v73 = -1.1666666666666665e+00F; + float v77 = 7.9015646852540022e-01F; + float v81 = 5.5854267289647742e-02F; + float v85 = 7.3430220123575241e-01F; + float v88 = 4.4095855184409838e-01F; + float v89 = -4.4095855184409838e-01F; + float v95 = 3.4087293062393137e-01F; + float v96 = -3.4087293062393137e-01F; + float v102 = -5.3396936033772524e-01F; + float v103 = 5.3396936033772524e-01F; + float v109 = 8.7484229096165667e-01F; + float v110 = -8.7484229096165667e-01F; + float32x2_t v112 = (float32x2_t){v4, v4}; + float32x2_t v58 = v5[0]; + float32x2_t v74 = (float32x2_t){v73, v73}; + float32x2_t v78 = (float32x2_t){v77, v77}; + float32x2_t v82 = (float32x2_t){v81, v81}; + float32x2_t v86 = (float32x2_t){v85, v85}; + float32x2_t v90 = (float32x2_t){v88, v89}; + float32x2_t v97 = (float32x2_t){v95, v96}; + float32x2_t v104 = (float32x2_t){v102, v103}; + float32x2_t v111 = (float32x2_t){v109, v110}; + float32x2_t v25 = v5[istride * 6]; + float32x2_t v32 = v5[istride * 4]; + float32x2_t v37 = v5[istride * 3]; + float32x2_t v44 = v5[istride * 2]; + float32x2_t v49 = v5[istride * 5]; + float32x2_t v92 = vmul_f32(v112, v90); + float32x2_t v99 = vmul_f32(v112, v97); + float32x2_t v106 = vmul_f32(v112, v104); + float32x2_t v113 = vmul_f32(v112, v111); + float32x2_t v26 = vadd_f32(v20, v25); + float32x2_t v27 = vsub_f32(v20, v25); + float32x2_t v38 = vadd_f32(v32, v37); + float32x2_t v39 = vsub_f32(v32, v37); + float32x2_t v50 = vadd_f32(v44, v49); + float32x2_t v51 = vsub_f32(v44, v49); + float32x2_t v52 = vadd_f32(v26, v38); + float32x2_t v60 = vsub_f32(v26, v38); + float32x2_t v61 = vsub_f32(v38, v50); + float32x2_t v62 = vsub_f32(v50, v26); + float32x2_t v63 = vadd_f32(v27, v39); + float32x2_t v65 = vsub_f32(v27, v39); + float32x2_t v66 = vsub_f32(v39, v51); + float32x2_t v67 = vsub_f32(v51, v27); + float32x2_t v53 = vadd_f32(v52, v50); + float32x2_t v64 = vadd_f32(v63, v51); + float32x2_t v79 = vmul_f32(v60, v78); + float32x2_t v83 = vmul_f32(v61, v82); + float32x2_t v87 = vmul_f32(v62, v86); + float32x2_t v100 = vrev64_f32(v65); + float32x2_t v107 = vrev64_f32(v66); + float32x2_t v114 = vrev64_f32(v67); + float32x2_t v59 = vadd_f32(v53, v58); + float32x2_t v75 = vmul_f32(v53, v74); + float32x2_t v93 = vrev64_f32(v64); + float32x2_t v101 = vmul_f32(v100, v99); + float32x2_t v108 = vmul_f32(v107, v106); + float32x2_t v115 = vmul_f32(v114, v113); + float32x2_t v94 = vmul_f32(v93, v92); + float32x2_t v116 = vadd_f32(v59, v75); + int16x4_t v137 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v59, 15), (int32x2_t){0, 0})); + float32x2_t v117 = vadd_f32(v116, v79); + float32x2_t v119 = vsub_f32(v116, v79); + float32x2_t v121 = vsub_f32(v116, v83); + float32x2_t v123 = vadd_f32(v94, v101); + float32x2_t v125 = vsub_f32(v94, v101); + float32x2_t v127 = vsub_f32(v94, v108); + v6[0] = vget_lane_s32(vreinterpret_s32_s16(v137), 0); + float32x2_t v118 = vadd_f32(v117, v83); + float32x2_t v120 = vsub_f32(v119, v87); + float32x2_t v122 = vadd_f32(v121, v87); + float32x2_t v124 = vadd_f32(v123, v108); + float32x2_t v126 = vsub_f32(v125, v115); + float32x2_t v128 = vadd_f32(v127, v115); + float32x2_t v129 = vadd_f32(v118, v124); + float32x2_t v130 = vsub_f32(v118, v124); + float32x2_t v131 = vadd_f32(v120, v126); + float32x2_t v132 = vsub_f32(v120, v126); + float32x2_t v133 = vadd_f32(v122, v128); + float32x2_t v134 = vsub_f32(v122, v128); + int16x4_t v143 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v130, 15), (int32x2_t){0, 0})); + int16x4_t v149 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v132, 15), (int32x2_t){0, 0})); + int16x4_t v155 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v133, 15), (int32x2_t){0, 0})); + int16x4_t v161 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v134, 15), (int32x2_t){0, 0})); + int16x4_t v167 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v131, 15), (int32x2_t){0, 0})); + int16x4_t v173 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v129, 15), (int32x2_t){0, 0})); + v6[ostride] = vget_lane_s32(vreinterpret_s32_s16(v143), 0); + v6[ostride * 2] = vget_lane_s32(vreinterpret_s32_s16(v149), 0); + v6[ostride * 3] = vget_lane_s32(vreinterpret_s32_s16(v155), 0); + v6[ostride * 4] = vget_lane_s32(vreinterpret_s32_s16(v161), 0); + v6[ostride * 5] = vget_lane_s32(vreinterpret_s32_s16(v167), 0); + v6[ostride * 6] = vget_lane_s32(vreinterpret_s32_s16(v173), 0); v5 += 1 * 1; v6 += 1 * 1; } @@ -1295,59 +879,35 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu7(const armral_cmplx_f32_t *restrict x, svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v254)[0])); svfloat32_t v360 = svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v263)[0])); - svfloat32_t v32; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v32) : "w"(v350), "w"(v352)); - svfloat32_t v33; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v33) : "w"(v350), "w"(v352)); - svfloat32_t v48; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v48) : "w"(v354), "w"(v356)); - svfloat32_t v49; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v49) : "w"(v354), "w"(v356)); - svfloat32_t v64; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v64) : "w"(v358), "w"(v360)); - svfloat32_t v65; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v65) : "w"(v358), "w"(v360)); - svfloat32_t v66; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v66) : "w"(v32), "w"(v48)); - svfloat32_t v76; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v76) : "w"(v32), "w"(v48)); - svfloat32_t v77; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v77) : "w"(v48), "w"(v64)); - svfloat32_t v78; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v78) : "w"(v64), "w"(v32)); - svfloat32_t v79; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v79) : "w"(v33), "w"(v49)); - svfloat32_t v81; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v81) : "w"(v33), "w"(v49)); - svfloat32_t v82; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v82) : "w"(v49), "w"(v65)); - svfloat32_t v83; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v83) : "w"(v65), "w"(v33)); - svfloat32_t v67; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v67) : "w"(v66), "w"(v64)); - svfloat32_t v80; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v80) : "w"(v79), "w"(v65)); - svfloat32_t zero122; - asm volatile("mov %0.s, #0" : "=w"(zero122)); + svfloat32_t v32 = svadd_f32_x(svptrue_b32(), v350, v352); + svfloat32_t v33 = svsub_f32_x(svptrue_b32(), v350, v352); + svfloat32_t v48 = svadd_f32_x(svptrue_b32(), v354, v356); + svfloat32_t v49 = svsub_f32_x(svptrue_b32(), v354, v356); + svfloat32_t v64 = svadd_f32_x(svptrue_b32(), v358, v360); + svfloat32_t v65 = svsub_f32_x(svptrue_b32(), v358, v360); + svfloat32_t v66 = svadd_f32_x(svptrue_b32(), v32, v48); + svfloat32_t v76 = svsub_f32_x(svptrue_b32(), v32, v48); + svfloat32_t v77 = svsub_f32_x(svptrue_b32(), v48, v64); + svfloat32_t v78 = svsub_f32_x(svptrue_b32(), v64, v32); + svfloat32_t v79 = svadd_f32_x(svptrue_b32(), v33, v49); + svfloat32_t v81 = svsub_f32_x(svptrue_b32(), v33, v49); + svfloat32_t v82 = svsub_f32_x(svptrue_b32(), v49, v65); + svfloat32_t v83 = svsub_f32_x(svptrue_b32(), v65, v33); + svfloat32_t v67 = svadd_f32_x(svptrue_b32(), v66, v64); + svfloat32_t v80 = svadd_f32_x(svptrue_b32(), v79, v65); + svfloat32_t zero122 = svdup_n_f32(0); svfloat32_t v122 = svcmla_f32_x(pred_full, zero122, v282, v81, 90); - svfloat32_t zero129; - asm volatile("mov %0.s, #0" : "=w"(zero129)); + svfloat32_t zero129 = svdup_n_f32(0); svfloat32_t v129 = svcmla_f32_x(pred_full, zero129, v283, v82, 90); - svfloat32_t zero136; - asm volatile("mov %0.s, #0" : "=w"(zero136)); + svfloat32_t zero136 = svdup_n_f32(0); svfloat32_t v136 = svcmla_f32_x(pred_full, zero136, v284, v83, 90); - svfloat32_t v75; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v75) : "w"(v67), "w"(v362)); - svfloat32_t zero115; - asm volatile("mov %0.s, #0" : "=w"(zero115)); + svfloat32_t v75 = svadd_f32_x(svptrue_b32(), v67, v362); + svfloat32_t zero115 = svdup_n_f32(0); svfloat32_t v115 = svcmla_f32_x(pred_full, zero115, v281, v80, 90); svfloat32_t v137 = svmla_f32_x(pred_full, v75, v67, v277); - svfloat32_t v144; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v144) : "w"(v115), "w"(v122)); - svfloat32_t v146; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v146) : "w"(v115), "w"(v122)); - svfloat32_t v148; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v148) : "w"(v115), "w"(v129)); + svfloat32_t v144 = svadd_f32_x(svptrue_b32(), v115, v122); + svfloat32_t v146 = svsub_f32_x(svptrue_b32(), v115, v122); + svfloat32_t v148 = svsub_f32_x(svptrue_b32(), v115, v129); svint16_t v158 = svtbl_s16( svreinterpret_s16_s32(svcvt_s32_f32_x( pred_full, svmul_n_f32_x(pred_full, v75, (float)(1ULL << 31ULL)))), @@ -1356,28 +916,19 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu7(const armral_cmplx_f32_t *restrict x, svfloat32_t v138 = svmla_f32_x(pred_full, v137, v76, v278); svfloat32_t v140 = svmls_f32_x(pred_full, v137, v76, v278); svfloat32_t v142 = svmls_f32_x(pred_full, v137, v77, v279); - svfloat32_t v145; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v145) : "w"(v144), "w"(v129)); - svfloat32_t v147; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v147) : "w"(v146), "w"(v136)); - svfloat32_t v149; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v149) : "w"(v148), "w"(v136)); + svfloat32_t v145 = svadd_f32_x(svptrue_b32(), v144, v129); + svfloat32_t v147 = svsub_f32_x(svptrue_b32(), v146, v136); + svfloat32_t v149 = svadd_f32_x(svptrue_b32(), v148, v136); svst1w_u64(pred_full, (unsigned *)(v292), svreinterpret_u64_s16(v158)); svfloat32_t v139 = svmla_f32_x(pred_full, v138, v77, v279); svfloat32_t v141 = svmls_f32_x(pred_full, v140, v78, v280); svfloat32_t v143 = svmla_f32_x(pred_full, v142, v78, v280); - svfloat32_t v150; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v150) : "w"(v139), "w"(v145)); - svfloat32_t v151; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v151) : "w"(v139), "w"(v145)); - svfloat32_t v152; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v152) : "w"(v141), "w"(v147)); - svfloat32_t v153; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v153) : "w"(v141), "w"(v147)); - svfloat32_t v154; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v154) : "w"(v143), "w"(v149)); - svfloat32_t v155; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v155) : "w"(v143), "w"(v149)); + svfloat32_t v150 = svadd_f32_x(svptrue_b32(), v139, v145); + svfloat32_t v151 = svsub_f32_x(svptrue_b32(), v139, v145); + svfloat32_t v152 = svadd_f32_x(svptrue_b32(), v141, v147); + svfloat32_t v153 = svsub_f32_x(svptrue_b32(), v141, v147); + svfloat32_t v154 = svadd_f32_x(svptrue_b32(), v143, v149); + svfloat32_t v155 = svsub_f32_x(svptrue_b32(), v143, v149); svint16_t v166 = svtbl_s16( svreinterpret_s16_s32(svcvt_s32_f32_x( pred_full, svmul_n_f32_x(pred_full, v151, (float)(1ULL << 31ULL)))), @@ -1428,174 +979,82 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu8(const armral_cmplx_f32_t *restrict x, float v4 = dir; const float32x2_t *v5 = (const float32x2_t *)x; int32_t *v6 = (int32_t *)y; - int64_t v12 = howmany - 1; - int64_t v216 = howmany / 2; - for (int j = 0; j < v12; j += 2) { - float v121 = 1.0000000000000000e+00F; - float v122 = -1.0000000000000000e+00F; - float v130 = -7.0710678118654746e-01F; - float32x2_t v132 = (float32x2_t){v4, v4}; - float v138 = 7.0710678118654757e-01F; - const float32x2_t *v426 = &v5[istride]; - int32_t *v472 = &v6[ostride]; - float32x2_t v123 = (float32x2_t){v121, v122}; - float32x2_t v131 = (float32x2_t){v138, v130}; - float32x2_t v139 = (float32x2_t){v138, v138}; - const float32x2_t *v390 = &v5[0]; - int32_t *v463 = &v6[0]; - float32x4_t v538 = vld1q_f32((const float32_t *)v426); - float32x2_t v125 = vmul_f32(v132, v123); - float32x2_t v133 = vmul_f32(v132, v131); - float32x4_t v140 = vcombine_f32(v139, v139); - const float32x2_t *v399 = &v5[istride * 4]; - const float32x2_t *v408 = &v5[istride * 2]; - const float32x2_t *v417 = &v5[istride * 6]; - const float32x2_t *v435 = &v5[istride * 5]; - const float32x2_t *v444 = &v5[istride * 3]; - const float32x2_t *v453 = &v5[istride * 7]; - int32_t *v481 = &v6[ostride * 2]; - int32_t *v490 = &v6[ostride * 3]; - int32_t *v499 = &v6[ostride * 4]; - int32_t *v508 = &v6[ostride * 5]; - int32_t *v517 = &v6[ostride * 6]; - int32_t *v526 = &v6[ostride * 7]; - float32x4_t v530 = vld1q_f32((const float32_t *)v390); - float32x4_t v127 = vcombine_f32(v125, v125); - float32x4_t v135 = vcombine_f32(v133, v133); - float32x4_t v532 = vld1q_f32((const float32_t *)v399); - float32x4_t v534 = vld1q_f32((const float32_t *)v408); - float32x4_t v536 = vld1q_f32((const float32_t *)v417); - float32x4_t v540 = vld1q_f32((const float32_t *)v435); - float32x4_t v542 = vld1q_f32((const float32_t *)v444); - float32x4_t v544 = vld1q_f32((const float32_t *)v453); - float32x4_t v35 = vaddq_f32(v530, v532); - float32x4_t v36 = vsubq_f32(v530, v532); - float32x4_t v51 = vaddq_f32(v534, v536); - float32x4_t v52 = vsubq_f32(v534, v536); - float32x4_t v67 = vaddq_f32(v538, v540); - float32x4_t v68 = vsubq_f32(v538, v540); - float32x4_t v83 = vaddq_f32(v542, v544); - float32x4_t v84 = vsubq_f32(v542, v544); - float32x4_t v85 = vaddq_f32(v35, v51); - float32x4_t v86 = vsubq_f32(v35, v51); - float32x4_t v87 = vaddq_f32(v67, v83); - float32x4_t v88 = vsubq_f32(v67, v83); - float32x4_t v91 = vaddq_f32(v68, v84); - float32x4_t v92 = vsubq_f32(v68, v84); - float32x4_t v126 = vrev64q_f32(v52); - float32x4_t v89 = vaddq_f32(v85, v87); - float32x4_t v90 = vsubq_f32(v85, v87); - float32x4_t v113 = vrev64q_f32(v88); - float32x4_t v128 = vmulq_f32(v126, v127); - float32x4_t v134 = vrev64q_f32(v91); - float32x4_t v141 = vmulq_f32(v92, v140); - float32x4_t v115 = vmulq_f32(v113, v127); - float32x4_t v136 = vmulq_f32(v134, v135); - float32x4_t v144 = vaddq_f32(v36, v141); - float32x4_t v145 = vsubq_f32(v36, v141); - int16x4_t v154 = vqmovn_s32(vcvtq_n_s32_f32(v89, 15)); - int16x4_t v186 = vqmovn_s32(vcvtq_n_s32_f32(v90, 15)); - float32x4_t v142 = vaddq_f32(v86, v115); - float32x4_t v143 = vsubq_f32(v86, v115); - float32x4_t v146 = vaddq_f32(v128, v136); - float32x4_t v147 = vsubq_f32(v128, v136); - vst1_s16((int16_t *)v463, v154); - vst1_s16((int16_t *)v499, v186); - float32x4_t v148 = vaddq_f32(v144, v146); - float32x4_t v149 = vsubq_f32(v144, v146); - float32x4_t v150 = vaddq_f32(v145, v147); - float32x4_t v151 = vsubq_f32(v145, v147); - int16x4_t v170 = vqmovn_s32(vcvtq_n_s32_f32(v143, 15)); - int16x4_t v202 = vqmovn_s32(vcvtq_n_s32_f32(v142, 15)); - int16x4_t v162 = vqmovn_s32(vcvtq_n_s32_f32(v149, 15)); - int16x4_t v178 = vqmovn_s32(vcvtq_n_s32_f32(v150, 15)); - int16x4_t v194 = vqmovn_s32(vcvtq_n_s32_f32(v151, 15)); - int16x4_t v210 = vqmovn_s32(vcvtq_n_s32_f32(v148, 15)); - vst1_s16((int16_t *)v481, v170); - vst1_s16((int16_t *)v517, v202); - vst1_s16((int16_t *)v472, v162); - vst1_s16((int16_t *)v490, v178); - vst1_s16((int16_t *)v508, v194); - vst1_s16((int16_t *)v526, v210); - v5 += 2 * 1; - v6 += 2 * 1; - } - for (int j = v216 * 2; j < howmany; j += 1) { - float32x2_t v252 = v5[istride]; - float v303 = 1.0000000000000000e+00F; - float v304 = -1.0000000000000000e+00F; - float v311 = -7.0710678118654746e-01F; - float32x2_t v313 = (float32x2_t){v4, v4}; - float v318 = 7.0710678118654757e-01F; - float32x2_t v228 = v5[0]; - float32x2_t v305 = (float32x2_t){v303, v304}; - float32x2_t v312 = (float32x2_t){v318, v311}; - float32x2_t v319 = (float32x2_t){v318, v318}; - float32x2_t v233 = v5[istride * 4]; - float32x2_t v240 = v5[istride * 2]; - float32x2_t v245 = v5[istride * 6]; - float32x2_t v257 = v5[istride * 5]; - float32x2_t v264 = v5[istride * 3]; - float32x2_t v269 = v5[istride * 7]; - float32x2_t v307 = vmul_f32(v313, v305); - float32x2_t v314 = vmul_f32(v313, v312); - float32x2_t v234 = vadd_f32(v228, v233); - float32x2_t v235 = vsub_f32(v228, v233); - float32x2_t v246 = vadd_f32(v240, v245); - float32x2_t v247 = vsub_f32(v240, v245); - float32x2_t v258 = vadd_f32(v252, v257); - float32x2_t v259 = vsub_f32(v252, v257); - float32x2_t v270 = vadd_f32(v264, v269); - float32x2_t v271 = vsub_f32(v264, v269); - float32x2_t v272 = vadd_f32(v234, v246); - float32x2_t v273 = vsub_f32(v234, v246); - float32x2_t v274 = vadd_f32(v258, v270); - float32x2_t v275 = vsub_f32(v258, v270); - float32x2_t v278 = vadd_f32(v259, v271); - float32x2_t v279 = vsub_f32(v259, v271); - float32x2_t v308 = vrev64_f32(v247); - float32x2_t v276 = vadd_f32(v272, v274); - float32x2_t v277 = vsub_f32(v272, v274); - float32x2_t v297 = vrev64_f32(v275); - float32x2_t v309 = vmul_f32(v308, v307); - float32x2_t v315 = vrev64_f32(v278); - float32x2_t v320 = vmul_f32(v279, v319); - float32x2_t v298 = vmul_f32(v297, v307); - float32x2_t v316 = vmul_f32(v315, v314); - float32x2_t v323 = vadd_f32(v235, v320); - float32x2_t v324 = vsub_f32(v235, v320); - int16x4_t v333 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v276, 15), (int32x2_t){0, 0})); - int16x4_t v357 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v277, 15), (int32x2_t){0, 0})); - float32x2_t v321 = vadd_f32(v273, v298); - float32x2_t v322 = vsub_f32(v273, v298); - float32x2_t v325 = vadd_f32(v309, v316); - float32x2_t v326 = vsub_f32(v309, v316); - v6[0] = vget_lane_s32(vreinterpret_s32_s16(v333), 0); - v6[ostride * 4] = vget_lane_s32(vreinterpret_s32_s16(v357), 0); - float32x2_t v327 = vadd_f32(v323, v325); - float32x2_t v328 = vsub_f32(v323, v325); - float32x2_t v329 = vadd_f32(v324, v326); - float32x2_t v330 = vsub_f32(v324, v326); - int16x4_t v345 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v322, 15), (int32x2_t){0, 0})); - int16x4_t v369 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v321, 15), (int32x2_t){0, 0})); - int16x4_t v339 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v328, 15), (int32x2_t){0, 0})); - v6[ostride * 2] = vget_lane_s32(vreinterpret_s32_s16(v345), 0); - int16x4_t v351 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v329, 15), (int32x2_t){0, 0})); - int16x4_t v363 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v330, 15), (int32x2_t){0, 0})); - v6[ostride * 6] = vget_lane_s32(vreinterpret_s32_s16(v369), 0); - int16x4_t v375 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v327, 15), (int32x2_t){0, 0})); - v6[ostride] = vget_lane_s32(vreinterpret_s32_s16(v339), 0); - v6[ostride * 3] = vget_lane_s32(vreinterpret_s32_s16(v351), 0); - v6[ostride * 5] = vget_lane_s32(vreinterpret_s32_s16(v363), 0); - v6[ostride * 7] = vget_lane_s32(vreinterpret_s32_s16(v375), 0); + for (int j = 0; j < howmany; j += 1) { + float32x2_t v44 = v5[istride]; + float v95 = 1.0000000000000000e+00F; + float v96 = -1.0000000000000000e+00F; + float v103 = -7.0710678118654746e-01F; + float32x2_t v105 = (float32x2_t){v4, v4}; + float v110 = 7.0710678118654757e-01F; + float32x2_t v20 = v5[0]; + float32x2_t v97 = (float32x2_t){v95, v96}; + float32x2_t v104 = (float32x2_t){v110, v103}; + float32x2_t v111 = (float32x2_t){v110, v110}; + float32x2_t v25 = v5[istride * 4]; + float32x2_t v32 = v5[istride * 2]; + float32x2_t v37 = v5[istride * 6]; + float32x2_t v49 = v5[istride * 5]; + float32x2_t v56 = v5[istride * 3]; + float32x2_t v61 = v5[istride * 7]; + float32x2_t v99 = vmul_f32(v105, v97); + float32x2_t v106 = vmul_f32(v105, v104); + float32x2_t v26 = vadd_f32(v20, v25); + float32x2_t v27 = vsub_f32(v20, v25); + float32x2_t v38 = vadd_f32(v32, v37); + float32x2_t v39 = vsub_f32(v32, v37); + float32x2_t v50 = vadd_f32(v44, v49); + float32x2_t v51 = vsub_f32(v44, v49); + float32x2_t v62 = vadd_f32(v56, v61); + float32x2_t v63 = vsub_f32(v56, v61); + float32x2_t v64 = vadd_f32(v26, v38); + float32x2_t v65 = vsub_f32(v26, v38); + float32x2_t v66 = vadd_f32(v50, v62); + float32x2_t v67 = vsub_f32(v50, v62); + float32x2_t v70 = vadd_f32(v51, v63); + float32x2_t v71 = vsub_f32(v51, v63); + float32x2_t v100 = vrev64_f32(v39); + float32x2_t v68 = vadd_f32(v64, v66); + float32x2_t v69 = vsub_f32(v64, v66); + float32x2_t v89 = vrev64_f32(v67); + float32x2_t v101 = vmul_f32(v100, v99); + float32x2_t v107 = vrev64_f32(v70); + float32x2_t v112 = vmul_f32(v71, v111); + float32x2_t v90 = vmul_f32(v89, v99); + float32x2_t v108 = vmul_f32(v107, v106); + float32x2_t v115 = vadd_f32(v27, v112); + float32x2_t v116 = vsub_f32(v27, v112); + int16x4_t v125 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v68, 15), (int32x2_t){0, 0})); + int16x4_t v149 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v69, 15), (int32x2_t){0, 0})); + float32x2_t v113 = vadd_f32(v65, v90); + float32x2_t v114 = vsub_f32(v65, v90); + float32x2_t v117 = vadd_f32(v101, v108); + float32x2_t v118 = vsub_f32(v101, v108); + v6[0] = vget_lane_s32(vreinterpret_s32_s16(v125), 0); + v6[ostride * 4] = vget_lane_s32(vreinterpret_s32_s16(v149), 0); + float32x2_t v119 = vadd_f32(v115, v117); + float32x2_t v120 = vsub_f32(v115, v117); + float32x2_t v121 = vadd_f32(v116, v118); + float32x2_t v122 = vsub_f32(v116, v118); + int16x4_t v137 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v114, 15), (int32x2_t){0, 0})); + int16x4_t v161 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v113, 15), (int32x2_t){0, 0})); + int16x4_t v131 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v120, 15), (int32x2_t){0, 0})); + v6[ostride * 2] = vget_lane_s32(vreinterpret_s32_s16(v137), 0); + int16x4_t v143 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v121, 15), (int32x2_t){0, 0})); + int16x4_t v155 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v122, 15), (int32x2_t){0, 0})); + v6[ostride * 6] = vget_lane_s32(vreinterpret_s32_s16(v161), 0); + int16x4_t v167 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v119, 15), (int32x2_t){0, 0})); + v6[ostride] = vget_lane_s32(vreinterpret_s32_s16(v131), 0); + v6[ostride * 3] = vget_lane_s32(vreinterpret_s32_s16(v143), 0); + v6[ostride * 5] = vget_lane_s32(vreinterpret_s32_s16(v155), 0); + v6[ostride * 7] = vget_lane_s32(vreinterpret_s32_s16(v167), 0); v5 += 1 * 1; v6 += 1 * 1; } @@ -1670,57 +1129,34 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu8(const armral_cmplx_f32_t *restrict x, svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v271)[0])); svfloat32_t v379 = svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v280)[0])); - svfloat32_t v32; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v32) : "w"(v365), "w"(v367)); - svfloat32_t v33; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v33) : "w"(v365), "w"(v367)); - svfloat32_t v48; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v48) : "w"(v369), "w"(v371)); - svfloat32_t v49; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v49) : "w"(v369), "w"(v371)); - svfloat32_t v64; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v64) : "w"(v373), "w"(v375)); - svfloat32_t v65; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v65) : "w"(v373), "w"(v375)); - svfloat32_t v80; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v80) : "w"(v377), "w"(v379)); - svfloat32_t v81; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v81) : "w"(v377), "w"(v379)); - svfloat32_t v82; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v82) : "w"(v32), "w"(v48)); - svfloat32_t v83; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v83) : "w"(v32), "w"(v48)); - svfloat32_t v84; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v84) : "w"(v64), "w"(v80)); - svfloat32_t v85; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v85) : "w"(v64), "w"(v80)); - svfloat32_t v88; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v88) : "w"(v65), "w"(v81)); - svfloat32_t v89; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v89) : "w"(v65), "w"(v81)); - svfloat32_t zero123; - asm volatile("mov %0.s, #0" : "=w"(zero123)); + svfloat32_t v32 = svadd_f32_x(svptrue_b32(), v365, v367); + svfloat32_t v33 = svsub_f32_x(svptrue_b32(), v365, v367); + svfloat32_t v48 = svadd_f32_x(svptrue_b32(), v369, v371); + svfloat32_t v49 = svsub_f32_x(svptrue_b32(), v369, v371); + svfloat32_t v64 = svadd_f32_x(svptrue_b32(), v373, v375); + svfloat32_t v65 = svsub_f32_x(svptrue_b32(), v373, v375); + svfloat32_t v80 = svadd_f32_x(svptrue_b32(), v377, v379); + svfloat32_t v81 = svsub_f32_x(svptrue_b32(), v377, v379); + svfloat32_t v82 = svadd_f32_x(svptrue_b32(), v32, v48); + svfloat32_t v83 = svsub_f32_x(svptrue_b32(), v32, v48); + svfloat32_t v84 = svadd_f32_x(svptrue_b32(), v64, v80); + svfloat32_t v85 = svsub_f32_x(svptrue_b32(), v64, v80); + svfloat32_t v88 = svadd_f32_x(svptrue_b32(), v65, v81); + svfloat32_t v89 = svsub_f32_x(svptrue_b32(), v65, v81); + svfloat32_t zero123 = svdup_n_f32(0); svfloat32_t v123 = svcmla_f32_x(pred_full, zero123, v288, v49, 90); - svfloat32_t v86; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v86) : "w"(v82), "w"(v84)); - svfloat32_t v87; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v87) : "w"(v82), "w"(v84)); - svfloat32_t zero111; - asm volatile("mov %0.s, #0" : "=w"(zero111)); + svfloat32_t v86 = svadd_f32_x(svptrue_b32(), v82, v84); + svfloat32_t v87 = svsub_f32_x(svptrue_b32(), v82, v84); + svfloat32_t zero111 = svdup_n_f32(0); svfloat32_t v111 = svcmla_f32_x(pred_full, zero111, v288, v85, 90); - svfloat32_t zero130; - asm volatile("mov %0.s, #0" : "=w"(zero130)); + svfloat32_t zero130 = svdup_n_f32(0); svfloat32_t v130 = svcmla_f32_x(pred_full, zero130, v289, v88, 90); - svfloat32_t v136; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v136) : "w"(v83), "w"(v111)); - svfloat32_t v137; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v137) : "w"(v83), "w"(v111)); + svfloat32_t v136 = svadd_f32_x(svptrue_b32(), v83, v111); + svfloat32_t v137 = svsub_f32_x(svptrue_b32(), v83, v111); svfloat32_t v138 = svmla_f32_x(pred_full, v33, v89, v290); svfloat32_t v139 = svmls_f32_x(pred_full, v33, v89, v290); - svfloat32_t v140; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v140) : "w"(v123), "w"(v130)); - svfloat32_t v141; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v141) : "w"(v123), "w"(v130)); + svfloat32_t v140 = svadd_f32_x(svptrue_b32(), v123, v130); + svfloat32_t v141 = svsub_f32_x(svptrue_b32(), v123, v130); svint16_t v148 = svtbl_s16( svreinterpret_s16_s32(svcvt_s32_f32_x( pred_full, svmul_n_f32_x(pred_full, v86, (float)(1ULL << 31ULL)))), @@ -1731,14 +1167,10 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu8(const armral_cmplx_f32_t *restrict x, pred_full, svmul_n_f32_x(pred_full, v87, (float)(1ULL << 31ULL)))), svreinterpret_u16_u64( svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svfloat32_t v142; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v142) : "w"(v138), "w"(v140)); - svfloat32_t v143; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v143) : "w"(v138), "w"(v140)); - svfloat32_t v144; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v144) : "w"(v139), "w"(v141)); - svfloat32_t v145; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v145) : "w"(v139), "w"(v141)); + svfloat32_t v142 = svadd_f32_x(svptrue_b32(), v138, v140); + svfloat32_t v143 = svsub_f32_x(svptrue_b32(), v138, v140); + svfloat32_t v144 = svadd_f32_x(svptrue_b32(), v139, v141); + svfloat32_t v145 = svsub_f32_x(svptrue_b32(), v139, v141); svint16_t v164 = svtbl_s16( svreinterpret_s16_s32(svcvt_s32_f32_x( pred_full, svmul_n_f32_x(pred_full, v137, (float)(1ULL << 31ULL)))), @@ -1791,277 +1223,130 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu9(const armral_cmplx_f32_t *restrict x, float v4 = dir; const float32x2_t *v5 = (const float32x2_t *)x; int32_t *v6 = (int32_t *)y; - int64_t v12 = howmany - 1; - int64_t v271 = howmany / 2; - for (int j = 0; j < v12; j += 2) { - float v110 = -5.0000000000000000e-01F; - float v123 = -1.4999999999999998e+00F; - float v127 = 8.6602540378443871e-01F; - float v128 = -8.6602540378443871e-01F; - float v136 = 7.6604444311897801e-01F; - float v141 = 9.3969262078590832e-01F; - float v146 = -1.7364817766693039e-01F; - float v150 = 6.4278760968653925e-01F; - float v151 = -6.4278760968653925e-01F; - float v158 = -3.4202014332566888e-01F; - float v159 = 3.4202014332566888e-01F; - float v166 = 9.8480775301220802e-01F; - float v167 = -9.8480775301220802e-01F; - float32x2_t v169 = (float32x2_t){v4, v4}; - const float32x2_t *v492 = &v5[istride]; - int32_t *v584 = &v6[ostride]; - float32x2_t v111 = (float32x2_t){v110, v110}; - float32x2_t v124 = (float32x2_t){v123, v123}; - float32x2_t v129 = (float32x2_t){v127, v128}; - float32x2_t v137 = (float32x2_t){v136, v136}; - float32x2_t v142 = (float32x2_t){v141, v141}; - float32x2_t v147 = (float32x2_t){v146, v146}; - float32x2_t v152 = (float32x2_t){v150, v151}; - float32x2_t v160 = (float32x2_t){v158, v159}; - float32x2_t v168 = (float32x2_t){v166, v167}; - const float32x2_t *v565 = &v5[0]; - int32_t *v575 = &v6[0]; - float32x4_t v651 = vld1q_f32((const float32_t *)v492); - float32x4_t v112 = vcombine_f32(v111, v111); - float32x4_t v125 = vcombine_f32(v124, v124); - float32x2_t v131 = vmul_f32(v169, v129); - float32x4_t v138 = vcombine_f32(v137, v137); - float32x4_t v143 = vcombine_f32(v142, v142); - float32x4_t v148 = vcombine_f32(v147, v147); - float32x2_t v154 = vmul_f32(v169, v152); - float32x2_t v162 = vmul_f32(v169, v160); - float32x2_t v170 = vmul_f32(v169, v168); - const float32x2_t *v501 = &v5[istride * 8]; - const float32x2_t *v510 = &v5[istride * 7]; - const float32x2_t *v519 = &v5[istride * 2]; - const float32x2_t *v528 = &v5[istride * 3]; - const float32x2_t *v537 = &v5[istride * 6]; - const float32x2_t *v546 = &v5[istride * 4]; - const float32x2_t *v555 = &v5[istride * 5]; - int32_t *v593 = &v6[ostride * 2]; - int32_t *v602 = &v6[ostride * 3]; - int32_t *v611 = &v6[ostride * 4]; - int32_t *v620 = &v6[ostride * 5]; - int32_t *v629 = &v6[ostride * 6]; - int32_t *v638 = &v6[ostride * 7]; - int32_t *v647 = &v6[ostride * 8]; - float32x4_t v667 = vld1q_f32((const float32_t *)v565); - float32x4_t v133 = vcombine_f32(v131, v131); - float32x4_t v156 = vcombine_f32(v154, v154); - float32x4_t v164 = vcombine_f32(v162, v162); - float32x4_t v172 = vcombine_f32(v170, v170); - float32x4_t v653 = vld1q_f32((const float32_t *)v501); - float32x4_t v655 = vld1q_f32((const float32_t *)v510); - float32x4_t v657 = vld1q_f32((const float32_t *)v519); - float32x4_t v659 = vld1q_f32((const float32_t *)v528); - float32x4_t v661 = vld1q_f32((const float32_t *)v537); - float32x4_t v663 = vld1q_f32((const float32_t *)v546); - float32x4_t v665 = vld1q_f32((const float32_t *)v555); - float32x4_t v35 = vaddq_f32(v651, v653); - float32x4_t v36 = vsubq_f32(v651, v653); - float32x4_t v51 = vaddq_f32(v655, v657); - float32x4_t v52 = vsubq_f32(v655, v657); - float32x4_t v67 = vaddq_f32(v659, v661); - float32x4_t v68 = vsubq_f32(v659, v661); - float32x4_t v83 = vaddq_f32(v663, v665); - float32x4_t v84 = vsubq_f32(v663, v665); - float32x4_t v85 = vaddq_f32(v35, v51); - float32x4_t v96 = vaddq_f32(v36, v52); - float32x4_t v98 = vsubq_f32(v35, v51); - float32x4_t v99 = vsubq_f32(v51, v83); - float32x4_t v100 = vsubq_f32(v83, v35); - float32x4_t v101 = vsubq_f32(v36, v52); - float32x4_t v102 = vsubq_f32(v52, v84); - float32x4_t v103 = vsubq_f32(v84, v36); - float32x4_t v126 = vmulq_f32(v67, v125); - float32x4_t v132 = vrev64q_f32(v68); - float32x4_t v86 = vaddq_f32(v85, v83); - float32x4_t v97 = vaddq_f32(v96, v84); - float32x4_t v134 = vmulq_f32(v132, v133); - float32x4_t v139 = vmulq_f32(v98, v138); - float32x4_t v144 = vmulq_f32(v99, v143); - float32x4_t v149 = vmulq_f32(v100, v148); - float32x4_t v155 = vrev64q_f32(v101); - float32x4_t v163 = vrev64q_f32(v102); - float32x4_t v171 = vrev64q_f32(v103); - float32x4_t v87 = vaddq_f32(v86, v67); - float32x4_t v113 = vmulq_f32(v86, v112); - float32x4_t v119 = vrev64q_f32(v97); - float32x4_t v157 = vmulq_f32(v155, v156); - float32x4_t v165 = vmulq_f32(v163, v164); - float32x4_t v173 = vmulq_f32(v171, v172); - float32x4_t v95 = vaddq_f32(v87, v667); - float32x4_t v121 = vmulq_f32(v119, v133); - float32x4_t v174 = vaddq_f32(v113, v113); - float32x4_t v187 = vaddq_f32(v134, v157); - float32x4_t v189 = vsubq_f32(v134, v165); - float32x4_t v191 = vsubq_f32(v134, v157); - float32x4_t v175 = vaddq_f32(v174, v113); - float32x4_t v179 = vaddq_f32(v95, v126); - float32x4_t v188 = vaddq_f32(v187, v165); - float32x4_t v190 = vaddq_f32(v189, v173); - float32x4_t v192 = vsubq_f32(v191, v173); - int16x4_t v201 = vqmovn_s32(vcvtq_n_s32_f32(v95, 15)); - float32x4_t v176 = vaddq_f32(v95, v175); - float32x4_t v180 = vaddq_f32(v179, v174); - vst1_s16((int16_t *)v575, v201); - float32x4_t v177 = vaddq_f32(v176, v121); - float32x4_t v178 = vsubq_f32(v176, v121); - float32x4_t v181 = vaddq_f32(v180, v139); - float32x4_t v183 = vsubq_f32(v180, v144); - float32x4_t v185 = vsubq_f32(v180, v139); - float32x4_t v182 = vaddq_f32(v181, v144); - float32x4_t v184 = vaddq_f32(v183, v149); - float32x4_t v186 = vsubq_f32(v185, v149); - int16x4_t v225 = vqmovn_s32(vcvtq_n_s32_f32(v178, 15)); - int16x4_t v249 = vqmovn_s32(vcvtq_n_s32_f32(v177, 15)); - float32x4_t v193 = vaddq_f32(v182, v188); - float32x4_t v194 = vsubq_f32(v182, v188); - float32x4_t v195 = vaddq_f32(v184, v190); - float32x4_t v196 = vsubq_f32(v184, v190); - float32x4_t v197 = vaddq_f32(v186, v192); - float32x4_t v198 = vsubq_f32(v186, v192); - vst1_s16((int16_t *)v602, v225); - vst1_s16((int16_t *)v629, v249); - int16x4_t v209 = vqmovn_s32(vcvtq_n_s32_f32(v194, 15)); - int16x4_t v217 = vqmovn_s32(vcvtq_n_s32_f32(v195, 15)); - int16x4_t v233 = vqmovn_s32(vcvtq_n_s32_f32(v198, 15)); - int16x4_t v241 = vqmovn_s32(vcvtq_n_s32_f32(v197, 15)); - int16x4_t v257 = vqmovn_s32(vcvtq_n_s32_f32(v196, 15)); - int16x4_t v265 = vqmovn_s32(vcvtq_n_s32_f32(v193, 15)); - vst1_s16((int16_t *)v584, v209); - vst1_s16((int16_t *)v593, v217); - vst1_s16((int16_t *)v611, v233); - vst1_s16((int16_t *)v620, v241); - vst1_s16((int16_t *)v638, v257); - vst1_s16((int16_t *)v647, v265); - v5 += 2 * 1; - v6 += 2 * 1; - } - for (int j = v271 * 2; j < howmany; j += 1) { - float32x2_t v283 = v5[istride]; - float v349 = -5.0000000000000000e-01F; - float v360 = -1.4999999999999998e+00F; - float v363 = 8.6602540378443871e-01F; - float v364 = -8.6602540378443871e-01F; - float v371 = 7.6604444311897801e-01F; - float v375 = 9.3969262078590832e-01F; - float v379 = -1.7364817766693039e-01F; - float v382 = 6.4278760968653925e-01F; - float v383 = -6.4278760968653925e-01F; - float v389 = -3.4202014332566888e-01F; - float v390 = 3.4202014332566888e-01F; - float v396 = 9.8480775301220802e-01F; - float v397 = -9.8480775301220802e-01F; - float32x2_t v399 = (float32x2_t){v4, v4}; - float32x2_t v334 = v5[0]; - float32x2_t v350 = (float32x2_t){v349, v349}; - float32x2_t v361 = (float32x2_t){v360, v360}; - float32x2_t v365 = (float32x2_t){v363, v364}; - float32x2_t v372 = (float32x2_t){v371, v371}; - float32x2_t v376 = (float32x2_t){v375, v375}; - float32x2_t v380 = (float32x2_t){v379, v379}; - float32x2_t v384 = (float32x2_t){v382, v383}; - float32x2_t v391 = (float32x2_t){v389, v390}; - float32x2_t v398 = (float32x2_t){v396, v397}; - float32x2_t v288 = v5[istride * 8]; - float32x2_t v295 = v5[istride * 7]; - float32x2_t v300 = v5[istride * 2]; - float32x2_t v307 = v5[istride * 3]; - float32x2_t v312 = v5[istride * 6]; - float32x2_t v319 = v5[istride * 4]; - float32x2_t v324 = v5[istride * 5]; - float32x2_t v367 = vmul_f32(v399, v365); - float32x2_t v386 = vmul_f32(v399, v384); - float32x2_t v393 = vmul_f32(v399, v391); - float32x2_t v400 = vmul_f32(v399, v398); - float32x2_t v289 = vadd_f32(v283, v288); - float32x2_t v290 = vsub_f32(v283, v288); - float32x2_t v301 = vadd_f32(v295, v300); - float32x2_t v302 = vsub_f32(v295, v300); - float32x2_t v313 = vadd_f32(v307, v312); - float32x2_t v314 = vsub_f32(v307, v312); - float32x2_t v325 = vadd_f32(v319, v324); - float32x2_t v326 = vsub_f32(v319, v324); - float32x2_t v327 = vadd_f32(v289, v301); - float32x2_t v336 = vadd_f32(v290, v302); - float32x2_t v338 = vsub_f32(v289, v301); - float32x2_t v339 = vsub_f32(v301, v325); - float32x2_t v340 = vsub_f32(v325, v289); - float32x2_t v341 = vsub_f32(v290, v302); - float32x2_t v342 = vsub_f32(v302, v326); - float32x2_t v343 = vsub_f32(v326, v290); - float32x2_t v362 = vmul_f32(v313, v361); - float32x2_t v368 = vrev64_f32(v314); - float32x2_t v328 = vadd_f32(v327, v325); - float32x2_t v337 = vadd_f32(v336, v326); - float32x2_t v369 = vmul_f32(v368, v367); - float32x2_t v373 = vmul_f32(v338, v372); - float32x2_t v377 = vmul_f32(v339, v376); - float32x2_t v381 = vmul_f32(v340, v380); - float32x2_t v387 = vrev64_f32(v341); - float32x2_t v394 = vrev64_f32(v342); - float32x2_t v401 = vrev64_f32(v343); - float32x2_t v329 = vadd_f32(v328, v313); - float32x2_t v351 = vmul_f32(v328, v350); - float32x2_t v357 = vrev64_f32(v337); - float32x2_t v388 = vmul_f32(v387, v386); - float32x2_t v395 = vmul_f32(v394, v393); - float32x2_t v402 = vmul_f32(v401, v400); - float32x2_t v335 = vadd_f32(v329, v334); - float32x2_t v358 = vmul_f32(v357, v367); - float32x2_t v403 = vadd_f32(v351, v351); - float32x2_t v416 = vadd_f32(v369, v388); - float32x2_t v418 = vsub_f32(v369, v395); - float32x2_t v420 = vsub_f32(v369, v388); - float32x2_t v404 = vadd_f32(v403, v351); - float32x2_t v408 = vadd_f32(v335, v362); - float32x2_t v417 = vadd_f32(v416, v395); - float32x2_t v419 = vadd_f32(v418, v402); - float32x2_t v421 = vsub_f32(v420, v402); - int16x4_t v430 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v335, 15), (int32x2_t){0, 0})); - float32x2_t v405 = vadd_f32(v335, v404); - float32x2_t v409 = vadd_f32(v408, v403); - v6[0] = vget_lane_s32(vreinterpret_s32_s16(v430), 0); - float32x2_t v406 = vadd_f32(v405, v358); - float32x2_t v407 = vsub_f32(v405, v358); - float32x2_t v410 = vadd_f32(v409, v373); - float32x2_t v412 = vsub_f32(v409, v377); - float32x2_t v414 = vsub_f32(v409, v373); - float32x2_t v411 = vadd_f32(v410, v377); - float32x2_t v413 = vadd_f32(v412, v381); - float32x2_t v415 = vsub_f32(v414, v381); - int16x4_t v448 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v407, 15), (int32x2_t){0, 0})); - int16x4_t v466 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v406, 15), (int32x2_t){0, 0})); - float32x2_t v422 = vadd_f32(v411, v417); - float32x2_t v423 = vsub_f32(v411, v417); - float32x2_t v424 = vadd_f32(v413, v419); - float32x2_t v425 = vsub_f32(v413, v419); - float32x2_t v426 = vadd_f32(v415, v421); - float32x2_t v427 = vsub_f32(v415, v421); - v6[ostride * 3] = vget_lane_s32(vreinterpret_s32_s16(v448), 0); - v6[ostride * 6] = vget_lane_s32(vreinterpret_s32_s16(v466), 0); - int16x4_t v436 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v423, 15), (int32x2_t){0, 0})); - int16x4_t v442 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v424, 15), (int32x2_t){0, 0})); - int16x4_t v454 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v427, 15), (int32x2_t){0, 0})); - int16x4_t v460 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v426, 15), (int32x2_t){0, 0})); - int16x4_t v472 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v425, 15), (int32x2_t){0, 0})); - int16x4_t v478 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v422, 15), (int32x2_t){0, 0})); - v6[ostride] = vget_lane_s32(vreinterpret_s32_s16(v436), 0); - v6[ostride * 2] = vget_lane_s32(vreinterpret_s32_s16(v442), 0); - v6[ostride * 4] = vget_lane_s32(vreinterpret_s32_s16(v454), 0); - v6[ostride * 5] = vget_lane_s32(vreinterpret_s32_s16(v460), 0); - v6[ostride * 7] = vget_lane_s32(vreinterpret_s32_s16(v472), 0); - v6[ostride * 8] = vget_lane_s32(vreinterpret_s32_s16(v478), 0); + for (int j = 0; j < howmany; j += 1) { + float32x2_t v20 = v5[istride]; + float v86 = -5.0000000000000000e-01F; + float v97 = -1.4999999999999998e+00F; + float v100 = 8.6602540378443871e-01F; + float v101 = -8.6602540378443871e-01F; + float v108 = 7.6604444311897801e-01F; + float v112 = 9.3969262078590832e-01F; + float v116 = -1.7364817766693039e-01F; + float v119 = 6.4278760968653925e-01F; + float v120 = -6.4278760968653925e-01F; + float v126 = -3.4202014332566888e-01F; + float v127 = 3.4202014332566888e-01F; + float v133 = 9.8480775301220802e-01F; + float v134 = -9.8480775301220802e-01F; + float32x2_t v136 = (float32x2_t){v4, v4}; + float32x2_t v71 = v5[0]; + float32x2_t v87 = (float32x2_t){v86, v86}; + float32x2_t v98 = (float32x2_t){v97, v97}; + float32x2_t v102 = (float32x2_t){v100, v101}; + float32x2_t v109 = (float32x2_t){v108, v108}; + float32x2_t v113 = (float32x2_t){v112, v112}; + float32x2_t v117 = (float32x2_t){v116, v116}; + float32x2_t v121 = (float32x2_t){v119, v120}; + float32x2_t v128 = (float32x2_t){v126, v127}; + float32x2_t v135 = (float32x2_t){v133, v134}; + float32x2_t v25 = v5[istride * 8]; + float32x2_t v32 = v5[istride * 7]; + float32x2_t v37 = v5[istride * 2]; + float32x2_t v44 = v5[istride * 3]; + float32x2_t v49 = v5[istride * 6]; + float32x2_t v56 = v5[istride * 4]; + float32x2_t v61 = v5[istride * 5]; + float32x2_t v104 = vmul_f32(v136, v102); + float32x2_t v123 = vmul_f32(v136, v121); + float32x2_t v130 = vmul_f32(v136, v128); + float32x2_t v137 = vmul_f32(v136, v135); + float32x2_t v26 = vadd_f32(v20, v25); + float32x2_t v27 = vsub_f32(v20, v25); + float32x2_t v38 = vadd_f32(v32, v37); + float32x2_t v39 = vsub_f32(v32, v37); + float32x2_t v50 = vadd_f32(v44, v49); + float32x2_t v51 = vsub_f32(v44, v49); + float32x2_t v62 = vadd_f32(v56, v61); + float32x2_t v63 = vsub_f32(v56, v61); + float32x2_t v64 = vadd_f32(v26, v38); + float32x2_t v73 = vadd_f32(v27, v39); + float32x2_t v75 = vsub_f32(v26, v38); + float32x2_t v76 = vsub_f32(v38, v62); + float32x2_t v77 = vsub_f32(v62, v26); + float32x2_t v78 = vsub_f32(v27, v39); + float32x2_t v79 = vsub_f32(v39, v63); + float32x2_t v80 = vsub_f32(v63, v27); + float32x2_t v99 = vmul_f32(v50, v98); + float32x2_t v105 = vrev64_f32(v51); + float32x2_t v65 = vadd_f32(v64, v62); + float32x2_t v74 = vadd_f32(v73, v63); + float32x2_t v106 = vmul_f32(v105, v104); + float32x2_t v110 = vmul_f32(v75, v109); + float32x2_t v114 = vmul_f32(v76, v113); + float32x2_t v118 = vmul_f32(v77, v117); + float32x2_t v124 = vrev64_f32(v78); + float32x2_t v131 = vrev64_f32(v79); + float32x2_t v138 = vrev64_f32(v80); + float32x2_t v66 = vadd_f32(v65, v50); + float32x2_t v88 = vmul_f32(v65, v87); + float32x2_t v94 = vrev64_f32(v74); + float32x2_t v125 = vmul_f32(v124, v123); + float32x2_t v132 = vmul_f32(v131, v130); + float32x2_t v139 = vmul_f32(v138, v137); + float32x2_t v72 = vadd_f32(v66, v71); + float32x2_t v95 = vmul_f32(v94, v104); + float32x2_t v140 = vadd_f32(v88, v88); + float32x2_t v153 = vadd_f32(v106, v125); + float32x2_t v155 = vsub_f32(v106, v132); + float32x2_t v157 = vsub_f32(v106, v125); + float32x2_t v141 = vadd_f32(v140, v88); + float32x2_t v145 = vadd_f32(v72, v99); + float32x2_t v154 = vadd_f32(v153, v132); + float32x2_t v156 = vadd_f32(v155, v139); + float32x2_t v158 = vsub_f32(v157, v139); + int16x4_t v167 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v72, 15), (int32x2_t){0, 0})); + float32x2_t v142 = vadd_f32(v72, v141); + float32x2_t v146 = vadd_f32(v145, v140); + v6[0] = vget_lane_s32(vreinterpret_s32_s16(v167), 0); + float32x2_t v143 = vadd_f32(v142, v95); + float32x2_t v144 = vsub_f32(v142, v95); + float32x2_t v147 = vadd_f32(v146, v110); + float32x2_t v149 = vsub_f32(v146, v114); + float32x2_t v151 = vsub_f32(v146, v110); + float32x2_t v148 = vadd_f32(v147, v114); + float32x2_t v150 = vadd_f32(v149, v118); + float32x2_t v152 = vsub_f32(v151, v118); + int16x4_t v185 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v144, 15), (int32x2_t){0, 0})); + int16x4_t v203 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v143, 15), (int32x2_t){0, 0})); + float32x2_t v159 = vadd_f32(v148, v154); + float32x2_t v160 = vsub_f32(v148, v154); + float32x2_t v161 = vadd_f32(v150, v156); + float32x2_t v162 = vsub_f32(v150, v156); + float32x2_t v163 = vadd_f32(v152, v158); + float32x2_t v164 = vsub_f32(v152, v158); + v6[ostride * 3] = vget_lane_s32(vreinterpret_s32_s16(v185), 0); + v6[ostride * 6] = vget_lane_s32(vreinterpret_s32_s16(v203), 0); + int16x4_t v173 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v160, 15), (int32x2_t){0, 0})); + int16x4_t v179 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v161, 15), (int32x2_t){0, 0})); + int16x4_t v191 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v164, 15), (int32x2_t){0, 0})); + int16x4_t v197 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v163, 15), (int32x2_t){0, 0})); + int16x4_t v209 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v162, 15), (int32x2_t){0, 0})); + int16x4_t v215 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v159, 15), (int32x2_t){0, 0})); + v6[ostride] = vget_lane_s32(vreinterpret_s32_s16(v173), 0); + v6[ostride * 2] = vget_lane_s32(vreinterpret_s32_s16(v179), 0); + v6[ostride * 4] = vget_lane_s32(vreinterpret_s32_s16(v191), 0); + v6[ostride * 5] = vget_lane_s32(vreinterpret_s32_s16(v197), 0); + v6[ostride * 7] = vget_lane_s32(vreinterpret_s32_s16(v209), 0); + v6[ostride * 8] = vget_lane_s32(vreinterpret_s32_s16(v215), 0); v5 += 1 * 1; v6 += 1 * 1; } @@ -2156,77 +1441,44 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu9(const armral_cmplx_f32_t *restrict x, svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v323)[0])); svfloat32_t v453 = svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v332)[0])); - svfloat32_t v32; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v32) : "w"(v439), "w"(v441)); - svfloat32_t v33; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v33) : "w"(v439), "w"(v441)); - svfloat32_t v48; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v48) : "w"(v443), "w"(v445)); - svfloat32_t v49; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v49) : "w"(v443), "w"(v445)); - svfloat32_t v64; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v64) : "w"(v447), "w"(v449)); - svfloat32_t v65; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v65) : "w"(v447), "w"(v449)); - svfloat32_t v80; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v80) : "w"(v451), "w"(v453)); - svfloat32_t v81; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v81) : "w"(v451), "w"(v453)); - svfloat32_t v82; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v82) : "w"(v32), "w"(v48)); - svfloat32_t v93; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v93) : "w"(v33), "w"(v49)); - svfloat32_t v95; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v95) : "w"(v32), "w"(v48)); - svfloat32_t v96; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v96) : "w"(v48), "w"(v80)); - svfloat32_t v97; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v97) : "w"(v80), "w"(v32)); - svfloat32_t v98; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v98) : "w"(v33), "w"(v49)); - svfloat32_t v99; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v99) : "w"(v49), "w"(v81)); - svfloat32_t v100; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v100) : "w"(v81), "w"(v33)); - svfloat32_t zero129; - asm volatile("mov %0.s, #0" : "=w"(zero129)); + svfloat32_t v32 = svadd_f32_x(svptrue_b32(), v439, v441); + svfloat32_t v33 = svsub_f32_x(svptrue_b32(), v439, v441); + svfloat32_t v48 = svadd_f32_x(svptrue_b32(), v443, v445); + svfloat32_t v49 = svsub_f32_x(svptrue_b32(), v443, v445); + svfloat32_t v64 = svadd_f32_x(svptrue_b32(), v447, v449); + svfloat32_t v65 = svsub_f32_x(svptrue_b32(), v447, v449); + svfloat32_t v80 = svadd_f32_x(svptrue_b32(), v451, v453); + svfloat32_t v81 = svsub_f32_x(svptrue_b32(), v451, v453); + svfloat32_t v82 = svadd_f32_x(svptrue_b32(), v32, v48); + svfloat32_t v93 = svadd_f32_x(svptrue_b32(), v33, v49); + svfloat32_t v95 = svsub_f32_x(svptrue_b32(), v32, v48); + svfloat32_t v96 = svsub_f32_x(svptrue_b32(), v48, v80); + svfloat32_t v97 = svsub_f32_x(svptrue_b32(), v80, v32); + svfloat32_t v98 = svsub_f32_x(svptrue_b32(), v33, v49); + svfloat32_t v99 = svsub_f32_x(svptrue_b32(), v49, v81); + svfloat32_t v100 = svsub_f32_x(svptrue_b32(), v81, v33); + svfloat32_t zero129 = svdup_n_f32(0); svfloat32_t v129 = svcmla_f32_x(pred_full, zero129, v349, v65, 90); - svfloat32_t v83; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v83) : "w"(v82), "w"(v80)); - svfloat32_t v94; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v94) : "w"(v93), "w"(v81)); - svfloat32_t zero151; - asm volatile("mov %0.s, #0" : "=w"(zero151)); + svfloat32_t v83 = svadd_f32_x(svptrue_b32(), v82, v80); + svfloat32_t v94 = svadd_f32_x(svptrue_b32(), v93, v81); + svfloat32_t zero151 = svdup_n_f32(0); svfloat32_t v151 = svcmla_f32_x(pred_full, zero151, v353, v98, 90); - svfloat32_t zero158; - asm volatile("mov %0.s, #0" : "=w"(zero158)); + svfloat32_t zero158 = svdup_n_f32(0); svfloat32_t v158 = svcmla_f32_x(pred_full, zero158, v354, v99, 90); - svfloat32_t zero165; - asm volatile("mov %0.s, #0" : "=w"(zero165)); + svfloat32_t zero165 = svdup_n_f32(0); svfloat32_t v165 = svcmla_f32_x(pred_full, zero165, v355, v100, 90); - svfloat32_t v84; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v84) : "w"(v83), "w"(v64)); - svfloat32_t v110; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v110) : "w"(v83), "w"(v346)); - svfloat32_t zero117; - asm volatile("mov %0.s, #0" : "=w"(zero117)); + svfloat32_t v84 = svadd_f32_x(svptrue_b32(), v83, v64); + svfloat32_t v110 = svmul_f32_x(svptrue_b32(), v83, v346); + svfloat32_t zero117 = svdup_n_f32(0); svfloat32_t v117 = svcmla_f32_x(pred_full, zero117, v349, v94, 90); - svfloat32_t v179; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v179) : "w"(v129), "w"(v151)); - svfloat32_t v181; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v181) : "w"(v129), "w"(v158)); - svfloat32_t v183; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v183) : "w"(v129), "w"(v151)); - svfloat32_t v92; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v92) : "w"(v84), "w"(v455)); - svfloat32_t v166; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v166) : "w"(v110), "w"(v110)); - svfloat32_t v180; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v180) : "w"(v179), "w"(v158)); - svfloat32_t v182; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v182) : "w"(v181), "w"(v165)); - svfloat32_t v184; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v184) : "w"(v183), "w"(v165)); + svfloat32_t v179 = svadd_f32_x(svptrue_b32(), v129, v151); + svfloat32_t v181 = svsub_f32_x(svptrue_b32(), v129, v158); + svfloat32_t v183 = svsub_f32_x(svptrue_b32(), v129, v151); + svfloat32_t v92 = svadd_f32_x(svptrue_b32(), v84, v455); + svfloat32_t v166 = svadd_f32_x(svptrue_b32(), v110, v110); + svfloat32_t v180 = svadd_f32_x(svptrue_b32(), v179, v158); + svfloat32_t v182 = svadd_f32_x(svptrue_b32(), v181, v165); + svfloat32_t v184 = svsub_f32_x(svptrue_b32(), v183, v165); svfloat32_t v167 = svmla_f32_x(pred_full, v166, v83, v346); svfloat32_t v171 = svmla_f32_x(pred_full, v92, v64, v348); svint16_t v193 = svtbl_s16( @@ -2234,15 +1486,11 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu9(const armral_cmplx_f32_t *restrict x, pred_full, svmul_n_f32_x(pred_full, v92, (float)(1ULL << 31ULL)))), svreinterpret_u16_u64( svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svfloat32_t v168; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v168) : "w"(v92), "w"(v167)); - svfloat32_t v172; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v172) : "w"(v171), "w"(v166)); + svfloat32_t v168 = svadd_f32_x(svptrue_b32(), v92, v167); + svfloat32_t v172 = svadd_f32_x(svptrue_b32(), v171, v166); svst1w_u64(pred_full, (unsigned *)(v363), svreinterpret_u64_s16(v193)); - svfloat32_t v169; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v169) : "w"(v168), "w"(v117)); - svfloat32_t v170; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v170) : "w"(v168), "w"(v117)); + svfloat32_t v169 = svadd_f32_x(svptrue_b32(), v168, v117); + svfloat32_t v170 = svsub_f32_x(svptrue_b32(), v168, v117); svfloat32_t v173 = svmla_f32_x(pred_full, v172, v95, v350); svfloat32_t v175 = svmls_f32_x(pred_full, v172, v96, v351); svfloat32_t v177 = svmls_f32_x(pred_full, v172, v95, v350); @@ -2259,18 +1507,12 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu9(const armral_cmplx_f32_t *restrict x, pred_full, svmul_n_f32_x(pred_full, v169, (float)(1ULL << 31ULL)))), svreinterpret_u16_u64( svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svfloat32_t v185; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v185) : "w"(v174), "w"(v180)); - svfloat32_t v186; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v186) : "w"(v174), "w"(v180)); - svfloat32_t v187; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v187) : "w"(v176), "w"(v182)); - svfloat32_t v188; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v188) : "w"(v176), "w"(v182)); - svfloat32_t v189; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v189) : "w"(v178), "w"(v184)); - svfloat32_t v190; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v190) : "w"(v178), "w"(v184)); + svfloat32_t v185 = svadd_f32_x(svptrue_b32(), v174, v180); + svfloat32_t v186 = svsub_f32_x(svptrue_b32(), v174, v180); + svfloat32_t v187 = svadd_f32_x(svptrue_b32(), v176, v182); + svfloat32_t v188 = svsub_f32_x(svptrue_b32(), v176, v182); + svfloat32_t v189 = svadd_f32_x(svptrue_b32(), v178, v184); + svfloat32_t v190 = svsub_f32_x(svptrue_b32(), v178, v184); svst1w_u64(pred_full, (unsigned *)(v390), svreinterpret_u64_s16(v217)); svst1w_u64(pred_full, (unsigned *)(v417), svreinterpret_u64_s16(v241)); svint16_t v201 = svtbl_s16( @@ -2323,262 +1565,124 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu10(const armral_cmplx_f32_t *restrict x, float v4 = dir; const float32x2_t *v5 = (const float32x2_t *)x; int32_t *v6 = (int32_t *)y; - int64_t v12 = howmany - 1; - int64_t v293 = howmany / 2; - for (int j = 0; j < v12; j += 2) { - float v171 = -1.2500000000000000e+00F; - float v176 = 5.5901699437494745e-01F; - float v180 = 1.5388417685876268e+00F; - float v181 = -1.5388417685876268e+00F; - float v188 = 5.8778525229247325e-01F; - float v189 = -5.8778525229247325e-01F; - float v196 = 3.6327126400268028e-01F; - float v197 = -3.6327126400268028e-01F; - float32x2_t v199 = (float32x2_t){v4, v4}; - const float32x2_t *v595 = &v5[istride]; - int32_t *v650 = &v6[ostride]; - float32x2_t v172 = (float32x2_t){v171, v171}; - float32x2_t v177 = (float32x2_t){v176, v176}; - float32x2_t v182 = (float32x2_t){v180, v181}; - float32x2_t v190 = (float32x2_t){v188, v189}; - float32x2_t v198 = (float32x2_t){v196, v197}; - const float32x2_t *v532 = &v5[0]; - int32_t *v623 = &v6[0]; - float32x4_t v722 = vld1q_f32((const float32_t *)v595); - float32x4_t v173 = vcombine_f32(v172, v172); - float32x4_t v178 = vcombine_f32(v177, v177); - float32x2_t v184 = vmul_f32(v199, v182); - float32x2_t v192 = vmul_f32(v199, v190); - float32x2_t v200 = vmul_f32(v199, v198); - const float32x2_t *v541 = &v5[istride * 5]; - const float32x2_t *v550 = &v5[istride * 2]; - const float32x2_t *v559 = &v5[istride * 7]; - const float32x2_t *v568 = &v5[istride * 4]; - const float32x2_t *v577 = &v5[istride * 9]; - const float32x2_t *v586 = &v5[istride * 6]; - const float32x2_t *v604 = &v5[istride * 8]; - const float32x2_t *v613 = &v5[istride * 3]; - int32_t *v632 = &v6[ostride * 5]; - int32_t *v641 = &v6[ostride * 6]; - int32_t *v659 = &v6[ostride * 2]; - int32_t *v668 = &v6[ostride * 7]; - int32_t *v677 = &v6[ostride * 8]; - int32_t *v686 = &v6[ostride * 3]; - int32_t *v695 = &v6[ostride * 4]; - int32_t *v704 = &v6[ostride * 9]; - float32x4_t v708 = vld1q_f32((const float32_t *)v532); - float32x4_t v186 = vcombine_f32(v184, v184); - float32x4_t v194 = vcombine_f32(v192, v192); - float32x4_t v202 = vcombine_f32(v200, v200); - float32x4_t v710 = vld1q_f32((const float32_t *)v541); - float32x4_t v712 = vld1q_f32((const float32_t *)v550); - float32x4_t v714 = vld1q_f32((const float32_t *)v559); - float32x4_t v716 = vld1q_f32((const float32_t *)v568); - float32x4_t v718 = vld1q_f32((const float32_t *)v577); - float32x4_t v720 = vld1q_f32((const float32_t *)v586); - float32x4_t v724 = vld1q_f32((const float32_t *)v604); - float32x4_t v726 = vld1q_f32((const float32_t *)v613); - float32x4_t v35 = vaddq_f32(v708, v710); - float32x4_t v36 = vsubq_f32(v708, v710); - float32x4_t v51 = vaddq_f32(v712, v714); - float32x4_t v52 = vsubq_f32(v712, v714); - float32x4_t v67 = vaddq_f32(v716, v718); - float32x4_t v68 = vsubq_f32(v716, v718); - float32x4_t v83 = vaddq_f32(v720, v722); - float32x4_t v84 = vsubq_f32(v720, v722); - float32x4_t v99 = vaddq_f32(v724, v726); - float32x4_t v100 = vsubq_f32(v724, v726); - float32x4_t v101 = vaddq_f32(v51, v99); - float32x4_t v102 = vsubq_f32(v51, v99); - float32x4_t v103 = vaddq_f32(v83, v67); - float32x4_t v104 = vsubq_f32(v83, v67); - float32x4_t v157 = vaddq_f32(v52, v100); - float32x4_t v158 = vsubq_f32(v52, v100); - float32x4_t v159 = vaddq_f32(v84, v68); - float32x4_t v160 = vsubq_f32(v84, v68); - float32x4_t v105 = vaddq_f32(v101, v103); - float32x4_t v106 = vsubq_f32(v101, v103); - float32x4_t v107 = vaddq_f32(v102, v104); - float32x4_t v129 = vrev64q_f32(v102); - float32x4_t v145 = vrev64q_f32(v104); - float32x4_t v161 = vaddq_f32(v157, v159); - float32x4_t v162 = vsubq_f32(v157, v159); - float32x4_t v163 = vaddq_f32(v158, v160); - float32x4_t v185 = vrev64q_f32(v158); - float32x4_t v201 = vrev64q_f32(v160); - float32x4_t v108 = vaddq_f32(v105, v35); - float32x4_t v118 = vmulq_f32(v105, v173); - float32x4_t v123 = vmulq_f32(v106, v178); - float32x4_t v131 = vmulq_f32(v129, v186); - float32x4_t v137 = vrev64q_f32(v107); - float32x4_t v147 = vmulq_f32(v145, v202); - float32x4_t v164 = vaddq_f32(v161, v36); - float32x4_t v174 = vmulq_f32(v161, v173); - float32x4_t v179 = vmulq_f32(v162, v178); - float32x4_t v187 = vmulq_f32(v185, v186); - float32x4_t v193 = vrev64q_f32(v163); - float32x4_t v203 = vmulq_f32(v201, v202); - float32x4_t v139 = vmulq_f32(v137, v194); - float32x4_t v148 = vaddq_f32(v108, v118); - float32x4_t v195 = vmulq_f32(v193, v194); - float32x4_t v204 = vaddq_f32(v164, v174); - int16x4_t v215 = vqmovn_s32(vcvtq_n_s32_f32(v108, 15)); - int16x4_t v223 = vqmovn_s32(vcvtq_n_s32_f32(v164, 15)); - float32x4_t v149 = vaddq_f32(v148, v123); - float32x4_t v150 = vsubq_f32(v148, v123); - float32x4_t v151 = vsubq_f32(v131, v139); - float32x4_t v152 = vaddq_f32(v139, v147); - float32x4_t v205 = vaddq_f32(v204, v179); - float32x4_t v206 = vsubq_f32(v204, v179); - float32x4_t v207 = vsubq_f32(v187, v195); - float32x4_t v208 = vaddq_f32(v195, v203); - vst1_s16((int16_t *)v623, v215); - vst1_s16((int16_t *)v632, v223); - float32x4_t v153 = vaddq_f32(v149, v151); - float32x4_t v154 = vsubq_f32(v149, v151); - float32x4_t v155 = vaddq_f32(v150, v152); - float32x4_t v156 = vsubq_f32(v150, v152); - float32x4_t v209 = vaddq_f32(v205, v207); - float32x4_t v210 = vsubq_f32(v205, v207); - float32x4_t v211 = vaddq_f32(v206, v208); - float32x4_t v212 = vsubq_f32(v206, v208); - int16x4_t v231 = vqmovn_s32(vcvtq_n_s32_f32(v154, 15)); - int16x4_t v239 = vqmovn_s32(vcvtq_n_s32_f32(v210, 15)); - int16x4_t v247 = vqmovn_s32(vcvtq_n_s32_f32(v156, 15)); - int16x4_t v255 = vqmovn_s32(vcvtq_n_s32_f32(v212, 15)); - int16x4_t v263 = vqmovn_s32(vcvtq_n_s32_f32(v155, 15)); - int16x4_t v271 = vqmovn_s32(vcvtq_n_s32_f32(v211, 15)); - int16x4_t v279 = vqmovn_s32(vcvtq_n_s32_f32(v153, 15)); - int16x4_t v287 = vqmovn_s32(vcvtq_n_s32_f32(v209, 15)); - vst1_s16((int16_t *)v641, v231); - vst1_s16((int16_t *)v650, v239); - vst1_s16((int16_t *)v659, v247); - vst1_s16((int16_t *)v668, v255); - vst1_s16((int16_t *)v677, v263); - vst1_s16((int16_t *)v686, v271); - vst1_s16((int16_t *)v695, v279); - vst1_s16((int16_t *)v704, v287); - v5 += 2 * 1; - v6 += 2 * 1; - } - for (int j = v293 * 2; j < howmany; j += 1) { - float32x2_t v346 = v5[istride]; - float v424 = -1.2500000000000000e+00F; - float v428 = 5.5901699437494745e-01F; - float v431 = 1.5388417685876268e+00F; - float v432 = -1.5388417685876268e+00F; - float v438 = 5.8778525229247325e-01F; - float v439 = -5.8778525229247325e-01F; - float v445 = 3.6327126400268028e-01F; - float v446 = -3.6327126400268028e-01F; - float32x2_t v448 = (float32x2_t){v4, v4}; - float32x2_t v305 = v5[0]; - float32x2_t v425 = (float32x2_t){v424, v424}; - float32x2_t v429 = (float32x2_t){v428, v428}; - float32x2_t v433 = (float32x2_t){v431, v432}; - float32x2_t v440 = (float32x2_t){v438, v439}; - float32x2_t v447 = (float32x2_t){v445, v446}; - float32x2_t v310 = v5[istride * 5]; - float32x2_t v317 = v5[istride * 2]; - float32x2_t v322 = v5[istride * 7]; - float32x2_t v329 = v5[istride * 4]; - float32x2_t v334 = v5[istride * 9]; - float32x2_t v341 = v5[istride * 6]; - float32x2_t v353 = v5[istride * 8]; - float32x2_t v358 = v5[istride * 3]; - float32x2_t v435 = vmul_f32(v448, v433); - float32x2_t v442 = vmul_f32(v448, v440); - float32x2_t v449 = vmul_f32(v448, v447); - float32x2_t v311 = vadd_f32(v305, v310); - float32x2_t v312 = vsub_f32(v305, v310); - float32x2_t v323 = vadd_f32(v317, v322); - float32x2_t v324 = vsub_f32(v317, v322); - float32x2_t v335 = vadd_f32(v329, v334); - float32x2_t v336 = vsub_f32(v329, v334); - float32x2_t v347 = vadd_f32(v341, v346); - float32x2_t v348 = vsub_f32(v341, v346); - float32x2_t v359 = vadd_f32(v353, v358); - float32x2_t v360 = vsub_f32(v353, v358); - float32x2_t v361 = vadd_f32(v323, v359); - float32x2_t v362 = vsub_f32(v323, v359); - float32x2_t v363 = vadd_f32(v347, v335); - float32x2_t v364 = vsub_f32(v347, v335); - float32x2_t v411 = vadd_f32(v324, v360); - float32x2_t v412 = vsub_f32(v324, v360); - float32x2_t v413 = vadd_f32(v348, v336); - float32x2_t v414 = vsub_f32(v348, v336); - float32x2_t v365 = vadd_f32(v361, v363); - float32x2_t v366 = vsub_f32(v361, v363); - float32x2_t v367 = vadd_f32(v362, v364); - float32x2_t v386 = vrev64_f32(v362); - float32x2_t v400 = vrev64_f32(v364); - float32x2_t v415 = vadd_f32(v411, v413); - float32x2_t v416 = vsub_f32(v411, v413); - float32x2_t v417 = vadd_f32(v412, v414); - float32x2_t v436 = vrev64_f32(v412); - float32x2_t v450 = vrev64_f32(v414); - float32x2_t v368 = vadd_f32(v365, v311); - float32x2_t v376 = vmul_f32(v365, v425); - float32x2_t v380 = vmul_f32(v366, v429); - float32x2_t v387 = vmul_f32(v386, v435); - float32x2_t v393 = vrev64_f32(v367); - float32x2_t v401 = vmul_f32(v400, v449); - float32x2_t v418 = vadd_f32(v415, v312); - float32x2_t v426 = vmul_f32(v415, v425); - float32x2_t v430 = vmul_f32(v416, v429); - float32x2_t v437 = vmul_f32(v436, v435); - float32x2_t v443 = vrev64_f32(v417); - float32x2_t v451 = vmul_f32(v450, v449); - float32x2_t v394 = vmul_f32(v393, v442); - float32x2_t v402 = vadd_f32(v368, v376); - float32x2_t v444 = vmul_f32(v443, v442); - float32x2_t v452 = vadd_f32(v418, v426); - int16x4_t v463 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v368, 15), (int32x2_t){0, 0})); - int16x4_t v469 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v418, 15), (int32x2_t){0, 0})); - float32x2_t v403 = vadd_f32(v402, v380); - float32x2_t v404 = vsub_f32(v402, v380); - float32x2_t v405 = vsub_f32(v387, v394); - float32x2_t v406 = vadd_f32(v394, v401); - float32x2_t v453 = vadd_f32(v452, v430); - float32x2_t v454 = vsub_f32(v452, v430); - float32x2_t v455 = vsub_f32(v437, v444); - float32x2_t v456 = vadd_f32(v444, v451); - v6[0] = vget_lane_s32(vreinterpret_s32_s16(v463), 0); - v6[ostride * 5] = vget_lane_s32(vreinterpret_s32_s16(v469), 0); - float32x2_t v407 = vadd_f32(v403, v405); - float32x2_t v408 = vsub_f32(v403, v405); - float32x2_t v409 = vadd_f32(v404, v406); - float32x2_t v410 = vsub_f32(v404, v406); - float32x2_t v457 = vadd_f32(v453, v455); - float32x2_t v458 = vsub_f32(v453, v455); - float32x2_t v459 = vadd_f32(v454, v456); - float32x2_t v460 = vsub_f32(v454, v456); - int16x4_t v475 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v408, 15), (int32x2_t){0, 0})); - int16x4_t v481 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v458, 15), (int32x2_t){0, 0})); - int16x4_t v487 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v410, 15), (int32x2_t){0, 0})); - int16x4_t v493 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v460, 15), (int32x2_t){0, 0})); - int16x4_t v499 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v409, 15), (int32x2_t){0, 0})); - int16x4_t v505 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v459, 15), (int32x2_t){0, 0})); - int16x4_t v511 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v407, 15), (int32x2_t){0, 0})); - int16x4_t v517 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v457, 15), (int32x2_t){0, 0})); - v6[ostride * 6] = vget_lane_s32(vreinterpret_s32_s16(v475), 0); - v6[ostride] = vget_lane_s32(vreinterpret_s32_s16(v481), 0); - v6[ostride * 2] = vget_lane_s32(vreinterpret_s32_s16(v487), 0); - v6[ostride * 7] = vget_lane_s32(vreinterpret_s32_s16(v493), 0); - v6[ostride * 8] = vget_lane_s32(vreinterpret_s32_s16(v499), 0); - v6[ostride * 3] = vget_lane_s32(vreinterpret_s32_s16(v505), 0); - v6[ostride * 4] = vget_lane_s32(vreinterpret_s32_s16(v511), 0); - v6[ostride * 9] = vget_lane_s32(vreinterpret_s32_s16(v517), 0); + for (int j = 0; j < howmany; j += 1) { + float32x2_t v61 = v5[istride]; + float v139 = -1.2500000000000000e+00F; + float v143 = 5.5901699437494745e-01F; + float v146 = 1.5388417685876268e+00F; + float v147 = -1.5388417685876268e+00F; + float v153 = 5.8778525229247325e-01F; + float v154 = -5.8778525229247325e-01F; + float v160 = 3.6327126400268028e-01F; + float v161 = -3.6327126400268028e-01F; + float32x2_t v163 = (float32x2_t){v4, v4}; + float32x2_t v20 = v5[0]; + float32x2_t v140 = (float32x2_t){v139, v139}; + float32x2_t v144 = (float32x2_t){v143, v143}; + float32x2_t v148 = (float32x2_t){v146, v147}; + float32x2_t v155 = (float32x2_t){v153, v154}; + float32x2_t v162 = (float32x2_t){v160, v161}; + float32x2_t v25 = v5[istride * 5]; + float32x2_t v32 = v5[istride * 2]; + float32x2_t v37 = v5[istride * 7]; + float32x2_t v44 = v5[istride * 4]; + float32x2_t v49 = v5[istride * 9]; + float32x2_t v56 = v5[istride * 6]; + float32x2_t v68 = v5[istride * 8]; + float32x2_t v73 = v5[istride * 3]; + float32x2_t v150 = vmul_f32(v163, v148); + float32x2_t v157 = vmul_f32(v163, v155); + float32x2_t v164 = vmul_f32(v163, v162); + float32x2_t v26 = vadd_f32(v20, v25); + float32x2_t v27 = vsub_f32(v20, v25); + float32x2_t v38 = vadd_f32(v32, v37); + float32x2_t v39 = vsub_f32(v32, v37); + float32x2_t v50 = vadd_f32(v44, v49); + float32x2_t v51 = vsub_f32(v44, v49); + float32x2_t v62 = vadd_f32(v56, v61); + float32x2_t v63 = vsub_f32(v56, v61); + float32x2_t v74 = vadd_f32(v68, v73); + float32x2_t v75 = vsub_f32(v68, v73); + float32x2_t v76 = vadd_f32(v38, v74); + float32x2_t v77 = vsub_f32(v38, v74); + float32x2_t v78 = vadd_f32(v62, v50); + float32x2_t v79 = vsub_f32(v62, v50); + float32x2_t v126 = vadd_f32(v39, v75); + float32x2_t v127 = vsub_f32(v39, v75); + float32x2_t v128 = vadd_f32(v63, v51); + float32x2_t v129 = vsub_f32(v63, v51); + float32x2_t v80 = vadd_f32(v76, v78); + float32x2_t v81 = vsub_f32(v76, v78); + float32x2_t v82 = vadd_f32(v77, v79); + float32x2_t v101 = vrev64_f32(v77); + float32x2_t v115 = vrev64_f32(v79); + float32x2_t v130 = vadd_f32(v126, v128); + float32x2_t v131 = vsub_f32(v126, v128); + float32x2_t v132 = vadd_f32(v127, v129); + float32x2_t v151 = vrev64_f32(v127); + float32x2_t v165 = vrev64_f32(v129); + float32x2_t v83 = vadd_f32(v80, v26); + float32x2_t v91 = vmul_f32(v80, v140); + float32x2_t v95 = vmul_f32(v81, v144); + float32x2_t v102 = vmul_f32(v101, v150); + float32x2_t v108 = vrev64_f32(v82); + float32x2_t v116 = vmul_f32(v115, v164); + float32x2_t v133 = vadd_f32(v130, v27); + float32x2_t v141 = vmul_f32(v130, v140); + float32x2_t v145 = vmul_f32(v131, v144); + float32x2_t v152 = vmul_f32(v151, v150); + float32x2_t v158 = vrev64_f32(v132); + float32x2_t v166 = vmul_f32(v165, v164); + float32x2_t v109 = vmul_f32(v108, v157); + float32x2_t v117 = vadd_f32(v83, v91); + float32x2_t v159 = vmul_f32(v158, v157); + float32x2_t v167 = vadd_f32(v133, v141); + int16x4_t v178 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v83, 15), (int32x2_t){0, 0})); + int16x4_t v184 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v133, 15), (int32x2_t){0, 0})); + float32x2_t v118 = vadd_f32(v117, v95); + float32x2_t v119 = vsub_f32(v117, v95); + float32x2_t v120 = vsub_f32(v102, v109); + float32x2_t v121 = vadd_f32(v109, v116); + float32x2_t v168 = vadd_f32(v167, v145); + float32x2_t v169 = vsub_f32(v167, v145); + float32x2_t v170 = vsub_f32(v152, v159); + float32x2_t v171 = vadd_f32(v159, v166); + v6[0] = vget_lane_s32(vreinterpret_s32_s16(v178), 0); + v6[ostride * 5] = vget_lane_s32(vreinterpret_s32_s16(v184), 0); + float32x2_t v122 = vadd_f32(v118, v120); + float32x2_t v123 = vsub_f32(v118, v120); + float32x2_t v124 = vadd_f32(v119, v121); + float32x2_t v125 = vsub_f32(v119, v121); + float32x2_t v172 = vadd_f32(v168, v170); + float32x2_t v173 = vsub_f32(v168, v170); + float32x2_t v174 = vadd_f32(v169, v171); + float32x2_t v175 = vsub_f32(v169, v171); + int16x4_t v190 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v123, 15), (int32x2_t){0, 0})); + int16x4_t v196 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v173, 15), (int32x2_t){0, 0})); + int16x4_t v202 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v125, 15), (int32x2_t){0, 0})); + int16x4_t v208 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v175, 15), (int32x2_t){0, 0})); + int16x4_t v214 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v124, 15), (int32x2_t){0, 0})); + int16x4_t v220 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v174, 15), (int32x2_t){0, 0})); + int16x4_t v226 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v122, 15), (int32x2_t){0, 0})); + int16x4_t v232 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v172, 15), (int32x2_t){0, 0})); + v6[ostride * 6] = vget_lane_s32(vreinterpret_s32_s16(v190), 0); + v6[ostride] = vget_lane_s32(vreinterpret_s32_s16(v196), 0); + v6[ostride * 2] = vget_lane_s32(vreinterpret_s32_s16(v202), 0); + v6[ostride * 7] = vget_lane_s32(vreinterpret_s32_s16(v208), 0); + v6[ostride * 8] = vget_lane_s32(vreinterpret_s32_s16(v214), 0); + v6[ostride * 3] = vget_lane_s32(vreinterpret_s32_s16(v220), 0); + v6[ostride * 4] = vget_lane_s32(vreinterpret_s32_s16(v226), 0); + v6[ostride * 9] = vget_lane_s32(vreinterpret_s32_s16(v232), 0); v5 += 1 * 1; v6 += 1 * 1; } @@ -2670,77 +1774,45 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu10(const armral_cmplx_f32_t *restrict x, svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v363)[0])); svfloat32_t v497 = svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v372)[0])); - svfloat32_t v32; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v32) : "w"(v479), "w"(v481)); - svfloat32_t v33; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v33) : "w"(v479), "w"(v481)); - svfloat32_t v48; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v48) : "w"(v483), "w"(v485)); - svfloat32_t v49; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v49) : "w"(v483), "w"(v485)); - svfloat32_t v64; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v64) : "w"(v487), "w"(v489)); - svfloat32_t v65; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v65) : "w"(v487), "w"(v489)); - svfloat32_t v80; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v80) : "w"(v491), "w"(v493)); - svfloat32_t v81; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v81) : "w"(v491), "w"(v493)); - svfloat32_t v96; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v96) : "w"(v495), "w"(v497)); - svfloat32_t v97; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v97) : "w"(v495), "w"(v497)); - svfloat32_t v98; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v98) : "w"(v48), "w"(v96)); - svfloat32_t v99; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v99) : "w"(v48), "w"(v96)); - svfloat32_t v100; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v100) : "w"(v80), "w"(v64)); - svfloat32_t v101; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v101) : "w"(v80), "w"(v64)); - svfloat32_t v151; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v151) : "w"(v49), "w"(v97)); - svfloat32_t v152; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v152) : "w"(v49), "w"(v97)); - svfloat32_t v153; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v153) : "w"(v81), "w"(v65)); - svfloat32_t v154; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v154) : "w"(v81), "w"(v65)); - svfloat32_t v102; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v102) : "w"(v98), "w"(v100)); - svfloat32_t v103; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v103) : "w"(v98), "w"(v100)); - svfloat32_t v104; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v104) : "w"(v99), "w"(v101)); - svfloat32_t zero127; - asm volatile("mov %0.s, #0" : "=w"(zero127)); + svfloat32_t v32 = svadd_f32_x(svptrue_b32(), v479, v481); + svfloat32_t v33 = svsub_f32_x(svptrue_b32(), v479, v481); + svfloat32_t v48 = svadd_f32_x(svptrue_b32(), v483, v485); + svfloat32_t v49 = svsub_f32_x(svptrue_b32(), v483, v485); + svfloat32_t v64 = svadd_f32_x(svptrue_b32(), v487, v489); + svfloat32_t v65 = svsub_f32_x(svptrue_b32(), v487, v489); + svfloat32_t v80 = svadd_f32_x(svptrue_b32(), v491, v493); + svfloat32_t v81 = svsub_f32_x(svptrue_b32(), v491, v493); + svfloat32_t v96 = svadd_f32_x(svptrue_b32(), v495, v497); + svfloat32_t v97 = svsub_f32_x(svptrue_b32(), v495, v497); + svfloat32_t v98 = svadd_f32_x(svptrue_b32(), v48, v96); + svfloat32_t v99 = svsub_f32_x(svptrue_b32(), v48, v96); + svfloat32_t v100 = svadd_f32_x(svptrue_b32(), v80, v64); + svfloat32_t v101 = svsub_f32_x(svptrue_b32(), v80, v64); + svfloat32_t v151 = svadd_f32_x(svptrue_b32(), v49, v97); + svfloat32_t v152 = svsub_f32_x(svptrue_b32(), v49, v97); + svfloat32_t v153 = svadd_f32_x(svptrue_b32(), v81, v65); + svfloat32_t v154 = svsub_f32_x(svptrue_b32(), v81, v65); + svfloat32_t v102 = svadd_f32_x(svptrue_b32(), v98, v100); + svfloat32_t v103 = svsub_f32_x(svptrue_b32(), v98, v100); + svfloat32_t v104 = svadd_f32_x(svptrue_b32(), v99, v101); + svfloat32_t zero127 = svdup_n_f32(0); svfloat32_t v127 = svcmla_f32_x(pred_full, zero127, v384, v99, 90); - svfloat32_t v155; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v155) : "w"(v151), "w"(v153)); - svfloat32_t v156; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v156) : "w"(v151), "w"(v153)); - svfloat32_t v157; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v157) : "w"(v152), "w"(v154)); - svfloat32_t zero180; - asm volatile("mov %0.s, #0" : "=w"(zero180)); + svfloat32_t v155 = svadd_f32_x(svptrue_b32(), v151, v153); + svfloat32_t v156 = svsub_f32_x(svptrue_b32(), v151, v153); + svfloat32_t v157 = svadd_f32_x(svptrue_b32(), v152, v154); + svfloat32_t zero180 = svdup_n_f32(0); svfloat32_t v180 = svcmla_f32_x(pred_full, zero180, v384, v152, 90); - svfloat32_t v105; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v105) : "w"(v102), "w"(v32)); - svfloat32_t zero134; - asm volatile("mov %0.s, #0" : "=w"(zero134)); + svfloat32_t v105 = svadd_f32_x(svptrue_b32(), v102, v32); + svfloat32_t zero134 = svdup_n_f32(0); svfloat32_t v134 = svcmla_f32_x(pred_full, zero134, v385, v104, 90); - svfloat32_t v158; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v158) : "w"(v155), "w"(v33)); - svfloat32_t zero187; - asm volatile("mov %0.s, #0" : "=w"(zero187)); + svfloat32_t v158 = svadd_f32_x(svptrue_b32(), v155, v33); + svfloat32_t zero187 = svdup_n_f32(0); svfloat32_t v187 = svcmla_f32_x(pred_full, zero187, v385, v157, 90); svfloat32_t v142 = svmla_f32_x(pred_full, v105, v102, v382); - svfloat32_t v145; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v145) : "w"(v127), "w"(v134)); + svfloat32_t v145 = svsub_f32_x(svptrue_b32(), v127, v134); svfloat32_t v146 = svcmla_f32_x(pred_full, v134, v386, v101, 90); svfloat32_t v195 = svmla_f32_x(pred_full, v158, v155, v382); - svfloat32_t v198; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v198) : "w"(v180), "w"(v187)); + svfloat32_t v198 = svsub_f32_x(svptrue_b32(), v180, v187); svfloat32_t v199 = svcmla_f32_x(pred_full, v187, v386, v154, 90); svint16_t v206 = svtbl_s16( svreinterpret_s16_s32(svcvt_s32_f32_x( @@ -2758,22 +1830,14 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu10(const armral_cmplx_f32_t *restrict x, svfloat32_t v197 = svmls_f32_x(pred_full, v195, v156, v383); svst1w_u64(pred_full, (unsigned *)(v394), svreinterpret_u64_s16(v206)); svst1w_u64(pred_full, (unsigned *)(v403), svreinterpret_u64_s16(v214)); - svfloat32_t v147; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v147) : "w"(v143), "w"(v145)); - svfloat32_t v148; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v148) : "w"(v143), "w"(v145)); - svfloat32_t v149; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v149) : "w"(v144), "w"(v146)); - svfloat32_t v150; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v150) : "w"(v144), "w"(v146)); - svfloat32_t v200; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v200) : "w"(v196), "w"(v198)); - svfloat32_t v201; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v201) : "w"(v196), "w"(v198)); - svfloat32_t v202; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v202) : "w"(v197), "w"(v199)); - svfloat32_t v203; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v203) : "w"(v197), "w"(v199)); + svfloat32_t v147 = svadd_f32_x(svptrue_b32(), v143, v145); + svfloat32_t v148 = svsub_f32_x(svptrue_b32(), v143, v145); + svfloat32_t v149 = svadd_f32_x(svptrue_b32(), v144, v146); + svfloat32_t v150 = svsub_f32_x(svptrue_b32(), v144, v146); + svfloat32_t v200 = svadd_f32_x(svptrue_b32(), v196, v198); + svfloat32_t v201 = svsub_f32_x(svptrue_b32(), v196, v198); + svfloat32_t v202 = svadd_f32_x(svptrue_b32(), v197, v199); + svfloat32_t v203 = svsub_f32_x(svptrue_b32(), v197, v199); svint16_t v222 = svtbl_s16( svreinterpret_s16_s32(svcvt_s32_f32_x( pred_full, svmul_n_f32_x(pred_full, v148, (float)(1ULL << 31ULL)))), @@ -2836,482 +1900,226 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu11(const armral_cmplx_f32_t *restrict x, float v4 = dir; const float32x2_t *v5 = (const float32x2_t *)x; int32_t *v6 = (int32_t *)y; - int64_t v12 = howmany - 1; - int64_t v405 = howmany / 2; - for (int j = 0; j < v12; j += 2) { - float v141 = 1.1000000000000001e+00F; - float v145 = 3.3166247903554003e-01F; - float v146 = -3.3166247903554003e-01F; - float v154 = 5.1541501300188641e-01F; - float v159 = 9.4125353283118118e-01F; - float v164 = 1.4143537075597825e+00F; - float v169 = 8.5949297361449750e-01F; - float v174 = 4.2314838273285138e-02F; - float v179 = 3.8639279888589606e-01F; - float v184 = 5.1254589567200015e-01F; - float v189 = 1.0702757469471715e+00F; - float v194 = 5.5486073394528512e-01F; - float v198 = 1.2412944743900585e+00F; - float v199 = -1.2412944743900585e+00F; - float v206 = 2.0897833842005756e-01F; - float v207 = -2.0897833842005756e-01F; - float v214 = 3.7415717312460811e-01F; - float v215 = -3.7415717312460811e-01F; - float v222 = 4.9929922194110327e-02F; - float v223 = -4.9929922194110327e-02F; - float v230 = 6.5815896284539266e-01F; - float v231 = -6.5815896284539266e-01F; - float v238 = 6.3306543373877577e-01F; - float v239 = -6.3306543373877577e-01F; - float v246 = 1.0822460581641109e+00F; - float v247 = -1.0822460581641109e+00F; - float v254 = 8.1720737907134022e-01F; - float v255 = -8.1720737907134022e-01F; - float v262 = 4.2408709531871824e-01F; - float v263 = -4.2408709531871824e-01F; - float32x2_t v265 = (float32x2_t){v4, v4}; - const float32x2_t *v742 = &v5[istride]; - int32_t *v933 = &v6[ostride]; - float32x2_t v142 = (float32x2_t){v141, v141}; - float32x2_t v147 = (float32x2_t){v145, v146}; - float32x2_t v155 = (float32x2_t){v154, v154}; - float32x2_t v160 = (float32x2_t){v159, v159}; - float32x2_t v165 = (float32x2_t){v164, v164}; - float32x2_t v170 = (float32x2_t){v169, v169}; - float32x2_t v175 = (float32x2_t){v174, v174}; - float32x2_t v180 = (float32x2_t){v179, v179}; - float32x2_t v185 = (float32x2_t){v184, v184}; - float32x2_t v190 = (float32x2_t){v189, v189}; - float32x2_t v195 = (float32x2_t){v194, v194}; - float32x2_t v200 = (float32x2_t){v198, v199}; - float32x2_t v208 = (float32x2_t){v206, v207}; - float32x2_t v216 = (float32x2_t){v214, v215}; - float32x2_t v224 = (float32x2_t){v222, v223}; - float32x2_t v232 = (float32x2_t){v230, v231}; - float32x2_t v240 = (float32x2_t){v238, v239}; - float32x2_t v248 = (float32x2_t){v246, v247}; - float32x2_t v256 = (float32x2_t){v254, v255}; - float32x2_t v264 = (float32x2_t){v262, v263}; - const float32x2_t *v833 = &v5[0]; - int32_t *v843 = &v6[0]; - float32x4_t v937 = vld1q_f32((const float32_t *)v742); - float32x4_t v143 = vcombine_f32(v142, v142); - float32x2_t v149 = vmul_f32(v265, v147); - float32x4_t v156 = vcombine_f32(v155, v155); - float32x4_t v161 = vcombine_f32(v160, v160); - float32x4_t v166 = vcombine_f32(v165, v165); - float32x4_t v171 = vcombine_f32(v170, v170); - float32x4_t v176 = vcombine_f32(v175, v175); - float32x4_t v181 = vcombine_f32(v180, v180); - float32x4_t v186 = vcombine_f32(v185, v185); - float32x4_t v191 = vcombine_f32(v190, v190); - float32x4_t v196 = vcombine_f32(v195, v195); - float32x2_t v202 = vmul_f32(v265, v200); - float32x2_t v210 = vmul_f32(v265, v208); - float32x2_t v218 = vmul_f32(v265, v216); - float32x2_t v226 = vmul_f32(v265, v224); - float32x2_t v234 = vmul_f32(v265, v232); - float32x2_t v242 = vmul_f32(v265, v240); - float32x2_t v250 = vmul_f32(v265, v248); - float32x2_t v258 = vmul_f32(v265, v256); - float32x2_t v266 = vmul_f32(v265, v264); - const float32x2_t *v751 = &v5[istride * 10]; - const float32x2_t *v760 = &v5[istride * 2]; - const float32x2_t *v769 = &v5[istride * 9]; - const float32x2_t *v778 = &v5[istride * 3]; - const float32x2_t *v787 = &v5[istride * 8]; - const float32x2_t *v796 = &v5[istride * 4]; - const float32x2_t *v805 = &v5[istride * 7]; - const float32x2_t *v814 = &v5[istride * 5]; - const float32x2_t *v823 = &v5[istride * 6]; - int32_t *v852 = &v6[ostride * 10]; - int32_t *v861 = &v6[ostride * 9]; - int32_t *v870 = &v6[ostride * 8]; - int32_t *v879 = &v6[ostride * 7]; - int32_t *v888 = &v6[ostride * 6]; - int32_t *v897 = &v6[ostride * 5]; - int32_t *v906 = &v6[ostride * 4]; - int32_t *v915 = &v6[ostride * 3]; - int32_t *v924 = &v6[ostride * 2]; - float32x4_t v957 = vld1q_f32((const float32_t *)v833); - float32x4_t v151 = vcombine_f32(v149, v149); - float32x4_t v204 = vcombine_f32(v202, v202); - float32x4_t v212 = vcombine_f32(v210, v210); - float32x4_t v220 = vcombine_f32(v218, v218); - float32x4_t v228 = vcombine_f32(v226, v226); - float32x4_t v236 = vcombine_f32(v234, v234); - float32x4_t v244 = vcombine_f32(v242, v242); - float32x4_t v252 = vcombine_f32(v250, v250); - float32x4_t v260 = vcombine_f32(v258, v258); - float32x4_t v268 = vcombine_f32(v266, v266); - float32x4_t v939 = vld1q_f32((const float32_t *)v751); - float32x4_t v941 = vld1q_f32((const float32_t *)v760); - float32x4_t v943 = vld1q_f32((const float32_t *)v769); - float32x4_t v945 = vld1q_f32((const float32_t *)v778); - float32x4_t v947 = vld1q_f32((const float32_t *)v787); - float32x4_t v949 = vld1q_f32((const float32_t *)v796); - float32x4_t v951 = vld1q_f32((const float32_t *)v805); - float32x4_t v953 = vld1q_f32((const float32_t *)v814); - float32x4_t v955 = vld1q_f32((const float32_t *)v823); - float32x4_t v35 = vaddq_f32(v937, v939); - float32x4_t v50 = vaddq_f32(v941, v943); - float32x4_t v65 = vaddq_f32(v945, v947); - float32x4_t v80 = vaddq_f32(v949, v951); - float32x4_t v95 = vaddq_f32(v953, v955); - float32x4_t v96 = vsubq_f32(v937, v939); - float32x4_t v97 = vsubq_f32(v941, v943); - float32x4_t v98 = vsubq_f32(v945, v947); - float32x4_t v99 = vsubq_f32(v949, v951); - float32x4_t v100 = vsubq_f32(v953, v955); - float32x4_t v101 = vaddq_f32(v35, v50); - float32x4_t v102 = vaddq_f32(v65, v95); - float32x4_t v104 = vsubq_f32(v97, v98); - float32x4_t v105 = vaddq_f32(v96, v100); - float32x4_t v117 = vsubq_f32(v50, v80); - float32x4_t v118 = vsubq_f32(v35, v80); - float32x4_t v119 = vsubq_f32(v50, v35); - float32x4_t v120 = vsubq_f32(v95, v80); - float32x4_t v121 = vsubq_f32(v65, v80); - float32x4_t v122 = vsubq_f32(v95, v65); - float32x4_t v123 = vsubq_f32(v50, v95); - float32x4_t v124 = vsubq_f32(v35, v65); - float32x4_t v126 = vaddq_f32(v97, v99); - float32x4_t v127 = vsubq_f32(v96, v99); - float32x4_t v128 = vaddq_f32(v96, v97); - float32x4_t v129 = vsubq_f32(v99, v100); - float32x4_t v130 = vsubq_f32(v98, v99); - float32x4_t v131 = vsubq_f32(v98, v100); - float32x4_t v132 = vaddq_f32(v97, v100); - float32x4_t v133 = vsubq_f32(v96, v98); - float32x4_t v103 = vaddq_f32(v80, v101); - float32x4_t v115 = vsubq_f32(v104, v105); - float32x4_t v125 = vsubq_f32(v102, v101); - float32x4_t v134 = vaddq_f32(v104, v105); - float32x4_t v157 = vmulq_f32(v117, v156); - float32x4_t v162 = vmulq_f32(v118, v161); - float32x4_t v167 = vmulq_f32(v119, v166); - float32x4_t v172 = vmulq_f32(v120, v171); - float32x4_t v177 = vmulq_f32(v121, v176); - float32x4_t v182 = vmulq_f32(v122, v181); - float32x4_t v187 = vmulq_f32(v123, v186); - float32x4_t v192 = vmulq_f32(v124, v191); - float32x4_t v203 = vrev64q_f32(v126); - float32x4_t v211 = vrev64q_f32(v127); - float32x4_t v219 = vrev64q_f32(v128); - float32x4_t v227 = vrev64q_f32(v129); - float32x4_t v235 = vrev64q_f32(v130); - float32x4_t v243 = vrev64q_f32(v131); - float32x4_t v251 = vrev64q_f32(v132); - float32x4_t v259 = vrev64q_f32(v133); - float32x4_t v106 = vaddq_f32(v103, v102); - float32x4_t v116 = vsubq_f32(v115, v99); - float32x4_t v197 = vmulq_f32(v125, v196); - float32x4_t v205 = vmulq_f32(v203, v204); - float32x4_t v213 = vmulq_f32(v211, v212); - float32x4_t v221 = vmulq_f32(v219, v220); - float32x4_t v229 = vmulq_f32(v227, v228); - float32x4_t v237 = vmulq_f32(v235, v236); - float32x4_t v245 = vmulq_f32(v243, v244); - float32x4_t v253 = vmulq_f32(v251, v252); - float32x4_t v261 = vmulq_f32(v259, v260); - float32x4_t v267 = vrev64q_f32(v134); - float32x4_t v271 = vaddq_f32(v157, v162); - float32x4_t v272 = vaddq_f32(v162, v167); - float32x4_t v273 = vsubq_f32(v157, v167); - float32x4_t v274 = vaddq_f32(v172, v177); - float32x4_t v275 = vaddq_f32(v177, v182); - float32x4_t v276 = vsubq_f32(v172, v182); - float32x4_t v114 = vaddq_f32(v957, v106); - float32x4_t v144 = vmulq_f32(v106, v143); - float32x4_t v150 = vrev64q_f32(v116); - float32x4_t v269 = vmulq_f32(v267, v268); - float32x4_t v277 = vaddq_f32(v192, v197); - float32x4_t v278 = vaddq_f32(v187, v197); - float32x4_t v279 = vaddq_f32(v213, v221); - float32x4_t v280 = vsubq_f32(v205, v221); - float32x4_t v281 = vaddq_f32(v237, v245); - float32x4_t v282 = vsubq_f32(v229, v245); - float32x4_t v152 = vmulq_f32(v150, v151); - float32x4_t v270 = vsubq_f32(v114, v144); - float32x4_t v283 = vaddq_f32(v261, v269); - float32x4_t v284 = vsubq_f32(v253, v269); - float32x4_t v285 = vaddq_f32(v275, v277); - float32x4_t v303 = vaddq_f32(v279, v280); - int16x4_t v319 = vqmovn_s32(vcvtq_n_s32_f32(v114, 15)); - float32x4_t v286 = vaddq_f32(v285, v270); - float32x4_t v287 = vsubq_f32(v270, v272); - float32x4_t v289 = vaddq_f32(v270, v276); - float32x4_t v291 = vsubq_f32(v270, v273); - float32x4_t v293 = vaddq_f32(v270, v271); - float32x4_t v295 = vaddq_f32(v152, v281); - float32x4_t v297 = vsubq_f32(v283, v279); - float32x4_t v299 = vaddq_f32(v152, v284); - float32x4_t v301 = vsubq_f32(v284, v280); - float32x4_t v304 = vaddq_f32(v303, v281); - vst1_s16((int16_t *)v843, v319); - float32x4_t v288 = vsubq_f32(v287, v277); - float32x4_t v290 = vaddq_f32(v289, v278); - float32x4_t v292 = vsubq_f32(v291, v278); - float32x4_t v294 = vsubq_f32(v293, v274); - float32x4_t v296 = vaddq_f32(v295, v283); - float32x4_t v298 = vsubq_f32(v297, v152); - float32x4_t v300 = vaddq_f32(v299, v282); - float32x4_t v302 = vsubq_f32(v301, v152); - float32x4_t v305 = vaddq_f32(v304, v282); - float32x4_t v306 = vsubq_f32(v305, v152); - float32x4_t v308 = vaddq_f32(v286, v296); - float32x4_t v309 = vaddq_f32(v288, v298); - float32x4_t v310 = vsubq_f32(v290, v300); - float32x4_t v311 = vaddq_f32(v292, v302); - float32x4_t v312 = vsubq_f32(v292, v302); - float32x4_t v313 = vaddq_f32(v290, v300); - float32x4_t v314 = vsubq_f32(v288, v298); - float32x4_t v315 = vsubq_f32(v286, v296); - float32x4_t v307 = vaddq_f32(v294, v306); - float32x4_t v316 = vsubq_f32(v294, v306); - int16x4_t v335 = vqmovn_s32(vcvtq_n_s32_f32(v308, 15)); - int16x4_t v343 = vqmovn_s32(vcvtq_n_s32_f32(v309, 15)); - int16x4_t v351 = vqmovn_s32(vcvtq_n_s32_f32(v310, 15)); - int16x4_t v359 = vqmovn_s32(vcvtq_n_s32_f32(v311, 15)); - int16x4_t v367 = vqmovn_s32(vcvtq_n_s32_f32(v312, 15)); - int16x4_t v375 = vqmovn_s32(vcvtq_n_s32_f32(v313, 15)); - int16x4_t v383 = vqmovn_s32(vcvtq_n_s32_f32(v314, 15)); - int16x4_t v391 = vqmovn_s32(vcvtq_n_s32_f32(v315, 15)); - int16x4_t v327 = vqmovn_s32(vcvtq_n_s32_f32(v307, 15)); - int16x4_t v399 = vqmovn_s32(vcvtq_n_s32_f32(v316, 15)); - vst1_s16((int16_t *)v861, v335); - vst1_s16((int16_t *)v870, v343); - vst1_s16((int16_t *)v879, v351); - vst1_s16((int16_t *)v888, v359); - vst1_s16((int16_t *)v897, v367); - vst1_s16((int16_t *)v906, v375); - vst1_s16((int16_t *)v915, v383); - vst1_s16((int16_t *)v924, v391); - vst1_s16((int16_t *)v852, v327); - vst1_s16((int16_t *)v933, v399); - v5 += 2 * 1; - v6 += 2 * 1; - } - for (int j = v405 * 2; j < howmany; j += 1) { - float32x2_t v417 = v5[istride]; - float v510 = 1.1000000000000001e+00F; - float v513 = 3.3166247903554003e-01F; - float v514 = -3.3166247903554003e-01F; - float v521 = 5.1541501300188641e-01F; - float v525 = 9.4125353283118118e-01F; - float v529 = 1.4143537075597825e+00F; - float v533 = 8.5949297361449750e-01F; - float v537 = 4.2314838273285138e-02F; - float v541 = 3.8639279888589606e-01F; - float v545 = 5.1254589567200015e-01F; - float v549 = 1.0702757469471715e+00F; - float v553 = 5.5486073394528512e-01F; - float v556 = 1.2412944743900585e+00F; - float v557 = -1.2412944743900585e+00F; - float v563 = 2.0897833842005756e-01F; - float v564 = -2.0897833842005756e-01F; - float v570 = 3.7415717312460811e-01F; - float v571 = -3.7415717312460811e-01F; - float v577 = 4.9929922194110327e-02F; - float v578 = -4.9929922194110327e-02F; - float v584 = 6.5815896284539266e-01F; - float v585 = -6.5815896284539266e-01F; - float v591 = 6.3306543373877577e-01F; - float v592 = -6.3306543373877577e-01F; - float v598 = 1.0822460581641109e+00F; - float v599 = -1.0822460581641109e+00F; - float v605 = 8.1720737907134022e-01F; - float v606 = -8.1720737907134022e-01F; - float v612 = 4.2408709531871824e-01F; - float v613 = -4.2408709531871824e-01F; - float32x2_t v615 = (float32x2_t){v4, v4}; - float32x2_t v483 = v5[0]; - float32x2_t v511 = (float32x2_t){v510, v510}; - float32x2_t v515 = (float32x2_t){v513, v514}; - float32x2_t v522 = (float32x2_t){v521, v521}; - float32x2_t v526 = (float32x2_t){v525, v525}; - float32x2_t v530 = (float32x2_t){v529, v529}; - float32x2_t v534 = (float32x2_t){v533, v533}; - float32x2_t v538 = (float32x2_t){v537, v537}; - float32x2_t v542 = (float32x2_t){v541, v541}; - float32x2_t v546 = (float32x2_t){v545, v545}; - float32x2_t v550 = (float32x2_t){v549, v549}; - float32x2_t v554 = (float32x2_t){v553, v553}; - float32x2_t v558 = (float32x2_t){v556, v557}; - float32x2_t v565 = (float32x2_t){v563, v564}; - float32x2_t v572 = (float32x2_t){v570, v571}; - float32x2_t v579 = (float32x2_t){v577, v578}; - float32x2_t v586 = (float32x2_t){v584, v585}; - float32x2_t v593 = (float32x2_t){v591, v592}; - float32x2_t v600 = (float32x2_t){v598, v599}; - float32x2_t v607 = (float32x2_t){v605, v606}; - float32x2_t v614 = (float32x2_t){v612, v613}; - float32x2_t v422 = v5[istride * 10]; - float32x2_t v428 = v5[istride * 2]; - float32x2_t v433 = v5[istride * 9]; - float32x2_t v439 = v5[istride * 3]; - float32x2_t v444 = v5[istride * 8]; - float32x2_t v450 = v5[istride * 4]; - float32x2_t v455 = v5[istride * 7]; - float32x2_t v461 = v5[istride * 5]; - float32x2_t v466 = v5[istride * 6]; - float32x2_t v517 = vmul_f32(v615, v515); - float32x2_t v560 = vmul_f32(v615, v558); - float32x2_t v567 = vmul_f32(v615, v565); - float32x2_t v574 = vmul_f32(v615, v572); - float32x2_t v581 = vmul_f32(v615, v579); - float32x2_t v588 = vmul_f32(v615, v586); - float32x2_t v595 = vmul_f32(v615, v593); - float32x2_t v602 = vmul_f32(v615, v600); - float32x2_t v609 = vmul_f32(v615, v607); - float32x2_t v616 = vmul_f32(v615, v614); - float32x2_t v423 = vadd_f32(v417, v422); - float32x2_t v434 = vadd_f32(v428, v433); - float32x2_t v445 = vadd_f32(v439, v444); - float32x2_t v456 = vadd_f32(v450, v455); - float32x2_t v467 = vadd_f32(v461, v466); - float32x2_t v468 = vsub_f32(v417, v422); - float32x2_t v469 = vsub_f32(v428, v433); - float32x2_t v470 = vsub_f32(v439, v444); - float32x2_t v471 = vsub_f32(v450, v455); - float32x2_t v472 = vsub_f32(v461, v466); - float32x2_t v473 = vadd_f32(v423, v434); - float32x2_t v474 = vadd_f32(v445, v467); - float32x2_t v476 = vsub_f32(v469, v470); - float32x2_t v477 = vadd_f32(v468, v472); - float32x2_t v487 = vsub_f32(v434, v456); - float32x2_t v488 = vsub_f32(v423, v456); - float32x2_t v489 = vsub_f32(v434, v423); - float32x2_t v490 = vsub_f32(v467, v456); - float32x2_t v491 = vsub_f32(v445, v456); - float32x2_t v492 = vsub_f32(v467, v445); - float32x2_t v493 = vsub_f32(v434, v467); - float32x2_t v494 = vsub_f32(v423, v445); - float32x2_t v496 = vadd_f32(v469, v471); - float32x2_t v497 = vsub_f32(v468, v471); - float32x2_t v498 = vadd_f32(v468, v469); - float32x2_t v499 = vsub_f32(v471, v472); - float32x2_t v500 = vsub_f32(v470, v471); - float32x2_t v501 = vsub_f32(v470, v472); - float32x2_t v502 = vadd_f32(v469, v472); - float32x2_t v503 = vsub_f32(v468, v470); - float32x2_t v475 = vadd_f32(v456, v473); - float32x2_t v485 = vsub_f32(v476, v477); - float32x2_t v495 = vsub_f32(v474, v473); - float32x2_t v504 = vadd_f32(v476, v477); - float32x2_t v523 = vmul_f32(v487, v522); - float32x2_t v527 = vmul_f32(v488, v526); - float32x2_t v531 = vmul_f32(v489, v530); - float32x2_t v535 = vmul_f32(v490, v534); - float32x2_t v539 = vmul_f32(v491, v538); - float32x2_t v543 = vmul_f32(v492, v542); - float32x2_t v547 = vmul_f32(v493, v546); - float32x2_t v551 = vmul_f32(v494, v550); - float32x2_t v561 = vrev64_f32(v496); - float32x2_t v568 = vrev64_f32(v497); - float32x2_t v575 = vrev64_f32(v498); - float32x2_t v582 = vrev64_f32(v499); - float32x2_t v589 = vrev64_f32(v500); - float32x2_t v596 = vrev64_f32(v501); - float32x2_t v603 = vrev64_f32(v502); - float32x2_t v610 = vrev64_f32(v503); - float32x2_t v478 = vadd_f32(v475, v474); - float32x2_t v486 = vsub_f32(v485, v471); - float32x2_t v555 = vmul_f32(v495, v554); - float32x2_t v562 = vmul_f32(v561, v560); - float32x2_t v569 = vmul_f32(v568, v567); - float32x2_t v576 = vmul_f32(v575, v574); - float32x2_t v583 = vmul_f32(v582, v581); - float32x2_t v590 = vmul_f32(v589, v588); - float32x2_t v597 = vmul_f32(v596, v595); - float32x2_t v604 = vmul_f32(v603, v602); - float32x2_t v611 = vmul_f32(v610, v609); - float32x2_t v617 = vrev64_f32(v504); - float32x2_t v620 = vadd_f32(v523, v527); - float32x2_t v621 = vadd_f32(v527, v531); - float32x2_t v622 = vsub_f32(v523, v531); - float32x2_t v623 = vadd_f32(v535, v539); - float32x2_t v624 = vadd_f32(v539, v543); - float32x2_t v625 = vsub_f32(v535, v543); - float32x2_t v484 = vadd_f32(v483, v478); - float32x2_t v512 = vmul_f32(v478, v511); - float32x2_t v518 = vrev64_f32(v486); - float32x2_t v618 = vmul_f32(v617, v616); - float32x2_t v626 = vadd_f32(v551, v555); - float32x2_t v627 = vadd_f32(v547, v555); - float32x2_t v628 = vadd_f32(v569, v576); - float32x2_t v629 = vsub_f32(v562, v576); - float32x2_t v630 = vadd_f32(v590, v597); - float32x2_t v631 = vsub_f32(v583, v597); - float32x2_t v519 = vmul_f32(v518, v517); - float32x2_t v619 = vsub_f32(v484, v512); - float32x2_t v632 = vadd_f32(v611, v618); - float32x2_t v633 = vsub_f32(v604, v618); - float32x2_t v634 = vadd_f32(v624, v626); - float32x2_t v652 = vadd_f32(v628, v629); - int16x4_t v668 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v484, 15), (int32x2_t){0, 0})); - float32x2_t v635 = vadd_f32(v634, v619); - float32x2_t v636 = vsub_f32(v619, v621); - float32x2_t v638 = vadd_f32(v619, v625); - float32x2_t v640 = vsub_f32(v619, v622); - float32x2_t v642 = vadd_f32(v619, v620); - float32x2_t v644 = vadd_f32(v519, v630); - float32x2_t v646 = vsub_f32(v632, v628); - float32x2_t v648 = vadd_f32(v519, v633); - float32x2_t v650 = vsub_f32(v633, v629); - float32x2_t v653 = vadd_f32(v652, v630); - v6[0] = vget_lane_s32(vreinterpret_s32_s16(v668), 0); - float32x2_t v637 = vsub_f32(v636, v626); - float32x2_t v639 = vadd_f32(v638, v627); - float32x2_t v641 = vsub_f32(v640, v627); - float32x2_t v643 = vsub_f32(v642, v623); - float32x2_t v645 = vadd_f32(v644, v632); - float32x2_t v647 = vsub_f32(v646, v519); - float32x2_t v649 = vadd_f32(v648, v631); - float32x2_t v651 = vsub_f32(v650, v519); - float32x2_t v654 = vadd_f32(v653, v631); - float32x2_t v655 = vsub_f32(v654, v519); - float32x2_t v657 = vadd_f32(v635, v645); - float32x2_t v658 = vadd_f32(v637, v647); - float32x2_t v659 = vsub_f32(v639, v649); - float32x2_t v660 = vadd_f32(v641, v651); - float32x2_t v661 = vsub_f32(v641, v651); - float32x2_t v662 = vadd_f32(v639, v649); - float32x2_t v663 = vsub_f32(v637, v647); - float32x2_t v664 = vsub_f32(v635, v645); - float32x2_t v656 = vadd_f32(v643, v655); - float32x2_t v665 = vsub_f32(v643, v655); - int16x4_t v680 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v657, 15), (int32x2_t){0, 0})); - int16x4_t v686 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v658, 15), (int32x2_t){0, 0})); - int16x4_t v692 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v659, 15), (int32x2_t){0, 0})); - int16x4_t v698 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v660, 15), (int32x2_t){0, 0})); - int16x4_t v704 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v661, 15), (int32x2_t){0, 0})); - int16x4_t v710 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v662, 15), (int32x2_t){0, 0})); - int16x4_t v716 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v663, 15), (int32x2_t){0, 0})); - int16x4_t v722 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v664, 15), (int32x2_t){0, 0})); - int16x4_t v674 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v656, 15), (int32x2_t){0, 0})); - v6[ostride * 9] = vget_lane_s32(vreinterpret_s32_s16(v680), 0); - v6[ostride * 8] = vget_lane_s32(vreinterpret_s32_s16(v686), 0); - v6[ostride * 7] = vget_lane_s32(vreinterpret_s32_s16(v692), 0); - v6[ostride * 6] = vget_lane_s32(vreinterpret_s32_s16(v698), 0); - v6[ostride * 5] = vget_lane_s32(vreinterpret_s32_s16(v704), 0); - v6[ostride * 4] = vget_lane_s32(vreinterpret_s32_s16(v710), 0); - v6[ostride * 3] = vget_lane_s32(vreinterpret_s32_s16(v716), 0); - v6[ostride * 2] = vget_lane_s32(vreinterpret_s32_s16(v722), 0); - int16x4_t v728 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v665, 15), (int32x2_t){0, 0})); - v6[ostride * 10] = vget_lane_s32(vreinterpret_s32_s16(v674), 0); - v6[ostride] = vget_lane_s32(vreinterpret_s32_s16(v728), 0); + for (int j = 0; j < howmany; j += 1) { + float32x2_t v20 = v5[istride]; + float v113 = 1.1000000000000001e+00F; + float v116 = 3.3166247903554003e-01F; + float v117 = -3.3166247903554003e-01F; + float v124 = 5.1541501300188641e-01F; + float v128 = 9.4125353283118118e-01F; + float v132 = 1.4143537075597825e+00F; + float v136 = 8.5949297361449750e-01F; + float v140 = 4.2314838273285138e-02F; + float v144 = 3.8639279888589606e-01F; + float v148 = 5.1254589567200015e-01F; + float v152 = 1.0702757469471715e+00F; + float v156 = 5.5486073394528512e-01F; + float v159 = 1.2412944743900585e+00F; + float v160 = -1.2412944743900585e+00F; + float v166 = 2.0897833842005756e-01F; + float v167 = -2.0897833842005756e-01F; + float v173 = 3.7415717312460811e-01F; + float v174 = -3.7415717312460811e-01F; + float v180 = 4.9929922194110327e-02F; + float v181 = -4.9929922194110327e-02F; + float v187 = 6.5815896284539266e-01F; + float v188 = -6.5815896284539266e-01F; + float v194 = 6.3306543373877577e-01F; + float v195 = -6.3306543373877577e-01F; + float v201 = 1.0822460581641109e+00F; + float v202 = -1.0822460581641109e+00F; + float v208 = 8.1720737907134022e-01F; + float v209 = -8.1720737907134022e-01F; + float v215 = 4.2408709531871824e-01F; + float v216 = -4.2408709531871824e-01F; + float32x2_t v218 = (float32x2_t){v4, v4}; + float32x2_t v86 = v5[0]; + float32x2_t v114 = (float32x2_t){v113, v113}; + float32x2_t v118 = (float32x2_t){v116, v117}; + float32x2_t v125 = (float32x2_t){v124, v124}; + float32x2_t v129 = (float32x2_t){v128, v128}; + float32x2_t v133 = (float32x2_t){v132, v132}; + float32x2_t v137 = (float32x2_t){v136, v136}; + float32x2_t v141 = (float32x2_t){v140, v140}; + float32x2_t v145 = (float32x2_t){v144, v144}; + float32x2_t v149 = (float32x2_t){v148, v148}; + float32x2_t v153 = (float32x2_t){v152, v152}; + float32x2_t v157 = (float32x2_t){v156, v156}; + float32x2_t v161 = (float32x2_t){v159, v160}; + float32x2_t v168 = (float32x2_t){v166, v167}; + float32x2_t v175 = (float32x2_t){v173, v174}; + float32x2_t v182 = (float32x2_t){v180, v181}; + float32x2_t v189 = (float32x2_t){v187, v188}; + float32x2_t v196 = (float32x2_t){v194, v195}; + float32x2_t v203 = (float32x2_t){v201, v202}; + float32x2_t v210 = (float32x2_t){v208, v209}; + float32x2_t v217 = (float32x2_t){v215, v216}; + float32x2_t v25 = v5[istride * 10]; + float32x2_t v31 = v5[istride * 2]; + float32x2_t v36 = v5[istride * 9]; + float32x2_t v42 = v5[istride * 3]; + float32x2_t v47 = v5[istride * 8]; + float32x2_t v53 = v5[istride * 4]; + float32x2_t v58 = v5[istride * 7]; + float32x2_t v64 = v5[istride * 5]; + float32x2_t v69 = v5[istride * 6]; + float32x2_t v120 = vmul_f32(v218, v118); + float32x2_t v163 = vmul_f32(v218, v161); + float32x2_t v170 = vmul_f32(v218, v168); + float32x2_t v177 = vmul_f32(v218, v175); + float32x2_t v184 = vmul_f32(v218, v182); + float32x2_t v191 = vmul_f32(v218, v189); + float32x2_t v198 = vmul_f32(v218, v196); + float32x2_t v205 = vmul_f32(v218, v203); + float32x2_t v212 = vmul_f32(v218, v210); + float32x2_t v219 = vmul_f32(v218, v217); + float32x2_t v26 = vadd_f32(v20, v25); + float32x2_t v37 = vadd_f32(v31, v36); + float32x2_t v48 = vadd_f32(v42, v47); + float32x2_t v59 = vadd_f32(v53, v58); + float32x2_t v70 = vadd_f32(v64, v69); + float32x2_t v71 = vsub_f32(v20, v25); + float32x2_t v72 = vsub_f32(v31, v36); + float32x2_t v73 = vsub_f32(v42, v47); + float32x2_t v74 = vsub_f32(v53, v58); + float32x2_t v75 = vsub_f32(v64, v69); + float32x2_t v76 = vadd_f32(v26, v37); + float32x2_t v77 = vadd_f32(v48, v70); + float32x2_t v79 = vsub_f32(v72, v73); + float32x2_t v80 = vadd_f32(v71, v75); + float32x2_t v90 = vsub_f32(v37, v59); + float32x2_t v91 = vsub_f32(v26, v59); + float32x2_t v92 = vsub_f32(v37, v26); + float32x2_t v93 = vsub_f32(v70, v59); + float32x2_t v94 = vsub_f32(v48, v59); + float32x2_t v95 = vsub_f32(v70, v48); + float32x2_t v96 = vsub_f32(v37, v70); + float32x2_t v97 = vsub_f32(v26, v48); + float32x2_t v99 = vadd_f32(v72, v74); + float32x2_t v100 = vsub_f32(v71, v74); + float32x2_t v101 = vadd_f32(v71, v72); + float32x2_t v102 = vsub_f32(v74, v75); + float32x2_t v103 = vsub_f32(v73, v74); + float32x2_t v104 = vsub_f32(v73, v75); + float32x2_t v105 = vadd_f32(v72, v75); + float32x2_t v106 = vsub_f32(v71, v73); + float32x2_t v78 = vadd_f32(v59, v76); + float32x2_t v88 = vsub_f32(v79, v80); + float32x2_t v98 = vsub_f32(v77, v76); + float32x2_t v107 = vadd_f32(v79, v80); + float32x2_t v126 = vmul_f32(v90, v125); + float32x2_t v130 = vmul_f32(v91, v129); + float32x2_t v134 = vmul_f32(v92, v133); + float32x2_t v138 = vmul_f32(v93, v137); + float32x2_t v142 = vmul_f32(v94, v141); + float32x2_t v146 = vmul_f32(v95, v145); + float32x2_t v150 = vmul_f32(v96, v149); + float32x2_t v154 = vmul_f32(v97, v153); + float32x2_t v164 = vrev64_f32(v99); + float32x2_t v171 = vrev64_f32(v100); + float32x2_t v178 = vrev64_f32(v101); + float32x2_t v185 = vrev64_f32(v102); + float32x2_t v192 = vrev64_f32(v103); + float32x2_t v199 = vrev64_f32(v104); + float32x2_t v206 = vrev64_f32(v105); + float32x2_t v213 = vrev64_f32(v106); + float32x2_t v81 = vadd_f32(v78, v77); + float32x2_t v89 = vsub_f32(v88, v74); + float32x2_t v158 = vmul_f32(v98, v157); + float32x2_t v165 = vmul_f32(v164, v163); + float32x2_t v172 = vmul_f32(v171, v170); + float32x2_t v179 = vmul_f32(v178, v177); + float32x2_t v186 = vmul_f32(v185, v184); + float32x2_t v193 = vmul_f32(v192, v191); + float32x2_t v200 = vmul_f32(v199, v198); + float32x2_t v207 = vmul_f32(v206, v205); + float32x2_t v214 = vmul_f32(v213, v212); + float32x2_t v220 = vrev64_f32(v107); + float32x2_t v223 = vadd_f32(v126, v130); + float32x2_t v224 = vadd_f32(v130, v134); + float32x2_t v225 = vsub_f32(v126, v134); + float32x2_t v226 = vadd_f32(v138, v142); + float32x2_t v227 = vadd_f32(v142, v146); + float32x2_t v228 = vsub_f32(v138, v146); + float32x2_t v87 = vadd_f32(v86, v81); + float32x2_t v115 = vmul_f32(v81, v114); + float32x2_t v121 = vrev64_f32(v89); + float32x2_t v221 = vmul_f32(v220, v219); + float32x2_t v229 = vadd_f32(v154, v158); + float32x2_t v230 = vadd_f32(v150, v158); + float32x2_t v231 = vadd_f32(v172, v179); + float32x2_t v232 = vsub_f32(v165, v179); + float32x2_t v233 = vadd_f32(v193, v200); + float32x2_t v234 = vsub_f32(v186, v200); + float32x2_t v122 = vmul_f32(v121, v120); + float32x2_t v222 = vsub_f32(v87, v115); + float32x2_t v235 = vadd_f32(v214, v221); + float32x2_t v236 = vsub_f32(v207, v221); + float32x2_t v237 = vadd_f32(v227, v229); + float32x2_t v255 = vadd_f32(v231, v232); + int16x4_t v271 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v87, 15), (int32x2_t){0, 0})); + float32x2_t v238 = vadd_f32(v237, v222); + float32x2_t v239 = vsub_f32(v222, v224); + float32x2_t v241 = vadd_f32(v222, v228); + float32x2_t v243 = vsub_f32(v222, v225); + float32x2_t v245 = vadd_f32(v222, v223); + float32x2_t v247 = vadd_f32(v122, v233); + float32x2_t v249 = vsub_f32(v235, v231); + float32x2_t v251 = vadd_f32(v122, v236); + float32x2_t v253 = vsub_f32(v236, v232); + float32x2_t v256 = vadd_f32(v255, v233); + v6[0] = vget_lane_s32(vreinterpret_s32_s16(v271), 0); + float32x2_t v240 = vsub_f32(v239, v229); + float32x2_t v242 = vadd_f32(v241, v230); + float32x2_t v244 = vsub_f32(v243, v230); + float32x2_t v246 = vsub_f32(v245, v226); + float32x2_t v248 = vadd_f32(v247, v235); + float32x2_t v250 = vsub_f32(v249, v122); + float32x2_t v252 = vadd_f32(v251, v234); + float32x2_t v254 = vsub_f32(v253, v122); + float32x2_t v257 = vadd_f32(v256, v234); + float32x2_t v258 = vsub_f32(v257, v122); + float32x2_t v260 = vadd_f32(v238, v248); + float32x2_t v261 = vadd_f32(v240, v250); + float32x2_t v262 = vsub_f32(v242, v252); + float32x2_t v263 = vadd_f32(v244, v254); + float32x2_t v264 = vsub_f32(v244, v254); + float32x2_t v265 = vadd_f32(v242, v252); + float32x2_t v266 = vsub_f32(v240, v250); + float32x2_t v267 = vsub_f32(v238, v248); + float32x2_t v259 = vadd_f32(v246, v258); + float32x2_t v268 = vsub_f32(v246, v258); + int16x4_t v283 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v260, 15), (int32x2_t){0, 0})); + int16x4_t v289 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v261, 15), (int32x2_t){0, 0})); + int16x4_t v295 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v262, 15), (int32x2_t){0, 0})); + int16x4_t v301 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v263, 15), (int32x2_t){0, 0})); + int16x4_t v307 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v264, 15), (int32x2_t){0, 0})); + int16x4_t v313 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v265, 15), (int32x2_t){0, 0})); + int16x4_t v319 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v266, 15), (int32x2_t){0, 0})); + int16x4_t v325 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v267, 15), (int32x2_t){0, 0})); + int16x4_t v277 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v259, 15), (int32x2_t){0, 0})); + v6[ostride * 9] = vget_lane_s32(vreinterpret_s32_s16(v283), 0); + v6[ostride * 8] = vget_lane_s32(vreinterpret_s32_s16(v289), 0); + v6[ostride * 7] = vget_lane_s32(vreinterpret_s32_s16(v295), 0); + v6[ostride * 6] = vget_lane_s32(vreinterpret_s32_s16(v301), 0); + v6[ostride * 5] = vget_lane_s32(vreinterpret_s32_s16(v307), 0); + v6[ostride * 4] = vget_lane_s32(vreinterpret_s32_s16(v313), 0); + v6[ostride * 3] = vget_lane_s32(vreinterpret_s32_s16(v319), 0); + v6[ostride * 2] = vget_lane_s32(vreinterpret_s32_s16(v325), 0); + int16x4_t v331 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v268, 15), (int32x2_t){0, 0})); + v6[ostride * 10] = vget_lane_s32(vreinterpret_s32_s16(v277), 0); + v6[ostride] = vget_lane_s32(vreinterpret_s32_s16(v331), 0); v5 += 1 * 1; v6 += 1 * 1; } @@ -3446,105 +2254,58 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu11(const armral_cmplx_f32_t *restrict x, svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v470)[0])); svfloat32_t v632 = svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v479)[0])); - svfloat32_t v32; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v32) : "w"(v614), "w"(v616)); - svfloat32_t v47; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v47) : "w"(v618), "w"(v620)); - svfloat32_t v62; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v62) : "w"(v622), "w"(v624)); - svfloat32_t v77; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v77) : "w"(v626), "w"(v628)); - svfloat32_t v92; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v92) : "w"(v630), "w"(v632)); - svfloat32_t v93; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v93) : "w"(v614), "w"(v616)); - svfloat32_t v94; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v94) : "w"(v618), "w"(v620)); - svfloat32_t v95; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v95) : "w"(v622), "w"(v624)); - svfloat32_t v96; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v96) : "w"(v626), "w"(v628)); - svfloat32_t v97; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v97) : "w"(v630), "w"(v632)); - svfloat32_t v98; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v98) : "w"(v32), "w"(v47)); - svfloat32_t v99; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v99) : "w"(v62), "w"(v92)); - svfloat32_t v101; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v101) : "w"(v94), "w"(v95)); - svfloat32_t v102; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v102) : "w"(v93), "w"(v97)); - svfloat32_t v114; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v114) : "w"(v47), "w"(v77)); - svfloat32_t v115; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v115) : "w"(v32), "w"(v77)); - svfloat32_t v116; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v116) : "w"(v47), "w"(v32)); - svfloat32_t v117; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v117) : "w"(v92), "w"(v77)); - svfloat32_t v118; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v118) : "w"(v62), "w"(v77)); - svfloat32_t v119; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v119) : "w"(v92), "w"(v62)); - svfloat32_t v120; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v120) : "w"(v47), "w"(v92)); - svfloat32_t v121; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v121) : "w"(v32), "w"(v62)); - svfloat32_t v123; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v123) : "w"(v94), "w"(v96)); - svfloat32_t v124; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v124) : "w"(v93), "w"(v96)); - svfloat32_t v125; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v125) : "w"(v93), "w"(v94)); - svfloat32_t v126; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v126) : "w"(v96), "w"(v97)); - svfloat32_t v127; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v127) : "w"(v95), "w"(v96)); - svfloat32_t v128; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v128) : "w"(v95), "w"(v97)); - svfloat32_t v129; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v129) : "w"(v94), "w"(v97)); - svfloat32_t v130; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v130) : "w"(v93), "w"(v95)); - svfloat32_t v100; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v100) : "w"(v77), "w"(v98)); - svfloat32_t v112; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v112) : "w"(v101), "w"(v102)); - svfloat32_t v122; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v122) : "w"(v99), "w"(v98)); - svfloat32_t v131; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v131) : "w"(v101), "w"(v102)); - svfloat32_t v158; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v158) : "w"(v115), "w"(v496)); - svfloat32_t v163; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v163) : "w"(v116), "w"(v497)); - svfloat32_t v173; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v173) : "w"(v118), "w"(v499)); - svfloat32_t v178; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v178) : "w"(v119), "w"(v500)); - svfloat32_t zero200; - asm volatile("mov %0.s, #0" : "=w"(zero200)); + svfloat32_t v32 = svadd_f32_x(svptrue_b32(), v614, v616); + svfloat32_t v47 = svadd_f32_x(svptrue_b32(), v618, v620); + svfloat32_t v62 = svadd_f32_x(svptrue_b32(), v622, v624); + svfloat32_t v77 = svadd_f32_x(svptrue_b32(), v626, v628); + svfloat32_t v92 = svadd_f32_x(svptrue_b32(), v630, v632); + svfloat32_t v93 = svsub_f32_x(svptrue_b32(), v614, v616); + svfloat32_t v94 = svsub_f32_x(svptrue_b32(), v618, v620); + svfloat32_t v95 = svsub_f32_x(svptrue_b32(), v622, v624); + svfloat32_t v96 = svsub_f32_x(svptrue_b32(), v626, v628); + svfloat32_t v97 = svsub_f32_x(svptrue_b32(), v630, v632); + svfloat32_t v98 = svadd_f32_x(svptrue_b32(), v32, v47); + svfloat32_t v99 = svadd_f32_x(svptrue_b32(), v62, v92); + svfloat32_t v101 = svsub_f32_x(svptrue_b32(), v94, v95); + svfloat32_t v102 = svadd_f32_x(svptrue_b32(), v93, v97); + svfloat32_t v114 = svsub_f32_x(svptrue_b32(), v47, v77); + svfloat32_t v115 = svsub_f32_x(svptrue_b32(), v32, v77); + svfloat32_t v116 = svsub_f32_x(svptrue_b32(), v47, v32); + svfloat32_t v117 = svsub_f32_x(svptrue_b32(), v92, v77); + svfloat32_t v118 = svsub_f32_x(svptrue_b32(), v62, v77); + svfloat32_t v119 = svsub_f32_x(svptrue_b32(), v92, v62); + svfloat32_t v120 = svsub_f32_x(svptrue_b32(), v47, v92); + svfloat32_t v121 = svsub_f32_x(svptrue_b32(), v32, v62); + svfloat32_t v123 = svadd_f32_x(svptrue_b32(), v94, v96); + svfloat32_t v124 = svsub_f32_x(svptrue_b32(), v93, v96); + svfloat32_t v125 = svadd_f32_x(svptrue_b32(), v93, v94); + svfloat32_t v126 = svsub_f32_x(svptrue_b32(), v96, v97); + svfloat32_t v127 = svsub_f32_x(svptrue_b32(), v95, v96); + svfloat32_t v128 = svsub_f32_x(svptrue_b32(), v95, v97); + svfloat32_t v129 = svadd_f32_x(svptrue_b32(), v94, v97); + svfloat32_t v130 = svsub_f32_x(svptrue_b32(), v93, v95); + svfloat32_t v100 = svadd_f32_x(svptrue_b32(), v77, v98); + svfloat32_t v112 = svsub_f32_x(svptrue_b32(), v101, v102); + svfloat32_t v122 = svsub_f32_x(svptrue_b32(), v99, v98); + svfloat32_t v131 = svadd_f32_x(svptrue_b32(), v101, v102); + svfloat32_t v158 = svmul_f32_x(svptrue_b32(), v115, v496); + svfloat32_t v163 = svmul_f32_x(svptrue_b32(), v116, v497); + svfloat32_t v173 = svmul_f32_x(svptrue_b32(), v118, v499); + svfloat32_t v178 = svmul_f32_x(svptrue_b32(), v119, v500); + svfloat32_t zero200 = svdup_n_f32(0); svfloat32_t v200 = svcmla_f32_x(pred_full, zero200, v504, v123, 90); - svfloat32_t zero214; - asm volatile("mov %0.s, #0" : "=w"(zero214)); + svfloat32_t zero214 = svdup_n_f32(0); svfloat32_t v214 = svcmla_f32_x(pred_full, zero214, v506, v125, 90); - svfloat32_t zero221; - asm volatile("mov %0.s, #0" : "=w"(zero221)); + svfloat32_t zero221 = svdup_n_f32(0); svfloat32_t v221 = svcmla_f32_x(pred_full, zero221, v507, v126, 90); - svfloat32_t zero235; - asm volatile("mov %0.s, #0" : "=w"(zero235)); + svfloat32_t zero235 = svdup_n_f32(0); svfloat32_t v235 = svcmla_f32_x(pred_full, zero235, v509, v128, 90); - svfloat32_t zero242; - asm volatile("mov %0.s, #0" : "=w"(zero242)); + svfloat32_t zero242 = svdup_n_f32(0); svfloat32_t v242 = svcmla_f32_x(pred_full, zero242, v510, v129, 90); - svfloat32_t v103; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v103) : "w"(v100), "w"(v99)); - svfloat32_t v113; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v113) : "w"(v112), "w"(v96)); - svfloat32_t v193; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v193) : "w"(v122), "w"(v503)); - svfloat32_t zero256; - asm volatile("mov %0.s, #0" : "=w"(zero256)); + svfloat32_t v103 = svadd_f32_x(svptrue_b32(), v100, v99); + svfloat32_t v113 = svsub_f32_x(svptrue_b32(), v112, v96); + svfloat32_t v193 = svmul_f32_x(svptrue_b32(), v122, v503); + svfloat32_t zero256 = svdup_n_f32(0); svfloat32_t v256 = svcmla_f32_x(pred_full, zero256, v512, v131, 90); svfloat32_t v258 = svmla_f32_x(pred_full, v158, v114, v495); svfloat32_t v259 = svmla_f32_x(pred_full, v163, v115, v496); @@ -3553,92 +2314,55 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu11(const armral_cmplx_f32_t *restrict x, svfloat32_t v262 = svmla_f32_x(pred_full, v178, v118, v499); svfloat32_t v263 = svnmls_f32_x(pred_full, v178, v117, v498); svfloat32_t v266 = svcmla_f32_x(pred_full, v214, v505, v124, 90); - svfloat32_t v267; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v267) : "w"(v200), "w"(v214)); + svfloat32_t v267 = svsub_f32_x(svptrue_b32(), v200, v214); svfloat32_t v268 = svcmla_f32_x(pred_full, v235, v508, v127, 90); - svfloat32_t v269; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v269) : "w"(v221), "w"(v235)); - svfloat32_t v111; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v111) : "w"(v634), "w"(v103)); - svfloat32_t zero148; - asm volatile("mov %0.s, #0" : "=w"(zero148)); + svfloat32_t v269 = svsub_f32_x(svptrue_b32(), v221, v235); + svfloat32_t v111 = svadd_f32_x(svptrue_b32(), v634, v103); + svfloat32_t zero148 = svdup_n_f32(0); svfloat32_t v148 = svcmla_f32_x(pred_full, zero148, v494, v113, 90); svfloat32_t v264 = svmla_f32_x(pred_full, v193, v121, v502); svfloat32_t v265 = svmla_f32_x(pred_full, v193, v120, v501); svfloat32_t v270 = svcmla_f32_x(pred_full, v256, v511, v130, 90); - svfloat32_t v271; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v271) : "w"(v242), "w"(v256)); - svfloat32_t v290; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v290) : "w"(v266), "w"(v267)); + svfloat32_t v271 = svsub_f32_x(svptrue_b32(), v242, v256); + svfloat32_t v290 = svadd_f32_x(svptrue_b32(), v266, v267); svfloat32_t v257 = svmls_f32_x(pred_full, v111, v103, v493); - svfloat32_t v272; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v272) : "w"(v262), "w"(v264)); - svfloat32_t v282; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v282) : "w"(v148), "w"(v268)); - svfloat32_t v284; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v284) : "w"(v270), "w"(v266)); - svfloat32_t v286; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v286) : "w"(v148), "w"(v271)); - svfloat32_t v288; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v288) : "w"(v271), "w"(v267)); - svfloat32_t v291; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v291) : "w"(v290), "w"(v268)); + svfloat32_t v272 = svadd_f32_x(svptrue_b32(), v262, v264); + svfloat32_t v282 = svadd_f32_x(svptrue_b32(), v148, v268); + svfloat32_t v284 = svsub_f32_x(svptrue_b32(), v270, v266); + svfloat32_t v286 = svadd_f32_x(svptrue_b32(), v148, v271); + svfloat32_t v288 = svsub_f32_x(svptrue_b32(), v271, v267); + svfloat32_t v291 = svadd_f32_x(svptrue_b32(), v290, v268); svint16_t v306 = svtbl_s16( svreinterpret_s16_s32(svcvt_s32_f32_x( pred_full, svmul_n_f32_x(pred_full, v111, (float)(1ULL << 31ULL)))), svreinterpret_u16_u64( svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svfloat32_t v273; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v273) : "w"(v272), "w"(v257)); - svfloat32_t v274; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v274) : "w"(v257), "w"(v259)); - svfloat32_t v276; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v276) : "w"(v257), "w"(v263)); - svfloat32_t v278; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v278) : "w"(v257), "w"(v260)); - svfloat32_t v280; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v280) : "w"(v257), "w"(v258)); - svfloat32_t v283; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v283) : "w"(v282), "w"(v270)); - svfloat32_t v285; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v285) : "w"(v284), "w"(v148)); - svfloat32_t v287; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v287) : "w"(v286), "w"(v269)); - svfloat32_t v289; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v289) : "w"(v288), "w"(v148)); - svfloat32_t v292; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v292) : "w"(v291), "w"(v269)); + svfloat32_t v273 = svadd_f32_x(svptrue_b32(), v272, v257); + svfloat32_t v274 = svsub_f32_x(svptrue_b32(), v257, v259); + svfloat32_t v276 = svadd_f32_x(svptrue_b32(), v257, v263); + svfloat32_t v278 = svsub_f32_x(svptrue_b32(), v257, v260); + svfloat32_t v280 = svadd_f32_x(svptrue_b32(), v257, v258); + svfloat32_t v283 = svadd_f32_x(svptrue_b32(), v282, v270); + svfloat32_t v285 = svsub_f32_x(svptrue_b32(), v284, v148); + svfloat32_t v287 = svadd_f32_x(svptrue_b32(), v286, v269); + svfloat32_t v289 = svsub_f32_x(svptrue_b32(), v288, v148); + svfloat32_t v292 = svadd_f32_x(svptrue_b32(), v291, v269); svst1w_u64(pred_full, (unsigned *)(v520), svreinterpret_u64_s16(v306)); - svfloat32_t v275; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v275) : "w"(v274), "w"(v264)); - svfloat32_t v277; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v277) : "w"(v276), "w"(v265)); - svfloat32_t v279; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v279) : "w"(v278), "w"(v265)); - svfloat32_t v281; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v281) : "w"(v280), "w"(v261)); - svfloat32_t v293; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v293) : "w"(v292), "w"(v148)); - svfloat32_t v295; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v295) : "w"(v273), "w"(v283)); - svfloat32_t v302; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v302) : "w"(v273), "w"(v283)); - svfloat32_t v294; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v294) : "w"(v281), "w"(v293)); - svfloat32_t v296; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v296) : "w"(v275), "w"(v285)); - svfloat32_t v297; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v297) : "w"(v277), "w"(v287)); - svfloat32_t v298; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v298) : "w"(v279), "w"(v289)); - svfloat32_t v299; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v299) : "w"(v279), "w"(v289)); - svfloat32_t v300; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v300) : "w"(v277), "w"(v287)); - svfloat32_t v301; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v301) : "w"(v275), "w"(v285)); - svfloat32_t v303; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v303) : "w"(v281), "w"(v293)); + svfloat32_t v275 = svsub_f32_x(svptrue_b32(), v274, v264); + svfloat32_t v277 = svadd_f32_x(svptrue_b32(), v276, v265); + svfloat32_t v279 = svsub_f32_x(svptrue_b32(), v278, v265); + svfloat32_t v281 = svsub_f32_x(svptrue_b32(), v280, v261); + svfloat32_t v293 = svsub_f32_x(svptrue_b32(), v292, v148); + svfloat32_t v295 = svadd_f32_x(svptrue_b32(), v273, v283); + svfloat32_t v302 = svsub_f32_x(svptrue_b32(), v273, v283); + svfloat32_t v294 = svadd_f32_x(svptrue_b32(), v281, v293); + svfloat32_t v296 = svadd_f32_x(svptrue_b32(), v275, v285); + svfloat32_t v297 = svsub_f32_x(svptrue_b32(), v277, v287); + svfloat32_t v298 = svadd_f32_x(svptrue_b32(), v279, v289); + svfloat32_t v299 = svsub_f32_x(svptrue_b32(), v279, v289); + svfloat32_t v300 = svadd_f32_x(svptrue_b32(), v277, v287); + svfloat32_t v301 = svsub_f32_x(svptrue_b32(), v275, v285); + svfloat32_t v303 = svsub_f32_x(svptrue_b32(), v281, v293); svint16_t v322 = svtbl_s16( svreinterpret_s16_s32(svcvt_s32_f32_x( pred_full, svmul_n_f32_x(pred_full, v295, (float)(1ULL << 31ULL)))), @@ -3713,280 +2437,132 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu12(const armral_cmplx_f32_t *restrict x, float v4 = dir; const float32x2_t *v5 = (const float32x2_t *)x; int32_t *v6 = (int32_t *)y; - int64_t v12 = howmany - 1; - int64_t v324 = howmany / 2; - for (int j = 0; j < v12; j += 2) { - float v138 = 1.0000000000000000e+00F; - float v139 = -1.0000000000000000e+00F; - float v169 = -1.4999999999999998e+00F; - float v170 = 1.4999999999999998e+00F; - float v201 = 8.6602540378443871e-01F; - float32x2_t v204 = (float32x2_t){v4, v4}; - float v210 = -8.6602540378443871e-01F; - const float32x2_t *v667 = &v5[istride]; - int32_t *v731 = &v6[ostride]; - float32x2_t v140 = (float32x2_t){v138, v139}; - float32x2_t v166 = (float32x2_t){v169, v169}; - float32x2_t v171 = (float32x2_t){v169, v170}; - float32x2_t v203 = (float32x2_t){v201, v210}; - float32x2_t v211 = (float32x2_t){v210, v210}; - const float32x2_t *v604 = &v5[0]; - int32_t *v695 = &v6[0]; - float32x4_t v816 = vld1q_f32((const float32_t *)v667); - float32x2_t v142 = vmul_f32(v204, v140); - float32x4_t v167 = vcombine_f32(v166, v166); - float32x2_t v173 = vmul_f32(v204, v171); - float32x2_t v205 = vmul_f32(v204, v203); - float32x4_t v212 = vcombine_f32(v211, v211); - const float32x2_t *v585 = &v5[istride * 4]; - const float32x2_t *v594 = &v5[istride * 8]; - const float32x2_t *v613 = &v5[istride * 7]; - const float32x2_t *v622 = &v5[istride * 11]; - const float32x2_t *v631 = &v5[istride * 3]; - const float32x2_t *v640 = &v5[istride * 10]; - const float32x2_t *v649 = &v5[istride * 2]; - const float32x2_t *v658 = &v5[istride * 6]; - const float32x2_t *v676 = &v5[istride * 5]; - const float32x2_t *v685 = &v5[istride * 9]; - int32_t *v704 = &v6[ostride * 4]; - int32_t *v713 = &v6[ostride * 8]; - int32_t *v722 = &v6[ostride * 9]; - int32_t *v740 = &v6[ostride * 5]; - int32_t *v749 = &v6[ostride * 6]; - int32_t *v758 = &v6[ostride * 10]; - int32_t *v767 = &v6[ostride * 2]; - int32_t *v776 = &v6[ostride * 3]; - int32_t *v785 = &v6[ostride * 7]; - int32_t *v794 = &v6[ostride * 11]; - float32x4_t v802 = vld1q_f32((const float32_t *)v604); - float32x4_t v144 = vcombine_f32(v142, v142); - float32x4_t v175 = vcombine_f32(v173, v173); - float32x4_t v207 = vcombine_f32(v205, v205); - float32x4_t v798 = vld1q_f32((const float32_t *)v585); - float32x4_t v800 = vld1q_f32((const float32_t *)v594); - float32x4_t v804 = vld1q_f32((const float32_t *)v613); - float32x4_t v806 = vld1q_f32((const float32_t *)v622); - float32x4_t v808 = vld1q_f32((const float32_t *)v631); - float32x4_t v810 = vld1q_f32((const float32_t *)v640); - float32x4_t v812 = vld1q_f32((const float32_t *)v649); - float32x4_t v814 = vld1q_f32((const float32_t *)v658); - float32x4_t v818 = vld1q_f32((const float32_t *)v676); - float32x4_t v820 = vld1q_f32((const float32_t *)v685); - float32x4_t v35 = vaddq_f32(v798, v800); - float32x4_t v36 = vsubq_f32(v798, v800); - float32x4_t v59 = vaddq_f32(v804, v806); - float32x4_t v60 = vsubq_f32(v804, v806); - float32x4_t v83 = vaddq_f32(v810, v812); - float32x4_t v84 = vsubq_f32(v810, v812); - float32x4_t v107 = vaddq_f32(v816, v818); - float32x4_t v108 = vsubq_f32(v816, v818); - float32x4_t v44 = vaddq_f32(v35, v802); - float32x4_t v68 = vaddq_f32(v59, v808); - float32x4_t v92 = vaddq_f32(v83, v814); - float32x4_t v116 = vaddq_f32(v107, v820); - float32x4_t v148 = vaddq_f32(v35, v83); - float32x4_t v149 = vsubq_f32(v35, v83); - float32x4_t v150 = vaddq_f32(v59, v107); - float32x4_t v151 = vsubq_f32(v59, v107); - float32x4_t v179 = vaddq_f32(v36, v84); - float32x4_t v180 = vsubq_f32(v36, v84); - float32x4_t v181 = vaddq_f32(v60, v108); - float32x4_t v182 = vsubq_f32(v60, v108); - float32x4_t v117 = vaddq_f32(v44, v92); - float32x4_t v118 = vsubq_f32(v44, v92); - float32x4_t v119 = vaddq_f32(v68, v116); - float32x4_t v120 = vsubq_f32(v68, v116); - float32x4_t v152 = vaddq_f32(v148, v150); - float32x4_t v153 = vsubq_f32(v148, v150); - float32x4_t v168 = vmulq_f32(v149, v167); - float32x4_t v174 = vrev64q_f32(v151); - float32x4_t v183 = vaddq_f32(v179, v181); - float32x4_t v184 = vsubq_f32(v179, v181); - float32x4_t v206 = vrev64q_f32(v180); - float32x4_t v213 = vmulq_f32(v182, v212); - float32x4_t v121 = vaddq_f32(v117, v119); - float32x4_t v122 = vsubq_f32(v117, v119); - float32x4_t v143 = vrev64q_f32(v120); - float32x4_t v158 = vmulq_f32(v152, v167); - float32x4_t v163 = vmulq_f32(v153, v167); - float32x4_t v176 = vmulq_f32(v174, v175); - float32x4_t v190 = vrev64q_f32(v183); - float32x4_t v198 = vrev64q_f32(v184); - float32x4_t v208 = vmulq_f32(v206, v207); - float32x4_t v145 = vmulq_f32(v143, v144); - float32x4_t v177 = vaddq_f32(v168, v176); - float32x4_t v178 = vsubq_f32(v168, v176); - float32x4_t v192 = vmulq_f32(v190, v207); - float32x4_t v200 = vmulq_f32(v198, v207); - float32x4_t v214 = vaddq_f32(v208, v213); - float32x4_t v215 = vsubq_f32(v208, v213); - float32x4_t v216 = vaddq_f32(v121, v158); - int16x4_t v221 = vqmovn_s32(vcvtq_n_s32_f32(v121, 15)); - float32x4_t v270 = vaddq_f32(v122, v163); - int16x4_t v275 = vqmovn_s32(vcvtq_n_s32_f32(v122, 15)); - float32x4_t v146 = vaddq_f32(v118, v145); - float32x4_t v147 = vsubq_f32(v118, v145); - float32x4_t v217 = vaddq_f32(v216, v192); - float32x4_t v218 = vsubq_f32(v216, v192); - float32x4_t v271 = vaddq_f32(v270, v200); - float32x4_t v272 = vsubq_f32(v270, v200); - vst1_s16((int16_t *)v695, v221); - vst1_s16((int16_t *)v749, v275); - int16x4_t v229 = vqmovn_s32(vcvtq_n_s32_f32(v218, 15)); - int16x4_t v237 = vqmovn_s32(vcvtq_n_s32_f32(v217, 15)); - float32x4_t v243 = vaddq_f32(v147, v178); - int16x4_t v248 = vqmovn_s32(vcvtq_n_s32_f32(v147, 15)); - int16x4_t v283 = vqmovn_s32(vcvtq_n_s32_f32(v272, 15)); - int16x4_t v291 = vqmovn_s32(vcvtq_n_s32_f32(v271, 15)); - float32x4_t v297 = vaddq_f32(v146, v177); - int16x4_t v302 = vqmovn_s32(vcvtq_n_s32_f32(v146, 15)); - float32x4_t v244 = vaddq_f32(v243, v215); - float32x4_t v245 = vsubq_f32(v243, v215); - float32x4_t v298 = vaddq_f32(v297, v214); - float32x4_t v299 = vsubq_f32(v297, v214); - vst1_s16((int16_t *)v704, v229); - vst1_s16((int16_t *)v713, v237); - vst1_s16((int16_t *)v722, v248); - vst1_s16((int16_t *)v758, v283); - vst1_s16((int16_t *)v767, v291); - vst1_s16((int16_t *)v776, v302); - int16x4_t v256 = vqmovn_s32(vcvtq_n_s32_f32(v245, 15)); - int16x4_t v264 = vqmovn_s32(vcvtq_n_s32_f32(v244, 15)); - int16x4_t v310 = vqmovn_s32(vcvtq_n_s32_f32(v299, 15)); - int16x4_t v318 = vqmovn_s32(vcvtq_n_s32_f32(v298, 15)); - vst1_s16((int16_t *)v731, v256); - vst1_s16((int16_t *)v740, v264); - vst1_s16((int16_t *)v785, v310); - vst1_s16((int16_t *)v794, v318); - v5 += 2 * 1; - v6 += 2 * 1; - } - for (int j = v324 * 2; j < howmany; j += 1) { - float32x2_t v390 = v5[istride]; - float v422 = 1.0000000000000000e+00F; - float v423 = -1.0000000000000000e+00F; - float v449 = -1.4999999999999998e+00F; - float v450 = 1.4999999999999998e+00F; - float v478 = 8.6602540378443871e-01F; - float32x2_t v481 = (float32x2_t){v4, v4}; - float v486 = -8.6602540378443871e-01F; - float32x2_t v348 = v5[0]; - float32x2_t v424 = (float32x2_t){v422, v423}; - float32x2_t v447 = (float32x2_t){v449, v449}; - float32x2_t v451 = (float32x2_t){v449, v450}; - float32x2_t v480 = (float32x2_t){v478, v486}; - float32x2_t v487 = (float32x2_t){v486, v486}; - float32x2_t v336 = v5[istride * 4]; - float32x2_t v341 = v5[istride * 8]; - float32x2_t v354 = v5[istride * 7]; - float32x2_t v359 = v5[istride * 11]; - float32x2_t v366 = v5[istride * 3]; - float32x2_t v372 = v5[istride * 10]; - float32x2_t v377 = v5[istride * 2]; - float32x2_t v384 = v5[istride * 6]; - float32x2_t v395 = v5[istride * 5]; - float32x2_t v402 = v5[istride * 9]; - float32x2_t v426 = vmul_f32(v481, v424); - float32x2_t v453 = vmul_f32(v481, v451); - float32x2_t v482 = vmul_f32(v481, v480); - float32x2_t v342 = vadd_f32(v336, v341); - float32x2_t v343 = vsub_f32(v336, v341); - float32x2_t v360 = vadd_f32(v354, v359); - float32x2_t v361 = vsub_f32(v354, v359); - float32x2_t v378 = vadd_f32(v372, v377); - float32x2_t v379 = vsub_f32(v372, v377); - float32x2_t v396 = vadd_f32(v390, v395); - float32x2_t v397 = vsub_f32(v390, v395); - float32x2_t v349 = vadd_f32(v342, v348); - float32x2_t v367 = vadd_f32(v360, v366); - float32x2_t v385 = vadd_f32(v378, v384); - float32x2_t v403 = vadd_f32(v396, v402); - float32x2_t v431 = vadd_f32(v342, v378); - float32x2_t v432 = vsub_f32(v342, v378); - float32x2_t v433 = vadd_f32(v360, v396); - float32x2_t v434 = vsub_f32(v360, v396); - float32x2_t v458 = vadd_f32(v343, v379); - float32x2_t v459 = vsub_f32(v343, v379); - float32x2_t v460 = vadd_f32(v361, v397); - float32x2_t v461 = vsub_f32(v361, v397); - float32x2_t v404 = vadd_f32(v349, v385); - float32x2_t v405 = vsub_f32(v349, v385); - float32x2_t v406 = vadd_f32(v367, v403); - float32x2_t v407 = vsub_f32(v367, v403); - float32x2_t v435 = vadd_f32(v431, v433); - float32x2_t v436 = vsub_f32(v431, v433); - float32x2_t v448 = vmul_f32(v432, v447); - float32x2_t v454 = vrev64_f32(v434); - float32x2_t v462 = vadd_f32(v458, v460); - float32x2_t v463 = vsub_f32(v458, v460); - float32x2_t v483 = vrev64_f32(v459); - float32x2_t v488 = vmul_f32(v461, v487); - float32x2_t v408 = vadd_f32(v404, v406); - float32x2_t v409 = vsub_f32(v404, v406); - float32x2_t v427 = vrev64_f32(v407); - float32x2_t v440 = vmul_f32(v435, v447); - float32x2_t v444 = vmul_f32(v436, v447); - float32x2_t v455 = vmul_f32(v454, v453); - float32x2_t v469 = vrev64_f32(v462); - float32x2_t v476 = vrev64_f32(v463); - float32x2_t v484 = vmul_f32(v483, v482); - float32x2_t v428 = vmul_f32(v427, v426); - float32x2_t v456 = vadd_f32(v448, v455); - float32x2_t v457 = vsub_f32(v448, v455); - float32x2_t v470 = vmul_f32(v469, v482); - float32x2_t v477 = vmul_f32(v476, v482); - float32x2_t v489 = vadd_f32(v484, v488); - float32x2_t v490 = vsub_f32(v484, v488); - float32x2_t v491 = vadd_f32(v408, v440); - int16x4_t v496 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v408, 15), (int32x2_t){0, 0})); - float32x2_t v533 = vadd_f32(v409, v444); - int16x4_t v538 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v409, 15), (int32x2_t){0, 0})); - float32x2_t v429 = vadd_f32(v405, v428); - float32x2_t v430 = vsub_f32(v405, v428); - float32x2_t v492 = vadd_f32(v491, v470); - float32x2_t v493 = vsub_f32(v491, v470); - v6[0] = vget_lane_s32(vreinterpret_s32_s16(v496), 0); - float32x2_t v534 = vadd_f32(v533, v477); - float32x2_t v535 = vsub_f32(v533, v477); - v6[ostride * 6] = vget_lane_s32(vreinterpret_s32_s16(v538), 0); - int16x4_t v502 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v493, 15), (int32x2_t){0, 0})); - int16x4_t v508 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v492, 15), (int32x2_t){0, 0})); - float32x2_t v512 = vadd_f32(v430, v457); - int16x4_t v517 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v430, 15), (int32x2_t){0, 0})); - int16x4_t v544 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v535, 15), (int32x2_t){0, 0})); - int16x4_t v550 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v534, 15), (int32x2_t){0, 0})); - float32x2_t v554 = vadd_f32(v429, v456); - int16x4_t v559 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v429, 15), (int32x2_t){0, 0})); - v6[ostride * 4] = vget_lane_s32(vreinterpret_s32_s16(v502), 0); - v6[ostride * 8] = vget_lane_s32(vreinterpret_s32_s16(v508), 0); - float32x2_t v513 = vadd_f32(v512, v490); - float32x2_t v514 = vsub_f32(v512, v490); - v6[ostride * 9] = vget_lane_s32(vreinterpret_s32_s16(v517), 0); - v6[ostride * 10] = vget_lane_s32(vreinterpret_s32_s16(v544), 0); - v6[ostride * 2] = vget_lane_s32(vreinterpret_s32_s16(v550), 0); - float32x2_t v555 = vadd_f32(v554, v489); - float32x2_t v556 = vsub_f32(v554, v489); - v6[ostride * 3] = vget_lane_s32(vreinterpret_s32_s16(v559), 0); - int16x4_t v523 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v514, 15), (int32x2_t){0, 0})); - int16x4_t v529 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v513, 15), (int32x2_t){0, 0})); - int16x4_t v565 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v556, 15), (int32x2_t){0, 0})); - int16x4_t v571 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v555, 15), (int32x2_t){0, 0})); - v6[ostride] = vget_lane_s32(vreinterpret_s32_s16(v523), 0); - v6[ostride * 5] = vget_lane_s32(vreinterpret_s32_s16(v529), 0); - v6[ostride * 7] = vget_lane_s32(vreinterpret_s32_s16(v565), 0); - v6[ostride * 11] = vget_lane_s32(vreinterpret_s32_s16(v571), 0); + for (int j = 0; j < howmany; j += 1) { + float32x2_t v74 = v5[istride]; + float v106 = 1.0000000000000000e+00F; + float v107 = -1.0000000000000000e+00F; + float v133 = -1.4999999999999998e+00F; + float v134 = 1.4999999999999998e+00F; + float v162 = 8.6602540378443871e-01F; + float32x2_t v165 = (float32x2_t){v4, v4}; + float v170 = -8.6602540378443871e-01F; + float32x2_t v32 = v5[0]; + float32x2_t v108 = (float32x2_t){v106, v107}; + float32x2_t v131 = (float32x2_t){v133, v133}; + float32x2_t v135 = (float32x2_t){v133, v134}; + float32x2_t v164 = (float32x2_t){v162, v170}; + float32x2_t v171 = (float32x2_t){v170, v170}; + float32x2_t v20 = v5[istride * 4]; + float32x2_t v25 = v5[istride * 8]; + float32x2_t v38 = v5[istride * 7]; + float32x2_t v43 = v5[istride * 11]; + float32x2_t v50 = v5[istride * 3]; + float32x2_t v56 = v5[istride * 10]; + float32x2_t v61 = v5[istride * 2]; + float32x2_t v68 = v5[istride * 6]; + float32x2_t v79 = v5[istride * 5]; + float32x2_t v86 = v5[istride * 9]; + float32x2_t v110 = vmul_f32(v165, v108); + float32x2_t v137 = vmul_f32(v165, v135); + float32x2_t v166 = vmul_f32(v165, v164); + float32x2_t v26 = vadd_f32(v20, v25); + float32x2_t v27 = vsub_f32(v20, v25); + float32x2_t v44 = vadd_f32(v38, v43); + float32x2_t v45 = vsub_f32(v38, v43); + float32x2_t v62 = vadd_f32(v56, v61); + float32x2_t v63 = vsub_f32(v56, v61); + float32x2_t v80 = vadd_f32(v74, v79); + float32x2_t v81 = vsub_f32(v74, v79); + float32x2_t v33 = vadd_f32(v26, v32); + float32x2_t v51 = vadd_f32(v44, v50); + float32x2_t v69 = vadd_f32(v62, v68); + float32x2_t v87 = vadd_f32(v80, v86); + float32x2_t v115 = vadd_f32(v26, v62); + float32x2_t v116 = vsub_f32(v26, v62); + float32x2_t v117 = vadd_f32(v44, v80); + float32x2_t v118 = vsub_f32(v44, v80); + float32x2_t v142 = vadd_f32(v27, v63); + float32x2_t v143 = vsub_f32(v27, v63); + float32x2_t v144 = vadd_f32(v45, v81); + float32x2_t v145 = vsub_f32(v45, v81); + float32x2_t v88 = vadd_f32(v33, v69); + float32x2_t v89 = vsub_f32(v33, v69); + float32x2_t v90 = vadd_f32(v51, v87); + float32x2_t v91 = vsub_f32(v51, v87); + float32x2_t v119 = vadd_f32(v115, v117); + float32x2_t v120 = vsub_f32(v115, v117); + float32x2_t v132 = vmul_f32(v116, v131); + float32x2_t v138 = vrev64_f32(v118); + float32x2_t v146 = vadd_f32(v142, v144); + float32x2_t v147 = vsub_f32(v142, v144); + float32x2_t v167 = vrev64_f32(v143); + float32x2_t v172 = vmul_f32(v145, v171); + float32x2_t v92 = vadd_f32(v88, v90); + float32x2_t v93 = vsub_f32(v88, v90); + float32x2_t v111 = vrev64_f32(v91); + float32x2_t v124 = vmul_f32(v119, v131); + float32x2_t v128 = vmul_f32(v120, v131); + float32x2_t v139 = vmul_f32(v138, v137); + float32x2_t v153 = vrev64_f32(v146); + float32x2_t v160 = vrev64_f32(v147); + float32x2_t v168 = vmul_f32(v167, v166); + float32x2_t v112 = vmul_f32(v111, v110); + float32x2_t v140 = vadd_f32(v132, v139); + float32x2_t v141 = vsub_f32(v132, v139); + float32x2_t v154 = vmul_f32(v153, v166); + float32x2_t v161 = vmul_f32(v160, v166); + float32x2_t v173 = vadd_f32(v168, v172); + float32x2_t v174 = vsub_f32(v168, v172); + float32x2_t v175 = vadd_f32(v92, v124); + int16x4_t v180 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v92, 15), (int32x2_t){0, 0})); + float32x2_t v217 = vadd_f32(v93, v128); + int16x4_t v222 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v93, 15), (int32x2_t){0, 0})); + float32x2_t v113 = vadd_f32(v89, v112); + float32x2_t v114 = vsub_f32(v89, v112); + float32x2_t v176 = vadd_f32(v175, v154); + float32x2_t v177 = vsub_f32(v175, v154); + v6[0] = vget_lane_s32(vreinterpret_s32_s16(v180), 0); + float32x2_t v218 = vadd_f32(v217, v161); + float32x2_t v219 = vsub_f32(v217, v161); + v6[ostride * 6] = vget_lane_s32(vreinterpret_s32_s16(v222), 0); + int16x4_t v186 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v177, 15), (int32x2_t){0, 0})); + int16x4_t v192 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v176, 15), (int32x2_t){0, 0})); + float32x2_t v196 = vadd_f32(v114, v141); + int16x4_t v201 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v114, 15), (int32x2_t){0, 0})); + int16x4_t v228 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v219, 15), (int32x2_t){0, 0})); + int16x4_t v234 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v218, 15), (int32x2_t){0, 0})); + float32x2_t v238 = vadd_f32(v113, v140); + int16x4_t v243 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v113, 15), (int32x2_t){0, 0})); + v6[ostride * 4] = vget_lane_s32(vreinterpret_s32_s16(v186), 0); + v6[ostride * 8] = vget_lane_s32(vreinterpret_s32_s16(v192), 0); + float32x2_t v197 = vadd_f32(v196, v174); + float32x2_t v198 = vsub_f32(v196, v174); + v6[ostride * 9] = vget_lane_s32(vreinterpret_s32_s16(v201), 0); + v6[ostride * 10] = vget_lane_s32(vreinterpret_s32_s16(v228), 0); + v6[ostride * 2] = vget_lane_s32(vreinterpret_s32_s16(v234), 0); + float32x2_t v239 = vadd_f32(v238, v173); + float32x2_t v240 = vsub_f32(v238, v173); + v6[ostride * 3] = vget_lane_s32(vreinterpret_s32_s16(v243), 0); + int16x4_t v207 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v198, 15), (int32x2_t){0, 0})); + int16x4_t v213 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v197, 15), (int32x2_t){0, 0})); + int16x4_t v249 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v240, 15), (int32x2_t){0, 0})); + int16x4_t v255 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v239, 15), (int32x2_t){0, 0})); + v6[ostride] = vget_lane_s32(vreinterpret_s32_s16(v207), 0); + v6[ostride * 5] = vget_lane_s32(vreinterpret_s32_s16(v213), 0); + v6[ostride * 7] = vget_lane_s32(vreinterpret_s32_s16(v249), 0); + v6[ostride * 11] = vget_lane_s32(vreinterpret_s32_s16(v255), 0); v5 += 1 * 1; v6 += 1 * 1; } @@ -4089,89 +2665,52 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu12(const armral_cmplx_f32_t *restrict x, svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v413)[0])); svfloat32_t v569 = svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v422)[0])); - svfloat32_t v32; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v32) : "w"(v547), "w"(v549)); - svfloat32_t v33; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v33) : "w"(v547), "w"(v549)); - svfloat32_t v56; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v56) : "w"(v553), "w"(v555)); - svfloat32_t v57; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v57) : "w"(v553), "w"(v555)); - svfloat32_t v80; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v80) : "w"(v559), "w"(v561)); - svfloat32_t v81; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v81) : "w"(v559), "w"(v561)); - svfloat32_t v104; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v104) : "w"(v565), "w"(v567)); - svfloat32_t v105; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v105) : "w"(v565), "w"(v567)); - svfloat32_t v41; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v41) : "w"(v32), "w"(v551)); - svfloat32_t v65; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v65) : "w"(v56), "w"(v557)); - svfloat32_t v89; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v89) : "w"(v80), "w"(v563)); - svfloat32_t v113; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v113) : "w"(v104), "w"(v569)); - svfloat32_t v144; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v144) : "w"(v32), "w"(v80)); - svfloat32_t v145; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v145) : "w"(v32), "w"(v80)); - svfloat32_t v146; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v146) : "w"(v56), "w"(v104)); - svfloat32_t v147; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v147) : "w"(v56), "w"(v104)); - svfloat32_t v174; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v174) : "w"(v33), "w"(v81)); - svfloat32_t v175; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v175) : "w"(v33), "w"(v81)); - svfloat32_t v176; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v176) : "w"(v57), "w"(v105)); - svfloat32_t v177; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v177) : "w"(v57), "w"(v105)); - svfloat32_t v114; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v114) : "w"(v41), "w"(v89)); - svfloat32_t v115; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v115) : "w"(v41), "w"(v89)); - svfloat32_t v116; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v116) : "w"(v65), "w"(v113)); - svfloat32_t v117; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v117) : "w"(v65), "w"(v113)); - svfloat32_t v148; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v148) : "w"(v144), "w"(v146)); - svfloat32_t v149; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v149) : "w"(v144), "w"(v146)); - svfloat32_t zero171; - asm volatile("mov %0.s, #0" : "=w"(zero171)); + svfloat32_t v32 = svadd_f32_x(svptrue_b32(), v547, v549); + svfloat32_t v33 = svsub_f32_x(svptrue_b32(), v547, v549); + svfloat32_t v56 = svadd_f32_x(svptrue_b32(), v553, v555); + svfloat32_t v57 = svsub_f32_x(svptrue_b32(), v553, v555); + svfloat32_t v80 = svadd_f32_x(svptrue_b32(), v559, v561); + svfloat32_t v81 = svsub_f32_x(svptrue_b32(), v559, v561); + svfloat32_t v104 = svadd_f32_x(svptrue_b32(), v565, v567); + svfloat32_t v105 = svsub_f32_x(svptrue_b32(), v565, v567); + svfloat32_t v41 = svadd_f32_x(svptrue_b32(), v32, v551); + svfloat32_t v65 = svadd_f32_x(svptrue_b32(), v56, v557); + svfloat32_t v89 = svadd_f32_x(svptrue_b32(), v80, v563); + svfloat32_t v113 = svadd_f32_x(svptrue_b32(), v104, v569); + svfloat32_t v144 = svadd_f32_x(svptrue_b32(), v32, v80); + svfloat32_t v145 = svsub_f32_x(svptrue_b32(), v32, v80); + svfloat32_t v146 = svadd_f32_x(svptrue_b32(), v56, v104); + svfloat32_t v147 = svsub_f32_x(svptrue_b32(), v56, v104); + svfloat32_t v174 = svadd_f32_x(svptrue_b32(), v33, v81); + svfloat32_t v175 = svsub_f32_x(svptrue_b32(), v33, v81); + svfloat32_t v176 = svadd_f32_x(svptrue_b32(), v57, v105); + svfloat32_t v177 = svsub_f32_x(svptrue_b32(), v57, v105); + svfloat32_t v114 = svadd_f32_x(svptrue_b32(), v41, v89); + svfloat32_t v115 = svsub_f32_x(svptrue_b32(), v41, v89); + svfloat32_t v116 = svadd_f32_x(svptrue_b32(), v65, v113); + svfloat32_t v117 = svsub_f32_x(svptrue_b32(), v65, v113); + svfloat32_t v148 = svadd_f32_x(svptrue_b32(), v144, v146); + svfloat32_t v149 = svsub_f32_x(svptrue_b32(), v144, v146); + svfloat32_t zero171 = svdup_n_f32(0); svfloat32_t v171 = svcmla_f32_x(pred_full, zero171, v432, v147, 90); - svfloat32_t v178; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v178) : "w"(v174), "w"(v176)); - svfloat32_t v179; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v179) : "w"(v174), "w"(v176)); - svfloat32_t zero200; - asm volatile("mov %0.s, #0" : "=w"(zero200)); + svfloat32_t v178 = svadd_f32_x(svptrue_b32(), v174, v176); + svfloat32_t v179 = svsub_f32_x(svptrue_b32(), v174, v176); + svfloat32_t zero200 = svdup_n_f32(0); svfloat32_t v200 = svcmla_f32_x(pred_full, zero200, v435, v175, 90); - svfloat32_t v118; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v118) : "w"(v114), "w"(v116)); - svfloat32_t v119; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v119) : "w"(v114), "w"(v116)); - svfloat32_t zero141; - asm volatile("mov %0.s, #0" : "=w"(zero141)); + svfloat32_t v118 = svadd_f32_x(svptrue_b32(), v114, v116); + svfloat32_t v119 = svsub_f32_x(svptrue_b32(), v114, v116); + svfloat32_t zero141 = svdup_n_f32(0); svfloat32_t v141 = svcmla_f32_x(pred_full, zero141, v428, v117, 90); svfloat32_t v172 = svmla_f32_x(pred_full, v171, v145, v431); svfloat32_t v173 = svnmls_f32_x(pred_full, v171, v145, v431); - svfloat32_t zero186; - asm volatile("mov %0.s, #0" : "=w"(zero186)); + svfloat32_t zero186 = svdup_n_f32(0); svfloat32_t v186 = svcmla_f32_x(pred_full, zero186, v435, v178, 90); - svfloat32_t zero193; - asm volatile("mov %0.s, #0" : "=w"(zero193)); + svfloat32_t zero193 = svdup_n_f32(0); svfloat32_t v193 = svcmla_f32_x(pred_full, zero193, v435, v179, 90); svfloat32_t v206 = svmla_f32_x(pred_full, v200, v177, v436); svfloat32_t v207 = svmls_f32_x(pred_full, v200, v177, v436); - svfloat32_t v142; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v142) : "w"(v115), "w"(v141)); - svfloat32_t v143; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v143) : "w"(v115), "w"(v141)); + svfloat32_t v142 = svadd_f32_x(svptrue_b32(), v115, v141); + svfloat32_t v143 = svsub_f32_x(svptrue_b32(), v115, v141); svfloat32_t v208 = svmla_f32_x(pred_full, v118, v148, v431); svint16_t v213 = svtbl_s16( svreinterpret_s16_s32(svcvt_s32_f32_x( @@ -4184,23 +2723,17 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu12(const armral_cmplx_f32_t *restrict x, pred_full, svmul_n_f32_x(pred_full, v119, (float)(1ULL << 31ULL)))), svreinterpret_u16_u64( svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svfloat32_t v209; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v209) : "w"(v208), "w"(v186)); - svfloat32_t v210; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v210) : "w"(v208), "w"(v186)); - svfloat32_t v235; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v235) : "w"(v143), "w"(v173)); + svfloat32_t v209 = svadd_f32_x(svptrue_b32(), v208, v186); + svfloat32_t v210 = svsub_f32_x(svptrue_b32(), v208, v186); + svfloat32_t v235 = svadd_f32_x(svptrue_b32(), v143, v173); svint16_t v240 = svtbl_s16( svreinterpret_s16_s32(svcvt_s32_f32_x( pred_full, svmul_n_f32_x(pred_full, v143, (float)(1ULL << 31ULL)))), svreinterpret_u16_u64( svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svfloat32_t v263; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v263) : "w"(v262), "w"(v193)); - svfloat32_t v264; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v264) : "w"(v262), "w"(v193)); - svfloat32_t v289; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v289) : "w"(v142), "w"(v172)); + svfloat32_t v263 = svadd_f32_x(svptrue_b32(), v262, v193); + svfloat32_t v264 = svsub_f32_x(svptrue_b32(), v262, v193); + svfloat32_t v289 = svadd_f32_x(svptrue_b32(), v142, v172); svint16_t v294 = svtbl_s16( svreinterpret_s16_s32(svcvt_s32_f32_x( pred_full, svmul_n_f32_x(pred_full, v142, (float)(1ULL << 31ULL)))), @@ -4218,10 +2751,8 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu12(const armral_cmplx_f32_t *restrict x, pred_full, svmul_n_f32_x(pred_full, v209, (float)(1ULL << 31ULL)))), svreinterpret_u16_u64( svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svfloat32_t v236; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v236) : "w"(v235), "w"(v207)); - svfloat32_t v237; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v237) : "w"(v235), "w"(v207)); + svfloat32_t v236 = svadd_f32_x(svptrue_b32(), v235, v207); + svfloat32_t v237 = svsub_f32_x(svptrue_b32(), v235, v207); svint16_t v275 = svtbl_s16( svreinterpret_s16_s32(svcvt_s32_f32_x( pred_full, svmul_n_f32_x(pred_full, v264, (float)(1ULL << 31ULL)))), @@ -4232,10 +2763,8 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu12(const armral_cmplx_f32_t *restrict x, pred_full, svmul_n_f32_x(pred_full, v263, (float)(1ULL << 31ULL)))), svreinterpret_u16_u64( svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svfloat32_t v290; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v290) : "w"(v289), "w"(v206)); - svfloat32_t v291; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v291) : "w"(v289), "w"(v206)); + svfloat32_t v290 = svadd_f32_x(svptrue_b32(), v289, v206); + svfloat32_t v291 = svsub_f32_x(svptrue_b32(), v289, v206); svst1w_u64(pred_full, (unsigned *)(v471), svreinterpret_u64_s16(v240)); svst1w_u64(pred_full, (unsigned *)(v525), svreinterpret_u64_s16(v294)); svint16_t v248 = svtbl_s16( @@ -4280,532 +2809,250 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu13(const armral_cmplx_f32_t *restrict x, float v4 = dir; const float32x2_t *v5 = (const float32x2_t *)x; int32_t *v6 = (int32_t *)y; - int64_t v12 = howmany - 1; - int64_t v451 = howmany / 2; - for (int j = 0; j < v12; j += 2) { - float v161 = 1.0833333333333333e+00F; - float v166 = -3.0046260628866578e-01F; - float v170 = 7.4927933062613905e-01F; - float v171 = -7.4927933062613905e-01F; - float v178 = 4.0100212832186721e-01F; - float v179 = -4.0100212832186721e-01F; - float v186 = 5.7514072947400308e-01F; - float v187 = -5.7514072947400308e-01F; - float v195 = 5.2422663952658211e-01F; - float v200 = 5.1652078062348972e-01F; - float v205 = 7.7058589030924258e-03F; - float v210 = 4.2763404682656941e-01F; - float v215 = 1.5180597207438440e-01F; - float v220 = 5.7944001890096386e-01F; - float v224 = 1.1543953381323635e+00F; - float v225 = -1.1543953381323635e+00F; - float v232 = 9.0655220171271012e-01F; - float v233 = -9.0655220171271012e-01F; - float v240 = 8.1857027294591811e-01F; - float v241 = -8.1857027294591811e-01F; - float v248 = 1.1971367726043427e+00F; - float v249 = -1.1971367726043427e+00F; - float v256 = 8.6131170741789742e-01F; - float v257 = -8.6131170741789742e-01F; - float v264 = 1.1091548438375507e+00F; - float v265 = -1.1091548438375507e+00F; - float v272 = 4.2741434471979367e-02F; - float v273 = -4.2741434471979367e-02F; - float v280 = -4.5240494294812715e-02F; - float v281 = 4.5240494294812715e-02F; - float v288 = 2.9058457089163264e-01F; - float v289 = -2.9058457089163264e-01F; - float32x2_t v291 = (float32x2_t){v4, v4}; - const float32x2_t *v826 = &v5[istride]; - int32_t *v1053 = &v6[ostride]; - float32x2_t v162 = (float32x2_t){v161, v161}; + for (int j = 0; j < howmany; j += 1) { + float32x2_t v20 = v5[istride]; + float v129 = 1.0833333333333333e+00F; + float v133 = -3.0046260628866578e-01F; + float v136 = 7.4927933062613905e-01F; + float v137 = -7.4927933062613905e-01F; + float v143 = 4.0100212832186721e-01F; + float v144 = -4.0100212832186721e-01F; + float v150 = 5.7514072947400308e-01F; + float v151 = -5.7514072947400308e-01F; + float v158 = 5.2422663952658211e-01F; + float v162 = 5.1652078062348972e-01F; + float v166 = 7.7058589030924258e-03F; + float v170 = 4.2763404682656941e-01F; + float v174 = 1.5180597207438440e-01F; + float v178 = 5.7944001890096386e-01F; + float v181 = 1.1543953381323635e+00F; + float v182 = -1.1543953381323635e+00F; + float v188 = 9.0655220171271012e-01F; + float v189 = -9.0655220171271012e-01F; + float v195 = 8.1857027294591811e-01F; + float v196 = -8.1857027294591811e-01F; + float v202 = 1.1971367726043427e+00F; + float v203 = -1.1971367726043427e+00F; + float v209 = 8.6131170741789742e-01F; + float v210 = -8.6131170741789742e-01F; + float v216 = 1.1091548438375507e+00F; + float v217 = -1.1091548438375507e+00F; + float v223 = 4.2741434471979367e-02F; + float v224 = -4.2741434471979367e-02F; + float v230 = -4.5240494294812715e-02F; + float v231 = 4.5240494294812715e-02F; + float v237 = 2.9058457089163264e-01F; + float v238 = -2.9058457089163264e-01F; + float32x2_t v240 = (float32x2_t){v4, v4}; + float32x2_t v115 = v5[0]; + float32x2_t v130 = (float32x2_t){v129, v129}; + float32x2_t v134 = (float32x2_t){v133, v133}; + float32x2_t v138 = (float32x2_t){v136, v137}; + float32x2_t v145 = (float32x2_t){v143, v144}; + float32x2_t v152 = (float32x2_t){v150, v151}; + float32x2_t v159 = (float32x2_t){v158, v158}; + float32x2_t v163 = (float32x2_t){v162, v162}; float32x2_t v167 = (float32x2_t){v166, v166}; - float32x2_t v172 = (float32x2_t){v170, v171}; - float32x2_t v180 = (float32x2_t){v178, v179}; - float32x2_t v188 = (float32x2_t){v186, v187}; - float32x2_t v196 = (float32x2_t){v195, v195}; - float32x2_t v201 = (float32x2_t){v200, v200}; - float32x2_t v206 = (float32x2_t){v205, v205}; - float32x2_t v211 = (float32x2_t){v210, v210}; - float32x2_t v216 = (float32x2_t){v215, v215}; - float32x2_t v221 = (float32x2_t){v220, v220}; - float32x2_t v226 = (float32x2_t){v224, v225}; - float32x2_t v234 = (float32x2_t){v232, v233}; - float32x2_t v242 = (float32x2_t){v240, v241}; - float32x2_t v250 = (float32x2_t){v248, v249}; - float32x2_t v258 = (float32x2_t){v256, v257}; - float32x2_t v266 = (float32x2_t){v264, v265}; - float32x2_t v274 = (float32x2_t){v272, v273}; - float32x2_t v282 = (float32x2_t){v280, v281}; - float32x2_t v290 = (float32x2_t){v288, v289}; - const float32x2_t *v935 = &v5[0]; - int32_t *v945 = &v6[0]; - float32x4_t v1057 = vld1q_f32((const float32_t *)v826); - float32x4_t v163 = vcombine_f32(v162, v162); - float32x4_t v168 = vcombine_f32(v167, v167); - float32x2_t v174 = vmul_f32(v291, v172); - float32x2_t v182 = vmul_f32(v291, v180); - float32x2_t v190 = vmul_f32(v291, v188); - float32x4_t v197 = vcombine_f32(v196, v196); - float32x4_t v202 = vcombine_f32(v201, v201); - float32x4_t v207 = vcombine_f32(v206, v206); - float32x4_t v212 = vcombine_f32(v211, v211); - float32x4_t v217 = vcombine_f32(v216, v216); - float32x4_t v222 = vcombine_f32(v221, v221); - float32x2_t v228 = vmul_f32(v291, v226); - float32x2_t v236 = vmul_f32(v291, v234); - float32x2_t v244 = vmul_f32(v291, v242); - float32x2_t v252 = vmul_f32(v291, v250); - float32x2_t v260 = vmul_f32(v291, v258); - float32x2_t v268 = vmul_f32(v291, v266); - float32x2_t v276 = vmul_f32(v291, v274); - float32x2_t v284 = vmul_f32(v291, v282); - float32x2_t v292 = vmul_f32(v291, v290); - const float32x2_t *v835 = &v5[istride * 12]; - const float32x2_t *v844 = &v5[istride * 2]; - const float32x2_t *v853 = &v5[istride * 11]; - const float32x2_t *v862 = &v5[istride * 3]; - const float32x2_t *v871 = &v5[istride * 10]; - const float32x2_t *v880 = &v5[istride * 4]; - const float32x2_t *v889 = &v5[istride * 9]; - const float32x2_t *v898 = &v5[istride * 5]; - const float32x2_t *v907 = &v5[istride * 8]; - const float32x2_t *v916 = &v5[istride * 6]; - const float32x2_t *v925 = &v5[istride * 7]; - int32_t *v954 = &v6[ostride * 12]; - int32_t *v963 = &v6[ostride * 11]; - int32_t *v972 = &v6[ostride * 10]; - int32_t *v981 = &v6[ostride * 9]; - int32_t *v990 = &v6[ostride * 8]; - int32_t *v999 = &v6[ostride * 7]; - int32_t *v1008 = &v6[ostride * 6]; - int32_t *v1017 = &v6[ostride * 5]; - int32_t *v1026 = &v6[ostride * 4]; - int32_t *v1035 = &v6[ostride * 3]; - int32_t *v1044 = &v6[ostride * 2]; - float32x4_t v1081 = vld1q_f32((const float32_t *)v935); - float32x4_t v176 = vcombine_f32(v174, v174); - float32x4_t v184 = vcombine_f32(v182, v182); - float32x4_t v192 = vcombine_f32(v190, v190); - float32x4_t v230 = vcombine_f32(v228, v228); - float32x4_t v238 = vcombine_f32(v236, v236); - float32x4_t v246 = vcombine_f32(v244, v244); - float32x4_t v254 = vcombine_f32(v252, v252); - float32x4_t v262 = vcombine_f32(v260, v260); - float32x4_t v270 = vcombine_f32(v268, v268); - float32x4_t v278 = vcombine_f32(v276, v276); - float32x4_t v286 = vcombine_f32(v284, v284); - float32x4_t v294 = vcombine_f32(v292, v292); - float32x4_t v1059 = vld1q_f32((const float32_t *)v835); - float32x4_t v1061 = vld1q_f32((const float32_t *)v844); - float32x4_t v1063 = vld1q_f32((const float32_t *)v853); - float32x4_t v1065 = vld1q_f32((const float32_t *)v862); - float32x4_t v1067 = vld1q_f32((const float32_t *)v871); - float32x4_t v1069 = vld1q_f32((const float32_t *)v880); - float32x4_t v1071 = vld1q_f32((const float32_t *)v889); - float32x4_t v1073 = vld1q_f32((const float32_t *)v898); - float32x4_t v1075 = vld1q_f32((const float32_t *)v907); - float32x4_t v1077 = vld1q_f32((const float32_t *)v916); - float32x4_t v1079 = vld1q_f32((const float32_t *)v925); - float32x4_t v35 = vaddq_f32(v1057, v1059); - float32x4_t v50 = vaddq_f32(v1061, v1063); - float32x4_t v65 = vaddq_f32(v1065, v1067); - float32x4_t v80 = vaddq_f32(v1069, v1071); - float32x4_t v95 = vaddq_f32(v1073, v1075); - float32x4_t v110 = vaddq_f32(v1077, v1079); - float32x4_t v111 = vsubq_f32(v1057, v1059); - float32x4_t v112 = vsubq_f32(v1061, v1063); - float32x4_t v113 = vsubq_f32(v1065, v1067); - float32x4_t v114 = vsubq_f32(v1069, v1071); - float32x4_t v115 = vsubq_f32(v1073, v1075); - float32x4_t v116 = vsubq_f32(v1077, v1079); - float32x4_t v117 = vaddq_f32(v50, v95); - float32x4_t v119 = vaddq_f32(v35, v65); - float32x4_t v122 = vaddq_f32(v112, v115); - float32x4_t v124 = vaddq_f32(v111, v113); - float32x4_t v126 = vsubq_f32(v50, v110); - float32x4_t v127 = vsubq_f32(v65, v80); - float32x4_t v128 = vsubq_f32(v35, v80); - float32x4_t v129 = vsubq_f32(v95, v110); - float32x4_t v134 = vsubq_f32(v112, v116); - float32x4_t v135 = vsubq_f32(v111, v113); - float32x4_t v136 = vsubq_f32(v112, v115); - float32x4_t v137 = vaddq_f32(v111, v114); - float32x4_t v138 = vsubq_f32(v115, v116); - float32x4_t v139 = vaddq_f32(v113, v114); - float32x4_t v118 = vaddq_f32(v117, v110); - float32x4_t v120 = vaddq_f32(v119, v80); - float32x4_t v123 = vaddq_f32(v122, v116); - float32x4_t v125 = vsubq_f32(v124, v114); - float32x4_t v130 = vsubq_f32(v126, v127); - float32x4_t v131 = vsubq_f32(v128, v129); - float32x4_t v132 = vaddq_f32(v126, v127); - float32x4_t v133 = vaddq_f32(v128, v129); - float32x4_t v152 = vaddq_f32(v134, v135); - float32x4_t v153 = vaddq_f32(v136, v137); - float32x4_t v154 = vsubq_f32(v138, v139); - float32x4_t v229 = vrev64q_f32(v134); - float32x4_t v237 = vrev64q_f32(v135); - float32x4_t v253 = vrev64q_f32(v136); - float32x4_t v261 = vrev64q_f32(v137); - float32x4_t v277 = vrev64q_f32(v138); - float32x4_t v285 = vrev64q_f32(v139); - float32x4_t v121 = vaddq_f32(v118, v120); - float32x4_t v148 = vsubq_f32(v120, v118); - float32x4_t v149 = vaddq_f32(v123, v125); - float32x4_t v150 = vaddq_f32(v130, v131); - float32x4_t v151 = vsubq_f32(v132, v133); - float32x4_t v175 = vrev64q_f32(v123); - float32x4_t v183 = vrev64q_f32(v125); - float32x4_t v198 = vmulq_f32(v130, v197); - float32x4_t v203 = vmulq_f32(v131, v202); - float32x4_t v213 = vmulq_f32(v132, v212); - float32x4_t v218 = vmulq_f32(v133, v217); - float32x4_t v231 = vmulq_f32(v229, v230); - float32x4_t v239 = vmulq_f32(v237, v238); - float32x4_t v245 = vrev64q_f32(v152); - float32x4_t v255 = vmulq_f32(v253, v254); - float32x4_t v263 = vmulq_f32(v261, v262); - float32x4_t v269 = vrev64q_f32(v153); - float32x4_t v279 = vmulq_f32(v277, v278); - float32x4_t v287 = vmulq_f32(v285, v286); - float32x4_t v293 = vrev64q_f32(v154); - float32x4_t v147 = vaddq_f32(v1081, v121); - float32x4_t v164 = vmulq_f32(v121, v163); - float32x4_t v169 = vmulq_f32(v148, v168); - float32x4_t v177 = vmulq_f32(v175, v176); - float32x4_t v185 = vmulq_f32(v183, v184); - float32x4_t v191 = vrev64q_f32(v149); - float32x4_t v208 = vmulq_f32(v150, v207); - float32x4_t v223 = vmulq_f32(v151, v222); - float32x4_t v247 = vmulq_f32(v245, v246); - float32x4_t v271 = vmulq_f32(v269, v270); - float32x4_t v295 = vmulq_f32(v293, v294); - float32x4_t v297 = vaddq_f32(v203, v198); - float32x4_t v193 = vmulq_f32(v191, v192); - float32x4_t v296 = vsubq_f32(v147, v164); - float32x4_t v298 = vsubq_f32(v297, v169); - float32x4_t v299 = vaddq_f32(v203, v208); - float32x4_t v301 = vsubq_f32(v208, v198); - float32x4_t v309 = vsubq_f32(v231, v247); - float32x4_t v310 = vsubq_f32(v239, v247); - float32x4_t v311 = vsubq_f32(v255, v271); - float32x4_t v312 = vsubq_f32(v263, v271); - float32x4_t v313 = vsubq_f32(v279, v295); - float32x4_t v314 = vaddq_f32(v287, v295); - int16x4_t v349 = vqmovn_s32(vcvtq_n_s32_f32(v147, 15)); - float32x4_t v300 = vaddq_f32(v299, v169); - float32x4_t v302 = vsubq_f32(v301, v169); - float32x4_t v303 = vaddq_f32(v296, v213); - float32x4_t v305 = vsubq_f32(v296, v218); - float32x4_t v307 = vsubq_f32(v296, v213); - float32x4_t v315 = vsubq_f32(v177, v193); - float32x4_t v316 = vsubq_f32(v185, v193); - float32x4_t v327 = vaddq_f32(v309, v313); - float32x4_t v329 = vaddq_f32(v311, v313); - float32x4_t v331 = vsubq_f32(v310, v314); - vst1_s16((int16_t *)v945, v349); - float32x4_t v304 = vaddq_f32(v303, v218); - float32x4_t v306 = vsubq_f32(v305, v223); - float32x4_t v308 = vaddq_f32(v307, v223); - float32x4_t v323 = vsubq_f32(v316, v309); - float32x4_t v325 = vsubq_f32(v314, v315); - float32x4_t v328 = vaddq_f32(v327, v316); - float32x4_t v330 = vsubq_f32(v329, v316); - float32x4_t v332 = vsubq_f32(v331, v315); - float32x4_t v333 = vaddq_f32(v315, v310); - float32x4_t v317 = vaddq_f32(v298, v304); - float32x4_t v318 = vaddq_f32(v300, v306); - float32x4_t v319 = vsubq_f32(v306, v300); - float32x4_t v320 = vaddq_f32(v302, v308); - float32x4_t v321 = vsubq_f32(v304, v298); - float32x4_t v322 = vsubq_f32(v308, v302); - float32x4_t v324 = vaddq_f32(v323, v311); - float32x4_t v326 = vsubq_f32(v325, v312); - float32x4_t v334 = vsubq_f32(v333, v312); - float32x4_t v335 = vsubq_f32(v317, v324); - float32x4_t v336 = vaddq_f32(v318, v326); - float32x4_t v337 = vsubq_f32(v319, v328); - float32x4_t v338 = vsubq_f32(v320, v330); - float32x4_t v339 = vaddq_f32(v321, v332); - float32x4_t v340 = vsubq_f32(v322, v334); - float32x4_t v341 = vaddq_f32(v322, v334); - float32x4_t v342 = vsubq_f32(v321, v332); - float32x4_t v343 = vaddq_f32(v320, v330); - float32x4_t v344 = vaddq_f32(v319, v328); - float32x4_t v345 = vsubq_f32(v318, v326); - float32x4_t v346 = vaddq_f32(v317, v324); - int16x4_t v357 = vqmovn_s32(vcvtq_n_s32_f32(v335, 15)); - int16x4_t v365 = vqmovn_s32(vcvtq_n_s32_f32(v336, 15)); - int16x4_t v373 = vqmovn_s32(vcvtq_n_s32_f32(v337, 15)); - int16x4_t v381 = vqmovn_s32(vcvtq_n_s32_f32(v338, 15)); - int16x4_t v389 = vqmovn_s32(vcvtq_n_s32_f32(v339, 15)); - int16x4_t v397 = vqmovn_s32(vcvtq_n_s32_f32(v340, 15)); - int16x4_t v405 = vqmovn_s32(vcvtq_n_s32_f32(v341, 15)); - int16x4_t v413 = vqmovn_s32(vcvtq_n_s32_f32(v342, 15)); - int16x4_t v421 = vqmovn_s32(vcvtq_n_s32_f32(v343, 15)); - int16x4_t v429 = vqmovn_s32(vcvtq_n_s32_f32(v344, 15)); - int16x4_t v437 = vqmovn_s32(vcvtq_n_s32_f32(v345, 15)); - int16x4_t v445 = vqmovn_s32(vcvtq_n_s32_f32(v346, 15)); - vst1_s16((int16_t *)v954, v357); - vst1_s16((int16_t *)v963, v365); - vst1_s16((int16_t *)v972, v373); - vst1_s16((int16_t *)v981, v381); - vst1_s16((int16_t *)v990, v389); - vst1_s16((int16_t *)v999, v397); - vst1_s16((int16_t *)v1008, v405); - vst1_s16((int16_t *)v1017, v413); - vst1_s16((int16_t *)v1026, v421); - vst1_s16((int16_t *)v1035, v429); - vst1_s16((int16_t *)v1044, v437); - vst1_s16((int16_t *)v1053, v445); - v5 += 2 * 1; - v6 += 2 * 1; - } - for (int j = v451 * 2; j < howmany; j += 1) { - float32x2_t v463 = v5[istride]; - float v572 = 1.0833333333333333e+00F; - float v576 = -3.0046260628866578e-01F; - float v579 = 7.4927933062613905e-01F; - float v580 = -7.4927933062613905e-01F; - float v586 = 4.0100212832186721e-01F; - float v587 = -4.0100212832186721e-01F; - float v593 = 5.7514072947400308e-01F; - float v594 = -5.7514072947400308e-01F; - float v601 = 5.2422663952658211e-01F; - float v605 = 5.1652078062348972e-01F; - float v609 = 7.7058589030924258e-03F; - float v613 = 4.2763404682656941e-01F; - float v617 = 1.5180597207438440e-01F; - float v621 = 5.7944001890096386e-01F; - float v624 = 1.1543953381323635e+00F; - float v625 = -1.1543953381323635e+00F; - float v631 = 9.0655220171271012e-01F; - float v632 = -9.0655220171271012e-01F; - float v638 = 8.1857027294591811e-01F; - float v639 = -8.1857027294591811e-01F; - float v645 = 1.1971367726043427e+00F; - float v646 = -1.1971367726043427e+00F; - float v652 = 8.6131170741789742e-01F; - float v653 = -8.6131170741789742e-01F; - float v659 = 1.1091548438375507e+00F; - float v660 = -1.1091548438375507e+00F; - float v666 = 4.2741434471979367e-02F; - float v667 = -4.2741434471979367e-02F; - float v673 = -4.5240494294812715e-02F; - float v674 = 4.5240494294812715e-02F; - float v680 = 2.9058457089163264e-01F; - float v681 = -2.9058457089163264e-01F; - float32x2_t v683 = (float32x2_t){v4, v4}; - float32x2_t v558 = v5[0]; - float32x2_t v573 = (float32x2_t){v572, v572}; - float32x2_t v577 = (float32x2_t){v576, v576}; - float32x2_t v581 = (float32x2_t){v579, v580}; - float32x2_t v588 = (float32x2_t){v586, v587}; - float32x2_t v595 = (float32x2_t){v593, v594}; - float32x2_t v602 = (float32x2_t){v601, v601}; - float32x2_t v606 = (float32x2_t){v605, v605}; - float32x2_t v610 = (float32x2_t){v609, v609}; - float32x2_t v614 = (float32x2_t){v613, v613}; - float32x2_t v618 = (float32x2_t){v617, v617}; - float32x2_t v622 = (float32x2_t){v621, v621}; - float32x2_t v626 = (float32x2_t){v624, v625}; - float32x2_t v633 = (float32x2_t){v631, v632}; - float32x2_t v640 = (float32x2_t){v638, v639}; - float32x2_t v647 = (float32x2_t){v645, v646}; - float32x2_t v654 = (float32x2_t){v652, v653}; - float32x2_t v661 = (float32x2_t){v659, v660}; - float32x2_t v668 = (float32x2_t){v666, v667}; - float32x2_t v675 = (float32x2_t){v673, v674}; - float32x2_t v682 = (float32x2_t){v680, v681}; - float32x2_t v468 = v5[istride * 12]; - float32x2_t v474 = v5[istride * 2]; - float32x2_t v479 = v5[istride * 11]; - float32x2_t v485 = v5[istride * 3]; - float32x2_t v490 = v5[istride * 10]; - float32x2_t v496 = v5[istride * 4]; - float32x2_t v501 = v5[istride * 9]; - float32x2_t v507 = v5[istride * 5]; - float32x2_t v512 = v5[istride * 8]; - float32x2_t v518 = v5[istride * 6]; - float32x2_t v523 = v5[istride * 7]; - float32x2_t v583 = vmul_f32(v683, v581); - float32x2_t v590 = vmul_f32(v683, v588); - float32x2_t v597 = vmul_f32(v683, v595); - float32x2_t v628 = vmul_f32(v683, v626); - float32x2_t v635 = vmul_f32(v683, v633); - float32x2_t v642 = vmul_f32(v683, v640); - float32x2_t v649 = vmul_f32(v683, v647); - float32x2_t v656 = vmul_f32(v683, v654); - float32x2_t v663 = vmul_f32(v683, v661); - float32x2_t v670 = vmul_f32(v683, v668); - float32x2_t v677 = vmul_f32(v683, v675); - float32x2_t v684 = vmul_f32(v683, v682); - float32x2_t v469 = vadd_f32(v463, v468); - float32x2_t v480 = vadd_f32(v474, v479); - float32x2_t v491 = vadd_f32(v485, v490); - float32x2_t v502 = vadd_f32(v496, v501); - float32x2_t v513 = vadd_f32(v507, v512); - float32x2_t v524 = vadd_f32(v518, v523); - float32x2_t v525 = vsub_f32(v463, v468); - float32x2_t v526 = vsub_f32(v474, v479); - float32x2_t v527 = vsub_f32(v485, v490); - float32x2_t v528 = vsub_f32(v496, v501); - float32x2_t v529 = vsub_f32(v507, v512); - float32x2_t v530 = vsub_f32(v518, v523); - float32x2_t v531 = vadd_f32(v480, v513); - float32x2_t v533 = vadd_f32(v469, v491); - float32x2_t v536 = vadd_f32(v526, v529); - float32x2_t v538 = vadd_f32(v525, v527); - float32x2_t v540 = vsub_f32(v480, v524); - float32x2_t v541 = vsub_f32(v491, v502); - float32x2_t v542 = vsub_f32(v469, v502); - float32x2_t v543 = vsub_f32(v513, v524); - float32x2_t v548 = vsub_f32(v526, v530); - float32x2_t v549 = vsub_f32(v525, v527); - float32x2_t v550 = vsub_f32(v526, v529); - float32x2_t v551 = vadd_f32(v525, v528); - float32x2_t v552 = vsub_f32(v529, v530); - float32x2_t v553 = vadd_f32(v527, v528); - float32x2_t v532 = vadd_f32(v531, v524); - float32x2_t v534 = vadd_f32(v533, v502); - float32x2_t v537 = vadd_f32(v536, v530); - float32x2_t v539 = vsub_f32(v538, v528); - float32x2_t v544 = vsub_f32(v540, v541); - float32x2_t v545 = vsub_f32(v542, v543); - float32x2_t v546 = vadd_f32(v540, v541); - float32x2_t v547 = vadd_f32(v542, v543); - float32x2_t v564 = vadd_f32(v548, v549); - float32x2_t v565 = vadd_f32(v550, v551); - float32x2_t v566 = vsub_f32(v552, v553); - float32x2_t v629 = vrev64_f32(v548); - float32x2_t v636 = vrev64_f32(v549); - float32x2_t v650 = vrev64_f32(v550); - float32x2_t v657 = vrev64_f32(v551); - float32x2_t v671 = vrev64_f32(v552); - float32x2_t v678 = vrev64_f32(v553); - float32x2_t v535 = vadd_f32(v532, v534); - float32x2_t v560 = vsub_f32(v534, v532); - float32x2_t v561 = vadd_f32(v537, v539); - float32x2_t v562 = vadd_f32(v544, v545); - float32x2_t v563 = vsub_f32(v546, v547); - float32x2_t v584 = vrev64_f32(v537); - float32x2_t v591 = vrev64_f32(v539); - float32x2_t v603 = vmul_f32(v544, v602); - float32x2_t v607 = vmul_f32(v545, v606); - float32x2_t v615 = vmul_f32(v546, v614); - float32x2_t v619 = vmul_f32(v547, v618); - float32x2_t v630 = vmul_f32(v629, v628); - float32x2_t v637 = vmul_f32(v636, v635); - float32x2_t v643 = vrev64_f32(v564); - float32x2_t v651 = vmul_f32(v650, v649); - float32x2_t v658 = vmul_f32(v657, v656); - float32x2_t v664 = vrev64_f32(v565); - float32x2_t v672 = vmul_f32(v671, v670); - float32x2_t v679 = vmul_f32(v678, v677); - float32x2_t v685 = vrev64_f32(v566); - float32x2_t v559 = vadd_f32(v558, v535); - float32x2_t v574 = vmul_f32(v535, v573); - float32x2_t v578 = vmul_f32(v560, v577); - float32x2_t v585 = vmul_f32(v584, v583); - float32x2_t v592 = vmul_f32(v591, v590); - float32x2_t v598 = vrev64_f32(v561); - float32x2_t v611 = vmul_f32(v562, v610); - float32x2_t v623 = vmul_f32(v563, v622); - float32x2_t v644 = vmul_f32(v643, v642); - float32x2_t v665 = vmul_f32(v664, v663); - float32x2_t v686 = vmul_f32(v685, v684); - float32x2_t v688 = vadd_f32(v607, v603); - float32x2_t v599 = vmul_f32(v598, v597); - float32x2_t v687 = vsub_f32(v559, v574); - float32x2_t v689 = vsub_f32(v688, v578); - float32x2_t v690 = vadd_f32(v607, v611); - float32x2_t v692 = vsub_f32(v611, v603); - float32x2_t v700 = vsub_f32(v630, v644); - float32x2_t v701 = vsub_f32(v637, v644); - float32x2_t v702 = vsub_f32(v651, v665); - float32x2_t v703 = vsub_f32(v658, v665); - float32x2_t v704 = vsub_f32(v672, v686); - float32x2_t v705 = vadd_f32(v679, v686); - int16x4_t v740 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v559, 15), (int32x2_t){0, 0})); - float32x2_t v691 = vadd_f32(v690, v578); - float32x2_t v693 = vsub_f32(v692, v578); - float32x2_t v694 = vadd_f32(v687, v615); - float32x2_t v696 = vsub_f32(v687, v619); - float32x2_t v698 = vsub_f32(v687, v615); - float32x2_t v706 = vsub_f32(v585, v599); - float32x2_t v707 = vsub_f32(v592, v599); - float32x2_t v718 = vadd_f32(v700, v704); - float32x2_t v720 = vadd_f32(v702, v704); - float32x2_t v722 = vsub_f32(v701, v705); - v6[0] = vget_lane_s32(vreinterpret_s32_s16(v740), 0); - float32x2_t v695 = vadd_f32(v694, v619); - float32x2_t v697 = vsub_f32(v696, v623); - float32x2_t v699 = vadd_f32(v698, v623); - float32x2_t v714 = vsub_f32(v707, v700); - float32x2_t v716 = vsub_f32(v705, v706); - float32x2_t v719 = vadd_f32(v718, v707); - float32x2_t v721 = vsub_f32(v720, v707); - float32x2_t v723 = vsub_f32(v722, v706); - float32x2_t v724 = vadd_f32(v706, v701); - float32x2_t v708 = vadd_f32(v689, v695); - float32x2_t v709 = vadd_f32(v691, v697); - float32x2_t v710 = vsub_f32(v697, v691); - float32x2_t v711 = vadd_f32(v693, v699); - float32x2_t v712 = vsub_f32(v695, v689); - float32x2_t v713 = vsub_f32(v699, v693); - float32x2_t v715 = vadd_f32(v714, v702); - float32x2_t v717 = vsub_f32(v716, v703); - float32x2_t v725 = vsub_f32(v724, v703); - float32x2_t v726 = vsub_f32(v708, v715); - float32x2_t v727 = vadd_f32(v709, v717); - float32x2_t v728 = vsub_f32(v710, v719); - float32x2_t v729 = vsub_f32(v711, v721); - float32x2_t v730 = vadd_f32(v712, v723); - float32x2_t v731 = vsub_f32(v713, v725); - float32x2_t v732 = vadd_f32(v713, v725); - float32x2_t v733 = vsub_f32(v712, v723); - float32x2_t v734 = vadd_f32(v711, v721); - float32x2_t v735 = vadd_f32(v710, v719); - float32x2_t v736 = vsub_f32(v709, v717); - float32x2_t v737 = vadd_f32(v708, v715); - int16x4_t v746 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v726, 15), (int32x2_t){0, 0})); - int16x4_t v752 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v727, 15), (int32x2_t){0, 0})); - int16x4_t v758 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v728, 15), (int32x2_t){0, 0})); - int16x4_t v764 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v729, 15), (int32x2_t){0, 0})); - int16x4_t v770 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v730, 15), (int32x2_t){0, 0})); - int16x4_t v776 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v731, 15), (int32x2_t){0, 0})); - int16x4_t v782 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v732, 15), (int32x2_t){0, 0})); - int16x4_t v788 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v733, 15), (int32x2_t){0, 0})); - int16x4_t v794 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v734, 15), (int32x2_t){0, 0})); - int16x4_t v800 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v735, 15), (int32x2_t){0, 0})); - int16x4_t v806 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v736, 15), (int32x2_t){0, 0})); - int16x4_t v812 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v737, 15), (int32x2_t){0, 0})); - v6[ostride * 12] = vget_lane_s32(vreinterpret_s32_s16(v746), 0); - v6[ostride * 11] = vget_lane_s32(vreinterpret_s32_s16(v752), 0); - v6[ostride * 10] = vget_lane_s32(vreinterpret_s32_s16(v758), 0); - v6[ostride * 9] = vget_lane_s32(vreinterpret_s32_s16(v764), 0); - v6[ostride * 8] = vget_lane_s32(vreinterpret_s32_s16(v770), 0); - v6[ostride * 7] = vget_lane_s32(vreinterpret_s32_s16(v776), 0); - v6[ostride * 6] = vget_lane_s32(vreinterpret_s32_s16(v782), 0); - v6[ostride * 5] = vget_lane_s32(vreinterpret_s32_s16(v788), 0); - v6[ostride * 4] = vget_lane_s32(vreinterpret_s32_s16(v794), 0); - v6[ostride * 3] = vget_lane_s32(vreinterpret_s32_s16(v800), 0); - v6[ostride * 2] = vget_lane_s32(vreinterpret_s32_s16(v806), 0); - v6[ostride] = vget_lane_s32(vreinterpret_s32_s16(v812), 0); + float32x2_t v171 = (float32x2_t){v170, v170}; + float32x2_t v175 = (float32x2_t){v174, v174}; + float32x2_t v179 = (float32x2_t){v178, v178}; + float32x2_t v183 = (float32x2_t){v181, v182}; + float32x2_t v190 = (float32x2_t){v188, v189}; + float32x2_t v197 = (float32x2_t){v195, v196}; + float32x2_t v204 = (float32x2_t){v202, v203}; + float32x2_t v211 = (float32x2_t){v209, v210}; + float32x2_t v218 = (float32x2_t){v216, v217}; + float32x2_t v225 = (float32x2_t){v223, v224}; + float32x2_t v232 = (float32x2_t){v230, v231}; + float32x2_t v239 = (float32x2_t){v237, v238}; + float32x2_t v25 = v5[istride * 12]; + float32x2_t v31 = v5[istride * 2]; + float32x2_t v36 = v5[istride * 11]; + float32x2_t v42 = v5[istride * 3]; + float32x2_t v47 = v5[istride * 10]; + float32x2_t v53 = v5[istride * 4]; + float32x2_t v58 = v5[istride * 9]; + float32x2_t v64 = v5[istride * 5]; + float32x2_t v69 = v5[istride * 8]; + float32x2_t v75 = v5[istride * 6]; + float32x2_t v80 = v5[istride * 7]; + float32x2_t v140 = vmul_f32(v240, v138); + float32x2_t v147 = vmul_f32(v240, v145); + float32x2_t v154 = vmul_f32(v240, v152); + float32x2_t v185 = vmul_f32(v240, v183); + float32x2_t v192 = vmul_f32(v240, v190); + float32x2_t v199 = vmul_f32(v240, v197); + float32x2_t v206 = vmul_f32(v240, v204); + float32x2_t v213 = vmul_f32(v240, v211); + float32x2_t v220 = vmul_f32(v240, v218); + float32x2_t v227 = vmul_f32(v240, v225); + float32x2_t v234 = vmul_f32(v240, v232); + float32x2_t v241 = vmul_f32(v240, v239); + float32x2_t v26 = vadd_f32(v20, v25); + float32x2_t v37 = vadd_f32(v31, v36); + float32x2_t v48 = vadd_f32(v42, v47); + float32x2_t v59 = vadd_f32(v53, v58); + float32x2_t v70 = vadd_f32(v64, v69); + float32x2_t v81 = vadd_f32(v75, v80); + float32x2_t v82 = vsub_f32(v20, v25); + float32x2_t v83 = vsub_f32(v31, v36); + float32x2_t v84 = vsub_f32(v42, v47); + float32x2_t v85 = vsub_f32(v53, v58); + float32x2_t v86 = vsub_f32(v64, v69); + float32x2_t v87 = vsub_f32(v75, v80); + float32x2_t v88 = vadd_f32(v37, v70); + float32x2_t v90 = vadd_f32(v26, v48); + float32x2_t v93 = vadd_f32(v83, v86); + float32x2_t v95 = vadd_f32(v82, v84); + float32x2_t v97 = vsub_f32(v37, v81); + float32x2_t v98 = vsub_f32(v48, v59); + float32x2_t v99 = vsub_f32(v26, v59); + float32x2_t v100 = vsub_f32(v70, v81); + float32x2_t v105 = vsub_f32(v83, v87); + float32x2_t v106 = vsub_f32(v82, v84); + float32x2_t v107 = vsub_f32(v83, v86); + float32x2_t v108 = vadd_f32(v82, v85); + float32x2_t v109 = vsub_f32(v86, v87); + float32x2_t v110 = vadd_f32(v84, v85); + float32x2_t v89 = vadd_f32(v88, v81); + float32x2_t v91 = vadd_f32(v90, v59); + float32x2_t v94 = vadd_f32(v93, v87); + float32x2_t v96 = vsub_f32(v95, v85); + float32x2_t v101 = vsub_f32(v97, v98); + float32x2_t v102 = vsub_f32(v99, v100); + float32x2_t v103 = vadd_f32(v97, v98); + float32x2_t v104 = vadd_f32(v99, v100); + float32x2_t v121 = vadd_f32(v105, v106); + float32x2_t v122 = vadd_f32(v107, v108); + float32x2_t v123 = vsub_f32(v109, v110); + float32x2_t v186 = vrev64_f32(v105); + float32x2_t v193 = vrev64_f32(v106); + float32x2_t v207 = vrev64_f32(v107); + float32x2_t v214 = vrev64_f32(v108); + float32x2_t v228 = vrev64_f32(v109); + float32x2_t v235 = vrev64_f32(v110); + float32x2_t v92 = vadd_f32(v89, v91); + float32x2_t v117 = vsub_f32(v91, v89); + float32x2_t v118 = vadd_f32(v94, v96); + float32x2_t v119 = vadd_f32(v101, v102); + float32x2_t v120 = vsub_f32(v103, v104); + float32x2_t v141 = vrev64_f32(v94); + float32x2_t v148 = vrev64_f32(v96); + float32x2_t v160 = vmul_f32(v101, v159); + float32x2_t v164 = vmul_f32(v102, v163); + float32x2_t v172 = vmul_f32(v103, v171); + float32x2_t v176 = vmul_f32(v104, v175); + float32x2_t v187 = vmul_f32(v186, v185); + float32x2_t v194 = vmul_f32(v193, v192); + float32x2_t v200 = vrev64_f32(v121); + float32x2_t v208 = vmul_f32(v207, v206); + float32x2_t v215 = vmul_f32(v214, v213); + float32x2_t v221 = vrev64_f32(v122); + float32x2_t v229 = vmul_f32(v228, v227); + float32x2_t v236 = vmul_f32(v235, v234); + float32x2_t v242 = vrev64_f32(v123); + float32x2_t v116 = vadd_f32(v115, v92); + float32x2_t v131 = vmul_f32(v92, v130); + float32x2_t v135 = vmul_f32(v117, v134); + float32x2_t v142 = vmul_f32(v141, v140); + float32x2_t v149 = vmul_f32(v148, v147); + float32x2_t v155 = vrev64_f32(v118); + float32x2_t v168 = vmul_f32(v119, v167); + float32x2_t v180 = vmul_f32(v120, v179); + float32x2_t v201 = vmul_f32(v200, v199); + float32x2_t v222 = vmul_f32(v221, v220); + float32x2_t v243 = vmul_f32(v242, v241); + float32x2_t v245 = vadd_f32(v164, v160); + float32x2_t v156 = vmul_f32(v155, v154); + float32x2_t v244 = vsub_f32(v116, v131); + float32x2_t v246 = vsub_f32(v245, v135); + float32x2_t v247 = vadd_f32(v164, v168); + float32x2_t v249 = vsub_f32(v168, v160); + float32x2_t v257 = vsub_f32(v187, v201); + float32x2_t v258 = vsub_f32(v194, v201); + float32x2_t v259 = vsub_f32(v208, v222); + float32x2_t v260 = vsub_f32(v215, v222); + float32x2_t v261 = vsub_f32(v229, v243); + float32x2_t v262 = vadd_f32(v236, v243); + int16x4_t v297 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v116, 15), (int32x2_t){0, 0})); + float32x2_t v248 = vadd_f32(v247, v135); + float32x2_t v250 = vsub_f32(v249, v135); + float32x2_t v251 = vadd_f32(v244, v172); + float32x2_t v253 = vsub_f32(v244, v176); + float32x2_t v255 = vsub_f32(v244, v172); + float32x2_t v263 = vsub_f32(v142, v156); + float32x2_t v264 = vsub_f32(v149, v156); + float32x2_t v275 = vadd_f32(v257, v261); + float32x2_t v277 = vadd_f32(v259, v261); + float32x2_t v279 = vsub_f32(v258, v262); + v6[0] = vget_lane_s32(vreinterpret_s32_s16(v297), 0); + float32x2_t v252 = vadd_f32(v251, v176); + float32x2_t v254 = vsub_f32(v253, v180); + float32x2_t v256 = vadd_f32(v255, v180); + float32x2_t v271 = vsub_f32(v264, v257); + float32x2_t v273 = vsub_f32(v262, v263); + float32x2_t v276 = vadd_f32(v275, v264); + float32x2_t v278 = vsub_f32(v277, v264); + float32x2_t v280 = vsub_f32(v279, v263); + float32x2_t v281 = vadd_f32(v263, v258); + float32x2_t v265 = vadd_f32(v246, v252); + float32x2_t v266 = vadd_f32(v248, v254); + float32x2_t v267 = vsub_f32(v254, v248); + float32x2_t v268 = vadd_f32(v250, v256); + float32x2_t v269 = vsub_f32(v252, v246); + float32x2_t v270 = vsub_f32(v256, v250); + float32x2_t v272 = vadd_f32(v271, v259); + float32x2_t v274 = vsub_f32(v273, v260); + float32x2_t v282 = vsub_f32(v281, v260); + float32x2_t v283 = vsub_f32(v265, v272); + float32x2_t v284 = vadd_f32(v266, v274); + float32x2_t v285 = vsub_f32(v267, v276); + float32x2_t v286 = vsub_f32(v268, v278); + float32x2_t v287 = vadd_f32(v269, v280); + float32x2_t v288 = vsub_f32(v270, v282); + float32x2_t v289 = vadd_f32(v270, v282); + float32x2_t v290 = vsub_f32(v269, v280); + float32x2_t v291 = vadd_f32(v268, v278); + float32x2_t v292 = vadd_f32(v267, v276); + float32x2_t v293 = vsub_f32(v266, v274); + float32x2_t v294 = vadd_f32(v265, v272); + int16x4_t v303 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v283, 15), (int32x2_t){0, 0})); + int16x4_t v309 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v284, 15), (int32x2_t){0, 0})); + int16x4_t v315 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v285, 15), (int32x2_t){0, 0})); + int16x4_t v321 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v286, 15), (int32x2_t){0, 0})); + int16x4_t v327 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v287, 15), (int32x2_t){0, 0})); + int16x4_t v333 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v288, 15), (int32x2_t){0, 0})); + int16x4_t v339 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v289, 15), (int32x2_t){0, 0})); + int16x4_t v345 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v290, 15), (int32x2_t){0, 0})); + int16x4_t v351 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v291, 15), (int32x2_t){0, 0})); + int16x4_t v357 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v292, 15), (int32x2_t){0, 0})); + int16x4_t v363 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v293, 15), (int32x2_t){0, 0})); + int16x4_t v369 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v294, 15), (int32x2_t){0, 0})); + v6[ostride * 12] = vget_lane_s32(vreinterpret_s32_s16(v303), 0); + v6[ostride * 11] = vget_lane_s32(vreinterpret_s32_s16(v309), 0); + v6[ostride * 10] = vget_lane_s32(vreinterpret_s32_s16(v315), 0); + v6[ostride * 9] = vget_lane_s32(vreinterpret_s32_s16(v321), 0); + v6[ostride * 8] = vget_lane_s32(vreinterpret_s32_s16(v327), 0); + v6[ostride * 7] = vget_lane_s32(vreinterpret_s32_s16(v333), 0); + v6[ostride * 6] = vget_lane_s32(vreinterpret_s32_s16(v339), 0); + v6[ostride * 5] = vget_lane_s32(vreinterpret_s32_s16(v345), 0); + v6[ostride * 4] = vget_lane_s32(vreinterpret_s32_s16(v351), 0); + v6[ostride * 3] = vget_lane_s32(vreinterpret_s32_s16(v357), 0); + v6[ostride * 2] = vget_lane_s32(vreinterpret_s32_s16(v363), 0); + v6[ostride] = vget_lane_s32(vreinterpret_s32_s16(v369), 0); v5 += 1 * 1; v6 += 1 * 1; } @@ -4954,155 +3201,89 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu13(const armral_cmplx_f32_t *restrict x, svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v532)[0])); svfloat32_t v716 = svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v541)[0])); - svfloat32_t v32; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v32) : "w"(v694), "w"(v696)); - svfloat32_t v47; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v47) : "w"(v698), "w"(v700)); - svfloat32_t v62; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v62) : "w"(v702), "w"(v704)); - svfloat32_t v77; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v77) : "w"(v706), "w"(v708)); - svfloat32_t v92; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v92) : "w"(v710), "w"(v712)); - svfloat32_t v107; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v107) : "w"(v714), "w"(v716)); - svfloat32_t v108; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v108) : "w"(v694), "w"(v696)); - svfloat32_t v109; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v109) : "w"(v698), "w"(v700)); - svfloat32_t v110; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v110) : "w"(v702), "w"(v704)); - svfloat32_t v111; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v111) : "w"(v706), "w"(v708)); - svfloat32_t v112; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v112) : "w"(v710), "w"(v712)); - svfloat32_t v113; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v113) : "w"(v714), "w"(v716)); - svfloat32_t v114; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v114) : "w"(v47), "w"(v92)); - svfloat32_t v116; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v116) : "w"(v32), "w"(v62)); - svfloat32_t v119; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v119) : "w"(v109), "w"(v112)); - svfloat32_t v121; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v121) : "w"(v108), "w"(v110)); - svfloat32_t v123; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v123) : "w"(v47), "w"(v107)); - svfloat32_t v124; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v124) : "w"(v62), "w"(v77)); - svfloat32_t v125; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v125) : "w"(v32), "w"(v77)); - svfloat32_t v126; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v126) : "w"(v92), "w"(v107)); - svfloat32_t v131; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v131) : "w"(v109), "w"(v113)); - svfloat32_t v132; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v132) : "w"(v108), "w"(v110)); - svfloat32_t v133; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v133) : "w"(v109), "w"(v112)); - svfloat32_t v134; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v134) : "w"(v108), "w"(v111)); - svfloat32_t v135; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v135) : "w"(v112), "w"(v113)); - svfloat32_t v136; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v136) : "w"(v110), "w"(v111)); - svfloat32_t v115; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v115) : "w"(v114), "w"(v107)); - svfloat32_t v117; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v117) : "w"(v116), "w"(v77)); - svfloat32_t v120; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v120) : "w"(v119), "w"(v113)); - svfloat32_t v122; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v122) : "w"(v121), "w"(v111)); - svfloat32_t v127; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v127) : "w"(v123), "w"(v124)); - svfloat32_t v128; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v128) : "w"(v125), "w"(v126)); - svfloat32_t v129; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v129) : "w"(v123), "w"(v124)); - svfloat32_t v130; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v130) : "w"(v125), "w"(v126)); - svfloat32_t v149; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v149) : "w"(v131), "w"(v132)); - svfloat32_t v150; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v150) : "w"(v133), "w"(v134)); - svfloat32_t v151; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v151) : "w"(v135), "w"(v136)); - svfloat32_t zero224; - asm volatile("mov %0.s, #0" : "=w"(zero224)); + svfloat32_t v32 = svadd_f32_x(svptrue_b32(), v694, v696); + svfloat32_t v47 = svadd_f32_x(svptrue_b32(), v698, v700); + svfloat32_t v62 = svadd_f32_x(svptrue_b32(), v702, v704); + svfloat32_t v77 = svadd_f32_x(svptrue_b32(), v706, v708); + svfloat32_t v92 = svadd_f32_x(svptrue_b32(), v710, v712); + svfloat32_t v107 = svadd_f32_x(svptrue_b32(), v714, v716); + svfloat32_t v108 = svsub_f32_x(svptrue_b32(), v694, v696); + svfloat32_t v109 = svsub_f32_x(svptrue_b32(), v698, v700); + svfloat32_t v110 = svsub_f32_x(svptrue_b32(), v702, v704); + svfloat32_t v111 = svsub_f32_x(svptrue_b32(), v706, v708); + svfloat32_t v112 = svsub_f32_x(svptrue_b32(), v710, v712); + svfloat32_t v113 = svsub_f32_x(svptrue_b32(), v714, v716); + svfloat32_t v114 = svadd_f32_x(svptrue_b32(), v47, v92); + svfloat32_t v116 = svadd_f32_x(svptrue_b32(), v32, v62); + svfloat32_t v119 = svadd_f32_x(svptrue_b32(), v109, v112); + svfloat32_t v121 = svadd_f32_x(svptrue_b32(), v108, v110); + svfloat32_t v123 = svsub_f32_x(svptrue_b32(), v47, v107); + svfloat32_t v124 = svsub_f32_x(svptrue_b32(), v62, v77); + svfloat32_t v125 = svsub_f32_x(svptrue_b32(), v32, v77); + svfloat32_t v126 = svsub_f32_x(svptrue_b32(), v92, v107); + svfloat32_t v131 = svsub_f32_x(svptrue_b32(), v109, v113); + svfloat32_t v132 = svsub_f32_x(svptrue_b32(), v108, v110); + svfloat32_t v133 = svsub_f32_x(svptrue_b32(), v109, v112); + svfloat32_t v134 = svadd_f32_x(svptrue_b32(), v108, v111); + svfloat32_t v135 = svsub_f32_x(svptrue_b32(), v112, v113); + svfloat32_t v136 = svadd_f32_x(svptrue_b32(), v110, v111); + svfloat32_t v115 = svadd_f32_x(svptrue_b32(), v114, v107); + svfloat32_t v117 = svadd_f32_x(svptrue_b32(), v116, v77); + svfloat32_t v120 = svadd_f32_x(svptrue_b32(), v119, v113); + svfloat32_t v122 = svsub_f32_x(svptrue_b32(), v121, v111); + svfloat32_t v127 = svsub_f32_x(svptrue_b32(), v123, v124); + svfloat32_t v128 = svsub_f32_x(svptrue_b32(), v125, v126); + svfloat32_t v129 = svadd_f32_x(svptrue_b32(), v123, v124); + svfloat32_t v130 = svadd_f32_x(svptrue_b32(), v125, v126); + svfloat32_t v149 = svadd_f32_x(svptrue_b32(), v131, v132); + svfloat32_t v150 = svadd_f32_x(svptrue_b32(), v133, v134); + svfloat32_t v151 = svsub_f32_x(svptrue_b32(), v135, v136); + svfloat32_t zero224 = svdup_n_f32(0); svfloat32_t v224 = svcmla_f32_x(pred_full, zero224, v566, v131, 90); - svfloat32_t zero231; - asm volatile("mov %0.s, #0" : "=w"(zero231)); + svfloat32_t zero231 = svdup_n_f32(0); svfloat32_t v231 = svcmla_f32_x(pred_full, zero231, v567, v132, 90); - svfloat32_t zero245; - asm volatile("mov %0.s, #0" : "=w"(zero245)); + svfloat32_t zero245 = svdup_n_f32(0); svfloat32_t v245 = svcmla_f32_x(pred_full, zero245, v569, v133, 90); - svfloat32_t zero252; - asm volatile("mov %0.s, #0" : "=w"(zero252)); + svfloat32_t zero252 = svdup_n_f32(0); svfloat32_t v252 = svcmla_f32_x(pred_full, zero252, v570, v134, 90); - svfloat32_t zero266; - asm volatile("mov %0.s, #0" : "=w"(zero266)); + svfloat32_t zero266 = svdup_n_f32(0); svfloat32_t v266 = svcmla_f32_x(pred_full, zero266, v572, v135, 90); - svfloat32_t v118; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v118) : "w"(v115), "w"(v117)); - svfloat32_t v145; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v145) : "w"(v117), "w"(v115)); - svfloat32_t v146; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v146) : "w"(v120), "w"(v122)); - svfloat32_t v147; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v147) : "w"(v127), "w"(v128)); - svfloat32_t v148; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v148) : "w"(v129), "w"(v130)); - svfloat32_t zero173; - asm volatile("mov %0.s, #0" : "=w"(zero173)); + svfloat32_t v118 = svadd_f32_x(svptrue_b32(), v115, v117); + svfloat32_t v145 = svsub_f32_x(svptrue_b32(), v117, v115); + svfloat32_t v146 = svadd_f32_x(svptrue_b32(), v120, v122); + svfloat32_t v147 = svadd_f32_x(svptrue_b32(), v127, v128); + svfloat32_t v148 = svsub_f32_x(svptrue_b32(), v129, v130); + svfloat32_t zero173 = svdup_n_f32(0); svfloat32_t v173 = svcmla_f32_x(pred_full, zero173, v557, v120, 90); - svfloat32_t zero180; - asm volatile("mov %0.s, #0" : "=w"(zero180)); + svfloat32_t zero180 = svdup_n_f32(0); svfloat32_t v180 = svcmla_f32_x(pred_full, zero180, v558, v122, 90); - svfloat32_t v192; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v192) : "w"(v127), "w"(v560)); - svfloat32_t zero238; - asm volatile("mov %0.s, #0" : "=w"(zero238)); + svfloat32_t v192 = svmul_f32_x(svptrue_b32(), v127, v560); + svfloat32_t zero238 = svdup_n_f32(0); svfloat32_t v238 = svcmla_f32_x(pred_full, zero238, v568, v149, 90); - svfloat32_t zero259; - asm volatile("mov %0.s, #0" : "=w"(zero259)); + svfloat32_t zero259 = svdup_n_f32(0); svfloat32_t v259 = svcmla_f32_x(pred_full, zero259, v571, v150, 90); - svfloat32_t zero280; - asm volatile("mov %0.s, #0" : "=w"(zero280)); + svfloat32_t zero280 = svdup_n_f32(0); svfloat32_t v280 = svcmla_f32_x(pred_full, zero280, v574, v151, 90); - svfloat32_t v144; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v144) : "w"(v718), "w"(v118)); - svfloat32_t zero187; - asm volatile("mov %0.s, #0" : "=w"(zero187)); + svfloat32_t v144 = svadd_f32_x(svptrue_b32(), v718, v118); + svfloat32_t zero187 = svdup_n_f32(0); svfloat32_t v187 = svcmla_f32_x(pred_full, zero187, v559, v146, 90); - svfloat32_t v202; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v202) : "w"(v147), "w"(v562)); + svfloat32_t v202 = svmul_f32_x(svptrue_b32(), v147, v562); svfloat32_t v282 = svmla_f32_x(pred_full, v192, v128, v561); - svfloat32_t v294; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v294) : "w"(v224), "w"(v238)); - svfloat32_t v295; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v295) : "w"(v231), "w"(v238)); - svfloat32_t v296; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v296) : "w"(v245), "w"(v259)); - svfloat32_t v297; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v297) : "w"(v252), "w"(v259)); - svfloat32_t v298; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v298) : "w"(v266), "w"(v280)); + svfloat32_t v294 = svsub_f32_x(svptrue_b32(), v224, v238); + svfloat32_t v295 = svsub_f32_x(svptrue_b32(), v231, v238); + svfloat32_t v296 = svsub_f32_x(svptrue_b32(), v245, v259); + svfloat32_t v297 = svsub_f32_x(svptrue_b32(), v252, v259); + svfloat32_t v298 = svsub_f32_x(svptrue_b32(), v266, v280); svfloat32_t v299 = svcmla_f32_x(pred_full, v280, v573, v136, 90); svfloat32_t v281 = svmls_f32_x(pred_full, v144, v118, v555); svfloat32_t v283 = svmls_f32_x(pred_full, v282, v145, v556); svfloat32_t v284 = svmla_f32_x(pred_full, v202, v128, v561); svfloat32_t v286 = svnmls_f32_x(pred_full, v192, v147, v562); - svfloat32_t v300; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v300) : "w"(v173), "w"(v187)); - svfloat32_t v301; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v301) : "w"(v180), "w"(v187)); - svfloat32_t v312; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v312) : "w"(v294), "w"(v298)); - svfloat32_t v314; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v314) : "w"(v296), "w"(v298)); - svfloat32_t v316; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v316) : "w"(v295), "w"(v299)); + svfloat32_t v300 = svsub_f32_x(svptrue_b32(), v173, v187); + svfloat32_t v301 = svsub_f32_x(svptrue_b32(), v180, v187); + svfloat32_t v312 = svadd_f32_x(svptrue_b32(), v294, v298); + svfloat32_t v314 = svadd_f32_x(svptrue_b32(), v296, v298); + svfloat32_t v316 = svsub_f32_x(svptrue_b32(), v295, v299); svint16_t v334 = svtbl_s16( svreinterpret_s16_s32(svcvt_s32_f32_x( pred_full, svmul_n_f32_x(pred_full, v144, (float)(1ULL << 31ULL)))), @@ -5113,64 +3294,37 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu13(const armral_cmplx_f32_t *restrict x, svfloat32_t v288 = svmla_f32_x(pred_full, v281, v129, v563); svfloat32_t v290 = svmls_f32_x(pred_full, v281, v130, v564); svfloat32_t v292 = svmls_f32_x(pred_full, v281, v129, v563); - svfloat32_t v308; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v308) : "w"(v301), "w"(v294)); - svfloat32_t v310; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v310) : "w"(v299), "w"(v300)); - svfloat32_t v313; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v313) : "w"(v312), "w"(v301)); - svfloat32_t v315; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v315) : "w"(v314), "w"(v301)); - svfloat32_t v317; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v317) : "w"(v316), "w"(v300)); - svfloat32_t v318; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v318) : "w"(v300), "w"(v295)); + svfloat32_t v308 = svsub_f32_x(svptrue_b32(), v301, v294); + svfloat32_t v310 = svsub_f32_x(svptrue_b32(), v299, v300); + svfloat32_t v313 = svadd_f32_x(svptrue_b32(), v312, v301); + svfloat32_t v315 = svsub_f32_x(svptrue_b32(), v314, v301); + svfloat32_t v317 = svsub_f32_x(svptrue_b32(), v316, v300); + svfloat32_t v318 = svadd_f32_x(svptrue_b32(), v300, v295); svst1w_u64(pred_full, (unsigned *)(v582), svreinterpret_u64_s16(v334)); svfloat32_t v289 = svmla_f32_x(pred_full, v288, v130, v564); svfloat32_t v291 = svmls_f32_x(pred_full, v290, v148, v565); svfloat32_t v293 = svmla_f32_x(pred_full, v292, v148, v565); - svfloat32_t v309; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v309) : "w"(v308), "w"(v296)); - svfloat32_t v311; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v311) : "w"(v310), "w"(v297)); - svfloat32_t v319; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v319) : "w"(v318), "w"(v297)); - svfloat32_t v302; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v302) : "w"(v283), "w"(v289)); - svfloat32_t v303; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v303) : "w"(v285), "w"(v291)); - svfloat32_t v304; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v304) : "w"(v291), "w"(v285)); - svfloat32_t v305; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v305) : "w"(v287), "w"(v293)); - svfloat32_t v306; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v306) : "w"(v289), "w"(v283)); - svfloat32_t v307; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v307) : "w"(v293), "w"(v287)); - svfloat32_t v320; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v320) : "w"(v302), "w"(v309)); - svfloat32_t v321; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v321) : "w"(v303), "w"(v311)); - svfloat32_t v322; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v322) : "w"(v304), "w"(v313)); - svfloat32_t v323; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v323) : "w"(v305), "w"(v315)); - svfloat32_t v324; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v324) : "w"(v306), "w"(v317)); - svfloat32_t v325; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v325) : "w"(v307), "w"(v319)); - svfloat32_t v326; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v326) : "w"(v307), "w"(v319)); - svfloat32_t v327; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v327) : "w"(v306), "w"(v317)); - svfloat32_t v328; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v328) : "w"(v305), "w"(v315)); - svfloat32_t v329; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v329) : "w"(v304), "w"(v313)); - svfloat32_t v330; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v330) : "w"(v303), "w"(v311)); - svfloat32_t v331; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v331) : "w"(v302), "w"(v309)); + svfloat32_t v309 = svadd_f32_x(svptrue_b32(), v308, v296); + svfloat32_t v311 = svsub_f32_x(svptrue_b32(), v310, v297); + svfloat32_t v319 = svsub_f32_x(svptrue_b32(), v318, v297); + svfloat32_t v302 = svadd_f32_x(svptrue_b32(), v283, v289); + svfloat32_t v303 = svadd_f32_x(svptrue_b32(), v285, v291); + svfloat32_t v304 = svsub_f32_x(svptrue_b32(), v291, v285); + svfloat32_t v305 = svadd_f32_x(svptrue_b32(), v287, v293); + svfloat32_t v306 = svsub_f32_x(svptrue_b32(), v289, v283); + svfloat32_t v307 = svsub_f32_x(svptrue_b32(), v293, v287); + svfloat32_t v320 = svsub_f32_x(svptrue_b32(), v302, v309); + svfloat32_t v321 = svadd_f32_x(svptrue_b32(), v303, v311); + svfloat32_t v322 = svsub_f32_x(svptrue_b32(), v304, v313); + svfloat32_t v323 = svsub_f32_x(svptrue_b32(), v305, v315); + svfloat32_t v324 = svadd_f32_x(svptrue_b32(), v306, v317); + svfloat32_t v325 = svsub_f32_x(svptrue_b32(), v307, v319); + svfloat32_t v326 = svadd_f32_x(svptrue_b32(), v307, v319); + svfloat32_t v327 = svsub_f32_x(svptrue_b32(), v306, v317); + svfloat32_t v328 = svadd_f32_x(svptrue_b32(), v305, v315); + svfloat32_t v329 = svadd_f32_x(svptrue_b32(), v304, v313); + svfloat32_t v330 = svsub_f32_x(svptrue_b32(), v303, v311); + svfloat32_t v331 = svadd_f32_x(svptrue_b32(), v302, v309); svint16_t v342 = svtbl_s16( svreinterpret_s16_s32(svcvt_s32_f32_x( pred_full, svmul_n_f32_x(pred_full, v320, (float)(1ULL << 31ULL)))), @@ -5257,417 +3411,198 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu14(const armral_cmplx_f32_t *restrict x, float v4 = dir; const float32x2_t *v5 = (const float32x2_t *)x; int32_t *v6 = (int32_t *)y; - int64_t v12 = howmany - 1; - int64_t v431 = howmany / 2; - for (int j = 0; j < v12; j += 2) { - float v249 = -1.1666666666666665e+00F; - float v254 = 7.9015646852540022e-01F; - float v259 = 5.5854267289647742e-02F; - float v264 = 7.3430220123575241e-01F; - float v268 = 4.4095855184409838e-01F; - float v269 = -4.4095855184409838e-01F; - float v276 = 3.4087293062393137e-01F; - float v277 = -3.4087293062393137e-01F; - float v284 = -5.3396936033772524e-01F; - float v285 = 5.3396936033772524e-01F; - float v292 = 8.7484229096165667e-01F; - float v293 = -8.7484229096165667e-01F; - float32x2_t v295 = (float32x2_t){v4, v4}; - const float32x2_t *v867 = &v5[istride]; - int32_t *v940 = &v6[ostride]; - float32x2_t v250 = (float32x2_t){v249, v249}; - float32x2_t v255 = (float32x2_t){v254, v254}; - float32x2_t v260 = (float32x2_t){v259, v259}; - float32x2_t v265 = (float32x2_t){v264, v264}; - float32x2_t v270 = (float32x2_t){v268, v269}; - float32x2_t v278 = (float32x2_t){v276, v277}; - float32x2_t v286 = (float32x2_t){v284, v285}; - float32x2_t v294 = (float32x2_t){v292, v293}; - const float32x2_t *v786 = &v5[0]; - int32_t *v913 = &v6[0]; - float32x4_t v1052 = vld1q_f32((const float32_t *)v867); - float32x4_t v251 = vcombine_f32(v250, v250); - float32x4_t v256 = vcombine_f32(v255, v255); - float32x4_t v261 = vcombine_f32(v260, v260); - float32x4_t v266 = vcombine_f32(v265, v265); - float32x2_t v272 = vmul_f32(v295, v270); - float32x2_t v280 = vmul_f32(v295, v278); - float32x2_t v288 = vmul_f32(v295, v286); - float32x2_t v296 = vmul_f32(v295, v294); - const float32x2_t *v795 = &v5[istride * 7]; - const float32x2_t *v804 = &v5[istride * 2]; - const float32x2_t *v813 = &v5[istride * 9]; - const float32x2_t *v822 = &v5[istride * 4]; - const float32x2_t *v831 = &v5[istride * 11]; - const float32x2_t *v840 = &v5[istride * 6]; - const float32x2_t *v849 = &v5[istride * 13]; - const float32x2_t *v858 = &v5[istride * 8]; - const float32x2_t *v876 = &v5[istride * 10]; - const float32x2_t *v885 = &v5[istride * 3]; - const float32x2_t *v894 = &v5[istride * 12]; - const float32x2_t *v903 = &v5[istride * 5]; - int32_t *v922 = &v6[ostride * 7]; - int32_t *v931 = &v6[ostride * 8]; - int32_t *v949 = &v6[ostride * 2]; - int32_t *v958 = &v6[ostride * 9]; - int32_t *v967 = &v6[ostride * 10]; - int32_t *v976 = &v6[ostride * 3]; - int32_t *v985 = &v6[ostride * 4]; - int32_t *v994 = &v6[ostride * 11]; - int32_t *v1003 = &v6[ostride * 12]; - int32_t *v1012 = &v6[ostride * 5]; - int32_t *v1021 = &v6[ostride * 6]; - int32_t *v1030 = &v6[ostride * 13]; - float32x4_t v1034 = vld1q_f32((const float32_t *)v786); - float32x4_t v274 = vcombine_f32(v272, v272); - float32x4_t v282 = vcombine_f32(v280, v280); - float32x4_t v290 = vcombine_f32(v288, v288); - float32x4_t v298 = vcombine_f32(v296, v296); - float32x4_t v1036 = vld1q_f32((const float32_t *)v795); - float32x4_t v1038 = vld1q_f32((const float32_t *)v804); - float32x4_t v1040 = vld1q_f32((const float32_t *)v813); - float32x4_t v1042 = vld1q_f32((const float32_t *)v822); - float32x4_t v1044 = vld1q_f32((const float32_t *)v831); - float32x4_t v1046 = vld1q_f32((const float32_t *)v840); - float32x4_t v1048 = vld1q_f32((const float32_t *)v849); - float32x4_t v1050 = vld1q_f32((const float32_t *)v858); - float32x4_t v1054 = vld1q_f32((const float32_t *)v876); - float32x4_t v1056 = vld1q_f32((const float32_t *)v885); - float32x4_t v1058 = vld1q_f32((const float32_t *)v894); - float32x4_t v1060 = vld1q_f32((const float32_t *)v903); - float32x4_t v35 = vaddq_f32(v1034, v1036); - float32x4_t v36 = vsubq_f32(v1034, v1036); - float32x4_t v51 = vaddq_f32(v1038, v1040); - float32x4_t v52 = vsubq_f32(v1038, v1040); - float32x4_t v67 = vaddq_f32(v1042, v1044); - float32x4_t v68 = vsubq_f32(v1042, v1044); - float32x4_t v83 = vaddq_f32(v1046, v1048); - float32x4_t v84 = vsubq_f32(v1046, v1048); - float32x4_t v99 = vaddq_f32(v1050, v1052); - float32x4_t v100 = vsubq_f32(v1050, v1052); - float32x4_t v115 = vaddq_f32(v1054, v1056); - float32x4_t v116 = vsubq_f32(v1054, v1056); - float32x4_t v131 = vaddq_f32(v1058, v1060); - float32x4_t v132 = vsubq_f32(v1058, v1060); - float32x4_t v133 = vaddq_f32(v51, v131); - float32x4_t v134 = vsubq_f32(v51, v131); - float32x4_t v135 = vaddq_f32(v99, v83); - float32x4_t v136 = vsubq_f32(v99, v83); - float32x4_t v137 = vaddq_f32(v67, v115); - float32x4_t v138 = vsubq_f32(v67, v115); - float32x4_t v226 = vaddq_f32(v52, v132); - float32x4_t v227 = vsubq_f32(v52, v132); - float32x4_t v228 = vaddq_f32(v100, v84); - float32x4_t v229 = vsubq_f32(v100, v84); - float32x4_t v230 = vaddq_f32(v68, v116); - float32x4_t v231 = vsubq_f32(v68, v116); - float32x4_t v139 = vaddq_f32(v133, v135); - float32x4_t v142 = vsubq_f32(v133, v135); - float32x4_t v143 = vsubq_f32(v135, v137); - float32x4_t v144 = vsubq_f32(v137, v133); - float32x4_t v145 = vaddq_f32(v134, v136); - float32x4_t v147 = vsubq_f32(v134, v136); - float32x4_t v148 = vsubq_f32(v136, v138); - float32x4_t v149 = vsubq_f32(v138, v134); - float32x4_t v232 = vaddq_f32(v226, v228); - float32x4_t v235 = vsubq_f32(v226, v228); - float32x4_t v236 = vsubq_f32(v228, v230); - float32x4_t v237 = vsubq_f32(v230, v226); - float32x4_t v238 = vaddq_f32(v227, v229); - float32x4_t v240 = vsubq_f32(v227, v229); - float32x4_t v241 = vsubq_f32(v229, v231); - float32x4_t v242 = vsubq_f32(v231, v227); - float32x4_t v140 = vaddq_f32(v139, v137); - float32x4_t v146 = vaddq_f32(v145, v138); - float32x4_t v164 = vmulq_f32(v142, v256); - float32x4_t v169 = vmulq_f32(v143, v261); - float32x4_t v174 = vmulq_f32(v144, v266); - float32x4_t v188 = vrev64q_f32(v147); - float32x4_t v196 = vrev64q_f32(v148); - float32x4_t v204 = vrev64q_f32(v149); - float32x4_t v233 = vaddq_f32(v232, v230); - float32x4_t v239 = vaddq_f32(v238, v231); - float32x4_t v257 = vmulq_f32(v235, v256); - float32x4_t v262 = vmulq_f32(v236, v261); - float32x4_t v267 = vmulq_f32(v237, v266); - float32x4_t v281 = vrev64q_f32(v240); - float32x4_t v289 = vrev64q_f32(v241); - float32x4_t v297 = vrev64q_f32(v242); - float32x4_t v141 = vaddq_f32(v140, v35); - float32x4_t v159 = vmulq_f32(v140, v251); - float32x4_t v180 = vrev64q_f32(v146); - float32x4_t v190 = vmulq_f32(v188, v282); - float32x4_t v198 = vmulq_f32(v196, v290); - float32x4_t v206 = vmulq_f32(v204, v298); - float32x4_t v234 = vaddq_f32(v233, v36); - float32x4_t v252 = vmulq_f32(v233, v251); - float32x4_t v273 = vrev64q_f32(v239); - float32x4_t v283 = vmulq_f32(v281, v282); - float32x4_t v291 = vmulq_f32(v289, v290); - float32x4_t v299 = vmulq_f32(v297, v298); - float32x4_t v182 = vmulq_f32(v180, v274); - float32x4_t v207 = vaddq_f32(v141, v159); - float32x4_t v275 = vmulq_f32(v273, v274); - float32x4_t v300 = vaddq_f32(v234, v252); - int16x4_t v321 = vqmovn_s32(vcvtq_n_s32_f32(v141, 15)); - int16x4_t v329 = vqmovn_s32(vcvtq_n_s32_f32(v234, 15)); - float32x4_t v208 = vaddq_f32(v207, v164); - float32x4_t v210 = vsubq_f32(v207, v164); - float32x4_t v212 = vsubq_f32(v207, v169); - float32x4_t v214 = vaddq_f32(v182, v190); - float32x4_t v216 = vsubq_f32(v182, v190); - float32x4_t v218 = vsubq_f32(v182, v198); - float32x4_t v301 = vaddq_f32(v300, v257); - float32x4_t v303 = vsubq_f32(v300, v257); - float32x4_t v305 = vsubq_f32(v300, v262); - float32x4_t v307 = vaddq_f32(v275, v283); - float32x4_t v309 = vsubq_f32(v275, v283); - float32x4_t v311 = vsubq_f32(v275, v291); - vst1_s16((int16_t *)v913, v321); - vst1_s16((int16_t *)v922, v329); - float32x4_t v209 = vaddq_f32(v208, v169); - float32x4_t v211 = vsubq_f32(v210, v174); - float32x4_t v213 = vaddq_f32(v212, v174); - float32x4_t v215 = vaddq_f32(v214, v198); - float32x4_t v217 = vsubq_f32(v216, v206); - float32x4_t v219 = vaddq_f32(v218, v206); - float32x4_t v302 = vaddq_f32(v301, v262); - float32x4_t v304 = vsubq_f32(v303, v267); - float32x4_t v306 = vaddq_f32(v305, v267); - float32x4_t v308 = vaddq_f32(v307, v291); - float32x4_t v310 = vsubq_f32(v309, v299); - float32x4_t v312 = vaddq_f32(v311, v299); - float32x4_t v220 = vaddq_f32(v209, v215); - float32x4_t v221 = vsubq_f32(v209, v215); - float32x4_t v222 = vaddq_f32(v211, v217); - float32x4_t v223 = vsubq_f32(v211, v217); - float32x4_t v224 = vaddq_f32(v213, v219); - float32x4_t v225 = vsubq_f32(v213, v219); - float32x4_t v313 = vaddq_f32(v302, v308); - float32x4_t v314 = vsubq_f32(v302, v308); - float32x4_t v315 = vaddq_f32(v304, v310); - float32x4_t v316 = vsubq_f32(v304, v310); - float32x4_t v317 = vaddq_f32(v306, v312); - float32x4_t v318 = vsubq_f32(v306, v312); - int16x4_t v337 = vqmovn_s32(vcvtq_n_s32_f32(v221, 15)); - int16x4_t v345 = vqmovn_s32(vcvtq_n_s32_f32(v314, 15)); - int16x4_t v353 = vqmovn_s32(vcvtq_n_s32_f32(v223, 15)); - int16x4_t v361 = vqmovn_s32(vcvtq_n_s32_f32(v316, 15)); - int16x4_t v369 = vqmovn_s32(vcvtq_n_s32_f32(v224, 15)); - int16x4_t v377 = vqmovn_s32(vcvtq_n_s32_f32(v317, 15)); - int16x4_t v385 = vqmovn_s32(vcvtq_n_s32_f32(v225, 15)); - int16x4_t v393 = vqmovn_s32(vcvtq_n_s32_f32(v318, 15)); - int16x4_t v401 = vqmovn_s32(vcvtq_n_s32_f32(v222, 15)); - int16x4_t v409 = vqmovn_s32(vcvtq_n_s32_f32(v315, 15)); - int16x4_t v417 = vqmovn_s32(vcvtq_n_s32_f32(v220, 15)); - int16x4_t v425 = vqmovn_s32(vcvtq_n_s32_f32(v313, 15)); - vst1_s16((int16_t *)v931, v337); - vst1_s16((int16_t *)v940, v345); - vst1_s16((int16_t *)v949, v353); - vst1_s16((int16_t *)v958, v361); - vst1_s16((int16_t *)v967, v369); - vst1_s16((int16_t *)v976, v377); - vst1_s16((int16_t *)v985, v385); - vst1_s16((int16_t *)v994, v393); - vst1_s16((int16_t *)v1003, v401); - vst1_s16((int16_t *)v1012, v409); - vst1_s16((int16_t *)v1021, v417); - vst1_s16((int16_t *)v1030, v425); - v5 += 2 * 1; - v6 += 2 * 1; - } - for (int j = v431 * 2; j < howmany; j += 1) { - float32x2_t v496 = v5[istride]; - float v629 = -1.1666666666666665e+00F; - float v633 = 7.9015646852540022e-01F; - float v637 = 5.5854267289647742e-02F; - float v641 = 7.3430220123575241e-01F; - float v644 = 4.4095855184409838e-01F; - float v645 = -4.4095855184409838e-01F; - float v651 = 3.4087293062393137e-01F; - float v652 = -3.4087293062393137e-01F; - float v658 = -5.3396936033772524e-01F; - float v659 = 5.3396936033772524e-01F; - float v665 = 8.7484229096165667e-01F; - float v666 = -8.7484229096165667e-01F; - float32x2_t v668 = (float32x2_t){v4, v4}; - float32x2_t v443 = v5[0]; - float32x2_t v630 = (float32x2_t){v629, v629}; - float32x2_t v634 = (float32x2_t){v633, v633}; - float32x2_t v638 = (float32x2_t){v637, v637}; - float32x2_t v642 = (float32x2_t){v641, v641}; - float32x2_t v646 = (float32x2_t){v644, v645}; - float32x2_t v653 = (float32x2_t){v651, v652}; - float32x2_t v660 = (float32x2_t){v658, v659}; - float32x2_t v667 = (float32x2_t){v665, v666}; - float32x2_t v448 = v5[istride * 7]; - float32x2_t v455 = v5[istride * 2]; - float32x2_t v460 = v5[istride * 9]; - float32x2_t v467 = v5[istride * 4]; - float32x2_t v472 = v5[istride * 11]; - float32x2_t v479 = v5[istride * 6]; - float32x2_t v484 = v5[istride * 13]; - float32x2_t v491 = v5[istride * 8]; - float32x2_t v503 = v5[istride * 10]; - float32x2_t v508 = v5[istride * 3]; - float32x2_t v515 = v5[istride * 12]; - float32x2_t v520 = v5[istride * 5]; - float32x2_t v648 = vmul_f32(v668, v646); - float32x2_t v655 = vmul_f32(v668, v653); - float32x2_t v662 = vmul_f32(v668, v660); - float32x2_t v669 = vmul_f32(v668, v667); - float32x2_t v449 = vadd_f32(v443, v448); - float32x2_t v450 = vsub_f32(v443, v448); - float32x2_t v461 = vadd_f32(v455, v460); - float32x2_t v462 = vsub_f32(v455, v460); - float32x2_t v473 = vadd_f32(v467, v472); - float32x2_t v474 = vsub_f32(v467, v472); - float32x2_t v485 = vadd_f32(v479, v484); - float32x2_t v486 = vsub_f32(v479, v484); - float32x2_t v497 = vadd_f32(v491, v496); - float32x2_t v498 = vsub_f32(v491, v496); - float32x2_t v509 = vadd_f32(v503, v508); - float32x2_t v510 = vsub_f32(v503, v508); - float32x2_t v521 = vadd_f32(v515, v520); - float32x2_t v522 = vsub_f32(v515, v520); - float32x2_t v523 = vadd_f32(v461, v521); - float32x2_t v524 = vsub_f32(v461, v521); - float32x2_t v525 = vadd_f32(v497, v485); - float32x2_t v526 = vsub_f32(v497, v485); - float32x2_t v527 = vadd_f32(v473, v509); - float32x2_t v528 = vsub_f32(v473, v509); - float32x2_t v607 = vadd_f32(v462, v522); - float32x2_t v608 = vsub_f32(v462, v522); - float32x2_t v609 = vadd_f32(v498, v486); - float32x2_t v610 = vsub_f32(v498, v486); - float32x2_t v611 = vadd_f32(v474, v510); - float32x2_t v612 = vsub_f32(v474, v510); - float32x2_t v529 = vadd_f32(v523, v525); - float32x2_t v532 = vsub_f32(v523, v525); - float32x2_t v533 = vsub_f32(v525, v527); - float32x2_t v534 = vsub_f32(v527, v523); - float32x2_t v535 = vadd_f32(v524, v526); - float32x2_t v537 = vsub_f32(v524, v526); - float32x2_t v538 = vsub_f32(v526, v528); - float32x2_t v539 = vsub_f32(v528, v524); - float32x2_t v613 = vadd_f32(v607, v609); - float32x2_t v616 = vsub_f32(v607, v609); - float32x2_t v617 = vsub_f32(v609, v611); - float32x2_t v618 = vsub_f32(v611, v607); - float32x2_t v619 = vadd_f32(v608, v610); - float32x2_t v621 = vsub_f32(v608, v610); - float32x2_t v622 = vsub_f32(v610, v612); - float32x2_t v623 = vsub_f32(v612, v608); - float32x2_t v530 = vadd_f32(v529, v527); - float32x2_t v536 = vadd_f32(v535, v528); - float32x2_t v551 = vmul_f32(v532, v634); - float32x2_t v555 = vmul_f32(v533, v638); - float32x2_t v559 = vmul_f32(v534, v642); - float32x2_t v572 = vrev64_f32(v537); - float32x2_t v579 = vrev64_f32(v538); - float32x2_t v586 = vrev64_f32(v539); - float32x2_t v614 = vadd_f32(v613, v611); - float32x2_t v620 = vadd_f32(v619, v612); - float32x2_t v635 = vmul_f32(v616, v634); - float32x2_t v639 = vmul_f32(v617, v638); - float32x2_t v643 = vmul_f32(v618, v642); - float32x2_t v656 = vrev64_f32(v621); - float32x2_t v663 = vrev64_f32(v622); - float32x2_t v670 = vrev64_f32(v623); - float32x2_t v531 = vadd_f32(v530, v449); - float32x2_t v547 = vmul_f32(v530, v630); - float32x2_t v565 = vrev64_f32(v536); - float32x2_t v573 = vmul_f32(v572, v655); - float32x2_t v580 = vmul_f32(v579, v662); - float32x2_t v587 = vmul_f32(v586, v669); - float32x2_t v615 = vadd_f32(v614, v450); - float32x2_t v631 = vmul_f32(v614, v630); - float32x2_t v649 = vrev64_f32(v620); - float32x2_t v657 = vmul_f32(v656, v655); - float32x2_t v664 = vmul_f32(v663, v662); - float32x2_t v671 = vmul_f32(v670, v669); - float32x2_t v566 = vmul_f32(v565, v648); - float32x2_t v588 = vadd_f32(v531, v547); - float32x2_t v650 = vmul_f32(v649, v648); - float32x2_t v672 = vadd_f32(v615, v631); - int16x4_t v693 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v531, 15), (int32x2_t){0, 0})); - int16x4_t v699 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v615, 15), (int32x2_t){0, 0})); - float32x2_t v589 = vadd_f32(v588, v551); - float32x2_t v591 = vsub_f32(v588, v551); - float32x2_t v593 = vsub_f32(v588, v555); - float32x2_t v595 = vadd_f32(v566, v573); - float32x2_t v597 = vsub_f32(v566, v573); - float32x2_t v599 = vsub_f32(v566, v580); - float32x2_t v673 = vadd_f32(v672, v635); - float32x2_t v675 = vsub_f32(v672, v635); - float32x2_t v677 = vsub_f32(v672, v639); - float32x2_t v679 = vadd_f32(v650, v657); - float32x2_t v681 = vsub_f32(v650, v657); - float32x2_t v683 = vsub_f32(v650, v664); - v6[0] = vget_lane_s32(vreinterpret_s32_s16(v693), 0); - v6[ostride * 7] = vget_lane_s32(vreinterpret_s32_s16(v699), 0); - float32x2_t v590 = vadd_f32(v589, v555); - float32x2_t v592 = vsub_f32(v591, v559); - float32x2_t v594 = vadd_f32(v593, v559); - float32x2_t v596 = vadd_f32(v595, v580); - float32x2_t v598 = vsub_f32(v597, v587); - float32x2_t v600 = vadd_f32(v599, v587); - float32x2_t v674 = vadd_f32(v673, v639); - float32x2_t v676 = vsub_f32(v675, v643); - float32x2_t v678 = vadd_f32(v677, v643); - float32x2_t v680 = vadd_f32(v679, v664); - float32x2_t v682 = vsub_f32(v681, v671); - float32x2_t v684 = vadd_f32(v683, v671); - float32x2_t v601 = vadd_f32(v590, v596); - float32x2_t v602 = vsub_f32(v590, v596); - float32x2_t v603 = vadd_f32(v592, v598); - float32x2_t v604 = vsub_f32(v592, v598); - float32x2_t v605 = vadd_f32(v594, v600); - float32x2_t v606 = vsub_f32(v594, v600); - float32x2_t v685 = vadd_f32(v674, v680); - float32x2_t v686 = vsub_f32(v674, v680); - float32x2_t v687 = vadd_f32(v676, v682); - float32x2_t v688 = vsub_f32(v676, v682); - float32x2_t v689 = vadd_f32(v678, v684); - float32x2_t v690 = vsub_f32(v678, v684); - int16x4_t v705 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v602, 15), (int32x2_t){0, 0})); - int16x4_t v711 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v686, 15), (int32x2_t){0, 0})); - int16x4_t v717 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v604, 15), (int32x2_t){0, 0})); - int16x4_t v723 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v688, 15), (int32x2_t){0, 0})); - int16x4_t v729 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v605, 15), (int32x2_t){0, 0})); - int16x4_t v735 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v689, 15), (int32x2_t){0, 0})); - int16x4_t v741 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v606, 15), (int32x2_t){0, 0})); - int16x4_t v747 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v690, 15), (int32x2_t){0, 0})); - int16x4_t v753 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v603, 15), (int32x2_t){0, 0})); - int16x4_t v759 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v687, 15), (int32x2_t){0, 0})); - int16x4_t v765 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v601, 15), (int32x2_t){0, 0})); - int16x4_t v771 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v685, 15), (int32x2_t){0, 0})); - v6[ostride * 8] = vget_lane_s32(vreinterpret_s32_s16(v705), 0); - v6[ostride] = vget_lane_s32(vreinterpret_s32_s16(v711), 0); - v6[ostride * 2] = vget_lane_s32(vreinterpret_s32_s16(v717), 0); - v6[ostride * 9] = vget_lane_s32(vreinterpret_s32_s16(v723), 0); - v6[ostride * 10] = vget_lane_s32(vreinterpret_s32_s16(v729), 0); - v6[ostride * 3] = vget_lane_s32(vreinterpret_s32_s16(v735), 0); - v6[ostride * 4] = vget_lane_s32(vreinterpret_s32_s16(v741), 0); - v6[ostride * 11] = vget_lane_s32(vreinterpret_s32_s16(v747), 0); - v6[ostride * 12] = vget_lane_s32(vreinterpret_s32_s16(v753), 0); - v6[ostride * 5] = vget_lane_s32(vreinterpret_s32_s16(v759), 0); - v6[ostride * 6] = vget_lane_s32(vreinterpret_s32_s16(v765), 0); - v6[ostride * 13] = vget_lane_s32(vreinterpret_s32_s16(v771), 0); + for (int j = 0; j < howmany; j += 1) { + float32x2_t v73 = v5[istride]; + float v206 = -1.1666666666666665e+00F; + float v210 = 7.9015646852540022e-01F; + float v214 = 5.5854267289647742e-02F; + float v218 = 7.3430220123575241e-01F; + float v221 = 4.4095855184409838e-01F; + float v222 = -4.4095855184409838e-01F; + float v228 = 3.4087293062393137e-01F; + float v229 = -3.4087293062393137e-01F; + float v235 = -5.3396936033772524e-01F; + float v236 = 5.3396936033772524e-01F; + float v242 = 8.7484229096165667e-01F; + float v243 = -8.7484229096165667e-01F; + float32x2_t v245 = (float32x2_t){v4, v4}; + float32x2_t v20 = v5[0]; + float32x2_t v207 = (float32x2_t){v206, v206}; + float32x2_t v211 = (float32x2_t){v210, v210}; + float32x2_t v215 = (float32x2_t){v214, v214}; + float32x2_t v219 = (float32x2_t){v218, v218}; + float32x2_t v223 = (float32x2_t){v221, v222}; + float32x2_t v230 = (float32x2_t){v228, v229}; + float32x2_t v237 = (float32x2_t){v235, v236}; + float32x2_t v244 = (float32x2_t){v242, v243}; + float32x2_t v25 = v5[istride * 7]; + float32x2_t v32 = v5[istride * 2]; + float32x2_t v37 = v5[istride * 9]; + float32x2_t v44 = v5[istride * 4]; + float32x2_t v49 = v5[istride * 11]; + float32x2_t v56 = v5[istride * 6]; + float32x2_t v61 = v5[istride * 13]; + float32x2_t v68 = v5[istride * 8]; + float32x2_t v80 = v5[istride * 10]; + float32x2_t v85 = v5[istride * 3]; + float32x2_t v92 = v5[istride * 12]; + float32x2_t v97 = v5[istride * 5]; + float32x2_t v225 = vmul_f32(v245, v223); + float32x2_t v232 = vmul_f32(v245, v230); + float32x2_t v239 = vmul_f32(v245, v237); + float32x2_t v246 = vmul_f32(v245, v244); + float32x2_t v26 = vadd_f32(v20, v25); + float32x2_t v27 = vsub_f32(v20, v25); + float32x2_t v38 = vadd_f32(v32, v37); + float32x2_t v39 = vsub_f32(v32, v37); + float32x2_t v50 = vadd_f32(v44, v49); + float32x2_t v51 = vsub_f32(v44, v49); + float32x2_t v62 = vadd_f32(v56, v61); + float32x2_t v63 = vsub_f32(v56, v61); + float32x2_t v74 = vadd_f32(v68, v73); + float32x2_t v75 = vsub_f32(v68, v73); + float32x2_t v86 = vadd_f32(v80, v85); + float32x2_t v87 = vsub_f32(v80, v85); + float32x2_t v98 = vadd_f32(v92, v97); + float32x2_t v99 = vsub_f32(v92, v97); + float32x2_t v100 = vadd_f32(v38, v98); + float32x2_t v101 = vsub_f32(v38, v98); + float32x2_t v102 = vadd_f32(v74, v62); + float32x2_t v103 = vsub_f32(v74, v62); + float32x2_t v104 = vadd_f32(v50, v86); + float32x2_t v105 = vsub_f32(v50, v86); + float32x2_t v184 = vadd_f32(v39, v99); + float32x2_t v185 = vsub_f32(v39, v99); + float32x2_t v186 = vadd_f32(v75, v63); + float32x2_t v187 = vsub_f32(v75, v63); + float32x2_t v188 = vadd_f32(v51, v87); + float32x2_t v189 = vsub_f32(v51, v87); + float32x2_t v106 = vadd_f32(v100, v102); + float32x2_t v109 = vsub_f32(v100, v102); + float32x2_t v110 = vsub_f32(v102, v104); + float32x2_t v111 = vsub_f32(v104, v100); + float32x2_t v112 = vadd_f32(v101, v103); + float32x2_t v114 = vsub_f32(v101, v103); + float32x2_t v115 = vsub_f32(v103, v105); + float32x2_t v116 = vsub_f32(v105, v101); + float32x2_t v190 = vadd_f32(v184, v186); + float32x2_t v193 = vsub_f32(v184, v186); + float32x2_t v194 = vsub_f32(v186, v188); + float32x2_t v195 = vsub_f32(v188, v184); + float32x2_t v196 = vadd_f32(v185, v187); + float32x2_t v198 = vsub_f32(v185, v187); + float32x2_t v199 = vsub_f32(v187, v189); + float32x2_t v200 = vsub_f32(v189, v185); + float32x2_t v107 = vadd_f32(v106, v104); + float32x2_t v113 = vadd_f32(v112, v105); + float32x2_t v128 = vmul_f32(v109, v211); + float32x2_t v132 = vmul_f32(v110, v215); + float32x2_t v136 = vmul_f32(v111, v219); + float32x2_t v149 = vrev64_f32(v114); + float32x2_t v156 = vrev64_f32(v115); + float32x2_t v163 = vrev64_f32(v116); + float32x2_t v191 = vadd_f32(v190, v188); + float32x2_t v197 = vadd_f32(v196, v189); + float32x2_t v212 = vmul_f32(v193, v211); + float32x2_t v216 = vmul_f32(v194, v215); + float32x2_t v220 = vmul_f32(v195, v219); + float32x2_t v233 = vrev64_f32(v198); + float32x2_t v240 = vrev64_f32(v199); + float32x2_t v247 = vrev64_f32(v200); + float32x2_t v108 = vadd_f32(v107, v26); + float32x2_t v124 = vmul_f32(v107, v207); + float32x2_t v142 = vrev64_f32(v113); + float32x2_t v150 = vmul_f32(v149, v232); + float32x2_t v157 = vmul_f32(v156, v239); + float32x2_t v164 = vmul_f32(v163, v246); + float32x2_t v192 = vadd_f32(v191, v27); + float32x2_t v208 = vmul_f32(v191, v207); + float32x2_t v226 = vrev64_f32(v197); + float32x2_t v234 = vmul_f32(v233, v232); + float32x2_t v241 = vmul_f32(v240, v239); + float32x2_t v248 = vmul_f32(v247, v246); + float32x2_t v143 = vmul_f32(v142, v225); + float32x2_t v165 = vadd_f32(v108, v124); + float32x2_t v227 = vmul_f32(v226, v225); + float32x2_t v249 = vadd_f32(v192, v208); + int16x4_t v270 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v108, 15), (int32x2_t){0, 0})); + int16x4_t v276 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v192, 15), (int32x2_t){0, 0})); + float32x2_t v166 = vadd_f32(v165, v128); + float32x2_t v168 = vsub_f32(v165, v128); + float32x2_t v170 = vsub_f32(v165, v132); + float32x2_t v172 = vadd_f32(v143, v150); + float32x2_t v174 = vsub_f32(v143, v150); + float32x2_t v176 = vsub_f32(v143, v157); + float32x2_t v250 = vadd_f32(v249, v212); + float32x2_t v252 = vsub_f32(v249, v212); + float32x2_t v254 = vsub_f32(v249, v216); + float32x2_t v256 = vadd_f32(v227, v234); + float32x2_t v258 = vsub_f32(v227, v234); + float32x2_t v260 = vsub_f32(v227, v241); + v6[0] = vget_lane_s32(vreinterpret_s32_s16(v270), 0); + v6[ostride * 7] = vget_lane_s32(vreinterpret_s32_s16(v276), 0); + float32x2_t v167 = vadd_f32(v166, v132); + float32x2_t v169 = vsub_f32(v168, v136); + float32x2_t v171 = vadd_f32(v170, v136); + float32x2_t v173 = vadd_f32(v172, v157); + float32x2_t v175 = vsub_f32(v174, v164); + float32x2_t v177 = vadd_f32(v176, v164); + float32x2_t v251 = vadd_f32(v250, v216); + float32x2_t v253 = vsub_f32(v252, v220); + float32x2_t v255 = vadd_f32(v254, v220); + float32x2_t v257 = vadd_f32(v256, v241); + float32x2_t v259 = vsub_f32(v258, v248); + float32x2_t v261 = vadd_f32(v260, v248); + float32x2_t v178 = vadd_f32(v167, v173); + float32x2_t v179 = vsub_f32(v167, v173); + float32x2_t v180 = vadd_f32(v169, v175); + float32x2_t v181 = vsub_f32(v169, v175); + float32x2_t v182 = vadd_f32(v171, v177); + float32x2_t v183 = vsub_f32(v171, v177); + float32x2_t v262 = vadd_f32(v251, v257); + float32x2_t v263 = vsub_f32(v251, v257); + float32x2_t v264 = vadd_f32(v253, v259); + float32x2_t v265 = vsub_f32(v253, v259); + float32x2_t v266 = vadd_f32(v255, v261); + float32x2_t v267 = vsub_f32(v255, v261); + int16x4_t v282 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v179, 15), (int32x2_t){0, 0})); + int16x4_t v288 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v263, 15), (int32x2_t){0, 0})); + int16x4_t v294 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v181, 15), (int32x2_t){0, 0})); + int16x4_t v300 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v265, 15), (int32x2_t){0, 0})); + int16x4_t v306 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v182, 15), (int32x2_t){0, 0})); + int16x4_t v312 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v266, 15), (int32x2_t){0, 0})); + int16x4_t v318 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v183, 15), (int32x2_t){0, 0})); + int16x4_t v324 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v267, 15), (int32x2_t){0, 0})); + int16x4_t v330 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v180, 15), (int32x2_t){0, 0})); + int16x4_t v336 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v264, 15), (int32x2_t){0, 0})); + int16x4_t v342 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v178, 15), (int32x2_t){0, 0})); + int16x4_t v348 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v262, 15), (int32x2_t){0, 0})); + v6[ostride * 8] = vget_lane_s32(vreinterpret_s32_s16(v282), 0); + v6[ostride] = vget_lane_s32(vreinterpret_s32_s16(v288), 0); + v6[ostride * 2] = vget_lane_s32(vreinterpret_s32_s16(v294), 0); + v6[ostride * 9] = vget_lane_s32(vreinterpret_s32_s16(v300), 0); + v6[ostride * 10] = vget_lane_s32(vreinterpret_s32_s16(v306), 0); + v6[ostride * 3] = vget_lane_s32(vreinterpret_s32_s16(v312), 0); + v6[ostride * 4] = vget_lane_s32(vreinterpret_s32_s16(v318), 0); + v6[ostride * 11] = vget_lane_s32(vreinterpret_s32_s16(v324), 0); + v6[ostride * 12] = vget_lane_s32(vreinterpret_s32_s16(v330), 0); + v6[ostride * 5] = vget_lane_s32(vreinterpret_s32_s16(v336), 0); + v6[ostride * 6] = vget_lane_s32(vreinterpret_s32_s16(v342), 0); + v6[ostride * 13] = vget_lane_s32(vreinterpret_s32_s16(v348), 0); v5 += 1 * 1; v6 += 1 * 1; } @@ -5790,140 +3725,78 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu14(const armral_cmplx_f32_t *restrict x, svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v535)[0])); svfloat32_t v719 = svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v544)[0])); - svfloat32_t v32; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v32) : "w"(v693), "w"(v695)); - svfloat32_t v33; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v33) : "w"(v693), "w"(v695)); - svfloat32_t v48; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v48) : "w"(v697), "w"(v699)); - svfloat32_t v49; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v49) : "w"(v697), "w"(v699)); - svfloat32_t v64; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v64) : "w"(v701), "w"(v703)); - svfloat32_t v65; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v65) : "w"(v701), "w"(v703)); - svfloat32_t v80; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v80) : "w"(v705), "w"(v707)); - svfloat32_t v81; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v81) : "w"(v705), "w"(v707)); - svfloat32_t v96; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v96) : "w"(v709), "w"(v711)); - svfloat32_t v97; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v97) : "w"(v709), "w"(v711)); - svfloat32_t v112; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v112) : "w"(v713), "w"(v715)); - svfloat32_t v113; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v113) : "w"(v713), "w"(v715)); - svfloat32_t v128; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v128) : "w"(v717), "w"(v719)); - svfloat32_t v129; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v129) : "w"(v717), "w"(v719)); - svfloat32_t v130; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v130) : "w"(v48), "w"(v128)); - svfloat32_t v131; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v131) : "w"(v48), "w"(v128)); - svfloat32_t v132; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v132) : "w"(v96), "w"(v80)); - svfloat32_t v133; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v133) : "w"(v96), "w"(v80)); - svfloat32_t v134; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v134) : "w"(v64), "w"(v112)); - svfloat32_t v135; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v135) : "w"(v64), "w"(v112)); - svfloat32_t v219; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v219) : "w"(v49), "w"(v129)); - svfloat32_t v220; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v220) : "w"(v49), "w"(v129)); - svfloat32_t v221; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v221) : "w"(v97), "w"(v81)); - svfloat32_t v222; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v222) : "w"(v97), "w"(v81)); - svfloat32_t v223; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v223) : "w"(v65), "w"(v113)); - svfloat32_t v224; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v224) : "w"(v65), "w"(v113)); - svfloat32_t v136; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v136) : "w"(v130), "w"(v132)); - svfloat32_t v139; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v139) : "w"(v130), "w"(v132)); - svfloat32_t v140; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v140) : "w"(v132), "w"(v134)); - svfloat32_t v141; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v141) : "w"(v134), "w"(v130)); - svfloat32_t v142; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v142) : "w"(v131), "w"(v133)); - svfloat32_t v144; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v144) : "w"(v131), "w"(v133)); - svfloat32_t v145; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v145) : "w"(v133), "w"(v135)); - svfloat32_t v146; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v146) : "w"(v135), "w"(v131)); - svfloat32_t v225; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v225) : "w"(v219), "w"(v221)); - svfloat32_t v228; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v228) : "w"(v219), "w"(v221)); - svfloat32_t v229; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v229) : "w"(v221), "w"(v223)); - svfloat32_t v230; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v230) : "w"(v223), "w"(v219)); - svfloat32_t v231; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v231) : "w"(v220), "w"(v222)); - svfloat32_t v233; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v233) : "w"(v220), "w"(v222)); - svfloat32_t v234; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v234) : "w"(v222), "w"(v224)); - svfloat32_t v235; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v235) : "w"(v224), "w"(v220)); - svfloat32_t v137; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v137) : "w"(v136), "w"(v134)); - svfloat32_t v143; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v143) : "w"(v142), "w"(v135)); - svfloat32_t zero185; - asm volatile("mov %0.s, #0" : "=w"(zero185)); + svfloat32_t v32 = svadd_f32_x(svptrue_b32(), v693, v695); + svfloat32_t v33 = svsub_f32_x(svptrue_b32(), v693, v695); + svfloat32_t v48 = svadd_f32_x(svptrue_b32(), v697, v699); + svfloat32_t v49 = svsub_f32_x(svptrue_b32(), v697, v699); + svfloat32_t v64 = svadd_f32_x(svptrue_b32(), v701, v703); + svfloat32_t v65 = svsub_f32_x(svptrue_b32(), v701, v703); + svfloat32_t v80 = svadd_f32_x(svptrue_b32(), v705, v707); + svfloat32_t v81 = svsub_f32_x(svptrue_b32(), v705, v707); + svfloat32_t v96 = svadd_f32_x(svptrue_b32(), v709, v711); + svfloat32_t v97 = svsub_f32_x(svptrue_b32(), v709, v711); + svfloat32_t v112 = svadd_f32_x(svptrue_b32(), v713, v715); + svfloat32_t v113 = svsub_f32_x(svptrue_b32(), v713, v715); + svfloat32_t v128 = svadd_f32_x(svptrue_b32(), v717, v719); + svfloat32_t v129 = svsub_f32_x(svptrue_b32(), v717, v719); + svfloat32_t v130 = svadd_f32_x(svptrue_b32(), v48, v128); + svfloat32_t v131 = svsub_f32_x(svptrue_b32(), v48, v128); + svfloat32_t v132 = svadd_f32_x(svptrue_b32(), v96, v80); + svfloat32_t v133 = svsub_f32_x(svptrue_b32(), v96, v80); + svfloat32_t v134 = svadd_f32_x(svptrue_b32(), v64, v112); + svfloat32_t v135 = svsub_f32_x(svptrue_b32(), v64, v112); + svfloat32_t v219 = svadd_f32_x(svptrue_b32(), v49, v129); + svfloat32_t v220 = svsub_f32_x(svptrue_b32(), v49, v129); + svfloat32_t v221 = svadd_f32_x(svptrue_b32(), v97, v81); + svfloat32_t v222 = svsub_f32_x(svptrue_b32(), v97, v81); + svfloat32_t v223 = svadd_f32_x(svptrue_b32(), v65, v113); + svfloat32_t v224 = svsub_f32_x(svptrue_b32(), v65, v113); + svfloat32_t v136 = svadd_f32_x(svptrue_b32(), v130, v132); + svfloat32_t v139 = svsub_f32_x(svptrue_b32(), v130, v132); + svfloat32_t v140 = svsub_f32_x(svptrue_b32(), v132, v134); + svfloat32_t v141 = svsub_f32_x(svptrue_b32(), v134, v130); + svfloat32_t v142 = svadd_f32_x(svptrue_b32(), v131, v133); + svfloat32_t v144 = svsub_f32_x(svptrue_b32(), v131, v133); + svfloat32_t v145 = svsub_f32_x(svptrue_b32(), v133, v135); + svfloat32_t v146 = svsub_f32_x(svptrue_b32(), v135, v131); + svfloat32_t v225 = svadd_f32_x(svptrue_b32(), v219, v221); + svfloat32_t v228 = svsub_f32_x(svptrue_b32(), v219, v221); + svfloat32_t v229 = svsub_f32_x(svptrue_b32(), v221, v223); + svfloat32_t v230 = svsub_f32_x(svptrue_b32(), v223, v219); + svfloat32_t v231 = svadd_f32_x(svptrue_b32(), v220, v222); + svfloat32_t v233 = svsub_f32_x(svptrue_b32(), v220, v222); + svfloat32_t v234 = svsub_f32_x(svptrue_b32(), v222, v224); + svfloat32_t v235 = svsub_f32_x(svptrue_b32(), v224, v220); + svfloat32_t v137 = svadd_f32_x(svptrue_b32(), v136, v134); + svfloat32_t v143 = svadd_f32_x(svptrue_b32(), v142, v135); + svfloat32_t zero185 = svdup_n_f32(0); svfloat32_t v185 = svcmla_f32_x(pred_full, zero185, v562, v144, 90); - svfloat32_t zero192; - asm volatile("mov %0.s, #0" : "=w"(zero192)); + svfloat32_t zero192 = svdup_n_f32(0); svfloat32_t v192 = svcmla_f32_x(pred_full, zero192, v563, v145, 90); - svfloat32_t zero199; - asm volatile("mov %0.s, #0" : "=w"(zero199)); + svfloat32_t zero199 = svdup_n_f32(0); svfloat32_t v199 = svcmla_f32_x(pred_full, zero199, v564, v146, 90); - svfloat32_t v226; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v226) : "w"(v225), "w"(v223)); - svfloat32_t v232; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v232) : "w"(v231), "w"(v224)); - svfloat32_t zero274; - asm volatile("mov %0.s, #0" : "=w"(zero274)); + svfloat32_t v226 = svadd_f32_x(svptrue_b32(), v225, v223); + svfloat32_t v232 = svadd_f32_x(svptrue_b32(), v231, v224); + svfloat32_t zero274 = svdup_n_f32(0); svfloat32_t v274 = svcmla_f32_x(pred_full, zero274, v562, v233, 90); - svfloat32_t zero281; - asm volatile("mov %0.s, #0" : "=w"(zero281)); + svfloat32_t zero281 = svdup_n_f32(0); svfloat32_t v281 = svcmla_f32_x(pred_full, zero281, v563, v234, 90); - svfloat32_t zero288; - asm volatile("mov %0.s, #0" : "=w"(zero288)); + svfloat32_t zero288 = svdup_n_f32(0); svfloat32_t v288 = svcmla_f32_x(pred_full, zero288, v564, v235, 90); - svfloat32_t v138; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v138) : "w"(v137), "w"(v32)); - svfloat32_t zero178; - asm volatile("mov %0.s, #0" : "=w"(zero178)); + svfloat32_t v138 = svadd_f32_x(svptrue_b32(), v137, v32); + svfloat32_t zero178 = svdup_n_f32(0); svfloat32_t v178 = svcmla_f32_x(pred_full, zero178, v561, v143, 90); - svfloat32_t v227; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v227) : "w"(v226), "w"(v33)); - svfloat32_t zero267; - asm volatile("mov %0.s, #0" : "=w"(zero267)); + svfloat32_t v227 = svadd_f32_x(svptrue_b32(), v226, v33); + svfloat32_t zero267 = svdup_n_f32(0); svfloat32_t v267 = svcmla_f32_x(pred_full, zero267, v561, v232, 90); svfloat32_t v200 = svmla_f32_x(pred_full, v138, v137, v557); - svfloat32_t v207; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v207) : "w"(v178), "w"(v185)); - svfloat32_t v209; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v209) : "w"(v178), "w"(v185)); - svfloat32_t v211; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v211) : "w"(v178), "w"(v192)); + svfloat32_t v207 = svadd_f32_x(svptrue_b32(), v178, v185); + svfloat32_t v209 = svsub_f32_x(svptrue_b32(), v178, v185); + svfloat32_t v211 = svsub_f32_x(svptrue_b32(), v178, v192); svfloat32_t v289 = svmla_f32_x(pred_full, v227, v226, v557); - svfloat32_t v296; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v296) : "w"(v267), "w"(v274)); - svfloat32_t v298; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v298) : "w"(v267), "w"(v274)); - svfloat32_t v300; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v300) : "w"(v267), "w"(v281)); + svfloat32_t v296 = svadd_f32_x(svptrue_b32(), v267, v274); + svfloat32_t v298 = svsub_f32_x(svptrue_b32(), v267, v274); + svfloat32_t v300 = svsub_f32_x(svptrue_b32(), v267, v281); svint16_t v310 = svtbl_s16( svreinterpret_s16_s32(svcvt_s32_f32_x( pred_full, svmul_n_f32_x(pred_full, v138, (float)(1ULL << 31ULL)))), @@ -5937,21 +3810,15 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu14(const armral_cmplx_f32_t *restrict x, svfloat32_t v201 = svmla_f32_x(pred_full, v200, v139, v558); svfloat32_t v203 = svmls_f32_x(pred_full, v200, v139, v558); svfloat32_t v205 = svmls_f32_x(pred_full, v200, v140, v559); - svfloat32_t v208; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v208) : "w"(v207), "w"(v192)); - svfloat32_t v210; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v210) : "w"(v209), "w"(v199)); - svfloat32_t v212; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v212) : "w"(v211), "w"(v199)); + svfloat32_t v208 = svadd_f32_x(svptrue_b32(), v207, v192); + svfloat32_t v210 = svsub_f32_x(svptrue_b32(), v209, v199); + svfloat32_t v212 = svadd_f32_x(svptrue_b32(), v211, v199); svfloat32_t v290 = svmla_f32_x(pred_full, v289, v228, v558); svfloat32_t v292 = svmls_f32_x(pred_full, v289, v228, v558); svfloat32_t v294 = svmls_f32_x(pred_full, v289, v229, v559); - svfloat32_t v297; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v297) : "w"(v296), "w"(v281)); - svfloat32_t v299; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v299) : "w"(v298), "w"(v288)); - svfloat32_t v301; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v301) : "w"(v300), "w"(v288)); + svfloat32_t v297 = svadd_f32_x(svptrue_b32(), v296, v281); + svfloat32_t v299 = svsub_f32_x(svptrue_b32(), v298, v288); + svfloat32_t v301 = svadd_f32_x(svptrue_b32(), v300, v288); svst1w_u64(pred_full, (unsigned *)(v572), svreinterpret_u64_s16(v310)); svst1w_u64(pred_full, (unsigned *)(v581), svreinterpret_u64_s16(v318)); svfloat32_t v202 = svmla_f32_x(pred_full, v201, v140, v559); @@ -5960,30 +3827,18 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu14(const armral_cmplx_f32_t *restrict x, svfloat32_t v291 = svmla_f32_x(pred_full, v290, v229, v559); svfloat32_t v293 = svmls_f32_x(pred_full, v292, v230, v560); svfloat32_t v295 = svmla_f32_x(pred_full, v294, v230, v560); - svfloat32_t v213; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v213) : "w"(v202), "w"(v208)); - svfloat32_t v214; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v214) : "w"(v202), "w"(v208)); - svfloat32_t v215; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v215) : "w"(v204), "w"(v210)); - svfloat32_t v216; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v216) : "w"(v204), "w"(v210)); - svfloat32_t v217; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v217) : "w"(v206), "w"(v212)); - svfloat32_t v218; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v218) : "w"(v206), "w"(v212)); - svfloat32_t v302; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v302) : "w"(v291), "w"(v297)); - svfloat32_t v303; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v303) : "w"(v291), "w"(v297)); - svfloat32_t v304; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v304) : "w"(v293), "w"(v299)); - svfloat32_t v305; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v305) : "w"(v293), "w"(v299)); - svfloat32_t v306; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v306) : "w"(v295), "w"(v301)); - svfloat32_t v307; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v307) : "w"(v295), "w"(v301)); + svfloat32_t v213 = svadd_f32_x(svptrue_b32(), v202, v208); + svfloat32_t v214 = svsub_f32_x(svptrue_b32(), v202, v208); + svfloat32_t v215 = svadd_f32_x(svptrue_b32(), v204, v210); + svfloat32_t v216 = svsub_f32_x(svptrue_b32(), v204, v210); + svfloat32_t v217 = svadd_f32_x(svptrue_b32(), v206, v212); + svfloat32_t v218 = svsub_f32_x(svptrue_b32(), v206, v212); + svfloat32_t v302 = svadd_f32_x(svptrue_b32(), v291, v297); + svfloat32_t v303 = svsub_f32_x(svptrue_b32(), v291, v297); + svfloat32_t v304 = svadd_f32_x(svptrue_b32(), v293, v299); + svfloat32_t v305 = svsub_f32_x(svptrue_b32(), v293, v299); + svfloat32_t v306 = svadd_f32_x(svptrue_b32(), v295, v301); + svfloat32_t v307 = svsub_f32_x(svptrue_b32(), v295, v301); svint16_t v326 = svtbl_s16( svreinterpret_s16_s32(svcvt_s32_f32_x( pred_full, svmul_n_f32_x(pred_full, v214, (float)(1ULL << 31ULL)))), @@ -6070,485 +3925,227 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu15(const armral_cmplx_f32_t *restrict x, float v4 = dir; const float32x2_t *v5 = (const float32x2_t *)x; int32_t *v6 = (int32_t *)y; - int64_t v12 = howmany - 1; - int64_t v444 = howmany / 2; - for (int j = 0; j < v12; j += 2) { - float v155 = -1.2500000000000000e+00F; - float v160 = 5.5901699437494745e-01F; - float v164 = 1.5388417685876268e+00F; - float v165 = -1.5388417685876268e+00F; - float v172 = 5.8778525229247325e-01F; - float v173 = -5.8778525229247325e-01F; - float v180 = 3.6327126400268028e-01F; - float v181 = -3.6327126400268028e-01F; - float v206 = -1.4999999999999998e+00F; - float v211 = 1.8749999999999998e+00F; - float v216 = -8.3852549156242107e-01F; - float v220 = -2.3082626528814396e+00F; - float v221 = 2.3082626528814396e+00F; - float v228 = -8.8167787843870971e-01F; - float v229 = 8.8167787843870971e-01F; - float v236 = -5.4490689600402031e-01F; - float v237 = 5.4490689600402031e-01F; - float v261 = 8.6602540378443871e-01F; - float v262 = -8.6602540378443871e-01F; - float v269 = -1.0825317547305484e+00F; - float v270 = 1.0825317547305484e+00F; - float v277 = 4.8412291827592718e-01F; - float v278 = -4.8412291827592718e-01F; - float32x2_t v280 = (float32x2_t){v4, v4}; - float v286 = -1.3326760640014592e+00F; - float v291 = -5.0903696045512736e-01F; - float v296 = -3.1460214309120460e-01F; - const float32x2_t *v871 = &v5[istride]; - int32_t *v980 = &v6[ostride]; - float32x2_t v156 = (float32x2_t){v155, v155}; - float32x2_t v161 = (float32x2_t){v160, v160}; - float32x2_t v166 = (float32x2_t){v164, v165}; - float32x2_t v174 = (float32x2_t){v172, v173}; - float32x2_t v182 = (float32x2_t){v180, v181}; - float32x2_t v207 = (float32x2_t){v206, v206}; - float32x2_t v212 = (float32x2_t){v211, v211}; - float32x2_t v217 = (float32x2_t){v216, v216}; - float32x2_t v222 = (float32x2_t){v220, v221}; + for (int j = 0; j < howmany; j += 1) { + float32x2_t v61 = v5[istride]; + float v119 = -1.2500000000000000e+00F; + float v123 = 5.5901699437494745e-01F; + float v126 = 1.5388417685876268e+00F; + float v127 = -1.5388417685876268e+00F; + float v133 = 5.8778525229247325e-01F; + float v134 = -5.8778525229247325e-01F; + float v140 = 3.6327126400268028e-01F; + float v141 = -3.6327126400268028e-01F; + float v165 = -1.4999999999999998e+00F; + float v169 = 1.8749999999999998e+00F; + float v173 = -8.3852549156242107e-01F; + float v176 = -2.3082626528814396e+00F; + float v177 = 2.3082626528814396e+00F; + float v183 = -8.8167787843870971e-01F; + float v184 = 8.8167787843870971e-01F; + float v190 = -5.4490689600402031e-01F; + float v191 = 5.4490689600402031e-01F; + float v214 = 8.6602540378443871e-01F; + float v215 = -8.6602540378443871e-01F; + float v221 = -1.0825317547305484e+00F; + float v222 = 1.0825317547305484e+00F; + float v228 = 4.8412291827592718e-01F; + float v229 = -4.8412291827592718e-01F; + float32x2_t v231 = (float32x2_t){v4, v4}; + float v236 = -1.3326760640014592e+00F; + float v240 = -5.0903696045512736e-01F; + float v244 = -3.1460214309120460e-01F; + float32x2_t v32 = v5[0]; + float32x2_t v120 = (float32x2_t){v119, v119}; + float32x2_t v124 = (float32x2_t){v123, v123}; + float32x2_t v128 = (float32x2_t){v126, v127}; + float32x2_t v135 = (float32x2_t){v133, v134}; + float32x2_t v142 = (float32x2_t){v140, v141}; + float32x2_t v166 = (float32x2_t){v165, v165}; + float32x2_t v170 = (float32x2_t){v169, v169}; + float32x2_t v174 = (float32x2_t){v173, v173}; + float32x2_t v178 = (float32x2_t){v176, v177}; + float32x2_t v185 = (float32x2_t){v183, v184}; + float32x2_t v192 = (float32x2_t){v190, v191}; + float32x2_t v216 = (float32x2_t){v214, v215}; + float32x2_t v223 = (float32x2_t){v221, v222}; float32x2_t v230 = (float32x2_t){v228, v229}; - float32x2_t v238 = (float32x2_t){v236, v237}; - float32x2_t v263 = (float32x2_t){v261, v262}; - float32x2_t v271 = (float32x2_t){v269, v270}; - float32x2_t v279 = (float32x2_t){v277, v278}; - float32x2_t v287 = (float32x2_t){v286, v286}; - float32x2_t v292 = (float32x2_t){v291, v291}; - float32x2_t v297 = (float32x2_t){v296, v296}; - const float32x2_t *v826 = &v5[0]; - int32_t *v944 = &v6[0]; - float32x4_t v1088 = vld1q_f32((const float32_t *)v871); - float32x4_t v157 = vcombine_f32(v156, v156); - float32x4_t v162 = vcombine_f32(v161, v161); - float32x2_t v168 = vmul_f32(v280, v166); - float32x2_t v176 = vmul_f32(v280, v174); - float32x2_t v184 = vmul_f32(v280, v182); - float32x4_t v208 = vcombine_f32(v207, v207); - float32x4_t v213 = vcombine_f32(v212, v212); - float32x4_t v218 = vcombine_f32(v217, v217); - float32x2_t v224 = vmul_f32(v280, v222); - float32x2_t v232 = vmul_f32(v280, v230); - float32x2_t v240 = vmul_f32(v280, v238); - float32x2_t v265 = vmul_f32(v280, v263); - float32x2_t v273 = vmul_f32(v280, v271); - float32x2_t v281 = vmul_f32(v280, v279); - float32x4_t v288 = vcombine_f32(v287, v287); - float32x4_t v293 = vcombine_f32(v292, v292); - float32x4_t v298 = vcombine_f32(v297, v297); - const float32x2_t *v807 = &v5[istride * 5]; - const float32x2_t *v816 = &v5[istride * 10]; - const float32x2_t *v835 = &v5[istride * 8]; - const float32x2_t *v844 = &v5[istride * 13]; - const float32x2_t *v853 = &v5[istride * 3]; - const float32x2_t *v862 = &v5[istride * 11]; - const float32x2_t *v880 = &v5[istride * 6]; - const float32x2_t *v889 = &v5[istride * 14]; - const float32x2_t *v898 = &v5[istride * 4]; - const float32x2_t *v907 = &v5[istride * 9]; - const float32x2_t *v916 = &v5[istride * 2]; - const float32x2_t *v925 = &v5[istride * 7]; - const float32x2_t *v934 = &v5[istride * 12]; - int32_t *v953 = &v6[ostride * 10]; - int32_t *v962 = &v6[ostride * 5]; - int32_t *v971 = &v6[ostride * 6]; - int32_t *v989 = &v6[ostride * 11]; - int32_t *v998 = &v6[ostride * 12]; - int32_t *v1007 = &v6[ostride * 7]; - int32_t *v1016 = &v6[ostride * 2]; - int32_t *v1025 = &v6[ostride * 3]; - int32_t *v1034 = &v6[ostride * 13]; - int32_t *v1043 = &v6[ostride * 8]; - int32_t *v1052 = &v6[ostride * 9]; - int32_t *v1061 = &v6[ostride * 4]; - int32_t *v1070 = &v6[ostride * 14]; - float32x4_t v1078 = vld1q_f32((const float32_t *)v826); - float32x4_t v170 = vcombine_f32(v168, v168); - float32x4_t v178 = vcombine_f32(v176, v176); - float32x4_t v186 = vcombine_f32(v184, v184); - float32x4_t v226 = vcombine_f32(v224, v224); - float32x4_t v234 = vcombine_f32(v232, v232); - float32x4_t v242 = vcombine_f32(v240, v240); - float32x4_t v267 = vcombine_f32(v265, v265); - float32x4_t v275 = vcombine_f32(v273, v273); - float32x4_t v283 = vcombine_f32(v281, v281); - float32x4_t v1074 = vld1q_f32((const float32_t *)v807); - float32x4_t v1076 = vld1q_f32((const float32_t *)v816); - float32x4_t v1080 = vld1q_f32((const float32_t *)v835); - float32x4_t v1082 = vld1q_f32((const float32_t *)v844); - float32x4_t v1084 = vld1q_f32((const float32_t *)v853); - float32x4_t v1086 = vld1q_f32((const float32_t *)v862); - float32x4_t v1090 = vld1q_f32((const float32_t *)v880); - float32x4_t v1092 = vld1q_f32((const float32_t *)v889); - float32x4_t v1094 = vld1q_f32((const float32_t *)v898); - float32x4_t v1096 = vld1q_f32((const float32_t *)v907); - float32x4_t v1098 = vld1q_f32((const float32_t *)v916); - float32x4_t v1100 = vld1q_f32((const float32_t *)v925); - float32x4_t v1102 = vld1q_f32((const float32_t *)v934); - float32x4_t v35 = vaddq_f32(v1074, v1076); - float32x4_t v36 = vsubq_f32(v1074, v1076); - float32x4_t v59 = vaddq_f32(v1080, v1082); - float32x4_t v60 = vsubq_f32(v1080, v1082); - float32x4_t v83 = vaddq_f32(v1086, v1088); - float32x4_t v84 = vsubq_f32(v1086, v1088); - float32x4_t v107 = vaddq_f32(v1092, v1094); - float32x4_t v108 = vsubq_f32(v1092, v1094); - float32x4_t v131 = vaddq_f32(v1098, v1100); - float32x4_t v132 = vsubq_f32(v1098, v1100); - float32x4_t v44 = vaddq_f32(v35, v1078); - float32x4_t v68 = vaddq_f32(v59, v1084); - float32x4_t v92 = vaddq_f32(v83, v1090); - float32x4_t v116 = vaddq_f32(v107, v1096); - float32x4_t v140 = vaddq_f32(v131, v1102); - float32x4_t v197 = vaddq_f32(v59, v131); - float32x4_t v198 = vsubq_f32(v59, v131); - float32x4_t v199 = vaddq_f32(v107, v83); - float32x4_t v200 = vsubq_f32(v107, v83); - float32x4_t v253 = vaddq_f32(v60, v132); - float32x4_t v254 = vsubq_f32(v60, v132); - float32x4_t v255 = vaddq_f32(v108, v84); - float32x4_t v256 = vsubq_f32(v108, v84); - float32x4_t v141 = vaddq_f32(v68, v140); - float32x4_t v142 = vsubq_f32(v68, v140); - float32x4_t v143 = vaddq_f32(v116, v92); - float32x4_t v144 = vsubq_f32(v116, v92); - float32x4_t v201 = vaddq_f32(v197, v199); - float32x4_t v202 = vsubq_f32(v197, v199); - float32x4_t v203 = vaddq_f32(v198, v200); - float32x4_t v225 = vrev64q_f32(v198); - float32x4_t v241 = vrev64q_f32(v200); - float32x4_t v257 = vaddq_f32(v253, v255); - float32x4_t v258 = vsubq_f32(v253, v255); - float32x4_t v259 = vaddq_f32(v254, v256); - float32x4_t v289 = vmulq_f32(v254, v288); - float32x4_t v299 = vmulq_f32(v256, v298); - float32x4_t v145 = vaddq_f32(v141, v143); - float32x4_t v146 = vsubq_f32(v141, v143); - float32x4_t v147 = vaddq_f32(v142, v144); - float32x4_t v169 = vrev64q_f32(v142); - float32x4_t v185 = vrev64q_f32(v144); - float32x4_t v204 = vaddq_f32(v201, v35); - float32x4_t v214 = vmulq_f32(v201, v213); - float32x4_t v219 = vmulq_f32(v202, v218); - float32x4_t v227 = vmulq_f32(v225, v226); - float32x4_t v233 = vrev64q_f32(v203); - float32x4_t v243 = vmulq_f32(v241, v242); - float32x4_t v260 = vaddq_f32(v257, v36); - float32x4_t v274 = vrev64q_f32(v257); - float32x4_t v282 = vrev64q_f32(v258); - float32x4_t v294 = vmulq_f32(v259, v293); - float32x4_t v148 = vaddq_f32(v145, v44); - float32x4_t v158 = vmulq_f32(v145, v157); - float32x4_t v163 = vmulq_f32(v146, v162); - float32x4_t v171 = vmulq_f32(v169, v170); - float32x4_t v177 = vrev64q_f32(v147); - float32x4_t v187 = vmulq_f32(v185, v186); - float32x4_t v209 = vmulq_f32(v204, v208); - float32x4_t v235 = vmulq_f32(v233, v234); - float32x4_t v266 = vrev64q_f32(v260); - float32x4_t v276 = vmulq_f32(v274, v275); - float32x4_t v284 = vmulq_f32(v282, v283); - float32x4_t v303 = vsubq_f32(v289, v294); - float32x4_t v304 = vaddq_f32(v294, v299); - float32x4_t v179 = vmulq_f32(v177, v178); - float32x4_t v188 = vaddq_f32(v148, v158); - float32x4_t v244 = vaddq_f32(v209, v214); - float32x4_t v247 = vsubq_f32(v227, v235); - float32x4_t v248 = vaddq_f32(v235, v243); - float32x4_t v268 = vmulq_f32(v266, v267); - float32x4_t v309 = vaddq_f32(v148, v209); - int16x4_t v314 = vqmovn_s32(vcvtq_n_s32_f32(v148, 15)); - float32x4_t v189 = vaddq_f32(v188, v163); - float32x4_t v190 = vsubq_f32(v188, v163); - float32x4_t v191 = vsubq_f32(v171, v179); - float32x4_t v192 = vaddq_f32(v179, v187); - float32x4_t v245 = vaddq_f32(v244, v219); - float32x4_t v246 = vsubq_f32(v244, v219); - float32x4_t v300 = vaddq_f32(v268, v276); - float32x4_t v310 = vaddq_f32(v309, v268); - float32x4_t v311 = vsubq_f32(v309, v268); - vst1_s16((int16_t *)v944, v314); - float32x4_t v193 = vaddq_f32(v189, v191); - float32x4_t v194 = vsubq_f32(v189, v191); - float32x4_t v195 = vaddq_f32(v190, v192); - float32x4_t v196 = vsubq_f32(v190, v192); - float32x4_t v249 = vaddq_f32(v245, v247); - float32x4_t v250 = vsubq_f32(v245, v247); - float32x4_t v251 = vaddq_f32(v246, v248); - float32x4_t v252 = vsubq_f32(v246, v248); - float32x4_t v301 = vaddq_f32(v300, v284); - float32x4_t v302 = vsubq_f32(v300, v284); - int16x4_t v322 = vqmovn_s32(vcvtq_n_s32_f32(v311, 15)); - int16x4_t v330 = vqmovn_s32(vcvtq_n_s32_f32(v310, 15)); - float32x4_t v305 = vaddq_f32(v301, v303); - float32x4_t v306 = vsubq_f32(v301, v303); - float32x4_t v307 = vaddq_f32(v302, v304); - float32x4_t v308 = vsubq_f32(v302, v304); - float32x4_t v336 = vaddq_f32(v194, v250); - int16x4_t v341 = vqmovn_s32(vcvtq_n_s32_f32(v194, 15)); - float32x4_t v363 = vaddq_f32(v196, v252); - int16x4_t v368 = vqmovn_s32(vcvtq_n_s32_f32(v196, 15)); - float32x4_t v390 = vaddq_f32(v195, v251); - int16x4_t v395 = vqmovn_s32(vcvtq_n_s32_f32(v195, 15)); - float32x4_t v417 = vaddq_f32(v193, v249); - int16x4_t v422 = vqmovn_s32(vcvtq_n_s32_f32(v193, 15)); - vst1_s16((int16_t *)v953, v322); - vst1_s16((int16_t *)v962, v330); - float32x4_t v337 = vaddq_f32(v336, v306); - float32x4_t v338 = vsubq_f32(v336, v306); - float32x4_t v364 = vaddq_f32(v363, v308); - float32x4_t v365 = vsubq_f32(v363, v308); - float32x4_t v391 = vaddq_f32(v390, v307); - float32x4_t v392 = vsubq_f32(v390, v307); - float32x4_t v418 = vaddq_f32(v417, v305); - float32x4_t v419 = vsubq_f32(v417, v305); - vst1_s16((int16_t *)v971, v341); - vst1_s16((int16_t *)v998, v368); - vst1_s16((int16_t *)v1025, v395); - vst1_s16((int16_t *)v1052, v422); - int16x4_t v349 = vqmovn_s32(vcvtq_n_s32_f32(v338, 15)); - int16x4_t v357 = vqmovn_s32(vcvtq_n_s32_f32(v337, 15)); - int16x4_t v376 = vqmovn_s32(vcvtq_n_s32_f32(v365, 15)); - int16x4_t v384 = vqmovn_s32(vcvtq_n_s32_f32(v364, 15)); - int16x4_t v403 = vqmovn_s32(vcvtq_n_s32_f32(v392, 15)); - int16x4_t v411 = vqmovn_s32(vcvtq_n_s32_f32(v391, 15)); - int16x4_t v430 = vqmovn_s32(vcvtq_n_s32_f32(v419, 15)); - int16x4_t v438 = vqmovn_s32(vcvtq_n_s32_f32(v418, 15)); - vst1_s16((int16_t *)v980, v349); - vst1_s16((int16_t *)v989, v357); - vst1_s16((int16_t *)v1007, v376); - vst1_s16((int16_t *)v1016, v384); - vst1_s16((int16_t *)v1034, v403); - vst1_s16((int16_t *)v1043, v411); - vst1_s16((int16_t *)v1061, v430); - vst1_s16((int16_t *)v1070, v438); - v5 += 2 * 1; - v6 += 2 * 1; - } - for (int j = v444 * 2; j < howmany; j += 1) { - float32x2_t v497 = v5[istride]; - float v555 = -1.2500000000000000e+00F; - float v559 = 5.5901699437494745e-01F; - float v562 = 1.5388417685876268e+00F; - float v563 = -1.5388417685876268e+00F; - float v569 = 5.8778525229247325e-01F; - float v570 = -5.8778525229247325e-01F; - float v576 = 3.6327126400268028e-01F; - float v577 = -3.6327126400268028e-01F; - float v601 = -1.4999999999999998e+00F; - float v605 = 1.8749999999999998e+00F; - float v609 = -8.3852549156242107e-01F; - float v612 = -2.3082626528814396e+00F; - float v613 = 2.3082626528814396e+00F; - float v619 = -8.8167787843870971e-01F; - float v620 = 8.8167787843870971e-01F; - float v626 = -5.4490689600402031e-01F; - float v627 = 5.4490689600402031e-01F; - float v650 = 8.6602540378443871e-01F; - float v651 = -8.6602540378443871e-01F; - float v657 = -1.0825317547305484e+00F; - float v658 = 1.0825317547305484e+00F; - float v664 = 4.8412291827592718e-01F; - float v665 = -4.8412291827592718e-01F; - float32x2_t v667 = (float32x2_t){v4, v4}; - float v672 = -1.3326760640014592e+00F; - float v676 = -5.0903696045512736e-01F; - float v680 = -3.1460214309120460e-01F; - float32x2_t v468 = v5[0]; - float32x2_t v556 = (float32x2_t){v555, v555}; - float32x2_t v560 = (float32x2_t){v559, v559}; - float32x2_t v564 = (float32x2_t){v562, v563}; - float32x2_t v571 = (float32x2_t){v569, v570}; - float32x2_t v578 = (float32x2_t){v576, v577}; - float32x2_t v602 = (float32x2_t){v601, v601}; - float32x2_t v606 = (float32x2_t){v605, v605}; - float32x2_t v610 = (float32x2_t){v609, v609}; - float32x2_t v614 = (float32x2_t){v612, v613}; - float32x2_t v621 = (float32x2_t){v619, v620}; - float32x2_t v628 = (float32x2_t){v626, v627}; - float32x2_t v652 = (float32x2_t){v650, v651}; - float32x2_t v659 = (float32x2_t){v657, v658}; - float32x2_t v666 = (float32x2_t){v664, v665}; - float32x2_t v673 = (float32x2_t){v672, v672}; - float32x2_t v677 = (float32x2_t){v676, v676}; - float32x2_t v681 = (float32x2_t){v680, v680}; - float32x2_t v456 = v5[istride * 5]; - float32x2_t v461 = v5[istride * 10]; - float32x2_t v474 = v5[istride * 8]; - float32x2_t v479 = v5[istride * 13]; - float32x2_t v486 = v5[istride * 3]; - float32x2_t v492 = v5[istride * 11]; - float32x2_t v504 = v5[istride * 6]; - float32x2_t v510 = v5[istride * 14]; - float32x2_t v515 = v5[istride * 4]; - float32x2_t v522 = v5[istride * 9]; - float32x2_t v528 = v5[istride * 2]; - float32x2_t v533 = v5[istride * 7]; - float32x2_t v540 = v5[istride * 12]; - float32x2_t v566 = vmul_f32(v667, v564); - float32x2_t v573 = vmul_f32(v667, v571); - float32x2_t v580 = vmul_f32(v667, v578); - float32x2_t v616 = vmul_f32(v667, v614); - float32x2_t v623 = vmul_f32(v667, v621); - float32x2_t v630 = vmul_f32(v667, v628); - float32x2_t v654 = vmul_f32(v667, v652); - float32x2_t v661 = vmul_f32(v667, v659); - float32x2_t v668 = vmul_f32(v667, v666); - float32x2_t v462 = vadd_f32(v456, v461); - float32x2_t v463 = vsub_f32(v456, v461); - float32x2_t v480 = vadd_f32(v474, v479); - float32x2_t v481 = vsub_f32(v474, v479); - float32x2_t v498 = vadd_f32(v492, v497); - float32x2_t v499 = vsub_f32(v492, v497); - float32x2_t v516 = vadd_f32(v510, v515); - float32x2_t v517 = vsub_f32(v510, v515); - float32x2_t v534 = vadd_f32(v528, v533); - float32x2_t v535 = vsub_f32(v528, v533); - float32x2_t v469 = vadd_f32(v462, v468); - float32x2_t v487 = vadd_f32(v480, v486); - float32x2_t v505 = vadd_f32(v498, v504); - float32x2_t v523 = vadd_f32(v516, v522); - float32x2_t v541 = vadd_f32(v534, v540); - float32x2_t v592 = vadd_f32(v480, v534); - float32x2_t v593 = vsub_f32(v480, v534); - float32x2_t v594 = vadd_f32(v516, v498); - float32x2_t v595 = vsub_f32(v516, v498); - float32x2_t v642 = vadd_f32(v481, v535); - float32x2_t v643 = vsub_f32(v481, v535); - float32x2_t v644 = vadd_f32(v517, v499); - float32x2_t v645 = vsub_f32(v517, v499); - float32x2_t v542 = vadd_f32(v487, v541); - float32x2_t v543 = vsub_f32(v487, v541); - float32x2_t v544 = vadd_f32(v523, v505); - float32x2_t v545 = vsub_f32(v523, v505); - float32x2_t v596 = vadd_f32(v592, v594); - float32x2_t v597 = vsub_f32(v592, v594); - float32x2_t v598 = vadd_f32(v593, v595); - float32x2_t v617 = vrev64_f32(v593); - float32x2_t v631 = vrev64_f32(v595); - float32x2_t v646 = vadd_f32(v642, v644); - float32x2_t v647 = vsub_f32(v642, v644); - float32x2_t v648 = vadd_f32(v643, v645); - float32x2_t v674 = vmul_f32(v643, v673); - float32x2_t v682 = vmul_f32(v645, v681); - float32x2_t v546 = vadd_f32(v542, v544); - float32x2_t v547 = vsub_f32(v542, v544); - float32x2_t v548 = vadd_f32(v543, v545); - float32x2_t v567 = vrev64_f32(v543); - float32x2_t v581 = vrev64_f32(v545); - float32x2_t v599 = vadd_f32(v596, v462); - float32x2_t v607 = vmul_f32(v596, v606); - float32x2_t v611 = vmul_f32(v597, v610); - float32x2_t v618 = vmul_f32(v617, v616); - float32x2_t v624 = vrev64_f32(v598); - float32x2_t v632 = vmul_f32(v631, v630); - float32x2_t v649 = vadd_f32(v646, v463); - float32x2_t v662 = vrev64_f32(v646); - float32x2_t v669 = vrev64_f32(v647); - float32x2_t v678 = vmul_f32(v648, v677); - float32x2_t v549 = vadd_f32(v546, v469); - float32x2_t v557 = vmul_f32(v546, v556); - float32x2_t v561 = vmul_f32(v547, v560); - float32x2_t v568 = vmul_f32(v567, v566); - float32x2_t v574 = vrev64_f32(v548); - float32x2_t v582 = vmul_f32(v581, v580); - float32x2_t v603 = vmul_f32(v599, v602); - float32x2_t v625 = vmul_f32(v624, v623); - float32x2_t v655 = vrev64_f32(v649); - float32x2_t v663 = vmul_f32(v662, v661); - float32x2_t v670 = vmul_f32(v669, v668); - float32x2_t v686 = vsub_f32(v674, v678); - float32x2_t v687 = vadd_f32(v678, v682); - float32x2_t v575 = vmul_f32(v574, v573); - float32x2_t v583 = vadd_f32(v549, v557); - float32x2_t v633 = vadd_f32(v603, v607); - float32x2_t v636 = vsub_f32(v618, v625); - float32x2_t v637 = vadd_f32(v625, v632); - float32x2_t v656 = vmul_f32(v655, v654); - float32x2_t v692 = vadd_f32(v549, v603); - int16x4_t v697 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v549, 15), (int32x2_t){0, 0})); - float32x2_t v584 = vadd_f32(v583, v561); - float32x2_t v585 = vsub_f32(v583, v561); - float32x2_t v586 = vsub_f32(v568, v575); - float32x2_t v587 = vadd_f32(v575, v582); - float32x2_t v634 = vadd_f32(v633, v611); - float32x2_t v635 = vsub_f32(v633, v611); - float32x2_t v683 = vadd_f32(v656, v663); - float32x2_t v693 = vadd_f32(v692, v656); - float32x2_t v694 = vsub_f32(v692, v656); - v6[0] = vget_lane_s32(vreinterpret_s32_s16(v697), 0); - float32x2_t v588 = vadd_f32(v584, v586); - float32x2_t v589 = vsub_f32(v584, v586); - float32x2_t v590 = vadd_f32(v585, v587); - float32x2_t v591 = vsub_f32(v585, v587); - float32x2_t v638 = vadd_f32(v634, v636); - float32x2_t v639 = vsub_f32(v634, v636); - float32x2_t v640 = vadd_f32(v635, v637); - float32x2_t v641 = vsub_f32(v635, v637); - float32x2_t v684 = vadd_f32(v683, v670); - float32x2_t v685 = vsub_f32(v683, v670); - int16x4_t v703 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v694, 15), (int32x2_t){0, 0})); - int16x4_t v709 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v693, 15), (int32x2_t){0, 0})); - float32x2_t v688 = vadd_f32(v684, v686); - float32x2_t v689 = vsub_f32(v684, v686); - float32x2_t v690 = vadd_f32(v685, v687); - float32x2_t v691 = vsub_f32(v685, v687); - v6[ostride * 10] = vget_lane_s32(vreinterpret_s32_s16(v703), 0); - v6[ostride * 5] = vget_lane_s32(vreinterpret_s32_s16(v709), 0); - float32x2_t v713 = vadd_f32(v589, v639); - int16x4_t v718 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v589, 15), (int32x2_t){0, 0})); - float32x2_t v734 = vadd_f32(v591, v641); - int16x4_t v739 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v591, 15), (int32x2_t){0, 0})); - float32x2_t v755 = vadd_f32(v590, v640); - int16x4_t v760 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v590, 15), (int32x2_t){0, 0})); - float32x2_t v776 = vadd_f32(v588, v638); - int16x4_t v781 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v588, 15), (int32x2_t){0, 0})); - float32x2_t v714 = vadd_f32(v713, v689); - float32x2_t v715 = vsub_f32(v713, v689); - v6[ostride * 6] = vget_lane_s32(vreinterpret_s32_s16(v718), 0); - float32x2_t v735 = vadd_f32(v734, v691); - float32x2_t v736 = vsub_f32(v734, v691); - v6[ostride * 12] = vget_lane_s32(vreinterpret_s32_s16(v739), 0); - float32x2_t v756 = vadd_f32(v755, v690); - float32x2_t v757 = vsub_f32(v755, v690); - v6[ostride * 3] = vget_lane_s32(vreinterpret_s32_s16(v760), 0); - float32x2_t v777 = vadd_f32(v776, v688); - float32x2_t v778 = vsub_f32(v776, v688); - v6[ostride * 9] = vget_lane_s32(vreinterpret_s32_s16(v781), 0); - int16x4_t v724 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v715, 15), (int32x2_t){0, 0})); - int16x4_t v730 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v714, 15), (int32x2_t){0, 0})); - int16x4_t v745 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v736, 15), (int32x2_t){0, 0})); - int16x4_t v751 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v735, 15), (int32x2_t){0, 0})); - int16x4_t v766 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v757, 15), (int32x2_t){0, 0})); - int16x4_t v772 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v756, 15), (int32x2_t){0, 0})); - int16x4_t v787 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v778, 15), (int32x2_t){0, 0})); - int16x4_t v793 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v777, 15), (int32x2_t){0, 0})); - v6[ostride] = vget_lane_s32(vreinterpret_s32_s16(v724), 0); - v6[ostride * 11] = vget_lane_s32(vreinterpret_s32_s16(v730), 0); - v6[ostride * 7] = vget_lane_s32(vreinterpret_s32_s16(v745), 0); - v6[ostride * 2] = vget_lane_s32(vreinterpret_s32_s16(v751), 0); - v6[ostride * 13] = vget_lane_s32(vreinterpret_s32_s16(v766), 0); - v6[ostride * 8] = vget_lane_s32(vreinterpret_s32_s16(v772), 0); - v6[ostride * 4] = vget_lane_s32(vreinterpret_s32_s16(v787), 0); - v6[ostride * 14] = vget_lane_s32(vreinterpret_s32_s16(v793), 0); + float32x2_t v237 = (float32x2_t){v236, v236}; + float32x2_t v241 = (float32x2_t){v240, v240}; + float32x2_t v245 = (float32x2_t){v244, v244}; + float32x2_t v20 = v5[istride * 5]; + float32x2_t v25 = v5[istride * 10]; + float32x2_t v38 = v5[istride * 8]; + float32x2_t v43 = v5[istride * 13]; + float32x2_t v50 = v5[istride * 3]; + float32x2_t v56 = v5[istride * 11]; + float32x2_t v68 = v5[istride * 6]; + float32x2_t v74 = v5[istride * 14]; + float32x2_t v79 = v5[istride * 4]; + float32x2_t v86 = v5[istride * 9]; + float32x2_t v92 = v5[istride * 2]; + float32x2_t v97 = v5[istride * 7]; + float32x2_t v104 = v5[istride * 12]; + float32x2_t v130 = vmul_f32(v231, v128); + float32x2_t v137 = vmul_f32(v231, v135); + float32x2_t v144 = vmul_f32(v231, v142); + float32x2_t v180 = vmul_f32(v231, v178); + float32x2_t v187 = vmul_f32(v231, v185); + float32x2_t v194 = vmul_f32(v231, v192); + float32x2_t v218 = vmul_f32(v231, v216); + float32x2_t v225 = vmul_f32(v231, v223); + float32x2_t v232 = vmul_f32(v231, v230); + float32x2_t v26 = vadd_f32(v20, v25); + float32x2_t v27 = vsub_f32(v20, v25); + float32x2_t v44 = vadd_f32(v38, v43); + float32x2_t v45 = vsub_f32(v38, v43); + float32x2_t v62 = vadd_f32(v56, v61); + float32x2_t v63 = vsub_f32(v56, v61); + float32x2_t v80 = vadd_f32(v74, v79); + float32x2_t v81 = vsub_f32(v74, v79); + float32x2_t v98 = vadd_f32(v92, v97); + float32x2_t v99 = vsub_f32(v92, v97); + float32x2_t v33 = vadd_f32(v26, v32); + float32x2_t v51 = vadd_f32(v44, v50); + float32x2_t v69 = vadd_f32(v62, v68); + float32x2_t v87 = vadd_f32(v80, v86); + float32x2_t v105 = vadd_f32(v98, v104); + float32x2_t v156 = vadd_f32(v44, v98); + float32x2_t v157 = vsub_f32(v44, v98); + float32x2_t v158 = vadd_f32(v80, v62); + float32x2_t v159 = vsub_f32(v80, v62); + float32x2_t v206 = vadd_f32(v45, v99); + float32x2_t v207 = vsub_f32(v45, v99); + float32x2_t v208 = vadd_f32(v81, v63); + float32x2_t v209 = vsub_f32(v81, v63); + float32x2_t v106 = vadd_f32(v51, v105); + float32x2_t v107 = vsub_f32(v51, v105); + float32x2_t v108 = vadd_f32(v87, v69); + float32x2_t v109 = vsub_f32(v87, v69); + float32x2_t v160 = vadd_f32(v156, v158); + float32x2_t v161 = vsub_f32(v156, v158); + float32x2_t v162 = vadd_f32(v157, v159); + float32x2_t v181 = vrev64_f32(v157); + float32x2_t v195 = vrev64_f32(v159); + float32x2_t v210 = vadd_f32(v206, v208); + float32x2_t v211 = vsub_f32(v206, v208); + float32x2_t v212 = vadd_f32(v207, v209); + float32x2_t v238 = vmul_f32(v207, v237); + float32x2_t v246 = vmul_f32(v209, v245); + float32x2_t v110 = vadd_f32(v106, v108); + float32x2_t v111 = vsub_f32(v106, v108); + float32x2_t v112 = vadd_f32(v107, v109); + float32x2_t v131 = vrev64_f32(v107); + float32x2_t v145 = vrev64_f32(v109); + float32x2_t v163 = vadd_f32(v160, v26); + float32x2_t v171 = vmul_f32(v160, v170); + float32x2_t v175 = vmul_f32(v161, v174); + float32x2_t v182 = vmul_f32(v181, v180); + float32x2_t v188 = vrev64_f32(v162); + float32x2_t v196 = vmul_f32(v195, v194); + float32x2_t v213 = vadd_f32(v210, v27); + float32x2_t v226 = vrev64_f32(v210); + float32x2_t v233 = vrev64_f32(v211); + float32x2_t v242 = vmul_f32(v212, v241); + float32x2_t v113 = vadd_f32(v110, v33); + float32x2_t v121 = vmul_f32(v110, v120); + float32x2_t v125 = vmul_f32(v111, v124); + float32x2_t v132 = vmul_f32(v131, v130); + float32x2_t v138 = vrev64_f32(v112); + float32x2_t v146 = vmul_f32(v145, v144); + float32x2_t v167 = vmul_f32(v163, v166); + float32x2_t v189 = vmul_f32(v188, v187); + float32x2_t v219 = vrev64_f32(v213); + float32x2_t v227 = vmul_f32(v226, v225); + float32x2_t v234 = vmul_f32(v233, v232); + float32x2_t v250 = vsub_f32(v238, v242); + float32x2_t v251 = vadd_f32(v242, v246); + float32x2_t v139 = vmul_f32(v138, v137); + float32x2_t v147 = vadd_f32(v113, v121); + float32x2_t v197 = vadd_f32(v167, v171); + float32x2_t v200 = vsub_f32(v182, v189); + float32x2_t v201 = vadd_f32(v189, v196); + float32x2_t v220 = vmul_f32(v219, v218); + float32x2_t v256 = vadd_f32(v113, v167); + int16x4_t v261 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v113, 15), (int32x2_t){0, 0})); + float32x2_t v148 = vadd_f32(v147, v125); + float32x2_t v149 = vsub_f32(v147, v125); + float32x2_t v150 = vsub_f32(v132, v139); + float32x2_t v151 = vadd_f32(v139, v146); + float32x2_t v198 = vadd_f32(v197, v175); + float32x2_t v199 = vsub_f32(v197, v175); + float32x2_t v247 = vadd_f32(v220, v227); + float32x2_t v257 = vadd_f32(v256, v220); + float32x2_t v258 = vsub_f32(v256, v220); + v6[0] = vget_lane_s32(vreinterpret_s32_s16(v261), 0); + float32x2_t v152 = vadd_f32(v148, v150); + float32x2_t v153 = vsub_f32(v148, v150); + float32x2_t v154 = vadd_f32(v149, v151); + float32x2_t v155 = vsub_f32(v149, v151); + float32x2_t v202 = vadd_f32(v198, v200); + float32x2_t v203 = vsub_f32(v198, v200); + float32x2_t v204 = vadd_f32(v199, v201); + float32x2_t v205 = vsub_f32(v199, v201); + float32x2_t v248 = vadd_f32(v247, v234); + float32x2_t v249 = vsub_f32(v247, v234); + int16x4_t v267 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v258, 15), (int32x2_t){0, 0})); + int16x4_t v273 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v257, 15), (int32x2_t){0, 0})); + float32x2_t v252 = vadd_f32(v248, v250); + float32x2_t v253 = vsub_f32(v248, v250); + float32x2_t v254 = vadd_f32(v249, v251); + float32x2_t v255 = vsub_f32(v249, v251); + v6[ostride * 10] = vget_lane_s32(vreinterpret_s32_s16(v267), 0); + v6[ostride * 5] = vget_lane_s32(vreinterpret_s32_s16(v273), 0); + float32x2_t v277 = vadd_f32(v153, v203); + int16x4_t v282 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v153, 15), (int32x2_t){0, 0})); + float32x2_t v298 = vadd_f32(v155, v205); + int16x4_t v303 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v155, 15), (int32x2_t){0, 0})); + float32x2_t v319 = vadd_f32(v154, v204); + int16x4_t v324 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v154, 15), (int32x2_t){0, 0})); + float32x2_t v340 = vadd_f32(v152, v202); + int16x4_t v345 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v152, 15), (int32x2_t){0, 0})); + float32x2_t v278 = vadd_f32(v277, v253); + float32x2_t v279 = vsub_f32(v277, v253); + v6[ostride * 6] = vget_lane_s32(vreinterpret_s32_s16(v282), 0); + float32x2_t v299 = vadd_f32(v298, v255); + float32x2_t v300 = vsub_f32(v298, v255); + v6[ostride * 12] = vget_lane_s32(vreinterpret_s32_s16(v303), 0); + float32x2_t v320 = vadd_f32(v319, v254); + float32x2_t v321 = vsub_f32(v319, v254); + v6[ostride * 3] = vget_lane_s32(vreinterpret_s32_s16(v324), 0); + float32x2_t v341 = vadd_f32(v340, v252); + float32x2_t v342 = vsub_f32(v340, v252); + v6[ostride * 9] = vget_lane_s32(vreinterpret_s32_s16(v345), 0); + int16x4_t v288 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v279, 15), (int32x2_t){0, 0})); + int16x4_t v294 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v278, 15), (int32x2_t){0, 0})); + int16x4_t v309 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v300, 15), (int32x2_t){0, 0})); + int16x4_t v315 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v299, 15), (int32x2_t){0, 0})); + int16x4_t v330 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v321, 15), (int32x2_t){0, 0})); + int16x4_t v336 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v320, 15), (int32x2_t){0, 0})); + int16x4_t v351 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v342, 15), (int32x2_t){0, 0})); + int16x4_t v357 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v341, 15), (int32x2_t){0, 0})); + v6[ostride] = vget_lane_s32(vreinterpret_s32_s16(v288), 0); + v6[ostride * 11] = vget_lane_s32(vreinterpret_s32_s16(v294), 0); + v6[ostride * 7] = vget_lane_s32(vreinterpret_s32_s16(v309), 0); + v6[ostride * 2] = vget_lane_s32(vreinterpret_s32_s16(v315), 0); + v6[ostride * 13] = vget_lane_s32(vreinterpret_s32_s16(v330), 0); + v6[ostride * 8] = vget_lane_s32(vreinterpret_s32_s16(v336), 0); + v6[ostride * 4] = vget_lane_s32(vreinterpret_s32_s16(v351), 0); + v6[ostride * 14] = vget_lane_s32(vreinterpret_s32_s16(v357), 0); v5 += 1 * 1; v6 += 1 * 1; } @@ -6700,116 +4297,66 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu15(const armral_cmplx_f32_t *restrict x, svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v556)[0])); svfloat32_t v751 = svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v565)[0])); - svfloat32_t v32; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v32) : "w"(v723), "w"(v725)); - svfloat32_t v33; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v33) : "w"(v723), "w"(v725)); - svfloat32_t v56; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v56) : "w"(v729), "w"(v731)); - svfloat32_t v57; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v57) : "w"(v729), "w"(v731)); - svfloat32_t v80; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v80) : "w"(v735), "w"(v737)); - svfloat32_t v81; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v81) : "w"(v735), "w"(v737)); - svfloat32_t v104; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v104) : "w"(v741), "w"(v743)); - svfloat32_t v105; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v105) : "w"(v741), "w"(v743)); - svfloat32_t v128; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v128) : "w"(v747), "w"(v749)); - svfloat32_t v129; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v129) : "w"(v747), "w"(v749)); - svfloat32_t v41; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v41) : "w"(v32), "w"(v727)); - svfloat32_t v65; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v65) : "w"(v56), "w"(v733)); - svfloat32_t v89; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v89) : "w"(v80), "w"(v739)); - svfloat32_t v113; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v113) : "w"(v104), "w"(v745)); - svfloat32_t v137; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v137) : "w"(v128), "w"(v751)); - svfloat32_t v191; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v191) : "w"(v56), "w"(v128)); - svfloat32_t v192; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v192) : "w"(v56), "w"(v128)); - svfloat32_t v193; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v193) : "w"(v104), "w"(v80)); - svfloat32_t v194; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v194) : "w"(v104), "w"(v80)); - svfloat32_t v244; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v244) : "w"(v57), "w"(v129)); - svfloat32_t v245; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v245) : "w"(v57), "w"(v129)); - svfloat32_t v246; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v246) : "w"(v105), "w"(v81)); - svfloat32_t v247; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v247) : "w"(v105), "w"(v81)); - svfloat32_t v138; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v138) : "w"(v65), "w"(v137)); - svfloat32_t v139; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v139) : "w"(v65), "w"(v137)); - svfloat32_t v140; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v140) : "w"(v113), "w"(v89)); - svfloat32_t v141; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v141) : "w"(v113), "w"(v89)); - svfloat32_t v195; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v195) : "w"(v191), "w"(v193)); - svfloat32_t v196; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v196) : "w"(v191), "w"(v193)); - svfloat32_t v197; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v197) : "w"(v192), "w"(v194)); - svfloat32_t zero220; - asm volatile("mov %0.s, #0" : "=w"(zero220)); + svfloat32_t v32 = svadd_f32_x(svptrue_b32(), v723, v725); + svfloat32_t v33 = svsub_f32_x(svptrue_b32(), v723, v725); + svfloat32_t v56 = svadd_f32_x(svptrue_b32(), v729, v731); + svfloat32_t v57 = svsub_f32_x(svptrue_b32(), v729, v731); + svfloat32_t v80 = svadd_f32_x(svptrue_b32(), v735, v737); + svfloat32_t v81 = svsub_f32_x(svptrue_b32(), v735, v737); + svfloat32_t v104 = svadd_f32_x(svptrue_b32(), v741, v743); + svfloat32_t v105 = svsub_f32_x(svptrue_b32(), v741, v743); + svfloat32_t v128 = svadd_f32_x(svptrue_b32(), v747, v749); + svfloat32_t v129 = svsub_f32_x(svptrue_b32(), v747, v749); + svfloat32_t v41 = svadd_f32_x(svptrue_b32(), v32, v727); + svfloat32_t v65 = svadd_f32_x(svptrue_b32(), v56, v733); + svfloat32_t v89 = svadd_f32_x(svptrue_b32(), v80, v739); + svfloat32_t v113 = svadd_f32_x(svptrue_b32(), v104, v745); + svfloat32_t v137 = svadd_f32_x(svptrue_b32(), v128, v751); + svfloat32_t v191 = svadd_f32_x(svptrue_b32(), v56, v128); + svfloat32_t v192 = svsub_f32_x(svptrue_b32(), v56, v128); + svfloat32_t v193 = svadd_f32_x(svptrue_b32(), v104, v80); + svfloat32_t v194 = svsub_f32_x(svptrue_b32(), v104, v80); + svfloat32_t v244 = svadd_f32_x(svptrue_b32(), v57, v129); + svfloat32_t v245 = svsub_f32_x(svptrue_b32(), v57, v129); + svfloat32_t v246 = svadd_f32_x(svptrue_b32(), v105, v81); + svfloat32_t v247 = svsub_f32_x(svptrue_b32(), v105, v81); + svfloat32_t v138 = svadd_f32_x(svptrue_b32(), v65, v137); + svfloat32_t v139 = svsub_f32_x(svptrue_b32(), v65, v137); + svfloat32_t v140 = svadd_f32_x(svptrue_b32(), v113, v89); + svfloat32_t v141 = svsub_f32_x(svptrue_b32(), v113, v89); + svfloat32_t v195 = svadd_f32_x(svptrue_b32(), v191, v193); + svfloat32_t v196 = svsub_f32_x(svptrue_b32(), v191, v193); + svfloat32_t v197 = svadd_f32_x(svptrue_b32(), v192, v194); + svfloat32_t zero220 = svdup_n_f32(0); svfloat32_t v220 = svcmla_f32_x(pred_full, zero220, v577, v192, 90); - svfloat32_t v248; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v248) : "w"(v244), "w"(v246)); - svfloat32_t v249; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v249) : "w"(v244), "w"(v246)); - svfloat32_t v250; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v250) : "w"(v245), "w"(v247)); - svfloat32_t v287; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v287) : "w"(v247), "w"(v585)); - svfloat32_t v142; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v142) : "w"(v138), "w"(v140)); - svfloat32_t v143; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v143) : "w"(v138), "w"(v140)); - svfloat32_t v144; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v144) : "w"(v139), "w"(v141)); - svfloat32_t zero167; - asm volatile("mov %0.s, #0" : "=w"(zero167)); + svfloat32_t v248 = svadd_f32_x(svptrue_b32(), v244, v246); + svfloat32_t v249 = svsub_f32_x(svptrue_b32(), v244, v246); + svfloat32_t v250 = svadd_f32_x(svptrue_b32(), v245, v247); + svfloat32_t v287 = svmul_f32_x(svptrue_b32(), v247, v585); + svfloat32_t v142 = svadd_f32_x(svptrue_b32(), v138, v140); + svfloat32_t v143 = svsub_f32_x(svptrue_b32(), v138, v140); + svfloat32_t v144 = svadd_f32_x(svptrue_b32(), v139, v141); + svfloat32_t zero167 = svdup_n_f32(0); svfloat32_t v167 = svcmla_f32_x(pred_full, zero167, v571, v139, 90); - svfloat32_t v198; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v198) : "w"(v195), "w"(v32)); - svfloat32_t v208; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v208) : "w"(v195), "w"(v575)); - svfloat32_t zero227; - asm volatile("mov %0.s, #0" : "=w"(zero227)); + svfloat32_t v198 = svadd_f32_x(svptrue_b32(), v195, v32); + svfloat32_t v208 = svmul_f32_x(svptrue_b32(), v195, v575); + svfloat32_t zero227 = svdup_n_f32(0); svfloat32_t v227 = svcmla_f32_x(pred_full, zero227, v578, v197, 90); - svfloat32_t v251; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v251) : "w"(v248), "w"(v33)); - svfloat32_t zero272; - asm volatile("mov %0.s, #0" : "=w"(zero272)); + svfloat32_t v251 = svadd_f32_x(svptrue_b32(), v248, v33); + svfloat32_t zero272 = svdup_n_f32(0); svfloat32_t v272 = svcmla_f32_x(pred_full, zero272, v582, v249, 90); - svfloat32_t v282; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v282) : "w"(v250), "w"(v584)); - svfloat32_t v145; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v145) : "w"(v142), "w"(v41)); - svfloat32_t zero174; - asm volatile("mov %0.s, #0" : "=w"(zero174)); + svfloat32_t v282 = svmul_f32_x(svptrue_b32(), v250, v584); + svfloat32_t v145 = svadd_f32_x(svptrue_b32(), v142, v41); + svfloat32_t zero174 = svdup_n_f32(0); svfloat32_t v174 = svcmla_f32_x(pred_full, zero174, v572, v144, 90); - svfloat32_t v238; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v238) : "w"(v220), "w"(v227)); + svfloat32_t v238 = svsub_f32_x(svptrue_b32(), v220, v227); svfloat32_t v239 = svcmla_f32_x(pred_full, v227, v579, v194, 90); - svfloat32_t zero258; - asm volatile("mov %0.s, #0" : "=w"(zero258)); + svfloat32_t zero258 = svdup_n_f32(0); svfloat32_t v258 = svcmla_f32_x(pred_full, zero258, v580, v251, 90); svfloat32_t v291 = svnmls_f32_x(pred_full, v282, v245, v583); svfloat32_t v292 = svmla_f32_x(pred_full, v287, v250, v584); svfloat32_t v182 = svmla_f32_x(pred_full, v145, v142, v569); - svfloat32_t v185; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v185) : "w"(v167), "w"(v174)); + svfloat32_t v185 = svsub_f32_x(svptrue_b32(), v167, v174); svfloat32_t v186 = svcmla_f32_x(pred_full, v174, v573, v141, 90); svfloat32_t v235 = svmla_f32_x(pred_full, v208, v198, v574); svfloat32_t v288 = svcmla_f32_x(pred_full, v258, v581, v248, 90); @@ -6823,39 +4370,23 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu15(const armral_cmplx_f32_t *restrict x, svfloat32_t v184 = svmls_f32_x(pred_full, v182, v143, v570); svfloat32_t v236 = svmla_f32_x(pred_full, v235, v196, v576); svfloat32_t v237 = svmls_f32_x(pred_full, v235, v196, v576); - svfloat32_t v289; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v289) : "w"(v288), "w"(v272)); - svfloat32_t v290; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v290) : "w"(v288), "w"(v272)); - svfloat32_t v298; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v298) : "w"(v297), "w"(v258)); - svfloat32_t v299; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v299) : "w"(v297), "w"(v258)); + svfloat32_t v289 = svadd_f32_x(svptrue_b32(), v288, v272); + svfloat32_t v290 = svsub_f32_x(svptrue_b32(), v288, v272); + svfloat32_t v298 = svadd_f32_x(svptrue_b32(), v297, v258); + svfloat32_t v299 = svsub_f32_x(svptrue_b32(), v297, v258); svst1w_u64(pred_full, (unsigned *)(v593), svreinterpret_u64_s16(v302)); - svfloat32_t v187; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v187) : "w"(v183), "w"(v185)); - svfloat32_t v188; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v188) : "w"(v183), "w"(v185)); - svfloat32_t v189; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v189) : "w"(v184), "w"(v186)); - svfloat32_t v190; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v190) : "w"(v184), "w"(v186)); - svfloat32_t v240; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v240) : "w"(v236), "w"(v238)); - svfloat32_t v241; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v241) : "w"(v236), "w"(v238)); - svfloat32_t v242; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v242) : "w"(v237), "w"(v239)); - svfloat32_t v243; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v243) : "w"(v237), "w"(v239)); - svfloat32_t v293; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v293) : "w"(v289), "w"(v291)); - svfloat32_t v294; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v294) : "w"(v289), "w"(v291)); - svfloat32_t v295; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v295) : "w"(v290), "w"(v292)); - svfloat32_t v296; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v296) : "w"(v290), "w"(v292)); + svfloat32_t v187 = svadd_f32_x(svptrue_b32(), v183, v185); + svfloat32_t v188 = svsub_f32_x(svptrue_b32(), v183, v185); + svfloat32_t v189 = svadd_f32_x(svptrue_b32(), v184, v186); + svfloat32_t v190 = svsub_f32_x(svptrue_b32(), v184, v186); + svfloat32_t v240 = svadd_f32_x(svptrue_b32(), v236, v238); + svfloat32_t v241 = svsub_f32_x(svptrue_b32(), v236, v238); + svfloat32_t v242 = svadd_f32_x(svptrue_b32(), v237, v239); + svfloat32_t v243 = svsub_f32_x(svptrue_b32(), v237, v239); + svfloat32_t v293 = svadd_f32_x(svptrue_b32(), v289, v291); + svfloat32_t v294 = svsub_f32_x(svptrue_b32(), v289, v291); + svfloat32_t v295 = svadd_f32_x(svptrue_b32(), v290, v292); + svfloat32_t v296 = svsub_f32_x(svptrue_b32(), v290, v292); svint16_t v310 = svtbl_s16( svreinterpret_s16_s32(svcvt_s32_f32_x( pred_full, svmul_n_f32_x(pred_full, v299, (float)(1ULL << 31ULL)))), @@ -6866,29 +4397,25 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu15(const armral_cmplx_f32_t *restrict x, pred_full, svmul_n_f32_x(pred_full, v298, (float)(1ULL << 31ULL)))), svreinterpret_u16_u64( svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svfloat32_t v324; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v324) : "w"(v188), "w"(v241)); + svfloat32_t v324 = svadd_f32_x(svptrue_b32(), v188, v241); svint16_t v329 = svtbl_s16( svreinterpret_s16_s32(svcvt_s32_f32_x( pred_full, svmul_n_f32_x(pred_full, v188, (float)(1ULL << 31ULL)))), svreinterpret_u16_u64( svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svfloat32_t v351; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v351) : "w"(v190), "w"(v243)); + svfloat32_t v351 = svadd_f32_x(svptrue_b32(), v190, v243); svint16_t v356 = svtbl_s16( svreinterpret_s16_s32(svcvt_s32_f32_x( pred_full, svmul_n_f32_x(pred_full, v190, (float)(1ULL << 31ULL)))), svreinterpret_u16_u64( svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svfloat32_t v378; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v378) : "w"(v189), "w"(v242)); + svfloat32_t v378 = svadd_f32_x(svptrue_b32(), v189, v242); svint16_t v383 = svtbl_s16( svreinterpret_s16_s32(svcvt_s32_f32_x( pred_full, svmul_n_f32_x(pred_full, v189, (float)(1ULL << 31ULL)))), svreinterpret_u16_u64( svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svfloat32_t v405; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v405) : "w"(v187), "w"(v240)); + svfloat32_t v405 = svadd_f32_x(svptrue_b32(), v187, v240); svint16_t v410 = svtbl_s16( svreinterpret_s16_s32(svcvt_s32_f32_x( pred_full, svmul_n_f32_x(pred_full, v187, (float)(1ULL << 31ULL)))), @@ -6896,22 +4423,14 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu15(const armral_cmplx_f32_t *restrict x, svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); svst1w_u64(pred_full, (unsigned *)(v602), svreinterpret_u64_s16(v310)); svst1w_u64(pred_full, (unsigned *)(v611), svreinterpret_u64_s16(v318)); - svfloat32_t v325; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v325) : "w"(v324), "w"(v294)); - svfloat32_t v326; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v326) : "w"(v324), "w"(v294)); - svfloat32_t v352; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v352) : "w"(v351), "w"(v296)); - svfloat32_t v353; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v353) : "w"(v351), "w"(v296)); - svfloat32_t v379; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v379) : "w"(v378), "w"(v295)); - svfloat32_t v380; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v380) : "w"(v378), "w"(v295)); - svfloat32_t v406; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v406) : "w"(v405), "w"(v293)); - svfloat32_t v407; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v407) : "w"(v405), "w"(v293)); + svfloat32_t v325 = svadd_f32_x(svptrue_b32(), v324, v294); + svfloat32_t v326 = svsub_f32_x(svptrue_b32(), v324, v294); + svfloat32_t v352 = svadd_f32_x(svptrue_b32(), v351, v296); + svfloat32_t v353 = svsub_f32_x(svptrue_b32(), v351, v296); + svfloat32_t v379 = svadd_f32_x(svptrue_b32(), v378, v295); + svfloat32_t v380 = svsub_f32_x(svptrue_b32(), v378, v295); + svfloat32_t v406 = svadd_f32_x(svptrue_b32(), v405, v293); + svfloat32_t v407 = svsub_f32_x(svptrue_b32(), v405, v293); svst1w_u64(pred_full, (unsigned *)(v620), svreinterpret_u64_s16(v329)); svst1w_u64(pred_full, (unsigned *)(v647), svreinterpret_u64_s16(v356)); svst1w_u64(pred_full, (unsigned *)(v674), svreinterpret_u64_s16(v383)); @@ -6978,424 +4497,200 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu16(const armral_cmplx_f32_t *restrict x, float v4 = dir; const float32x2_t *v5 = (const float32x2_t *)x; int32_t *v6 = (int32_t *)y; - int64_t v12 = howmany - 1; - int64_t v459 = howmany / 2; - for (int j = 0; j < v12; j += 2) { - float v227 = 1.0000000000000000e+00F; - float v228 = -1.0000000000000000e+00F; - float v236 = -7.0710678118654746e-01F; - float v244 = 7.0710678118654757e-01F; - float v248 = 9.2387953251128674e-01F; - float v249 = -9.2387953251128674e-01F; - float v257 = 5.4119610014619690e-01F; - float v265 = -1.3065629648763766e+00F; - float32x2_t v267 = (float32x2_t){v4, v4}; - float v273 = 3.8268343236508984e-01F; - float v278 = 1.3065629648763766e+00F; - float v283 = -5.4119610014619690e-01F; - const float32x2_t *v906 = &v5[istride]; - int32_t *v988 = &v6[ostride]; - float32x2_t v229 = (float32x2_t){v227, v228}; - float32x2_t v237 = (float32x2_t){v244, v236}; - float32x2_t v245 = (float32x2_t){v244, v244}; - float32x2_t v250 = (float32x2_t){v248, v249}; - float32x2_t v258 = (float32x2_t){v283, v257}; - float32x2_t v266 = (float32x2_t){v278, v265}; - float32x2_t v274 = (float32x2_t){v273, v273}; - float32x2_t v279 = (float32x2_t){v278, v278}; - float32x2_t v284 = (float32x2_t){v283, v283}; - const float32x2_t *v834 = &v5[0]; - int32_t *v979 = &v6[0]; - float32x4_t v1134 = vld1q_f32((const float32_t *)v906); - float32x2_t v231 = vmul_f32(v267, v229); - float32x2_t v239 = vmul_f32(v267, v237); - float32x4_t v246 = vcombine_f32(v245, v245); - float32x2_t v252 = vmul_f32(v267, v250); - float32x2_t v260 = vmul_f32(v267, v258); - float32x2_t v268 = vmul_f32(v267, v266); - float32x4_t v275 = vcombine_f32(v274, v274); - float32x4_t v280 = vcombine_f32(v279, v279); - float32x4_t v285 = vcombine_f32(v284, v284); - const float32x2_t *v843 = &v5[istride * 8]; - const float32x2_t *v852 = &v5[istride * 4]; - const float32x2_t *v861 = &v5[istride * 12]; - const float32x2_t *v870 = &v5[istride * 2]; - const float32x2_t *v879 = &v5[istride * 10]; - const float32x2_t *v888 = &v5[istride * 6]; - const float32x2_t *v897 = &v5[istride * 14]; - const float32x2_t *v915 = &v5[istride * 9]; - const float32x2_t *v924 = &v5[istride * 5]; - const float32x2_t *v933 = &v5[istride * 13]; - const float32x2_t *v942 = &v5[istride * 3]; - const float32x2_t *v951 = &v5[istride * 11]; - const float32x2_t *v960 = &v5[istride * 7]; - const float32x2_t *v969 = &v5[istride * 15]; - int32_t *v997 = &v6[ostride * 2]; - int32_t *v1006 = &v6[ostride * 3]; - int32_t *v1015 = &v6[ostride * 4]; - int32_t *v1024 = &v6[ostride * 5]; - int32_t *v1033 = &v6[ostride * 6]; - int32_t *v1042 = &v6[ostride * 7]; - int32_t *v1051 = &v6[ostride * 8]; - int32_t *v1060 = &v6[ostride * 9]; - int32_t *v1069 = &v6[ostride * 10]; - int32_t *v1078 = &v6[ostride * 11]; - int32_t *v1087 = &v6[ostride * 12]; - int32_t *v1096 = &v6[ostride * 13]; - int32_t *v1105 = &v6[ostride * 14]; - int32_t *v1114 = &v6[ostride * 15]; - float32x4_t v1118 = vld1q_f32((const float32_t *)v834); - float32x4_t v233 = vcombine_f32(v231, v231); - float32x4_t v241 = vcombine_f32(v239, v239); - float32x4_t v254 = vcombine_f32(v252, v252); - float32x4_t v262 = vcombine_f32(v260, v260); - float32x4_t v270 = vcombine_f32(v268, v268); - float32x4_t v1120 = vld1q_f32((const float32_t *)v843); - float32x4_t v1122 = vld1q_f32((const float32_t *)v852); - float32x4_t v1124 = vld1q_f32((const float32_t *)v861); - float32x4_t v1126 = vld1q_f32((const float32_t *)v870); - float32x4_t v1128 = vld1q_f32((const float32_t *)v879); - float32x4_t v1130 = vld1q_f32((const float32_t *)v888); - float32x4_t v1132 = vld1q_f32((const float32_t *)v897); - float32x4_t v1136 = vld1q_f32((const float32_t *)v915); - float32x4_t v1138 = vld1q_f32((const float32_t *)v924); - float32x4_t v1140 = vld1q_f32((const float32_t *)v933); - float32x4_t v1142 = vld1q_f32((const float32_t *)v942); - float32x4_t v1144 = vld1q_f32((const float32_t *)v951); - float32x4_t v1146 = vld1q_f32((const float32_t *)v960); - float32x4_t v1148 = vld1q_f32((const float32_t *)v969); - float32x4_t v35 = vaddq_f32(v1118, v1120); - float32x4_t v36 = vsubq_f32(v1118, v1120); - float32x4_t v51 = vaddq_f32(v1122, v1124); - float32x4_t v52 = vsubq_f32(v1122, v1124); - float32x4_t v67 = vaddq_f32(v1126, v1128); - float32x4_t v68 = vsubq_f32(v1126, v1128); - float32x4_t v83 = vaddq_f32(v1130, v1132); - float32x4_t v84 = vsubq_f32(v1130, v1132); - float32x4_t v99 = vaddq_f32(v1134, v1136); - float32x4_t v100 = vsubq_f32(v1134, v1136); - float32x4_t v115 = vaddq_f32(v1138, v1140); - float32x4_t v116 = vsubq_f32(v1138, v1140); - float32x4_t v131 = vaddq_f32(v1142, v1144); - float32x4_t v132 = vsubq_f32(v1142, v1144); - float32x4_t v147 = vaddq_f32(v1146, v1148); - float32x4_t v148 = vsubq_f32(v1146, v1148); - float32x4_t v149 = vaddq_f32(v35, v51); - float32x4_t v150 = vsubq_f32(v35, v51); - float32x4_t v151 = vaddq_f32(v67, v83); - float32x4_t v152 = vsubq_f32(v67, v83); - float32x4_t v153 = vaddq_f32(v99, v115); - float32x4_t v154 = vsubq_f32(v99, v115); - float32x4_t v155 = vaddq_f32(v131, v147); - float32x4_t v156 = vsubq_f32(v131, v147); - float32x4_t v165 = vaddq_f32(v68, v84); - float32x4_t v166 = vsubq_f32(v68, v84); - float32x4_t v167 = vaddq_f32(v100, v148); - float32x4_t v168 = vsubq_f32(v100, v148); - float32x4_t v169 = vaddq_f32(v116, v132); - float32x4_t v170 = vsubq_f32(v116, v132); - float32x4_t v232 = vrev64q_f32(v52); - float32x4_t v157 = vaddq_f32(v149, v151); - float32x4_t v158 = vsubq_f32(v149, v151); - float32x4_t v159 = vaddq_f32(v153, v155); - float32x4_t v160 = vsubq_f32(v153, v155); - float32x4_t v163 = vaddq_f32(v154, v156); - float32x4_t v164 = vsubq_f32(v154, v156); - float32x4_t v171 = vaddq_f32(v167, v169); - float32x4_t v172 = vaddq_f32(v168, v170); - float32x4_t v206 = vrev64q_f32(v152); - float32x4_t v234 = vmulq_f32(v232, v233); - float32x4_t v240 = vrev64q_f32(v165); - float32x4_t v247 = vmulq_f32(v166, v246); - float32x4_t v261 = vrev64q_f32(v167); - float32x4_t v269 = vrev64q_f32(v169); - float32x4_t v281 = vmulq_f32(v168, v280); - float32x4_t v286 = vmulq_f32(v170, v285); - float32x4_t v161 = vaddq_f32(v157, v159); - float32x4_t v162 = vsubq_f32(v157, v159); - float32x4_t v193 = vrev64q_f32(v160); - float32x4_t v208 = vmulq_f32(v206, v233); - float32x4_t v214 = vrev64q_f32(v163); - float32x4_t v221 = vmulq_f32(v164, v246); - float32x4_t v242 = vmulq_f32(v240, v241); - float32x4_t v253 = vrev64q_f32(v171); - float32x4_t v263 = vmulq_f32(v261, v262); - float32x4_t v271 = vmulq_f32(v269, v270); - float32x4_t v276 = vmulq_f32(v172, v275); - float32x4_t v297 = vaddq_f32(v36, v247); - float32x4_t v298 = vsubq_f32(v36, v247); - float32x4_t v195 = vmulq_f32(v193, v233); - float32x4_t v216 = vmulq_f32(v214, v241); - float32x4_t v255 = vmulq_f32(v253, v254); - float32x4_t v289 = vaddq_f32(v150, v221); - float32x4_t v291 = vsubq_f32(v150, v221); - float32x4_t v299 = vaddq_f32(v234, v242); - float32x4_t v300 = vsubq_f32(v234, v242); - float32x4_t v303 = vsubq_f32(v281, v276); - float32x4_t v304 = vsubq_f32(v286, v276); - float32x4_t v305 = vsubq_f32(v276, v281); - float32x4_t v306 = vsubq_f32(v276, v286); - int16x4_t v333 = vqmovn_s32(vcvtq_n_s32_f32(v161, 15)); - int16x4_t v397 = vqmovn_s32(vcvtq_n_s32_f32(v162, 15)); - float32x4_t v287 = vaddq_f32(v158, v195); - float32x4_t v288 = vsubq_f32(v158, v195); - float32x4_t v290 = vaddq_f32(v208, v216); - float32x4_t v292 = vsubq_f32(v216, v208); - float32x4_t v301 = vaddq_f32(v255, v263); - float32x4_t v302 = vsubq_f32(v255, v271); - float32x4_t v307 = vaddq_f32(v297, v303); - float32x4_t v308 = vsubq_f32(v297, v303); - float32x4_t v309 = vaddq_f32(v297, v305); - float32x4_t v310 = vsubq_f32(v297, v305); - float32x4_t v311 = vaddq_f32(v298, v300); - float32x4_t v312 = vsubq_f32(v298, v300); - float32x4_t v313 = vaddq_f32(v298, v306); - float32x4_t v314 = vsubq_f32(v298, v306); - vst1_s16((int16_t *)v979, v333); - vst1_s16((int16_t *)v1051, v397); - float32x4_t v293 = vaddq_f32(v289, v290); - float32x4_t v294 = vaddq_f32(v291, v292); - float32x4_t v295 = vsubq_f32(v291, v292); - float32x4_t v296 = vsubq_f32(v289, v290); - float32x4_t v317 = vaddq_f32(v301, v299); - float32x4_t v318 = vsubq_f32(v301, v299); - float32x4_t v319 = vaddq_f32(v302, v304); - float32x4_t v320 = vsubq_f32(v302, v304); - float32x4_t v321 = vaddq_f32(v302, v300); - float32x4_t v322 = vsubq_f32(v302, v300); - int16x4_t v365 = vqmovn_s32(vcvtq_n_s32_f32(v288, 15)); - int16x4_t v429 = vqmovn_s32(vcvtq_n_s32_f32(v287, 15)); - float32x4_t v323 = vaddq_f32(v307, v317); - float32x4_t v324 = vaddq_f32(v308, v318); - float32x4_t v325 = vsubq_f32(v309, v318); - float32x4_t v326 = vsubq_f32(v310, v317); - float32x4_t v327 = vaddq_f32(v311, v319); - float32x4_t v328 = vaddq_f32(v312, v320); - float32x4_t v329 = vsubq_f32(v313, v322); - float32x4_t v330 = vsubq_f32(v314, v321); - int16x4_t v349 = vqmovn_s32(vcvtq_n_s32_f32(v296, 15)); - int16x4_t v381 = vqmovn_s32(vcvtq_n_s32_f32(v295, 15)); - int16x4_t v413 = vqmovn_s32(vcvtq_n_s32_f32(v294, 15)); - int16x4_t v445 = vqmovn_s32(vcvtq_n_s32_f32(v293, 15)); - vst1_s16((int16_t *)v1015, v365); - vst1_s16((int16_t *)v1087, v429); - int16x4_t v341 = vqmovn_s32(vcvtq_n_s32_f32(v326, 15)); - int16x4_t v357 = vqmovn_s32(vcvtq_n_s32_f32(v329, 15)); - int16x4_t v373 = vqmovn_s32(vcvtq_n_s32_f32(v330, 15)); - int16x4_t v389 = vqmovn_s32(vcvtq_n_s32_f32(v325, 15)); - int16x4_t v405 = vqmovn_s32(vcvtq_n_s32_f32(v324, 15)); - int16x4_t v421 = vqmovn_s32(vcvtq_n_s32_f32(v327, 15)); - int16x4_t v437 = vqmovn_s32(vcvtq_n_s32_f32(v328, 15)); - int16x4_t v453 = vqmovn_s32(vcvtq_n_s32_f32(v323, 15)); - vst1_s16((int16_t *)v997, v349); - vst1_s16((int16_t *)v1033, v381); - vst1_s16((int16_t *)v1069, v413); - vst1_s16((int16_t *)v1105, v445); - vst1_s16((int16_t *)v988, v341); - vst1_s16((int16_t *)v1006, v357); - vst1_s16((int16_t *)v1024, v373); - vst1_s16((int16_t *)v1042, v389); - vst1_s16((int16_t *)v1060, v405); - vst1_s16((int16_t *)v1078, v421); - vst1_s16((int16_t *)v1096, v437); - vst1_s16((int16_t *)v1114, v453); - v5 += 2 * 1; - v6 += 2 * 1; - } - for (int j = v459 * 2; j < howmany; j += 1) { - float32x2_t v519 = v5[istride]; - float v632 = 1.0000000000000000e+00F; - float v633 = -1.0000000000000000e+00F; - float v640 = -7.0710678118654746e-01F; - float v647 = 7.0710678118654757e-01F; - float v650 = 9.2387953251128674e-01F; - float v651 = -9.2387953251128674e-01F; - float v658 = 5.4119610014619690e-01F; - float v665 = -1.3065629648763766e+00F; - float32x2_t v667 = (float32x2_t){v4, v4}; - float v672 = 3.8268343236508984e-01F; - float v676 = 1.3065629648763766e+00F; - float v680 = -5.4119610014619690e-01F; - float32x2_t v471 = v5[0]; - float32x2_t v634 = (float32x2_t){v632, v633}; - float32x2_t v641 = (float32x2_t){v647, v640}; - float32x2_t v648 = (float32x2_t){v647, v647}; - float32x2_t v652 = (float32x2_t){v650, v651}; - float32x2_t v659 = (float32x2_t){v680, v658}; - float32x2_t v666 = (float32x2_t){v676, v665}; - float32x2_t v673 = (float32x2_t){v672, v672}; - float32x2_t v677 = (float32x2_t){v676, v676}; - float32x2_t v681 = (float32x2_t){v680, v680}; - float32x2_t v476 = v5[istride * 8]; - float32x2_t v483 = v5[istride * 4]; - float32x2_t v488 = v5[istride * 12]; - float32x2_t v495 = v5[istride * 2]; - float32x2_t v500 = v5[istride * 10]; - float32x2_t v507 = v5[istride * 6]; - float32x2_t v512 = v5[istride * 14]; - float32x2_t v524 = v5[istride * 9]; - float32x2_t v531 = v5[istride * 5]; - float32x2_t v536 = v5[istride * 13]; - float32x2_t v543 = v5[istride * 3]; - float32x2_t v548 = v5[istride * 11]; - float32x2_t v555 = v5[istride * 7]; - float32x2_t v560 = v5[istride * 15]; - float32x2_t v636 = vmul_f32(v667, v634); - float32x2_t v643 = vmul_f32(v667, v641); - float32x2_t v654 = vmul_f32(v667, v652); - float32x2_t v661 = vmul_f32(v667, v659); - float32x2_t v668 = vmul_f32(v667, v666); - float32x2_t v477 = vadd_f32(v471, v476); - float32x2_t v478 = vsub_f32(v471, v476); - float32x2_t v489 = vadd_f32(v483, v488); - float32x2_t v490 = vsub_f32(v483, v488); - float32x2_t v501 = vadd_f32(v495, v500); - float32x2_t v502 = vsub_f32(v495, v500); - float32x2_t v513 = vadd_f32(v507, v512); - float32x2_t v514 = vsub_f32(v507, v512); - float32x2_t v525 = vadd_f32(v519, v524); - float32x2_t v526 = vsub_f32(v519, v524); - float32x2_t v537 = vadd_f32(v531, v536); - float32x2_t v538 = vsub_f32(v531, v536); - float32x2_t v549 = vadd_f32(v543, v548); - float32x2_t v550 = vsub_f32(v543, v548); - float32x2_t v561 = vadd_f32(v555, v560); - float32x2_t v562 = vsub_f32(v555, v560); - float32x2_t v563 = vadd_f32(v477, v489); - float32x2_t v564 = vsub_f32(v477, v489); - float32x2_t v565 = vadd_f32(v501, v513); - float32x2_t v566 = vsub_f32(v501, v513); - float32x2_t v567 = vadd_f32(v525, v537); - float32x2_t v568 = vsub_f32(v525, v537); - float32x2_t v569 = vadd_f32(v549, v561); - float32x2_t v570 = vsub_f32(v549, v561); - float32x2_t v579 = vadd_f32(v502, v514); - float32x2_t v580 = vsub_f32(v502, v514); - float32x2_t v581 = vadd_f32(v526, v562); - float32x2_t v582 = vsub_f32(v526, v562); - float32x2_t v583 = vadd_f32(v538, v550); - float32x2_t v584 = vsub_f32(v538, v550); - float32x2_t v637 = vrev64_f32(v490); - float32x2_t v571 = vadd_f32(v563, v565); - float32x2_t v572 = vsub_f32(v563, v565); - float32x2_t v573 = vadd_f32(v567, v569); - float32x2_t v574 = vsub_f32(v567, v569); - float32x2_t v577 = vadd_f32(v568, v570); - float32x2_t v578 = vsub_f32(v568, v570); - float32x2_t v585 = vadd_f32(v581, v583); - float32x2_t v586 = vadd_f32(v582, v584); - float32x2_t v615 = vrev64_f32(v566); - float32x2_t v638 = vmul_f32(v637, v636); - float32x2_t v644 = vrev64_f32(v579); - float32x2_t v649 = vmul_f32(v580, v648); - float32x2_t v662 = vrev64_f32(v581); - float32x2_t v669 = vrev64_f32(v583); - float32x2_t v678 = vmul_f32(v582, v677); - float32x2_t v682 = vmul_f32(v584, v681); - float32x2_t v575 = vadd_f32(v571, v573); - float32x2_t v576 = vsub_f32(v571, v573); - float32x2_t v604 = vrev64_f32(v574); - float32x2_t v616 = vmul_f32(v615, v636); - float32x2_t v622 = vrev64_f32(v577); - float32x2_t v627 = vmul_f32(v578, v648); - float32x2_t v645 = vmul_f32(v644, v643); - float32x2_t v655 = vrev64_f32(v585); - float32x2_t v663 = vmul_f32(v662, v661); - float32x2_t v670 = vmul_f32(v669, v668); - float32x2_t v674 = vmul_f32(v586, v673); - float32x2_t v693 = vadd_f32(v478, v649); - float32x2_t v694 = vsub_f32(v478, v649); - float32x2_t v605 = vmul_f32(v604, v636); - float32x2_t v623 = vmul_f32(v622, v643); - float32x2_t v656 = vmul_f32(v655, v654); - float32x2_t v685 = vadd_f32(v564, v627); - float32x2_t v687 = vsub_f32(v564, v627); - float32x2_t v695 = vadd_f32(v638, v645); - float32x2_t v696 = vsub_f32(v638, v645); - float32x2_t v699 = vsub_f32(v678, v674); - float32x2_t v700 = vsub_f32(v682, v674); - float32x2_t v701 = vsub_f32(v674, v678); - float32x2_t v702 = vsub_f32(v674, v682); - int16x4_t v729 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v575, 15), (int32x2_t){0, 0})); - int16x4_t v777 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v576, 15), (int32x2_t){0, 0})); - float32x2_t v683 = vadd_f32(v572, v605); - float32x2_t v684 = vsub_f32(v572, v605); - float32x2_t v686 = vadd_f32(v616, v623); - float32x2_t v688 = vsub_f32(v623, v616); - float32x2_t v697 = vadd_f32(v656, v663); - float32x2_t v698 = vsub_f32(v656, v670); - float32x2_t v703 = vadd_f32(v693, v699); - float32x2_t v704 = vsub_f32(v693, v699); - float32x2_t v705 = vadd_f32(v693, v701); - float32x2_t v706 = vsub_f32(v693, v701); - float32x2_t v707 = vadd_f32(v694, v696); - float32x2_t v708 = vsub_f32(v694, v696); - float32x2_t v709 = vadd_f32(v694, v702); - float32x2_t v710 = vsub_f32(v694, v702); - v6[0] = vget_lane_s32(vreinterpret_s32_s16(v729), 0); - v6[ostride * 8] = vget_lane_s32(vreinterpret_s32_s16(v777), 0); - float32x2_t v689 = vadd_f32(v685, v686); - float32x2_t v690 = vadd_f32(v687, v688); - float32x2_t v691 = vsub_f32(v687, v688); - float32x2_t v692 = vsub_f32(v685, v686); - float32x2_t v713 = vadd_f32(v697, v695); - float32x2_t v714 = vsub_f32(v697, v695); - float32x2_t v715 = vadd_f32(v698, v700); - float32x2_t v716 = vsub_f32(v698, v700); - float32x2_t v717 = vadd_f32(v698, v696); - float32x2_t v718 = vsub_f32(v698, v696); - int16x4_t v753 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v684, 15), (int32x2_t){0, 0})); - int16x4_t v801 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v683, 15), (int32x2_t){0, 0})); - float32x2_t v719 = vadd_f32(v703, v713); - float32x2_t v720 = vadd_f32(v704, v714); - float32x2_t v721 = vsub_f32(v705, v714); - float32x2_t v722 = vsub_f32(v706, v713); - float32x2_t v723 = vadd_f32(v707, v715); - float32x2_t v724 = vadd_f32(v708, v716); - float32x2_t v725 = vsub_f32(v709, v718); - float32x2_t v726 = vsub_f32(v710, v717); - int16x4_t v741 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v692, 15), (int32x2_t){0, 0})); - v6[ostride * 4] = vget_lane_s32(vreinterpret_s32_s16(v753), 0); - int16x4_t v765 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v691, 15), (int32x2_t){0, 0})); - int16x4_t v789 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v690, 15), (int32x2_t){0, 0})); - v6[ostride * 12] = vget_lane_s32(vreinterpret_s32_s16(v801), 0); - int16x4_t v813 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v689, 15), (int32x2_t){0, 0})); - int16x4_t v735 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v722, 15), (int32x2_t){0, 0})); - v6[ostride * 2] = vget_lane_s32(vreinterpret_s32_s16(v741), 0); - int16x4_t v747 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v725, 15), (int32x2_t){0, 0})); - int16x4_t v759 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v726, 15), (int32x2_t){0, 0})); - v6[ostride * 6] = vget_lane_s32(vreinterpret_s32_s16(v765), 0); - int16x4_t v771 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v721, 15), (int32x2_t){0, 0})); - int16x4_t v783 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v720, 15), (int32x2_t){0, 0})); - v6[ostride * 10] = vget_lane_s32(vreinterpret_s32_s16(v789), 0); - int16x4_t v795 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v723, 15), (int32x2_t){0, 0})); - int16x4_t v807 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v724, 15), (int32x2_t){0, 0})); - v6[ostride * 14] = vget_lane_s32(vreinterpret_s32_s16(v813), 0); - int16x4_t v819 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v719, 15), (int32x2_t){0, 0})); - v6[ostride] = vget_lane_s32(vreinterpret_s32_s16(v735), 0); - v6[ostride * 3] = vget_lane_s32(vreinterpret_s32_s16(v747), 0); - v6[ostride * 5] = vget_lane_s32(vreinterpret_s32_s16(v759), 0); - v6[ostride * 7] = vget_lane_s32(vreinterpret_s32_s16(v771), 0); - v6[ostride * 9] = vget_lane_s32(vreinterpret_s32_s16(v783), 0); - v6[ostride * 11] = vget_lane_s32(vreinterpret_s32_s16(v795), 0); - v6[ostride * 13] = vget_lane_s32(vreinterpret_s32_s16(v807), 0); - v6[ostride * 15] = vget_lane_s32(vreinterpret_s32_s16(v819), 0); + for (int j = 0; j < howmany; j += 1) { + float32x2_t v68 = v5[istride]; + float v181 = 1.0000000000000000e+00F; + float v182 = -1.0000000000000000e+00F; + float v189 = -7.0710678118654746e-01F; + float v196 = 7.0710678118654757e-01F; + float v199 = 9.2387953251128674e-01F; + float v200 = -9.2387953251128674e-01F; + float v207 = 5.4119610014619690e-01F; + float v214 = -1.3065629648763766e+00F; + float32x2_t v216 = (float32x2_t){v4, v4}; + float v221 = 3.8268343236508984e-01F; + float v225 = 1.3065629648763766e+00F; + float v229 = -5.4119610014619690e-01F; + float32x2_t v20 = v5[0]; + float32x2_t v183 = (float32x2_t){v181, v182}; + float32x2_t v190 = (float32x2_t){v196, v189}; + float32x2_t v197 = (float32x2_t){v196, v196}; + float32x2_t v201 = (float32x2_t){v199, v200}; + float32x2_t v208 = (float32x2_t){v229, v207}; + float32x2_t v215 = (float32x2_t){v225, v214}; + float32x2_t v222 = (float32x2_t){v221, v221}; + float32x2_t v226 = (float32x2_t){v225, v225}; + float32x2_t v230 = (float32x2_t){v229, v229}; + float32x2_t v25 = v5[istride * 8]; + float32x2_t v32 = v5[istride * 4]; + float32x2_t v37 = v5[istride * 12]; + float32x2_t v44 = v5[istride * 2]; + float32x2_t v49 = v5[istride * 10]; + float32x2_t v56 = v5[istride * 6]; + float32x2_t v61 = v5[istride * 14]; + float32x2_t v73 = v5[istride * 9]; + float32x2_t v80 = v5[istride * 5]; + float32x2_t v85 = v5[istride * 13]; + float32x2_t v92 = v5[istride * 3]; + float32x2_t v97 = v5[istride * 11]; + float32x2_t v104 = v5[istride * 7]; + float32x2_t v109 = v5[istride * 15]; + float32x2_t v185 = vmul_f32(v216, v183); + float32x2_t v192 = vmul_f32(v216, v190); + float32x2_t v203 = vmul_f32(v216, v201); + float32x2_t v210 = vmul_f32(v216, v208); + float32x2_t v217 = vmul_f32(v216, v215); + float32x2_t v26 = vadd_f32(v20, v25); + float32x2_t v27 = vsub_f32(v20, v25); + float32x2_t v38 = vadd_f32(v32, v37); + float32x2_t v39 = vsub_f32(v32, v37); + float32x2_t v50 = vadd_f32(v44, v49); + float32x2_t v51 = vsub_f32(v44, v49); + float32x2_t v62 = vadd_f32(v56, v61); + float32x2_t v63 = vsub_f32(v56, v61); + float32x2_t v74 = vadd_f32(v68, v73); + float32x2_t v75 = vsub_f32(v68, v73); + float32x2_t v86 = vadd_f32(v80, v85); + float32x2_t v87 = vsub_f32(v80, v85); + float32x2_t v98 = vadd_f32(v92, v97); + float32x2_t v99 = vsub_f32(v92, v97); + float32x2_t v110 = vadd_f32(v104, v109); + float32x2_t v111 = vsub_f32(v104, v109); + float32x2_t v112 = vadd_f32(v26, v38); + float32x2_t v113 = vsub_f32(v26, v38); + float32x2_t v114 = vadd_f32(v50, v62); + float32x2_t v115 = vsub_f32(v50, v62); + float32x2_t v116 = vadd_f32(v74, v86); + float32x2_t v117 = vsub_f32(v74, v86); + float32x2_t v118 = vadd_f32(v98, v110); + float32x2_t v119 = vsub_f32(v98, v110); + float32x2_t v128 = vadd_f32(v51, v63); + float32x2_t v129 = vsub_f32(v51, v63); + float32x2_t v130 = vadd_f32(v75, v111); + float32x2_t v131 = vsub_f32(v75, v111); + float32x2_t v132 = vadd_f32(v87, v99); + float32x2_t v133 = vsub_f32(v87, v99); + float32x2_t v186 = vrev64_f32(v39); + float32x2_t v120 = vadd_f32(v112, v114); + float32x2_t v121 = vsub_f32(v112, v114); + float32x2_t v122 = vadd_f32(v116, v118); + float32x2_t v123 = vsub_f32(v116, v118); + float32x2_t v126 = vadd_f32(v117, v119); + float32x2_t v127 = vsub_f32(v117, v119); + float32x2_t v134 = vadd_f32(v130, v132); + float32x2_t v135 = vadd_f32(v131, v133); + float32x2_t v164 = vrev64_f32(v115); + float32x2_t v187 = vmul_f32(v186, v185); + float32x2_t v193 = vrev64_f32(v128); + float32x2_t v198 = vmul_f32(v129, v197); + float32x2_t v211 = vrev64_f32(v130); + float32x2_t v218 = vrev64_f32(v132); + float32x2_t v227 = vmul_f32(v131, v226); + float32x2_t v231 = vmul_f32(v133, v230); + float32x2_t v124 = vadd_f32(v120, v122); + float32x2_t v125 = vsub_f32(v120, v122); + float32x2_t v153 = vrev64_f32(v123); + float32x2_t v165 = vmul_f32(v164, v185); + float32x2_t v171 = vrev64_f32(v126); + float32x2_t v176 = vmul_f32(v127, v197); + float32x2_t v194 = vmul_f32(v193, v192); + float32x2_t v204 = vrev64_f32(v134); + float32x2_t v212 = vmul_f32(v211, v210); + float32x2_t v219 = vmul_f32(v218, v217); + float32x2_t v223 = vmul_f32(v135, v222); + float32x2_t v242 = vadd_f32(v27, v198); + float32x2_t v243 = vsub_f32(v27, v198); + float32x2_t v154 = vmul_f32(v153, v185); + float32x2_t v172 = vmul_f32(v171, v192); + float32x2_t v205 = vmul_f32(v204, v203); + float32x2_t v234 = vadd_f32(v113, v176); + float32x2_t v236 = vsub_f32(v113, v176); + float32x2_t v244 = vadd_f32(v187, v194); + float32x2_t v245 = vsub_f32(v187, v194); + float32x2_t v248 = vsub_f32(v227, v223); + float32x2_t v249 = vsub_f32(v231, v223); + float32x2_t v250 = vsub_f32(v223, v227); + float32x2_t v251 = vsub_f32(v223, v231); + int16x4_t v278 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v124, 15), (int32x2_t){0, 0})); + int16x4_t v326 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v125, 15), (int32x2_t){0, 0})); + float32x2_t v232 = vadd_f32(v121, v154); + float32x2_t v233 = vsub_f32(v121, v154); + float32x2_t v235 = vadd_f32(v165, v172); + float32x2_t v237 = vsub_f32(v172, v165); + float32x2_t v246 = vadd_f32(v205, v212); + float32x2_t v247 = vsub_f32(v205, v219); + float32x2_t v252 = vadd_f32(v242, v248); + float32x2_t v253 = vsub_f32(v242, v248); + float32x2_t v254 = vadd_f32(v242, v250); + float32x2_t v255 = vsub_f32(v242, v250); + float32x2_t v256 = vadd_f32(v243, v245); + float32x2_t v257 = vsub_f32(v243, v245); + float32x2_t v258 = vadd_f32(v243, v251); + float32x2_t v259 = vsub_f32(v243, v251); + v6[0] = vget_lane_s32(vreinterpret_s32_s16(v278), 0); + v6[ostride * 8] = vget_lane_s32(vreinterpret_s32_s16(v326), 0); + float32x2_t v238 = vadd_f32(v234, v235); + float32x2_t v239 = vadd_f32(v236, v237); + float32x2_t v240 = vsub_f32(v236, v237); + float32x2_t v241 = vsub_f32(v234, v235); + float32x2_t v262 = vadd_f32(v246, v244); + float32x2_t v263 = vsub_f32(v246, v244); + float32x2_t v264 = vadd_f32(v247, v249); + float32x2_t v265 = vsub_f32(v247, v249); + float32x2_t v266 = vadd_f32(v247, v245); + float32x2_t v267 = vsub_f32(v247, v245); + int16x4_t v302 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v233, 15), (int32x2_t){0, 0})); + int16x4_t v350 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v232, 15), (int32x2_t){0, 0})); + float32x2_t v268 = vadd_f32(v252, v262); + float32x2_t v269 = vadd_f32(v253, v263); + float32x2_t v270 = vsub_f32(v254, v263); + float32x2_t v271 = vsub_f32(v255, v262); + float32x2_t v272 = vadd_f32(v256, v264); + float32x2_t v273 = vadd_f32(v257, v265); + float32x2_t v274 = vsub_f32(v258, v267); + float32x2_t v275 = vsub_f32(v259, v266); + int16x4_t v290 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v241, 15), (int32x2_t){0, 0})); + v6[ostride * 4] = vget_lane_s32(vreinterpret_s32_s16(v302), 0); + int16x4_t v314 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v240, 15), (int32x2_t){0, 0})); + int16x4_t v338 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v239, 15), (int32x2_t){0, 0})); + v6[ostride * 12] = vget_lane_s32(vreinterpret_s32_s16(v350), 0); + int16x4_t v362 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v238, 15), (int32x2_t){0, 0})); + int16x4_t v284 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v271, 15), (int32x2_t){0, 0})); + v6[ostride * 2] = vget_lane_s32(vreinterpret_s32_s16(v290), 0); + int16x4_t v296 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v274, 15), (int32x2_t){0, 0})); + int16x4_t v308 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v275, 15), (int32x2_t){0, 0})); + v6[ostride * 6] = vget_lane_s32(vreinterpret_s32_s16(v314), 0); + int16x4_t v320 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v270, 15), (int32x2_t){0, 0})); + int16x4_t v332 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v269, 15), (int32x2_t){0, 0})); + v6[ostride * 10] = vget_lane_s32(vreinterpret_s32_s16(v338), 0); + int16x4_t v344 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v272, 15), (int32x2_t){0, 0})); + int16x4_t v356 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v273, 15), (int32x2_t){0, 0})); + v6[ostride * 14] = vget_lane_s32(vreinterpret_s32_s16(v362), 0); + int16x4_t v368 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v268, 15), (int32x2_t){0, 0})); + v6[ostride] = vget_lane_s32(vreinterpret_s32_s16(v284), 0); + v6[ostride * 3] = vget_lane_s32(vreinterpret_s32_s16(v296), 0); + v6[ostride * 5] = vget_lane_s32(vreinterpret_s32_s16(v308), 0); + v6[ostride * 7] = vget_lane_s32(vreinterpret_s32_s16(v320), 0); + v6[ostride * 9] = vget_lane_s32(vreinterpret_s32_s16(v332), 0); + v6[ostride * 11] = vget_lane_s32(vreinterpret_s32_s16(v344), 0); + v6[ostride * 13] = vget_lane_s32(vreinterpret_s32_s16(v356), 0); + v6[ostride * 15] = vget_lane_s32(vreinterpret_s32_s16(v368), 0); v5 += 1 * 1; v6 += 1 * 1; } @@ -7533,140 +4828,81 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu16(const armral_cmplx_f32_t *restrict x, svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v581)[0])); svfloat32_t v787 = svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v590)[0])); - svfloat32_t v32; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v32) : "w"(v757), "w"(v759)); - svfloat32_t v33; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v33) : "w"(v757), "w"(v759)); - svfloat32_t v48; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v48) : "w"(v761), "w"(v763)); - svfloat32_t v49; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v49) : "w"(v761), "w"(v763)); - svfloat32_t v64; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v64) : "w"(v765), "w"(v767)); - svfloat32_t v65; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v65) : "w"(v765), "w"(v767)); - svfloat32_t v80; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v80) : "w"(v769), "w"(v771)); - svfloat32_t v81; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v81) : "w"(v769), "w"(v771)); - svfloat32_t v96; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v96) : "w"(v773), "w"(v775)); - svfloat32_t v97; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v97) : "w"(v773), "w"(v775)); - svfloat32_t v112; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v112) : "w"(v777), "w"(v779)); - svfloat32_t v113; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v113) : "w"(v777), "w"(v779)); - svfloat32_t v128; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v128) : "w"(v781), "w"(v783)); - svfloat32_t v129; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v129) : "w"(v781), "w"(v783)); - svfloat32_t v144; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v144) : "w"(v785), "w"(v787)); - svfloat32_t v145; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v145) : "w"(v785), "w"(v787)); - svfloat32_t v146; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v146) : "w"(v32), "w"(v48)); - svfloat32_t v147; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v147) : "w"(v32), "w"(v48)); - svfloat32_t v148; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v148) : "w"(v64), "w"(v80)); - svfloat32_t v149; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v149) : "w"(v64), "w"(v80)); - svfloat32_t v150; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v150) : "w"(v96), "w"(v112)); - svfloat32_t v151; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v151) : "w"(v96), "w"(v112)); - svfloat32_t v152; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v152) : "w"(v128), "w"(v144)); - svfloat32_t v153; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v153) : "w"(v128), "w"(v144)); - svfloat32_t v162; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v162) : "w"(v65), "w"(v81)); - svfloat32_t v163; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v163) : "w"(v65), "w"(v81)); - svfloat32_t v164; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v164) : "w"(v97), "w"(v145)); - svfloat32_t v165; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v165) : "w"(v97), "w"(v145)); - svfloat32_t v166; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v166) : "w"(v113), "w"(v129)); - svfloat32_t v167; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v167) : "w"(v113), "w"(v129)); - svfloat32_t zero227; - asm volatile("mov %0.s, #0" : "=w"(zero227)); + svfloat32_t v32 = svadd_f32_x(svptrue_b32(), v757, v759); + svfloat32_t v33 = svsub_f32_x(svptrue_b32(), v757, v759); + svfloat32_t v48 = svadd_f32_x(svptrue_b32(), v761, v763); + svfloat32_t v49 = svsub_f32_x(svptrue_b32(), v761, v763); + svfloat32_t v64 = svadd_f32_x(svptrue_b32(), v765, v767); + svfloat32_t v65 = svsub_f32_x(svptrue_b32(), v765, v767); + svfloat32_t v80 = svadd_f32_x(svptrue_b32(), v769, v771); + svfloat32_t v81 = svsub_f32_x(svptrue_b32(), v769, v771); + svfloat32_t v96 = svadd_f32_x(svptrue_b32(), v773, v775); + svfloat32_t v97 = svsub_f32_x(svptrue_b32(), v773, v775); + svfloat32_t v112 = svadd_f32_x(svptrue_b32(), v777, v779); + svfloat32_t v113 = svsub_f32_x(svptrue_b32(), v777, v779); + svfloat32_t v128 = svadd_f32_x(svptrue_b32(), v781, v783); + svfloat32_t v129 = svsub_f32_x(svptrue_b32(), v781, v783); + svfloat32_t v144 = svadd_f32_x(svptrue_b32(), v785, v787); + svfloat32_t v145 = svsub_f32_x(svptrue_b32(), v785, v787); + svfloat32_t v146 = svadd_f32_x(svptrue_b32(), v32, v48); + svfloat32_t v147 = svsub_f32_x(svptrue_b32(), v32, v48); + svfloat32_t v148 = svadd_f32_x(svptrue_b32(), v64, v80); + svfloat32_t v149 = svsub_f32_x(svptrue_b32(), v64, v80); + svfloat32_t v150 = svadd_f32_x(svptrue_b32(), v96, v112); + svfloat32_t v151 = svsub_f32_x(svptrue_b32(), v96, v112); + svfloat32_t v152 = svadd_f32_x(svptrue_b32(), v128, v144); + svfloat32_t v153 = svsub_f32_x(svptrue_b32(), v128, v144); + svfloat32_t v162 = svadd_f32_x(svptrue_b32(), v65, v81); + svfloat32_t v163 = svsub_f32_x(svptrue_b32(), v65, v81); + svfloat32_t v164 = svadd_f32_x(svptrue_b32(), v97, v145); + svfloat32_t v165 = svsub_f32_x(svptrue_b32(), v97, v145); + svfloat32_t v166 = svadd_f32_x(svptrue_b32(), v113, v129); + svfloat32_t v167 = svsub_f32_x(svptrue_b32(), v113, v129); + svfloat32_t zero227 = svdup_n_f32(0); svfloat32_t v227 = svcmla_f32_x(pred_full, zero227, v602, v49, 90); - svfloat32_t v154; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v154) : "w"(v146), "w"(v148)); - svfloat32_t v155; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v155) : "w"(v146), "w"(v148)); - svfloat32_t v156; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v156) : "w"(v150), "w"(v152)); - svfloat32_t v157; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v157) : "w"(v150), "w"(v152)); - svfloat32_t v160; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v160) : "w"(v151), "w"(v153)); - svfloat32_t v161; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v161) : "w"(v151), "w"(v153)); - svfloat32_t v168; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v168) : "w"(v164), "w"(v166)); - svfloat32_t v169; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v169) : "w"(v165), "w"(v167)); - svfloat32_t zero203; - asm volatile("mov %0.s, #0" : "=w"(zero203)); + svfloat32_t v154 = svadd_f32_x(svptrue_b32(), v146, v148); + svfloat32_t v155 = svsub_f32_x(svptrue_b32(), v146, v148); + svfloat32_t v156 = svadd_f32_x(svptrue_b32(), v150, v152); + svfloat32_t v157 = svsub_f32_x(svptrue_b32(), v150, v152); + svfloat32_t v160 = svadd_f32_x(svptrue_b32(), v151, v153); + svfloat32_t v161 = svsub_f32_x(svptrue_b32(), v151, v153); + svfloat32_t v168 = svadd_f32_x(svptrue_b32(), v164, v166); + svfloat32_t v169 = svadd_f32_x(svptrue_b32(), v165, v167); + svfloat32_t zero203 = svdup_n_f32(0); svfloat32_t v203 = svcmla_f32_x(pred_full, zero203, v602, v149, 90); - svfloat32_t zero234; - asm volatile("mov %0.s, #0" : "=w"(zero234)); + svfloat32_t zero234 = svdup_n_f32(0); svfloat32_t v234 = svcmla_f32_x(pred_full, zero234, v603, v162, 90); - svfloat32_t zero260; - asm volatile("mov %0.s, #0" : "=w"(zero260)); + svfloat32_t zero260 = svdup_n_f32(0); svfloat32_t v260 = svcmla_f32_x(pred_full, zero260, v607, v166, 90); - svfloat32_t v270; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v270) : "w"(v165), "w"(v609)); - svfloat32_t v275; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v275) : "w"(v167), "w"(v610)); - svfloat32_t v158; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v158) : "w"(v154), "w"(v156)); - svfloat32_t v159; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v159) : "w"(v154), "w"(v156)); - svfloat32_t zero191; - asm volatile("mov %0.s, #0" : "=w"(zero191)); + svfloat32_t v270 = svmul_f32_x(svptrue_b32(), v165, v609); + svfloat32_t v275 = svmul_f32_x(svptrue_b32(), v167, v610); + svfloat32_t v158 = svadd_f32_x(svptrue_b32(), v154, v156); + svfloat32_t v159 = svsub_f32_x(svptrue_b32(), v154, v156); + svfloat32_t zero191 = svdup_n_f32(0); svfloat32_t v191 = svcmla_f32_x(pred_full, zero191, v602, v157, 90); - svfloat32_t zero210; - asm volatile("mov %0.s, #0" : "=w"(zero210)); + svfloat32_t zero210 = svdup_n_f32(0); svfloat32_t v210 = svcmla_f32_x(pred_full, zero210, v603, v160, 90); - svfloat32_t zero246; - asm volatile("mov %0.s, #0" : "=w"(zero246)); + svfloat32_t zero246 = svdup_n_f32(0); svfloat32_t v246 = svcmla_f32_x(pred_full, zero246, v605, v168, 90); - svfloat32_t v265; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v265) : "w"(v169), "w"(v608)); + svfloat32_t v265 = svmul_f32_x(svptrue_b32(), v169, v608); svfloat32_t v286 = svmla_f32_x(pred_full, v33, v163, v604); svfloat32_t v287 = svmls_f32_x(pred_full, v33, v163, v604); - svfloat32_t v288; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v288) : "w"(v227), "w"(v234)); - svfloat32_t v289; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v289) : "w"(v227), "w"(v234)); - svfloat32_t v276; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v276) : "w"(v155), "w"(v191)); - svfloat32_t v277; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v277) : "w"(v155), "w"(v191)); + svfloat32_t v288 = svadd_f32_x(svptrue_b32(), v227, v234); + svfloat32_t v289 = svsub_f32_x(svptrue_b32(), v227, v234); + svfloat32_t v276 = svadd_f32_x(svptrue_b32(), v155, v191); + svfloat32_t v277 = svsub_f32_x(svptrue_b32(), v155, v191); svfloat32_t v278 = svmla_f32_x(pred_full, v147, v161, v604); - svfloat32_t v279; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v279) : "w"(v203), "w"(v210)); + svfloat32_t v279 = svadd_f32_x(svptrue_b32(), v203, v210); svfloat32_t v280 = svmls_f32_x(pred_full, v147, v161, v604); - svfloat32_t v281; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v281) : "w"(v210), "w"(v203)); + svfloat32_t v281 = svsub_f32_x(svptrue_b32(), v210, v203); svfloat32_t v290 = svcmla_f32_x(pred_full, v246, v606, v164, 90); - svfloat32_t v291; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v291) : "w"(v246), "w"(v260)); + svfloat32_t v291 = svsub_f32_x(svptrue_b32(), v246, v260); svfloat32_t v292 = svnmls_f32_x(pred_full, v265, v165, v609); svfloat32_t v293 = svnmls_f32_x(pred_full, v265, v167, v610); svfloat32_t v294 = svnmls_f32_x(pred_full, v270, v169, v608); svfloat32_t v295 = svnmls_f32_x(pred_full, v275, v169, v608); - svfloat32_t v300; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v300) : "w"(v287), "w"(v289)); - svfloat32_t v301; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v301) : "w"(v287), "w"(v289)); + svfloat32_t v300 = svadd_f32_x(svptrue_b32(), v287, v289); + svfloat32_t v301 = svsub_f32_x(svptrue_b32(), v287, v289); svint16_t v322 = svtbl_s16( svreinterpret_s16_s32(svcvt_s32_f32_x( pred_full, svmul_n_f32_x(pred_full, v158, (float)(1ULL << 31ULL)))), @@ -7677,38 +4913,22 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu16(const armral_cmplx_f32_t *restrict x, pred_full, svmul_n_f32_x(pred_full, v159, (float)(1ULL << 31ULL)))), svreinterpret_u16_u64( svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svfloat32_t v282; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v282) : "w"(v278), "w"(v279)); - svfloat32_t v283; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v283) : "w"(v280), "w"(v281)); - svfloat32_t v284; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v284) : "w"(v280), "w"(v281)); - svfloat32_t v285; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v285) : "w"(v278), "w"(v279)); - svfloat32_t v296; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v296) : "w"(v286), "w"(v292)); - svfloat32_t v297; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v297) : "w"(v286), "w"(v292)); - svfloat32_t v298; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v298) : "w"(v286), "w"(v294)); - svfloat32_t v299; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v299) : "w"(v286), "w"(v294)); - svfloat32_t v302; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v302) : "w"(v287), "w"(v295)); - svfloat32_t v303; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v303) : "w"(v287), "w"(v295)); - svfloat32_t v306; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v306) : "w"(v290), "w"(v288)); - svfloat32_t v307; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v307) : "w"(v290), "w"(v288)); - svfloat32_t v308; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v308) : "w"(v291), "w"(v293)); - svfloat32_t v309; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v309) : "w"(v291), "w"(v293)); - svfloat32_t v310; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v310) : "w"(v291), "w"(v289)); - svfloat32_t v311; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v311) : "w"(v291), "w"(v289)); + svfloat32_t v282 = svadd_f32_x(svptrue_b32(), v278, v279); + svfloat32_t v283 = svadd_f32_x(svptrue_b32(), v280, v281); + svfloat32_t v284 = svsub_f32_x(svptrue_b32(), v280, v281); + svfloat32_t v285 = svsub_f32_x(svptrue_b32(), v278, v279); + svfloat32_t v296 = svadd_f32_x(svptrue_b32(), v286, v292); + svfloat32_t v297 = svsub_f32_x(svptrue_b32(), v286, v292); + svfloat32_t v298 = svadd_f32_x(svptrue_b32(), v286, v294); + svfloat32_t v299 = svsub_f32_x(svptrue_b32(), v286, v294); + svfloat32_t v302 = svadd_f32_x(svptrue_b32(), v287, v295); + svfloat32_t v303 = svsub_f32_x(svptrue_b32(), v287, v295); + svfloat32_t v306 = svadd_f32_x(svptrue_b32(), v290, v288); + svfloat32_t v307 = svsub_f32_x(svptrue_b32(), v290, v288); + svfloat32_t v308 = svadd_f32_x(svptrue_b32(), v291, v293); + svfloat32_t v309 = svsub_f32_x(svptrue_b32(), v291, v293); + svfloat32_t v310 = svadd_f32_x(svptrue_b32(), v291, v289); + svfloat32_t v311 = svsub_f32_x(svptrue_b32(), v291, v289); svint16_t v354 = svtbl_s16( svreinterpret_s16_s32(svcvt_s32_f32_x( pred_full, svmul_n_f32_x(pred_full, v277, (float)(1ULL << 31ULL)))), @@ -7721,22 +4941,14 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu16(const armral_cmplx_f32_t *restrict x, svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); svst1w_u64(pred_full, (unsigned *)(v618), svreinterpret_u64_s16(v322)); svst1w_u64(pred_full, (unsigned *)(v690), svreinterpret_u64_s16(v386)); - svfloat32_t v312; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v312) : "w"(v296), "w"(v306)); - svfloat32_t v313; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v313) : "w"(v297), "w"(v307)); - svfloat32_t v314; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v314) : "w"(v298), "w"(v307)); - svfloat32_t v315; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v315) : "w"(v299), "w"(v306)); - svfloat32_t v316; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v316) : "w"(v300), "w"(v308)); - svfloat32_t v317; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v317) : "w"(v301), "w"(v309)); - svfloat32_t v318; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v318) : "w"(v302), "w"(v311)); - svfloat32_t v319; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v319) : "w"(v303), "w"(v310)); + svfloat32_t v312 = svadd_f32_x(svptrue_b32(), v296, v306); + svfloat32_t v313 = svadd_f32_x(svptrue_b32(), v297, v307); + svfloat32_t v314 = svsub_f32_x(svptrue_b32(), v298, v307); + svfloat32_t v315 = svsub_f32_x(svptrue_b32(), v299, v306); + svfloat32_t v316 = svadd_f32_x(svptrue_b32(), v300, v308); + svfloat32_t v317 = svadd_f32_x(svptrue_b32(), v301, v309); + svfloat32_t v318 = svsub_f32_x(svptrue_b32(), v302, v311); + svfloat32_t v319 = svsub_f32_x(svptrue_b32(), v303, v310); svint16_t v338 = svtbl_s16( svreinterpret_s16_s32(svcvt_s32_f32_x( pred_full, svmul_n_f32_x(pred_full, v285, (float)(1ULL << 31ULL)))), @@ -7825,853 +5037,401 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu17(const armral_cmplx_f32_t *restrict x, float v4 = dir; const float32x2_t *v5 = (const float32x2_t *)x; int32_t *v6 = (int32_t *)y; - int64_t v12 = howmany - 1; - int64_t v676 = howmany / 2; - for (int j = 0; j < v12; j += 2) { - float v213 = -4.2602849117736000e-02F; - float v218 = 2.0497965023262180e-01F; - float v223 = 1.0451835201736759e+00F; - float v228 = 1.7645848660222969e+00F; - float v233 = -7.2340797728605655e-01F; - float v238 = -8.9055591620606403e-02F; - float v243 = -1.0625000000000000e+00F; - float v248 = 2.5769410160110379e-01F; - float v253 = 7.7980260789483757e-01F; - float v258 = 5.4389318464570580e-01F; - float v263 = 4.2010193497052700e-01F; - float v268 = 1.2810929434228073e+00F; - float v273 = 4.4088907348175338e-01F; - float v278 = 3.1717619283272508e-01F; - float v282 = -9.0138318648016680e-01F; - float v283 = 9.0138318648016680e-01F; - float v290 = -4.3248756360072310e-01F; - float v291 = 4.3248756360072310e-01F; - float v298 = 6.6693537504044498e-01F; - float v299 = -6.6693537504044498e-01F; - float v306 = -6.0389004312516970e-01F; - float v307 = 6.0389004312516970e-01F; - float v314 = -3.6924873198582547e-01F; - float v315 = 3.6924873198582547e-01F; - float v322 = 4.8656938755549761e-01F; - float v323 = -4.8656938755549761e-01F; - float v330 = 2.3813712136760609e-01F; - float v331 = -2.3813712136760609e-01F; - float v338 = -1.5573820617422458e+00F; - float v339 = 1.5573820617422458e+00F; - float v346 = 6.5962247018731990e-01F; - float v347 = -6.5962247018731990e-01F; - float v354 = -1.4316961569866241e-01F; - float v355 = 1.4316961569866241e-01F; - float v362 = 2.3903469959860771e-01F; - float v363 = -2.3903469959860771e-01F; - float v370 = -4.7932541949972603e-02F; - float v371 = 4.7932541949972603e-02F; - float v378 = -2.3188014856550065e+00F; - float v379 = 2.3188014856550065e+00F; - float v386 = 7.8914568419206255e-01F; - float v387 = -7.8914568419206255e-01F; - float v394 = 3.8484572871179505e+00F; - float v395 = -3.8484572871179505e+00F; - float v402 = -1.3003804568801376e+00F; - float v403 = 1.3003804568801376e+00F; - float v410 = 4.0814769046889037e+00F; - float v411 = -4.0814769046889037e+00F; - float v418 = -1.4807159909286283e+00F; - float v419 = 1.4807159909286283e+00F; - float v426 = -1.3332470363551400e-02F; - float v427 = 1.3332470363551400e-02F; - float v434 = -3.7139778690557629e-01F; - float v435 = 3.7139778690557629e-01F; - float v442 = 1.9236512863456379e-01F; - float v443 = -1.9236512863456379e-01F; - float32x2_t v445 = (float32x2_t){v4, v4}; - const float32x2_t *v1245 = &v5[istride]; - int32_t *v1409 = &v6[ostride]; + for (int j = 0; j < howmany; j += 1) { + float32x2_t v20 = v5[istride]; + float v173 = -4.2602849117736000e-02F; + float v177 = 2.0497965023262180e-01F; + float v181 = 1.0451835201736759e+00F; + float v185 = 1.7645848660222969e+00F; + float v189 = -7.2340797728605655e-01F; + float v193 = -8.9055591620606403e-02F; + float v197 = -1.0625000000000000e+00F; + float v201 = 2.5769410160110379e-01F; + float v205 = 7.7980260789483757e-01F; + float v209 = 5.4389318464570580e-01F; + float v213 = 4.2010193497052700e-01F; + float v217 = 1.2810929434228073e+00F; + float v221 = 4.4088907348175338e-01F; + float v225 = 3.1717619283272508e-01F; + float v228 = -9.0138318648016680e-01F; + float v229 = 9.0138318648016680e-01F; + float v235 = -4.3248756360072310e-01F; + float v236 = 4.3248756360072310e-01F; + float v242 = 6.6693537504044498e-01F; + float v243 = -6.6693537504044498e-01F; + float v249 = -6.0389004312516970e-01F; + float v250 = 6.0389004312516970e-01F; + float v256 = -3.6924873198582547e-01F; + float v257 = 3.6924873198582547e-01F; + float v263 = 4.8656938755549761e-01F; + float v264 = -4.8656938755549761e-01F; + float v270 = 2.3813712136760609e-01F; + float v271 = -2.3813712136760609e-01F; + float v277 = -1.5573820617422458e+00F; + float v278 = 1.5573820617422458e+00F; + float v284 = 6.5962247018731990e-01F; + float v285 = -6.5962247018731990e-01F; + float v291 = -1.4316961569866241e-01F; + float v292 = 1.4316961569866241e-01F; + float v298 = 2.3903469959860771e-01F; + float v299 = -2.3903469959860771e-01F; + float v305 = -4.7932541949972603e-02F; + float v306 = 4.7932541949972603e-02F; + float v312 = -2.3188014856550065e+00F; + float v313 = 2.3188014856550065e+00F; + float v319 = 7.8914568419206255e-01F; + float v320 = -7.8914568419206255e-01F; + float v326 = 3.8484572871179505e+00F; + float v327 = -3.8484572871179505e+00F; + float v333 = -1.3003804568801376e+00F; + float v334 = 1.3003804568801376e+00F; + float v340 = 4.0814769046889037e+00F; + float v341 = -4.0814769046889037e+00F; + float v347 = -1.4807159909286283e+00F; + float v348 = 1.4807159909286283e+00F; + float v354 = -1.3332470363551400e-02F; + float v355 = 1.3332470363551400e-02F; + float v361 = -3.7139778690557629e-01F; + float v362 = 3.7139778690557629e-01F; + float v368 = 1.9236512863456379e-01F; + float v369 = -1.9236512863456379e-01F; + float32x2_t v371 = (float32x2_t){v4, v4}; + float32x2_t v166 = v5[0]; + float32x2_t v174 = (float32x2_t){v173, v173}; + float32x2_t v178 = (float32x2_t){v177, v177}; + float32x2_t v182 = (float32x2_t){v181, v181}; + float32x2_t v186 = (float32x2_t){v185, v185}; + float32x2_t v190 = (float32x2_t){v189, v189}; + float32x2_t v194 = (float32x2_t){v193, v193}; + float32x2_t v198 = (float32x2_t){v197, v197}; + float32x2_t v202 = (float32x2_t){v201, v201}; + float32x2_t v206 = (float32x2_t){v205, v205}; + float32x2_t v210 = (float32x2_t){v209, v209}; float32x2_t v214 = (float32x2_t){v213, v213}; - float32x2_t v219 = (float32x2_t){v218, v218}; - float32x2_t v224 = (float32x2_t){v223, v223}; - float32x2_t v229 = (float32x2_t){v228, v228}; - float32x2_t v234 = (float32x2_t){v233, v233}; - float32x2_t v239 = (float32x2_t){v238, v238}; - float32x2_t v244 = (float32x2_t){v243, v243}; - float32x2_t v249 = (float32x2_t){v248, v248}; - float32x2_t v254 = (float32x2_t){v253, v253}; - float32x2_t v259 = (float32x2_t){v258, v258}; - float32x2_t v264 = (float32x2_t){v263, v263}; - float32x2_t v269 = (float32x2_t){v268, v268}; - float32x2_t v274 = (float32x2_t){v273, v273}; - float32x2_t v279 = (float32x2_t){v278, v278}; - float32x2_t v284 = (float32x2_t){v282, v283}; - float32x2_t v292 = (float32x2_t){v290, v291}; + float32x2_t v218 = (float32x2_t){v217, v217}; + float32x2_t v222 = (float32x2_t){v221, v221}; + float32x2_t v226 = (float32x2_t){v225, v225}; + float32x2_t v230 = (float32x2_t){v228, v229}; + float32x2_t v237 = (float32x2_t){v235, v236}; + float32x2_t v244 = (float32x2_t){v242, v243}; + float32x2_t v251 = (float32x2_t){v249, v250}; + float32x2_t v258 = (float32x2_t){v256, v257}; + float32x2_t v265 = (float32x2_t){v263, v264}; + float32x2_t v272 = (float32x2_t){v270, v271}; + float32x2_t v279 = (float32x2_t){v277, v278}; + float32x2_t v286 = (float32x2_t){v284, v285}; + float32x2_t v293 = (float32x2_t){v291, v292}; float32x2_t v300 = (float32x2_t){v298, v299}; - float32x2_t v308 = (float32x2_t){v306, v307}; - float32x2_t v316 = (float32x2_t){v314, v315}; - float32x2_t v324 = (float32x2_t){v322, v323}; - float32x2_t v332 = (float32x2_t){v330, v331}; - float32x2_t v340 = (float32x2_t){v338, v339}; - float32x2_t v348 = (float32x2_t){v346, v347}; + float32x2_t v307 = (float32x2_t){v305, v306}; + float32x2_t v314 = (float32x2_t){v312, v313}; + float32x2_t v321 = (float32x2_t){v319, v320}; + float32x2_t v328 = (float32x2_t){v326, v327}; + float32x2_t v335 = (float32x2_t){v333, v334}; + float32x2_t v342 = (float32x2_t){v340, v341}; + float32x2_t v349 = (float32x2_t){v347, v348}; float32x2_t v356 = (float32x2_t){v354, v355}; - float32x2_t v364 = (float32x2_t){v362, v363}; - float32x2_t v372 = (float32x2_t){v370, v371}; - float32x2_t v380 = (float32x2_t){v378, v379}; - float32x2_t v388 = (float32x2_t){v386, v387}; - float32x2_t v396 = (float32x2_t){v394, v395}; - float32x2_t v404 = (float32x2_t){v402, v403}; - float32x2_t v412 = (float32x2_t){v410, v411}; - float32x2_t v420 = (float32x2_t){v418, v419}; - float32x2_t v428 = (float32x2_t){v426, v427}; - float32x2_t v436 = (float32x2_t){v434, v435}; - float32x2_t v444 = (float32x2_t){v442, v443}; - const float32x2_t *v1390 = &v5[0]; - int32_t *v1400 = &v6[0]; - float32x4_t v1548 = vld1q_f32((const float32_t *)v1245); - float32x4_t v215 = vcombine_f32(v214, v214); - float32x4_t v220 = vcombine_f32(v219, v219); - float32x4_t v225 = vcombine_f32(v224, v224); - float32x4_t v230 = vcombine_f32(v229, v229); - float32x4_t v235 = vcombine_f32(v234, v234); - float32x4_t v240 = vcombine_f32(v239, v239); - float32x4_t v245 = vcombine_f32(v244, v244); - float32x4_t v250 = vcombine_f32(v249, v249); - float32x4_t v255 = vcombine_f32(v254, v254); - float32x4_t v260 = vcombine_f32(v259, v259); - float32x4_t v265 = vcombine_f32(v264, v264); - float32x4_t v270 = vcombine_f32(v269, v269); - float32x4_t v275 = vcombine_f32(v274, v274); - float32x4_t v280 = vcombine_f32(v279, v279); - float32x2_t v286 = vmul_f32(v445, v284); - float32x2_t v294 = vmul_f32(v445, v292); - float32x2_t v302 = vmul_f32(v445, v300); - float32x2_t v310 = vmul_f32(v445, v308); - float32x2_t v318 = vmul_f32(v445, v316); - float32x2_t v326 = vmul_f32(v445, v324); - float32x2_t v334 = vmul_f32(v445, v332); - float32x2_t v342 = vmul_f32(v445, v340); - float32x2_t v350 = vmul_f32(v445, v348); - float32x2_t v358 = vmul_f32(v445, v356); - float32x2_t v366 = vmul_f32(v445, v364); - float32x2_t v374 = vmul_f32(v445, v372); - float32x2_t v382 = vmul_f32(v445, v380); - float32x2_t v390 = vmul_f32(v445, v388); - float32x2_t v398 = vmul_f32(v445, v396); - float32x2_t v406 = vmul_f32(v445, v404); - float32x2_t v414 = vmul_f32(v445, v412); - float32x2_t v422 = vmul_f32(v445, v420); - float32x2_t v430 = vmul_f32(v445, v428); - float32x2_t v438 = vmul_f32(v445, v436); - float32x2_t v446 = vmul_f32(v445, v444); - const float32x2_t *v1254 = &v5[istride * 16]; - const float32x2_t *v1263 = &v5[istride * 3]; - const float32x2_t *v1272 = &v5[istride * 14]; - const float32x2_t *v1281 = &v5[istride * 9]; - const float32x2_t *v1290 = &v5[istride * 8]; - const float32x2_t *v1299 = &v5[istride * 10]; - const float32x2_t *v1308 = &v5[istride * 7]; - const float32x2_t *v1317 = &v5[istride * 13]; - const float32x2_t *v1326 = &v5[istride * 4]; - const float32x2_t *v1335 = &v5[istride * 5]; - const float32x2_t *v1344 = &v5[istride * 12]; - const float32x2_t *v1353 = &v5[istride * 15]; - const float32x2_t *v1362 = &v5[istride * 2]; - const float32x2_t *v1371 = &v5[istride * 11]; - const float32x2_t *v1380 = &v5[istride * 6]; - int32_t *v1418 = &v6[ostride * 16]; - int32_t *v1427 = &v6[ostride * 2]; - int32_t *v1436 = &v6[ostride * 15]; - int32_t *v1445 = &v6[ostride * 3]; - int32_t *v1454 = &v6[ostride * 14]; - int32_t *v1463 = &v6[ostride * 4]; - int32_t *v1472 = &v6[ostride * 13]; - int32_t *v1481 = &v6[ostride * 5]; - int32_t *v1490 = &v6[ostride * 12]; - int32_t *v1499 = &v6[ostride * 6]; - int32_t *v1508 = &v6[ostride * 11]; - int32_t *v1517 = &v6[ostride * 7]; - int32_t *v1526 = &v6[ostride * 10]; - int32_t *v1535 = &v6[ostride * 8]; - int32_t *v1544 = &v6[ostride * 9]; - float32x4_t v1580 = vld1q_f32((const float32_t *)v1390); - float32x4_t v288 = vcombine_f32(v286, v286); - float32x4_t v296 = vcombine_f32(v294, v294); - float32x4_t v304 = vcombine_f32(v302, v302); - float32x4_t v312 = vcombine_f32(v310, v310); - float32x4_t v320 = vcombine_f32(v318, v318); - float32x4_t v328 = vcombine_f32(v326, v326); - float32x4_t v336 = vcombine_f32(v334, v334); - float32x4_t v344 = vcombine_f32(v342, v342); - float32x4_t v352 = vcombine_f32(v350, v350); - float32x4_t v360 = vcombine_f32(v358, v358); - float32x4_t v368 = vcombine_f32(v366, v366); - float32x4_t v376 = vcombine_f32(v374, v374); - float32x4_t v384 = vcombine_f32(v382, v382); - float32x4_t v392 = vcombine_f32(v390, v390); - float32x4_t v400 = vcombine_f32(v398, v398); - float32x4_t v408 = vcombine_f32(v406, v406); - float32x4_t v416 = vcombine_f32(v414, v414); - float32x4_t v424 = vcombine_f32(v422, v422); - float32x4_t v432 = vcombine_f32(v430, v430); - float32x4_t v440 = vcombine_f32(v438, v438); - float32x4_t v448 = vcombine_f32(v446, v446); - float32x4_t v1550 = vld1q_f32((const float32_t *)v1254); - float32x4_t v1552 = vld1q_f32((const float32_t *)v1263); - float32x4_t v1554 = vld1q_f32((const float32_t *)v1272); - float32x4_t v1556 = vld1q_f32((const float32_t *)v1281); - float32x4_t v1558 = vld1q_f32((const float32_t *)v1290); - float32x4_t v1560 = vld1q_f32((const float32_t *)v1299); - float32x4_t v1562 = vld1q_f32((const float32_t *)v1308); - float32x4_t v1564 = vld1q_f32((const float32_t *)v1317); - float32x4_t v1566 = vld1q_f32((const float32_t *)v1326); - float32x4_t v1568 = vld1q_f32((const float32_t *)v1335); - float32x4_t v1570 = vld1q_f32((const float32_t *)v1344); - float32x4_t v1572 = vld1q_f32((const float32_t *)v1353); - float32x4_t v1574 = vld1q_f32((const float32_t *)v1362); - float32x4_t v1576 = vld1q_f32((const float32_t *)v1371); - float32x4_t v1578 = vld1q_f32((const float32_t *)v1380); - float32x4_t v35 = vaddq_f32(v1548, v1550); - float32x4_t v36 = vsubq_f32(v1548, v1550); - float32x4_t v51 = vaddq_f32(v1552, v1554); - float32x4_t v52 = vsubq_f32(v1552, v1554); - float32x4_t v67 = vaddq_f32(v1556, v1558); - float32x4_t v68 = vsubq_f32(v1556, v1558); - float32x4_t v83 = vaddq_f32(v1560, v1562); - float32x4_t v84 = vsubq_f32(v1560, v1562); - float32x4_t v99 = vaddq_f32(v1564, v1566); - float32x4_t v100 = vsubq_f32(v1564, v1566); - float32x4_t v115 = vaddq_f32(v1568, v1570); - float32x4_t v116 = vsubq_f32(v1568, v1570); - float32x4_t v131 = vaddq_f32(v1572, v1574); - float32x4_t v132 = vsubq_f32(v1572, v1574); - float32x4_t v147 = vaddq_f32(v1576, v1578); - float32x4_t v148 = vsubq_f32(v1576, v1578); - float32x4_t v149 = vaddq_f32(v35, v99); - float32x4_t v150 = vaddq_f32(v51, v115); - float32x4_t v151 = vaddq_f32(v67, v131); - float32x4_t v152 = vaddq_f32(v83, v147); - float32x4_t v155 = vsubq_f32(v35, v99); - float32x4_t v156 = vsubq_f32(v51, v115); - float32x4_t v157 = vsubq_f32(v67, v131); - float32x4_t v158 = vsubq_f32(v83, v147); - float32x4_t v169 = vaddq_f32(v36, v68); - float32x4_t v170 = vaddq_f32(v52, v84); - float32x4_t v171 = vsubq_f32(v36, v68); - float32x4_t v172 = vsubq_f32(v148, v116); - float32x4_t v173 = vaddq_f32(v100, v132); - float32x4_t v174 = vaddq_f32(v116, v148); - float32x4_t v175 = vsubq_f32(v100, v132); - float32x4_t v176 = vsubq_f32(v52, v84); - float32x4_t v189 = vaddq_f32(v36, v100); - float32x4_t v190 = vaddq_f32(v84, v148); - float32x4_t v391 = vrev64q_f32(v36); - float32x4_t v399 = vrev64q_f32(v100); - float32x4_t v415 = vrev64q_f32(v84); - float32x4_t v423 = vrev64q_f32(v148); - float32x4_t v153 = vaddq_f32(v149, v151); - float32x4_t v154 = vaddq_f32(v150, v152); - float32x4_t v159 = vsubq_f32(v149, v151); - float32x4_t v160 = vsubq_f32(v150, v152); - float32x4_t v163 = vaddq_f32(v156, v158); - float32x4_t v164 = vaddq_f32(v155, v157); - float32x4_t v166 = vsubq_f32(v157, v158); - float32x4_t v167 = vsubq_f32(v155, v156); - float32x4_t v177 = vaddq_f32(v169, v170); - float32x4_t v178 = vaddq_f32(v173, v174); - float32x4_t v180 = vsubq_f32(v169, v170); - float32x4_t v181 = vsubq_f32(v173, v174); - float32x4_t v183 = vaddq_f32(v171, v172); - float32x4_t v184 = vaddq_f32(v175, v176); - float32x4_t v186 = vsubq_f32(v171, v172); - float32x4_t v187 = vsubq_f32(v175, v176); - float32x4_t v216 = vmulq_f32(v155, v215); - float32x4_t v221 = vmulq_f32(v156, v220); - float32x4_t v226 = vmulq_f32(v157, v225); - float32x4_t v231 = vmulq_f32(v158, v230); - float32x4_t v383 = vrev64q_f32(v189); - float32x4_t v393 = vmulq_f32(v391, v392); - float32x4_t v401 = vmulq_f32(v399, v400); - float32x4_t v407 = vrev64q_f32(v190); - float32x4_t v417 = vmulq_f32(v415, v416); - float32x4_t v425 = vmulq_f32(v423, v424); - float32x4_t v161 = vaddq_f32(v153, v154); - float32x4_t v162 = vsubq_f32(v153, v154); - float32x4_t v165 = vsubq_f32(v164, v163); - float32x4_t v168 = vaddq_f32(v159, v160); - float32x4_t v179 = vaddq_f32(v177, v178); - float32x4_t v182 = vaddq_f32(v180, v181); - float32x4_t v185 = vaddq_f32(v183, v184); - float32x4_t v188 = vaddq_f32(v186, v187); - float32x4_t v191 = vsubq_f32(v184, v178); - float32x4_t v194 = vsubq_f32(v177, v183); - float32x4_t v236 = vmulq_f32(v159, v235); - float32x4_t v241 = vmulq_f32(v160, v240); - float32x4_t v256 = vmulq_f32(v163, v255); - float32x4_t v261 = vmulq_f32(v164, v260); - float32x4_t v271 = vmulq_f32(v166, v270); - float32x4_t v276 = vmulq_f32(v167, v275); - float32x4_t v287 = vrev64q_f32(v177); - float32x4_t v295 = vrev64q_f32(v178); - float32x4_t v311 = vrev64q_f32(v180); - float32x4_t v319 = vrev64q_f32(v181); - float32x4_t v335 = vrev64q_f32(v183); - float32x4_t v343 = vrev64q_f32(v184); - float32x4_t v359 = vrev64q_f32(v186); - float32x4_t v367 = vrev64q_f32(v187); - float32x4_t v385 = vmulq_f32(v383, v384); - float32x4_t v409 = vmulq_f32(v407, v408); - float32x4_t v192 = vaddq_f32(v191, v36); - float32x4_t v195 = vaddq_f32(v194, v84); - float32x4_t v206 = vaddq_f32(v1580, v161); - float32x4_t v246 = vmulq_f32(v161, v245); - float32x4_t v251 = vmulq_f32(v162, v250); - float32x4_t v266 = vmulq_f32(v165, v265); - float32x4_t v281 = vmulq_f32(v168, v280); - float32x4_t v289 = vmulq_f32(v287, v288); - float32x4_t v297 = vmulq_f32(v295, v296); - float32x4_t v303 = vrev64q_f32(v179); - float32x4_t v313 = vmulq_f32(v311, v312); - float32x4_t v321 = vmulq_f32(v319, v320); - float32x4_t v327 = vrev64q_f32(v182); - float32x4_t v337 = vmulq_f32(v335, v336); - float32x4_t v345 = vmulq_f32(v343, v344); - float32x4_t v351 = vrev64q_f32(v185); - float32x4_t v361 = vmulq_f32(v359, v360); - float32x4_t v369 = vmulq_f32(v367, v368); - float32x4_t v375 = vrev64q_f32(v188); - float32x4_t v452 = vaddq_f32(v231, v271); - float32x4_t v453 = vsubq_f32(v271, v226); - float32x4_t v454 = vaddq_f32(v221, v276); - float32x4_t v455 = vsubq_f32(v216, v276); - float32x4_t v193 = vsubq_f32(v192, v190); - float32x4_t v196 = vaddq_f32(v195, v100); - float32x4_t v305 = vmulq_f32(v303, v304); - float32x4_t v329 = vmulq_f32(v327, v328); - float32x4_t v353 = vmulq_f32(v351, v352); - float32x4_t v377 = vmulq_f32(v375, v376); - float32x4_t v450 = vaddq_f32(v256, v266); - float32x4_t v451 = vsubq_f32(v261, v266); - float32x4_t v456 = vsubq_f32(v281, v241); - float32x4_t v457 = vaddq_f32(v281, v236); - float32x4_t v458 = vaddq_f32(v246, v206); - int16x4_t v526 = vqmovn_s32(vcvtq_n_s32_f32(v206, 15)); - float32x4_t v197 = vsubq_f32(v196, v148); - float32x4_t v431 = vrev64q_f32(v193); - float32x4_t v459 = vaddq_f32(v251, v458); - float32x4_t v460 = vsubq_f32(v458, v251); - float32x4_t v461 = vsubq_f32(v450, v452); - float32x4_t v463 = vaddq_f32(v451, v453); - float32x4_t v465 = vaddq_f32(v450, v454); - float32x4_t v467 = vaddq_f32(v451, v455); - float32x4_t v477 = vaddq_f32(v289, v305); - float32x4_t v478 = vaddq_f32(v297, v305); - float32x4_t v479 = vaddq_f32(v313, v329); - float32x4_t v480 = vaddq_f32(v321, v329); - float32x4_t v481 = vaddq_f32(v337, v353); - float32x4_t v482 = vaddq_f32(v345, v353); - float32x4_t v483 = vaddq_f32(v361, v377); - float32x4_t v484 = vaddq_f32(v369, v377); - vst1_s16((int16_t *)v1400, v526); - float32x4_t v198 = vaddq_f32(v193, v197); - float32x4_t v433 = vmulq_f32(v431, v432); - float32x4_t v439 = vrev64q_f32(v197); - float32x4_t v462 = vaddq_f32(v456, v459); - float32x4_t v464 = vaddq_f32(v457, v460); - float32x4_t v466 = vsubq_f32(v459, v456); - float32x4_t v468 = vsubq_f32(v460, v457); - float32x4_t v488 = vaddq_f32(v477, v479); - float32x4_t v489 = vsubq_f32(v477, v479); - float32x4_t v490 = vaddq_f32(v478, v480); - float32x4_t v491 = vsubq_f32(v478, v480); - float32x4_t v492 = vaddq_f32(v481, v483); - float32x4_t v493 = vsubq_f32(v483, v481); - float32x4_t v494 = vaddq_f32(v482, v484); - float32x4_t v495 = vsubq_f32(v484, v482); - float32x4_t v441 = vmulq_f32(v439, v440); - float32x4_t v447 = vrev64q_f32(v198); - float32x4_t v469 = vaddq_f32(v461, v462); - float32x4_t v470 = vaddq_f32(v463, v464); - float32x4_t v471 = vaddq_f32(v465, v466); - float32x4_t v472 = vaddq_f32(v467, v468); - float32x4_t v473 = vsubq_f32(v462, v461); - float32x4_t v474 = vsubq_f32(v464, v463); - float32x4_t v475 = vsubq_f32(v466, v465); - float32x4_t v476 = vsubq_f32(v468, v467); - float32x4_t v505 = vaddq_f32(v490, v494); - float32x4_t v507 = vaddq_f32(v489, v495); - float32x4_t v509 = vsubq_f32(v488, v492); - float32x4_t v511 = vsubq_f32(v495, v489); - float32x4_t v513 = vaddq_f32(v488, v492); - float32x4_t v516 = vsubq_f32(v493, v491); - float32x4_t v519 = vsubq_f32(v494, v490); - float32x4_t v522 = vaddq_f32(v491, v493); - float32x4_t v449 = vmulq_f32(v447, v448); - float32x4_t v496 = vsubq_f32(v433, v441); - float32x4_t v485 = vaddq_f32(v449, v441); - float32x4_t v498 = vaddq_f32(v496, v496); - float32x4_t v523 = vsubq_f32(v522, v496); - float32x4_t v486 = vaddq_f32(v385, v485); - float32x4_t v499 = vsubq_f32(v409, v498); - float32x4_t v502 = vaddq_f32(v485, v485); - float32x4_t v520 = vaddq_f32(v519, v498); - float32x4_t v568 = vaddq_f32(v476, v523); - float32x4_t v577 = vsubq_f32(v476, v523); - float32x4_t v487 = vaddq_f32(v486, v393); - float32x4_t v497 = vaddq_f32(v486, v401); - float32x4_t v500 = vaddq_f32(v499, v417); - float32x4_t v501 = vaddq_f32(v499, v425); - float32x4_t v503 = vaddq_f32(v502, v502); - float32x4_t v504 = vaddq_f32(v496, v502); - float32x4_t v510 = vaddq_f32(v509, v502); - float32x4_t v521 = vaddq_f32(v520, v502); - int16x4_t v571 = vqmovn_s32(vcvtq_n_s32_f32(v568, 15)); - int16x4_t v580 = vqmovn_s32(vcvtq_n_s32_f32(v577, 15)); - float32x4_t v506 = vaddq_f32(v505, v497); - float32x4_t v508 = vaddq_f32(v507, v500); - float32x4_t v512 = vsubq_f32(v511, v504); - float32x4_t v514 = vaddq_f32(v513, v487); - float32x4_t v517 = vsubq_f32(v516, v501); - float32x4_t v550 = vaddq_f32(v471, v510); - float32x4_t v559 = vsubq_f32(v471, v510); - float32x4_t v658 = vaddq_f32(v475, v521); - float32x4_t v667 = vsubq_f32(v475, v521); - vst1_s16((int16_t *)v1445, v571); - vst1_s16((int16_t *)v1454, v580); - float32x4_t v515 = vaddq_f32(v514, v496); - float32x4_t v518 = vaddq_f32(v517, v503); - float32x4_t v532 = vaddq_f32(v469, v506); - float32x4_t v541 = vsubq_f32(v469, v506); - int16x4_t v553 = vqmovn_s32(vcvtq_n_s32_f32(v550, 15)); - int16x4_t v562 = vqmovn_s32(vcvtq_n_s32_f32(v559, 15)); - float32x4_t v604 = vaddq_f32(v472, v512); - float32x4_t v613 = vsubq_f32(v472, v512); - float32x4_t v622 = vaddq_f32(v470, v508); - float32x4_t v631 = vsubq_f32(v470, v508); - int16x4_t v661 = vqmovn_s32(vcvtq_n_s32_f32(v658, 15)); - int16x4_t v670 = vqmovn_s32(vcvtq_n_s32_f32(v667, 15)); - int16x4_t v535 = vqmovn_s32(vcvtq_n_s32_f32(v532, 15)); - int16x4_t v544 = vqmovn_s32(vcvtq_n_s32_f32(v541, 15)); - float32x4_t v586 = vaddq_f32(v473, v515); - float32x4_t v595 = vsubq_f32(v473, v515); - int16x4_t v607 = vqmovn_s32(vcvtq_n_s32_f32(v604, 15)); - int16x4_t v616 = vqmovn_s32(vcvtq_n_s32_f32(v613, 15)); - int16x4_t v625 = vqmovn_s32(vcvtq_n_s32_f32(v622, 15)); - int16x4_t v634 = vqmovn_s32(vcvtq_n_s32_f32(v631, 15)); - float32x4_t v640 = vaddq_f32(v474, v518); - float32x4_t v649 = vsubq_f32(v474, v518); - vst1_s16((int16_t *)v1427, v553); - vst1_s16((int16_t *)v1436, v562); - vst1_s16((int16_t *)v1535, v661); - vst1_s16((int16_t *)v1544, v670); - int16x4_t v589 = vqmovn_s32(vcvtq_n_s32_f32(v586, 15)); - int16x4_t v598 = vqmovn_s32(vcvtq_n_s32_f32(v595, 15)); - int16x4_t v643 = vqmovn_s32(vcvtq_n_s32_f32(v640, 15)); - int16x4_t v652 = vqmovn_s32(vcvtq_n_s32_f32(v649, 15)); - vst1_s16((int16_t *)v1409, v535); - vst1_s16((int16_t *)v1418, v544); - vst1_s16((int16_t *)v1481, v607); - vst1_s16((int16_t *)v1490, v616); - vst1_s16((int16_t *)v1499, v625); - vst1_s16((int16_t *)v1508, v634); - vst1_s16((int16_t *)v1463, v589); - vst1_s16((int16_t *)v1472, v598); - vst1_s16((int16_t *)v1517, v643); - vst1_s16((int16_t *)v1526, v652); - v5 += 2 * 1; - v6 += 2 * 1; - } - for (int j = v676 * 2; j < howmany; j += 1) { - float32x2_t v688 = v5[istride]; - float v841 = -4.2602849117736000e-02F; - float v845 = 2.0497965023262180e-01F; - float v849 = 1.0451835201736759e+00F; - float v853 = 1.7645848660222969e+00F; - float v857 = -7.2340797728605655e-01F; - float v861 = -8.9055591620606403e-02F; - float v865 = -1.0625000000000000e+00F; - float v869 = 2.5769410160110379e-01F; - float v873 = 7.7980260789483757e-01F; - float v877 = 5.4389318464570580e-01F; - float v881 = 4.2010193497052700e-01F; - float v885 = 1.2810929434228073e+00F; - float v889 = 4.4088907348175338e-01F; - float v893 = 3.1717619283272508e-01F; - float v896 = -9.0138318648016680e-01F; - float v897 = 9.0138318648016680e-01F; - float v903 = -4.3248756360072310e-01F; - float v904 = 4.3248756360072310e-01F; - float v910 = 6.6693537504044498e-01F; - float v911 = -6.6693537504044498e-01F; - float v917 = -6.0389004312516970e-01F; - float v918 = 6.0389004312516970e-01F; - float v924 = -3.6924873198582547e-01F; - float v925 = 3.6924873198582547e-01F; - float v931 = 4.8656938755549761e-01F; - float v932 = -4.8656938755549761e-01F; - float v938 = 2.3813712136760609e-01F; - float v939 = -2.3813712136760609e-01F; - float v945 = -1.5573820617422458e+00F; - float v946 = 1.5573820617422458e+00F; - float v952 = 6.5962247018731990e-01F; - float v953 = -6.5962247018731990e-01F; - float v959 = -1.4316961569866241e-01F; - float v960 = 1.4316961569866241e-01F; - float v966 = 2.3903469959860771e-01F; - float v967 = -2.3903469959860771e-01F; - float v973 = -4.7932541949972603e-02F; - float v974 = 4.7932541949972603e-02F; - float v980 = -2.3188014856550065e+00F; - float v981 = 2.3188014856550065e+00F; - float v987 = 7.8914568419206255e-01F; - float v988 = -7.8914568419206255e-01F; - float v994 = 3.8484572871179505e+00F; - float v995 = -3.8484572871179505e+00F; - float v1001 = -1.3003804568801376e+00F; - float v1002 = 1.3003804568801376e+00F; - float v1008 = 4.0814769046889037e+00F; - float v1009 = -4.0814769046889037e+00F; - float v1015 = -1.4807159909286283e+00F; - float v1016 = 1.4807159909286283e+00F; - float v1022 = -1.3332470363551400e-02F; - float v1023 = 1.3332470363551400e-02F; - float v1029 = -3.7139778690557629e-01F; - float v1030 = 3.7139778690557629e-01F; - float v1036 = 1.9236512863456379e-01F; - float v1037 = -1.9236512863456379e-01F; - float32x2_t v1039 = (float32x2_t){v4, v4}; - float32x2_t v834 = v5[0]; - float32x2_t v842 = (float32x2_t){v841, v841}; - float32x2_t v846 = (float32x2_t){v845, v845}; - float32x2_t v850 = (float32x2_t){v849, v849}; - float32x2_t v854 = (float32x2_t){v853, v853}; - float32x2_t v858 = (float32x2_t){v857, v857}; - float32x2_t v862 = (float32x2_t){v861, v861}; - float32x2_t v866 = (float32x2_t){v865, v865}; - float32x2_t v870 = (float32x2_t){v869, v869}; - float32x2_t v874 = (float32x2_t){v873, v873}; - float32x2_t v878 = (float32x2_t){v877, v877}; - float32x2_t v882 = (float32x2_t){v881, v881}; - float32x2_t v886 = (float32x2_t){v885, v885}; - float32x2_t v890 = (float32x2_t){v889, v889}; - float32x2_t v894 = (float32x2_t){v893, v893}; - float32x2_t v898 = (float32x2_t){v896, v897}; - float32x2_t v905 = (float32x2_t){v903, v904}; - float32x2_t v912 = (float32x2_t){v910, v911}; - float32x2_t v919 = (float32x2_t){v917, v918}; - float32x2_t v926 = (float32x2_t){v924, v925}; - float32x2_t v933 = (float32x2_t){v931, v932}; - float32x2_t v940 = (float32x2_t){v938, v939}; - float32x2_t v947 = (float32x2_t){v945, v946}; - float32x2_t v954 = (float32x2_t){v952, v953}; - float32x2_t v961 = (float32x2_t){v959, v960}; - float32x2_t v968 = (float32x2_t){v966, v967}; - float32x2_t v975 = (float32x2_t){v973, v974}; - float32x2_t v982 = (float32x2_t){v980, v981}; - float32x2_t v989 = (float32x2_t){v987, v988}; - float32x2_t v996 = (float32x2_t){v994, v995}; - float32x2_t v1003 = (float32x2_t){v1001, v1002}; - float32x2_t v1010 = (float32x2_t){v1008, v1009}; - float32x2_t v1017 = (float32x2_t){v1015, v1016}; - float32x2_t v1024 = (float32x2_t){v1022, v1023}; - float32x2_t v1031 = (float32x2_t){v1029, v1030}; - float32x2_t v1038 = (float32x2_t){v1036, v1037}; - float32x2_t v693 = v5[istride * 16]; - float32x2_t v700 = v5[istride * 3]; - float32x2_t v705 = v5[istride * 14]; - float32x2_t v712 = v5[istride * 9]; - float32x2_t v717 = v5[istride * 8]; - float32x2_t v724 = v5[istride * 10]; - float32x2_t v729 = v5[istride * 7]; - float32x2_t v736 = v5[istride * 13]; - float32x2_t v741 = v5[istride * 4]; - float32x2_t v748 = v5[istride * 5]; - float32x2_t v753 = v5[istride * 12]; - float32x2_t v760 = v5[istride * 15]; - float32x2_t v765 = v5[istride * 2]; - float32x2_t v772 = v5[istride * 11]; - float32x2_t v777 = v5[istride * 6]; - float32x2_t v900 = vmul_f32(v1039, v898); - float32x2_t v907 = vmul_f32(v1039, v905); - float32x2_t v914 = vmul_f32(v1039, v912); - float32x2_t v921 = vmul_f32(v1039, v919); - float32x2_t v928 = vmul_f32(v1039, v926); - float32x2_t v935 = vmul_f32(v1039, v933); - float32x2_t v942 = vmul_f32(v1039, v940); - float32x2_t v949 = vmul_f32(v1039, v947); - float32x2_t v956 = vmul_f32(v1039, v954); - float32x2_t v963 = vmul_f32(v1039, v961); - float32x2_t v970 = vmul_f32(v1039, v968); - float32x2_t v977 = vmul_f32(v1039, v975); - float32x2_t v984 = vmul_f32(v1039, v982); - float32x2_t v991 = vmul_f32(v1039, v989); - float32x2_t v998 = vmul_f32(v1039, v996); - float32x2_t v1005 = vmul_f32(v1039, v1003); - float32x2_t v1012 = vmul_f32(v1039, v1010); - float32x2_t v1019 = vmul_f32(v1039, v1017); - float32x2_t v1026 = vmul_f32(v1039, v1024); - float32x2_t v1033 = vmul_f32(v1039, v1031); - float32x2_t v1040 = vmul_f32(v1039, v1038); - float32x2_t v694 = vadd_f32(v688, v693); - float32x2_t v695 = vsub_f32(v688, v693); - float32x2_t v706 = vadd_f32(v700, v705); - float32x2_t v707 = vsub_f32(v700, v705); - float32x2_t v718 = vadd_f32(v712, v717); - float32x2_t v719 = vsub_f32(v712, v717); - float32x2_t v730 = vadd_f32(v724, v729); - float32x2_t v731 = vsub_f32(v724, v729); - float32x2_t v742 = vadd_f32(v736, v741); - float32x2_t v743 = vsub_f32(v736, v741); - float32x2_t v754 = vadd_f32(v748, v753); - float32x2_t v755 = vsub_f32(v748, v753); - float32x2_t v766 = vadd_f32(v760, v765); - float32x2_t v767 = vsub_f32(v760, v765); - float32x2_t v778 = vadd_f32(v772, v777); - float32x2_t v779 = vsub_f32(v772, v777); - float32x2_t v780 = vadd_f32(v694, v742); - float32x2_t v781 = vadd_f32(v706, v754); - float32x2_t v782 = vadd_f32(v718, v766); - float32x2_t v783 = vadd_f32(v730, v778); - float32x2_t v786 = vsub_f32(v694, v742); - float32x2_t v787 = vsub_f32(v706, v754); - float32x2_t v788 = vsub_f32(v718, v766); - float32x2_t v789 = vsub_f32(v730, v778); - float32x2_t v800 = vadd_f32(v695, v719); - float32x2_t v801 = vadd_f32(v707, v731); - float32x2_t v802 = vsub_f32(v695, v719); - float32x2_t v803 = vsub_f32(v779, v755); - float32x2_t v804 = vadd_f32(v743, v767); - float32x2_t v805 = vadd_f32(v755, v779); - float32x2_t v806 = vsub_f32(v743, v767); - float32x2_t v807 = vsub_f32(v707, v731); - float32x2_t v820 = vadd_f32(v695, v743); - float32x2_t v821 = vadd_f32(v731, v779); - float32x2_t v992 = vrev64_f32(v695); - float32x2_t v999 = vrev64_f32(v743); - float32x2_t v1013 = vrev64_f32(v731); - float32x2_t v1020 = vrev64_f32(v779); - float32x2_t v784 = vadd_f32(v780, v782); - float32x2_t v785 = vadd_f32(v781, v783); - float32x2_t v790 = vsub_f32(v780, v782); - float32x2_t v791 = vsub_f32(v781, v783); - float32x2_t v794 = vadd_f32(v787, v789); - float32x2_t v795 = vadd_f32(v786, v788); - float32x2_t v797 = vsub_f32(v788, v789); - float32x2_t v798 = vsub_f32(v786, v787); - float32x2_t v808 = vadd_f32(v800, v801); - float32x2_t v809 = vadd_f32(v804, v805); - float32x2_t v811 = vsub_f32(v800, v801); - float32x2_t v812 = vsub_f32(v804, v805); - float32x2_t v814 = vadd_f32(v802, v803); - float32x2_t v815 = vadd_f32(v806, v807); - float32x2_t v817 = vsub_f32(v802, v803); - float32x2_t v818 = vsub_f32(v806, v807); - float32x2_t v843 = vmul_f32(v786, v842); - float32x2_t v847 = vmul_f32(v787, v846); - float32x2_t v851 = vmul_f32(v788, v850); - float32x2_t v855 = vmul_f32(v789, v854); - float32x2_t v985 = vrev64_f32(v820); - float32x2_t v993 = vmul_f32(v992, v991); - float32x2_t v1000 = vmul_f32(v999, v998); - float32x2_t v1006 = vrev64_f32(v821); - float32x2_t v1014 = vmul_f32(v1013, v1012); - float32x2_t v1021 = vmul_f32(v1020, v1019); - float32x2_t v792 = vadd_f32(v784, v785); - float32x2_t v793 = vsub_f32(v784, v785); - float32x2_t v796 = vsub_f32(v795, v794); - float32x2_t v799 = vadd_f32(v790, v791); - float32x2_t v810 = vadd_f32(v808, v809); - float32x2_t v813 = vadd_f32(v811, v812); - float32x2_t v816 = vadd_f32(v814, v815); - float32x2_t v819 = vadd_f32(v817, v818); - float32x2_t v822 = vsub_f32(v815, v809); - float32x2_t v825 = vsub_f32(v808, v814); - float32x2_t v859 = vmul_f32(v790, v858); - float32x2_t v863 = vmul_f32(v791, v862); - float32x2_t v875 = vmul_f32(v794, v874); - float32x2_t v879 = vmul_f32(v795, v878); - float32x2_t v887 = vmul_f32(v797, v886); - float32x2_t v891 = vmul_f32(v798, v890); - float32x2_t v901 = vrev64_f32(v808); - float32x2_t v908 = vrev64_f32(v809); - float32x2_t v922 = vrev64_f32(v811); - float32x2_t v929 = vrev64_f32(v812); - float32x2_t v943 = vrev64_f32(v814); - float32x2_t v950 = vrev64_f32(v815); - float32x2_t v964 = vrev64_f32(v817); - float32x2_t v971 = vrev64_f32(v818); - float32x2_t v986 = vmul_f32(v985, v984); - float32x2_t v1007 = vmul_f32(v1006, v1005); - float32x2_t v823 = vadd_f32(v822, v695); - float32x2_t v826 = vadd_f32(v825, v731); - float32x2_t v835 = vadd_f32(v834, v792); - float32x2_t v867 = vmul_f32(v792, v866); - float32x2_t v871 = vmul_f32(v793, v870); - float32x2_t v883 = vmul_f32(v796, v882); - float32x2_t v895 = vmul_f32(v799, v894); - float32x2_t v902 = vmul_f32(v901, v900); - float32x2_t v909 = vmul_f32(v908, v907); - float32x2_t v915 = vrev64_f32(v810); - float32x2_t v923 = vmul_f32(v922, v921); - float32x2_t v930 = vmul_f32(v929, v928); - float32x2_t v936 = vrev64_f32(v813); - float32x2_t v944 = vmul_f32(v943, v942); - float32x2_t v951 = vmul_f32(v950, v949); - float32x2_t v957 = vrev64_f32(v816); - float32x2_t v965 = vmul_f32(v964, v963); - float32x2_t v972 = vmul_f32(v971, v970); - float32x2_t v978 = vrev64_f32(v819); - float32x2_t v1045 = vadd_f32(v855, v887); - float32x2_t v1046 = vsub_f32(v887, v851); - float32x2_t v1047 = vadd_f32(v847, v891); - float32x2_t v1048 = vsub_f32(v843, v891); - float32x2_t v824 = vsub_f32(v823, v821); - float32x2_t v827 = vadd_f32(v826, v743); - float32x2_t v916 = vmul_f32(v915, v914); - float32x2_t v937 = vmul_f32(v936, v935); - float32x2_t v958 = vmul_f32(v957, v956); - float32x2_t v979 = vmul_f32(v978, v977); - float32x2_t v1043 = vadd_f32(v875, v883); - float32x2_t v1044 = vsub_f32(v879, v883); - float32x2_t v1049 = vsub_f32(v895, v863); - float32x2_t v1050 = vadd_f32(v895, v859); - float32x2_t v1051 = vadd_f32(v867, v835); - int16x4_t v1119 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v835, 15), (int32x2_t){0, 0})); - float32x2_t v828 = vsub_f32(v827, v779); - float32x2_t v1027 = vrev64_f32(v824); - float32x2_t v1052 = vadd_f32(v871, v1051); - float32x2_t v1053 = vsub_f32(v1051, v871); - float32x2_t v1054 = vsub_f32(v1043, v1045); - float32x2_t v1056 = vadd_f32(v1044, v1046); - float32x2_t v1058 = vadd_f32(v1043, v1047); - float32x2_t v1060 = vadd_f32(v1044, v1048); - float32x2_t v1070 = vadd_f32(v902, v916); - float32x2_t v1071 = vadd_f32(v909, v916); - float32x2_t v1072 = vadd_f32(v923, v937); - float32x2_t v1073 = vadd_f32(v930, v937); - float32x2_t v1074 = vadd_f32(v944, v958); - float32x2_t v1075 = vadd_f32(v951, v958); - float32x2_t v1076 = vadd_f32(v965, v979); - float32x2_t v1077 = vadd_f32(v972, v979); - v6[0] = vget_lane_s32(vreinterpret_s32_s16(v1119), 0); - float32x2_t v829 = vadd_f32(v824, v828); - float32x2_t v1028 = vmul_f32(v1027, v1026); - float32x2_t v1034 = vrev64_f32(v828); - float32x2_t v1055 = vadd_f32(v1049, v1052); - float32x2_t v1057 = vadd_f32(v1050, v1053); - float32x2_t v1059 = vsub_f32(v1052, v1049); - float32x2_t v1061 = vsub_f32(v1053, v1050); - float32x2_t v1081 = vadd_f32(v1070, v1072); - float32x2_t v1082 = vsub_f32(v1070, v1072); - float32x2_t v1083 = vadd_f32(v1071, v1073); - float32x2_t v1084 = vsub_f32(v1071, v1073); - float32x2_t v1085 = vadd_f32(v1074, v1076); - float32x2_t v1086 = vsub_f32(v1076, v1074); - float32x2_t v1087 = vadd_f32(v1075, v1077); - float32x2_t v1088 = vsub_f32(v1077, v1075); - float32x2_t v1035 = vmul_f32(v1034, v1033); - float32x2_t v1041 = vrev64_f32(v829); - float32x2_t v1062 = vadd_f32(v1054, v1055); - float32x2_t v1063 = vadd_f32(v1056, v1057); - float32x2_t v1064 = vadd_f32(v1058, v1059); - float32x2_t v1065 = vadd_f32(v1060, v1061); - float32x2_t v1066 = vsub_f32(v1055, v1054); - float32x2_t v1067 = vsub_f32(v1057, v1056); - float32x2_t v1068 = vsub_f32(v1059, v1058); - float32x2_t v1069 = vsub_f32(v1061, v1060); - float32x2_t v1098 = vadd_f32(v1083, v1087); - float32x2_t v1100 = vadd_f32(v1082, v1088); - float32x2_t v1102 = vsub_f32(v1081, v1085); - float32x2_t v1104 = vsub_f32(v1088, v1082); - float32x2_t v1106 = vadd_f32(v1081, v1085); - float32x2_t v1109 = vsub_f32(v1086, v1084); - float32x2_t v1112 = vsub_f32(v1087, v1083); - float32x2_t v1115 = vadd_f32(v1084, v1086); - float32x2_t v1042 = vmul_f32(v1041, v1040); - float32x2_t v1089 = vsub_f32(v1028, v1035); - float32x2_t v1078 = vadd_f32(v1042, v1035); - float32x2_t v1091 = vadd_f32(v1089, v1089); - float32x2_t v1116 = vsub_f32(v1115, v1089); - float32x2_t v1079 = vadd_f32(v986, v1078); - float32x2_t v1092 = vsub_f32(v1007, v1091); - float32x2_t v1095 = vadd_f32(v1078, v1078); - float32x2_t v1113 = vadd_f32(v1112, v1091); - float32x2_t v1151 = vadd_f32(v1069, v1116); - float32x2_t v1158 = vsub_f32(v1069, v1116); - float32x2_t v1080 = vadd_f32(v1079, v993); - float32x2_t v1090 = vadd_f32(v1079, v1000); - float32x2_t v1093 = vadd_f32(v1092, v1014); - float32x2_t v1094 = vadd_f32(v1092, v1021); - float32x2_t v1096 = vadd_f32(v1095, v1095); - float32x2_t v1097 = vadd_f32(v1089, v1095); - float32x2_t v1103 = vadd_f32(v1102, v1095); - float32x2_t v1114 = vadd_f32(v1113, v1095); - int16x4_t v1154 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1151, 15), (int32x2_t){0, 0})); - int16x4_t v1161 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1158, 15), (int32x2_t){0, 0})); - float32x2_t v1099 = vadd_f32(v1098, v1090); - float32x2_t v1101 = vadd_f32(v1100, v1093); - float32x2_t v1105 = vsub_f32(v1104, v1097); - float32x2_t v1107 = vadd_f32(v1106, v1080); - float32x2_t v1110 = vsub_f32(v1109, v1094); - float32x2_t v1137 = vadd_f32(v1064, v1103); - float32x2_t v1144 = vsub_f32(v1064, v1103); - v6[ostride * 3] = vget_lane_s32(vreinterpret_s32_s16(v1154), 0); - v6[ostride * 14] = vget_lane_s32(vreinterpret_s32_s16(v1161), 0); - float32x2_t v1221 = vadd_f32(v1068, v1114); - float32x2_t v1228 = vsub_f32(v1068, v1114); - float32x2_t v1108 = vadd_f32(v1107, v1089); - float32x2_t v1111 = vadd_f32(v1110, v1096); - float32x2_t v1123 = vadd_f32(v1062, v1099); - float32x2_t v1130 = vsub_f32(v1062, v1099); - int16x4_t v1140 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1137, 15), (int32x2_t){0, 0})); - int16x4_t v1147 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1144, 15), (int32x2_t){0, 0})); - float32x2_t v1179 = vadd_f32(v1065, v1105); - float32x2_t v1186 = vsub_f32(v1065, v1105); - float32x2_t v1193 = vadd_f32(v1063, v1101); - float32x2_t v1200 = vsub_f32(v1063, v1101); - int16x4_t v1224 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1221, 15), (int32x2_t){0, 0})); - int16x4_t v1231 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1228, 15), (int32x2_t){0, 0})); - int16x4_t v1126 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1123, 15), (int32x2_t){0, 0})); - int16x4_t v1133 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1130, 15), (int32x2_t){0, 0})); - v6[ostride * 2] = vget_lane_s32(vreinterpret_s32_s16(v1140), 0); - v6[ostride * 15] = vget_lane_s32(vreinterpret_s32_s16(v1147), 0); - float32x2_t v1165 = vadd_f32(v1066, v1108); - float32x2_t v1172 = vsub_f32(v1066, v1108); - int16x4_t v1182 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1179, 15), (int32x2_t){0, 0})); - int16x4_t v1189 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1186, 15), (int32x2_t){0, 0})); - int16x4_t v1196 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1193, 15), (int32x2_t){0, 0})); - int16x4_t v1203 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1200, 15), (int32x2_t){0, 0})); - float32x2_t v1207 = vadd_f32(v1067, v1111); - float32x2_t v1214 = vsub_f32(v1067, v1111); - v6[ostride * 8] = vget_lane_s32(vreinterpret_s32_s16(v1224), 0); - v6[ostride * 9] = vget_lane_s32(vreinterpret_s32_s16(v1231), 0); - v6[ostride] = vget_lane_s32(vreinterpret_s32_s16(v1126), 0); - v6[ostride * 16] = vget_lane_s32(vreinterpret_s32_s16(v1133), 0); - int16x4_t v1168 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1165, 15), (int32x2_t){0, 0})); - int16x4_t v1175 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1172, 15), (int32x2_t){0, 0})); - v6[ostride * 5] = vget_lane_s32(vreinterpret_s32_s16(v1182), 0); - v6[ostride * 12] = vget_lane_s32(vreinterpret_s32_s16(v1189), 0); - v6[ostride * 6] = vget_lane_s32(vreinterpret_s32_s16(v1196), 0); - v6[ostride * 11] = vget_lane_s32(vreinterpret_s32_s16(v1203), 0); - int16x4_t v1210 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1207, 15), (int32x2_t){0, 0})); - int16x4_t v1217 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1214, 15), (int32x2_t){0, 0})); - v6[ostride * 4] = vget_lane_s32(vreinterpret_s32_s16(v1168), 0); - v6[ostride * 13] = vget_lane_s32(vreinterpret_s32_s16(v1175), 0); - v6[ostride * 7] = vget_lane_s32(vreinterpret_s32_s16(v1210), 0); - v6[ostride * 10] = vget_lane_s32(vreinterpret_s32_s16(v1217), 0); + float32x2_t v363 = (float32x2_t){v361, v362}; + float32x2_t v370 = (float32x2_t){v368, v369}; + float32x2_t v25 = v5[istride * 16]; + float32x2_t v32 = v5[istride * 3]; + float32x2_t v37 = v5[istride * 14]; + float32x2_t v44 = v5[istride * 9]; + float32x2_t v49 = v5[istride * 8]; + float32x2_t v56 = v5[istride * 10]; + float32x2_t v61 = v5[istride * 7]; + float32x2_t v68 = v5[istride * 13]; + float32x2_t v73 = v5[istride * 4]; + float32x2_t v80 = v5[istride * 5]; + float32x2_t v85 = v5[istride * 12]; + float32x2_t v92 = v5[istride * 15]; + float32x2_t v97 = v5[istride * 2]; + float32x2_t v104 = v5[istride * 11]; + float32x2_t v109 = v5[istride * 6]; + float32x2_t v232 = vmul_f32(v371, v230); + float32x2_t v239 = vmul_f32(v371, v237); + float32x2_t v246 = vmul_f32(v371, v244); + float32x2_t v253 = vmul_f32(v371, v251); + float32x2_t v260 = vmul_f32(v371, v258); + float32x2_t v267 = vmul_f32(v371, v265); + float32x2_t v274 = vmul_f32(v371, v272); + float32x2_t v281 = vmul_f32(v371, v279); + float32x2_t v288 = vmul_f32(v371, v286); + float32x2_t v295 = vmul_f32(v371, v293); + float32x2_t v302 = vmul_f32(v371, v300); + float32x2_t v309 = vmul_f32(v371, v307); + float32x2_t v316 = vmul_f32(v371, v314); + float32x2_t v323 = vmul_f32(v371, v321); + float32x2_t v330 = vmul_f32(v371, v328); + float32x2_t v337 = vmul_f32(v371, v335); + float32x2_t v344 = vmul_f32(v371, v342); + float32x2_t v351 = vmul_f32(v371, v349); + float32x2_t v358 = vmul_f32(v371, v356); + float32x2_t v365 = vmul_f32(v371, v363); + float32x2_t v372 = vmul_f32(v371, v370); + float32x2_t v26 = vadd_f32(v20, v25); + float32x2_t v27 = vsub_f32(v20, v25); + float32x2_t v38 = vadd_f32(v32, v37); + float32x2_t v39 = vsub_f32(v32, v37); + float32x2_t v50 = vadd_f32(v44, v49); + float32x2_t v51 = vsub_f32(v44, v49); + float32x2_t v62 = vadd_f32(v56, v61); + float32x2_t v63 = vsub_f32(v56, v61); + float32x2_t v74 = vadd_f32(v68, v73); + float32x2_t v75 = vsub_f32(v68, v73); + float32x2_t v86 = vadd_f32(v80, v85); + float32x2_t v87 = vsub_f32(v80, v85); + float32x2_t v98 = vadd_f32(v92, v97); + float32x2_t v99 = vsub_f32(v92, v97); + float32x2_t v110 = vadd_f32(v104, v109); + float32x2_t v111 = vsub_f32(v104, v109); + float32x2_t v112 = vadd_f32(v26, v74); + float32x2_t v113 = vadd_f32(v38, v86); + float32x2_t v114 = vadd_f32(v50, v98); + float32x2_t v115 = vadd_f32(v62, v110); + float32x2_t v118 = vsub_f32(v26, v74); + float32x2_t v119 = vsub_f32(v38, v86); + float32x2_t v120 = vsub_f32(v50, v98); + float32x2_t v121 = vsub_f32(v62, v110); + float32x2_t v132 = vadd_f32(v27, v51); + float32x2_t v133 = vadd_f32(v39, v63); + float32x2_t v134 = vsub_f32(v27, v51); + float32x2_t v135 = vsub_f32(v111, v87); + float32x2_t v136 = vadd_f32(v75, v99); + float32x2_t v137 = vadd_f32(v87, v111); + float32x2_t v138 = vsub_f32(v75, v99); + float32x2_t v139 = vsub_f32(v39, v63); + float32x2_t v152 = vadd_f32(v27, v75); + float32x2_t v153 = vadd_f32(v63, v111); + float32x2_t v324 = vrev64_f32(v27); + float32x2_t v331 = vrev64_f32(v75); + float32x2_t v345 = vrev64_f32(v63); + float32x2_t v352 = vrev64_f32(v111); + float32x2_t v116 = vadd_f32(v112, v114); + float32x2_t v117 = vadd_f32(v113, v115); + float32x2_t v122 = vsub_f32(v112, v114); + float32x2_t v123 = vsub_f32(v113, v115); + float32x2_t v126 = vadd_f32(v119, v121); + float32x2_t v127 = vadd_f32(v118, v120); + float32x2_t v129 = vsub_f32(v120, v121); + float32x2_t v130 = vsub_f32(v118, v119); + float32x2_t v140 = vadd_f32(v132, v133); + float32x2_t v141 = vadd_f32(v136, v137); + float32x2_t v143 = vsub_f32(v132, v133); + float32x2_t v144 = vsub_f32(v136, v137); + float32x2_t v146 = vadd_f32(v134, v135); + float32x2_t v147 = vadd_f32(v138, v139); + float32x2_t v149 = vsub_f32(v134, v135); + float32x2_t v150 = vsub_f32(v138, v139); + float32x2_t v175 = vmul_f32(v118, v174); + float32x2_t v179 = vmul_f32(v119, v178); + float32x2_t v183 = vmul_f32(v120, v182); + float32x2_t v187 = vmul_f32(v121, v186); + float32x2_t v317 = vrev64_f32(v152); + float32x2_t v325 = vmul_f32(v324, v323); + float32x2_t v332 = vmul_f32(v331, v330); + float32x2_t v338 = vrev64_f32(v153); + float32x2_t v346 = vmul_f32(v345, v344); + float32x2_t v353 = vmul_f32(v352, v351); + float32x2_t v124 = vadd_f32(v116, v117); + float32x2_t v125 = vsub_f32(v116, v117); + float32x2_t v128 = vsub_f32(v127, v126); + float32x2_t v131 = vadd_f32(v122, v123); + float32x2_t v142 = vadd_f32(v140, v141); + float32x2_t v145 = vadd_f32(v143, v144); + float32x2_t v148 = vadd_f32(v146, v147); + float32x2_t v151 = vadd_f32(v149, v150); + float32x2_t v154 = vsub_f32(v147, v141); + float32x2_t v157 = vsub_f32(v140, v146); + float32x2_t v191 = vmul_f32(v122, v190); + float32x2_t v195 = vmul_f32(v123, v194); + float32x2_t v207 = vmul_f32(v126, v206); + float32x2_t v211 = vmul_f32(v127, v210); + float32x2_t v219 = vmul_f32(v129, v218); + float32x2_t v223 = vmul_f32(v130, v222); + float32x2_t v233 = vrev64_f32(v140); + float32x2_t v240 = vrev64_f32(v141); + float32x2_t v254 = vrev64_f32(v143); + float32x2_t v261 = vrev64_f32(v144); + float32x2_t v275 = vrev64_f32(v146); + float32x2_t v282 = vrev64_f32(v147); + float32x2_t v296 = vrev64_f32(v149); + float32x2_t v303 = vrev64_f32(v150); + float32x2_t v318 = vmul_f32(v317, v316); + float32x2_t v339 = vmul_f32(v338, v337); + float32x2_t v155 = vadd_f32(v154, v27); + float32x2_t v158 = vadd_f32(v157, v63); + float32x2_t v167 = vadd_f32(v166, v124); + float32x2_t v199 = vmul_f32(v124, v198); + float32x2_t v203 = vmul_f32(v125, v202); + float32x2_t v215 = vmul_f32(v128, v214); + float32x2_t v227 = vmul_f32(v131, v226); + float32x2_t v234 = vmul_f32(v233, v232); + float32x2_t v241 = vmul_f32(v240, v239); + float32x2_t v247 = vrev64_f32(v142); + float32x2_t v255 = vmul_f32(v254, v253); + float32x2_t v262 = vmul_f32(v261, v260); + float32x2_t v268 = vrev64_f32(v145); + float32x2_t v276 = vmul_f32(v275, v274); + float32x2_t v283 = vmul_f32(v282, v281); + float32x2_t v289 = vrev64_f32(v148); + float32x2_t v297 = vmul_f32(v296, v295); + float32x2_t v304 = vmul_f32(v303, v302); + float32x2_t v310 = vrev64_f32(v151); + float32x2_t v377 = vadd_f32(v187, v219); + float32x2_t v378 = vsub_f32(v219, v183); + float32x2_t v379 = vadd_f32(v179, v223); + float32x2_t v380 = vsub_f32(v175, v223); + float32x2_t v156 = vsub_f32(v155, v153); + float32x2_t v159 = vadd_f32(v158, v75); + float32x2_t v248 = vmul_f32(v247, v246); + float32x2_t v269 = vmul_f32(v268, v267); + float32x2_t v290 = vmul_f32(v289, v288); + float32x2_t v311 = vmul_f32(v310, v309); + float32x2_t v375 = vadd_f32(v207, v215); + float32x2_t v376 = vsub_f32(v211, v215); + float32x2_t v381 = vsub_f32(v227, v195); + float32x2_t v382 = vadd_f32(v227, v191); + float32x2_t v383 = vadd_f32(v199, v167); + int16x4_t v451 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v167, 15), (int32x2_t){0, 0})); + float32x2_t v160 = vsub_f32(v159, v111); + float32x2_t v359 = vrev64_f32(v156); + float32x2_t v384 = vadd_f32(v203, v383); + float32x2_t v385 = vsub_f32(v383, v203); + float32x2_t v386 = vsub_f32(v375, v377); + float32x2_t v388 = vadd_f32(v376, v378); + float32x2_t v390 = vadd_f32(v375, v379); + float32x2_t v392 = vadd_f32(v376, v380); + float32x2_t v402 = vadd_f32(v234, v248); + float32x2_t v403 = vadd_f32(v241, v248); + float32x2_t v404 = vadd_f32(v255, v269); + float32x2_t v405 = vadd_f32(v262, v269); + float32x2_t v406 = vadd_f32(v276, v290); + float32x2_t v407 = vadd_f32(v283, v290); + float32x2_t v408 = vadd_f32(v297, v311); + float32x2_t v409 = vadd_f32(v304, v311); + v6[0] = vget_lane_s32(vreinterpret_s32_s16(v451), 0); + float32x2_t v161 = vadd_f32(v156, v160); + float32x2_t v360 = vmul_f32(v359, v358); + float32x2_t v366 = vrev64_f32(v160); + float32x2_t v387 = vadd_f32(v381, v384); + float32x2_t v389 = vadd_f32(v382, v385); + float32x2_t v391 = vsub_f32(v384, v381); + float32x2_t v393 = vsub_f32(v385, v382); + float32x2_t v413 = vadd_f32(v402, v404); + float32x2_t v414 = vsub_f32(v402, v404); + float32x2_t v415 = vadd_f32(v403, v405); + float32x2_t v416 = vsub_f32(v403, v405); + float32x2_t v417 = vadd_f32(v406, v408); + float32x2_t v418 = vsub_f32(v408, v406); + float32x2_t v419 = vadd_f32(v407, v409); + float32x2_t v420 = vsub_f32(v409, v407); + float32x2_t v367 = vmul_f32(v366, v365); + float32x2_t v373 = vrev64_f32(v161); + float32x2_t v394 = vadd_f32(v386, v387); + float32x2_t v395 = vadd_f32(v388, v389); + float32x2_t v396 = vadd_f32(v390, v391); + float32x2_t v397 = vadd_f32(v392, v393); + float32x2_t v398 = vsub_f32(v387, v386); + float32x2_t v399 = vsub_f32(v389, v388); + float32x2_t v400 = vsub_f32(v391, v390); + float32x2_t v401 = vsub_f32(v393, v392); + float32x2_t v430 = vadd_f32(v415, v419); + float32x2_t v432 = vadd_f32(v414, v420); + float32x2_t v434 = vsub_f32(v413, v417); + float32x2_t v436 = vsub_f32(v420, v414); + float32x2_t v438 = vadd_f32(v413, v417); + float32x2_t v441 = vsub_f32(v418, v416); + float32x2_t v444 = vsub_f32(v419, v415); + float32x2_t v447 = vadd_f32(v416, v418); + float32x2_t v374 = vmul_f32(v373, v372); + float32x2_t v421 = vsub_f32(v360, v367); + float32x2_t v410 = vadd_f32(v374, v367); + float32x2_t v423 = vadd_f32(v421, v421); + float32x2_t v448 = vsub_f32(v447, v421); + float32x2_t v411 = vadd_f32(v318, v410); + float32x2_t v424 = vsub_f32(v339, v423); + float32x2_t v427 = vadd_f32(v410, v410); + float32x2_t v445 = vadd_f32(v444, v423); + float32x2_t v483 = vadd_f32(v401, v448); + float32x2_t v490 = vsub_f32(v401, v448); + float32x2_t v412 = vadd_f32(v411, v325); + float32x2_t v422 = vadd_f32(v411, v332); + float32x2_t v425 = vadd_f32(v424, v346); + float32x2_t v426 = vadd_f32(v424, v353); + float32x2_t v428 = vadd_f32(v427, v427); + float32x2_t v429 = vadd_f32(v421, v427); + float32x2_t v435 = vadd_f32(v434, v427); + float32x2_t v446 = vadd_f32(v445, v427); + int16x4_t v486 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v483, 15), (int32x2_t){0, 0})); + int16x4_t v493 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v490, 15), (int32x2_t){0, 0})); + float32x2_t v431 = vadd_f32(v430, v422); + float32x2_t v433 = vadd_f32(v432, v425); + float32x2_t v437 = vsub_f32(v436, v429); + float32x2_t v439 = vadd_f32(v438, v412); + float32x2_t v442 = vsub_f32(v441, v426); + float32x2_t v469 = vadd_f32(v396, v435); + float32x2_t v476 = vsub_f32(v396, v435); + v6[ostride * 3] = vget_lane_s32(vreinterpret_s32_s16(v486), 0); + v6[ostride * 14] = vget_lane_s32(vreinterpret_s32_s16(v493), 0); + float32x2_t v553 = vadd_f32(v400, v446); + float32x2_t v560 = vsub_f32(v400, v446); + float32x2_t v440 = vadd_f32(v439, v421); + float32x2_t v443 = vadd_f32(v442, v428); + float32x2_t v455 = vadd_f32(v394, v431); + float32x2_t v462 = vsub_f32(v394, v431); + int16x4_t v472 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v469, 15), (int32x2_t){0, 0})); + int16x4_t v479 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v476, 15), (int32x2_t){0, 0})); + float32x2_t v511 = vadd_f32(v397, v437); + float32x2_t v518 = vsub_f32(v397, v437); + float32x2_t v525 = vadd_f32(v395, v433); + float32x2_t v532 = vsub_f32(v395, v433); + int16x4_t v556 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v553, 15), (int32x2_t){0, 0})); + int16x4_t v563 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v560, 15), (int32x2_t){0, 0})); + int16x4_t v458 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v455, 15), (int32x2_t){0, 0})); + int16x4_t v465 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v462, 15), (int32x2_t){0, 0})); + v6[ostride * 2] = vget_lane_s32(vreinterpret_s32_s16(v472), 0); + v6[ostride * 15] = vget_lane_s32(vreinterpret_s32_s16(v479), 0); + float32x2_t v497 = vadd_f32(v398, v440); + float32x2_t v504 = vsub_f32(v398, v440); + int16x4_t v514 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v511, 15), (int32x2_t){0, 0})); + int16x4_t v521 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v518, 15), (int32x2_t){0, 0})); + int16x4_t v528 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v525, 15), (int32x2_t){0, 0})); + int16x4_t v535 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v532, 15), (int32x2_t){0, 0})); + float32x2_t v539 = vadd_f32(v399, v443); + float32x2_t v546 = vsub_f32(v399, v443); + v6[ostride * 8] = vget_lane_s32(vreinterpret_s32_s16(v556), 0); + v6[ostride * 9] = vget_lane_s32(vreinterpret_s32_s16(v563), 0); + v6[ostride] = vget_lane_s32(vreinterpret_s32_s16(v458), 0); + v6[ostride * 16] = vget_lane_s32(vreinterpret_s32_s16(v465), 0); + int16x4_t v500 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v497, 15), (int32x2_t){0, 0})); + int16x4_t v507 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v504, 15), (int32x2_t){0, 0})); + v6[ostride * 5] = vget_lane_s32(vreinterpret_s32_s16(v514), 0); + v6[ostride * 12] = vget_lane_s32(vreinterpret_s32_s16(v521), 0); + v6[ostride * 6] = vget_lane_s32(vreinterpret_s32_s16(v528), 0); + v6[ostride * 11] = vget_lane_s32(vreinterpret_s32_s16(v535), 0); + int16x4_t v542 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v539, 15), (int32x2_t){0, 0})); + int16x4_t v549 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v546, 15), (int32x2_t){0, 0})); + v6[ostride * 4] = vget_lane_s32(vreinterpret_s32_s16(v500), 0); + v6[ostride * 13] = vget_lane_s32(vreinterpret_s32_s16(v507), 0); + v6[ostride * 7] = vget_lane_s32(vreinterpret_s32_s16(v542), 0); + v6[ostride * 10] = vget_lane_s32(vreinterpret_s32_s16(v549), 0); v5 += 1 * 1; v6 += 1 * 1; } @@ -8883,167 +5643,91 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu17(const armral_cmplx_f32_t *restrict x, svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v784)[0])); svfloat32_t v1027 = svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v793)[0])); - svfloat32_t v32; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v32) : "w"(v997), "w"(v999)); - svfloat32_t v33; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v33) : "w"(v997), "w"(v999)); - svfloat32_t v48; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v48) : "w"(v1001), "w"(v1003)); - svfloat32_t v49; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v49) : "w"(v1001), "w"(v1003)); - svfloat32_t v64; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v64) : "w"(v1005), "w"(v1007)); - svfloat32_t v65; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v65) : "w"(v1005), "w"(v1007)); - svfloat32_t v80; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v80) : "w"(v1009), "w"(v1011)); - svfloat32_t v81; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v81) : "w"(v1009), "w"(v1011)); - svfloat32_t v96; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v96) : "w"(v1013), "w"(v1015)); - svfloat32_t v97; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v97) : "w"(v1013), "w"(v1015)); - svfloat32_t v112; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v112) : "w"(v1017), "w"(v1019)); - svfloat32_t v113; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v113) : "w"(v1017), "w"(v1019)); - svfloat32_t v128; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v128) : "w"(v1021), "w"(v1023)); - svfloat32_t v129; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v129) : "w"(v1021), "w"(v1023)); - svfloat32_t v144; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v144) : "w"(v1025), "w"(v1027)); - svfloat32_t v145; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v145) : "w"(v1025), "w"(v1027)); - svfloat32_t v146; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v146) : "w"(v32), "w"(v96)); - svfloat32_t v147; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v147) : "w"(v48), "w"(v112)); - svfloat32_t v148; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v148) : "w"(v64), "w"(v128)); - svfloat32_t v149; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v149) : "w"(v80), "w"(v144)); - svfloat32_t v152; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v152) : "w"(v32), "w"(v96)); - svfloat32_t v153; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v153) : "w"(v48), "w"(v112)); - svfloat32_t v154; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v154) : "w"(v64), "w"(v128)); - svfloat32_t v155; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v155) : "w"(v80), "w"(v144)); - svfloat32_t v166; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v166) : "w"(v33), "w"(v65)); - svfloat32_t v167; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v167) : "w"(v49), "w"(v81)); - svfloat32_t v168; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v168) : "w"(v33), "w"(v65)); - svfloat32_t v169; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v169) : "w"(v145), "w"(v113)); - svfloat32_t v170; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v170) : "w"(v97), "w"(v129)); - svfloat32_t v171; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v171) : "w"(v113), "w"(v145)); - svfloat32_t v172; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v172) : "w"(v97), "w"(v129)); - svfloat32_t v173; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v173) : "w"(v49), "w"(v81)); - svfloat32_t v186; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v186) : "w"(v33), "w"(v97)); - svfloat32_t v187; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v187) : "w"(v81), "w"(v145)); - svfloat32_t v150; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v150) : "w"(v146), "w"(v148)); - svfloat32_t v151; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v151) : "w"(v147), "w"(v149)); - svfloat32_t v156; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v156) : "w"(v146), "w"(v148)); - svfloat32_t v157; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v157) : "w"(v147), "w"(v149)); - svfloat32_t v160; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v160) : "w"(v153), "w"(v155)); - svfloat32_t v161; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v161) : "w"(v152), "w"(v154)); - svfloat32_t v163; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v163) : "w"(v154), "w"(v155)); - svfloat32_t v164; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v164) : "w"(v152), "w"(v153)); - svfloat32_t v174; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v174) : "w"(v166), "w"(v167)); - svfloat32_t v175; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v175) : "w"(v170), "w"(v171)); - svfloat32_t v177; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v177) : "w"(v166), "w"(v167)); - svfloat32_t v178; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v178) : "w"(v170), "w"(v171)); - svfloat32_t v180; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v180) : "w"(v168), "w"(v169)); - svfloat32_t v181; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v181) : "w"(v172), "w"(v173)); - svfloat32_t v183; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v183) : "w"(v168), "w"(v169)); - svfloat32_t v184; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v184) : "w"(v172), "w"(v173)); - svfloat32_t v223; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v223) : "w"(v154), "w"(v809)); - svfloat32_t zero390; - asm volatile("mov %0.s, #0" : "=w"(zero390)); + svfloat32_t v32 = svadd_f32_x(svptrue_b32(), v997, v999); + svfloat32_t v33 = svsub_f32_x(svptrue_b32(), v997, v999); + svfloat32_t v48 = svadd_f32_x(svptrue_b32(), v1001, v1003); + svfloat32_t v49 = svsub_f32_x(svptrue_b32(), v1001, v1003); + svfloat32_t v64 = svadd_f32_x(svptrue_b32(), v1005, v1007); + svfloat32_t v65 = svsub_f32_x(svptrue_b32(), v1005, v1007); + svfloat32_t v80 = svadd_f32_x(svptrue_b32(), v1009, v1011); + svfloat32_t v81 = svsub_f32_x(svptrue_b32(), v1009, v1011); + svfloat32_t v96 = svadd_f32_x(svptrue_b32(), v1013, v1015); + svfloat32_t v97 = svsub_f32_x(svptrue_b32(), v1013, v1015); + svfloat32_t v112 = svadd_f32_x(svptrue_b32(), v1017, v1019); + svfloat32_t v113 = svsub_f32_x(svptrue_b32(), v1017, v1019); + svfloat32_t v128 = svadd_f32_x(svptrue_b32(), v1021, v1023); + svfloat32_t v129 = svsub_f32_x(svptrue_b32(), v1021, v1023); + svfloat32_t v144 = svadd_f32_x(svptrue_b32(), v1025, v1027); + svfloat32_t v145 = svsub_f32_x(svptrue_b32(), v1025, v1027); + svfloat32_t v146 = svadd_f32_x(svptrue_b32(), v32, v96); + svfloat32_t v147 = svadd_f32_x(svptrue_b32(), v48, v112); + svfloat32_t v148 = svadd_f32_x(svptrue_b32(), v64, v128); + svfloat32_t v149 = svadd_f32_x(svptrue_b32(), v80, v144); + svfloat32_t v152 = svsub_f32_x(svptrue_b32(), v32, v96); + svfloat32_t v153 = svsub_f32_x(svptrue_b32(), v48, v112); + svfloat32_t v154 = svsub_f32_x(svptrue_b32(), v64, v128); + svfloat32_t v155 = svsub_f32_x(svptrue_b32(), v80, v144); + svfloat32_t v166 = svadd_f32_x(svptrue_b32(), v33, v65); + svfloat32_t v167 = svadd_f32_x(svptrue_b32(), v49, v81); + svfloat32_t v168 = svsub_f32_x(svptrue_b32(), v33, v65); + svfloat32_t v169 = svsub_f32_x(svptrue_b32(), v145, v113); + svfloat32_t v170 = svadd_f32_x(svptrue_b32(), v97, v129); + svfloat32_t v171 = svadd_f32_x(svptrue_b32(), v113, v145); + svfloat32_t v172 = svsub_f32_x(svptrue_b32(), v97, v129); + svfloat32_t v173 = svsub_f32_x(svptrue_b32(), v49, v81); + svfloat32_t v186 = svadd_f32_x(svptrue_b32(), v33, v97); + svfloat32_t v187 = svadd_f32_x(svptrue_b32(), v81, v145); + svfloat32_t v150 = svadd_f32_x(svptrue_b32(), v146, v148); + svfloat32_t v151 = svadd_f32_x(svptrue_b32(), v147, v149); + svfloat32_t v156 = svsub_f32_x(svptrue_b32(), v146, v148); + svfloat32_t v157 = svsub_f32_x(svptrue_b32(), v147, v149); + svfloat32_t v160 = svadd_f32_x(svptrue_b32(), v153, v155); + svfloat32_t v161 = svadd_f32_x(svptrue_b32(), v152, v154); + svfloat32_t v163 = svsub_f32_x(svptrue_b32(), v154, v155); + svfloat32_t v164 = svsub_f32_x(svptrue_b32(), v152, v153); + svfloat32_t v174 = svadd_f32_x(svptrue_b32(), v166, v167); + svfloat32_t v175 = svadd_f32_x(svptrue_b32(), v170, v171); + svfloat32_t v177 = svsub_f32_x(svptrue_b32(), v166, v167); + svfloat32_t v178 = svsub_f32_x(svptrue_b32(), v170, v171); + svfloat32_t v180 = svadd_f32_x(svptrue_b32(), v168, v169); + svfloat32_t v181 = svadd_f32_x(svptrue_b32(), v172, v173); + svfloat32_t v183 = svsub_f32_x(svptrue_b32(), v168, v169); + svfloat32_t v184 = svsub_f32_x(svptrue_b32(), v172, v173); + svfloat32_t v223 = svmul_f32_x(svptrue_b32(), v154, v809); + svfloat32_t zero390 = svdup_n_f32(0); svfloat32_t v390 = svcmla_f32_x(pred_full, zero390, v836, v187, 90); - svfloat32_t v158; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v158) : "w"(v150), "w"(v151)); - svfloat32_t v159; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v159) : "w"(v150), "w"(v151)); - svfloat32_t v162; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v162) : "w"(v161), "w"(v160)); - svfloat32_t v165; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v165) : "w"(v156), "w"(v157)); - svfloat32_t v176; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v176) : "w"(v174), "w"(v175)); - svfloat32_t v179; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v179) : "w"(v177), "w"(v178)); - svfloat32_t v182; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v182) : "w"(v180), "w"(v181)); - svfloat32_t v185; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v185) : "w"(v183), "w"(v184)); - svfloat32_t v188; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v188) : "w"(v181), "w"(v175)); - svfloat32_t v191; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v191) : "w"(v174), "w"(v180)); - svfloat32_t v233; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v233) : "w"(v156), "w"(v811)); - svfloat32_t v238; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v238) : "w"(v157), "w"(v812)); - svfloat32_t v268; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v268) : "w"(v163), "w"(v818)); - svfloat32_t v273; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v273) : "w"(v164), "w"(v819)); - svfloat32_t v189; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v189) : "w"(v188), "w"(v33)); - svfloat32_t v192; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v192) : "w"(v191), "w"(v81)); - svfloat32_t v203; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v203) : "w"(v1029), "w"(v158)); - svfloat32_t v263; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v263) : "w"(v162), "w"(v817)); - svfloat32_t zero299; - asm volatile("mov %0.s, #0" : "=w"(zero299)); + svfloat32_t v158 = svadd_f32_x(svptrue_b32(), v150, v151); + svfloat32_t v159 = svsub_f32_x(svptrue_b32(), v150, v151); + svfloat32_t v162 = svsub_f32_x(svptrue_b32(), v161, v160); + svfloat32_t v165 = svadd_f32_x(svptrue_b32(), v156, v157); + svfloat32_t v176 = svadd_f32_x(svptrue_b32(), v174, v175); + svfloat32_t v179 = svadd_f32_x(svptrue_b32(), v177, v178); + svfloat32_t v182 = svadd_f32_x(svptrue_b32(), v180, v181); + svfloat32_t v185 = svadd_f32_x(svptrue_b32(), v183, v184); + svfloat32_t v188 = svsub_f32_x(svptrue_b32(), v181, v175); + svfloat32_t v191 = svsub_f32_x(svptrue_b32(), v174, v180); + svfloat32_t v233 = svmul_f32_x(svptrue_b32(), v156, v811); + svfloat32_t v238 = svmul_f32_x(svptrue_b32(), v157, v812); + svfloat32_t v268 = svmul_f32_x(svptrue_b32(), v163, v818); + svfloat32_t v273 = svmul_f32_x(svptrue_b32(), v164, v819); + svfloat32_t v189 = svadd_f32_x(svptrue_b32(), v188, v33); + svfloat32_t v192 = svadd_f32_x(svptrue_b32(), v191, v81); + svfloat32_t v203 = svadd_f32_x(svptrue_b32(), v1029, v158); + svfloat32_t v263 = svmul_f32_x(svptrue_b32(), v162, v817); + svfloat32_t zero299 = svdup_n_f32(0); svfloat32_t v299 = svcmla_f32_x(pred_full, zero299, v823, v176, 90); - svfloat32_t zero320; - asm volatile("mov %0.s, #0" : "=w"(zero320)); + svfloat32_t zero320 = svdup_n_f32(0); svfloat32_t v320 = svcmla_f32_x(pred_full, zero320, v826, v179, 90); - svfloat32_t zero341; - asm volatile("mov %0.s, #0" : "=w"(zero341)); + svfloat32_t zero341 = svdup_n_f32(0); svfloat32_t v341 = svcmla_f32_x(pred_full, zero341, v829, v182, 90); - svfloat32_t zero362; - asm volatile("mov %0.s, #0" : "=w"(zero362)); + svfloat32_t zero362 = svdup_n_f32(0); svfloat32_t v362 = svcmla_f32_x(pred_full, zero362, v832, v185, 90); svfloat32_t v428 = svmla_f32_x(pred_full, v268, v155, v810); svfloat32_t v429 = svnmls_f32_x(pred_full, v223, v163, v818); svfloat32_t v430 = svmla_f32_x(pred_full, v273, v153, v808); svfloat32_t v431 = svnmls_f32_x(pred_full, v273, v152, v807); - svfloat32_t v190; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v190) : "w"(v189), "w"(v187)); - svfloat32_t v193; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v193) : "w"(v192), "w"(v97)); + svfloat32_t v190 = svsub_f32_x(svptrue_b32(), v189, v187); + svfloat32_t v193 = svadd_f32_x(svptrue_b32(), v192, v97); svfloat32_t v426 = svmla_f32_x(pred_full, v263, v160, v815); svfloat32_t v427 = svnmls_f32_x(pred_full, v263, v161, v816); svfloat32_t v432 = svnmls_f32_x(pred_full, v238, v165, v820); @@ -9062,113 +5746,65 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu17(const armral_cmplx_f32_t *restrict x, pred_full, svmul_n_f32_x(pred_full, v203, (float)(1ULL << 31ULL)))), svreinterpret_u16_u64( svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svfloat32_t v194; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v194) : "w"(v193), "w"(v145)); - svfloat32_t zero411; - asm volatile("mov %0.s, #0" : "=w"(zero411)); + svfloat32_t v194 = svsub_f32_x(svptrue_b32(), v193, v145); + svfloat32_t zero411 = svdup_n_f32(0); svfloat32_t v411 = svcmla_f32_x(pred_full, zero411, v839, v190, 90); svfloat32_t v435 = svmla_f32_x(pred_full, v434, v159, v814); svfloat32_t v436 = svmls_f32_x(pred_full, v434, v159, v814); - svfloat32_t v437; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v437) : "w"(v426), "w"(v428)); - svfloat32_t v439; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v439) : "w"(v427), "w"(v429)); - svfloat32_t v441; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v441) : "w"(v426), "w"(v430)); - svfloat32_t v443; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v443) : "w"(v427), "w"(v431)); - svfloat32_t v464; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v464) : "w"(v453), "w"(v455)); - svfloat32_t v465; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v465) : "w"(v453), "w"(v455)); - svfloat32_t v466; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v466) : "w"(v454), "w"(v456)); - svfloat32_t v467; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v467) : "w"(v454), "w"(v456)); - svfloat32_t v468; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v468) : "w"(v457), "w"(v459)); - svfloat32_t v469; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v469) : "w"(v459), "w"(v457)); - svfloat32_t v470; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v470) : "w"(v458), "w"(v460)); - svfloat32_t v471; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v471) : "w"(v460), "w"(v458)); + svfloat32_t v437 = svsub_f32_x(svptrue_b32(), v426, v428); + svfloat32_t v439 = svadd_f32_x(svptrue_b32(), v427, v429); + svfloat32_t v441 = svadd_f32_x(svptrue_b32(), v426, v430); + svfloat32_t v443 = svadd_f32_x(svptrue_b32(), v427, v431); + svfloat32_t v464 = svadd_f32_x(svptrue_b32(), v453, v455); + svfloat32_t v465 = svsub_f32_x(svptrue_b32(), v453, v455); + svfloat32_t v466 = svadd_f32_x(svptrue_b32(), v454, v456); + svfloat32_t v467 = svsub_f32_x(svptrue_b32(), v454, v456); + svfloat32_t v468 = svadd_f32_x(svptrue_b32(), v457, v459); + svfloat32_t v469 = svsub_f32_x(svptrue_b32(), v459, v457); + svfloat32_t v470 = svadd_f32_x(svptrue_b32(), v458, v460); + svfloat32_t v471 = svsub_f32_x(svptrue_b32(), v460, v458); svst1w_u64(pred_full, (unsigned *)(v849), svreinterpret_u64_s16(v502)); - svfloat32_t v195; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v195) : "w"(v190), "w"(v194)); - svfloat32_t zero418; - asm volatile("mov %0.s, #0" : "=w"(zero418)); + svfloat32_t v195 = svadd_f32_x(svptrue_b32(), v190, v194); + svfloat32_t zero418 = svdup_n_f32(0); svfloat32_t v418 = svcmla_f32_x(pred_full, zero418, v840, v194, 90); - svfloat32_t v438; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v438) : "w"(v432), "w"(v435)); - svfloat32_t v440; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v440) : "w"(v433), "w"(v436)); - svfloat32_t v442; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v442) : "w"(v435), "w"(v432)); - svfloat32_t v444; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v444) : "w"(v436), "w"(v433)); - svfloat32_t v481; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v481) : "w"(v466), "w"(v470)); - svfloat32_t v483; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v483) : "w"(v465), "w"(v471)); - svfloat32_t v485; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v485) : "w"(v464), "w"(v468)); - svfloat32_t v487; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v487) : "w"(v471), "w"(v465)); - svfloat32_t v489; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v489) : "w"(v464), "w"(v468)); - svfloat32_t v492; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v492) : "w"(v469), "w"(v467)); - svfloat32_t v495; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v495) : "w"(v470), "w"(v466)); - svfloat32_t v498; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v498) : "w"(v467), "w"(v469)); - svfloat32_t v445; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v445) : "w"(v437), "w"(v438)); - svfloat32_t v446; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v446) : "w"(v439), "w"(v440)); - svfloat32_t v447; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v447) : "w"(v441), "w"(v442)); - svfloat32_t v448; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v448) : "w"(v443), "w"(v444)); - svfloat32_t v449; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v449) : "w"(v438), "w"(v437)); - svfloat32_t v450; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v450) : "w"(v440), "w"(v439)); - svfloat32_t v451; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v451) : "w"(v442), "w"(v441)); - svfloat32_t v452; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v452) : "w"(v444), "w"(v443)); - svfloat32_t v472; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v472) : "w"(v411), "w"(v418)); + svfloat32_t v438 = svadd_f32_x(svptrue_b32(), v432, v435); + svfloat32_t v440 = svadd_f32_x(svptrue_b32(), v433, v436); + svfloat32_t v442 = svsub_f32_x(svptrue_b32(), v435, v432); + svfloat32_t v444 = svsub_f32_x(svptrue_b32(), v436, v433); + svfloat32_t v481 = svadd_f32_x(svptrue_b32(), v466, v470); + svfloat32_t v483 = svadd_f32_x(svptrue_b32(), v465, v471); + svfloat32_t v485 = svsub_f32_x(svptrue_b32(), v464, v468); + svfloat32_t v487 = svsub_f32_x(svptrue_b32(), v471, v465); + svfloat32_t v489 = svadd_f32_x(svptrue_b32(), v464, v468); + svfloat32_t v492 = svsub_f32_x(svptrue_b32(), v469, v467); + svfloat32_t v495 = svsub_f32_x(svptrue_b32(), v470, v466); + svfloat32_t v498 = svadd_f32_x(svptrue_b32(), v467, v469); + svfloat32_t v445 = svadd_f32_x(svptrue_b32(), v437, v438); + svfloat32_t v446 = svadd_f32_x(svptrue_b32(), v439, v440); + svfloat32_t v447 = svadd_f32_x(svptrue_b32(), v441, v442); + svfloat32_t v448 = svadd_f32_x(svptrue_b32(), v443, v444); + svfloat32_t v449 = svsub_f32_x(svptrue_b32(), v438, v437); + svfloat32_t v450 = svsub_f32_x(svptrue_b32(), v440, v439); + svfloat32_t v451 = svsub_f32_x(svptrue_b32(), v442, v441); + svfloat32_t v452 = svsub_f32_x(svptrue_b32(), v444, v443); + svfloat32_t v472 = svsub_f32_x(svptrue_b32(), v411, v418); svfloat32_t v461 = svcmla_f32_x(pred_full, v418, v841, v195, 90); - svfloat32_t v474; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v474) : "w"(v472), "w"(v472)); - svfloat32_t v499; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v499) : "w"(v498), "w"(v472)); + svfloat32_t v474 = svadd_f32_x(svptrue_b32(), v472, v472); + svfloat32_t v499 = svsub_f32_x(svptrue_b32(), v498, v472); svfloat32_t v462 = svcmla_f32_x(pred_full, v461, v833, v186, 90); - svfloat32_t v475; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v475) : "w"(v390), "w"(v474)); - svfloat32_t v478; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v478) : "w"(v461), "w"(v461)); - svfloat32_t v496; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v496) : "w"(v495), "w"(v474)); - svfloat32_t v544; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v544) : "w"(v452), "w"(v499)); - svfloat32_t v553; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v553) : "w"(v452), "w"(v499)); + svfloat32_t v475 = svsub_f32_x(svptrue_b32(), v390, v474); + svfloat32_t v478 = svadd_f32_x(svptrue_b32(), v461, v461); + svfloat32_t v496 = svadd_f32_x(svptrue_b32(), v495, v474); + svfloat32_t v544 = svadd_f32_x(svptrue_b32(), v452, v499); + svfloat32_t v553 = svsub_f32_x(svptrue_b32(), v452, v499); svfloat32_t v463 = svcmla_f32_x(pred_full, v462, v834, v33, 90); svfloat32_t v473 = svcmla_f32_x(pred_full, v462, v835, v97, 90); svfloat32_t v476 = svcmla_f32_x(pred_full, v475, v837, v81, 90); svfloat32_t v477 = svcmla_f32_x(pred_full, v475, v838, v145, 90); - svfloat32_t v479; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v479) : "w"(v478), "w"(v478)); - svfloat32_t v480; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v480) : "w"(v472), "w"(v478)); - svfloat32_t v486; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v486) : "w"(v485), "w"(v478)); - svfloat32_t v497; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v497) : "w"(v496), "w"(v478)); + svfloat32_t v479 = svadd_f32_x(svptrue_b32(), v478, v478); + svfloat32_t v480 = svadd_f32_x(svptrue_b32(), v472, v478); + svfloat32_t v486 = svadd_f32_x(svptrue_b32(), v485, v478); + svfloat32_t v497 = svadd_f32_x(svptrue_b32(), v496, v478); svint16_t v547 = svtbl_s16( svreinterpret_s16_s32(svcvt_s32_f32_x( pred_full, svmul_n_f32_x(pred_full, v544, (float)(1ULL << 31ULL)))), @@ -9179,34 +5815,21 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu17(const armral_cmplx_f32_t *restrict x, pred_full, svmul_n_f32_x(pred_full, v553, (float)(1ULL << 31ULL)))), svreinterpret_u16_u64( svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svfloat32_t v482; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v482) : "w"(v481), "w"(v473)); - svfloat32_t v484; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v484) : "w"(v483), "w"(v476)); - svfloat32_t v488; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v488) : "w"(v487), "w"(v480)); - svfloat32_t v490; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v490) : "w"(v489), "w"(v463)); - svfloat32_t v493; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v493) : "w"(v492), "w"(v477)); - svfloat32_t v526; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v526) : "w"(v447), "w"(v486)); - svfloat32_t v535; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v535) : "w"(v447), "w"(v486)); - svfloat32_t v634; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v634) : "w"(v451), "w"(v497)); - svfloat32_t v643; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v643) : "w"(v451), "w"(v497)); + svfloat32_t v482 = svadd_f32_x(svptrue_b32(), v481, v473); + svfloat32_t v484 = svadd_f32_x(svptrue_b32(), v483, v476); + svfloat32_t v488 = svsub_f32_x(svptrue_b32(), v487, v480); + svfloat32_t v490 = svadd_f32_x(svptrue_b32(), v489, v463); + svfloat32_t v493 = svsub_f32_x(svptrue_b32(), v492, v477); + svfloat32_t v526 = svadd_f32_x(svptrue_b32(), v447, v486); + svfloat32_t v535 = svsub_f32_x(svptrue_b32(), v447, v486); + svfloat32_t v634 = svadd_f32_x(svptrue_b32(), v451, v497); + svfloat32_t v643 = svsub_f32_x(svptrue_b32(), v451, v497); svst1w_u64(pred_full, (unsigned *)(v894), svreinterpret_u64_s16(v547)); svst1w_u64(pred_full, (unsigned *)(v903), svreinterpret_u64_s16(v556)); - svfloat32_t v491; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v491) : "w"(v490), "w"(v472)); - svfloat32_t v494; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v494) : "w"(v493), "w"(v479)); - svfloat32_t v508; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v508) : "w"(v445), "w"(v482)); - svfloat32_t v517; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v517) : "w"(v445), "w"(v482)); + svfloat32_t v491 = svadd_f32_x(svptrue_b32(), v490, v472); + svfloat32_t v494 = svadd_f32_x(svptrue_b32(), v493, v479); + svfloat32_t v508 = svadd_f32_x(svptrue_b32(), v445, v482); + svfloat32_t v517 = svsub_f32_x(svptrue_b32(), v445, v482); svint16_t v529 = svtbl_s16( svreinterpret_s16_s32(svcvt_s32_f32_x( pred_full, svmul_n_f32_x(pred_full, v526, (float)(1ULL << 31ULL)))), @@ -9217,14 +5840,10 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu17(const armral_cmplx_f32_t *restrict x, pred_full, svmul_n_f32_x(pred_full, v535, (float)(1ULL << 31ULL)))), svreinterpret_u16_u64( svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svfloat32_t v580; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v580) : "w"(v448), "w"(v488)); - svfloat32_t v589; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v589) : "w"(v448), "w"(v488)); - svfloat32_t v598; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v598) : "w"(v446), "w"(v484)); - svfloat32_t v607; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v607) : "w"(v446), "w"(v484)); + svfloat32_t v580 = svadd_f32_x(svptrue_b32(), v448, v488); + svfloat32_t v589 = svsub_f32_x(svptrue_b32(), v448, v488); + svfloat32_t v598 = svadd_f32_x(svptrue_b32(), v446, v484); + svfloat32_t v607 = svsub_f32_x(svptrue_b32(), v446, v484); svint16_t v637 = svtbl_s16( svreinterpret_s16_s32(svcvt_s32_f32_x( pred_full, svmul_n_f32_x(pred_full, v634, (float)(1ULL << 31ULL)))), @@ -9245,10 +5864,8 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu17(const armral_cmplx_f32_t *restrict x, pred_full, svmul_n_f32_x(pred_full, v517, (float)(1ULL << 31ULL)))), svreinterpret_u16_u64( svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svfloat32_t v562; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v562) : "w"(v449), "w"(v491)); - svfloat32_t v571; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v571) : "w"(v449), "w"(v491)); + svfloat32_t v562 = svadd_f32_x(svptrue_b32(), v449, v491); + svfloat32_t v571 = svsub_f32_x(svptrue_b32(), v449, v491); svint16_t v583 = svtbl_s16( svreinterpret_s16_s32(svcvt_s32_f32_x( pred_full, svmul_n_f32_x(pred_full, v580, (float)(1ULL << 31ULL)))), @@ -9269,10 +5886,8 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu17(const armral_cmplx_f32_t *restrict x, pred_full, svmul_n_f32_x(pred_full, v607, (float)(1ULL << 31ULL)))), svreinterpret_u16_u64( svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svfloat32_t v616; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v616) : "w"(v450), "w"(v494)); - svfloat32_t v625; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v625) : "w"(v450), "w"(v494)); + svfloat32_t v616 = svadd_f32_x(svptrue_b32(), v450, v494); + svfloat32_t v625 = svsub_f32_x(svptrue_b32(), v450, v494); svst1w_u64(pred_full, (unsigned *)(v876), svreinterpret_u64_s16(v529)); svst1w_u64(pred_full, (unsigned *)(v885), svreinterpret_u64_s16(v538)); svst1w_u64(pred_full, (unsigned *)(v984), svreinterpret_u64_s16(v637)); @@ -9321,514 +5936,244 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu18(const armral_cmplx_f32_t *restrict x, float v4 = dir; const float32x2_t *v5 = (const float32x2_t *)x; int32_t *v6 = (int32_t *)y; - int64_t v12 = howmany - 1; - int64_t v539 = howmany / 2; - for (int j = 0; j < v12; j += 2) { - float v306 = -5.0000000000000000e-01F; - float v319 = -1.4999999999999998e+00F; - float v323 = 8.6602540378443871e-01F; - float v324 = -8.6602540378443871e-01F; - float v332 = 7.6604444311897801e-01F; - float v337 = 9.3969262078590832e-01F; - float v342 = -1.7364817766693039e-01F; - float v346 = 6.4278760968653925e-01F; - float v347 = -6.4278760968653925e-01F; - float v354 = -3.4202014332566888e-01F; - float v355 = 3.4202014332566888e-01F; - float v362 = 9.8480775301220802e-01F; - float v363 = -9.8480775301220802e-01F; - float32x2_t v365 = (float32x2_t){v4, v4}; - const float32x2_t *v1081 = &v5[istride]; - int32_t *v1172 = &v6[ostride]; - float32x2_t v307 = (float32x2_t){v306, v306}; - float32x2_t v320 = (float32x2_t){v319, v319}; - float32x2_t v325 = (float32x2_t){v323, v324}; - float32x2_t v333 = (float32x2_t){v332, v332}; - float32x2_t v338 = (float32x2_t){v337, v337}; - float32x2_t v343 = (float32x2_t){v342, v342}; - float32x2_t v348 = (float32x2_t){v346, v347}; - float32x2_t v356 = (float32x2_t){v354, v355}; - float32x2_t v364 = (float32x2_t){v362, v363}; - const float32x2_t *v982 = &v5[0]; - int32_t *v1145 = &v6[0]; - float32x4_t v1324 = vld1q_f32((const float32_t *)v1081); - float32x4_t v308 = vcombine_f32(v307, v307); - float32x4_t v321 = vcombine_f32(v320, v320); - float32x2_t v327 = vmul_f32(v365, v325); - float32x4_t v334 = vcombine_f32(v333, v333); - float32x4_t v339 = vcombine_f32(v338, v338); - float32x4_t v344 = vcombine_f32(v343, v343); - float32x2_t v350 = vmul_f32(v365, v348); - float32x2_t v358 = vmul_f32(v365, v356); - float32x2_t v366 = vmul_f32(v365, v364); - const float32x2_t *v991 = &v5[istride * 9]; - const float32x2_t *v1000 = &v5[istride * 2]; - const float32x2_t *v1009 = &v5[istride * 11]; - const float32x2_t *v1018 = &v5[istride * 4]; - const float32x2_t *v1027 = &v5[istride * 13]; - const float32x2_t *v1036 = &v5[istride * 6]; - const float32x2_t *v1045 = &v5[istride * 15]; - const float32x2_t *v1054 = &v5[istride * 8]; - const float32x2_t *v1063 = &v5[istride * 17]; - const float32x2_t *v1072 = &v5[istride * 10]; - const float32x2_t *v1090 = &v5[istride * 12]; - const float32x2_t *v1099 = &v5[istride * 3]; - const float32x2_t *v1108 = &v5[istride * 14]; - const float32x2_t *v1117 = &v5[istride * 5]; - const float32x2_t *v1126 = &v5[istride * 16]; - const float32x2_t *v1135 = &v5[istride * 7]; - int32_t *v1154 = &v6[ostride * 9]; - int32_t *v1163 = &v6[ostride * 10]; - int32_t *v1181 = &v6[ostride * 2]; - int32_t *v1190 = &v6[ostride * 11]; - int32_t *v1199 = &v6[ostride * 12]; - int32_t *v1208 = &v6[ostride * 3]; - int32_t *v1217 = &v6[ostride * 4]; - int32_t *v1226 = &v6[ostride * 13]; - int32_t *v1235 = &v6[ostride * 14]; - int32_t *v1244 = &v6[ostride * 5]; - int32_t *v1253 = &v6[ostride * 6]; - int32_t *v1262 = &v6[ostride * 15]; - int32_t *v1271 = &v6[ostride * 16]; - int32_t *v1280 = &v6[ostride * 7]; - int32_t *v1289 = &v6[ostride * 8]; - int32_t *v1298 = &v6[ostride * 17]; - float32x4_t v1302 = vld1q_f32((const float32_t *)v982); - float32x4_t v329 = vcombine_f32(v327, v327); - float32x4_t v352 = vcombine_f32(v350, v350); - float32x4_t v360 = vcombine_f32(v358, v358); - float32x4_t v368 = vcombine_f32(v366, v366); - float32x4_t v1304 = vld1q_f32((const float32_t *)v991); - float32x4_t v1306 = vld1q_f32((const float32_t *)v1000); - float32x4_t v1308 = vld1q_f32((const float32_t *)v1009); - float32x4_t v1310 = vld1q_f32((const float32_t *)v1018); - float32x4_t v1312 = vld1q_f32((const float32_t *)v1027); - float32x4_t v1314 = vld1q_f32((const float32_t *)v1036); - float32x4_t v1316 = vld1q_f32((const float32_t *)v1045); - float32x4_t v1318 = vld1q_f32((const float32_t *)v1054); - float32x4_t v1320 = vld1q_f32((const float32_t *)v1063); - float32x4_t v1322 = vld1q_f32((const float32_t *)v1072); - float32x4_t v1326 = vld1q_f32((const float32_t *)v1090); - float32x4_t v1328 = vld1q_f32((const float32_t *)v1099); - float32x4_t v1330 = vld1q_f32((const float32_t *)v1108); - float32x4_t v1332 = vld1q_f32((const float32_t *)v1117); - float32x4_t v1334 = vld1q_f32((const float32_t *)v1126); - float32x4_t v1336 = vld1q_f32((const float32_t *)v1135); - float32x4_t v35 = vaddq_f32(v1302, v1304); - float32x4_t v36 = vsubq_f32(v1302, v1304); - float32x4_t v51 = vaddq_f32(v1306, v1308); - float32x4_t v52 = vsubq_f32(v1306, v1308); - float32x4_t v67 = vaddq_f32(v1310, v1312); - float32x4_t v68 = vsubq_f32(v1310, v1312); - float32x4_t v83 = vaddq_f32(v1314, v1316); - float32x4_t v84 = vsubq_f32(v1314, v1316); - float32x4_t v99 = vaddq_f32(v1318, v1320); - float32x4_t v100 = vsubq_f32(v1318, v1320); - float32x4_t v115 = vaddq_f32(v1322, v1324); - float32x4_t v116 = vsubq_f32(v1322, v1324); - float32x4_t v131 = vaddq_f32(v1326, v1328); - float32x4_t v132 = vsubq_f32(v1326, v1328); - float32x4_t v147 = vaddq_f32(v1330, v1332); - float32x4_t v148 = vsubq_f32(v1330, v1332); - float32x4_t v163 = vaddq_f32(v1334, v1336); - float32x4_t v164 = vsubq_f32(v1334, v1336); - float32x4_t v165 = vaddq_f32(v51, v163); - float32x4_t v166 = vsubq_f32(v51, v163); - float32x4_t v167 = vaddq_f32(v147, v67); - float32x4_t v168 = vsubq_f32(v147, v67); - float32x4_t v169 = vaddq_f32(v83, v131); - float32x4_t v170 = vsubq_f32(v83, v131); - float32x4_t v171 = vaddq_f32(v99, v115); - float32x4_t v172 = vsubq_f32(v99, v115); - float32x4_t v280 = vaddq_f32(v52, v164); - float32x4_t v281 = vsubq_f32(v52, v164); - float32x4_t v282 = vaddq_f32(v148, v68); - float32x4_t v283 = vsubq_f32(v148, v68); - float32x4_t v284 = vaddq_f32(v84, v132); - float32x4_t v285 = vsubq_f32(v84, v132); - float32x4_t v286 = vaddq_f32(v100, v116); - float32x4_t v287 = vsubq_f32(v100, v116); - float32x4_t v173 = vaddq_f32(v165, v167); - float32x4_t v177 = vaddq_f32(v166, v168); - float32x4_t v179 = vsubq_f32(v165, v167); - float32x4_t v180 = vsubq_f32(v167, v171); - float32x4_t v181 = vsubq_f32(v171, v165); - float32x4_t v182 = vsubq_f32(v166, v168); - float32x4_t v183 = vsubq_f32(v168, v172); - float32x4_t v184 = vsubq_f32(v172, v166); - float32x4_t v207 = vmulq_f32(v169, v321); - float32x4_t v213 = vrev64q_f32(v170); - float32x4_t v288 = vaddq_f32(v280, v282); - float32x4_t v292 = vaddq_f32(v281, v283); - float32x4_t v294 = vsubq_f32(v280, v282); - float32x4_t v295 = vsubq_f32(v282, v286); - float32x4_t v296 = vsubq_f32(v286, v280); - float32x4_t v297 = vsubq_f32(v281, v283); - float32x4_t v298 = vsubq_f32(v283, v287); - float32x4_t v299 = vsubq_f32(v287, v281); - float32x4_t v322 = vmulq_f32(v284, v321); - float32x4_t v328 = vrev64q_f32(v285); - float32x4_t v174 = vaddq_f32(v173, v171); - float32x4_t v178 = vaddq_f32(v177, v172); - float32x4_t v215 = vmulq_f32(v213, v329); - float32x4_t v220 = vmulq_f32(v179, v334); - float32x4_t v225 = vmulq_f32(v180, v339); - float32x4_t v230 = vmulq_f32(v181, v344); - float32x4_t v236 = vrev64q_f32(v182); - float32x4_t v244 = vrev64q_f32(v183); - float32x4_t v252 = vrev64q_f32(v184); - float32x4_t v289 = vaddq_f32(v288, v286); - float32x4_t v293 = vaddq_f32(v292, v287); - float32x4_t v330 = vmulq_f32(v328, v329); - float32x4_t v335 = vmulq_f32(v294, v334); - float32x4_t v340 = vmulq_f32(v295, v339); - float32x4_t v345 = vmulq_f32(v296, v344); - float32x4_t v351 = vrev64q_f32(v297); - float32x4_t v359 = vrev64q_f32(v298); - float32x4_t v367 = vrev64q_f32(v299); - float32x4_t v175 = vaddq_f32(v174, v169); - float32x4_t v194 = vmulq_f32(v174, v308); - float32x4_t v200 = vrev64q_f32(v178); - float32x4_t v238 = vmulq_f32(v236, v352); - float32x4_t v246 = vmulq_f32(v244, v360); - float32x4_t v254 = vmulq_f32(v252, v368); - float32x4_t v290 = vaddq_f32(v289, v284); - float32x4_t v309 = vmulq_f32(v289, v308); - float32x4_t v315 = vrev64q_f32(v293); - float32x4_t v353 = vmulq_f32(v351, v352); - float32x4_t v361 = vmulq_f32(v359, v360); - float32x4_t v369 = vmulq_f32(v367, v368); - float32x4_t v176 = vaddq_f32(v175, v35); - float32x4_t v202 = vmulq_f32(v200, v329); - float32x4_t v255 = vaddq_f32(v194, v194); - float32x4_t v268 = vaddq_f32(v215, v238); - float32x4_t v270 = vsubq_f32(v215, v246); - float32x4_t v272 = vsubq_f32(v215, v238); - float32x4_t v291 = vaddq_f32(v290, v36); - float32x4_t v317 = vmulq_f32(v315, v329); - float32x4_t v370 = vaddq_f32(v309, v309); - float32x4_t v383 = vaddq_f32(v330, v353); - float32x4_t v385 = vsubq_f32(v330, v361); - float32x4_t v387 = vsubq_f32(v330, v353); - float32x4_t v256 = vaddq_f32(v255, v194); - float32x4_t v260 = vaddq_f32(v176, v207); - float32x4_t v269 = vaddq_f32(v268, v246); - float32x4_t v271 = vaddq_f32(v270, v254); - float32x4_t v273 = vsubq_f32(v272, v254); - float32x4_t v371 = vaddq_f32(v370, v309); - float32x4_t v375 = vaddq_f32(v291, v322); - float32x4_t v384 = vaddq_f32(v383, v361); - float32x4_t v386 = vaddq_f32(v385, v369); - float32x4_t v388 = vsubq_f32(v387, v369); - int16x4_t v397 = vqmovn_s32(vcvtq_n_s32_f32(v176, 15)); - int16x4_t v405 = vqmovn_s32(vcvtq_n_s32_f32(v291, 15)); - float32x4_t v257 = vaddq_f32(v176, v256); - float32x4_t v261 = vaddq_f32(v260, v255); - float32x4_t v372 = vaddq_f32(v291, v371); - float32x4_t v376 = vaddq_f32(v375, v370); - vst1_s16((int16_t *)v1145, v397); - vst1_s16((int16_t *)v1154, v405); - float32x4_t v258 = vaddq_f32(v257, v202); - float32x4_t v259 = vsubq_f32(v257, v202); - float32x4_t v262 = vaddq_f32(v261, v220); - float32x4_t v264 = vsubq_f32(v261, v225); - float32x4_t v266 = vsubq_f32(v261, v220); - float32x4_t v373 = vaddq_f32(v372, v317); - float32x4_t v374 = vsubq_f32(v372, v317); - float32x4_t v377 = vaddq_f32(v376, v335); - float32x4_t v379 = vsubq_f32(v376, v340); - float32x4_t v381 = vsubq_f32(v376, v335); - float32x4_t v263 = vaddq_f32(v262, v225); - float32x4_t v265 = vaddq_f32(v264, v230); - float32x4_t v267 = vsubq_f32(v266, v230); - float32x4_t v378 = vaddq_f32(v377, v340); - float32x4_t v380 = vaddq_f32(v379, v345); - float32x4_t v382 = vsubq_f32(v381, v345); - int16x4_t v445 = vqmovn_s32(vcvtq_n_s32_f32(v259, 15)); - int16x4_t v453 = vqmovn_s32(vcvtq_n_s32_f32(v374, 15)); - int16x4_t v493 = vqmovn_s32(vcvtq_n_s32_f32(v258, 15)); - int16x4_t v501 = vqmovn_s32(vcvtq_n_s32_f32(v373, 15)); - float32x4_t v274 = vaddq_f32(v263, v269); - float32x4_t v275 = vsubq_f32(v263, v269); - float32x4_t v276 = vaddq_f32(v265, v271); - float32x4_t v277 = vsubq_f32(v265, v271); - float32x4_t v278 = vaddq_f32(v267, v273); - float32x4_t v279 = vsubq_f32(v267, v273); - float32x4_t v389 = vaddq_f32(v378, v384); - float32x4_t v390 = vsubq_f32(v378, v384); - float32x4_t v391 = vaddq_f32(v380, v386); - float32x4_t v392 = vsubq_f32(v380, v386); - float32x4_t v393 = vaddq_f32(v382, v388); - float32x4_t v394 = vsubq_f32(v382, v388); - vst1_s16((int16_t *)v1199, v445); - vst1_s16((int16_t *)v1208, v453); - vst1_s16((int16_t *)v1253, v493); - vst1_s16((int16_t *)v1262, v501); - int16x4_t v413 = vqmovn_s32(vcvtq_n_s32_f32(v275, 15)); - int16x4_t v421 = vqmovn_s32(vcvtq_n_s32_f32(v390, 15)); - int16x4_t v429 = vqmovn_s32(vcvtq_n_s32_f32(v276, 15)); - int16x4_t v437 = vqmovn_s32(vcvtq_n_s32_f32(v391, 15)); - int16x4_t v461 = vqmovn_s32(vcvtq_n_s32_f32(v279, 15)); - int16x4_t v469 = vqmovn_s32(vcvtq_n_s32_f32(v394, 15)); - int16x4_t v477 = vqmovn_s32(vcvtq_n_s32_f32(v278, 15)); - int16x4_t v485 = vqmovn_s32(vcvtq_n_s32_f32(v393, 15)); - int16x4_t v509 = vqmovn_s32(vcvtq_n_s32_f32(v277, 15)); - int16x4_t v517 = vqmovn_s32(vcvtq_n_s32_f32(v392, 15)); - int16x4_t v525 = vqmovn_s32(vcvtq_n_s32_f32(v274, 15)); - int16x4_t v533 = vqmovn_s32(vcvtq_n_s32_f32(v389, 15)); - vst1_s16((int16_t *)v1163, v413); - vst1_s16((int16_t *)v1172, v421); - vst1_s16((int16_t *)v1181, v429); - vst1_s16((int16_t *)v1190, v437); - vst1_s16((int16_t *)v1217, v461); - vst1_s16((int16_t *)v1226, v469); - vst1_s16((int16_t *)v1235, v477); - vst1_s16((int16_t *)v1244, v485); - vst1_s16((int16_t *)v1271, v509); - vst1_s16((int16_t *)v1280, v517); - vst1_s16((int16_t *)v1289, v525); - vst1_s16((int16_t *)v1298, v533); - v5 += 2 * 1; - v6 += 2 * 1; - } - for (int j = v539 * 2; j < howmany; j += 1) { - float32x2_t v616 = v5[istride]; - float v784 = -5.0000000000000000e-01F; - float v795 = -1.4999999999999998e+00F; - float v798 = 8.6602540378443871e-01F; - float v799 = -8.6602540378443871e-01F; - float v806 = 7.6604444311897801e-01F; - float v810 = 9.3969262078590832e-01F; - float v814 = -1.7364817766693039e-01F; - float v817 = 6.4278760968653925e-01F; - float v818 = -6.4278760968653925e-01F; - float v824 = -3.4202014332566888e-01F; - float v825 = 3.4202014332566888e-01F; - float v831 = 9.8480775301220802e-01F; - float v832 = -9.8480775301220802e-01F; - float32x2_t v834 = (float32x2_t){v4, v4}; - float32x2_t v551 = v5[0]; - float32x2_t v785 = (float32x2_t){v784, v784}; - float32x2_t v796 = (float32x2_t){v795, v795}; - float32x2_t v800 = (float32x2_t){v798, v799}; - float32x2_t v807 = (float32x2_t){v806, v806}; - float32x2_t v811 = (float32x2_t){v810, v810}; - float32x2_t v815 = (float32x2_t){v814, v814}; - float32x2_t v819 = (float32x2_t){v817, v818}; - float32x2_t v826 = (float32x2_t){v824, v825}; - float32x2_t v833 = (float32x2_t){v831, v832}; - float32x2_t v556 = v5[istride * 9]; - float32x2_t v563 = v5[istride * 2]; - float32x2_t v568 = v5[istride * 11]; - float32x2_t v575 = v5[istride * 4]; - float32x2_t v580 = v5[istride * 13]; - float32x2_t v587 = v5[istride * 6]; - float32x2_t v592 = v5[istride * 15]; - float32x2_t v599 = v5[istride * 8]; - float32x2_t v604 = v5[istride * 17]; - float32x2_t v611 = v5[istride * 10]; - float32x2_t v623 = v5[istride * 12]; - float32x2_t v628 = v5[istride * 3]; - float32x2_t v635 = v5[istride * 14]; - float32x2_t v640 = v5[istride * 5]; - float32x2_t v647 = v5[istride * 16]; - float32x2_t v652 = v5[istride * 7]; - float32x2_t v802 = vmul_f32(v834, v800); - float32x2_t v821 = vmul_f32(v834, v819); - float32x2_t v828 = vmul_f32(v834, v826); - float32x2_t v835 = vmul_f32(v834, v833); - float32x2_t v557 = vadd_f32(v551, v556); - float32x2_t v558 = vsub_f32(v551, v556); - float32x2_t v569 = vadd_f32(v563, v568); - float32x2_t v570 = vsub_f32(v563, v568); - float32x2_t v581 = vadd_f32(v575, v580); - float32x2_t v582 = vsub_f32(v575, v580); - float32x2_t v593 = vadd_f32(v587, v592); - float32x2_t v594 = vsub_f32(v587, v592); - float32x2_t v605 = vadd_f32(v599, v604); - float32x2_t v606 = vsub_f32(v599, v604); - float32x2_t v617 = vadd_f32(v611, v616); - float32x2_t v618 = vsub_f32(v611, v616); - float32x2_t v629 = vadd_f32(v623, v628); - float32x2_t v630 = vsub_f32(v623, v628); - float32x2_t v641 = vadd_f32(v635, v640); - float32x2_t v642 = vsub_f32(v635, v640); - float32x2_t v653 = vadd_f32(v647, v652); - float32x2_t v654 = vsub_f32(v647, v652); - float32x2_t v655 = vadd_f32(v569, v653); - float32x2_t v656 = vsub_f32(v569, v653); - float32x2_t v657 = vadd_f32(v641, v581); - float32x2_t v658 = vsub_f32(v641, v581); - float32x2_t v659 = vadd_f32(v593, v629); - float32x2_t v660 = vsub_f32(v593, v629); - float32x2_t v661 = vadd_f32(v605, v617); - float32x2_t v662 = vsub_f32(v605, v617); - float32x2_t v759 = vadd_f32(v570, v654); - float32x2_t v760 = vsub_f32(v570, v654); - float32x2_t v761 = vadd_f32(v642, v582); - float32x2_t v762 = vsub_f32(v642, v582); - float32x2_t v763 = vadd_f32(v594, v630); - float32x2_t v764 = vsub_f32(v594, v630); - float32x2_t v765 = vadd_f32(v606, v618); - float32x2_t v766 = vsub_f32(v606, v618); - float32x2_t v663 = vadd_f32(v655, v657); - float32x2_t v667 = vadd_f32(v656, v658); - float32x2_t v669 = vsub_f32(v655, v657); - float32x2_t v670 = vsub_f32(v657, v661); - float32x2_t v671 = vsub_f32(v661, v655); - float32x2_t v672 = vsub_f32(v656, v658); - float32x2_t v673 = vsub_f32(v658, v662); - float32x2_t v674 = vsub_f32(v662, v656); - float32x2_t v693 = vmul_f32(v659, v796); - float32x2_t v699 = vrev64_f32(v660); - float32x2_t v767 = vadd_f32(v759, v761); - float32x2_t v771 = vadd_f32(v760, v762); - float32x2_t v773 = vsub_f32(v759, v761); - float32x2_t v774 = vsub_f32(v761, v765); - float32x2_t v775 = vsub_f32(v765, v759); - float32x2_t v776 = vsub_f32(v760, v762); - float32x2_t v777 = vsub_f32(v762, v766); - float32x2_t v778 = vsub_f32(v766, v760); - float32x2_t v797 = vmul_f32(v763, v796); - float32x2_t v803 = vrev64_f32(v764); - float32x2_t v664 = vadd_f32(v663, v661); - float32x2_t v668 = vadd_f32(v667, v662); - float32x2_t v700 = vmul_f32(v699, v802); - float32x2_t v704 = vmul_f32(v669, v807); - float32x2_t v708 = vmul_f32(v670, v811); - float32x2_t v712 = vmul_f32(v671, v815); - float32x2_t v718 = vrev64_f32(v672); - float32x2_t v725 = vrev64_f32(v673); - float32x2_t v732 = vrev64_f32(v674); - float32x2_t v768 = vadd_f32(v767, v765); - float32x2_t v772 = vadd_f32(v771, v766); - float32x2_t v804 = vmul_f32(v803, v802); - float32x2_t v808 = vmul_f32(v773, v807); - float32x2_t v812 = vmul_f32(v774, v811); - float32x2_t v816 = vmul_f32(v775, v815); - float32x2_t v822 = vrev64_f32(v776); - float32x2_t v829 = vrev64_f32(v777); - float32x2_t v836 = vrev64_f32(v778); - float32x2_t v665 = vadd_f32(v664, v659); - float32x2_t v682 = vmul_f32(v664, v785); - float32x2_t v688 = vrev64_f32(v668); - float32x2_t v719 = vmul_f32(v718, v821); - float32x2_t v726 = vmul_f32(v725, v828); - float32x2_t v733 = vmul_f32(v732, v835); - float32x2_t v769 = vadd_f32(v768, v763); - float32x2_t v786 = vmul_f32(v768, v785); - float32x2_t v792 = vrev64_f32(v772); - float32x2_t v823 = vmul_f32(v822, v821); - float32x2_t v830 = vmul_f32(v829, v828); - float32x2_t v837 = vmul_f32(v836, v835); - float32x2_t v666 = vadd_f32(v665, v557); - float32x2_t v689 = vmul_f32(v688, v802); - float32x2_t v734 = vadd_f32(v682, v682); - float32x2_t v747 = vadd_f32(v700, v719); - float32x2_t v749 = vsub_f32(v700, v726); - float32x2_t v751 = vsub_f32(v700, v719); - float32x2_t v770 = vadd_f32(v769, v558); - float32x2_t v793 = vmul_f32(v792, v802); - float32x2_t v838 = vadd_f32(v786, v786); - float32x2_t v851 = vadd_f32(v804, v823); - float32x2_t v853 = vsub_f32(v804, v830); - float32x2_t v855 = vsub_f32(v804, v823); - float32x2_t v735 = vadd_f32(v734, v682); - float32x2_t v739 = vadd_f32(v666, v693); - float32x2_t v748 = vadd_f32(v747, v726); - float32x2_t v750 = vadd_f32(v749, v733); - float32x2_t v752 = vsub_f32(v751, v733); - float32x2_t v839 = vadd_f32(v838, v786); - float32x2_t v843 = vadd_f32(v770, v797); - float32x2_t v852 = vadd_f32(v851, v830); - float32x2_t v854 = vadd_f32(v853, v837); - float32x2_t v856 = vsub_f32(v855, v837); - int16x4_t v865 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v666, 15), (int32x2_t){0, 0})); - int16x4_t v871 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v770, 15), (int32x2_t){0, 0})); - float32x2_t v736 = vadd_f32(v666, v735); - float32x2_t v740 = vadd_f32(v739, v734); - float32x2_t v840 = vadd_f32(v770, v839); - float32x2_t v844 = vadd_f32(v843, v838); - v6[0] = vget_lane_s32(vreinterpret_s32_s16(v865), 0); - v6[ostride * 9] = vget_lane_s32(vreinterpret_s32_s16(v871), 0); - float32x2_t v737 = vadd_f32(v736, v689); - float32x2_t v738 = vsub_f32(v736, v689); - float32x2_t v741 = vadd_f32(v740, v704); - float32x2_t v743 = vsub_f32(v740, v708); - float32x2_t v745 = vsub_f32(v740, v704); - float32x2_t v841 = vadd_f32(v840, v793); - float32x2_t v842 = vsub_f32(v840, v793); - float32x2_t v845 = vadd_f32(v844, v808); - float32x2_t v847 = vsub_f32(v844, v812); - float32x2_t v849 = vsub_f32(v844, v808); - float32x2_t v742 = vadd_f32(v741, v708); - float32x2_t v744 = vadd_f32(v743, v712); - float32x2_t v746 = vsub_f32(v745, v712); - float32x2_t v846 = vadd_f32(v845, v812); - float32x2_t v848 = vadd_f32(v847, v816); - float32x2_t v850 = vsub_f32(v849, v816); - int16x4_t v901 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v738, 15), (int32x2_t){0, 0})); - int16x4_t v907 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v842, 15), (int32x2_t){0, 0})); - int16x4_t v937 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v737, 15), (int32x2_t){0, 0})); - int16x4_t v943 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v841, 15), (int32x2_t){0, 0})); - float32x2_t v753 = vadd_f32(v742, v748); - float32x2_t v754 = vsub_f32(v742, v748); - float32x2_t v755 = vadd_f32(v744, v750); - float32x2_t v756 = vsub_f32(v744, v750); - float32x2_t v757 = vadd_f32(v746, v752); - float32x2_t v758 = vsub_f32(v746, v752); - float32x2_t v857 = vadd_f32(v846, v852); - float32x2_t v858 = vsub_f32(v846, v852); - float32x2_t v859 = vadd_f32(v848, v854); - float32x2_t v860 = vsub_f32(v848, v854); - float32x2_t v861 = vadd_f32(v850, v856); - float32x2_t v862 = vsub_f32(v850, v856); - v6[ostride * 12] = vget_lane_s32(vreinterpret_s32_s16(v901), 0); - v6[ostride * 3] = vget_lane_s32(vreinterpret_s32_s16(v907), 0); - v6[ostride * 6] = vget_lane_s32(vreinterpret_s32_s16(v937), 0); - v6[ostride * 15] = vget_lane_s32(vreinterpret_s32_s16(v943), 0); - int16x4_t v877 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v754, 15), (int32x2_t){0, 0})); - int16x4_t v883 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v858, 15), (int32x2_t){0, 0})); - int16x4_t v889 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v755, 15), (int32x2_t){0, 0})); - int16x4_t v895 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v859, 15), (int32x2_t){0, 0})); - int16x4_t v913 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v758, 15), (int32x2_t){0, 0})); - int16x4_t v919 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v862, 15), (int32x2_t){0, 0})); - int16x4_t v925 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v757, 15), (int32x2_t){0, 0})); - int16x4_t v931 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v861, 15), (int32x2_t){0, 0})); - int16x4_t v949 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v756, 15), (int32x2_t){0, 0})); - int16x4_t v955 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v860, 15), (int32x2_t){0, 0})); - int16x4_t v961 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v753, 15), (int32x2_t){0, 0})); - int16x4_t v967 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v857, 15), (int32x2_t){0, 0})); - v6[ostride * 10] = vget_lane_s32(vreinterpret_s32_s16(v877), 0); - v6[ostride] = vget_lane_s32(vreinterpret_s32_s16(v883), 0); - v6[ostride * 2] = vget_lane_s32(vreinterpret_s32_s16(v889), 0); - v6[ostride * 11] = vget_lane_s32(vreinterpret_s32_s16(v895), 0); - v6[ostride * 4] = vget_lane_s32(vreinterpret_s32_s16(v913), 0); - v6[ostride * 13] = vget_lane_s32(vreinterpret_s32_s16(v919), 0); - v6[ostride * 14] = vget_lane_s32(vreinterpret_s32_s16(v925), 0); - v6[ostride * 5] = vget_lane_s32(vreinterpret_s32_s16(v931), 0); - v6[ostride * 16] = vget_lane_s32(vreinterpret_s32_s16(v949), 0); - v6[ostride * 7] = vget_lane_s32(vreinterpret_s32_s16(v955), 0); - v6[ostride * 8] = vget_lane_s32(vreinterpret_s32_s16(v961), 0); - v6[ostride * 17] = vget_lane_s32(vreinterpret_s32_s16(v967), 0); + for (int j = 0; j < howmany; j += 1) { + float32x2_t v85 = v5[istride]; + float v253 = -5.0000000000000000e-01F; + float v264 = -1.4999999999999998e+00F; + float v267 = 8.6602540378443871e-01F; + float v268 = -8.6602540378443871e-01F; + float v275 = 7.6604444311897801e-01F; + float v279 = 9.3969262078590832e-01F; + float v283 = -1.7364817766693039e-01F; + float v286 = 6.4278760968653925e-01F; + float v287 = -6.4278760968653925e-01F; + float v293 = -3.4202014332566888e-01F; + float v294 = 3.4202014332566888e-01F; + float v300 = 9.8480775301220802e-01F; + float v301 = -9.8480775301220802e-01F; + float32x2_t v303 = (float32x2_t){v4, v4}; + float32x2_t v20 = v5[0]; + float32x2_t v254 = (float32x2_t){v253, v253}; + float32x2_t v265 = (float32x2_t){v264, v264}; + float32x2_t v269 = (float32x2_t){v267, v268}; + float32x2_t v276 = (float32x2_t){v275, v275}; + float32x2_t v280 = (float32x2_t){v279, v279}; + float32x2_t v284 = (float32x2_t){v283, v283}; + float32x2_t v288 = (float32x2_t){v286, v287}; + float32x2_t v295 = (float32x2_t){v293, v294}; + float32x2_t v302 = (float32x2_t){v300, v301}; + float32x2_t v25 = v5[istride * 9]; + float32x2_t v32 = v5[istride * 2]; + float32x2_t v37 = v5[istride * 11]; + float32x2_t v44 = v5[istride * 4]; + float32x2_t v49 = v5[istride * 13]; + float32x2_t v56 = v5[istride * 6]; + float32x2_t v61 = v5[istride * 15]; + float32x2_t v68 = v5[istride * 8]; + float32x2_t v73 = v5[istride * 17]; + float32x2_t v80 = v5[istride * 10]; + float32x2_t v92 = v5[istride * 12]; + float32x2_t v97 = v5[istride * 3]; + float32x2_t v104 = v5[istride * 14]; + float32x2_t v109 = v5[istride * 5]; + float32x2_t v116 = v5[istride * 16]; + float32x2_t v121 = v5[istride * 7]; + float32x2_t v271 = vmul_f32(v303, v269); + float32x2_t v290 = vmul_f32(v303, v288); + float32x2_t v297 = vmul_f32(v303, v295); + float32x2_t v304 = vmul_f32(v303, v302); + float32x2_t v26 = vadd_f32(v20, v25); + float32x2_t v27 = vsub_f32(v20, v25); + float32x2_t v38 = vadd_f32(v32, v37); + float32x2_t v39 = vsub_f32(v32, v37); + float32x2_t v50 = vadd_f32(v44, v49); + float32x2_t v51 = vsub_f32(v44, v49); + float32x2_t v62 = vadd_f32(v56, v61); + float32x2_t v63 = vsub_f32(v56, v61); + float32x2_t v74 = vadd_f32(v68, v73); + float32x2_t v75 = vsub_f32(v68, v73); + float32x2_t v86 = vadd_f32(v80, v85); + float32x2_t v87 = vsub_f32(v80, v85); + float32x2_t v98 = vadd_f32(v92, v97); + float32x2_t v99 = vsub_f32(v92, v97); + float32x2_t v110 = vadd_f32(v104, v109); + float32x2_t v111 = vsub_f32(v104, v109); + float32x2_t v122 = vadd_f32(v116, v121); + float32x2_t v123 = vsub_f32(v116, v121); + float32x2_t v124 = vadd_f32(v38, v122); + float32x2_t v125 = vsub_f32(v38, v122); + float32x2_t v126 = vadd_f32(v110, v50); + float32x2_t v127 = vsub_f32(v110, v50); + float32x2_t v128 = vadd_f32(v62, v98); + float32x2_t v129 = vsub_f32(v62, v98); + float32x2_t v130 = vadd_f32(v74, v86); + float32x2_t v131 = vsub_f32(v74, v86); + float32x2_t v228 = vadd_f32(v39, v123); + float32x2_t v229 = vsub_f32(v39, v123); + float32x2_t v230 = vadd_f32(v111, v51); + float32x2_t v231 = vsub_f32(v111, v51); + float32x2_t v232 = vadd_f32(v63, v99); + float32x2_t v233 = vsub_f32(v63, v99); + float32x2_t v234 = vadd_f32(v75, v87); + float32x2_t v235 = vsub_f32(v75, v87); + float32x2_t v132 = vadd_f32(v124, v126); + float32x2_t v136 = vadd_f32(v125, v127); + float32x2_t v138 = vsub_f32(v124, v126); + float32x2_t v139 = vsub_f32(v126, v130); + float32x2_t v140 = vsub_f32(v130, v124); + float32x2_t v141 = vsub_f32(v125, v127); + float32x2_t v142 = vsub_f32(v127, v131); + float32x2_t v143 = vsub_f32(v131, v125); + float32x2_t v162 = vmul_f32(v128, v265); + float32x2_t v168 = vrev64_f32(v129); + float32x2_t v236 = vadd_f32(v228, v230); + float32x2_t v240 = vadd_f32(v229, v231); + float32x2_t v242 = vsub_f32(v228, v230); + float32x2_t v243 = vsub_f32(v230, v234); + float32x2_t v244 = vsub_f32(v234, v228); + float32x2_t v245 = vsub_f32(v229, v231); + float32x2_t v246 = vsub_f32(v231, v235); + float32x2_t v247 = vsub_f32(v235, v229); + float32x2_t v266 = vmul_f32(v232, v265); + float32x2_t v272 = vrev64_f32(v233); + float32x2_t v133 = vadd_f32(v132, v130); + float32x2_t v137 = vadd_f32(v136, v131); + float32x2_t v169 = vmul_f32(v168, v271); + float32x2_t v173 = vmul_f32(v138, v276); + float32x2_t v177 = vmul_f32(v139, v280); + float32x2_t v181 = vmul_f32(v140, v284); + float32x2_t v187 = vrev64_f32(v141); + float32x2_t v194 = vrev64_f32(v142); + float32x2_t v201 = vrev64_f32(v143); + float32x2_t v237 = vadd_f32(v236, v234); + float32x2_t v241 = vadd_f32(v240, v235); + float32x2_t v273 = vmul_f32(v272, v271); + float32x2_t v277 = vmul_f32(v242, v276); + float32x2_t v281 = vmul_f32(v243, v280); + float32x2_t v285 = vmul_f32(v244, v284); + float32x2_t v291 = vrev64_f32(v245); + float32x2_t v298 = vrev64_f32(v246); + float32x2_t v305 = vrev64_f32(v247); + float32x2_t v134 = vadd_f32(v133, v128); + float32x2_t v151 = vmul_f32(v133, v254); + float32x2_t v157 = vrev64_f32(v137); + float32x2_t v188 = vmul_f32(v187, v290); + float32x2_t v195 = vmul_f32(v194, v297); + float32x2_t v202 = vmul_f32(v201, v304); + float32x2_t v238 = vadd_f32(v237, v232); + float32x2_t v255 = vmul_f32(v237, v254); + float32x2_t v261 = vrev64_f32(v241); + float32x2_t v292 = vmul_f32(v291, v290); + float32x2_t v299 = vmul_f32(v298, v297); + float32x2_t v306 = vmul_f32(v305, v304); + float32x2_t v135 = vadd_f32(v134, v26); + float32x2_t v158 = vmul_f32(v157, v271); + float32x2_t v203 = vadd_f32(v151, v151); + float32x2_t v216 = vadd_f32(v169, v188); + float32x2_t v218 = vsub_f32(v169, v195); + float32x2_t v220 = vsub_f32(v169, v188); + float32x2_t v239 = vadd_f32(v238, v27); + float32x2_t v262 = vmul_f32(v261, v271); + float32x2_t v307 = vadd_f32(v255, v255); + float32x2_t v320 = vadd_f32(v273, v292); + float32x2_t v322 = vsub_f32(v273, v299); + float32x2_t v324 = vsub_f32(v273, v292); + float32x2_t v204 = vadd_f32(v203, v151); + float32x2_t v208 = vadd_f32(v135, v162); + float32x2_t v217 = vadd_f32(v216, v195); + float32x2_t v219 = vadd_f32(v218, v202); + float32x2_t v221 = vsub_f32(v220, v202); + float32x2_t v308 = vadd_f32(v307, v255); + float32x2_t v312 = vadd_f32(v239, v266); + float32x2_t v321 = vadd_f32(v320, v299); + float32x2_t v323 = vadd_f32(v322, v306); + float32x2_t v325 = vsub_f32(v324, v306); + int16x4_t v334 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v135, 15), (int32x2_t){0, 0})); + int16x4_t v340 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v239, 15), (int32x2_t){0, 0})); + float32x2_t v205 = vadd_f32(v135, v204); + float32x2_t v209 = vadd_f32(v208, v203); + float32x2_t v309 = vadd_f32(v239, v308); + float32x2_t v313 = vadd_f32(v312, v307); + v6[0] = vget_lane_s32(vreinterpret_s32_s16(v334), 0); + v6[ostride * 9] = vget_lane_s32(vreinterpret_s32_s16(v340), 0); + float32x2_t v206 = vadd_f32(v205, v158); + float32x2_t v207 = vsub_f32(v205, v158); + float32x2_t v210 = vadd_f32(v209, v173); + float32x2_t v212 = vsub_f32(v209, v177); + float32x2_t v214 = vsub_f32(v209, v173); + float32x2_t v310 = vadd_f32(v309, v262); + float32x2_t v311 = vsub_f32(v309, v262); + float32x2_t v314 = vadd_f32(v313, v277); + float32x2_t v316 = vsub_f32(v313, v281); + float32x2_t v318 = vsub_f32(v313, v277); + float32x2_t v211 = vadd_f32(v210, v177); + float32x2_t v213 = vadd_f32(v212, v181); + float32x2_t v215 = vsub_f32(v214, v181); + float32x2_t v315 = vadd_f32(v314, v281); + float32x2_t v317 = vadd_f32(v316, v285); + float32x2_t v319 = vsub_f32(v318, v285); + int16x4_t v370 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v207, 15), (int32x2_t){0, 0})); + int16x4_t v376 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v311, 15), (int32x2_t){0, 0})); + int16x4_t v406 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v206, 15), (int32x2_t){0, 0})); + int16x4_t v412 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v310, 15), (int32x2_t){0, 0})); + float32x2_t v222 = vadd_f32(v211, v217); + float32x2_t v223 = vsub_f32(v211, v217); + float32x2_t v224 = vadd_f32(v213, v219); + float32x2_t v225 = vsub_f32(v213, v219); + float32x2_t v226 = vadd_f32(v215, v221); + float32x2_t v227 = vsub_f32(v215, v221); + float32x2_t v326 = vadd_f32(v315, v321); + float32x2_t v327 = vsub_f32(v315, v321); + float32x2_t v328 = vadd_f32(v317, v323); + float32x2_t v329 = vsub_f32(v317, v323); + float32x2_t v330 = vadd_f32(v319, v325); + float32x2_t v331 = vsub_f32(v319, v325); + v6[ostride * 12] = vget_lane_s32(vreinterpret_s32_s16(v370), 0); + v6[ostride * 3] = vget_lane_s32(vreinterpret_s32_s16(v376), 0); + v6[ostride * 6] = vget_lane_s32(vreinterpret_s32_s16(v406), 0); + v6[ostride * 15] = vget_lane_s32(vreinterpret_s32_s16(v412), 0); + int16x4_t v346 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v223, 15), (int32x2_t){0, 0})); + int16x4_t v352 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v327, 15), (int32x2_t){0, 0})); + int16x4_t v358 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v224, 15), (int32x2_t){0, 0})); + int16x4_t v364 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v328, 15), (int32x2_t){0, 0})); + int16x4_t v382 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v227, 15), (int32x2_t){0, 0})); + int16x4_t v388 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v331, 15), (int32x2_t){0, 0})); + int16x4_t v394 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v226, 15), (int32x2_t){0, 0})); + int16x4_t v400 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v330, 15), (int32x2_t){0, 0})); + int16x4_t v418 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v225, 15), (int32x2_t){0, 0})); + int16x4_t v424 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v329, 15), (int32x2_t){0, 0})); + int16x4_t v430 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v222, 15), (int32x2_t){0, 0})); + int16x4_t v436 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v326, 15), (int32x2_t){0, 0})); + v6[ostride * 10] = vget_lane_s32(vreinterpret_s32_s16(v346), 0); + v6[ostride] = vget_lane_s32(vreinterpret_s32_s16(v352), 0); + v6[ostride * 2] = vget_lane_s32(vreinterpret_s32_s16(v358), 0); + v6[ostride * 11] = vget_lane_s32(vreinterpret_s32_s16(v364), 0); + v6[ostride * 4] = vget_lane_s32(vreinterpret_s32_s16(v382), 0); + v6[ostride * 13] = vget_lane_s32(vreinterpret_s32_s16(v388), 0); + v6[ostride * 14] = vget_lane_s32(vreinterpret_s32_s16(v394), 0); + v6[ostride * 5] = vget_lane_s32(vreinterpret_s32_s16(v400), 0); + v6[ostride * 16] = vget_lane_s32(vreinterpret_s32_s16(v418), 0); + v6[ostride * 7] = vget_lane_s32(vreinterpret_s32_s16(v424), 0); + v6[ostride * 8] = vget_lane_s32(vreinterpret_s32_s16(v430), 0); + v6[ostride * 17] = vget_lane_s32(vreinterpret_s32_s16(v436), 0); v5 += 1 * 1; v6 += 1 * 1; } @@ -9977,184 +6322,100 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu18(const armral_cmplx_f32_t *restrict x, svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v677)[0])); svfloat32_t v909 = svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v686)[0])); - svfloat32_t v32; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v32) : "w"(v875), "w"(v877)); - svfloat32_t v33; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v33) : "w"(v875), "w"(v877)); - svfloat32_t v48; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v48) : "w"(v879), "w"(v881)); - svfloat32_t v49; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v49) : "w"(v879), "w"(v881)); - svfloat32_t v64; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v64) : "w"(v883), "w"(v885)); - svfloat32_t v65; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v65) : "w"(v883), "w"(v885)); - svfloat32_t v80; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v80) : "w"(v887), "w"(v889)); - svfloat32_t v81; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v81) : "w"(v887), "w"(v889)); - svfloat32_t v96; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v96) : "w"(v891), "w"(v893)); - svfloat32_t v97; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v97) : "w"(v891), "w"(v893)); - svfloat32_t v112; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v112) : "w"(v895), "w"(v897)); - svfloat32_t v113; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v113) : "w"(v895), "w"(v897)); - svfloat32_t v128; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v128) : "w"(v899), "w"(v901)); - svfloat32_t v129; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v129) : "w"(v899), "w"(v901)); - svfloat32_t v144; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v144) : "w"(v903), "w"(v905)); - svfloat32_t v145; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v145) : "w"(v903), "w"(v905)); - svfloat32_t v160; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v160) : "w"(v907), "w"(v909)); - svfloat32_t v161; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v161) : "w"(v907), "w"(v909)); - svfloat32_t v162; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v162) : "w"(v48), "w"(v160)); - svfloat32_t v163; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v163) : "w"(v48), "w"(v160)); - svfloat32_t v164; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v164) : "w"(v144), "w"(v64)); - svfloat32_t v165; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v165) : "w"(v144), "w"(v64)); - svfloat32_t v166; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v166) : "w"(v80), "w"(v128)); - svfloat32_t v167; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v167) : "w"(v80), "w"(v128)); - svfloat32_t v168; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v168) : "w"(v96), "w"(v112)); - svfloat32_t v169; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v169) : "w"(v96), "w"(v112)); - svfloat32_t v272; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v272) : "w"(v49), "w"(v161)); - svfloat32_t v273; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v273) : "w"(v49), "w"(v161)); - svfloat32_t v274; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v274) : "w"(v145), "w"(v65)); - svfloat32_t v275; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v275) : "w"(v145), "w"(v65)); - svfloat32_t v276; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v276) : "w"(v81), "w"(v129)); - svfloat32_t v277; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v277) : "w"(v81), "w"(v129)); - svfloat32_t v278; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v278) : "w"(v97), "w"(v113)); - svfloat32_t v279; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v279) : "w"(v97), "w"(v113)); - svfloat32_t v170; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v170) : "w"(v162), "w"(v164)); - svfloat32_t v174; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v174) : "w"(v163), "w"(v165)); - svfloat32_t v176; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v176) : "w"(v162), "w"(v164)); - svfloat32_t v177; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v177) : "w"(v164), "w"(v168)); - svfloat32_t v178; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v178) : "w"(v168), "w"(v162)); - svfloat32_t v179; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v179) : "w"(v163), "w"(v165)); - svfloat32_t v180; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v180) : "w"(v165), "w"(v169)); - svfloat32_t v181; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v181) : "w"(v169), "w"(v163)); - svfloat32_t zero210; - asm volatile("mov %0.s, #0" : "=w"(zero210)); + svfloat32_t v32 = svadd_f32_x(svptrue_b32(), v875, v877); + svfloat32_t v33 = svsub_f32_x(svptrue_b32(), v875, v877); + svfloat32_t v48 = svadd_f32_x(svptrue_b32(), v879, v881); + svfloat32_t v49 = svsub_f32_x(svptrue_b32(), v879, v881); + svfloat32_t v64 = svadd_f32_x(svptrue_b32(), v883, v885); + svfloat32_t v65 = svsub_f32_x(svptrue_b32(), v883, v885); + svfloat32_t v80 = svadd_f32_x(svptrue_b32(), v887, v889); + svfloat32_t v81 = svsub_f32_x(svptrue_b32(), v887, v889); + svfloat32_t v96 = svadd_f32_x(svptrue_b32(), v891, v893); + svfloat32_t v97 = svsub_f32_x(svptrue_b32(), v891, v893); + svfloat32_t v112 = svadd_f32_x(svptrue_b32(), v895, v897); + svfloat32_t v113 = svsub_f32_x(svptrue_b32(), v895, v897); + svfloat32_t v128 = svadd_f32_x(svptrue_b32(), v899, v901); + svfloat32_t v129 = svsub_f32_x(svptrue_b32(), v899, v901); + svfloat32_t v144 = svadd_f32_x(svptrue_b32(), v903, v905); + svfloat32_t v145 = svsub_f32_x(svptrue_b32(), v903, v905); + svfloat32_t v160 = svadd_f32_x(svptrue_b32(), v907, v909); + svfloat32_t v161 = svsub_f32_x(svptrue_b32(), v907, v909); + svfloat32_t v162 = svadd_f32_x(svptrue_b32(), v48, v160); + svfloat32_t v163 = svsub_f32_x(svptrue_b32(), v48, v160); + svfloat32_t v164 = svadd_f32_x(svptrue_b32(), v144, v64); + svfloat32_t v165 = svsub_f32_x(svptrue_b32(), v144, v64); + svfloat32_t v166 = svadd_f32_x(svptrue_b32(), v80, v128); + svfloat32_t v167 = svsub_f32_x(svptrue_b32(), v80, v128); + svfloat32_t v168 = svadd_f32_x(svptrue_b32(), v96, v112); + svfloat32_t v169 = svsub_f32_x(svptrue_b32(), v96, v112); + svfloat32_t v272 = svadd_f32_x(svptrue_b32(), v49, v161); + svfloat32_t v273 = svsub_f32_x(svptrue_b32(), v49, v161); + svfloat32_t v274 = svadd_f32_x(svptrue_b32(), v145, v65); + svfloat32_t v275 = svsub_f32_x(svptrue_b32(), v145, v65); + svfloat32_t v276 = svadd_f32_x(svptrue_b32(), v81, v129); + svfloat32_t v277 = svsub_f32_x(svptrue_b32(), v81, v129); + svfloat32_t v278 = svadd_f32_x(svptrue_b32(), v97, v113); + svfloat32_t v279 = svsub_f32_x(svptrue_b32(), v97, v113); + svfloat32_t v170 = svadd_f32_x(svptrue_b32(), v162, v164); + svfloat32_t v174 = svadd_f32_x(svptrue_b32(), v163, v165); + svfloat32_t v176 = svsub_f32_x(svptrue_b32(), v162, v164); + svfloat32_t v177 = svsub_f32_x(svptrue_b32(), v164, v168); + svfloat32_t v178 = svsub_f32_x(svptrue_b32(), v168, v162); + svfloat32_t v179 = svsub_f32_x(svptrue_b32(), v163, v165); + svfloat32_t v180 = svsub_f32_x(svptrue_b32(), v165, v169); + svfloat32_t v181 = svsub_f32_x(svptrue_b32(), v169, v163); + svfloat32_t zero210 = svdup_n_f32(0); svfloat32_t v210 = svcmla_f32_x(pred_full, zero210, v704, v167, 90); - svfloat32_t v280; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v280) : "w"(v272), "w"(v274)); - svfloat32_t v284; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v284) : "w"(v273), "w"(v275)); - svfloat32_t v286; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v286) : "w"(v272), "w"(v274)); - svfloat32_t v287; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v287) : "w"(v274), "w"(v278)); - svfloat32_t v288; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v288) : "w"(v278), "w"(v272)); - svfloat32_t v289; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v289) : "w"(v273), "w"(v275)); - svfloat32_t v290; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v290) : "w"(v275), "w"(v279)); - svfloat32_t v291; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v291) : "w"(v279), "w"(v273)); - svfloat32_t zero320; - asm volatile("mov %0.s, #0" : "=w"(zero320)); + svfloat32_t v280 = svadd_f32_x(svptrue_b32(), v272, v274); + svfloat32_t v284 = svadd_f32_x(svptrue_b32(), v273, v275); + svfloat32_t v286 = svsub_f32_x(svptrue_b32(), v272, v274); + svfloat32_t v287 = svsub_f32_x(svptrue_b32(), v274, v278); + svfloat32_t v288 = svsub_f32_x(svptrue_b32(), v278, v272); + svfloat32_t v289 = svsub_f32_x(svptrue_b32(), v273, v275); + svfloat32_t v290 = svsub_f32_x(svptrue_b32(), v275, v279); + svfloat32_t v291 = svsub_f32_x(svptrue_b32(), v279, v273); + svfloat32_t zero320 = svdup_n_f32(0); svfloat32_t v320 = svcmla_f32_x(pred_full, zero320, v704, v277, 90); - svfloat32_t v171; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v171) : "w"(v170), "w"(v168)); - svfloat32_t v175; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v175) : "w"(v174), "w"(v169)); - svfloat32_t zero232; - asm volatile("mov %0.s, #0" : "=w"(zero232)); + svfloat32_t v171 = svadd_f32_x(svptrue_b32(), v170, v168); + svfloat32_t v175 = svadd_f32_x(svptrue_b32(), v174, v169); + svfloat32_t zero232 = svdup_n_f32(0); svfloat32_t v232 = svcmla_f32_x(pred_full, zero232, v708, v179, 90); - svfloat32_t zero239; - asm volatile("mov %0.s, #0" : "=w"(zero239)); + svfloat32_t zero239 = svdup_n_f32(0); svfloat32_t v239 = svcmla_f32_x(pred_full, zero239, v709, v180, 90); - svfloat32_t zero246; - asm volatile("mov %0.s, #0" : "=w"(zero246)); + svfloat32_t zero246 = svdup_n_f32(0); svfloat32_t v246 = svcmla_f32_x(pred_full, zero246, v710, v181, 90); - svfloat32_t v281; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v281) : "w"(v280), "w"(v278)); - svfloat32_t v285; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v285) : "w"(v284), "w"(v279)); - svfloat32_t zero342; - asm volatile("mov %0.s, #0" : "=w"(zero342)); + svfloat32_t v281 = svadd_f32_x(svptrue_b32(), v280, v278); + svfloat32_t v285 = svadd_f32_x(svptrue_b32(), v284, v279); + svfloat32_t zero342 = svdup_n_f32(0); svfloat32_t v342 = svcmla_f32_x(pred_full, zero342, v708, v289, 90); - svfloat32_t zero349; - asm volatile("mov %0.s, #0" : "=w"(zero349)); + svfloat32_t zero349 = svdup_n_f32(0); svfloat32_t v349 = svcmla_f32_x(pred_full, zero349, v709, v290, 90); - svfloat32_t zero356; - asm volatile("mov %0.s, #0" : "=w"(zero356)); + svfloat32_t zero356 = svdup_n_f32(0); svfloat32_t v356 = svcmla_f32_x(pred_full, zero356, v710, v291, 90); - svfloat32_t v172; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v172) : "w"(v171), "w"(v166)); - svfloat32_t v191; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v191) : "w"(v171), "w"(v701)); - svfloat32_t zero198; - asm volatile("mov %0.s, #0" : "=w"(zero198)); + svfloat32_t v172 = svadd_f32_x(svptrue_b32(), v171, v166); + svfloat32_t v191 = svmul_f32_x(svptrue_b32(), v171, v701); + svfloat32_t zero198 = svdup_n_f32(0); svfloat32_t v198 = svcmla_f32_x(pred_full, zero198, v704, v175, 90); - svfloat32_t v260; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v260) : "w"(v210), "w"(v232)); - svfloat32_t v262; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v262) : "w"(v210), "w"(v239)); - svfloat32_t v264; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v264) : "w"(v210), "w"(v232)); - svfloat32_t v282; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v282) : "w"(v281), "w"(v276)); - svfloat32_t v301; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v301) : "w"(v281), "w"(v701)); - svfloat32_t zero308; - asm volatile("mov %0.s, #0" : "=w"(zero308)); + svfloat32_t v260 = svadd_f32_x(svptrue_b32(), v210, v232); + svfloat32_t v262 = svsub_f32_x(svptrue_b32(), v210, v239); + svfloat32_t v264 = svsub_f32_x(svptrue_b32(), v210, v232); + svfloat32_t v282 = svadd_f32_x(svptrue_b32(), v281, v276); + svfloat32_t v301 = svmul_f32_x(svptrue_b32(), v281, v701); + svfloat32_t zero308 = svdup_n_f32(0); svfloat32_t v308 = svcmla_f32_x(pred_full, zero308, v704, v285, 90); - svfloat32_t v370; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v370) : "w"(v320), "w"(v342)); - svfloat32_t v372; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v372) : "w"(v320), "w"(v349)); - svfloat32_t v374; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v374) : "w"(v320), "w"(v342)); - svfloat32_t v173; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v173) : "w"(v172), "w"(v32)); - svfloat32_t v247; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v247) : "w"(v191), "w"(v191)); - svfloat32_t v261; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v261) : "w"(v260), "w"(v239)); - svfloat32_t v263; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v263) : "w"(v262), "w"(v246)); - svfloat32_t v265; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v265) : "w"(v264), "w"(v246)); - svfloat32_t v283; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v283) : "w"(v282), "w"(v33)); - svfloat32_t v357; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v357) : "w"(v301), "w"(v301)); - svfloat32_t v371; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v371) : "w"(v370), "w"(v349)); - svfloat32_t v373; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v373) : "w"(v372), "w"(v356)); - svfloat32_t v375; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v375) : "w"(v374), "w"(v356)); + svfloat32_t v370 = svadd_f32_x(svptrue_b32(), v320, v342); + svfloat32_t v372 = svsub_f32_x(svptrue_b32(), v320, v349); + svfloat32_t v374 = svsub_f32_x(svptrue_b32(), v320, v342); + svfloat32_t v173 = svadd_f32_x(svptrue_b32(), v172, v32); + svfloat32_t v247 = svadd_f32_x(svptrue_b32(), v191, v191); + svfloat32_t v261 = svadd_f32_x(svptrue_b32(), v260, v239); + svfloat32_t v263 = svadd_f32_x(svptrue_b32(), v262, v246); + svfloat32_t v265 = svsub_f32_x(svptrue_b32(), v264, v246); + svfloat32_t v283 = svadd_f32_x(svptrue_b32(), v282, v33); + svfloat32_t v357 = svadd_f32_x(svptrue_b32(), v301, v301); + svfloat32_t v371 = svadd_f32_x(svptrue_b32(), v370, v349); + svfloat32_t v373 = svadd_f32_x(svptrue_b32(), v372, v356); + svfloat32_t v375 = svsub_f32_x(svptrue_b32(), v374, v356); svfloat32_t v248 = svmla_f32_x(pred_full, v247, v171, v701); svfloat32_t v252 = svmla_f32_x(pred_full, v173, v166, v703); svfloat32_t v358 = svmla_f32_x(pred_full, v357, v281, v701); @@ -10169,27 +6430,19 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu18(const armral_cmplx_f32_t *restrict x, pred_full, svmul_n_f32_x(pred_full, v283, (float)(1ULL << 31ULL)))), svreinterpret_u16_u64( svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svfloat32_t v249; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v249) : "w"(v173), "w"(v248)); - svfloat32_t v253; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v253) : "w"(v252), "w"(v247)); - svfloat32_t v359; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v359) : "w"(v283), "w"(v358)); - svfloat32_t v363; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v363) : "w"(v362), "w"(v357)); + svfloat32_t v249 = svadd_f32_x(svptrue_b32(), v173, v248); + svfloat32_t v253 = svadd_f32_x(svptrue_b32(), v252, v247); + svfloat32_t v359 = svadd_f32_x(svptrue_b32(), v283, v358); + svfloat32_t v363 = svadd_f32_x(svptrue_b32(), v362, v357); svst1w_u64(pred_full, (unsigned *)(v718), svreinterpret_u64_s16(v384)); svst1w_u64(pred_full, (unsigned *)(v727), svreinterpret_u64_s16(v392)); - svfloat32_t v250; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v250) : "w"(v249), "w"(v198)); - svfloat32_t v251; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v251) : "w"(v249), "w"(v198)); + svfloat32_t v250 = svadd_f32_x(svptrue_b32(), v249, v198); + svfloat32_t v251 = svsub_f32_x(svptrue_b32(), v249, v198); svfloat32_t v254 = svmla_f32_x(pred_full, v253, v176, v705); svfloat32_t v256 = svmls_f32_x(pred_full, v253, v177, v706); svfloat32_t v258 = svmls_f32_x(pred_full, v253, v176, v705); - svfloat32_t v360; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v360) : "w"(v359), "w"(v308)); - svfloat32_t v361; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v361) : "w"(v359), "w"(v308)); + svfloat32_t v360 = svadd_f32_x(svptrue_b32(), v359, v308); + svfloat32_t v361 = svsub_f32_x(svptrue_b32(), v359, v308); svfloat32_t v364 = svmla_f32_x(pred_full, v363, v286, v705); svfloat32_t v366 = svmls_f32_x(pred_full, v363, v287, v706); svfloat32_t v368 = svmls_f32_x(pred_full, v363, v286, v705); @@ -10219,30 +6472,18 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu18(const armral_cmplx_f32_t *restrict x, pred_full, svmul_n_f32_x(pred_full, v360, (float)(1ULL << 31ULL)))), svreinterpret_u16_u64( svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svfloat32_t v266; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v266) : "w"(v255), "w"(v261)); - svfloat32_t v267; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v267) : "w"(v255), "w"(v261)); - svfloat32_t v268; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v268) : "w"(v257), "w"(v263)); - svfloat32_t v269; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v269) : "w"(v257), "w"(v263)); - svfloat32_t v270; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v270) : "w"(v259), "w"(v265)); - svfloat32_t v271; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v271) : "w"(v259), "w"(v265)); - svfloat32_t v376; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v376) : "w"(v365), "w"(v371)); - svfloat32_t v377; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v377) : "w"(v365), "w"(v371)); - svfloat32_t v378; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v378) : "w"(v367), "w"(v373)); - svfloat32_t v379; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v379) : "w"(v367), "w"(v373)); - svfloat32_t v380; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v380) : "w"(v369), "w"(v375)); - svfloat32_t v381; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v381) : "w"(v369), "w"(v375)); + svfloat32_t v266 = svadd_f32_x(svptrue_b32(), v255, v261); + svfloat32_t v267 = svsub_f32_x(svptrue_b32(), v255, v261); + svfloat32_t v268 = svadd_f32_x(svptrue_b32(), v257, v263); + svfloat32_t v269 = svsub_f32_x(svptrue_b32(), v257, v263); + svfloat32_t v270 = svadd_f32_x(svptrue_b32(), v259, v265); + svfloat32_t v271 = svsub_f32_x(svptrue_b32(), v259, v265); + svfloat32_t v376 = svadd_f32_x(svptrue_b32(), v365, v371); + svfloat32_t v377 = svsub_f32_x(svptrue_b32(), v365, v371); + svfloat32_t v378 = svadd_f32_x(svptrue_b32(), v367, v373); + svfloat32_t v379 = svsub_f32_x(svptrue_b32(), v367, v373); + svfloat32_t v380 = svadd_f32_x(svptrue_b32(), v369, v375); + svfloat32_t v381 = svsub_f32_x(svptrue_b32(), v369, v375); svst1w_u64(pred_full, (unsigned *)(v772), svreinterpret_u64_s16(v432)); svst1w_u64(pred_full, (unsigned *)(v781), svreinterpret_u64_s16(v440)); svst1w_u64(pred_full, (unsigned *)(v826), svreinterpret_u64_s16(v480)); @@ -10333,934 +6574,439 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu19(const armral_cmplx_f32_t *restrict x, float v4 = dir; const float32x2_t *v5 = (const float32x2_t *)x; int32_t *v6 = (int32_t *)y; - int64_t v12 = howmany - 1; - int64_t v744 = howmany / 2; - for (int j = 0; j < v12; j += 2) { - float v243 = -1.0555555555555556e+00F; - float v248 = 1.7752228513927079e-01F; - float v253 = -1.2820077502191529e-01F; - float v258 = 4.9321510117355499e-02F; - float v263 = 5.7611011491005903e-01F; - float v268 = -7.4996449655536279e-01F; - float v273 = -1.7385438164530381e-01F; - float v278 = -2.1729997561977314e+00F; - float v283 = -1.7021211726914738e+00F; - float v288 = 4.7087858350625778e-01F; - float v293 = -2.0239400846888440e+00F; - float v298 = 1.0551641201664090e-01F; - float v303 = 2.1294564967054850e+00F; - float v308 = -7.5087543897371167e-01F; - float v313 = 1.4812817695157160e-01F; - float v318 = 8.9900361592528333e-01F; - float v323 = -6.2148246772602778e-01F; - float v328 = -7.9869352098712687e-01F; - float v333 = -4.7339199623771833e-01F; - float v337 = -2.4216105241892630e-01F; - float v338 = 2.4216105241892630e-01F; - float v345 = -5.9368607967505101e-02F; - float v346 = 5.9368607967505101e-02F; - float v353 = 1.2578688255176201e-02F; - float v354 = -1.2578688255176201e-02F; - float v361 = -4.6789919712328903e-02F; - float v362 = 4.6789919712328903e-02F; - float v369 = -9.3750121913782358e-01F; - float v370 = 9.3750121913782358e-01F; - float v377 = -5.0111537043352902e-02F; - float v378 = 5.0111537043352902e-02F; - float v385 = -9.8761275618117661e-01F; - float v386 = 9.8761275618117661e-01F; - float v393 = -1.1745786501205959e+00F; - float v394 = 1.1745786501205959e+00F; - float v401 = 1.1114482296234993e+00F; - float v402 = -1.1114482296234993e+00F; - float v409 = 2.2860268797440955e+00F; - float v410 = -2.2860268797440955e+00F; - float v417 = 2.6420523257930939e-01F; - float v418 = -2.6420523257930939e-01F; - float v425 = 2.1981792779352136e+00F; - float v426 = -2.1981792779352136e+00F; - float v433 = 1.9339740453559042e+00F; - float v434 = -1.9339740453559042e+00F; - float v441 = -7.4825847091254893e-01F; - float v442 = 7.4825847091254893e-01F; - float v449 = -4.7820835642768872e-01F; - float v450 = 4.7820835642768872e-01F; - float v457 = 2.7005011448486022e-01F; - float v458 = -2.7005011448486022e-01F; - float v465 = -3.4642356159542270e-01F; - float v466 = 3.4642356159542270e-01F; - float v473 = -8.3485429360688279e-01F; - float v474 = 8.3485429360688279e-01F; - float v481 = -3.9375928506743518e-01F; - float v482 = 3.9375928506743518e-01F; - float32x2_t v484 = (float32x2_t){v4, v4}; - const float32x2_t *v1370 = &v5[istride]; - int32_t *v1552 = &v6[ostride]; + for (int j = 0; j < howmany; j += 1) { + float32x2_t v20 = v5[istride]; + float v199 = -1.0555555555555556e+00F; + float v203 = 1.7752228513927079e-01F; + float v207 = -1.2820077502191529e-01F; + float v211 = 4.9321510117355499e-02F; + float v215 = 5.7611011491005903e-01F; + float v219 = -7.4996449655536279e-01F; + float v223 = -1.7385438164530381e-01F; + float v227 = -2.1729997561977314e+00F; + float v231 = -1.7021211726914738e+00F; + float v235 = 4.7087858350625778e-01F; + float v239 = -2.0239400846888440e+00F; + float v243 = 1.0551641201664090e-01F; + float v247 = 2.1294564967054850e+00F; + float v251 = -7.5087543897371167e-01F; + float v255 = 1.4812817695157160e-01F; + float v259 = 8.9900361592528333e-01F; + float v263 = -6.2148246772602778e-01F; + float v267 = -7.9869352098712687e-01F; + float v271 = -4.7339199623771833e-01F; + float v274 = -2.4216105241892630e-01F; + float v275 = 2.4216105241892630e-01F; + float v281 = -5.9368607967505101e-02F; + float v282 = 5.9368607967505101e-02F; + float v288 = 1.2578688255176201e-02F; + float v289 = -1.2578688255176201e-02F; + float v295 = -4.6789919712328903e-02F; + float v296 = 4.6789919712328903e-02F; + float v302 = -9.3750121913782358e-01F; + float v303 = 9.3750121913782358e-01F; + float v309 = -5.0111537043352902e-02F; + float v310 = 5.0111537043352902e-02F; + float v316 = -9.8761275618117661e-01F; + float v317 = 9.8761275618117661e-01F; + float v323 = -1.1745786501205959e+00F; + float v324 = 1.1745786501205959e+00F; + float v330 = 1.1114482296234993e+00F; + float v331 = -1.1114482296234993e+00F; + float v337 = 2.2860268797440955e+00F; + float v338 = -2.2860268797440955e+00F; + float v344 = 2.6420523257930939e-01F; + float v345 = -2.6420523257930939e-01F; + float v351 = 2.1981792779352136e+00F; + float v352 = -2.1981792779352136e+00F; + float v358 = 1.9339740453559042e+00F; + float v359 = -1.9339740453559042e+00F; + float v365 = -7.4825847091254893e-01F; + float v366 = 7.4825847091254893e-01F; + float v372 = -4.7820835642768872e-01F; + float v373 = 4.7820835642768872e-01F; + float v379 = 2.7005011448486022e-01F; + float v380 = -2.7005011448486022e-01F; + float v386 = -3.4642356159542270e-01F; + float v387 = 3.4642356159542270e-01F; + float v393 = -8.3485429360688279e-01F; + float v394 = 8.3485429360688279e-01F; + float v400 = -3.9375928506743518e-01F; + float v401 = 3.9375928506743518e-01F; + float32x2_t v403 = (float32x2_t){v4, v4}; + float32x2_t v144 = v5[0]; + float32x2_t v200 = (float32x2_t){v199, v199}; + float32x2_t v204 = (float32x2_t){v203, v203}; + float32x2_t v208 = (float32x2_t){v207, v207}; + float32x2_t v212 = (float32x2_t){v211, v211}; + float32x2_t v216 = (float32x2_t){v215, v215}; + float32x2_t v220 = (float32x2_t){v219, v219}; + float32x2_t v224 = (float32x2_t){v223, v223}; + float32x2_t v228 = (float32x2_t){v227, v227}; + float32x2_t v232 = (float32x2_t){v231, v231}; + float32x2_t v236 = (float32x2_t){v235, v235}; + float32x2_t v240 = (float32x2_t){v239, v239}; float32x2_t v244 = (float32x2_t){v243, v243}; - float32x2_t v249 = (float32x2_t){v248, v248}; - float32x2_t v254 = (float32x2_t){v253, v253}; - float32x2_t v259 = (float32x2_t){v258, v258}; + float32x2_t v248 = (float32x2_t){v247, v247}; + float32x2_t v252 = (float32x2_t){v251, v251}; + float32x2_t v256 = (float32x2_t){v255, v255}; + float32x2_t v260 = (float32x2_t){v259, v259}; float32x2_t v264 = (float32x2_t){v263, v263}; - float32x2_t v269 = (float32x2_t){v268, v268}; - float32x2_t v274 = (float32x2_t){v273, v273}; - float32x2_t v279 = (float32x2_t){v278, v278}; - float32x2_t v284 = (float32x2_t){v283, v283}; - float32x2_t v289 = (float32x2_t){v288, v288}; - float32x2_t v294 = (float32x2_t){v293, v293}; - float32x2_t v299 = (float32x2_t){v298, v298}; - float32x2_t v304 = (float32x2_t){v303, v303}; - float32x2_t v309 = (float32x2_t){v308, v308}; - float32x2_t v314 = (float32x2_t){v313, v313}; - float32x2_t v319 = (float32x2_t){v318, v318}; - float32x2_t v324 = (float32x2_t){v323, v323}; - float32x2_t v329 = (float32x2_t){v328, v328}; - float32x2_t v334 = (float32x2_t){v333, v333}; + float32x2_t v268 = (float32x2_t){v267, v267}; + float32x2_t v272 = (float32x2_t){v271, v271}; + float32x2_t v276 = (float32x2_t){v274, v275}; + float32x2_t v283 = (float32x2_t){v281, v282}; + float32x2_t v290 = (float32x2_t){v288, v289}; + float32x2_t v297 = (float32x2_t){v295, v296}; + float32x2_t v304 = (float32x2_t){v302, v303}; + float32x2_t v311 = (float32x2_t){v309, v310}; + float32x2_t v318 = (float32x2_t){v316, v317}; + float32x2_t v325 = (float32x2_t){v323, v324}; + float32x2_t v332 = (float32x2_t){v330, v331}; float32x2_t v339 = (float32x2_t){v337, v338}; - float32x2_t v347 = (float32x2_t){v345, v346}; - float32x2_t v355 = (float32x2_t){v353, v354}; - float32x2_t v363 = (float32x2_t){v361, v362}; - float32x2_t v371 = (float32x2_t){v369, v370}; - float32x2_t v379 = (float32x2_t){v377, v378}; - float32x2_t v387 = (float32x2_t){v385, v386}; + float32x2_t v346 = (float32x2_t){v344, v345}; + float32x2_t v353 = (float32x2_t){v351, v352}; + float32x2_t v360 = (float32x2_t){v358, v359}; + float32x2_t v367 = (float32x2_t){v365, v366}; + float32x2_t v374 = (float32x2_t){v372, v373}; + float32x2_t v381 = (float32x2_t){v379, v380}; + float32x2_t v388 = (float32x2_t){v386, v387}; float32x2_t v395 = (float32x2_t){v393, v394}; - float32x2_t v403 = (float32x2_t){v401, v402}; - float32x2_t v411 = (float32x2_t){v409, v410}; - float32x2_t v419 = (float32x2_t){v417, v418}; - float32x2_t v427 = (float32x2_t){v425, v426}; - float32x2_t v435 = (float32x2_t){v433, v434}; - float32x2_t v443 = (float32x2_t){v441, v442}; - float32x2_t v451 = (float32x2_t){v449, v450}; - float32x2_t v459 = (float32x2_t){v457, v458}; - float32x2_t v467 = (float32x2_t){v465, v466}; - float32x2_t v475 = (float32x2_t){v473, v474}; - float32x2_t v483 = (float32x2_t){v481, v482}; - const float32x2_t *v1533 = &v5[0]; - int32_t *v1543 = &v6[0]; - float32x4_t v1709 = vld1q_f32((const float32_t *)v1370); - float32x4_t v245 = vcombine_f32(v244, v244); - float32x4_t v250 = vcombine_f32(v249, v249); - float32x4_t v255 = vcombine_f32(v254, v254); - float32x4_t v260 = vcombine_f32(v259, v259); - float32x4_t v265 = vcombine_f32(v264, v264); - float32x4_t v270 = vcombine_f32(v269, v269); - float32x4_t v275 = vcombine_f32(v274, v274); - float32x4_t v280 = vcombine_f32(v279, v279); - float32x4_t v285 = vcombine_f32(v284, v284); - float32x4_t v290 = vcombine_f32(v289, v289); - float32x4_t v295 = vcombine_f32(v294, v294); - float32x4_t v300 = vcombine_f32(v299, v299); - float32x4_t v305 = vcombine_f32(v304, v304); - float32x4_t v310 = vcombine_f32(v309, v309); - float32x4_t v315 = vcombine_f32(v314, v314); - float32x4_t v320 = vcombine_f32(v319, v319); - float32x4_t v325 = vcombine_f32(v324, v324); - float32x4_t v330 = vcombine_f32(v329, v329); - float32x4_t v335 = vcombine_f32(v334, v334); - float32x2_t v341 = vmul_f32(v484, v339); - float32x2_t v349 = vmul_f32(v484, v347); - float32x2_t v357 = vmul_f32(v484, v355); - float32x2_t v365 = vmul_f32(v484, v363); - float32x2_t v373 = vmul_f32(v484, v371); - float32x2_t v381 = vmul_f32(v484, v379); - float32x2_t v389 = vmul_f32(v484, v387); - float32x2_t v397 = vmul_f32(v484, v395); - float32x2_t v405 = vmul_f32(v484, v403); - float32x2_t v413 = vmul_f32(v484, v411); - float32x2_t v421 = vmul_f32(v484, v419); - float32x2_t v429 = vmul_f32(v484, v427); - float32x2_t v437 = vmul_f32(v484, v435); - float32x2_t v445 = vmul_f32(v484, v443); - float32x2_t v453 = vmul_f32(v484, v451); - float32x2_t v461 = vmul_f32(v484, v459); - float32x2_t v469 = vmul_f32(v484, v467); - float32x2_t v477 = vmul_f32(v484, v475); - float32x2_t v485 = vmul_f32(v484, v483); - const float32x2_t *v1379 = &v5[istride * 18]; - const float32x2_t *v1388 = &v5[istride * 2]; - const float32x2_t *v1397 = &v5[istride * 17]; - const float32x2_t *v1406 = &v5[istride * 4]; - const float32x2_t *v1415 = &v5[istride * 15]; - const float32x2_t *v1424 = &v5[istride * 8]; - const float32x2_t *v1433 = &v5[istride * 11]; - const float32x2_t *v1442 = &v5[istride * 16]; - const float32x2_t *v1451 = &v5[istride * 3]; - const float32x2_t *v1460 = &v5[istride * 13]; - const float32x2_t *v1469 = &v5[istride * 6]; - const float32x2_t *v1478 = &v5[istride * 7]; - const float32x2_t *v1487 = &v5[istride * 12]; - const float32x2_t *v1496 = &v5[istride * 14]; - const float32x2_t *v1505 = &v5[istride * 5]; - const float32x2_t *v1514 = &v5[istride * 9]; - const float32x2_t *v1523 = &v5[istride * 10]; - int32_t *v1561 = &v6[ostride * 18]; - int32_t *v1570 = &v6[ostride * 2]; - int32_t *v1579 = &v6[ostride * 17]; - int32_t *v1588 = &v6[ostride * 3]; - int32_t *v1597 = &v6[ostride * 16]; - int32_t *v1606 = &v6[ostride * 4]; - int32_t *v1615 = &v6[ostride * 15]; - int32_t *v1624 = &v6[ostride * 5]; - int32_t *v1633 = &v6[ostride * 14]; - int32_t *v1642 = &v6[ostride * 6]; - int32_t *v1651 = &v6[ostride * 13]; - int32_t *v1660 = &v6[ostride * 7]; - int32_t *v1669 = &v6[ostride * 12]; - int32_t *v1678 = &v6[ostride * 8]; - int32_t *v1687 = &v6[ostride * 11]; - int32_t *v1696 = &v6[ostride * 9]; - int32_t *v1705 = &v6[ostride * 10]; - float32x4_t v1745 = vld1q_f32((const float32_t *)v1533); - float32x4_t v343 = vcombine_f32(v341, v341); - float32x4_t v351 = vcombine_f32(v349, v349); - float32x4_t v359 = vcombine_f32(v357, v357); - float32x4_t v367 = vcombine_f32(v365, v365); - float32x4_t v375 = vcombine_f32(v373, v373); - float32x4_t v383 = vcombine_f32(v381, v381); - float32x4_t v391 = vcombine_f32(v389, v389); - float32x4_t v399 = vcombine_f32(v397, v397); - float32x4_t v407 = vcombine_f32(v405, v405); - float32x4_t v415 = vcombine_f32(v413, v413); - float32x4_t v423 = vcombine_f32(v421, v421); - float32x4_t v431 = vcombine_f32(v429, v429); - float32x4_t v439 = vcombine_f32(v437, v437); - float32x4_t v447 = vcombine_f32(v445, v445); - float32x4_t v455 = vcombine_f32(v453, v453); - float32x4_t v463 = vcombine_f32(v461, v461); - float32x4_t v471 = vcombine_f32(v469, v469); - float32x4_t v479 = vcombine_f32(v477, v477); - float32x4_t v487 = vcombine_f32(v485, v485); - float32x4_t v1711 = vld1q_f32((const float32_t *)v1379); - float32x4_t v1713 = vld1q_f32((const float32_t *)v1388); - float32x4_t v1715 = vld1q_f32((const float32_t *)v1397); - float32x4_t v1717 = vld1q_f32((const float32_t *)v1406); - float32x4_t v1719 = vld1q_f32((const float32_t *)v1415); - float32x4_t v1721 = vld1q_f32((const float32_t *)v1424); - float32x4_t v1723 = vld1q_f32((const float32_t *)v1433); - float32x4_t v1725 = vld1q_f32((const float32_t *)v1442); - float32x4_t v1727 = vld1q_f32((const float32_t *)v1451); - float32x4_t v1729 = vld1q_f32((const float32_t *)v1460); - float32x4_t v1731 = vld1q_f32((const float32_t *)v1469); - float32x4_t v1733 = vld1q_f32((const float32_t *)v1478); - float32x4_t v1735 = vld1q_f32((const float32_t *)v1487); - float32x4_t v1737 = vld1q_f32((const float32_t *)v1496); - float32x4_t v1739 = vld1q_f32((const float32_t *)v1505); - float32x4_t v1741 = vld1q_f32((const float32_t *)v1514); - float32x4_t v1743 = vld1q_f32((const float32_t *)v1523); - float32x4_t v35 = vaddq_f32(v1709, v1711); - float32x4_t v36 = vsubq_f32(v1709, v1711); - float32x4_t v51 = vaddq_f32(v1713, v1715); - float32x4_t v52 = vsubq_f32(v1715, v1713); - float32x4_t v67 = vaddq_f32(v1717, v1719); - float32x4_t v68 = vsubq_f32(v1717, v1719); - float32x4_t v83 = vaddq_f32(v1721, v1723); - float32x4_t v84 = vsubq_f32(v1723, v1721); - float32x4_t v99 = vaddq_f32(v1725, v1727); - float32x4_t v100 = vsubq_f32(v1725, v1727); - float32x4_t v115 = vaddq_f32(v1729, v1731); - float32x4_t v116 = vsubq_f32(v1731, v1729); - float32x4_t v131 = vaddq_f32(v1733, v1735); - float32x4_t v132 = vsubq_f32(v1733, v1735); - float32x4_t v147 = vaddq_f32(v1737, v1739); - float32x4_t v148 = vsubq_f32(v1739, v1737); - float32x4_t v163 = vaddq_f32(v1741, v1743); - float32x4_t v164 = vsubq_f32(v1741, v1743); - float32x4_t v165 = vsubq_f32(v35, v131); - float32x4_t v166 = vsubq_f32(v51, v147); - float32x4_t v167 = vsubq_f32(v67, v163); - float32x4_t v168 = vsubq_f32(v83, v131); - float32x4_t v169 = vsubq_f32(v99, v147); - float32x4_t v170 = vsubq_f32(v115, v163); - float32x4_t v171 = vaddq_f32(v35, v83); - float32x4_t v173 = vaddq_f32(v51, v99); - float32x4_t v175 = vaddq_f32(v67, v115); - float32x4_t v205 = vsubq_f32(v36, v132); - float32x4_t v206 = vsubq_f32(v52, v148); - float32x4_t v207 = vsubq_f32(v68, v164); - float32x4_t v208 = vsubq_f32(v84, v132); - float32x4_t v209 = vsubq_f32(v100, v148); - float32x4_t v210 = vsubq_f32(v116, v164); - float32x4_t v211 = vaddq_f32(v36, v84); - float32x4_t v213 = vaddq_f32(v52, v100); - float32x4_t v215 = vaddq_f32(v68, v116); - float32x4_t v172 = vaddq_f32(v171, v131); - float32x4_t v174 = vaddq_f32(v173, v147); - float32x4_t v176 = vaddq_f32(v175, v163); - float32x4_t v177 = vaddq_f32(v165, v167); - float32x4_t v178 = vaddq_f32(v168, v170); - float32x4_t v195 = vsubq_f32(v165, v168); - float32x4_t v196 = vsubq_f32(v167, v170); - float32x4_t v212 = vaddq_f32(v211, v132); - float32x4_t v214 = vaddq_f32(v213, v148); - float32x4_t v216 = vaddq_f32(v215, v164); - float32x4_t v217 = vaddq_f32(v205, v207); - float32x4_t v218 = vaddq_f32(v208, v210); - float32x4_t v227 = vsubq_f32(v205, v208); - float32x4_t v228 = vsubq_f32(v207, v210); - float32x4_t v281 = vmulq_f32(v168, v280); - float32x4_t v296 = vmulq_f32(v170, v295); - float32x4_t v306 = vmulq_f32(v167, v305); - float32x4_t v398 = vrev64q_f32(v208); - float32x4_t v414 = vrev64q_f32(v205); - float32x4_t v422 = vrev64q_f32(v210); - float32x4_t v438 = vrev64q_f32(v207); - float32x4_t v179 = vaddq_f32(v172, v174); - float32x4_t v189 = vaddq_f32(v178, v169); - float32x4_t v190 = vaddq_f32(v177, v166); - float32x4_t v192 = vsubq_f32(v178, v169); - float32x4_t v193 = vsubq_f32(v177, v166); - float32x4_t v197 = vsubq_f32(v165, v196); - float32x4_t v199 = vaddq_f32(v195, v170); - float32x4_t v202 = vsubq_f32(v172, v176); - float32x4_t v203 = vsubq_f32(v174, v176); - float32x4_t v219 = vaddq_f32(v212, v214); - float32x4_t v221 = vaddq_f32(v218, v209); - float32x4_t v222 = vaddq_f32(v217, v206); - float32x4_t v224 = vsubq_f32(v218, v209); - float32x4_t v225 = vsubq_f32(v217, v206); - float32x4_t v229 = vsubq_f32(v205, v228); - float32x4_t v231 = vaddq_f32(v227, v210); - float32x4_t v234 = vsubq_f32(v212, v216); - float32x4_t v235 = vsubq_f32(v214, v216); - float32x4_t v286 = vmulq_f32(v195, v285); - float32x4_t v301 = vmulq_f32(v196, v300); - float32x4_t v400 = vmulq_f32(v398, v399); - float32x4_t v406 = vrev64q_f32(v227); - float32x4_t v424 = vmulq_f32(v422, v423); - float32x4_t v430 = vrev64q_f32(v228); - float32x4_t v440 = vmulq_f32(v438, v439); - float32x4_t v180 = vaddq_f32(v179, v176); - float32x4_t v191 = vsubq_f32(v190, v189); - float32x4_t v194 = vsubq_f32(v193, v192); - float32x4_t v198 = vsubq_f32(v197, v169); - float32x4_t v200 = vsubq_f32(v199, v166); - float32x4_t v204 = vaddq_f32(v202, v203); - float32x4_t v220 = vaddq_f32(v219, v216); - float32x4_t v223 = vsubq_f32(v222, v221); - float32x4_t v226 = vsubq_f32(v225, v224); - float32x4_t v230 = vsubq_f32(v229, v209); - float32x4_t v232 = vsubq_f32(v231, v206); - float32x4_t v236 = vaddq_f32(v234, v235); - float32x4_t v251 = vmulq_f32(v189, v250); - float32x4_t v256 = vmulq_f32(v190, v255); - float32x4_t v266 = vmulq_f32(v192, v265); - float32x4_t v271 = vmulq_f32(v193, v270); - float32x4_t v326 = vmulq_f32(v202, v325); - float32x4_t v331 = vmulq_f32(v203, v330); - float32x4_t v350 = vrev64q_f32(v221); - float32x4_t v358 = vrev64q_f32(v222); - float32x4_t v374 = vrev64q_f32(v224); - float32x4_t v382 = vrev64q_f32(v225); - float32x4_t v408 = vmulq_f32(v406, v407); - float32x4_t v432 = vmulq_f32(v430, v431); - float32x4_t v470 = vrev64q_f32(v234); - float32x4_t v478 = vrev64q_f32(v235); - float32x4_t v188 = vaddq_f32(v1745, v180); - float32x4_t v201 = vsubq_f32(v198, v200); - float32x4_t v233 = vsubq_f32(v230, v232); - float32x4_t v246 = vmulq_f32(v180, v245); - float32x4_t v261 = vmulq_f32(v191, v260); - float32x4_t v276 = vmulq_f32(v194, v275); - float32x4_t v311 = vmulq_f32(v198, v310); - float32x4_t v316 = vmulq_f32(v200, v315); - float32x4_t v336 = vmulq_f32(v204, v335); - float32x4_t v342 = vrev64q_f32(v220); - float32x4_t v352 = vmulq_f32(v350, v351); - float32x4_t v360 = vmulq_f32(v358, v359); - float32x4_t v366 = vrev64q_f32(v223); - float32x4_t v376 = vmulq_f32(v374, v375); - float32x4_t v384 = vmulq_f32(v382, v383); - float32x4_t v390 = vrev64q_f32(v226); - float32x4_t v446 = vrev64q_f32(v230); - float32x4_t v454 = vrev64q_f32(v232); - float32x4_t v472 = vmulq_f32(v470, v471); - float32x4_t v480 = vmulq_f32(v478, v479); - float32x4_t v486 = vrev64q_f32(v236); - float32x4_t v489 = vaddq_f32(v251, v256); - float32x4_t v490 = vaddq_f32(v266, v271); - float32x4_t v321 = vmulq_f32(v201, v320); - float32x4_t v344 = vmulq_f32(v342, v343); - float32x4_t v368 = vmulq_f32(v366, v367); - float32x4_t v392 = vmulq_f32(v390, v391); - float32x4_t v448 = vmulq_f32(v446, v447); - float32x4_t v456 = vmulq_f32(v454, v455); - float32x4_t v462 = vrev64q_f32(v233); - float32x4_t v488 = vmulq_f32(v486, v487); - float32x4_t v492 = vaddq_f32(v489, v490); - float32x4_t v493 = vaddq_f32(v251, v261); - float32x4_t v494 = vaddq_f32(v266, v276); - float32x4_t v511 = vsubq_f32(v489, v490); - float32x4_t v513 = vsubq_f32(v326, v336); - float32x4_t v514 = vsubq_f32(v331, v336); - float32x4_t v515 = vaddq_f32(v246, v188); - float32x4_t v520 = vaddq_f32(v352, v360); - float32x4_t v521 = vaddq_f32(v376, v384); - int16x4_t v576 = vqmovn_s32(vcvtq_n_s32_f32(v188, 15)); - float32x4_t v464 = vmulq_f32(v462, v463); - float32x4_t v491 = vaddq_f32(v316, v321); - float32x4_t v495 = vaddq_f32(v311, v321); - float32x4_t v496 = vsubq_f32(v281, v492); - float32x4_t v497 = vaddq_f32(v493, v494); - float32x4_t v503 = vsubq_f32(v493, v494); - float32x4_t v508 = vaddq_f32(v492, v306); - float32x4_t v516 = vaddq_f32(v515, v513); - float32x4_t v517 = vsubq_f32(v515, v513); - float32x4_t v519 = vaddq_f32(v515, v514); - float32x4_t v523 = vaddq_f32(v520, v521); - float32x4_t v524 = vaddq_f32(v352, v368); - float32x4_t v525 = vaddq_f32(v376, v392); - float32x4_t v542 = vsubq_f32(v520, v521); - float32x4_t v544 = vsubq_f32(v472, v488); - float32x4_t v545 = vsubq_f32(v480, v488); - vst1_s16((int16_t *)v1543, v576); - float32x4_t v498 = vsubq_f32(v296, v495); - float32x4_t v499 = vaddq_f32(v286, v491); - float32x4_t v501 = vaddq_f32(v497, v301); - float32x4_t v504 = vaddq_f32(v503, v491); - float32x4_t v505 = vaddq_f32(v496, v497); - float32x4_t v512 = vaddq_f32(v511, v495); - float32x4_t v518 = vsubq_f32(v517, v514); - float32x4_t v522 = vaddq_f32(v456, v464); - float32x4_t v526 = vaddq_f32(v448, v464); - float32x4_t v527 = vsubq_f32(v400, v523); - float32x4_t v528 = vaddq_f32(v524, v525); - float32x4_t v534 = vsubq_f32(v524, v525); - float32x4_t v539 = vaddq_f32(v523, v440); - float32x4_t v546 = vaddq_f32(v344, v544); - float32x4_t v547 = vsubq_f32(v344, v544); - float32x4_t v549 = vaddq_f32(v344, v545); - float32x4_t v500 = vaddq_f32(v499, v496); - float32x4_t v502 = vaddq_f32(v501, v498); - float32x4_t v506 = vfmaq_f32(v505, v165, v290); - float32x4_t v509 = vaddq_f32(v508, v498); - float32x4_t v529 = vsubq_f32(v424, v526); - float32x4_t v530 = vaddq_f32(v408, v522); - float32x4_t v532 = vaddq_f32(v528, v432); - float32x4_t v535 = vaddq_f32(v534, v522); - float32x4_t v536 = vaddq_f32(v527, v528); - float32x4_t v543 = vaddq_f32(v542, v526); - float32x4_t v548 = vsubq_f32(v547, v545); - float32x4_t v554 = vsubq_f32(v512, v504); - float32x4_t v558 = vsubq_f32(v519, v512); - float32x4_t v561 = vaddq_f32(v504, v519); - float32x4_t v507 = vaddq_f32(v506, v495); - float32x4_t v510 = vaddq_f32(v509, v491); - float32x4_t v531 = vaddq_f32(v530, v527); - float32x4_t v533 = vaddq_f32(v532, v529); - float32x4_t v537 = vfmaq_f32(v536, v414, v415); - float32x4_t v540 = vaddq_f32(v539, v529); - float32x4_t v555 = vaddq_f32(v554, v519); - float32x4_t v559 = vaddq_f32(v500, v516); - float32x4_t v560 = vaddq_f32(v502, v518); - float32x4_t v566 = vsubq_f32(v543, v535); - float32x4_t v570 = vsubq_f32(v543, v549); - float32x4_t v573 = vaddq_f32(v535, v549); - float32x4_t v538 = vaddq_f32(v537, v526); - float32x4_t v541 = vaddq_f32(v540, v522); - float32x4_t v550 = vsubq_f32(v507, v500); - float32x4_t v552 = vsubq_f32(v510, v502); - float32x4_t v556 = vsubq_f32(v516, v507); - float32x4_t v557 = vsubq_f32(v518, v510); - float32x4_t v567 = vaddq_f32(v566, v549); - float32x4_t v571 = vaddq_f32(v531, v546); - float32x4_t v572 = vaddq_f32(v533, v548); - float32x4_t v600 = vsubq_f32(v561, v573); - float32x4_t v609 = vaddq_f32(v561, v573); - float32x4_t v618 = vaddq_f32(v558, v570); - float32x4_t v627 = vsubq_f32(v558, v570); - float32x4_t v551 = vaddq_f32(v550, v516); - float32x4_t v553 = vaddq_f32(v552, v518); - float32x4_t v562 = vsubq_f32(v538, v531); - float32x4_t v564 = vsubq_f32(v541, v533); - float32x4_t v568 = vsubq_f32(v546, v538); - float32x4_t v569 = vsubq_f32(v548, v541); - int16x4_t v603 = vqmovn_s32(vcvtq_n_s32_f32(v600, 15)); - int16x4_t v612 = vqmovn_s32(vcvtq_n_s32_f32(v609, 15)); - int16x4_t v621 = vqmovn_s32(vcvtq_n_s32_f32(v618, 15)); - int16x4_t v630 = vqmovn_s32(vcvtq_n_s32_f32(v627, 15)); - float32x4_t v636 = vaddq_f32(v560, v572); - float32x4_t v645 = vsubq_f32(v560, v572); - float32x4_t v654 = vaddq_f32(v555, v567); - float32x4_t v663 = vsubq_f32(v555, v567); - float32x4_t v708 = vsubq_f32(v559, v571); - float32x4_t v717 = vaddq_f32(v559, v571); - float32x4_t v563 = vaddq_f32(v562, v546); - float32x4_t v565 = vaddq_f32(v564, v548); - int16x4_t v639 = vqmovn_s32(vcvtq_n_s32_f32(v636, 15)); - int16x4_t v648 = vqmovn_s32(vcvtq_n_s32_f32(v645, 15)); - int16x4_t v657 = vqmovn_s32(vcvtq_n_s32_f32(v654, 15)); - int16x4_t v666 = vqmovn_s32(vcvtq_n_s32_f32(v663, 15)); - float32x4_t v672 = vaddq_f32(v557, v569); - float32x4_t v681 = vsubq_f32(v557, v569); - float32x4_t v690 = vaddq_f32(v556, v568); - float32x4_t v699 = vsubq_f32(v556, v568); - int16x4_t v711 = vqmovn_s32(vcvtq_n_s32_f32(v708, 15)); - int16x4_t v720 = vqmovn_s32(vcvtq_n_s32_f32(v717, 15)); - vst1_s16((int16_t *)v1570, v603); - vst1_s16((int16_t *)v1579, v612); - vst1_s16((int16_t *)v1588, v621); - vst1_s16((int16_t *)v1597, v630); - float32x4_t v582 = vaddq_f32(v551, v563); - float32x4_t v591 = vsubq_f32(v551, v563); - int16x4_t v675 = vqmovn_s32(vcvtq_n_s32_f32(v672, 15)); - int16x4_t v684 = vqmovn_s32(vcvtq_n_s32_f32(v681, 15)); - int16x4_t v693 = vqmovn_s32(vcvtq_n_s32_f32(v690, 15)); - int16x4_t v702 = vqmovn_s32(vcvtq_n_s32_f32(v699, 15)); - float32x4_t v726 = vaddq_f32(v553, v565); - float32x4_t v735 = vsubq_f32(v553, v565); - vst1_s16((int16_t *)v1606, v639); - vst1_s16((int16_t *)v1615, v648); - vst1_s16((int16_t *)v1624, v657); - vst1_s16((int16_t *)v1633, v666); - vst1_s16((int16_t *)v1678, v711); - vst1_s16((int16_t *)v1687, v720); - int16x4_t v585 = vqmovn_s32(vcvtq_n_s32_f32(v582, 15)); - int16x4_t v594 = vqmovn_s32(vcvtq_n_s32_f32(v591, 15)); - int16x4_t v729 = vqmovn_s32(vcvtq_n_s32_f32(v726, 15)); - int16x4_t v738 = vqmovn_s32(vcvtq_n_s32_f32(v735, 15)); - vst1_s16((int16_t *)v1642, v675); - vst1_s16((int16_t *)v1651, v684); - vst1_s16((int16_t *)v1660, v693); - vst1_s16((int16_t *)v1669, v702); - vst1_s16((int16_t *)v1552, v585); - vst1_s16((int16_t *)v1561, v594); - vst1_s16((int16_t *)v1696, v729); - vst1_s16((int16_t *)v1705, v738); - v5 += 2 * 1; - v6 += 2 * 1; - } - for (int j = v744 * 2; j < howmany; j += 1) { - float32x2_t v756 = v5[istride]; - float v935 = -1.0555555555555556e+00F; - float v939 = 1.7752228513927079e-01F; - float v943 = -1.2820077502191529e-01F; - float v947 = 4.9321510117355499e-02F; - float v951 = 5.7611011491005903e-01F; - float v955 = -7.4996449655536279e-01F; - float v959 = -1.7385438164530381e-01F; - float v963 = -2.1729997561977314e+00F; - float v967 = -1.7021211726914738e+00F; - float v971 = 4.7087858350625778e-01F; - float v975 = -2.0239400846888440e+00F; - float v979 = 1.0551641201664090e-01F; - float v983 = 2.1294564967054850e+00F; - float v987 = -7.5087543897371167e-01F; - float v991 = 1.4812817695157160e-01F; - float v995 = 8.9900361592528333e-01F; - float v999 = -6.2148246772602778e-01F; - float v1003 = -7.9869352098712687e-01F; - float v1007 = -4.7339199623771833e-01F; - float v1010 = -2.4216105241892630e-01F; - float v1011 = 2.4216105241892630e-01F; - float v1017 = -5.9368607967505101e-02F; - float v1018 = 5.9368607967505101e-02F; - float v1024 = 1.2578688255176201e-02F; - float v1025 = -1.2578688255176201e-02F; - float v1031 = -4.6789919712328903e-02F; - float v1032 = 4.6789919712328903e-02F; - float v1038 = -9.3750121913782358e-01F; - float v1039 = 9.3750121913782358e-01F; - float v1045 = -5.0111537043352902e-02F; - float v1046 = 5.0111537043352902e-02F; - float v1052 = -9.8761275618117661e-01F; - float v1053 = 9.8761275618117661e-01F; - float v1059 = -1.1745786501205959e+00F; - float v1060 = 1.1745786501205959e+00F; - float v1066 = 1.1114482296234993e+00F; - float v1067 = -1.1114482296234993e+00F; - float v1073 = 2.2860268797440955e+00F; - float v1074 = -2.2860268797440955e+00F; - float v1080 = 2.6420523257930939e-01F; - float v1081 = -2.6420523257930939e-01F; - float v1087 = 2.1981792779352136e+00F; - float v1088 = -2.1981792779352136e+00F; - float v1094 = 1.9339740453559042e+00F; - float v1095 = -1.9339740453559042e+00F; - float v1101 = -7.4825847091254893e-01F; - float v1102 = 7.4825847091254893e-01F; - float v1108 = -4.7820835642768872e-01F; - float v1109 = 4.7820835642768872e-01F; - float v1115 = 2.7005011448486022e-01F; - float v1116 = -2.7005011448486022e-01F; - float v1122 = -3.4642356159542270e-01F; - float v1123 = 3.4642356159542270e-01F; - float v1129 = -8.3485429360688279e-01F; - float v1130 = 8.3485429360688279e-01F; - float v1136 = -3.9375928506743518e-01F; - float v1137 = 3.9375928506743518e-01F; - float32x2_t v1139 = (float32x2_t){v4, v4}; - float32x2_t v880 = v5[0]; - float32x2_t v936 = (float32x2_t){v935, v935}; - float32x2_t v940 = (float32x2_t){v939, v939}; - float32x2_t v944 = (float32x2_t){v943, v943}; - float32x2_t v948 = (float32x2_t){v947, v947}; - float32x2_t v952 = (float32x2_t){v951, v951}; - float32x2_t v956 = (float32x2_t){v955, v955}; - float32x2_t v960 = (float32x2_t){v959, v959}; - float32x2_t v964 = (float32x2_t){v963, v963}; - float32x2_t v968 = (float32x2_t){v967, v967}; - float32x2_t v972 = (float32x2_t){v971, v971}; - float32x2_t v976 = (float32x2_t){v975, v975}; - float32x2_t v980 = (float32x2_t){v979, v979}; - float32x2_t v984 = (float32x2_t){v983, v983}; - float32x2_t v988 = (float32x2_t){v987, v987}; - float32x2_t v992 = (float32x2_t){v991, v991}; - float32x2_t v996 = (float32x2_t){v995, v995}; - float32x2_t v1000 = (float32x2_t){v999, v999}; - float32x2_t v1004 = (float32x2_t){v1003, v1003}; - float32x2_t v1008 = (float32x2_t){v1007, v1007}; - float32x2_t v1012 = (float32x2_t){v1010, v1011}; - float32x2_t v1019 = (float32x2_t){v1017, v1018}; - float32x2_t v1026 = (float32x2_t){v1024, v1025}; - float32x2_t v1033 = (float32x2_t){v1031, v1032}; - float32x2_t v1040 = (float32x2_t){v1038, v1039}; - float32x2_t v1047 = (float32x2_t){v1045, v1046}; - float32x2_t v1054 = (float32x2_t){v1052, v1053}; - float32x2_t v1061 = (float32x2_t){v1059, v1060}; - float32x2_t v1068 = (float32x2_t){v1066, v1067}; - float32x2_t v1075 = (float32x2_t){v1073, v1074}; - float32x2_t v1082 = (float32x2_t){v1080, v1081}; - float32x2_t v1089 = (float32x2_t){v1087, v1088}; - float32x2_t v1096 = (float32x2_t){v1094, v1095}; - float32x2_t v1103 = (float32x2_t){v1101, v1102}; - float32x2_t v1110 = (float32x2_t){v1108, v1109}; - float32x2_t v1117 = (float32x2_t){v1115, v1116}; - float32x2_t v1124 = (float32x2_t){v1122, v1123}; - float32x2_t v1131 = (float32x2_t){v1129, v1130}; - float32x2_t v1138 = (float32x2_t){v1136, v1137}; - float32x2_t v761 = v5[istride * 18]; - float32x2_t v768 = v5[istride * 2]; - float32x2_t v773 = v5[istride * 17]; - float32x2_t v780 = v5[istride * 4]; - float32x2_t v785 = v5[istride * 15]; - float32x2_t v792 = v5[istride * 8]; - float32x2_t v797 = v5[istride * 11]; - float32x2_t v804 = v5[istride * 16]; - float32x2_t v809 = v5[istride * 3]; - float32x2_t v816 = v5[istride * 13]; - float32x2_t v821 = v5[istride * 6]; - float32x2_t v828 = v5[istride * 7]; - float32x2_t v833 = v5[istride * 12]; - float32x2_t v840 = v5[istride * 14]; - float32x2_t v845 = v5[istride * 5]; - float32x2_t v852 = v5[istride * 9]; - float32x2_t v857 = v5[istride * 10]; - float32x2_t v1014 = vmul_f32(v1139, v1012); - float32x2_t v1021 = vmul_f32(v1139, v1019); - float32x2_t v1028 = vmul_f32(v1139, v1026); - float32x2_t v1035 = vmul_f32(v1139, v1033); - float32x2_t v1042 = vmul_f32(v1139, v1040); - float32x2_t v1049 = vmul_f32(v1139, v1047); - float32x2_t v1056 = vmul_f32(v1139, v1054); - float32x2_t v1063 = vmul_f32(v1139, v1061); - float32x2_t v1070 = vmul_f32(v1139, v1068); - float32x2_t v1077 = vmul_f32(v1139, v1075); - float32x2_t v1084 = vmul_f32(v1139, v1082); - float32x2_t v1091 = vmul_f32(v1139, v1089); - float32x2_t v1098 = vmul_f32(v1139, v1096); - float32x2_t v1105 = vmul_f32(v1139, v1103); - float32x2_t v1112 = vmul_f32(v1139, v1110); - float32x2_t v1119 = vmul_f32(v1139, v1117); - float32x2_t v1126 = vmul_f32(v1139, v1124); - float32x2_t v1133 = vmul_f32(v1139, v1131); - float32x2_t v1140 = vmul_f32(v1139, v1138); - float32x2_t v762 = vadd_f32(v756, v761); - float32x2_t v763 = vsub_f32(v756, v761); - float32x2_t v774 = vadd_f32(v768, v773); - float32x2_t v775 = vsub_f32(v773, v768); - float32x2_t v786 = vadd_f32(v780, v785); - float32x2_t v787 = vsub_f32(v780, v785); - float32x2_t v798 = vadd_f32(v792, v797); - float32x2_t v799 = vsub_f32(v797, v792); - float32x2_t v810 = vadd_f32(v804, v809); - float32x2_t v811 = vsub_f32(v804, v809); - float32x2_t v822 = vadd_f32(v816, v821); - float32x2_t v823 = vsub_f32(v821, v816); - float32x2_t v834 = vadd_f32(v828, v833); - float32x2_t v835 = vsub_f32(v828, v833); - float32x2_t v846 = vadd_f32(v840, v845); - float32x2_t v847 = vsub_f32(v845, v840); - float32x2_t v858 = vadd_f32(v852, v857); - float32x2_t v859 = vsub_f32(v852, v857); - float32x2_t v860 = vsub_f32(v762, v834); - float32x2_t v861 = vsub_f32(v774, v846); - float32x2_t v862 = vsub_f32(v786, v858); - float32x2_t v863 = vsub_f32(v798, v834); - float32x2_t v864 = vsub_f32(v810, v846); - float32x2_t v865 = vsub_f32(v822, v858); - float32x2_t v866 = vadd_f32(v762, v798); - float32x2_t v868 = vadd_f32(v774, v810); - float32x2_t v870 = vadd_f32(v786, v822); - float32x2_t v898 = vsub_f32(v763, v835); - float32x2_t v899 = vsub_f32(v775, v847); - float32x2_t v900 = vsub_f32(v787, v859); - float32x2_t v901 = vsub_f32(v799, v835); - float32x2_t v902 = vsub_f32(v811, v847); - float32x2_t v903 = vsub_f32(v823, v859); - float32x2_t v904 = vadd_f32(v763, v799); - float32x2_t v906 = vadd_f32(v775, v811); - float32x2_t v908 = vadd_f32(v787, v823); - float32x2_t v867 = vadd_f32(v866, v834); - float32x2_t v869 = vadd_f32(v868, v846); - float32x2_t v871 = vadd_f32(v870, v858); - float32x2_t v872 = vadd_f32(v860, v862); - float32x2_t v873 = vadd_f32(v863, v865); - float32x2_t v888 = vsub_f32(v860, v863); - float32x2_t v889 = vsub_f32(v862, v865); - float32x2_t v905 = vadd_f32(v904, v835); - float32x2_t v907 = vadd_f32(v906, v847); - float32x2_t v909 = vadd_f32(v908, v859); - float32x2_t v910 = vadd_f32(v898, v900); - float32x2_t v911 = vadd_f32(v901, v903); - float32x2_t v920 = vsub_f32(v898, v901); - float32x2_t v921 = vsub_f32(v900, v903); - float32x2_t v965 = vmul_f32(v863, v964); - float32x2_t v977 = vmul_f32(v865, v976); - float32x2_t v985 = vmul_f32(v862, v984); - float32x2_t v1064 = vrev64_f32(v901); - float32x2_t v1078 = vrev64_f32(v898); - float32x2_t v1085 = vrev64_f32(v903); - float32x2_t v1099 = vrev64_f32(v900); - float32x2_t v874 = vadd_f32(v867, v869); - float32x2_t v882 = vadd_f32(v873, v864); - float32x2_t v883 = vadd_f32(v872, v861); - float32x2_t v885 = vsub_f32(v873, v864); - float32x2_t v886 = vsub_f32(v872, v861); - float32x2_t v890 = vsub_f32(v860, v889); - float32x2_t v892 = vadd_f32(v888, v865); - float32x2_t v895 = vsub_f32(v867, v871); - float32x2_t v896 = vsub_f32(v869, v871); - float32x2_t v912 = vadd_f32(v905, v907); - float32x2_t v914 = vadd_f32(v911, v902); - float32x2_t v915 = vadd_f32(v910, v899); - float32x2_t v917 = vsub_f32(v911, v902); - float32x2_t v918 = vsub_f32(v910, v899); - float32x2_t v922 = vsub_f32(v898, v921); - float32x2_t v924 = vadd_f32(v920, v903); - float32x2_t v927 = vsub_f32(v905, v909); - float32x2_t v928 = vsub_f32(v907, v909); - float32x2_t v969 = vmul_f32(v888, v968); - float32x2_t v981 = vmul_f32(v889, v980); - float32x2_t v1065 = vmul_f32(v1064, v1063); - float32x2_t v1071 = vrev64_f32(v920); - float32x2_t v1086 = vmul_f32(v1085, v1084); - float32x2_t v1092 = vrev64_f32(v921); - float32x2_t v1100 = vmul_f32(v1099, v1098); - float32x2_t v875 = vadd_f32(v874, v871); - float32x2_t v884 = vsub_f32(v883, v882); - float32x2_t v887 = vsub_f32(v886, v885); - float32x2_t v891 = vsub_f32(v890, v864); - float32x2_t v893 = vsub_f32(v892, v861); - float32x2_t v897 = vadd_f32(v895, v896); - float32x2_t v913 = vadd_f32(v912, v909); - float32x2_t v916 = vsub_f32(v915, v914); - float32x2_t v919 = vsub_f32(v918, v917); - float32x2_t v923 = vsub_f32(v922, v902); - float32x2_t v925 = vsub_f32(v924, v899); - float32x2_t v929 = vadd_f32(v927, v928); - float32x2_t v941 = vmul_f32(v882, v940); - float32x2_t v945 = vmul_f32(v883, v944); - float32x2_t v953 = vmul_f32(v885, v952); - float32x2_t v957 = vmul_f32(v886, v956); - float32x2_t v1001 = vmul_f32(v895, v1000); - float32x2_t v1005 = vmul_f32(v896, v1004); - float32x2_t v1022 = vrev64_f32(v914); - float32x2_t v1029 = vrev64_f32(v915); - float32x2_t v1043 = vrev64_f32(v917); - float32x2_t v1050 = vrev64_f32(v918); - float32x2_t v1072 = vmul_f32(v1071, v1070); - float32x2_t v1093 = vmul_f32(v1092, v1091); - float32x2_t v1127 = vrev64_f32(v927); - float32x2_t v1134 = vrev64_f32(v928); - float32x2_t v881 = vadd_f32(v880, v875); - float32x2_t v894 = vsub_f32(v891, v893); - float32x2_t v926 = vsub_f32(v923, v925); - float32x2_t v937 = vmul_f32(v875, v936); - float32x2_t v949 = vmul_f32(v884, v948); - float32x2_t v961 = vmul_f32(v887, v960); - float32x2_t v989 = vmul_f32(v891, v988); - float32x2_t v993 = vmul_f32(v893, v992); - float32x2_t v1009 = vmul_f32(v897, v1008); - float32x2_t v1015 = vrev64_f32(v913); - float32x2_t v1023 = vmul_f32(v1022, v1021); - float32x2_t v1030 = vmul_f32(v1029, v1028); - float32x2_t v1036 = vrev64_f32(v916); - float32x2_t v1044 = vmul_f32(v1043, v1042); - float32x2_t v1051 = vmul_f32(v1050, v1049); - float32x2_t v1057 = vrev64_f32(v919); - float32x2_t v1106 = vrev64_f32(v923); - float32x2_t v1113 = vrev64_f32(v925); - float32x2_t v1128 = vmul_f32(v1127, v1126); - float32x2_t v1135 = vmul_f32(v1134, v1133); - float32x2_t v1141 = vrev64_f32(v929); - float32x2_t v1143 = vadd_f32(v941, v945); - float32x2_t v1144 = vadd_f32(v953, v957); - float32x2_t v997 = vmul_f32(v894, v996); - float32x2_t v1016 = vmul_f32(v1015, v1014); - float32x2_t v1037 = vmul_f32(v1036, v1035); - float32x2_t v1058 = vmul_f32(v1057, v1056); - float32x2_t v1107 = vmul_f32(v1106, v1105); - float32x2_t v1114 = vmul_f32(v1113, v1112); - float32x2_t v1120 = vrev64_f32(v926); - float32x2_t v1142 = vmul_f32(v1141, v1140); - float32x2_t v1146 = vadd_f32(v1143, v1144); - float32x2_t v1147 = vadd_f32(v941, v949); - float32x2_t v1148 = vadd_f32(v953, v961); - float32x2_t v1165 = vsub_f32(v1143, v1144); - float32x2_t v1167 = vsub_f32(v1001, v1009); - float32x2_t v1168 = vsub_f32(v1005, v1009); - float32x2_t v1169 = vadd_f32(v937, v881); - float32x2_t v1174 = vadd_f32(v1023, v1030); - float32x2_t v1175 = vadd_f32(v1044, v1051); - int16x4_t v1230 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v881, 15), (int32x2_t){0, 0})); - float32x2_t v1121 = vmul_f32(v1120, v1119); - float32x2_t v1145 = vadd_f32(v993, v997); - float32x2_t v1149 = vadd_f32(v989, v997); - float32x2_t v1150 = vsub_f32(v965, v1146); - float32x2_t v1151 = vadd_f32(v1147, v1148); - float32x2_t v1157 = vsub_f32(v1147, v1148); - float32x2_t v1162 = vadd_f32(v1146, v985); - float32x2_t v1170 = vadd_f32(v1169, v1167); - float32x2_t v1171 = vsub_f32(v1169, v1167); - float32x2_t v1173 = vadd_f32(v1169, v1168); - float32x2_t v1177 = vadd_f32(v1174, v1175); - float32x2_t v1178 = vadd_f32(v1023, v1037); - float32x2_t v1179 = vadd_f32(v1044, v1058); - float32x2_t v1196 = vsub_f32(v1174, v1175); - float32x2_t v1198 = vsub_f32(v1128, v1142); - float32x2_t v1199 = vsub_f32(v1135, v1142); - v6[0] = vget_lane_s32(vreinterpret_s32_s16(v1230), 0); - float32x2_t v1152 = vsub_f32(v977, v1149); - float32x2_t v1153 = vadd_f32(v969, v1145); - float32x2_t v1155 = vadd_f32(v1151, v981); - float32x2_t v1158 = vadd_f32(v1157, v1145); - float32x2_t v1159 = vadd_f32(v1150, v1151); - float32x2_t v1166 = vadd_f32(v1165, v1149); - float32x2_t v1172 = vsub_f32(v1171, v1168); - float32x2_t v1176 = vadd_f32(v1114, v1121); - float32x2_t v1180 = vadd_f32(v1107, v1121); - float32x2_t v1181 = vsub_f32(v1065, v1177); - float32x2_t v1182 = vadd_f32(v1178, v1179); - float32x2_t v1188 = vsub_f32(v1178, v1179); - float32x2_t v1193 = vadd_f32(v1177, v1100); - float32x2_t v1200 = vadd_f32(v1016, v1198); - float32x2_t v1201 = vsub_f32(v1016, v1198); - float32x2_t v1203 = vadd_f32(v1016, v1199); - float32x2_t v1154 = vadd_f32(v1153, v1150); - float32x2_t v1156 = vadd_f32(v1155, v1152); - float32x2_t v1160 = vfma_f32(v1159, v860, v972); - float32x2_t v1163 = vadd_f32(v1162, v1152); - float32x2_t v1183 = vsub_f32(v1086, v1180); - float32x2_t v1184 = vadd_f32(v1072, v1176); - float32x2_t v1186 = vadd_f32(v1182, v1093); - float32x2_t v1189 = vadd_f32(v1188, v1176); - float32x2_t v1190 = vadd_f32(v1181, v1182); - float32x2_t v1197 = vadd_f32(v1196, v1180); - float32x2_t v1202 = vsub_f32(v1201, v1199); - float32x2_t v1208 = vsub_f32(v1166, v1158); - float32x2_t v1212 = vsub_f32(v1173, v1166); - float32x2_t v1215 = vadd_f32(v1158, v1173); - float32x2_t v1161 = vadd_f32(v1160, v1149); - float32x2_t v1164 = vadd_f32(v1163, v1145); - float32x2_t v1185 = vadd_f32(v1184, v1181); - float32x2_t v1187 = vadd_f32(v1186, v1183); - float32x2_t v1191 = vfma_f32(v1190, v1078, v1077); - float32x2_t v1194 = vadd_f32(v1193, v1183); - float32x2_t v1209 = vadd_f32(v1208, v1173); - float32x2_t v1213 = vadd_f32(v1154, v1170); - float32x2_t v1214 = vadd_f32(v1156, v1172); - float32x2_t v1220 = vsub_f32(v1197, v1189); - float32x2_t v1224 = vsub_f32(v1197, v1203); - float32x2_t v1227 = vadd_f32(v1189, v1203); - float32x2_t v1192 = vadd_f32(v1191, v1180); - float32x2_t v1195 = vadd_f32(v1194, v1176); - float32x2_t v1204 = vsub_f32(v1161, v1154); - float32x2_t v1206 = vsub_f32(v1164, v1156); - float32x2_t v1210 = vsub_f32(v1170, v1161); - float32x2_t v1211 = vsub_f32(v1172, v1164); - float32x2_t v1221 = vadd_f32(v1220, v1203); - float32x2_t v1225 = vadd_f32(v1185, v1200); - float32x2_t v1226 = vadd_f32(v1187, v1202); - float32x2_t v1248 = vsub_f32(v1215, v1227); - float32x2_t v1255 = vadd_f32(v1215, v1227); - float32x2_t v1262 = vadd_f32(v1212, v1224); - float32x2_t v1269 = vsub_f32(v1212, v1224); - float32x2_t v1205 = vadd_f32(v1204, v1170); - float32x2_t v1207 = vadd_f32(v1206, v1172); - float32x2_t v1216 = vsub_f32(v1192, v1185); - float32x2_t v1218 = vsub_f32(v1195, v1187); - float32x2_t v1222 = vsub_f32(v1200, v1192); - float32x2_t v1223 = vsub_f32(v1202, v1195); - int16x4_t v1251 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1248, 15), (int32x2_t){0, 0})); - int16x4_t v1258 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1255, 15), (int32x2_t){0, 0})); - int16x4_t v1265 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1262, 15), (int32x2_t){0, 0})); - int16x4_t v1272 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1269, 15), (int32x2_t){0, 0})); - float32x2_t v1276 = vadd_f32(v1214, v1226); - float32x2_t v1283 = vsub_f32(v1214, v1226); - float32x2_t v1290 = vadd_f32(v1209, v1221); - float32x2_t v1297 = vsub_f32(v1209, v1221); - float32x2_t v1332 = vsub_f32(v1213, v1225); - float32x2_t v1339 = vadd_f32(v1213, v1225); - float32x2_t v1217 = vadd_f32(v1216, v1200); - float32x2_t v1219 = vadd_f32(v1218, v1202); - v6[ostride * 2] = vget_lane_s32(vreinterpret_s32_s16(v1251), 0); - v6[ostride * 17] = vget_lane_s32(vreinterpret_s32_s16(v1258), 0); - v6[ostride * 3] = vget_lane_s32(vreinterpret_s32_s16(v1265), 0); - v6[ostride * 16] = vget_lane_s32(vreinterpret_s32_s16(v1272), 0); - int16x4_t v1279 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1276, 15), (int32x2_t){0, 0})); - int16x4_t v1286 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1283, 15), (int32x2_t){0, 0})); - int16x4_t v1293 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1290, 15), (int32x2_t){0, 0})); - int16x4_t v1300 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1297, 15), (int32x2_t){0, 0})); - float32x2_t v1304 = vadd_f32(v1211, v1223); - float32x2_t v1311 = vsub_f32(v1211, v1223); - float32x2_t v1318 = vadd_f32(v1210, v1222); - float32x2_t v1325 = vsub_f32(v1210, v1222); - int16x4_t v1335 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1332, 15), (int32x2_t){0, 0})); - int16x4_t v1342 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1339, 15), (int32x2_t){0, 0})); - float32x2_t v1234 = vadd_f32(v1205, v1217); - float32x2_t v1241 = vsub_f32(v1205, v1217); - v6[ostride * 4] = vget_lane_s32(vreinterpret_s32_s16(v1279), 0); - v6[ostride * 15] = vget_lane_s32(vreinterpret_s32_s16(v1286), 0); - v6[ostride * 5] = vget_lane_s32(vreinterpret_s32_s16(v1293), 0); - v6[ostride * 14] = vget_lane_s32(vreinterpret_s32_s16(v1300), 0); - int16x4_t v1307 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1304, 15), (int32x2_t){0, 0})); - int16x4_t v1314 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1311, 15), (int32x2_t){0, 0})); - int16x4_t v1321 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1318, 15), (int32x2_t){0, 0})); - int16x4_t v1328 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1325, 15), (int32x2_t){0, 0})); - v6[ostride * 8] = vget_lane_s32(vreinterpret_s32_s16(v1335), 0); - v6[ostride * 11] = vget_lane_s32(vreinterpret_s32_s16(v1342), 0); - float32x2_t v1346 = vadd_f32(v1207, v1219); - float32x2_t v1353 = vsub_f32(v1207, v1219); - int16x4_t v1237 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1234, 15), (int32x2_t){0, 0})); - int16x4_t v1244 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1241, 15), (int32x2_t){0, 0})); - v6[ostride * 6] = vget_lane_s32(vreinterpret_s32_s16(v1307), 0); - v6[ostride * 13] = vget_lane_s32(vreinterpret_s32_s16(v1314), 0); - v6[ostride * 7] = vget_lane_s32(vreinterpret_s32_s16(v1321), 0); - v6[ostride * 12] = vget_lane_s32(vreinterpret_s32_s16(v1328), 0); - int16x4_t v1349 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1346, 15), (int32x2_t){0, 0})); - int16x4_t v1356 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1353, 15), (int32x2_t){0, 0})); - v6[ostride] = vget_lane_s32(vreinterpret_s32_s16(v1237), 0); - v6[ostride * 18] = vget_lane_s32(vreinterpret_s32_s16(v1244), 0); - v6[ostride * 9] = vget_lane_s32(vreinterpret_s32_s16(v1349), 0); - v6[ostride * 10] = vget_lane_s32(vreinterpret_s32_s16(v1356), 0); + float32x2_t v402 = (float32x2_t){v400, v401}; + float32x2_t v25 = v5[istride * 18]; + float32x2_t v32 = v5[istride * 2]; + float32x2_t v37 = v5[istride * 17]; + float32x2_t v44 = v5[istride * 4]; + float32x2_t v49 = v5[istride * 15]; + float32x2_t v56 = v5[istride * 8]; + float32x2_t v61 = v5[istride * 11]; + float32x2_t v68 = v5[istride * 16]; + float32x2_t v73 = v5[istride * 3]; + float32x2_t v80 = v5[istride * 13]; + float32x2_t v85 = v5[istride * 6]; + float32x2_t v92 = v5[istride * 7]; + float32x2_t v97 = v5[istride * 12]; + float32x2_t v104 = v5[istride * 14]; + float32x2_t v109 = v5[istride * 5]; + float32x2_t v116 = v5[istride * 9]; + float32x2_t v121 = v5[istride * 10]; + float32x2_t v278 = vmul_f32(v403, v276); + float32x2_t v285 = vmul_f32(v403, v283); + float32x2_t v292 = vmul_f32(v403, v290); + float32x2_t v299 = vmul_f32(v403, v297); + float32x2_t v306 = vmul_f32(v403, v304); + float32x2_t v313 = vmul_f32(v403, v311); + float32x2_t v320 = vmul_f32(v403, v318); + float32x2_t v327 = vmul_f32(v403, v325); + float32x2_t v334 = vmul_f32(v403, v332); + float32x2_t v341 = vmul_f32(v403, v339); + float32x2_t v348 = vmul_f32(v403, v346); + float32x2_t v355 = vmul_f32(v403, v353); + float32x2_t v362 = vmul_f32(v403, v360); + float32x2_t v369 = vmul_f32(v403, v367); + float32x2_t v376 = vmul_f32(v403, v374); + float32x2_t v383 = vmul_f32(v403, v381); + float32x2_t v390 = vmul_f32(v403, v388); + float32x2_t v397 = vmul_f32(v403, v395); + float32x2_t v404 = vmul_f32(v403, v402); + float32x2_t v26 = vadd_f32(v20, v25); + float32x2_t v27 = vsub_f32(v20, v25); + float32x2_t v38 = vadd_f32(v32, v37); + float32x2_t v39 = vsub_f32(v37, v32); + float32x2_t v50 = vadd_f32(v44, v49); + float32x2_t v51 = vsub_f32(v44, v49); + float32x2_t v62 = vadd_f32(v56, v61); + float32x2_t v63 = vsub_f32(v61, v56); + float32x2_t v74 = vadd_f32(v68, v73); + float32x2_t v75 = vsub_f32(v68, v73); + float32x2_t v86 = vadd_f32(v80, v85); + float32x2_t v87 = vsub_f32(v85, v80); + float32x2_t v98 = vadd_f32(v92, v97); + float32x2_t v99 = vsub_f32(v92, v97); + float32x2_t v110 = vadd_f32(v104, v109); + float32x2_t v111 = vsub_f32(v109, v104); + float32x2_t v122 = vadd_f32(v116, v121); + float32x2_t v123 = vsub_f32(v116, v121); + float32x2_t v124 = vsub_f32(v26, v98); + float32x2_t v125 = vsub_f32(v38, v110); + float32x2_t v126 = vsub_f32(v50, v122); + float32x2_t v127 = vsub_f32(v62, v98); + float32x2_t v128 = vsub_f32(v74, v110); + float32x2_t v129 = vsub_f32(v86, v122); + float32x2_t v130 = vadd_f32(v26, v62); + float32x2_t v132 = vadd_f32(v38, v74); + float32x2_t v134 = vadd_f32(v50, v86); + float32x2_t v162 = vsub_f32(v27, v99); + float32x2_t v163 = vsub_f32(v39, v111); + float32x2_t v164 = vsub_f32(v51, v123); + float32x2_t v165 = vsub_f32(v63, v99); + float32x2_t v166 = vsub_f32(v75, v111); + float32x2_t v167 = vsub_f32(v87, v123); + float32x2_t v168 = vadd_f32(v27, v63); + float32x2_t v170 = vadd_f32(v39, v75); + float32x2_t v172 = vadd_f32(v51, v87); + float32x2_t v131 = vadd_f32(v130, v98); + float32x2_t v133 = vadd_f32(v132, v110); + float32x2_t v135 = vadd_f32(v134, v122); + float32x2_t v136 = vadd_f32(v124, v126); + float32x2_t v137 = vadd_f32(v127, v129); + float32x2_t v152 = vsub_f32(v124, v127); + float32x2_t v153 = vsub_f32(v126, v129); + float32x2_t v169 = vadd_f32(v168, v99); + float32x2_t v171 = vadd_f32(v170, v111); + float32x2_t v173 = vadd_f32(v172, v123); + float32x2_t v174 = vadd_f32(v162, v164); + float32x2_t v175 = vadd_f32(v165, v167); + float32x2_t v184 = vsub_f32(v162, v165); + float32x2_t v185 = vsub_f32(v164, v167); + float32x2_t v229 = vmul_f32(v127, v228); + float32x2_t v241 = vmul_f32(v129, v240); + float32x2_t v249 = vmul_f32(v126, v248); + float32x2_t v328 = vrev64_f32(v165); + float32x2_t v342 = vrev64_f32(v162); + float32x2_t v349 = vrev64_f32(v167); + float32x2_t v363 = vrev64_f32(v164); + float32x2_t v138 = vadd_f32(v131, v133); + float32x2_t v146 = vadd_f32(v137, v128); + float32x2_t v147 = vadd_f32(v136, v125); + float32x2_t v149 = vsub_f32(v137, v128); + float32x2_t v150 = vsub_f32(v136, v125); + float32x2_t v154 = vsub_f32(v124, v153); + float32x2_t v156 = vadd_f32(v152, v129); + float32x2_t v159 = vsub_f32(v131, v135); + float32x2_t v160 = vsub_f32(v133, v135); + float32x2_t v176 = vadd_f32(v169, v171); + float32x2_t v178 = vadd_f32(v175, v166); + float32x2_t v179 = vadd_f32(v174, v163); + float32x2_t v181 = vsub_f32(v175, v166); + float32x2_t v182 = vsub_f32(v174, v163); + float32x2_t v186 = vsub_f32(v162, v185); + float32x2_t v188 = vadd_f32(v184, v167); + float32x2_t v191 = vsub_f32(v169, v173); + float32x2_t v192 = vsub_f32(v171, v173); + float32x2_t v233 = vmul_f32(v152, v232); + float32x2_t v245 = vmul_f32(v153, v244); + float32x2_t v329 = vmul_f32(v328, v327); + float32x2_t v335 = vrev64_f32(v184); + float32x2_t v350 = vmul_f32(v349, v348); + float32x2_t v356 = vrev64_f32(v185); + float32x2_t v364 = vmul_f32(v363, v362); + float32x2_t v139 = vadd_f32(v138, v135); + float32x2_t v148 = vsub_f32(v147, v146); + float32x2_t v151 = vsub_f32(v150, v149); + float32x2_t v155 = vsub_f32(v154, v128); + float32x2_t v157 = vsub_f32(v156, v125); + float32x2_t v161 = vadd_f32(v159, v160); + float32x2_t v177 = vadd_f32(v176, v173); + float32x2_t v180 = vsub_f32(v179, v178); + float32x2_t v183 = vsub_f32(v182, v181); + float32x2_t v187 = vsub_f32(v186, v166); + float32x2_t v189 = vsub_f32(v188, v163); + float32x2_t v193 = vadd_f32(v191, v192); + float32x2_t v205 = vmul_f32(v146, v204); + float32x2_t v209 = vmul_f32(v147, v208); + float32x2_t v217 = vmul_f32(v149, v216); + float32x2_t v221 = vmul_f32(v150, v220); + float32x2_t v265 = vmul_f32(v159, v264); + float32x2_t v269 = vmul_f32(v160, v268); + float32x2_t v286 = vrev64_f32(v178); + float32x2_t v293 = vrev64_f32(v179); + float32x2_t v307 = vrev64_f32(v181); + float32x2_t v314 = vrev64_f32(v182); + float32x2_t v336 = vmul_f32(v335, v334); + float32x2_t v357 = vmul_f32(v356, v355); + float32x2_t v391 = vrev64_f32(v191); + float32x2_t v398 = vrev64_f32(v192); + float32x2_t v145 = vadd_f32(v144, v139); + float32x2_t v158 = vsub_f32(v155, v157); + float32x2_t v190 = vsub_f32(v187, v189); + float32x2_t v201 = vmul_f32(v139, v200); + float32x2_t v213 = vmul_f32(v148, v212); + float32x2_t v225 = vmul_f32(v151, v224); + float32x2_t v253 = vmul_f32(v155, v252); + float32x2_t v257 = vmul_f32(v157, v256); + float32x2_t v273 = vmul_f32(v161, v272); + float32x2_t v279 = vrev64_f32(v177); + float32x2_t v287 = vmul_f32(v286, v285); + float32x2_t v294 = vmul_f32(v293, v292); + float32x2_t v300 = vrev64_f32(v180); + float32x2_t v308 = vmul_f32(v307, v306); + float32x2_t v315 = vmul_f32(v314, v313); + float32x2_t v321 = vrev64_f32(v183); + float32x2_t v370 = vrev64_f32(v187); + float32x2_t v377 = vrev64_f32(v189); + float32x2_t v392 = vmul_f32(v391, v390); + float32x2_t v399 = vmul_f32(v398, v397); + float32x2_t v405 = vrev64_f32(v193); + float32x2_t v407 = vadd_f32(v205, v209); + float32x2_t v408 = vadd_f32(v217, v221); + float32x2_t v261 = vmul_f32(v158, v260); + float32x2_t v280 = vmul_f32(v279, v278); + float32x2_t v301 = vmul_f32(v300, v299); + float32x2_t v322 = vmul_f32(v321, v320); + float32x2_t v371 = vmul_f32(v370, v369); + float32x2_t v378 = vmul_f32(v377, v376); + float32x2_t v384 = vrev64_f32(v190); + float32x2_t v406 = vmul_f32(v405, v404); + float32x2_t v410 = vadd_f32(v407, v408); + float32x2_t v411 = vadd_f32(v205, v213); + float32x2_t v412 = vadd_f32(v217, v225); + float32x2_t v429 = vsub_f32(v407, v408); + float32x2_t v431 = vsub_f32(v265, v273); + float32x2_t v432 = vsub_f32(v269, v273); + float32x2_t v433 = vadd_f32(v201, v145); + float32x2_t v438 = vadd_f32(v287, v294); + float32x2_t v439 = vadd_f32(v308, v315); + int16x4_t v494 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v145, 15), (int32x2_t){0, 0})); + float32x2_t v385 = vmul_f32(v384, v383); + float32x2_t v409 = vadd_f32(v257, v261); + float32x2_t v413 = vadd_f32(v253, v261); + float32x2_t v414 = vsub_f32(v229, v410); + float32x2_t v415 = vadd_f32(v411, v412); + float32x2_t v421 = vsub_f32(v411, v412); + float32x2_t v426 = vadd_f32(v410, v249); + float32x2_t v434 = vadd_f32(v433, v431); + float32x2_t v435 = vsub_f32(v433, v431); + float32x2_t v437 = vadd_f32(v433, v432); + float32x2_t v441 = vadd_f32(v438, v439); + float32x2_t v442 = vadd_f32(v287, v301); + float32x2_t v443 = vadd_f32(v308, v322); + float32x2_t v460 = vsub_f32(v438, v439); + float32x2_t v462 = vsub_f32(v392, v406); + float32x2_t v463 = vsub_f32(v399, v406); + v6[0] = vget_lane_s32(vreinterpret_s32_s16(v494), 0); + float32x2_t v416 = vsub_f32(v241, v413); + float32x2_t v417 = vadd_f32(v233, v409); + float32x2_t v419 = vadd_f32(v415, v245); + float32x2_t v422 = vadd_f32(v421, v409); + float32x2_t v423 = vadd_f32(v414, v415); + float32x2_t v430 = vadd_f32(v429, v413); + float32x2_t v436 = vsub_f32(v435, v432); + float32x2_t v440 = vadd_f32(v378, v385); + float32x2_t v444 = vadd_f32(v371, v385); + float32x2_t v445 = vsub_f32(v329, v441); + float32x2_t v446 = vadd_f32(v442, v443); + float32x2_t v452 = vsub_f32(v442, v443); + float32x2_t v457 = vadd_f32(v441, v364); + float32x2_t v464 = vadd_f32(v280, v462); + float32x2_t v465 = vsub_f32(v280, v462); + float32x2_t v467 = vadd_f32(v280, v463); + float32x2_t v418 = vadd_f32(v417, v414); + float32x2_t v420 = vadd_f32(v419, v416); + float32x2_t v424 = vfma_f32(v423, v124, v236); + float32x2_t v427 = vadd_f32(v426, v416); + float32x2_t v447 = vsub_f32(v350, v444); + float32x2_t v448 = vadd_f32(v336, v440); + float32x2_t v450 = vadd_f32(v446, v357); + float32x2_t v453 = vadd_f32(v452, v440); + float32x2_t v454 = vadd_f32(v445, v446); + float32x2_t v461 = vadd_f32(v460, v444); + float32x2_t v466 = vsub_f32(v465, v463); + float32x2_t v472 = vsub_f32(v430, v422); + float32x2_t v476 = vsub_f32(v437, v430); + float32x2_t v479 = vadd_f32(v422, v437); + float32x2_t v425 = vadd_f32(v424, v413); + float32x2_t v428 = vadd_f32(v427, v409); + float32x2_t v449 = vadd_f32(v448, v445); + float32x2_t v451 = vadd_f32(v450, v447); + float32x2_t v455 = vfma_f32(v454, v342, v341); + float32x2_t v458 = vadd_f32(v457, v447); + float32x2_t v473 = vadd_f32(v472, v437); + float32x2_t v477 = vadd_f32(v418, v434); + float32x2_t v478 = vadd_f32(v420, v436); + float32x2_t v484 = vsub_f32(v461, v453); + float32x2_t v488 = vsub_f32(v461, v467); + float32x2_t v491 = vadd_f32(v453, v467); + float32x2_t v456 = vadd_f32(v455, v444); + float32x2_t v459 = vadd_f32(v458, v440); + float32x2_t v468 = vsub_f32(v425, v418); + float32x2_t v470 = vsub_f32(v428, v420); + float32x2_t v474 = vsub_f32(v434, v425); + float32x2_t v475 = vsub_f32(v436, v428); + float32x2_t v485 = vadd_f32(v484, v467); + float32x2_t v489 = vadd_f32(v449, v464); + float32x2_t v490 = vadd_f32(v451, v466); + float32x2_t v512 = vsub_f32(v479, v491); + float32x2_t v519 = vadd_f32(v479, v491); + float32x2_t v526 = vadd_f32(v476, v488); + float32x2_t v533 = vsub_f32(v476, v488); + float32x2_t v469 = vadd_f32(v468, v434); + float32x2_t v471 = vadd_f32(v470, v436); + float32x2_t v480 = vsub_f32(v456, v449); + float32x2_t v482 = vsub_f32(v459, v451); + float32x2_t v486 = vsub_f32(v464, v456); + float32x2_t v487 = vsub_f32(v466, v459); + int16x4_t v515 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v512, 15), (int32x2_t){0, 0})); + int16x4_t v522 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v519, 15), (int32x2_t){0, 0})); + int16x4_t v529 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v526, 15), (int32x2_t){0, 0})); + int16x4_t v536 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v533, 15), (int32x2_t){0, 0})); + float32x2_t v540 = vadd_f32(v478, v490); + float32x2_t v547 = vsub_f32(v478, v490); + float32x2_t v554 = vadd_f32(v473, v485); + float32x2_t v561 = vsub_f32(v473, v485); + float32x2_t v596 = vsub_f32(v477, v489); + float32x2_t v603 = vadd_f32(v477, v489); + float32x2_t v481 = vadd_f32(v480, v464); + float32x2_t v483 = vadd_f32(v482, v466); + v6[ostride * 2] = vget_lane_s32(vreinterpret_s32_s16(v515), 0); + v6[ostride * 17] = vget_lane_s32(vreinterpret_s32_s16(v522), 0); + v6[ostride * 3] = vget_lane_s32(vreinterpret_s32_s16(v529), 0); + v6[ostride * 16] = vget_lane_s32(vreinterpret_s32_s16(v536), 0); + int16x4_t v543 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v540, 15), (int32x2_t){0, 0})); + int16x4_t v550 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v547, 15), (int32x2_t){0, 0})); + int16x4_t v557 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v554, 15), (int32x2_t){0, 0})); + int16x4_t v564 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v561, 15), (int32x2_t){0, 0})); + float32x2_t v568 = vadd_f32(v475, v487); + float32x2_t v575 = vsub_f32(v475, v487); + float32x2_t v582 = vadd_f32(v474, v486); + float32x2_t v589 = vsub_f32(v474, v486); + int16x4_t v599 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v596, 15), (int32x2_t){0, 0})); + int16x4_t v606 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v603, 15), (int32x2_t){0, 0})); + float32x2_t v498 = vadd_f32(v469, v481); + float32x2_t v505 = vsub_f32(v469, v481); + v6[ostride * 4] = vget_lane_s32(vreinterpret_s32_s16(v543), 0); + v6[ostride * 15] = vget_lane_s32(vreinterpret_s32_s16(v550), 0); + v6[ostride * 5] = vget_lane_s32(vreinterpret_s32_s16(v557), 0); + v6[ostride * 14] = vget_lane_s32(vreinterpret_s32_s16(v564), 0); + int16x4_t v571 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v568, 15), (int32x2_t){0, 0})); + int16x4_t v578 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v575, 15), (int32x2_t){0, 0})); + int16x4_t v585 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v582, 15), (int32x2_t){0, 0})); + int16x4_t v592 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v589, 15), (int32x2_t){0, 0})); + v6[ostride * 8] = vget_lane_s32(vreinterpret_s32_s16(v599), 0); + v6[ostride * 11] = vget_lane_s32(vreinterpret_s32_s16(v606), 0); + float32x2_t v610 = vadd_f32(v471, v483); + float32x2_t v617 = vsub_f32(v471, v483); + int16x4_t v501 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v498, 15), (int32x2_t){0, 0})); + int16x4_t v508 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v505, 15), (int32x2_t){0, 0})); + v6[ostride * 6] = vget_lane_s32(vreinterpret_s32_s16(v571), 0); + v6[ostride * 13] = vget_lane_s32(vreinterpret_s32_s16(v578), 0); + v6[ostride * 7] = vget_lane_s32(vreinterpret_s32_s16(v585), 0); + v6[ostride * 12] = vget_lane_s32(vreinterpret_s32_s16(v592), 0); + int16x4_t v613 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v610, 15), (int32x2_t){0, 0})); + int16x4_t v620 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v617, 15), (int32x2_t){0, 0})); + v6[ostride] = vget_lane_s32(vreinterpret_s32_s16(v501), 0); + v6[ostride * 18] = vget_lane_s32(vreinterpret_s32_s16(v508), 0); + v6[ostride * 9] = vget_lane_s32(vreinterpret_s32_s16(v613), 0); + v6[ostride * 10] = vget_lane_s32(vreinterpret_s32_s16(v620), 0); v5 += 1 * 1; v6 += 1 * 1; } @@ -11488,234 +7234,130 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu19(const armral_cmplx_f32_t *restrict x, svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v872)[0])); svfloat32_t v1140 = svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v881)[0])); - svfloat32_t v32; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v32) : "w"(v1106), "w"(v1108)); - svfloat32_t v33; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v33) : "w"(v1106), "w"(v1108)); - svfloat32_t v48; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v48) : "w"(v1110), "w"(v1112)); - svfloat32_t v49; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v49) : "w"(v1112), "w"(v1110)); - svfloat32_t v64; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v64) : "w"(v1114), "w"(v1116)); - svfloat32_t v65; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v65) : "w"(v1114), "w"(v1116)); - svfloat32_t v80; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v80) : "w"(v1118), "w"(v1120)); - svfloat32_t v81; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v81) : "w"(v1120), "w"(v1118)); - svfloat32_t v96; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v96) : "w"(v1122), "w"(v1124)); - svfloat32_t v97; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v97) : "w"(v1122), "w"(v1124)); - svfloat32_t v112; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v112) : "w"(v1126), "w"(v1128)); - svfloat32_t v113; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v113) : "w"(v1128), "w"(v1126)); - svfloat32_t v128; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v128) : "w"(v1130), "w"(v1132)); - svfloat32_t v129; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v129) : "w"(v1130), "w"(v1132)); - svfloat32_t v144; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v144) : "w"(v1134), "w"(v1136)); - svfloat32_t v145; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v145) : "w"(v1136), "w"(v1134)); - svfloat32_t v160; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v160) : "w"(v1138), "w"(v1140)); - svfloat32_t v161; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v161) : "w"(v1138), "w"(v1140)); - svfloat32_t v162; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v162) : "w"(v32), "w"(v128)); - svfloat32_t v163; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v163) : "w"(v48), "w"(v144)); - svfloat32_t v164; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v164) : "w"(v64), "w"(v160)); - svfloat32_t v165; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v165) : "w"(v80), "w"(v128)); - svfloat32_t v166; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v166) : "w"(v96), "w"(v144)); - svfloat32_t v167; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v167) : "w"(v112), "w"(v160)); - svfloat32_t v168; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v168) : "w"(v32), "w"(v80)); - svfloat32_t v170; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v170) : "w"(v48), "w"(v96)); - svfloat32_t v172; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v172) : "w"(v64), "w"(v112)); - svfloat32_t v202; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v202) : "w"(v33), "w"(v129)); - svfloat32_t v203; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v203) : "w"(v49), "w"(v145)); - svfloat32_t v204; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v204) : "w"(v65), "w"(v161)); - svfloat32_t v205; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v205) : "w"(v81), "w"(v129)); - svfloat32_t v206; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v206) : "w"(v97), "w"(v145)); - svfloat32_t v207; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v207) : "w"(v113), "w"(v161)); - svfloat32_t v208; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v208) : "w"(v33), "w"(v81)); - svfloat32_t v210; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v210) : "w"(v49), "w"(v97)); - svfloat32_t v212; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v212) : "w"(v65), "w"(v113)); - svfloat32_t v169; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v169) : "w"(v168), "w"(v128)); - svfloat32_t v171; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v171) : "w"(v170), "w"(v144)); - svfloat32_t v173; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v173) : "w"(v172), "w"(v160)); - svfloat32_t v174; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v174) : "w"(v162), "w"(v164)); - svfloat32_t v175; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v175) : "w"(v165), "w"(v167)); - svfloat32_t v192; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v192) : "w"(v162), "w"(v165)); - svfloat32_t v193; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v193) : "w"(v164), "w"(v167)); - svfloat32_t v209; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v209) : "w"(v208), "w"(v129)); - svfloat32_t v211; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v211) : "w"(v210), "w"(v145)); - svfloat32_t v213; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v213) : "w"(v212), "w"(v161)); - svfloat32_t v214; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v214) : "w"(v202), "w"(v204)); - svfloat32_t v215; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v215) : "w"(v205), "w"(v207)); - svfloat32_t v224; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v224) : "w"(v202), "w"(v205)); - svfloat32_t v225; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v225) : "w"(v204), "w"(v207)); - svfloat32_t zero389; - asm volatile("mov %0.s, #0" : "=w"(zero389)); + svfloat32_t v32 = svadd_f32_x(svptrue_b32(), v1106, v1108); + svfloat32_t v33 = svsub_f32_x(svptrue_b32(), v1106, v1108); + svfloat32_t v48 = svadd_f32_x(svptrue_b32(), v1110, v1112); + svfloat32_t v49 = svsub_f32_x(svptrue_b32(), v1112, v1110); + svfloat32_t v64 = svadd_f32_x(svptrue_b32(), v1114, v1116); + svfloat32_t v65 = svsub_f32_x(svptrue_b32(), v1114, v1116); + svfloat32_t v80 = svadd_f32_x(svptrue_b32(), v1118, v1120); + svfloat32_t v81 = svsub_f32_x(svptrue_b32(), v1120, v1118); + svfloat32_t v96 = svadd_f32_x(svptrue_b32(), v1122, v1124); + svfloat32_t v97 = svsub_f32_x(svptrue_b32(), v1122, v1124); + svfloat32_t v112 = svadd_f32_x(svptrue_b32(), v1126, v1128); + svfloat32_t v113 = svsub_f32_x(svptrue_b32(), v1128, v1126); + svfloat32_t v128 = svadd_f32_x(svptrue_b32(), v1130, v1132); + svfloat32_t v129 = svsub_f32_x(svptrue_b32(), v1130, v1132); + svfloat32_t v144 = svadd_f32_x(svptrue_b32(), v1134, v1136); + svfloat32_t v145 = svsub_f32_x(svptrue_b32(), v1136, v1134); + svfloat32_t v160 = svadd_f32_x(svptrue_b32(), v1138, v1140); + svfloat32_t v161 = svsub_f32_x(svptrue_b32(), v1138, v1140); + svfloat32_t v162 = svsub_f32_x(svptrue_b32(), v32, v128); + svfloat32_t v163 = svsub_f32_x(svptrue_b32(), v48, v144); + svfloat32_t v164 = svsub_f32_x(svptrue_b32(), v64, v160); + svfloat32_t v165 = svsub_f32_x(svptrue_b32(), v80, v128); + svfloat32_t v166 = svsub_f32_x(svptrue_b32(), v96, v144); + svfloat32_t v167 = svsub_f32_x(svptrue_b32(), v112, v160); + svfloat32_t v168 = svadd_f32_x(svptrue_b32(), v32, v80); + svfloat32_t v170 = svadd_f32_x(svptrue_b32(), v48, v96); + svfloat32_t v172 = svadd_f32_x(svptrue_b32(), v64, v112); + svfloat32_t v202 = svsub_f32_x(svptrue_b32(), v33, v129); + svfloat32_t v203 = svsub_f32_x(svptrue_b32(), v49, v145); + svfloat32_t v204 = svsub_f32_x(svptrue_b32(), v65, v161); + svfloat32_t v205 = svsub_f32_x(svptrue_b32(), v81, v129); + svfloat32_t v206 = svsub_f32_x(svptrue_b32(), v97, v145); + svfloat32_t v207 = svsub_f32_x(svptrue_b32(), v113, v161); + svfloat32_t v208 = svadd_f32_x(svptrue_b32(), v33, v81); + svfloat32_t v210 = svadd_f32_x(svptrue_b32(), v49, v97); + svfloat32_t v212 = svadd_f32_x(svptrue_b32(), v65, v113); + svfloat32_t v169 = svadd_f32_x(svptrue_b32(), v168, v128); + svfloat32_t v171 = svadd_f32_x(svptrue_b32(), v170, v144); + svfloat32_t v173 = svadd_f32_x(svptrue_b32(), v172, v160); + svfloat32_t v174 = svadd_f32_x(svptrue_b32(), v162, v164); + svfloat32_t v175 = svadd_f32_x(svptrue_b32(), v165, v167); + svfloat32_t v192 = svsub_f32_x(svptrue_b32(), v162, v165); + svfloat32_t v193 = svsub_f32_x(svptrue_b32(), v164, v167); + svfloat32_t v209 = svadd_f32_x(svptrue_b32(), v208, v129); + svfloat32_t v211 = svadd_f32_x(svptrue_b32(), v210, v145); + svfloat32_t v213 = svadd_f32_x(svptrue_b32(), v212, v161); + svfloat32_t v214 = svadd_f32_x(svptrue_b32(), v202, v204); + svfloat32_t v215 = svadd_f32_x(svptrue_b32(), v205, v207); + svfloat32_t v224 = svsub_f32_x(svptrue_b32(), v202, v205); + svfloat32_t v225 = svsub_f32_x(svptrue_b32(), v204, v207); + svfloat32_t zero389 = svdup_n_f32(0); svfloat32_t v389 = svcmla_f32_x(pred_full, zero389, v921, v205, 90); - svfloat32_t zero410; - asm volatile("mov %0.s, #0" : "=w"(zero410)); + svfloat32_t zero410 = svdup_n_f32(0); svfloat32_t v410 = svcmla_f32_x(pred_full, zero410, v924, v207, 90); - svfloat32_t v176; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v176) : "w"(v169), "w"(v171)); - svfloat32_t v186; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v186) : "w"(v175), "w"(v166)); - svfloat32_t v187; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v187) : "w"(v174), "w"(v163)); - svfloat32_t v189; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v189) : "w"(v175), "w"(v166)); - svfloat32_t v190; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v190) : "w"(v174), "w"(v163)); - svfloat32_t v194; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v194) : "w"(v162), "w"(v193)); - svfloat32_t v196; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v196) : "w"(v192), "w"(v167)); - svfloat32_t v199; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v199) : "w"(v169), "w"(v173)); - svfloat32_t v200; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v200) : "w"(v171), "w"(v173)); - svfloat32_t v216; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v216) : "w"(v209), "w"(v211)); - svfloat32_t v218; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v218) : "w"(v215), "w"(v206)); - svfloat32_t v219; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v219) : "w"(v214), "w"(v203)); - svfloat32_t v221; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v221) : "w"(v215), "w"(v206)); - svfloat32_t v222; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v222) : "w"(v214), "w"(v203)); - svfloat32_t v226; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v226) : "w"(v202), "w"(v225)); - svfloat32_t v228; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v228) : "w"(v224), "w"(v207)); - svfloat32_t v231; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v231) : "w"(v209), "w"(v213)); - svfloat32_t v232; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v232) : "w"(v211), "w"(v213)); - svfloat32_t v177; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v177) : "w"(v176), "w"(v173)); - svfloat32_t v188; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v188) : "w"(v187), "w"(v186)); - svfloat32_t v191; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v191) : "w"(v190), "w"(v189)); - svfloat32_t v195; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v195) : "w"(v194), "w"(v166)); - svfloat32_t v197; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v197) : "w"(v196), "w"(v163)); - svfloat32_t v201; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v201) : "w"(v199), "w"(v200)); - svfloat32_t v217; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v217) : "w"(v216), "w"(v213)); - svfloat32_t v220; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v220) : "w"(v219), "w"(v218)); - svfloat32_t v223; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v223) : "w"(v222), "w"(v221)); - svfloat32_t v227; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v227) : "w"(v226), "w"(v206)); - svfloat32_t v229; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v229) : "w"(v228), "w"(v203)); - svfloat32_t v233; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v233) : "w"(v231), "w"(v232)); - svfloat32_t v253; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v253) : "w"(v187), "w"(v897)); - svfloat32_t v268; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v268) : "w"(v190), "w"(v900)); - svfloat32_t zero347; - asm volatile("mov %0.s, #0" : "=w"(zero347)); + svfloat32_t v176 = svadd_f32_x(svptrue_b32(), v169, v171); + svfloat32_t v186 = svadd_f32_x(svptrue_b32(), v175, v166); + svfloat32_t v187 = svadd_f32_x(svptrue_b32(), v174, v163); + svfloat32_t v189 = svsub_f32_x(svptrue_b32(), v175, v166); + svfloat32_t v190 = svsub_f32_x(svptrue_b32(), v174, v163); + svfloat32_t v194 = svsub_f32_x(svptrue_b32(), v162, v193); + svfloat32_t v196 = svadd_f32_x(svptrue_b32(), v192, v167); + svfloat32_t v199 = svsub_f32_x(svptrue_b32(), v169, v173); + svfloat32_t v200 = svsub_f32_x(svptrue_b32(), v171, v173); + svfloat32_t v216 = svadd_f32_x(svptrue_b32(), v209, v211); + svfloat32_t v218 = svadd_f32_x(svptrue_b32(), v215, v206); + svfloat32_t v219 = svadd_f32_x(svptrue_b32(), v214, v203); + svfloat32_t v221 = svsub_f32_x(svptrue_b32(), v215, v206); + svfloat32_t v222 = svsub_f32_x(svptrue_b32(), v214, v203); + svfloat32_t v226 = svsub_f32_x(svptrue_b32(), v202, v225); + svfloat32_t v228 = svadd_f32_x(svptrue_b32(), v224, v207); + svfloat32_t v231 = svsub_f32_x(svptrue_b32(), v209, v213); + svfloat32_t v232 = svsub_f32_x(svptrue_b32(), v211, v213); + svfloat32_t v177 = svadd_f32_x(svptrue_b32(), v176, v173); + svfloat32_t v188 = svsub_f32_x(svptrue_b32(), v187, v186); + svfloat32_t v191 = svsub_f32_x(svptrue_b32(), v190, v189); + svfloat32_t v195 = svsub_f32_x(svptrue_b32(), v194, v166); + svfloat32_t v197 = svsub_f32_x(svptrue_b32(), v196, v163); + svfloat32_t v201 = svadd_f32_x(svptrue_b32(), v199, v200); + svfloat32_t v217 = svadd_f32_x(svptrue_b32(), v216, v213); + svfloat32_t v220 = svsub_f32_x(svptrue_b32(), v219, v218); + svfloat32_t v223 = svsub_f32_x(svptrue_b32(), v222, v221); + svfloat32_t v227 = svsub_f32_x(svptrue_b32(), v226, v206); + svfloat32_t v229 = svsub_f32_x(svptrue_b32(), v228, v203); + svfloat32_t v233 = svadd_f32_x(svptrue_b32(), v231, v232); + svfloat32_t v253 = svmul_f32_x(svptrue_b32(), v187, v897); + svfloat32_t v268 = svmul_f32_x(svptrue_b32(), v190, v900); + svfloat32_t zero347 = svdup_n_f32(0); svfloat32_t v347 = svcmla_f32_x(pred_full, zero347, v915, v218, 90); - svfloat32_t zero368; - asm volatile("mov %0.s, #0" : "=w"(zero368)); + svfloat32_t zero368 = svdup_n_f32(0); svfloat32_t v368 = svcmla_f32_x(pred_full, zero368, v918, v221, 90); - svfloat32_t zero452; - asm volatile("mov %0.s, #0" : "=w"(zero452)); + svfloat32_t zero452 = svdup_n_f32(0); svfloat32_t v452 = svcmla_f32_x(pred_full, zero452, v930, v231, 90); - svfloat32_t zero459; - asm volatile("mov %0.s, #0" : "=w"(zero459)); + svfloat32_t zero459 = svdup_n_f32(0); svfloat32_t v459 = svcmla_f32_x(pred_full, zero459, v931, v232, 90); - svfloat32_t v185; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v185) : "w"(v1142), "w"(v177)); - svfloat32_t v198; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v198) : "w"(v195), "w"(v197)); - svfloat32_t v230; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v230) : "w"(v227), "w"(v229)); - svfloat32_t v258; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v258) : "w"(v188), "w"(v898)); - svfloat32_t v273; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v273) : "w"(v191), "w"(v901)); - svfloat32_t v333; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v333) : "w"(v201), "w"(v913)); - svfloat32_t zero340; - asm volatile("mov %0.s, #0" : "=w"(zero340)); + svfloat32_t v185 = svadd_f32_x(svptrue_b32(), v1142, v177); + svfloat32_t v198 = svsub_f32_x(svptrue_b32(), v195, v197); + svfloat32_t v230 = svsub_f32_x(svptrue_b32(), v227, v229); + svfloat32_t v258 = svmul_f32_x(svptrue_b32(), v188, v898); + svfloat32_t v273 = svmul_f32_x(svptrue_b32(), v191, v901); + svfloat32_t v333 = svmul_f32_x(svptrue_b32(), v201, v913); + svfloat32_t zero340 = svdup_n_f32(0); svfloat32_t v340 = svcmla_f32_x(pred_full, zero340, v914, v217, 90); - svfloat32_t zero466; - asm volatile("mov %0.s, #0" : "=w"(zero466)); + svfloat32_t zero466 = svdup_n_f32(0); svfloat32_t v466 = svcmla_f32_x(pred_full, zero466, v932, v233, 90); svfloat32_t v467 = svmla_f32_x(pred_full, v253, v186, v896); svfloat32_t v468 = svmla_f32_x(pred_full, v268, v189, v899); svfloat32_t v498 = svcmla_f32_x(pred_full, v347, v916, v219, 90); svfloat32_t v499 = svcmla_f32_x(pred_full, v368, v919, v222, 90); - svfloat32_t v318; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v318) : "w"(v198), "w"(v910)); - svfloat32_t zero445; - asm volatile("mov %0.s, #0" : "=w"(zero445)); + svfloat32_t v318 = svmul_f32_x(svptrue_b32(), v198, v910); + svfloat32_t zero445 = svdup_n_f32(0); svfloat32_t v445 = svcmla_f32_x(pred_full, zero445, v929, v230, 90); - svfloat32_t v470; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v470) : "w"(v467), "w"(v468)); + svfloat32_t v470 = svadd_f32_x(svptrue_b32(), v467, v468); svfloat32_t v471 = svmla_f32_x(pred_full, v258, v186, v896); svfloat32_t v472 = svmla_f32_x(pred_full, v273, v189, v899); - svfloat32_t v489; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v489) : "w"(v467), "w"(v468)); + svfloat32_t v489 = svsub_f32_x(svptrue_b32(), v467, v468); svfloat32_t v491 = svnmls_f32_x(pred_full, v333, v199, v911); svfloat32_t v492 = svnmls_f32_x(pred_full, v333, v200, v912); svfloat32_t v493 = svmla_f32_x(pred_full, v185, v177, v895); - svfloat32_t v501; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v501) : "w"(v498), "w"(v499)); + svfloat32_t v501 = svadd_f32_x(svptrue_b32(), v498, v499); svfloat32_t v502 = svcmla_f32_x(pred_full, v347, v917, v220, 90); svfloat32_t v503 = svcmla_f32_x(pred_full, v368, v920, v223, 90); - svfloat32_t v520; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v520) : "w"(v498), "w"(v499)); - svfloat32_t v522; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v522) : "w"(v452), "w"(v466)); - svfloat32_t v523; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v523) : "w"(v459), "w"(v466)); + svfloat32_t v520 = svsub_f32_x(svptrue_b32(), v498, v499); + svfloat32_t v522 = svsub_f32_x(svptrue_b32(), v452, v466); + svfloat32_t v523 = svsub_f32_x(svptrue_b32(), v459, v466); svint16_t v554 = svtbl_s16( svreinterpret_s16_s32(svcvt_s32_f32_x( pred_full, svmul_n_f32_x(pred_full, v185, (float)(1ULL << 31ULL)))), @@ -11724,126 +7366,72 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu19(const armral_cmplx_f32_t *restrict x, svfloat32_t v469 = svmla_f32_x(pred_full, v318, v197, v909); svfloat32_t v473 = svmla_f32_x(pred_full, v318, v195, v908); svfloat32_t v474 = svnmls_f32_x(pred_full, v470, v165, v902); - svfloat32_t v475; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v475) : "w"(v471), "w"(v472)); - svfloat32_t v481; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v481) : "w"(v471), "w"(v472)); + svfloat32_t v475 = svadd_f32_x(svptrue_b32(), v471, v472); + svfloat32_t v481 = svsub_f32_x(svptrue_b32(), v471, v472); svfloat32_t v486 = svmla_f32_x(pred_full, v470, v164, v907); - svfloat32_t v494; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v494) : "w"(v493), "w"(v491)); - svfloat32_t v495; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v495) : "w"(v493), "w"(v491)); - svfloat32_t v497; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v497) : "w"(v493), "w"(v492)); + svfloat32_t v494 = svadd_f32_x(svptrue_b32(), v493, v491); + svfloat32_t v495 = svsub_f32_x(svptrue_b32(), v493, v491); + svfloat32_t v497 = svadd_f32_x(svptrue_b32(), v493, v492); svfloat32_t v500 = svcmla_f32_x(pred_full, v445, v928, v229, 90); svfloat32_t v504 = svcmla_f32_x(pred_full, v445, v927, v227, 90); - svfloat32_t v505; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v505) : "w"(v389), "w"(v501)); - svfloat32_t v506; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v506) : "w"(v502), "w"(v503)); - svfloat32_t v512; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v512) : "w"(v502), "w"(v503)); + svfloat32_t v505 = svsub_f32_x(svptrue_b32(), v389, v501); + svfloat32_t v506 = svadd_f32_x(svptrue_b32(), v502, v503); + svfloat32_t v512 = svsub_f32_x(svptrue_b32(), v502, v503); svfloat32_t v517 = svcmla_f32_x(pred_full, v501, v926, v204, 90); - svfloat32_t v524; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v524) : "w"(v340), "w"(v522)); - svfloat32_t v525; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v525) : "w"(v340), "w"(v522)); - svfloat32_t v527; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v527) : "w"(v340), "w"(v523)); + svfloat32_t v524 = svadd_f32_x(svptrue_b32(), v340, v522); + svfloat32_t v525 = svsub_f32_x(svptrue_b32(), v340, v522); + svfloat32_t v527 = svadd_f32_x(svptrue_b32(), v340, v523); svst1w_u64(pred_full, (unsigned *)(v940), svreinterpret_u64_s16(v554)); svfloat32_t v476 = svnmls_f32_x(pred_full, v473, v167, v905); svfloat32_t v477 = svmla_f32_x(pred_full, v469, v192, v903); svfloat32_t v479 = svmla_f32_x(pred_full, v475, v193, v906); - svfloat32_t v482; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v482) : "w"(v481), "w"(v469)); - svfloat32_t v483; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v483) : "w"(v474), "w"(v475)); - svfloat32_t v490; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v490) : "w"(v489), "w"(v473)); - svfloat32_t v496; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v496) : "w"(v495), "w"(v492)); - svfloat32_t v507; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v507) : "w"(v410), "w"(v504)); + svfloat32_t v482 = svadd_f32_x(svptrue_b32(), v481, v469); + svfloat32_t v483 = svadd_f32_x(svptrue_b32(), v474, v475); + svfloat32_t v490 = svadd_f32_x(svptrue_b32(), v489, v473); + svfloat32_t v496 = svsub_f32_x(svptrue_b32(), v495, v492); + svfloat32_t v507 = svsub_f32_x(svptrue_b32(), v410, v504); svfloat32_t v508 = svcmla_f32_x(pred_full, v500, v922, v224, 90); svfloat32_t v510 = svcmla_f32_x(pred_full, v506, v925, v225, 90); - svfloat32_t v513; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v513) : "w"(v512), "w"(v500)); - svfloat32_t v514; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v514) : "w"(v505), "w"(v506)); - svfloat32_t v521; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v521) : "w"(v520), "w"(v504)); - svfloat32_t v526; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v526) : "w"(v525), "w"(v523)); - svfloat32_t v478; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v478) : "w"(v477), "w"(v474)); - svfloat32_t v480; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v480) : "w"(v479), "w"(v476)); + svfloat32_t v513 = svadd_f32_x(svptrue_b32(), v512, v500); + svfloat32_t v514 = svadd_f32_x(svptrue_b32(), v505, v506); + svfloat32_t v521 = svadd_f32_x(svptrue_b32(), v520, v504); + svfloat32_t v526 = svsub_f32_x(svptrue_b32(), v525, v523); + svfloat32_t v478 = svadd_f32_x(svptrue_b32(), v477, v474); + svfloat32_t v480 = svadd_f32_x(svptrue_b32(), v479, v476); svfloat32_t v484 = svmla_f32_x(pred_full, v483, v162, v904); - svfloat32_t v487; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v487) : "w"(v486), "w"(v476)); - svfloat32_t v509; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v509) : "w"(v508), "w"(v505)); - svfloat32_t v511; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v511) : "w"(v510), "w"(v507)); + svfloat32_t v487 = svadd_f32_x(svptrue_b32(), v486, v476); + svfloat32_t v509 = svadd_f32_x(svptrue_b32(), v508, v505); + svfloat32_t v511 = svadd_f32_x(svptrue_b32(), v510, v507); svfloat32_t v515 = svcmla_f32_x(pred_full, v514, v923, v202, 90); - svfloat32_t v518; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v518) : "w"(v517), "w"(v507)); - svfloat32_t v532; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v532) : "w"(v490), "w"(v482)); - svfloat32_t v536; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v536) : "w"(v497), "w"(v490)); - svfloat32_t v539; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v539) : "w"(v482), "w"(v497)); - svfloat32_t v544; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v544) : "w"(v521), "w"(v513)); - svfloat32_t v548; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v548) : "w"(v521), "w"(v527)); - svfloat32_t v551; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v551) : "w"(v513), "w"(v527)); - svfloat32_t v485; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v485) : "w"(v484), "w"(v473)); - svfloat32_t v488; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v488) : "w"(v487), "w"(v469)); - svfloat32_t v516; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v516) : "w"(v515), "w"(v504)); - svfloat32_t v519; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v519) : "w"(v518), "w"(v500)); - svfloat32_t v533; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v533) : "w"(v532), "w"(v497)); - svfloat32_t v537; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v537) : "w"(v478), "w"(v494)); - svfloat32_t v538; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v538) : "w"(v480), "w"(v496)); - svfloat32_t v545; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v545) : "w"(v544), "w"(v527)); - svfloat32_t v549; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v549) : "w"(v509), "w"(v524)); - svfloat32_t v550; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v550) : "w"(v511), "w"(v526)); - svfloat32_t v578; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v578) : "w"(v539), "w"(v551)); - svfloat32_t v587; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v587) : "w"(v539), "w"(v551)); - svfloat32_t v596; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v596) : "w"(v536), "w"(v548)); - svfloat32_t v605; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v605) : "w"(v536), "w"(v548)); - svfloat32_t v528; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v528) : "w"(v485), "w"(v478)); - svfloat32_t v530; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v530) : "w"(v488), "w"(v480)); - svfloat32_t v534; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v534) : "w"(v494), "w"(v485)); - svfloat32_t v535; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v535) : "w"(v496), "w"(v488)); - svfloat32_t v540; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v540) : "w"(v516), "w"(v509)); - svfloat32_t v542; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v542) : "w"(v519), "w"(v511)); - svfloat32_t v546; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v546) : "w"(v524), "w"(v516)); - svfloat32_t v547; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v547) : "w"(v526), "w"(v519)); + svfloat32_t v518 = svadd_f32_x(svptrue_b32(), v517, v507); + svfloat32_t v532 = svsub_f32_x(svptrue_b32(), v490, v482); + svfloat32_t v536 = svsub_f32_x(svptrue_b32(), v497, v490); + svfloat32_t v539 = svadd_f32_x(svptrue_b32(), v482, v497); + svfloat32_t v544 = svsub_f32_x(svptrue_b32(), v521, v513); + svfloat32_t v548 = svsub_f32_x(svptrue_b32(), v521, v527); + svfloat32_t v551 = svadd_f32_x(svptrue_b32(), v513, v527); + svfloat32_t v485 = svadd_f32_x(svptrue_b32(), v484, v473); + svfloat32_t v488 = svadd_f32_x(svptrue_b32(), v487, v469); + svfloat32_t v516 = svadd_f32_x(svptrue_b32(), v515, v504); + svfloat32_t v519 = svadd_f32_x(svptrue_b32(), v518, v500); + svfloat32_t v533 = svadd_f32_x(svptrue_b32(), v532, v497); + svfloat32_t v537 = svadd_f32_x(svptrue_b32(), v478, v494); + svfloat32_t v538 = svadd_f32_x(svptrue_b32(), v480, v496); + svfloat32_t v545 = svadd_f32_x(svptrue_b32(), v544, v527); + svfloat32_t v549 = svadd_f32_x(svptrue_b32(), v509, v524); + svfloat32_t v550 = svadd_f32_x(svptrue_b32(), v511, v526); + svfloat32_t v578 = svsub_f32_x(svptrue_b32(), v539, v551); + svfloat32_t v587 = svadd_f32_x(svptrue_b32(), v539, v551); + svfloat32_t v596 = svadd_f32_x(svptrue_b32(), v536, v548); + svfloat32_t v605 = svsub_f32_x(svptrue_b32(), v536, v548); + svfloat32_t v528 = svsub_f32_x(svptrue_b32(), v485, v478); + svfloat32_t v530 = svsub_f32_x(svptrue_b32(), v488, v480); + svfloat32_t v534 = svsub_f32_x(svptrue_b32(), v494, v485); + svfloat32_t v535 = svsub_f32_x(svptrue_b32(), v496, v488); + svfloat32_t v540 = svsub_f32_x(svptrue_b32(), v516, v509); + svfloat32_t v542 = svsub_f32_x(svptrue_b32(), v519, v511); + svfloat32_t v546 = svsub_f32_x(svptrue_b32(), v524, v516); + svfloat32_t v547 = svsub_f32_x(svptrue_b32(), v526, v519); svint16_t v581 = svtbl_s16( svreinterpret_s16_s32(svcvt_s32_f32_x( pred_full, svmul_n_f32_x(pred_full, v578, (float)(1ULL << 31ULL)))), @@ -11864,26 +7452,16 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu19(const armral_cmplx_f32_t *restrict x, pred_full, svmul_n_f32_x(pred_full, v605, (float)(1ULL << 31ULL)))), svreinterpret_u16_u64( svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svfloat32_t v614; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v614) : "w"(v538), "w"(v550)); - svfloat32_t v623; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v623) : "w"(v538), "w"(v550)); - svfloat32_t v632; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v632) : "w"(v533), "w"(v545)); - svfloat32_t v641; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v641) : "w"(v533), "w"(v545)); - svfloat32_t v686; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v686) : "w"(v537), "w"(v549)); - svfloat32_t v695; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v695) : "w"(v537), "w"(v549)); - svfloat32_t v529; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v529) : "w"(v528), "w"(v494)); - svfloat32_t v531; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v531) : "w"(v530), "w"(v496)); - svfloat32_t v541; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v541) : "w"(v540), "w"(v524)); - svfloat32_t v543; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v543) : "w"(v542), "w"(v526)); + svfloat32_t v614 = svadd_f32_x(svptrue_b32(), v538, v550); + svfloat32_t v623 = svsub_f32_x(svptrue_b32(), v538, v550); + svfloat32_t v632 = svadd_f32_x(svptrue_b32(), v533, v545); + svfloat32_t v641 = svsub_f32_x(svptrue_b32(), v533, v545); + svfloat32_t v686 = svsub_f32_x(svptrue_b32(), v537, v549); + svfloat32_t v695 = svadd_f32_x(svptrue_b32(), v537, v549); + svfloat32_t v529 = svadd_f32_x(svptrue_b32(), v528, v494); + svfloat32_t v531 = svadd_f32_x(svptrue_b32(), v530, v496); + svfloat32_t v541 = svadd_f32_x(svptrue_b32(), v540, v524); + svfloat32_t v543 = svadd_f32_x(svptrue_b32(), v542, v526); svint16_t v617 = svtbl_s16( svreinterpret_s16_s32(svcvt_s32_f32_x( pred_full, svmul_n_f32_x(pred_full, v614, (float)(1ULL << 31ULL)))), @@ -11904,14 +7482,10 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu19(const armral_cmplx_f32_t *restrict x, pred_full, svmul_n_f32_x(pred_full, v641, (float)(1ULL << 31ULL)))), svreinterpret_u16_u64( svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svfloat32_t v650; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v650) : "w"(v535), "w"(v547)); - svfloat32_t v659; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v659) : "w"(v535), "w"(v547)); - svfloat32_t v668; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v668) : "w"(v534), "w"(v546)); - svfloat32_t v677; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v677) : "w"(v534), "w"(v546)); + svfloat32_t v650 = svadd_f32_x(svptrue_b32(), v535, v547); + svfloat32_t v659 = svsub_f32_x(svptrue_b32(), v535, v547); + svfloat32_t v668 = svadd_f32_x(svptrue_b32(), v534, v546); + svfloat32_t v677 = svsub_f32_x(svptrue_b32(), v534, v546); svint16_t v689 = svtbl_s16( svreinterpret_s16_s32(svcvt_s32_f32_x( pred_full, svmul_n_f32_x(pred_full, v686, (float)(1ULL << 31ULL)))), @@ -11926,10 +7500,8 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu19(const armral_cmplx_f32_t *restrict x, svst1w_u64(pred_full, (unsigned *)(v976), svreinterpret_u64_s16(v590)); svst1w_u64(pred_full, (unsigned *)(v985), svreinterpret_u64_s16(v599)); svst1w_u64(pred_full, (unsigned *)(v994), svreinterpret_u64_s16(v608)); - svfloat32_t v560; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v560) : "w"(v529), "w"(v541)); - svfloat32_t v569; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v569) : "w"(v529), "w"(v541)); + svfloat32_t v560 = svadd_f32_x(svptrue_b32(), v529, v541); + svfloat32_t v569 = svsub_f32_x(svptrue_b32(), v529, v541); svint16_t v653 = svtbl_s16( svreinterpret_s16_s32(svcvt_s32_f32_x( pred_full, svmul_n_f32_x(pred_full, v650, (float)(1ULL << 31ULL)))), @@ -11950,10 +7522,8 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu19(const armral_cmplx_f32_t *restrict x, pred_full, svmul_n_f32_x(pred_full, v677, (float)(1ULL << 31ULL)))), svreinterpret_u16_u64( svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svfloat32_t v704; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v704) : "w"(v531), "w"(v543)); - svfloat32_t v713; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v713) : "w"(v531), "w"(v543)); + svfloat32_t v704 = svadd_f32_x(svptrue_b32(), v531, v543); + svfloat32_t v713 = svsub_f32_x(svptrue_b32(), v531, v543); svst1w_u64(pred_full, (unsigned *)(v1003), svreinterpret_u64_s16(v617)); svst1w_u64(pred_full, (unsigned *)(v1012), svreinterpret_u64_s16(v626)); svst1w_u64(pred_full, (unsigned *)(v1021), svreinterpret_u64_s16(v635)); @@ -12002,546 +7572,258 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu20(const armral_cmplx_f32_t *restrict x, float v4 = dir; const float32x2_t *v5 = (const float32x2_t *)x; int32_t *v6 = (int32_t *)y; - int64_t v12 = howmany - 1; - int64_t v585 = howmany / 2; - for (int j = 0; j < v12; j += 2) { - float v326 = 1.5388417685876268e+00F; - float v334 = 5.8778525229247325e-01F; - float v342 = 3.6327126400268028e-01F; - float v367 = 1.0000000000000000e+00F; - float v368 = -1.0000000000000000e+00F; - float v375 = -1.2500000000000000e+00F; - float v376 = 1.2500000000000000e+00F; - float v383 = 5.5901699437494745e-01F; - float v384 = -5.5901699437494745e-01F; - float32x2_t v386 = (float32x2_t){v4, v4}; - float v392 = -1.5388417685876268e+00F; - float v397 = -5.8778525229247325e-01F; - float v402 = -3.6327126400268028e-01F; - const float32x2_t *v1226 = &v5[istride]; - int32_t *v1290 = &v6[ostride]; - float32x2_t v318 = (float32x2_t){v375, v375}; - float32x2_t v323 = (float32x2_t){v383, v383}; - float32x2_t v328 = (float32x2_t){v326, v392}; - float32x2_t v336 = (float32x2_t){v334, v397}; - float32x2_t v344 = (float32x2_t){v342, v402}; - float32x2_t v369 = (float32x2_t){v367, v368}; - float32x2_t v377 = (float32x2_t){v375, v376}; - float32x2_t v385 = (float32x2_t){v383, v384}; - float32x2_t v393 = (float32x2_t){v392, v392}; - float32x2_t v398 = (float32x2_t){v397, v397}; - float32x2_t v403 = (float32x2_t){v402, v402}; - const float32x2_t *v1064 = &v5[0]; - int32_t *v1245 = &v6[0]; - float32x4_t v1456 = vld1q_f32((const float32_t *)v1226); - float32x4_t v319 = vcombine_f32(v318, v318); - float32x4_t v324 = vcombine_f32(v323, v323); - float32x2_t v330 = vmul_f32(v386, v328); - float32x2_t v338 = vmul_f32(v386, v336); - float32x2_t v346 = vmul_f32(v386, v344); - float32x2_t v371 = vmul_f32(v386, v369); - float32x2_t v379 = vmul_f32(v386, v377); - float32x2_t v387 = vmul_f32(v386, v385); - float32x4_t v394 = vcombine_f32(v393, v393); - float32x4_t v399 = vcombine_f32(v398, v398); - float32x4_t v404 = vcombine_f32(v403, v403); - const float32x2_t *v1073 = &v5[istride * 10]; - const float32x2_t *v1082 = &v5[istride * 5]; - const float32x2_t *v1091 = &v5[istride * 15]; - const float32x2_t *v1100 = &v5[istride * 4]; - const float32x2_t *v1109 = &v5[istride * 14]; - const float32x2_t *v1118 = &v5[istride * 9]; - const float32x2_t *v1127 = &v5[istride * 19]; - const float32x2_t *v1136 = &v5[istride * 8]; - const float32x2_t *v1145 = &v5[istride * 18]; - const float32x2_t *v1154 = &v5[istride * 13]; - const float32x2_t *v1163 = &v5[istride * 3]; - const float32x2_t *v1172 = &v5[istride * 12]; - const float32x2_t *v1181 = &v5[istride * 2]; - const float32x2_t *v1190 = &v5[istride * 17]; - const float32x2_t *v1199 = &v5[istride * 7]; - const float32x2_t *v1208 = &v5[istride * 16]; - const float32x2_t *v1217 = &v5[istride * 6]; - const float32x2_t *v1235 = &v5[istride * 11]; - int32_t *v1254 = &v6[ostride * 5]; - int32_t *v1263 = &v6[ostride * 10]; - int32_t *v1272 = &v6[ostride * 15]; - int32_t *v1281 = &v6[ostride * 16]; - int32_t *v1299 = &v6[ostride * 6]; - int32_t *v1308 = &v6[ostride * 11]; - int32_t *v1317 = &v6[ostride * 12]; - int32_t *v1326 = &v6[ostride * 17]; - int32_t *v1335 = &v6[ostride * 2]; - int32_t *v1344 = &v6[ostride * 7]; - int32_t *v1353 = &v6[ostride * 8]; - int32_t *v1362 = &v6[ostride * 13]; - int32_t *v1371 = &v6[ostride * 18]; - int32_t *v1380 = &v6[ostride * 3]; - int32_t *v1389 = &v6[ostride * 4]; - int32_t *v1398 = &v6[ostride * 9]; - int32_t *v1407 = &v6[ostride * 14]; - int32_t *v1416 = &v6[ostride * 19]; - float32x4_t v1420 = vld1q_f32((const float32_t *)v1064); - float32x4_t v332 = vcombine_f32(v330, v330); - float32x4_t v340 = vcombine_f32(v338, v338); - float32x4_t v348 = vcombine_f32(v346, v346); - float32x4_t v373 = vcombine_f32(v371, v371); - float32x4_t v381 = vcombine_f32(v379, v379); - float32x4_t v389 = vcombine_f32(v387, v387); - float32x4_t v1422 = vld1q_f32((const float32_t *)v1073); - float32x4_t v1424 = vld1q_f32((const float32_t *)v1082); - float32x4_t v1426 = vld1q_f32((const float32_t *)v1091); - float32x4_t v1428 = vld1q_f32((const float32_t *)v1100); - float32x4_t v1430 = vld1q_f32((const float32_t *)v1109); - float32x4_t v1432 = vld1q_f32((const float32_t *)v1118); - float32x4_t v1434 = vld1q_f32((const float32_t *)v1127); - float32x4_t v1436 = vld1q_f32((const float32_t *)v1136); - float32x4_t v1438 = vld1q_f32((const float32_t *)v1145); - float32x4_t v1440 = vld1q_f32((const float32_t *)v1154); - float32x4_t v1442 = vld1q_f32((const float32_t *)v1163); - float32x4_t v1444 = vld1q_f32((const float32_t *)v1172); - float32x4_t v1446 = vld1q_f32((const float32_t *)v1181); - float32x4_t v1448 = vld1q_f32((const float32_t *)v1190); - float32x4_t v1450 = vld1q_f32((const float32_t *)v1199); - float32x4_t v1452 = vld1q_f32((const float32_t *)v1208); - float32x4_t v1454 = vld1q_f32((const float32_t *)v1217); - float32x4_t v1458 = vld1q_f32((const float32_t *)v1235); - float32x4_t v35 = vaddq_f32(v1420, v1422); - float32x4_t v36 = vsubq_f32(v1420, v1422); - float32x4_t v51 = vaddq_f32(v1424, v1426); - float32x4_t v52 = vsubq_f32(v1424, v1426); - float32x4_t v69 = vaddq_f32(v1428, v1430); - float32x4_t v70 = vsubq_f32(v1428, v1430); - float32x4_t v85 = vaddq_f32(v1432, v1434); - float32x4_t v86 = vsubq_f32(v1432, v1434); - float32x4_t v103 = vaddq_f32(v1436, v1438); - float32x4_t v104 = vsubq_f32(v1436, v1438); - float32x4_t v119 = vaddq_f32(v1440, v1442); - float32x4_t v120 = vsubq_f32(v1440, v1442); - float32x4_t v137 = vaddq_f32(v1444, v1446); - float32x4_t v138 = vsubq_f32(v1444, v1446); - float32x4_t v153 = vaddq_f32(v1448, v1450); - float32x4_t v154 = vsubq_f32(v1448, v1450); - float32x4_t v171 = vaddq_f32(v1452, v1454); - float32x4_t v172 = vsubq_f32(v1452, v1454); - float32x4_t v187 = vaddq_f32(v1456, v1458); - float32x4_t v188 = vsubq_f32(v1456, v1458); - float32x4_t v53 = vaddq_f32(v35, v51); - float32x4_t v54 = vsubq_f32(v35, v51); - float32x4_t v87 = vaddq_f32(v69, v85); - float32x4_t v88 = vsubq_f32(v69, v85); - float32x4_t v121 = vaddq_f32(v103, v119); - float32x4_t v122 = vsubq_f32(v103, v119); - float32x4_t v155 = vaddq_f32(v137, v153); - float32x4_t v156 = vsubq_f32(v137, v153); - float32x4_t v189 = vaddq_f32(v171, v187); - float32x4_t v190 = vsubq_f32(v171, v187); - float32x4_t v303 = vaddq_f32(v70, v172); - float32x4_t v304 = vsubq_f32(v70, v172); - float32x4_t v305 = vaddq_f32(v138, v104); - float32x4_t v306 = vsubq_f32(v138, v104); - float32x4_t v359 = vaddq_f32(v86, v188); - float32x4_t v360 = vsubq_f32(v86, v188); - float32x4_t v361 = vaddq_f32(v154, v120); - float32x4_t v362 = vsubq_f32(v154, v120); - float32x4_t v191 = vaddq_f32(v87, v189); - float32x4_t v192 = vsubq_f32(v87, v189); - float32x4_t v193 = vaddq_f32(v155, v121); - float32x4_t v194 = vsubq_f32(v155, v121); - float32x4_t v247 = vaddq_f32(v88, v190); - float32x4_t v248 = vsubq_f32(v88, v190); - float32x4_t v249 = vaddq_f32(v156, v122); - float32x4_t v250 = vsubq_f32(v156, v122); - float32x4_t v307 = vaddq_f32(v303, v305); - float32x4_t v308 = vsubq_f32(v303, v305); - float32x4_t v309 = vaddq_f32(v304, v306); - float32x4_t v331 = vrev64q_f32(v304); - float32x4_t v347 = vrev64q_f32(v306); - float32x4_t v363 = vaddq_f32(v359, v361); - float32x4_t v364 = vsubq_f32(v359, v361); - float32x4_t v365 = vaddq_f32(v360, v362); - float32x4_t v395 = vmulq_f32(v360, v394); - float32x4_t v405 = vmulq_f32(v362, v404); - float32x4_t v195 = vaddq_f32(v191, v193); - float32x4_t v196 = vsubq_f32(v191, v193); - float32x4_t v197 = vaddq_f32(v192, v194); - float32x4_t v219 = vrev64q_f32(v192); - float32x4_t v235 = vrev64q_f32(v194); - float32x4_t v251 = vaddq_f32(v247, v249); - float32x4_t v252 = vsubq_f32(v247, v249); - float32x4_t v253 = vaddq_f32(v248, v250); - float32x4_t v275 = vrev64q_f32(v248); - float32x4_t v291 = vrev64q_f32(v250); - float32x4_t v310 = vaddq_f32(v307, v36); - float32x4_t v320 = vmulq_f32(v307, v319); - float32x4_t v325 = vmulq_f32(v308, v324); - float32x4_t v333 = vmulq_f32(v331, v332); - float32x4_t v339 = vrev64q_f32(v309); - float32x4_t v349 = vmulq_f32(v347, v348); - float32x4_t v366 = vaddq_f32(v363, v52); - float32x4_t v380 = vrev64q_f32(v363); - float32x4_t v388 = vrev64q_f32(v364); - float32x4_t v400 = vmulq_f32(v365, v399); - float32x4_t v198 = vaddq_f32(v195, v53); - float32x4_t v208 = vmulq_f32(v195, v319); - float32x4_t v213 = vmulq_f32(v196, v324); - float32x4_t v221 = vmulq_f32(v219, v332); - float32x4_t v227 = vrev64q_f32(v197); - float32x4_t v237 = vmulq_f32(v235, v348); - float32x4_t v254 = vaddq_f32(v251, v54); - float32x4_t v264 = vmulq_f32(v251, v319); - float32x4_t v269 = vmulq_f32(v252, v324); - float32x4_t v277 = vmulq_f32(v275, v332); - float32x4_t v283 = vrev64q_f32(v253); - float32x4_t v293 = vmulq_f32(v291, v348); - float32x4_t v341 = vmulq_f32(v339, v340); - float32x4_t v350 = vaddq_f32(v310, v320); - float32x4_t v372 = vrev64q_f32(v366); - float32x4_t v382 = vmulq_f32(v380, v381); - float32x4_t v390 = vmulq_f32(v388, v389); - float32x4_t v409 = vsubq_f32(v395, v400); - float32x4_t v410 = vaddq_f32(v400, v405); - float32x4_t v229 = vmulq_f32(v227, v340); - float32x4_t v238 = vaddq_f32(v198, v208); - float32x4_t v285 = vmulq_f32(v283, v340); - float32x4_t v294 = vaddq_f32(v254, v264); - float32x4_t v351 = vaddq_f32(v350, v325); - float32x4_t v352 = vsubq_f32(v350, v325); - float32x4_t v353 = vsubq_f32(v333, v341); - float32x4_t v354 = vaddq_f32(v341, v349); - float32x4_t v374 = vmulq_f32(v372, v373); - int16x4_t v419 = vqmovn_s32(vcvtq_n_s32_f32(v198, 15)); - int16x4_t v435 = vqmovn_s32(vcvtq_n_s32_f32(v254, 15)); - float32x4_t v239 = vaddq_f32(v238, v213); - float32x4_t v240 = vsubq_f32(v238, v213); - float32x4_t v241 = vsubq_f32(v221, v229); - float32x4_t v242 = vaddq_f32(v229, v237); - float32x4_t v295 = vaddq_f32(v294, v269); - float32x4_t v296 = vsubq_f32(v294, v269); - float32x4_t v297 = vsubq_f32(v277, v285); - float32x4_t v298 = vaddq_f32(v285, v293); - float32x4_t v355 = vaddq_f32(v351, v353); - float32x4_t v356 = vsubq_f32(v351, v353); - float32x4_t v357 = vaddq_f32(v352, v354); - float32x4_t v358 = vsubq_f32(v352, v354); - float32x4_t v406 = vaddq_f32(v374, v382); - float32x4_t v415 = vaddq_f32(v310, v374); - float32x4_t v416 = vsubq_f32(v310, v374); - vst1_s16((int16_t *)v1245, v419); - vst1_s16((int16_t *)v1263, v435); - float32x4_t v243 = vaddq_f32(v239, v241); - float32x4_t v244 = vsubq_f32(v239, v241); - float32x4_t v245 = vaddq_f32(v240, v242); - float32x4_t v246 = vsubq_f32(v240, v242); - float32x4_t v299 = vaddq_f32(v295, v297); - float32x4_t v300 = vsubq_f32(v295, v297); - float32x4_t v301 = vaddq_f32(v296, v298); - float32x4_t v302 = vsubq_f32(v296, v298); - float32x4_t v407 = vaddq_f32(v406, v390); - float32x4_t v408 = vsubq_f32(v406, v390); - int16x4_t v427 = vqmovn_s32(vcvtq_n_s32_f32(v416, 15)); - int16x4_t v443 = vqmovn_s32(vcvtq_n_s32_f32(v415, 15)); - float32x4_t v411 = vaddq_f32(v407, v409); - float32x4_t v412 = vsubq_f32(v407, v409); - float32x4_t v413 = vaddq_f32(v408, v410); - float32x4_t v414 = vsubq_f32(v408, v410); - int16x4_t v453 = vqmovn_s32(vcvtq_n_s32_f32(v244, 15)); - int16x4_t v469 = vqmovn_s32(vcvtq_n_s32_f32(v300, 15)); - int16x4_t v487 = vqmovn_s32(vcvtq_n_s32_f32(v246, 15)); - int16x4_t v503 = vqmovn_s32(vcvtq_n_s32_f32(v302, 15)); - int16x4_t v521 = vqmovn_s32(vcvtq_n_s32_f32(v245, 15)); - int16x4_t v537 = vqmovn_s32(vcvtq_n_s32_f32(v301, 15)); - int16x4_t v555 = vqmovn_s32(vcvtq_n_s32_f32(v243, 15)); - int16x4_t v571 = vqmovn_s32(vcvtq_n_s32_f32(v299, 15)); - vst1_s16((int16_t *)v1254, v427); - vst1_s16((int16_t *)v1272, v443); - float32x4_t v449 = vaddq_f32(v356, v412); - float32x4_t v450 = vsubq_f32(v356, v412); - float32x4_t v483 = vaddq_f32(v358, v414); - float32x4_t v484 = vsubq_f32(v358, v414); - float32x4_t v517 = vaddq_f32(v357, v413); - float32x4_t v518 = vsubq_f32(v357, v413); - float32x4_t v551 = vaddq_f32(v355, v411); - float32x4_t v552 = vsubq_f32(v355, v411); - vst1_s16((int16_t *)v1281, v453); - vst1_s16((int16_t *)v1299, v469); - vst1_s16((int16_t *)v1317, v487); - vst1_s16((int16_t *)v1335, v503); - vst1_s16((int16_t *)v1353, v521); - vst1_s16((int16_t *)v1371, v537); - vst1_s16((int16_t *)v1389, v555); - vst1_s16((int16_t *)v1407, v571); - int16x4_t v461 = vqmovn_s32(vcvtq_n_s32_f32(v450, 15)); - int16x4_t v477 = vqmovn_s32(vcvtq_n_s32_f32(v449, 15)); - int16x4_t v495 = vqmovn_s32(vcvtq_n_s32_f32(v484, 15)); - int16x4_t v511 = vqmovn_s32(vcvtq_n_s32_f32(v483, 15)); - int16x4_t v529 = vqmovn_s32(vcvtq_n_s32_f32(v518, 15)); - int16x4_t v545 = vqmovn_s32(vcvtq_n_s32_f32(v517, 15)); - int16x4_t v563 = vqmovn_s32(vcvtq_n_s32_f32(v552, 15)); - int16x4_t v579 = vqmovn_s32(vcvtq_n_s32_f32(v551, 15)); - vst1_s16((int16_t *)v1290, v461); - vst1_s16((int16_t *)v1308, v477); - vst1_s16((int16_t *)v1326, v495); - vst1_s16((int16_t *)v1344, v511); - vst1_s16((int16_t *)v1362, v529); - vst1_s16((int16_t *)v1380, v545); - vst1_s16((int16_t *)v1398, v563); - vst1_s16((int16_t *)v1416, v579); - v5 += 2 * 1; - v6 += 2 * 1; - } - for (int j = v585 * 2; j < howmany; j += 1) { - float32x2_t v713 = v5[istride]; - float v843 = 1.5388417685876268e+00F; - float v850 = 5.8778525229247325e-01F; - float v857 = 3.6327126400268028e-01F; - float v881 = 1.0000000000000000e+00F; - float v882 = -1.0000000000000000e+00F; - float v888 = -1.2500000000000000e+00F; - float v889 = 1.2500000000000000e+00F; - float v895 = 5.5901699437494745e-01F; - float v896 = -5.5901699437494745e-01F; - float32x2_t v898 = (float32x2_t){v4, v4}; - float v903 = -1.5388417685876268e+00F; - float v907 = -5.8778525229247325e-01F; - float v911 = -3.6327126400268028e-01F; - float32x2_t v597 = v5[0]; - float32x2_t v837 = (float32x2_t){v888, v888}; - float32x2_t v841 = (float32x2_t){v895, v895}; - float32x2_t v845 = (float32x2_t){v843, v903}; - float32x2_t v852 = (float32x2_t){v850, v907}; - float32x2_t v859 = (float32x2_t){v857, v911}; - float32x2_t v883 = (float32x2_t){v881, v882}; - float32x2_t v890 = (float32x2_t){v888, v889}; - float32x2_t v897 = (float32x2_t){v895, v896}; - float32x2_t v904 = (float32x2_t){v903, v903}; - float32x2_t v908 = (float32x2_t){v907, v907}; - float32x2_t v912 = (float32x2_t){v911, v911}; - float32x2_t v602 = v5[istride * 10]; - float32x2_t v609 = v5[istride * 5]; - float32x2_t v614 = v5[istride * 15]; - float32x2_t v623 = v5[istride * 4]; - float32x2_t v628 = v5[istride * 14]; - float32x2_t v635 = v5[istride * 9]; - float32x2_t v640 = v5[istride * 19]; - float32x2_t v649 = v5[istride * 8]; - float32x2_t v654 = v5[istride * 18]; - float32x2_t v661 = v5[istride * 13]; - float32x2_t v666 = v5[istride * 3]; - float32x2_t v675 = v5[istride * 12]; - float32x2_t v680 = v5[istride * 2]; - float32x2_t v687 = v5[istride * 17]; - float32x2_t v692 = v5[istride * 7]; - float32x2_t v701 = v5[istride * 16]; - float32x2_t v706 = v5[istride * 6]; - float32x2_t v718 = v5[istride * 11]; - float32x2_t v847 = vmul_f32(v898, v845); - float32x2_t v854 = vmul_f32(v898, v852); - float32x2_t v861 = vmul_f32(v898, v859); - float32x2_t v885 = vmul_f32(v898, v883); - float32x2_t v892 = vmul_f32(v898, v890); - float32x2_t v899 = vmul_f32(v898, v897); - float32x2_t v603 = vadd_f32(v597, v602); - float32x2_t v604 = vsub_f32(v597, v602); - float32x2_t v615 = vadd_f32(v609, v614); - float32x2_t v616 = vsub_f32(v609, v614); - float32x2_t v629 = vadd_f32(v623, v628); - float32x2_t v630 = vsub_f32(v623, v628); - float32x2_t v641 = vadd_f32(v635, v640); - float32x2_t v642 = vsub_f32(v635, v640); - float32x2_t v655 = vadd_f32(v649, v654); - float32x2_t v656 = vsub_f32(v649, v654); - float32x2_t v667 = vadd_f32(v661, v666); - float32x2_t v668 = vsub_f32(v661, v666); - float32x2_t v681 = vadd_f32(v675, v680); - float32x2_t v682 = vsub_f32(v675, v680); - float32x2_t v693 = vadd_f32(v687, v692); - float32x2_t v694 = vsub_f32(v687, v692); - float32x2_t v707 = vadd_f32(v701, v706); - float32x2_t v708 = vsub_f32(v701, v706); - float32x2_t v719 = vadd_f32(v713, v718); - float32x2_t v720 = vsub_f32(v713, v718); - float32x2_t v617 = vadd_f32(v603, v615); - float32x2_t v618 = vsub_f32(v603, v615); - float32x2_t v643 = vadd_f32(v629, v641); - float32x2_t v644 = vsub_f32(v629, v641); - float32x2_t v669 = vadd_f32(v655, v667); - float32x2_t v670 = vsub_f32(v655, v667); - float32x2_t v695 = vadd_f32(v681, v693); - float32x2_t v696 = vsub_f32(v681, v693); - float32x2_t v721 = vadd_f32(v707, v719); - float32x2_t v722 = vsub_f32(v707, v719); - float32x2_t v823 = vadd_f32(v630, v708); - float32x2_t v824 = vsub_f32(v630, v708); - float32x2_t v825 = vadd_f32(v682, v656); - float32x2_t v826 = vsub_f32(v682, v656); - float32x2_t v873 = vadd_f32(v642, v720); - float32x2_t v874 = vsub_f32(v642, v720); - float32x2_t v875 = vadd_f32(v694, v668); - float32x2_t v876 = vsub_f32(v694, v668); - float32x2_t v723 = vadd_f32(v643, v721); - float32x2_t v724 = vsub_f32(v643, v721); - float32x2_t v725 = vadd_f32(v695, v669); - float32x2_t v726 = vsub_f32(v695, v669); - float32x2_t v773 = vadd_f32(v644, v722); - float32x2_t v774 = vsub_f32(v644, v722); - float32x2_t v775 = vadd_f32(v696, v670); - float32x2_t v776 = vsub_f32(v696, v670); - float32x2_t v827 = vadd_f32(v823, v825); - float32x2_t v828 = vsub_f32(v823, v825); - float32x2_t v829 = vadd_f32(v824, v826); - float32x2_t v848 = vrev64_f32(v824); - float32x2_t v862 = vrev64_f32(v826); - float32x2_t v877 = vadd_f32(v873, v875); - float32x2_t v878 = vsub_f32(v873, v875); - float32x2_t v879 = vadd_f32(v874, v876); - float32x2_t v905 = vmul_f32(v874, v904); - float32x2_t v913 = vmul_f32(v876, v912); - float32x2_t v727 = vadd_f32(v723, v725); - float32x2_t v728 = vsub_f32(v723, v725); - float32x2_t v729 = vadd_f32(v724, v726); - float32x2_t v748 = vrev64_f32(v724); - float32x2_t v762 = vrev64_f32(v726); - float32x2_t v777 = vadd_f32(v773, v775); - float32x2_t v778 = vsub_f32(v773, v775); - float32x2_t v779 = vadd_f32(v774, v776); - float32x2_t v798 = vrev64_f32(v774); - float32x2_t v812 = vrev64_f32(v776); - float32x2_t v830 = vadd_f32(v827, v604); - float32x2_t v838 = vmul_f32(v827, v837); - float32x2_t v842 = vmul_f32(v828, v841); - float32x2_t v849 = vmul_f32(v848, v847); - float32x2_t v855 = vrev64_f32(v829); - float32x2_t v863 = vmul_f32(v862, v861); - float32x2_t v880 = vadd_f32(v877, v616); - float32x2_t v893 = vrev64_f32(v877); - float32x2_t v900 = vrev64_f32(v878); - float32x2_t v909 = vmul_f32(v879, v908); - float32x2_t v730 = vadd_f32(v727, v617); - float32x2_t v738 = vmul_f32(v727, v837); - float32x2_t v742 = vmul_f32(v728, v841); - float32x2_t v749 = vmul_f32(v748, v847); - float32x2_t v755 = vrev64_f32(v729); - float32x2_t v763 = vmul_f32(v762, v861); - float32x2_t v780 = vadd_f32(v777, v618); - float32x2_t v788 = vmul_f32(v777, v837); - float32x2_t v792 = vmul_f32(v778, v841); - float32x2_t v799 = vmul_f32(v798, v847); - float32x2_t v805 = vrev64_f32(v779); - float32x2_t v813 = vmul_f32(v812, v861); - float32x2_t v856 = vmul_f32(v855, v854); - float32x2_t v864 = vadd_f32(v830, v838); - float32x2_t v886 = vrev64_f32(v880); - float32x2_t v894 = vmul_f32(v893, v892); - float32x2_t v901 = vmul_f32(v900, v899); - float32x2_t v917 = vsub_f32(v905, v909); - float32x2_t v918 = vadd_f32(v909, v913); - float32x2_t v756 = vmul_f32(v755, v854); - float32x2_t v764 = vadd_f32(v730, v738); - float32x2_t v806 = vmul_f32(v805, v854); - float32x2_t v814 = vadd_f32(v780, v788); - float32x2_t v865 = vadd_f32(v864, v842); - float32x2_t v866 = vsub_f32(v864, v842); - float32x2_t v867 = vsub_f32(v849, v856); - float32x2_t v868 = vadd_f32(v856, v863); - float32x2_t v887 = vmul_f32(v886, v885); - int16x4_t v927 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v730, 15), (int32x2_t){0, 0})); - int16x4_t v939 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v780, 15), (int32x2_t){0, 0})); - float32x2_t v765 = vadd_f32(v764, v742); - float32x2_t v766 = vsub_f32(v764, v742); - float32x2_t v767 = vsub_f32(v749, v756); - float32x2_t v768 = vadd_f32(v756, v763); - float32x2_t v815 = vadd_f32(v814, v792); - float32x2_t v816 = vsub_f32(v814, v792); - float32x2_t v817 = vsub_f32(v799, v806); - float32x2_t v818 = vadd_f32(v806, v813); - float32x2_t v869 = vadd_f32(v865, v867); - float32x2_t v870 = vsub_f32(v865, v867); - float32x2_t v871 = vadd_f32(v866, v868); - float32x2_t v872 = vsub_f32(v866, v868); - float32x2_t v914 = vadd_f32(v887, v894); - float32x2_t v923 = vadd_f32(v830, v887); - float32x2_t v924 = vsub_f32(v830, v887); - v6[0] = vget_lane_s32(vreinterpret_s32_s16(v927), 0); - v6[ostride * 10] = vget_lane_s32(vreinterpret_s32_s16(v939), 0); - float32x2_t v769 = vadd_f32(v765, v767); - float32x2_t v770 = vsub_f32(v765, v767); - float32x2_t v771 = vadd_f32(v766, v768); - float32x2_t v772 = vsub_f32(v766, v768); - float32x2_t v819 = vadd_f32(v815, v817); - float32x2_t v820 = vsub_f32(v815, v817); - float32x2_t v821 = vadd_f32(v816, v818); - float32x2_t v822 = vsub_f32(v816, v818); - float32x2_t v915 = vadd_f32(v914, v901); - float32x2_t v916 = vsub_f32(v914, v901); - int16x4_t v933 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v924, 15), (int32x2_t){0, 0})); - int16x4_t v945 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v923, 15), (int32x2_t){0, 0})); - float32x2_t v919 = vadd_f32(v915, v917); - float32x2_t v920 = vsub_f32(v915, v917); - float32x2_t v921 = vadd_f32(v916, v918); - float32x2_t v922 = vsub_f32(v916, v918); - v6[ostride * 5] = vget_lane_s32(vreinterpret_s32_s16(v933), 0); - v6[ostride * 15] = vget_lane_s32(vreinterpret_s32_s16(v945), 0); - int16x4_t v953 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v770, 15), (int32x2_t){0, 0})); - int16x4_t v965 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v820, 15), (int32x2_t){0, 0})); - int16x4_t v979 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v772, 15), (int32x2_t){0, 0})); - int16x4_t v991 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v822, 15), (int32x2_t){0, 0})); - int16x4_t v1005 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v771, 15), (int32x2_t){0, 0})); - int16x4_t v1017 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v821, 15), (int32x2_t){0, 0})); - int16x4_t v1031 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v769, 15), (int32x2_t){0, 0})); - int16x4_t v1043 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v819, 15), (int32x2_t){0, 0})); - float32x2_t v949 = vadd_f32(v870, v920); - float32x2_t v950 = vsub_f32(v870, v920); - v6[ostride * 16] = vget_lane_s32(vreinterpret_s32_s16(v953), 0); - v6[ostride * 6] = vget_lane_s32(vreinterpret_s32_s16(v965), 0); - float32x2_t v975 = vadd_f32(v872, v922); - float32x2_t v976 = vsub_f32(v872, v922); - v6[ostride * 12] = vget_lane_s32(vreinterpret_s32_s16(v979), 0); - v6[ostride * 2] = vget_lane_s32(vreinterpret_s32_s16(v991), 0); - float32x2_t v1001 = vadd_f32(v871, v921); - float32x2_t v1002 = vsub_f32(v871, v921); - v6[ostride * 8] = vget_lane_s32(vreinterpret_s32_s16(v1005), 0); - v6[ostride * 18] = vget_lane_s32(vreinterpret_s32_s16(v1017), 0); - float32x2_t v1027 = vadd_f32(v869, v919); - float32x2_t v1028 = vsub_f32(v869, v919); - v6[ostride * 4] = vget_lane_s32(vreinterpret_s32_s16(v1031), 0); - v6[ostride * 14] = vget_lane_s32(vreinterpret_s32_s16(v1043), 0); - int16x4_t v959 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v950, 15), (int32x2_t){0, 0})); - int16x4_t v971 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v949, 15), (int32x2_t){0, 0})); - int16x4_t v985 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v976, 15), (int32x2_t){0, 0})); - int16x4_t v997 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v975, 15), (int32x2_t){0, 0})); - int16x4_t v1011 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1002, 15), (int32x2_t){0, 0})); - int16x4_t v1023 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1001, 15), (int32x2_t){0, 0})); - int16x4_t v1037 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1028, 15), (int32x2_t){0, 0})); - int16x4_t v1049 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1027, 15), (int32x2_t){0, 0})); - v6[ostride] = vget_lane_s32(vreinterpret_s32_s16(v959), 0); - v6[ostride * 11] = vget_lane_s32(vreinterpret_s32_s16(v971), 0); - v6[ostride * 17] = vget_lane_s32(vreinterpret_s32_s16(v985), 0); - v6[ostride * 7] = vget_lane_s32(vreinterpret_s32_s16(v997), 0); - v6[ostride * 13] = vget_lane_s32(vreinterpret_s32_s16(v1011), 0); - v6[ostride * 3] = vget_lane_s32(vreinterpret_s32_s16(v1023), 0); - v6[ostride * 9] = vget_lane_s32(vreinterpret_s32_s16(v1037), 0); - v6[ostride * 19] = vget_lane_s32(vreinterpret_s32_s16(v1049), 0); + for (int j = 0; j < howmany; j += 1) { + float32x2_t v136 = v5[istride]; + float v266 = 1.5388417685876268e+00F; + float v273 = 5.8778525229247325e-01F; + float v280 = 3.6327126400268028e-01F; + float v304 = 1.0000000000000000e+00F; + float v305 = -1.0000000000000000e+00F; + float v311 = -1.2500000000000000e+00F; + float v312 = 1.2500000000000000e+00F; + float v318 = 5.5901699437494745e-01F; + float v319 = -5.5901699437494745e-01F; + float32x2_t v321 = (float32x2_t){v4, v4}; + float v326 = -1.5388417685876268e+00F; + float v330 = -5.8778525229247325e-01F; + float v334 = -3.6327126400268028e-01F; + float32x2_t v20 = v5[0]; + float32x2_t v260 = (float32x2_t){v311, v311}; + float32x2_t v264 = (float32x2_t){v318, v318}; + float32x2_t v268 = (float32x2_t){v266, v326}; + float32x2_t v275 = (float32x2_t){v273, v330}; + float32x2_t v282 = (float32x2_t){v280, v334}; + float32x2_t v306 = (float32x2_t){v304, v305}; + float32x2_t v313 = (float32x2_t){v311, v312}; + float32x2_t v320 = (float32x2_t){v318, v319}; + float32x2_t v327 = (float32x2_t){v326, v326}; + float32x2_t v331 = (float32x2_t){v330, v330}; + float32x2_t v335 = (float32x2_t){v334, v334}; + float32x2_t v25 = v5[istride * 10]; + float32x2_t v32 = v5[istride * 5]; + float32x2_t v37 = v5[istride * 15]; + float32x2_t v46 = v5[istride * 4]; + float32x2_t v51 = v5[istride * 14]; + float32x2_t v58 = v5[istride * 9]; + float32x2_t v63 = v5[istride * 19]; + float32x2_t v72 = v5[istride * 8]; + float32x2_t v77 = v5[istride * 18]; + float32x2_t v84 = v5[istride * 13]; + float32x2_t v89 = v5[istride * 3]; + float32x2_t v98 = v5[istride * 12]; + float32x2_t v103 = v5[istride * 2]; + float32x2_t v110 = v5[istride * 17]; + float32x2_t v115 = v5[istride * 7]; + float32x2_t v124 = v5[istride * 16]; + float32x2_t v129 = v5[istride * 6]; + float32x2_t v141 = v5[istride * 11]; + float32x2_t v270 = vmul_f32(v321, v268); + float32x2_t v277 = vmul_f32(v321, v275); + float32x2_t v284 = vmul_f32(v321, v282); + float32x2_t v308 = vmul_f32(v321, v306); + float32x2_t v315 = vmul_f32(v321, v313); + float32x2_t v322 = vmul_f32(v321, v320); + float32x2_t v26 = vadd_f32(v20, v25); + float32x2_t v27 = vsub_f32(v20, v25); + float32x2_t v38 = vadd_f32(v32, v37); + float32x2_t v39 = vsub_f32(v32, v37); + float32x2_t v52 = vadd_f32(v46, v51); + float32x2_t v53 = vsub_f32(v46, v51); + float32x2_t v64 = vadd_f32(v58, v63); + float32x2_t v65 = vsub_f32(v58, v63); + float32x2_t v78 = vadd_f32(v72, v77); + float32x2_t v79 = vsub_f32(v72, v77); + float32x2_t v90 = vadd_f32(v84, v89); + float32x2_t v91 = vsub_f32(v84, v89); + float32x2_t v104 = vadd_f32(v98, v103); + float32x2_t v105 = vsub_f32(v98, v103); + float32x2_t v116 = vadd_f32(v110, v115); + float32x2_t v117 = vsub_f32(v110, v115); + float32x2_t v130 = vadd_f32(v124, v129); + float32x2_t v131 = vsub_f32(v124, v129); + float32x2_t v142 = vadd_f32(v136, v141); + float32x2_t v143 = vsub_f32(v136, v141); + float32x2_t v40 = vadd_f32(v26, v38); + float32x2_t v41 = vsub_f32(v26, v38); + float32x2_t v66 = vadd_f32(v52, v64); + float32x2_t v67 = vsub_f32(v52, v64); + float32x2_t v92 = vadd_f32(v78, v90); + float32x2_t v93 = vsub_f32(v78, v90); + float32x2_t v118 = vadd_f32(v104, v116); + float32x2_t v119 = vsub_f32(v104, v116); + float32x2_t v144 = vadd_f32(v130, v142); + float32x2_t v145 = vsub_f32(v130, v142); + float32x2_t v246 = vadd_f32(v53, v131); + float32x2_t v247 = vsub_f32(v53, v131); + float32x2_t v248 = vadd_f32(v105, v79); + float32x2_t v249 = vsub_f32(v105, v79); + float32x2_t v296 = vadd_f32(v65, v143); + float32x2_t v297 = vsub_f32(v65, v143); + float32x2_t v298 = vadd_f32(v117, v91); + float32x2_t v299 = vsub_f32(v117, v91); + float32x2_t v146 = vadd_f32(v66, v144); + float32x2_t v147 = vsub_f32(v66, v144); + float32x2_t v148 = vadd_f32(v118, v92); + float32x2_t v149 = vsub_f32(v118, v92); + float32x2_t v196 = vadd_f32(v67, v145); + float32x2_t v197 = vsub_f32(v67, v145); + float32x2_t v198 = vadd_f32(v119, v93); + float32x2_t v199 = vsub_f32(v119, v93); + float32x2_t v250 = vadd_f32(v246, v248); + float32x2_t v251 = vsub_f32(v246, v248); + float32x2_t v252 = vadd_f32(v247, v249); + float32x2_t v271 = vrev64_f32(v247); + float32x2_t v285 = vrev64_f32(v249); + float32x2_t v300 = vadd_f32(v296, v298); + float32x2_t v301 = vsub_f32(v296, v298); + float32x2_t v302 = vadd_f32(v297, v299); + float32x2_t v328 = vmul_f32(v297, v327); + float32x2_t v336 = vmul_f32(v299, v335); + float32x2_t v150 = vadd_f32(v146, v148); + float32x2_t v151 = vsub_f32(v146, v148); + float32x2_t v152 = vadd_f32(v147, v149); + float32x2_t v171 = vrev64_f32(v147); + float32x2_t v185 = vrev64_f32(v149); + float32x2_t v200 = vadd_f32(v196, v198); + float32x2_t v201 = vsub_f32(v196, v198); + float32x2_t v202 = vadd_f32(v197, v199); + float32x2_t v221 = vrev64_f32(v197); + float32x2_t v235 = vrev64_f32(v199); + float32x2_t v253 = vadd_f32(v250, v27); + float32x2_t v261 = vmul_f32(v250, v260); + float32x2_t v265 = vmul_f32(v251, v264); + float32x2_t v272 = vmul_f32(v271, v270); + float32x2_t v278 = vrev64_f32(v252); + float32x2_t v286 = vmul_f32(v285, v284); + float32x2_t v303 = vadd_f32(v300, v39); + float32x2_t v316 = vrev64_f32(v300); + float32x2_t v323 = vrev64_f32(v301); + float32x2_t v332 = vmul_f32(v302, v331); + float32x2_t v153 = vadd_f32(v150, v40); + float32x2_t v161 = vmul_f32(v150, v260); + float32x2_t v165 = vmul_f32(v151, v264); + float32x2_t v172 = vmul_f32(v171, v270); + float32x2_t v178 = vrev64_f32(v152); + float32x2_t v186 = vmul_f32(v185, v284); + float32x2_t v203 = vadd_f32(v200, v41); + float32x2_t v211 = vmul_f32(v200, v260); + float32x2_t v215 = vmul_f32(v201, v264); + float32x2_t v222 = vmul_f32(v221, v270); + float32x2_t v228 = vrev64_f32(v202); + float32x2_t v236 = vmul_f32(v235, v284); + float32x2_t v279 = vmul_f32(v278, v277); + float32x2_t v287 = vadd_f32(v253, v261); + float32x2_t v309 = vrev64_f32(v303); + float32x2_t v317 = vmul_f32(v316, v315); + float32x2_t v324 = vmul_f32(v323, v322); + float32x2_t v340 = vsub_f32(v328, v332); + float32x2_t v341 = vadd_f32(v332, v336); + float32x2_t v179 = vmul_f32(v178, v277); + float32x2_t v187 = vadd_f32(v153, v161); + float32x2_t v229 = vmul_f32(v228, v277); + float32x2_t v237 = vadd_f32(v203, v211); + float32x2_t v288 = vadd_f32(v287, v265); + float32x2_t v289 = vsub_f32(v287, v265); + float32x2_t v290 = vsub_f32(v272, v279); + float32x2_t v291 = vadd_f32(v279, v286); + float32x2_t v310 = vmul_f32(v309, v308); + int16x4_t v350 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v153, 15), (int32x2_t){0, 0})); + int16x4_t v362 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v203, 15), (int32x2_t){0, 0})); + float32x2_t v188 = vadd_f32(v187, v165); + float32x2_t v189 = vsub_f32(v187, v165); + float32x2_t v190 = vsub_f32(v172, v179); + float32x2_t v191 = vadd_f32(v179, v186); + float32x2_t v238 = vadd_f32(v237, v215); + float32x2_t v239 = vsub_f32(v237, v215); + float32x2_t v240 = vsub_f32(v222, v229); + float32x2_t v241 = vadd_f32(v229, v236); + float32x2_t v292 = vadd_f32(v288, v290); + float32x2_t v293 = vsub_f32(v288, v290); + float32x2_t v294 = vadd_f32(v289, v291); + float32x2_t v295 = vsub_f32(v289, v291); + float32x2_t v337 = vadd_f32(v310, v317); + float32x2_t v346 = vadd_f32(v253, v310); + float32x2_t v347 = vsub_f32(v253, v310); + v6[0] = vget_lane_s32(vreinterpret_s32_s16(v350), 0); + v6[ostride * 10] = vget_lane_s32(vreinterpret_s32_s16(v362), 0); + float32x2_t v192 = vadd_f32(v188, v190); + float32x2_t v193 = vsub_f32(v188, v190); + float32x2_t v194 = vadd_f32(v189, v191); + float32x2_t v195 = vsub_f32(v189, v191); + float32x2_t v242 = vadd_f32(v238, v240); + float32x2_t v243 = vsub_f32(v238, v240); + float32x2_t v244 = vadd_f32(v239, v241); + float32x2_t v245 = vsub_f32(v239, v241); + float32x2_t v338 = vadd_f32(v337, v324); + float32x2_t v339 = vsub_f32(v337, v324); + int16x4_t v356 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v347, 15), (int32x2_t){0, 0})); + int16x4_t v368 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v346, 15), (int32x2_t){0, 0})); + float32x2_t v342 = vadd_f32(v338, v340); + float32x2_t v343 = vsub_f32(v338, v340); + float32x2_t v344 = vadd_f32(v339, v341); + float32x2_t v345 = vsub_f32(v339, v341); + v6[ostride * 5] = vget_lane_s32(vreinterpret_s32_s16(v356), 0); + v6[ostride * 15] = vget_lane_s32(vreinterpret_s32_s16(v368), 0); + int16x4_t v376 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v193, 15), (int32x2_t){0, 0})); + int16x4_t v388 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v243, 15), (int32x2_t){0, 0})); + int16x4_t v402 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v195, 15), (int32x2_t){0, 0})); + int16x4_t v414 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v245, 15), (int32x2_t){0, 0})); + int16x4_t v428 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v194, 15), (int32x2_t){0, 0})); + int16x4_t v440 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v244, 15), (int32x2_t){0, 0})); + int16x4_t v454 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v192, 15), (int32x2_t){0, 0})); + int16x4_t v466 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v242, 15), (int32x2_t){0, 0})); + float32x2_t v372 = vadd_f32(v293, v343); + float32x2_t v373 = vsub_f32(v293, v343); + v6[ostride * 16] = vget_lane_s32(vreinterpret_s32_s16(v376), 0); + v6[ostride * 6] = vget_lane_s32(vreinterpret_s32_s16(v388), 0); + float32x2_t v398 = vadd_f32(v295, v345); + float32x2_t v399 = vsub_f32(v295, v345); + v6[ostride * 12] = vget_lane_s32(vreinterpret_s32_s16(v402), 0); + v6[ostride * 2] = vget_lane_s32(vreinterpret_s32_s16(v414), 0); + float32x2_t v424 = vadd_f32(v294, v344); + float32x2_t v425 = vsub_f32(v294, v344); + v6[ostride * 8] = vget_lane_s32(vreinterpret_s32_s16(v428), 0); + v6[ostride * 18] = vget_lane_s32(vreinterpret_s32_s16(v440), 0); + float32x2_t v450 = vadd_f32(v292, v342); + float32x2_t v451 = vsub_f32(v292, v342); + v6[ostride * 4] = vget_lane_s32(vreinterpret_s32_s16(v454), 0); + v6[ostride * 14] = vget_lane_s32(vreinterpret_s32_s16(v466), 0); + int16x4_t v382 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v373, 15), (int32x2_t){0, 0})); + int16x4_t v394 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v372, 15), (int32x2_t){0, 0})); + int16x4_t v408 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v399, 15), (int32x2_t){0, 0})); + int16x4_t v420 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v398, 15), (int32x2_t){0, 0})); + int16x4_t v434 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v425, 15), (int32x2_t){0, 0})); + int16x4_t v446 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v424, 15), (int32x2_t){0, 0})); + int16x4_t v460 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v451, 15), (int32x2_t){0, 0})); + int16x4_t v472 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v450, 15), (int32x2_t){0, 0})); + v6[ostride] = vget_lane_s32(vreinterpret_s32_s16(v382), 0); + v6[ostride * 11] = vget_lane_s32(vreinterpret_s32_s16(v394), 0); + v6[ostride * 17] = vget_lane_s32(vreinterpret_s32_s16(v408), 0); + v6[ostride * 7] = vget_lane_s32(vreinterpret_s32_s16(v420), 0); + v6[ostride * 13] = vget_lane_s32(vreinterpret_s32_s16(v434), 0); + v6[ostride * 3] = vget_lane_s32(vreinterpret_s32_s16(v446), 0); + v6[ostride * 9] = vget_lane_s32(vreinterpret_s32_s16(v460), 0); + v6[ostride * 19] = vget_lane_s32(vreinterpret_s32_s16(v472), 0); v5 += 1 * 1; v6 += 1 * 1; } @@ -12705,179 +7987,102 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu20(const armral_cmplx_f32_t *restrict x, svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v730)[0])); svfloat32_t v995 = svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v748)[0])); - svfloat32_t v32; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v32) : "w"(v957), "w"(v959)); - svfloat32_t v33; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v33) : "w"(v957), "w"(v959)); - svfloat32_t v48; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v48) : "w"(v961), "w"(v963)); - svfloat32_t v49; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v49) : "w"(v961), "w"(v963)); - svfloat32_t v66; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v66) : "w"(v965), "w"(v967)); - svfloat32_t v67; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v67) : "w"(v965), "w"(v967)); - svfloat32_t v82; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v82) : "w"(v969), "w"(v971)); - svfloat32_t v83; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v83) : "w"(v969), "w"(v971)); - svfloat32_t v100; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v100) : "w"(v973), "w"(v975)); - svfloat32_t v101; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v101) : "w"(v973), "w"(v975)); - svfloat32_t v116; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v116) : "w"(v977), "w"(v979)); - svfloat32_t v117; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v117) : "w"(v977), "w"(v979)); - svfloat32_t v134; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v134) : "w"(v981), "w"(v983)); - svfloat32_t v135; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v135) : "w"(v981), "w"(v983)); - svfloat32_t v150; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v150) : "w"(v985), "w"(v987)); - svfloat32_t v151; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v151) : "w"(v985), "w"(v987)); - svfloat32_t v168; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v168) : "w"(v989), "w"(v991)); - svfloat32_t v169; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v169) : "w"(v989), "w"(v991)); - svfloat32_t v184; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v184) : "w"(v993), "w"(v995)); - svfloat32_t v185; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v185) : "w"(v993), "w"(v995)); - svfloat32_t v50; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v50) : "w"(v32), "w"(v48)); - svfloat32_t v51; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v51) : "w"(v32), "w"(v48)); - svfloat32_t v84; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v84) : "w"(v66), "w"(v82)); - svfloat32_t v85; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v85) : "w"(v66), "w"(v82)); - svfloat32_t v118; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v118) : "w"(v100), "w"(v116)); - svfloat32_t v119; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v119) : "w"(v100), "w"(v116)); - svfloat32_t v152; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v152) : "w"(v134), "w"(v150)); - svfloat32_t v153; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v153) : "w"(v134), "w"(v150)); - svfloat32_t v186; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v186) : "w"(v168), "w"(v184)); - svfloat32_t v187; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v187) : "w"(v168), "w"(v184)); - svfloat32_t v294; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v294) : "w"(v67), "w"(v169)); - svfloat32_t v295; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v295) : "w"(v67), "w"(v169)); - svfloat32_t v296; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v296) : "w"(v135), "w"(v101)); - svfloat32_t v297; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v297) : "w"(v135), "w"(v101)); - svfloat32_t v347; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v347) : "w"(v83), "w"(v185)); - svfloat32_t v348; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v348) : "w"(v83), "w"(v185)); - svfloat32_t v349; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v349) : "w"(v151), "w"(v117)); - svfloat32_t v350; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v350) : "w"(v151), "w"(v117)); - svfloat32_t v188; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v188) : "w"(v84), "w"(v186)); - svfloat32_t v189; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v189) : "w"(v84), "w"(v186)); - svfloat32_t v190; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v190) : "w"(v152), "w"(v118)); - svfloat32_t v191; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v191) : "w"(v152), "w"(v118)); - svfloat32_t v241; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v241) : "w"(v85), "w"(v187)); - svfloat32_t v242; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v242) : "w"(v85), "w"(v187)); - svfloat32_t v243; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v243) : "w"(v153), "w"(v119)); - svfloat32_t v244; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v244) : "w"(v153), "w"(v119)); - svfloat32_t v298; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v298) : "w"(v294), "w"(v296)); - svfloat32_t v299; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v299) : "w"(v294), "w"(v296)); - svfloat32_t v300; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v300) : "w"(v295), "w"(v297)); - svfloat32_t zero323; - asm volatile("mov %0.s, #0" : "=w"(zero323)); + svfloat32_t v32 = svadd_f32_x(svptrue_b32(), v957, v959); + svfloat32_t v33 = svsub_f32_x(svptrue_b32(), v957, v959); + svfloat32_t v48 = svadd_f32_x(svptrue_b32(), v961, v963); + svfloat32_t v49 = svsub_f32_x(svptrue_b32(), v961, v963); + svfloat32_t v66 = svadd_f32_x(svptrue_b32(), v965, v967); + svfloat32_t v67 = svsub_f32_x(svptrue_b32(), v965, v967); + svfloat32_t v82 = svadd_f32_x(svptrue_b32(), v969, v971); + svfloat32_t v83 = svsub_f32_x(svptrue_b32(), v969, v971); + svfloat32_t v100 = svadd_f32_x(svptrue_b32(), v973, v975); + svfloat32_t v101 = svsub_f32_x(svptrue_b32(), v973, v975); + svfloat32_t v116 = svadd_f32_x(svptrue_b32(), v977, v979); + svfloat32_t v117 = svsub_f32_x(svptrue_b32(), v977, v979); + svfloat32_t v134 = svadd_f32_x(svptrue_b32(), v981, v983); + svfloat32_t v135 = svsub_f32_x(svptrue_b32(), v981, v983); + svfloat32_t v150 = svadd_f32_x(svptrue_b32(), v985, v987); + svfloat32_t v151 = svsub_f32_x(svptrue_b32(), v985, v987); + svfloat32_t v168 = svadd_f32_x(svptrue_b32(), v989, v991); + svfloat32_t v169 = svsub_f32_x(svptrue_b32(), v989, v991); + svfloat32_t v184 = svadd_f32_x(svptrue_b32(), v993, v995); + svfloat32_t v185 = svsub_f32_x(svptrue_b32(), v993, v995); + svfloat32_t v50 = svadd_f32_x(svptrue_b32(), v32, v48); + svfloat32_t v51 = svsub_f32_x(svptrue_b32(), v32, v48); + svfloat32_t v84 = svadd_f32_x(svptrue_b32(), v66, v82); + svfloat32_t v85 = svsub_f32_x(svptrue_b32(), v66, v82); + svfloat32_t v118 = svadd_f32_x(svptrue_b32(), v100, v116); + svfloat32_t v119 = svsub_f32_x(svptrue_b32(), v100, v116); + svfloat32_t v152 = svadd_f32_x(svptrue_b32(), v134, v150); + svfloat32_t v153 = svsub_f32_x(svptrue_b32(), v134, v150); + svfloat32_t v186 = svadd_f32_x(svptrue_b32(), v168, v184); + svfloat32_t v187 = svsub_f32_x(svptrue_b32(), v168, v184); + svfloat32_t v294 = svadd_f32_x(svptrue_b32(), v67, v169); + svfloat32_t v295 = svsub_f32_x(svptrue_b32(), v67, v169); + svfloat32_t v296 = svadd_f32_x(svptrue_b32(), v135, v101); + svfloat32_t v297 = svsub_f32_x(svptrue_b32(), v135, v101); + svfloat32_t v347 = svadd_f32_x(svptrue_b32(), v83, v185); + svfloat32_t v348 = svsub_f32_x(svptrue_b32(), v83, v185); + svfloat32_t v349 = svadd_f32_x(svptrue_b32(), v151, v117); + svfloat32_t v350 = svsub_f32_x(svptrue_b32(), v151, v117); + svfloat32_t v188 = svadd_f32_x(svptrue_b32(), v84, v186); + svfloat32_t v189 = svsub_f32_x(svptrue_b32(), v84, v186); + svfloat32_t v190 = svadd_f32_x(svptrue_b32(), v152, v118); + svfloat32_t v191 = svsub_f32_x(svptrue_b32(), v152, v118); + svfloat32_t v241 = svadd_f32_x(svptrue_b32(), v85, v187); + svfloat32_t v242 = svsub_f32_x(svptrue_b32(), v85, v187); + svfloat32_t v243 = svadd_f32_x(svptrue_b32(), v153, v119); + svfloat32_t v244 = svsub_f32_x(svptrue_b32(), v153, v119); + svfloat32_t v298 = svadd_f32_x(svptrue_b32(), v294, v296); + svfloat32_t v299 = svsub_f32_x(svptrue_b32(), v294, v296); + svfloat32_t v300 = svadd_f32_x(svptrue_b32(), v295, v297); + svfloat32_t zero323 = svdup_n_f32(0); svfloat32_t v323 = svcmla_f32_x(pred_full, zero323, v766, v295, 90); - svfloat32_t v351; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v351) : "w"(v347), "w"(v349)); - svfloat32_t v352; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v352) : "w"(v347), "w"(v349)); - svfloat32_t v353; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v353) : "w"(v348), "w"(v350)); - svfloat32_t v390; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v390) : "w"(v350), "w"(v774)); - svfloat32_t v192; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v192) : "w"(v188), "w"(v190)); - svfloat32_t v193; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v193) : "w"(v188), "w"(v190)); - svfloat32_t v194; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v194) : "w"(v189), "w"(v191)); - svfloat32_t zero217; - asm volatile("mov %0.s, #0" : "=w"(zero217)); + svfloat32_t v351 = svadd_f32_x(svptrue_b32(), v347, v349); + svfloat32_t v352 = svsub_f32_x(svptrue_b32(), v347, v349); + svfloat32_t v353 = svadd_f32_x(svptrue_b32(), v348, v350); + svfloat32_t v390 = svmul_f32_x(svptrue_b32(), v350, v774); + svfloat32_t v192 = svadd_f32_x(svptrue_b32(), v188, v190); + svfloat32_t v193 = svsub_f32_x(svptrue_b32(), v188, v190); + svfloat32_t v194 = svadd_f32_x(svptrue_b32(), v189, v191); + svfloat32_t zero217 = svdup_n_f32(0); svfloat32_t v217 = svcmla_f32_x(pred_full, zero217, v766, v189, 90); - svfloat32_t v245; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v245) : "w"(v241), "w"(v243)); - svfloat32_t v246; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v246) : "w"(v241), "w"(v243)); - svfloat32_t v247; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v247) : "w"(v242), "w"(v244)); - svfloat32_t zero270; - asm volatile("mov %0.s, #0" : "=w"(zero270)); + svfloat32_t v245 = svadd_f32_x(svptrue_b32(), v241, v243); + svfloat32_t v246 = svsub_f32_x(svptrue_b32(), v241, v243); + svfloat32_t v247 = svadd_f32_x(svptrue_b32(), v242, v244); + svfloat32_t zero270 = svdup_n_f32(0); svfloat32_t v270 = svcmla_f32_x(pred_full, zero270, v766, v242, 90); - svfloat32_t v301; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v301) : "w"(v298), "w"(v33)); - svfloat32_t zero330; - asm volatile("mov %0.s, #0" : "=w"(zero330)); + svfloat32_t v301 = svadd_f32_x(svptrue_b32(), v298, v33); + svfloat32_t zero330 = svdup_n_f32(0); svfloat32_t v330 = svcmla_f32_x(pred_full, zero330, v767, v300, 90); - svfloat32_t v354; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v354) : "w"(v351), "w"(v49)); - svfloat32_t zero375; - asm volatile("mov %0.s, #0" : "=w"(zero375)); + svfloat32_t v354 = svadd_f32_x(svptrue_b32(), v351, v49); + svfloat32_t zero375 = svdup_n_f32(0); svfloat32_t v375 = svcmla_f32_x(pred_full, zero375, v771, v352, 90); - svfloat32_t v385; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v385) : "w"(v353), "w"(v773)); - svfloat32_t v195; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v195) : "w"(v192), "w"(v50)); - svfloat32_t zero224; - asm volatile("mov %0.s, #0" : "=w"(zero224)); + svfloat32_t v385 = svmul_f32_x(svptrue_b32(), v353, v773); + svfloat32_t v195 = svadd_f32_x(svptrue_b32(), v192, v50); + svfloat32_t zero224 = svdup_n_f32(0); svfloat32_t v224 = svcmla_f32_x(pred_full, zero224, v767, v194, 90); - svfloat32_t v248; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v248) : "w"(v245), "w"(v51)); - svfloat32_t zero277; - asm volatile("mov %0.s, #0" : "=w"(zero277)); + svfloat32_t v248 = svadd_f32_x(svptrue_b32(), v245, v51); + svfloat32_t zero277 = svdup_n_f32(0); svfloat32_t v277 = svcmla_f32_x(pred_full, zero277, v767, v247, 90); svfloat32_t v338 = svmla_f32_x(pred_full, v301, v298, v764); - svfloat32_t v341; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v341) : "w"(v323), "w"(v330)); + svfloat32_t v341 = svsub_f32_x(svptrue_b32(), v323, v330); svfloat32_t v342 = svcmla_f32_x(pred_full, v330, v768, v297, 90); - svfloat32_t zero361; - asm volatile("mov %0.s, #0" : "=w"(zero361)); + svfloat32_t zero361 = svdup_n_f32(0); svfloat32_t v361 = svcmla_f32_x(pred_full, zero361, v769, v354, 90); svfloat32_t v394 = svnmls_f32_x(pred_full, v385, v348, v772); svfloat32_t v395 = svmla_f32_x(pred_full, v390, v353, v773); svfloat32_t v232 = svmla_f32_x(pred_full, v195, v192, v764); - svfloat32_t v235; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v235) : "w"(v217), "w"(v224)); + svfloat32_t v235 = svsub_f32_x(svptrue_b32(), v217, v224); svfloat32_t v236 = svcmla_f32_x(pred_full, v224, v768, v191, 90); svfloat32_t v285 = svmla_f32_x(pred_full, v248, v245, v764); - svfloat32_t v288; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v288) : "w"(v270), "w"(v277)); + svfloat32_t v288 = svsub_f32_x(svptrue_b32(), v270, v277); svfloat32_t v289 = svcmla_f32_x(pred_full, v277, v768, v244, 90); svfloat32_t v339 = svmla_f32_x(pred_full, v338, v299, v765); svfloat32_t v340 = svmls_f32_x(pred_full, v338, v299, v765); svfloat32_t v391 = svcmla_f32_x(pred_full, v361, v770, v351, 90); - svfloat32_t v400; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v400) : "w"(v301), "w"(v361)); - svfloat32_t v401; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v401) : "w"(v301), "w"(v361)); + svfloat32_t v400 = svadd_f32_x(svptrue_b32(), v301, v361); + svfloat32_t v401 = svsub_f32_x(svptrue_b32(), v301, v361); svint16_t v404 = svtbl_s16( svreinterpret_s16_s32(svcvt_s32_f32_x( pred_full, svmul_n_f32_x(pred_full, v195, (float)(1ULL << 31ULL)))), @@ -12892,18 +8097,12 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu20(const armral_cmplx_f32_t *restrict x, svfloat32_t v234 = svmls_f32_x(pred_full, v232, v193, v765); svfloat32_t v286 = svmla_f32_x(pred_full, v285, v246, v765); svfloat32_t v287 = svmls_f32_x(pred_full, v285, v246, v765); - svfloat32_t v343; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v343) : "w"(v339), "w"(v341)); - svfloat32_t v344; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v344) : "w"(v339), "w"(v341)); - svfloat32_t v345; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v345) : "w"(v340), "w"(v342)); - svfloat32_t v346; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v346) : "w"(v340), "w"(v342)); - svfloat32_t v392; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v392) : "w"(v391), "w"(v375)); - svfloat32_t v393; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v393) : "w"(v391), "w"(v375)); + svfloat32_t v343 = svadd_f32_x(svptrue_b32(), v339, v341); + svfloat32_t v344 = svsub_f32_x(svptrue_b32(), v339, v341); + svfloat32_t v345 = svadd_f32_x(svptrue_b32(), v340, v342); + svfloat32_t v346 = svsub_f32_x(svptrue_b32(), v340, v342); + svfloat32_t v392 = svadd_f32_x(svptrue_b32(), v391, v375); + svfloat32_t v393 = svsub_f32_x(svptrue_b32(), v391, v375); svint16_t v412 = svtbl_s16( svreinterpret_s16_s32(svcvt_s32_f32_x( pred_full, svmul_n_f32_x(pred_full, v401, (float)(1ULL << 31ULL)))), @@ -12916,36 +8115,22 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu20(const armral_cmplx_f32_t *restrict x, svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); svst1w_u64(pred_full, (unsigned *)(v782), svreinterpret_u64_s16(v404)); svst1w_u64(pred_full, (unsigned *)(v800), svreinterpret_u64_s16(v420)); - svfloat32_t v237; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v237) : "w"(v233), "w"(v235)); - svfloat32_t v238; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v238) : "w"(v233), "w"(v235)); - svfloat32_t v239; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v239) : "w"(v234), "w"(v236)); - svfloat32_t v240; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v240) : "w"(v234), "w"(v236)); - svfloat32_t v290; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v290) : "w"(v286), "w"(v288)); - svfloat32_t v291; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v291) : "w"(v286), "w"(v288)); - svfloat32_t v292; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v292) : "w"(v287), "w"(v289)); - svfloat32_t v293; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v293) : "w"(v287), "w"(v289)); - svfloat32_t v396; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v396) : "w"(v392), "w"(v394)); - svfloat32_t v397; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v397) : "w"(v392), "w"(v394)); - svfloat32_t v398; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v398) : "w"(v393), "w"(v395)); - svfloat32_t v399; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v399) : "w"(v393), "w"(v395)); + svfloat32_t v237 = svadd_f32_x(svptrue_b32(), v233, v235); + svfloat32_t v238 = svsub_f32_x(svptrue_b32(), v233, v235); + svfloat32_t v239 = svadd_f32_x(svptrue_b32(), v234, v236); + svfloat32_t v240 = svsub_f32_x(svptrue_b32(), v234, v236); + svfloat32_t v290 = svadd_f32_x(svptrue_b32(), v286, v288); + svfloat32_t v291 = svsub_f32_x(svptrue_b32(), v286, v288); + svfloat32_t v292 = svadd_f32_x(svptrue_b32(), v287, v289); + svfloat32_t v293 = svsub_f32_x(svptrue_b32(), v287, v289); + svfloat32_t v396 = svadd_f32_x(svptrue_b32(), v392, v394); + svfloat32_t v397 = svsub_f32_x(svptrue_b32(), v392, v394); + svfloat32_t v398 = svadd_f32_x(svptrue_b32(), v393, v395); + svfloat32_t v399 = svsub_f32_x(svptrue_b32(), v393, v395); svst1w_u64(pred_full, (unsigned *)(v791), svreinterpret_u64_s16(v412)); svst1w_u64(pred_full, (unsigned *)(v809), svreinterpret_u64_s16(v428)); - svfloat32_t v434; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v434) : "w"(v344), "w"(v397)); - svfloat32_t v435; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v435) : "w"(v344), "w"(v397)); + svfloat32_t v434 = svadd_f32_x(svptrue_b32(), v344, v397); + svfloat32_t v435 = svsub_f32_x(svptrue_b32(), v344, v397); svint16_t v438 = svtbl_s16( svreinterpret_s16_s32(svcvt_s32_f32_x( pred_full, svmul_n_f32_x(pred_full, v238, (float)(1ULL << 31ULL)))), @@ -12956,10 +8141,8 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu20(const armral_cmplx_f32_t *restrict x, pred_full, svmul_n_f32_x(pred_full, v291, (float)(1ULL << 31ULL)))), svreinterpret_u16_u64( svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svfloat32_t v468; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v468) : "w"(v346), "w"(v399)); - svfloat32_t v469; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v469) : "w"(v346), "w"(v399)); + svfloat32_t v468 = svadd_f32_x(svptrue_b32(), v346, v399); + svfloat32_t v469 = svsub_f32_x(svptrue_b32(), v346, v399); svint16_t v472 = svtbl_s16( svreinterpret_s16_s32(svcvt_s32_f32_x( pred_full, svmul_n_f32_x(pred_full, v240, (float)(1ULL << 31ULL)))), @@ -12970,10 +8153,8 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu20(const armral_cmplx_f32_t *restrict x, pred_full, svmul_n_f32_x(pred_full, v293, (float)(1ULL << 31ULL)))), svreinterpret_u16_u64( svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svfloat32_t v502; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v502) : "w"(v345), "w"(v398)); - svfloat32_t v503; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v503) : "w"(v345), "w"(v398)); + svfloat32_t v502 = svadd_f32_x(svptrue_b32(), v345, v398); + svfloat32_t v503 = svsub_f32_x(svptrue_b32(), v345, v398); svint16_t v506 = svtbl_s16( svreinterpret_s16_s32(svcvt_s32_f32_x( pred_full, svmul_n_f32_x(pred_full, v239, (float)(1ULL << 31ULL)))), @@ -12984,10 +8165,8 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu20(const armral_cmplx_f32_t *restrict x, pred_full, svmul_n_f32_x(pred_full, v292, (float)(1ULL << 31ULL)))), svreinterpret_u16_u64( svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svfloat32_t v536; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v536) : "w"(v343), "w"(v396)); - svfloat32_t v537; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v537) : "w"(v343), "w"(v396)); + svfloat32_t v536 = svadd_f32_x(svptrue_b32(), v343, v396); + svfloat32_t v537 = svsub_f32_x(svptrue_b32(), v343, v396); svint16_t v540 = svtbl_s16( svreinterpret_s16_s32(svcvt_s32_f32_x( pred_full, svmul_n_f32_x(pred_full, v237, (float)(1ULL << 31ULL)))), @@ -13068,764 +8247,359 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu21(const armral_cmplx_f32_t *restrict x, float v4 = dir; const float32x2_t *v5 = (const float32x2_t *)x; int32_t *v6 = (int32_t *)y; - int64_t v12 = howmany - 1; - int64_t v660 = howmany / 2; - for (int j = 0; j < v12; j += 2) { - float v212 = -1.1666666666666665e+00F; - float v217 = 7.9015646852540022e-01F; - float v222 = 5.5854267289647742e-02F; - float v227 = 7.3430220123575241e-01F; - float v231 = 4.4095855184409838e-01F; - float v232 = -4.4095855184409838e-01F; - float v239 = 3.4087293062393137e-01F; - float v240 = -3.4087293062393137e-01F; - float v247 = -5.3396936033772524e-01F; - float v248 = 5.3396936033772524e-01F; - float v255 = 8.7484229096165667e-01F; - float v256 = -8.7484229096165667e-01F; - float v300 = -1.4999999999999998e+00F; - float v305 = 1.7499999999999996e+00F; - float v310 = -1.1852347027881001e+00F; - float v315 = -8.3781400934471603e-02F; - float v320 = -1.1014533018536286e+00F; - float v324 = -6.6143782776614746e-01F; - float v325 = 6.6143782776614746e-01F; - float v332 = -5.1130939593589697e-01F; - float v333 = 5.1130939593589697e-01F; - float v340 = 8.0095404050658769e-01F; - float v341 = -8.0095404050658769e-01F; - float v348 = -1.3122634364424848e+00F; - float v349 = 1.3122634364424848e+00F; - float v392 = 8.6602540378443871e-01F; - float v393 = -8.6602540378443871e-01F; - float v400 = -1.0103629710818451e+00F; - float v401 = 1.0103629710818451e+00F; - float v408 = 6.8429557470759583e-01F; - float v409 = -6.8429557470759583e-01F; - float v416 = 4.8371214382601155e-02F; - float v417 = -4.8371214382601155e-02F; - float v424 = 6.3592436032499466e-01F; - float v425 = -6.3592436032499466e-01F; - float32x2_t v427 = (float32x2_t){v4, v4}; - float v433 = -3.8188130791298663e-01F; - float v438 = -2.9520461738277515e-01F; - float v443 = 4.6243103089499693e-01F; - float v448 = -7.5763564827777208e-01F; - const float32x2_t *v1342 = &v5[istride]; - int32_t *v1433 = &v6[ostride]; - float32x2_t v213 = (float32x2_t){v212, v212}; - float32x2_t v218 = (float32x2_t){v217, v217}; - float32x2_t v223 = (float32x2_t){v222, v222}; - float32x2_t v228 = (float32x2_t){v227, v227}; - float32x2_t v233 = (float32x2_t){v231, v232}; - float32x2_t v241 = (float32x2_t){v239, v240}; - float32x2_t v249 = (float32x2_t){v247, v248}; - float32x2_t v257 = (float32x2_t){v255, v256}; - float32x2_t v301 = (float32x2_t){v300, v300}; - float32x2_t v306 = (float32x2_t){v305, v305}; - float32x2_t v311 = (float32x2_t){v310, v310}; - float32x2_t v316 = (float32x2_t){v315, v315}; - float32x2_t v321 = (float32x2_t){v320, v320}; - float32x2_t v326 = (float32x2_t){v324, v325}; - float32x2_t v334 = (float32x2_t){v332, v333}; - float32x2_t v342 = (float32x2_t){v340, v341}; + for (int j = 0; j < howmany; j += 1) { + float32x2_t v110 = v5[istride]; + float v164 = -1.1666666666666665e+00F; + float v168 = 7.9015646852540022e-01F; + float v172 = 5.5854267289647742e-02F; + float v176 = 7.3430220123575241e-01F; + float v179 = 4.4095855184409838e-01F; + float v180 = -4.4095855184409838e-01F; + float v186 = 3.4087293062393137e-01F; + float v187 = -3.4087293062393137e-01F; + float v193 = -5.3396936033772524e-01F; + float v194 = 5.3396936033772524e-01F; + float v200 = 8.7484229096165667e-01F; + float v201 = -8.7484229096165667e-01F; + float v244 = -1.4999999999999998e+00F; + float v248 = 1.7499999999999996e+00F; + float v252 = -1.1852347027881001e+00F; + float v256 = -8.3781400934471603e-02F; + float v260 = -1.1014533018536286e+00F; + float v263 = -6.6143782776614746e-01F; + float v264 = 6.6143782776614746e-01F; + float v270 = -5.1130939593589697e-01F; + float v271 = 5.1130939593589697e-01F; + float v277 = 8.0095404050658769e-01F; + float v278 = -8.0095404050658769e-01F; + float v284 = -1.3122634364424848e+00F; + float v285 = 1.3122634364424848e+00F; + float v327 = 8.6602540378443871e-01F; + float v328 = -8.6602540378443871e-01F; + float v334 = -1.0103629710818451e+00F; + float v335 = 1.0103629710818451e+00F; + float v341 = 6.8429557470759583e-01F; + float v342 = -6.8429557470759583e-01F; + float v348 = 4.8371214382601155e-02F; + float v349 = -4.8371214382601155e-02F; + float v355 = 6.3592436032499466e-01F; + float v356 = -6.3592436032499466e-01F; + float32x2_t v358 = (float32x2_t){v4, v4}; + float v363 = -3.8188130791298663e-01F; + float v367 = -2.9520461738277515e-01F; + float v371 = 4.6243103089499693e-01F; + float v375 = -7.5763564827777208e-01F; + float32x2_t v32 = v5[0]; + float32x2_t v165 = (float32x2_t){v164, v164}; + float32x2_t v169 = (float32x2_t){v168, v168}; + float32x2_t v173 = (float32x2_t){v172, v172}; + float32x2_t v177 = (float32x2_t){v176, v176}; + float32x2_t v181 = (float32x2_t){v179, v180}; + float32x2_t v188 = (float32x2_t){v186, v187}; + float32x2_t v195 = (float32x2_t){v193, v194}; + float32x2_t v202 = (float32x2_t){v200, v201}; + float32x2_t v245 = (float32x2_t){v244, v244}; + float32x2_t v249 = (float32x2_t){v248, v248}; + float32x2_t v253 = (float32x2_t){v252, v252}; + float32x2_t v257 = (float32x2_t){v256, v256}; + float32x2_t v261 = (float32x2_t){v260, v260}; + float32x2_t v265 = (float32x2_t){v263, v264}; + float32x2_t v272 = (float32x2_t){v270, v271}; + float32x2_t v279 = (float32x2_t){v277, v278}; + float32x2_t v286 = (float32x2_t){v284, v285}; + float32x2_t v329 = (float32x2_t){v327, v328}; + float32x2_t v336 = (float32x2_t){v334, v335}; + float32x2_t v343 = (float32x2_t){v341, v342}; float32x2_t v350 = (float32x2_t){v348, v349}; - float32x2_t v394 = (float32x2_t){v392, v393}; - float32x2_t v402 = (float32x2_t){v400, v401}; - float32x2_t v410 = (float32x2_t){v408, v409}; - float32x2_t v418 = (float32x2_t){v416, v417}; - float32x2_t v426 = (float32x2_t){v424, v425}; - float32x2_t v434 = (float32x2_t){v433, v433}; - float32x2_t v439 = (float32x2_t){v438, v438}; - float32x2_t v444 = (float32x2_t){v443, v443}; - float32x2_t v449 = (float32x2_t){v448, v448}; - const float32x2_t *v1225 = &v5[0]; - int32_t *v1397 = &v6[0]; - float32x4_t v1611 = vld1q_f32((const float32_t *)v1342); - float32x4_t v214 = vcombine_f32(v213, v213); - float32x4_t v219 = vcombine_f32(v218, v218); - float32x4_t v224 = vcombine_f32(v223, v223); - float32x4_t v229 = vcombine_f32(v228, v228); - float32x2_t v235 = vmul_f32(v427, v233); - float32x2_t v243 = vmul_f32(v427, v241); - float32x2_t v251 = vmul_f32(v427, v249); - float32x2_t v259 = vmul_f32(v427, v257); - float32x4_t v302 = vcombine_f32(v301, v301); - float32x4_t v307 = vcombine_f32(v306, v306); - float32x4_t v312 = vcombine_f32(v311, v311); - float32x4_t v317 = vcombine_f32(v316, v316); - float32x4_t v322 = vcombine_f32(v321, v321); - float32x2_t v328 = vmul_f32(v427, v326); - float32x2_t v336 = vmul_f32(v427, v334); - float32x2_t v344 = vmul_f32(v427, v342); - float32x2_t v352 = vmul_f32(v427, v350); - float32x2_t v396 = vmul_f32(v427, v394); - float32x2_t v404 = vmul_f32(v427, v402); - float32x2_t v412 = vmul_f32(v427, v410); - float32x2_t v420 = vmul_f32(v427, v418); - float32x2_t v428 = vmul_f32(v427, v426); - float32x4_t v435 = vcombine_f32(v434, v434); - float32x4_t v440 = vcombine_f32(v439, v439); - float32x4_t v445 = vcombine_f32(v444, v444); - float32x4_t v450 = vcombine_f32(v449, v449); - const float32x2_t *v1206 = &v5[istride * 7]; - const float32x2_t *v1215 = &v5[istride * 14]; - const float32x2_t *v1234 = &v5[istride * 10]; - const float32x2_t *v1243 = &v5[istride * 17]; - const float32x2_t *v1252 = &v5[istride * 3]; - const float32x2_t *v1261 = &v5[istride * 13]; - const float32x2_t *v1270 = &v5[istride * 20]; - const float32x2_t *v1279 = &v5[istride * 6]; - const float32x2_t *v1288 = &v5[istride * 16]; - const float32x2_t *v1297 = &v5[istride * 2]; - const float32x2_t *v1306 = &v5[istride * 9]; - const float32x2_t *v1315 = &v5[istride * 19]; - const float32x2_t *v1324 = &v5[istride * 5]; - const float32x2_t *v1333 = &v5[istride * 12]; - const float32x2_t *v1351 = &v5[istride * 8]; - const float32x2_t *v1360 = &v5[istride * 15]; - const float32x2_t *v1369 = &v5[istride * 4]; - const float32x2_t *v1378 = &v5[istride * 11]; - const float32x2_t *v1387 = &v5[istride * 18]; - int32_t *v1406 = &v6[ostride * 7]; - int32_t *v1415 = &v6[ostride * 14]; - int32_t *v1424 = &v6[ostride * 15]; - int32_t *v1442 = &v6[ostride * 8]; - int32_t *v1451 = &v6[ostride * 9]; - int32_t *v1460 = &v6[ostride * 16]; - int32_t *v1469 = &v6[ostride * 2]; - int32_t *v1478 = &v6[ostride * 3]; - int32_t *v1487 = &v6[ostride * 10]; - int32_t *v1496 = &v6[ostride * 17]; - int32_t *v1505 = &v6[ostride * 18]; - int32_t *v1514 = &v6[ostride * 4]; - int32_t *v1523 = &v6[ostride * 11]; - int32_t *v1532 = &v6[ostride * 12]; - int32_t *v1541 = &v6[ostride * 19]; - int32_t *v1550 = &v6[ostride * 5]; - int32_t *v1559 = &v6[ostride * 6]; - int32_t *v1568 = &v6[ostride * 13]; - int32_t *v1577 = &v6[ostride * 20]; - float32x4_t v1585 = vld1q_f32((const float32_t *)v1225); - float32x4_t v237 = vcombine_f32(v235, v235); - float32x4_t v245 = vcombine_f32(v243, v243); - float32x4_t v253 = vcombine_f32(v251, v251); - float32x4_t v261 = vcombine_f32(v259, v259); - float32x4_t v330 = vcombine_f32(v328, v328); - float32x4_t v338 = vcombine_f32(v336, v336); - float32x4_t v346 = vcombine_f32(v344, v344); - float32x4_t v354 = vcombine_f32(v352, v352); - float32x4_t v398 = vcombine_f32(v396, v396); - float32x4_t v406 = vcombine_f32(v404, v404); - float32x4_t v414 = vcombine_f32(v412, v412); - float32x4_t v422 = vcombine_f32(v420, v420); - float32x4_t v430 = vcombine_f32(v428, v428); - float32x4_t v1581 = vld1q_f32((const float32_t *)v1206); - float32x4_t v1583 = vld1q_f32((const float32_t *)v1215); - float32x4_t v1587 = vld1q_f32((const float32_t *)v1234); - float32x4_t v1589 = vld1q_f32((const float32_t *)v1243); - float32x4_t v1591 = vld1q_f32((const float32_t *)v1252); - float32x4_t v1593 = vld1q_f32((const float32_t *)v1261); - float32x4_t v1595 = vld1q_f32((const float32_t *)v1270); - float32x4_t v1597 = vld1q_f32((const float32_t *)v1279); - float32x4_t v1599 = vld1q_f32((const float32_t *)v1288); - float32x4_t v1601 = vld1q_f32((const float32_t *)v1297); - float32x4_t v1603 = vld1q_f32((const float32_t *)v1306); - float32x4_t v1605 = vld1q_f32((const float32_t *)v1315); - float32x4_t v1607 = vld1q_f32((const float32_t *)v1324); - float32x4_t v1609 = vld1q_f32((const float32_t *)v1333); - float32x4_t v1613 = vld1q_f32((const float32_t *)v1351); - float32x4_t v1615 = vld1q_f32((const float32_t *)v1360); - float32x4_t v1617 = vld1q_f32((const float32_t *)v1369); - float32x4_t v1619 = vld1q_f32((const float32_t *)v1378); - float32x4_t v1621 = vld1q_f32((const float32_t *)v1387); - float32x4_t v35 = vaddq_f32(v1581, v1583); - float32x4_t v36 = vsubq_f32(v1581, v1583); - float32x4_t v59 = vaddq_f32(v1587, v1589); - float32x4_t v60 = vsubq_f32(v1587, v1589); - float32x4_t v83 = vaddq_f32(v1593, v1595); - float32x4_t v84 = vsubq_f32(v1593, v1595); - float32x4_t v107 = vaddq_f32(v1599, v1601); - float32x4_t v108 = vsubq_f32(v1599, v1601); - float32x4_t v131 = vaddq_f32(v1605, v1607); - float32x4_t v132 = vsubq_f32(v1605, v1607); - float32x4_t v155 = vaddq_f32(v1611, v1613); - float32x4_t v156 = vsubq_f32(v1611, v1613); - float32x4_t v179 = vaddq_f32(v1617, v1619); - float32x4_t v180 = vsubq_f32(v1617, v1619); - float32x4_t v44 = vaddq_f32(v35, v1585); - float32x4_t v68 = vaddq_f32(v59, v1591); - float32x4_t v92 = vaddq_f32(v83, v1597); - float32x4_t v116 = vaddq_f32(v107, v1603); - float32x4_t v140 = vaddq_f32(v131, v1609); - float32x4_t v164 = vaddq_f32(v155, v1615); - float32x4_t v188 = vaddq_f32(v179, v1621); - float32x4_t v282 = vaddq_f32(v59, v179); - float32x4_t v283 = vsubq_f32(v59, v179); - float32x4_t v284 = vaddq_f32(v131, v107); - float32x4_t v285 = vsubq_f32(v131, v107); - float32x4_t v286 = vaddq_f32(v83, v155); - float32x4_t v287 = vsubq_f32(v83, v155); - float32x4_t v375 = vaddq_f32(v60, v180); - float32x4_t v376 = vsubq_f32(v60, v180); - float32x4_t v377 = vaddq_f32(v132, v108); - float32x4_t v378 = vsubq_f32(v132, v108); - float32x4_t v379 = vaddq_f32(v84, v156); - float32x4_t v380 = vsubq_f32(v84, v156); - float32x4_t v189 = vaddq_f32(v68, v188); - float32x4_t v190 = vsubq_f32(v68, v188); - float32x4_t v191 = vaddq_f32(v140, v116); - float32x4_t v192 = vsubq_f32(v140, v116); - float32x4_t v193 = vaddq_f32(v92, v164); - float32x4_t v194 = vsubq_f32(v92, v164); - float32x4_t v288 = vaddq_f32(v282, v284); - float32x4_t v291 = vsubq_f32(v282, v284); - float32x4_t v292 = vsubq_f32(v284, v286); - float32x4_t v293 = vsubq_f32(v286, v282); - float32x4_t v294 = vaddq_f32(v283, v285); - float32x4_t v296 = vsubq_f32(v283, v285); - float32x4_t v297 = vsubq_f32(v285, v287); - float32x4_t v298 = vsubq_f32(v287, v283); - float32x4_t v381 = vaddq_f32(v375, v377); - float32x4_t v384 = vsubq_f32(v375, v377); - float32x4_t v385 = vsubq_f32(v377, v379); - float32x4_t v386 = vsubq_f32(v379, v375); - float32x4_t v387 = vaddq_f32(v376, v378); - float32x4_t v389 = vsubq_f32(v376, v378); - float32x4_t v390 = vsubq_f32(v378, v380); - float32x4_t v391 = vsubq_f32(v380, v376); - float32x4_t v195 = vaddq_f32(v189, v191); - float32x4_t v198 = vsubq_f32(v189, v191); - float32x4_t v199 = vsubq_f32(v191, v193); - float32x4_t v200 = vsubq_f32(v193, v189); - float32x4_t v201 = vaddq_f32(v190, v192); - float32x4_t v203 = vsubq_f32(v190, v192); - float32x4_t v204 = vsubq_f32(v192, v194); - float32x4_t v205 = vsubq_f32(v194, v190); - float32x4_t v289 = vaddq_f32(v288, v286); - float32x4_t v295 = vaddq_f32(v294, v287); - float32x4_t v313 = vmulq_f32(v291, v312); - float32x4_t v318 = vmulq_f32(v292, v317); - float32x4_t v323 = vmulq_f32(v293, v322); - float32x4_t v337 = vrev64q_f32(v296); - float32x4_t v345 = vrev64q_f32(v297); - float32x4_t v353 = vrev64q_f32(v298); - float32x4_t v382 = vaddq_f32(v381, v379); - float32x4_t v388 = vaddq_f32(v387, v380); - float32x4_t v413 = vrev64q_f32(v384); - float32x4_t v421 = vrev64q_f32(v385); - float32x4_t v429 = vrev64q_f32(v386); - float32x4_t v441 = vmulq_f32(v389, v440); - float32x4_t v446 = vmulq_f32(v390, v445); - float32x4_t v451 = vmulq_f32(v391, v450); - float32x4_t v196 = vaddq_f32(v195, v193); - float32x4_t v202 = vaddq_f32(v201, v194); - float32x4_t v220 = vmulq_f32(v198, v219); - float32x4_t v225 = vmulq_f32(v199, v224); - float32x4_t v230 = vmulq_f32(v200, v229); - float32x4_t v244 = vrev64q_f32(v203); - float32x4_t v252 = vrev64q_f32(v204); - float32x4_t v260 = vrev64q_f32(v205); - float32x4_t v290 = vaddq_f32(v289, v35); - float32x4_t v308 = vmulq_f32(v289, v307); - float32x4_t v329 = vrev64q_f32(v295); - float32x4_t v339 = vmulq_f32(v337, v338); - float32x4_t v347 = vmulq_f32(v345, v346); - float32x4_t v355 = vmulq_f32(v353, v354); - float32x4_t v383 = vaddq_f32(v382, v36); - float32x4_t v405 = vrev64q_f32(v382); - float32x4_t v415 = vmulq_f32(v413, v414); - float32x4_t v423 = vmulq_f32(v421, v422); - float32x4_t v431 = vmulq_f32(v429, v430); - float32x4_t v436 = vmulq_f32(v388, v435); - float32x4_t v197 = vaddq_f32(v196, v44); - float32x4_t v215 = vmulq_f32(v196, v214); - float32x4_t v236 = vrev64q_f32(v202); - float32x4_t v246 = vmulq_f32(v244, v245); - float32x4_t v254 = vmulq_f32(v252, v253); - float32x4_t v262 = vmulq_f32(v260, v261); - float32x4_t v303 = vmulq_f32(v290, v302); - float32x4_t v331 = vmulq_f32(v329, v330); - float32x4_t v397 = vrev64q_f32(v383); - float32x4_t v407 = vmulq_f32(v405, v406); - float32x4_t v459 = vaddq_f32(v436, v441); - float32x4_t v461 = vsubq_f32(v436, v441); - float32x4_t v463 = vsubq_f32(v436, v446); - float32x4_t v238 = vmulq_f32(v236, v237); - float32x4_t v263 = vaddq_f32(v197, v215); - float32x4_t v356 = vaddq_f32(v303, v308); - float32x4_t v363 = vaddq_f32(v331, v339); - float32x4_t v365 = vsubq_f32(v331, v339); - float32x4_t v367 = vsubq_f32(v331, v347); - float32x4_t v399 = vmulq_f32(v397, v398); - float32x4_t v460 = vaddq_f32(v459, v446); - float32x4_t v462 = vsubq_f32(v461, v451); - float32x4_t v464 = vaddq_f32(v463, v451); - float32x4_t v471 = vaddq_f32(v197, v303); - int16x4_t v476 = vqmovn_s32(vcvtq_n_s32_f32(v197, 15)); - float32x4_t v264 = vaddq_f32(v263, v220); - float32x4_t v266 = vsubq_f32(v263, v220); - float32x4_t v268 = vsubq_f32(v263, v225); - float32x4_t v270 = vaddq_f32(v238, v246); - float32x4_t v272 = vsubq_f32(v238, v246); - float32x4_t v274 = vsubq_f32(v238, v254); - float32x4_t v357 = vaddq_f32(v356, v313); - float32x4_t v359 = vsubq_f32(v356, v313); - float32x4_t v361 = vsubq_f32(v356, v318); - float32x4_t v364 = vaddq_f32(v363, v347); - float32x4_t v366 = vsubq_f32(v365, v355); - float32x4_t v368 = vaddq_f32(v367, v355); - float32x4_t v452 = vaddq_f32(v399, v407); - float32x4_t v472 = vaddq_f32(v471, v399); - float32x4_t v473 = vsubq_f32(v471, v399); - vst1_s16((int16_t *)v1397, v476); - float32x4_t v265 = vaddq_f32(v264, v225); - float32x4_t v267 = vsubq_f32(v266, v230); - float32x4_t v269 = vaddq_f32(v268, v230); - float32x4_t v271 = vaddq_f32(v270, v254); - float32x4_t v273 = vsubq_f32(v272, v262); - float32x4_t v275 = vaddq_f32(v274, v262); - float32x4_t v358 = vaddq_f32(v357, v318); - float32x4_t v360 = vsubq_f32(v359, v323); - float32x4_t v362 = vaddq_f32(v361, v323); - float32x4_t v453 = vaddq_f32(v452, v415); - float32x4_t v455 = vsubq_f32(v452, v415); - float32x4_t v457 = vsubq_f32(v452, v423); - int16x4_t v484 = vqmovn_s32(vcvtq_n_s32_f32(v473, 15)); - int16x4_t v492 = vqmovn_s32(vcvtq_n_s32_f32(v472, 15)); - float32x4_t v276 = vaddq_f32(v265, v271); - float32x4_t v277 = vsubq_f32(v265, v271); - float32x4_t v278 = vaddq_f32(v267, v273); - float32x4_t v279 = vsubq_f32(v267, v273); - float32x4_t v280 = vaddq_f32(v269, v275); - float32x4_t v281 = vsubq_f32(v269, v275); - float32x4_t v369 = vaddq_f32(v358, v364); - float32x4_t v370 = vsubq_f32(v358, v364); - float32x4_t v371 = vaddq_f32(v360, v366); - float32x4_t v372 = vsubq_f32(v360, v366); - float32x4_t v373 = vaddq_f32(v362, v368); - float32x4_t v374 = vsubq_f32(v362, v368); - float32x4_t v454 = vaddq_f32(v453, v423); - float32x4_t v456 = vsubq_f32(v455, v431); - float32x4_t v458 = vaddq_f32(v457, v431); - vst1_s16((int16_t *)v1406, v484); - vst1_s16((int16_t *)v1415, v492); - float32x4_t v465 = vaddq_f32(v454, v460); - float32x4_t v466 = vsubq_f32(v454, v460); - float32x4_t v467 = vaddq_f32(v456, v462); - float32x4_t v468 = vsubq_f32(v456, v462); - float32x4_t v469 = vaddq_f32(v458, v464); - float32x4_t v470 = vsubq_f32(v458, v464); - float32x4_t v498 = vaddq_f32(v277, v370); - int16x4_t v503 = vqmovn_s32(vcvtq_n_s32_f32(v277, 15)); - float32x4_t v525 = vaddq_f32(v279, v372); - int16x4_t v530 = vqmovn_s32(vcvtq_n_s32_f32(v279, 15)); - float32x4_t v552 = vaddq_f32(v280, v373); - int16x4_t v557 = vqmovn_s32(vcvtq_n_s32_f32(v280, 15)); - float32x4_t v579 = vaddq_f32(v281, v374); - int16x4_t v584 = vqmovn_s32(vcvtq_n_s32_f32(v281, 15)); - float32x4_t v606 = vaddq_f32(v278, v371); - int16x4_t v611 = vqmovn_s32(vcvtq_n_s32_f32(v278, 15)); - float32x4_t v633 = vaddq_f32(v276, v369); - int16x4_t v638 = vqmovn_s32(vcvtq_n_s32_f32(v276, 15)); - float32x4_t v499 = vaddq_f32(v498, v466); - float32x4_t v500 = vsubq_f32(v498, v466); - float32x4_t v526 = vaddq_f32(v525, v468); - float32x4_t v527 = vsubq_f32(v525, v468); - float32x4_t v553 = vaddq_f32(v552, v469); - float32x4_t v554 = vsubq_f32(v552, v469); - float32x4_t v580 = vaddq_f32(v579, v470); - float32x4_t v581 = vsubq_f32(v579, v470); - float32x4_t v607 = vaddq_f32(v606, v467); - float32x4_t v608 = vsubq_f32(v606, v467); - float32x4_t v634 = vaddq_f32(v633, v465); - float32x4_t v635 = vsubq_f32(v633, v465); - vst1_s16((int16_t *)v1424, v503); - vst1_s16((int16_t *)v1451, v530); - vst1_s16((int16_t *)v1478, v557); - vst1_s16((int16_t *)v1505, v584); - vst1_s16((int16_t *)v1532, v611); - vst1_s16((int16_t *)v1559, v638); - int16x4_t v511 = vqmovn_s32(vcvtq_n_s32_f32(v500, 15)); - int16x4_t v519 = vqmovn_s32(vcvtq_n_s32_f32(v499, 15)); - int16x4_t v538 = vqmovn_s32(vcvtq_n_s32_f32(v527, 15)); - int16x4_t v546 = vqmovn_s32(vcvtq_n_s32_f32(v526, 15)); - int16x4_t v565 = vqmovn_s32(vcvtq_n_s32_f32(v554, 15)); - int16x4_t v573 = vqmovn_s32(vcvtq_n_s32_f32(v553, 15)); - int16x4_t v592 = vqmovn_s32(vcvtq_n_s32_f32(v581, 15)); - int16x4_t v600 = vqmovn_s32(vcvtq_n_s32_f32(v580, 15)); - int16x4_t v619 = vqmovn_s32(vcvtq_n_s32_f32(v608, 15)); - int16x4_t v627 = vqmovn_s32(vcvtq_n_s32_f32(v607, 15)); - int16x4_t v646 = vqmovn_s32(vcvtq_n_s32_f32(v635, 15)); - int16x4_t v654 = vqmovn_s32(vcvtq_n_s32_f32(v634, 15)); - vst1_s16((int16_t *)v1433, v511); - vst1_s16((int16_t *)v1442, v519); - vst1_s16((int16_t *)v1460, v538); - vst1_s16((int16_t *)v1469, v546); - vst1_s16((int16_t *)v1487, v565); - vst1_s16((int16_t *)v1496, v573); - vst1_s16((int16_t *)v1514, v592); - vst1_s16((int16_t *)v1523, v600); - vst1_s16((int16_t *)v1541, v619); - vst1_s16((int16_t *)v1550, v627); - vst1_s16((int16_t *)v1568, v646); - vst1_s16((int16_t *)v1577, v654); - v5 += 2 * 1; - v6 += 2 * 1; - } - for (int j = v660 * 2; j < howmany; j += 1) { - float32x2_t v762 = v5[istride]; - float v816 = -1.1666666666666665e+00F; - float v820 = 7.9015646852540022e-01F; - float v824 = 5.5854267289647742e-02F; - float v828 = 7.3430220123575241e-01F; - float v831 = 4.4095855184409838e-01F; - float v832 = -4.4095855184409838e-01F; - float v838 = 3.4087293062393137e-01F; - float v839 = -3.4087293062393137e-01F; - float v845 = -5.3396936033772524e-01F; - float v846 = 5.3396936033772524e-01F; - float v852 = 8.7484229096165667e-01F; - float v853 = -8.7484229096165667e-01F; - float v896 = -1.4999999999999998e+00F; - float v900 = 1.7499999999999996e+00F; - float v904 = -1.1852347027881001e+00F; - float v908 = -8.3781400934471603e-02F; - float v912 = -1.1014533018536286e+00F; - float v915 = -6.6143782776614746e-01F; - float v916 = 6.6143782776614746e-01F; - float v922 = -5.1130939593589697e-01F; - float v923 = 5.1130939593589697e-01F; - float v929 = 8.0095404050658769e-01F; - float v930 = -8.0095404050658769e-01F; - float v936 = -1.3122634364424848e+00F; - float v937 = 1.3122634364424848e+00F; - float v979 = 8.6602540378443871e-01F; - float v980 = -8.6602540378443871e-01F; - float v986 = -1.0103629710818451e+00F; - float v987 = 1.0103629710818451e+00F; - float v993 = 6.8429557470759583e-01F; - float v994 = -6.8429557470759583e-01F; - float v1000 = 4.8371214382601155e-02F; - float v1001 = -4.8371214382601155e-02F; - float v1007 = 6.3592436032499466e-01F; - float v1008 = -6.3592436032499466e-01F; - float32x2_t v1010 = (float32x2_t){v4, v4}; - float v1015 = -3.8188130791298663e-01F; - float v1019 = -2.9520461738277515e-01F; - float v1023 = 4.6243103089499693e-01F; - float v1027 = -7.5763564827777208e-01F; - float32x2_t v684 = v5[0]; - float32x2_t v817 = (float32x2_t){v816, v816}; - float32x2_t v821 = (float32x2_t){v820, v820}; - float32x2_t v825 = (float32x2_t){v824, v824}; - float32x2_t v829 = (float32x2_t){v828, v828}; - float32x2_t v833 = (float32x2_t){v831, v832}; - float32x2_t v840 = (float32x2_t){v838, v839}; - float32x2_t v847 = (float32x2_t){v845, v846}; - float32x2_t v854 = (float32x2_t){v852, v853}; - float32x2_t v897 = (float32x2_t){v896, v896}; - float32x2_t v901 = (float32x2_t){v900, v900}; - float32x2_t v905 = (float32x2_t){v904, v904}; - float32x2_t v909 = (float32x2_t){v908, v908}; - float32x2_t v913 = (float32x2_t){v912, v912}; - float32x2_t v917 = (float32x2_t){v915, v916}; - float32x2_t v924 = (float32x2_t){v922, v923}; - float32x2_t v931 = (float32x2_t){v929, v930}; - float32x2_t v938 = (float32x2_t){v936, v937}; - float32x2_t v981 = (float32x2_t){v979, v980}; - float32x2_t v988 = (float32x2_t){v986, v987}; - float32x2_t v995 = (float32x2_t){v993, v994}; - float32x2_t v1002 = (float32x2_t){v1000, v1001}; - float32x2_t v1009 = (float32x2_t){v1007, v1008}; - float32x2_t v1016 = (float32x2_t){v1015, v1015}; - float32x2_t v1020 = (float32x2_t){v1019, v1019}; - float32x2_t v1024 = (float32x2_t){v1023, v1023}; - float32x2_t v1028 = (float32x2_t){v1027, v1027}; - float32x2_t v672 = v5[istride * 7]; - float32x2_t v677 = v5[istride * 14]; - float32x2_t v690 = v5[istride * 10]; - float32x2_t v695 = v5[istride * 17]; - float32x2_t v702 = v5[istride * 3]; - float32x2_t v708 = v5[istride * 13]; - float32x2_t v713 = v5[istride * 20]; - float32x2_t v720 = v5[istride * 6]; - float32x2_t v726 = v5[istride * 16]; - float32x2_t v731 = v5[istride * 2]; - float32x2_t v738 = v5[istride * 9]; - float32x2_t v744 = v5[istride * 19]; - float32x2_t v749 = v5[istride * 5]; - float32x2_t v756 = v5[istride * 12]; - float32x2_t v767 = v5[istride * 8]; - float32x2_t v774 = v5[istride * 15]; - float32x2_t v780 = v5[istride * 4]; - float32x2_t v785 = v5[istride * 11]; - float32x2_t v792 = v5[istride * 18]; - float32x2_t v835 = vmul_f32(v1010, v833); - float32x2_t v842 = vmul_f32(v1010, v840); - float32x2_t v849 = vmul_f32(v1010, v847); - float32x2_t v856 = vmul_f32(v1010, v854); - float32x2_t v919 = vmul_f32(v1010, v917); - float32x2_t v926 = vmul_f32(v1010, v924); - float32x2_t v933 = vmul_f32(v1010, v931); - float32x2_t v940 = vmul_f32(v1010, v938); - float32x2_t v983 = vmul_f32(v1010, v981); - float32x2_t v990 = vmul_f32(v1010, v988); - float32x2_t v997 = vmul_f32(v1010, v995); - float32x2_t v1004 = vmul_f32(v1010, v1002); - float32x2_t v1011 = vmul_f32(v1010, v1009); - float32x2_t v678 = vadd_f32(v672, v677); - float32x2_t v679 = vsub_f32(v672, v677); - float32x2_t v696 = vadd_f32(v690, v695); - float32x2_t v697 = vsub_f32(v690, v695); - float32x2_t v714 = vadd_f32(v708, v713); - float32x2_t v715 = vsub_f32(v708, v713); - float32x2_t v732 = vadd_f32(v726, v731); - float32x2_t v733 = vsub_f32(v726, v731); - float32x2_t v750 = vadd_f32(v744, v749); - float32x2_t v751 = vsub_f32(v744, v749); - float32x2_t v768 = vadd_f32(v762, v767); - float32x2_t v769 = vsub_f32(v762, v767); - float32x2_t v786 = vadd_f32(v780, v785); - float32x2_t v787 = vsub_f32(v780, v785); - float32x2_t v685 = vadd_f32(v678, v684); - float32x2_t v703 = vadd_f32(v696, v702); - float32x2_t v721 = vadd_f32(v714, v720); - float32x2_t v739 = vadd_f32(v732, v738); - float32x2_t v757 = vadd_f32(v750, v756); - float32x2_t v775 = vadd_f32(v768, v774); - float32x2_t v793 = vadd_f32(v786, v792); - float32x2_t v878 = vadd_f32(v696, v786); - float32x2_t v879 = vsub_f32(v696, v786); - float32x2_t v880 = vadd_f32(v750, v732); - float32x2_t v881 = vsub_f32(v750, v732); - float32x2_t v882 = vadd_f32(v714, v768); - float32x2_t v883 = vsub_f32(v714, v768); - float32x2_t v962 = vadd_f32(v697, v787); - float32x2_t v963 = vsub_f32(v697, v787); - float32x2_t v964 = vadd_f32(v751, v733); - float32x2_t v965 = vsub_f32(v751, v733); - float32x2_t v966 = vadd_f32(v715, v769); - float32x2_t v967 = vsub_f32(v715, v769); - float32x2_t v794 = vadd_f32(v703, v793); - float32x2_t v795 = vsub_f32(v703, v793); - float32x2_t v796 = vadd_f32(v757, v739); - float32x2_t v797 = vsub_f32(v757, v739); - float32x2_t v798 = vadd_f32(v721, v775); - float32x2_t v799 = vsub_f32(v721, v775); - float32x2_t v884 = vadd_f32(v878, v880); - float32x2_t v887 = vsub_f32(v878, v880); - float32x2_t v888 = vsub_f32(v880, v882); - float32x2_t v889 = vsub_f32(v882, v878); - float32x2_t v890 = vadd_f32(v879, v881); - float32x2_t v892 = vsub_f32(v879, v881); - float32x2_t v893 = vsub_f32(v881, v883); - float32x2_t v894 = vsub_f32(v883, v879); - float32x2_t v968 = vadd_f32(v962, v964); - float32x2_t v971 = vsub_f32(v962, v964); - float32x2_t v972 = vsub_f32(v964, v966); - float32x2_t v973 = vsub_f32(v966, v962); - float32x2_t v974 = vadd_f32(v963, v965); - float32x2_t v976 = vsub_f32(v963, v965); - float32x2_t v977 = vsub_f32(v965, v967); - float32x2_t v978 = vsub_f32(v967, v963); - float32x2_t v800 = vadd_f32(v794, v796); - float32x2_t v803 = vsub_f32(v794, v796); - float32x2_t v804 = vsub_f32(v796, v798); - float32x2_t v805 = vsub_f32(v798, v794); - float32x2_t v806 = vadd_f32(v795, v797); - float32x2_t v808 = vsub_f32(v795, v797); - float32x2_t v809 = vsub_f32(v797, v799); - float32x2_t v810 = vsub_f32(v799, v795); - float32x2_t v885 = vadd_f32(v884, v882); - float32x2_t v891 = vadd_f32(v890, v883); - float32x2_t v906 = vmul_f32(v887, v905); - float32x2_t v910 = vmul_f32(v888, v909); - float32x2_t v914 = vmul_f32(v889, v913); - float32x2_t v927 = vrev64_f32(v892); - float32x2_t v934 = vrev64_f32(v893); - float32x2_t v941 = vrev64_f32(v894); - float32x2_t v969 = vadd_f32(v968, v966); - float32x2_t v975 = vadd_f32(v974, v967); - float32x2_t v998 = vrev64_f32(v971); - float32x2_t v1005 = vrev64_f32(v972); - float32x2_t v1012 = vrev64_f32(v973); - float32x2_t v1021 = vmul_f32(v976, v1020); - float32x2_t v1025 = vmul_f32(v977, v1024); - float32x2_t v1029 = vmul_f32(v978, v1028); - float32x2_t v801 = vadd_f32(v800, v798); - float32x2_t v807 = vadd_f32(v806, v799); - float32x2_t v822 = vmul_f32(v803, v821); - float32x2_t v826 = vmul_f32(v804, v825); - float32x2_t v830 = vmul_f32(v805, v829); - float32x2_t v843 = vrev64_f32(v808); - float32x2_t v850 = vrev64_f32(v809); - float32x2_t v857 = vrev64_f32(v810); - float32x2_t v886 = vadd_f32(v885, v678); - float32x2_t v902 = vmul_f32(v885, v901); - float32x2_t v920 = vrev64_f32(v891); - float32x2_t v928 = vmul_f32(v927, v926); - float32x2_t v935 = vmul_f32(v934, v933); - float32x2_t v942 = vmul_f32(v941, v940); - float32x2_t v970 = vadd_f32(v969, v679); - float32x2_t v991 = vrev64_f32(v969); - float32x2_t v999 = vmul_f32(v998, v997); - float32x2_t v1006 = vmul_f32(v1005, v1004); - float32x2_t v1013 = vmul_f32(v1012, v1011); - float32x2_t v1017 = vmul_f32(v975, v1016); - float32x2_t v802 = vadd_f32(v801, v685); - float32x2_t v818 = vmul_f32(v801, v817); - float32x2_t v836 = vrev64_f32(v807); - float32x2_t v844 = vmul_f32(v843, v842); - float32x2_t v851 = vmul_f32(v850, v849); - float32x2_t v858 = vmul_f32(v857, v856); - float32x2_t v898 = vmul_f32(v886, v897); - float32x2_t v921 = vmul_f32(v920, v919); - float32x2_t v984 = vrev64_f32(v970); - float32x2_t v992 = vmul_f32(v991, v990); - float32x2_t v1037 = vadd_f32(v1017, v1021); - float32x2_t v1039 = vsub_f32(v1017, v1021); - float32x2_t v1041 = vsub_f32(v1017, v1025); - float32x2_t v837 = vmul_f32(v836, v835); - float32x2_t v859 = vadd_f32(v802, v818); - float32x2_t v943 = vadd_f32(v898, v902); - float32x2_t v950 = vadd_f32(v921, v928); - float32x2_t v952 = vsub_f32(v921, v928); - float32x2_t v954 = vsub_f32(v921, v935); - float32x2_t v985 = vmul_f32(v984, v983); - float32x2_t v1038 = vadd_f32(v1037, v1025); - float32x2_t v1040 = vsub_f32(v1039, v1029); - float32x2_t v1042 = vadd_f32(v1041, v1029); - float32x2_t v1049 = vadd_f32(v802, v898); - int16x4_t v1054 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v802, 15), (int32x2_t){0, 0})); - float32x2_t v860 = vadd_f32(v859, v822); - float32x2_t v862 = vsub_f32(v859, v822); - float32x2_t v864 = vsub_f32(v859, v826); - float32x2_t v866 = vadd_f32(v837, v844); - float32x2_t v868 = vsub_f32(v837, v844); - float32x2_t v870 = vsub_f32(v837, v851); - float32x2_t v944 = vadd_f32(v943, v906); - float32x2_t v946 = vsub_f32(v943, v906); - float32x2_t v948 = vsub_f32(v943, v910); - float32x2_t v951 = vadd_f32(v950, v935); - float32x2_t v953 = vsub_f32(v952, v942); - float32x2_t v955 = vadd_f32(v954, v942); - float32x2_t v1030 = vadd_f32(v985, v992); - float32x2_t v1050 = vadd_f32(v1049, v985); - float32x2_t v1051 = vsub_f32(v1049, v985); - v6[0] = vget_lane_s32(vreinterpret_s32_s16(v1054), 0); - float32x2_t v861 = vadd_f32(v860, v826); - float32x2_t v863 = vsub_f32(v862, v830); - float32x2_t v865 = vadd_f32(v864, v830); - float32x2_t v867 = vadd_f32(v866, v851); - float32x2_t v869 = vsub_f32(v868, v858); - float32x2_t v871 = vadd_f32(v870, v858); - float32x2_t v945 = vadd_f32(v944, v910); - float32x2_t v947 = vsub_f32(v946, v914); - float32x2_t v949 = vadd_f32(v948, v914); - float32x2_t v1031 = vadd_f32(v1030, v999); - float32x2_t v1033 = vsub_f32(v1030, v999); - float32x2_t v1035 = vsub_f32(v1030, v1006); - int16x4_t v1060 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1051, 15), (int32x2_t){0, 0})); - int16x4_t v1066 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1050, 15), (int32x2_t){0, 0})); - float32x2_t v872 = vadd_f32(v861, v867); - float32x2_t v873 = vsub_f32(v861, v867); - float32x2_t v874 = vadd_f32(v863, v869); - float32x2_t v875 = vsub_f32(v863, v869); - float32x2_t v876 = vadd_f32(v865, v871); - float32x2_t v877 = vsub_f32(v865, v871); - float32x2_t v956 = vadd_f32(v945, v951); - float32x2_t v957 = vsub_f32(v945, v951); - float32x2_t v958 = vadd_f32(v947, v953); - float32x2_t v959 = vsub_f32(v947, v953); - float32x2_t v960 = vadd_f32(v949, v955); - float32x2_t v961 = vsub_f32(v949, v955); - float32x2_t v1032 = vadd_f32(v1031, v1006); - float32x2_t v1034 = vsub_f32(v1033, v1013); - float32x2_t v1036 = vadd_f32(v1035, v1013); - v6[ostride * 7] = vget_lane_s32(vreinterpret_s32_s16(v1060), 0); - v6[ostride * 14] = vget_lane_s32(vreinterpret_s32_s16(v1066), 0); - float32x2_t v1043 = vadd_f32(v1032, v1038); - float32x2_t v1044 = vsub_f32(v1032, v1038); - float32x2_t v1045 = vadd_f32(v1034, v1040); - float32x2_t v1046 = vsub_f32(v1034, v1040); - float32x2_t v1047 = vadd_f32(v1036, v1042); - float32x2_t v1048 = vsub_f32(v1036, v1042); - float32x2_t v1070 = vadd_f32(v873, v957); - int16x4_t v1075 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v873, 15), (int32x2_t){0, 0})); - float32x2_t v1091 = vadd_f32(v875, v959); - int16x4_t v1096 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v875, 15), (int32x2_t){0, 0})); - float32x2_t v1112 = vadd_f32(v876, v960); - int16x4_t v1117 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v876, 15), (int32x2_t){0, 0})); - float32x2_t v1133 = vadd_f32(v877, v961); - int16x4_t v1138 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v877, 15), (int32x2_t){0, 0})); - float32x2_t v1154 = vadd_f32(v874, v958); - int16x4_t v1159 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v874, 15), (int32x2_t){0, 0})); - float32x2_t v1175 = vadd_f32(v872, v956); - int16x4_t v1180 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v872, 15), (int32x2_t){0, 0})); - float32x2_t v1071 = vadd_f32(v1070, v1044); - float32x2_t v1072 = vsub_f32(v1070, v1044); - v6[ostride * 15] = vget_lane_s32(vreinterpret_s32_s16(v1075), 0); - float32x2_t v1092 = vadd_f32(v1091, v1046); - float32x2_t v1093 = vsub_f32(v1091, v1046); - v6[ostride * 9] = vget_lane_s32(vreinterpret_s32_s16(v1096), 0); - float32x2_t v1113 = vadd_f32(v1112, v1047); - float32x2_t v1114 = vsub_f32(v1112, v1047); - v6[ostride * 3] = vget_lane_s32(vreinterpret_s32_s16(v1117), 0); - float32x2_t v1134 = vadd_f32(v1133, v1048); - float32x2_t v1135 = vsub_f32(v1133, v1048); - v6[ostride * 18] = vget_lane_s32(vreinterpret_s32_s16(v1138), 0); - float32x2_t v1155 = vadd_f32(v1154, v1045); - float32x2_t v1156 = vsub_f32(v1154, v1045); - v6[ostride * 12] = vget_lane_s32(vreinterpret_s32_s16(v1159), 0); - float32x2_t v1176 = vadd_f32(v1175, v1043); - float32x2_t v1177 = vsub_f32(v1175, v1043); - v6[ostride * 6] = vget_lane_s32(vreinterpret_s32_s16(v1180), 0); - int16x4_t v1081 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1072, 15), (int32x2_t){0, 0})); - int16x4_t v1087 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1071, 15), (int32x2_t){0, 0})); - int16x4_t v1102 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1093, 15), (int32x2_t){0, 0})); - int16x4_t v1108 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1092, 15), (int32x2_t){0, 0})); - int16x4_t v1123 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1114, 15), (int32x2_t){0, 0})); - int16x4_t v1129 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1113, 15), (int32x2_t){0, 0})); - int16x4_t v1144 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1135, 15), (int32x2_t){0, 0})); - int16x4_t v1150 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1134, 15), (int32x2_t){0, 0})); - int16x4_t v1165 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1156, 15), (int32x2_t){0, 0})); - int16x4_t v1171 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1155, 15), (int32x2_t){0, 0})); - int16x4_t v1186 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1177, 15), (int32x2_t){0, 0})); - int16x4_t v1192 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1176, 15), (int32x2_t){0, 0})); - v6[ostride] = vget_lane_s32(vreinterpret_s32_s16(v1081), 0); - v6[ostride * 8] = vget_lane_s32(vreinterpret_s32_s16(v1087), 0); - v6[ostride * 16] = vget_lane_s32(vreinterpret_s32_s16(v1102), 0); - v6[ostride * 2] = vget_lane_s32(vreinterpret_s32_s16(v1108), 0); - v6[ostride * 10] = vget_lane_s32(vreinterpret_s32_s16(v1123), 0); - v6[ostride * 17] = vget_lane_s32(vreinterpret_s32_s16(v1129), 0); - v6[ostride * 4] = vget_lane_s32(vreinterpret_s32_s16(v1144), 0); - v6[ostride * 11] = vget_lane_s32(vreinterpret_s32_s16(v1150), 0); - v6[ostride * 19] = vget_lane_s32(vreinterpret_s32_s16(v1165), 0); - v6[ostride * 5] = vget_lane_s32(vreinterpret_s32_s16(v1171), 0); - v6[ostride * 13] = vget_lane_s32(vreinterpret_s32_s16(v1186), 0); - v6[ostride * 20] = vget_lane_s32(vreinterpret_s32_s16(v1192), 0); + float32x2_t v357 = (float32x2_t){v355, v356}; + float32x2_t v364 = (float32x2_t){v363, v363}; + float32x2_t v368 = (float32x2_t){v367, v367}; + float32x2_t v372 = (float32x2_t){v371, v371}; + float32x2_t v376 = (float32x2_t){v375, v375}; + float32x2_t v20 = v5[istride * 7]; + float32x2_t v25 = v5[istride * 14]; + float32x2_t v38 = v5[istride * 10]; + float32x2_t v43 = v5[istride * 17]; + float32x2_t v50 = v5[istride * 3]; + float32x2_t v56 = v5[istride * 13]; + float32x2_t v61 = v5[istride * 20]; + float32x2_t v68 = v5[istride * 6]; + float32x2_t v74 = v5[istride * 16]; + float32x2_t v79 = v5[istride * 2]; + float32x2_t v86 = v5[istride * 9]; + float32x2_t v92 = v5[istride * 19]; + float32x2_t v97 = v5[istride * 5]; + float32x2_t v104 = v5[istride * 12]; + float32x2_t v115 = v5[istride * 8]; + float32x2_t v122 = v5[istride * 15]; + float32x2_t v128 = v5[istride * 4]; + float32x2_t v133 = v5[istride * 11]; + float32x2_t v140 = v5[istride * 18]; + float32x2_t v183 = vmul_f32(v358, v181); + float32x2_t v190 = vmul_f32(v358, v188); + float32x2_t v197 = vmul_f32(v358, v195); + float32x2_t v204 = vmul_f32(v358, v202); + float32x2_t v267 = vmul_f32(v358, v265); + float32x2_t v274 = vmul_f32(v358, v272); + float32x2_t v281 = vmul_f32(v358, v279); + float32x2_t v288 = vmul_f32(v358, v286); + float32x2_t v331 = vmul_f32(v358, v329); + float32x2_t v338 = vmul_f32(v358, v336); + float32x2_t v345 = vmul_f32(v358, v343); + float32x2_t v352 = vmul_f32(v358, v350); + float32x2_t v359 = vmul_f32(v358, v357); + float32x2_t v26 = vadd_f32(v20, v25); + float32x2_t v27 = vsub_f32(v20, v25); + float32x2_t v44 = vadd_f32(v38, v43); + float32x2_t v45 = vsub_f32(v38, v43); + float32x2_t v62 = vadd_f32(v56, v61); + float32x2_t v63 = vsub_f32(v56, v61); + float32x2_t v80 = vadd_f32(v74, v79); + float32x2_t v81 = vsub_f32(v74, v79); + float32x2_t v98 = vadd_f32(v92, v97); + float32x2_t v99 = vsub_f32(v92, v97); + float32x2_t v116 = vadd_f32(v110, v115); + float32x2_t v117 = vsub_f32(v110, v115); + float32x2_t v134 = vadd_f32(v128, v133); + float32x2_t v135 = vsub_f32(v128, v133); + float32x2_t v33 = vadd_f32(v26, v32); + float32x2_t v51 = vadd_f32(v44, v50); + float32x2_t v69 = vadd_f32(v62, v68); + float32x2_t v87 = vadd_f32(v80, v86); + float32x2_t v105 = vadd_f32(v98, v104); + float32x2_t v123 = vadd_f32(v116, v122); + float32x2_t v141 = vadd_f32(v134, v140); + float32x2_t v226 = vadd_f32(v44, v134); + float32x2_t v227 = vsub_f32(v44, v134); + float32x2_t v228 = vadd_f32(v98, v80); + float32x2_t v229 = vsub_f32(v98, v80); + float32x2_t v230 = vadd_f32(v62, v116); + float32x2_t v231 = vsub_f32(v62, v116); + float32x2_t v310 = vadd_f32(v45, v135); + float32x2_t v311 = vsub_f32(v45, v135); + float32x2_t v312 = vadd_f32(v99, v81); + float32x2_t v313 = vsub_f32(v99, v81); + float32x2_t v314 = vadd_f32(v63, v117); + float32x2_t v315 = vsub_f32(v63, v117); + float32x2_t v142 = vadd_f32(v51, v141); + float32x2_t v143 = vsub_f32(v51, v141); + float32x2_t v144 = vadd_f32(v105, v87); + float32x2_t v145 = vsub_f32(v105, v87); + float32x2_t v146 = vadd_f32(v69, v123); + float32x2_t v147 = vsub_f32(v69, v123); + float32x2_t v232 = vadd_f32(v226, v228); + float32x2_t v235 = vsub_f32(v226, v228); + float32x2_t v236 = vsub_f32(v228, v230); + float32x2_t v237 = vsub_f32(v230, v226); + float32x2_t v238 = vadd_f32(v227, v229); + float32x2_t v240 = vsub_f32(v227, v229); + float32x2_t v241 = vsub_f32(v229, v231); + float32x2_t v242 = vsub_f32(v231, v227); + float32x2_t v316 = vadd_f32(v310, v312); + float32x2_t v319 = vsub_f32(v310, v312); + float32x2_t v320 = vsub_f32(v312, v314); + float32x2_t v321 = vsub_f32(v314, v310); + float32x2_t v322 = vadd_f32(v311, v313); + float32x2_t v324 = vsub_f32(v311, v313); + float32x2_t v325 = vsub_f32(v313, v315); + float32x2_t v326 = vsub_f32(v315, v311); + float32x2_t v148 = vadd_f32(v142, v144); + float32x2_t v151 = vsub_f32(v142, v144); + float32x2_t v152 = vsub_f32(v144, v146); + float32x2_t v153 = vsub_f32(v146, v142); + float32x2_t v154 = vadd_f32(v143, v145); + float32x2_t v156 = vsub_f32(v143, v145); + float32x2_t v157 = vsub_f32(v145, v147); + float32x2_t v158 = vsub_f32(v147, v143); + float32x2_t v233 = vadd_f32(v232, v230); + float32x2_t v239 = vadd_f32(v238, v231); + float32x2_t v254 = vmul_f32(v235, v253); + float32x2_t v258 = vmul_f32(v236, v257); + float32x2_t v262 = vmul_f32(v237, v261); + float32x2_t v275 = vrev64_f32(v240); + float32x2_t v282 = vrev64_f32(v241); + float32x2_t v289 = vrev64_f32(v242); + float32x2_t v317 = vadd_f32(v316, v314); + float32x2_t v323 = vadd_f32(v322, v315); + float32x2_t v346 = vrev64_f32(v319); + float32x2_t v353 = vrev64_f32(v320); + float32x2_t v360 = vrev64_f32(v321); + float32x2_t v369 = vmul_f32(v324, v368); + float32x2_t v373 = vmul_f32(v325, v372); + float32x2_t v377 = vmul_f32(v326, v376); + float32x2_t v149 = vadd_f32(v148, v146); + float32x2_t v155 = vadd_f32(v154, v147); + float32x2_t v170 = vmul_f32(v151, v169); + float32x2_t v174 = vmul_f32(v152, v173); + float32x2_t v178 = vmul_f32(v153, v177); + float32x2_t v191 = vrev64_f32(v156); + float32x2_t v198 = vrev64_f32(v157); + float32x2_t v205 = vrev64_f32(v158); + float32x2_t v234 = vadd_f32(v233, v26); + float32x2_t v250 = vmul_f32(v233, v249); + float32x2_t v268 = vrev64_f32(v239); + float32x2_t v276 = vmul_f32(v275, v274); + float32x2_t v283 = vmul_f32(v282, v281); + float32x2_t v290 = vmul_f32(v289, v288); + float32x2_t v318 = vadd_f32(v317, v27); + float32x2_t v339 = vrev64_f32(v317); + float32x2_t v347 = vmul_f32(v346, v345); + float32x2_t v354 = vmul_f32(v353, v352); + float32x2_t v361 = vmul_f32(v360, v359); + float32x2_t v365 = vmul_f32(v323, v364); + float32x2_t v150 = vadd_f32(v149, v33); + float32x2_t v166 = vmul_f32(v149, v165); + float32x2_t v184 = vrev64_f32(v155); + float32x2_t v192 = vmul_f32(v191, v190); + float32x2_t v199 = vmul_f32(v198, v197); + float32x2_t v206 = vmul_f32(v205, v204); + float32x2_t v246 = vmul_f32(v234, v245); + float32x2_t v269 = vmul_f32(v268, v267); + float32x2_t v332 = vrev64_f32(v318); + float32x2_t v340 = vmul_f32(v339, v338); + float32x2_t v385 = vadd_f32(v365, v369); + float32x2_t v387 = vsub_f32(v365, v369); + float32x2_t v389 = vsub_f32(v365, v373); + float32x2_t v185 = vmul_f32(v184, v183); + float32x2_t v207 = vadd_f32(v150, v166); + float32x2_t v291 = vadd_f32(v246, v250); + float32x2_t v298 = vadd_f32(v269, v276); + float32x2_t v300 = vsub_f32(v269, v276); + float32x2_t v302 = vsub_f32(v269, v283); + float32x2_t v333 = vmul_f32(v332, v331); + float32x2_t v386 = vadd_f32(v385, v373); + float32x2_t v388 = vsub_f32(v387, v377); + float32x2_t v390 = vadd_f32(v389, v377); + float32x2_t v397 = vadd_f32(v150, v246); + int16x4_t v402 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v150, 15), (int32x2_t){0, 0})); + float32x2_t v208 = vadd_f32(v207, v170); + float32x2_t v210 = vsub_f32(v207, v170); + float32x2_t v212 = vsub_f32(v207, v174); + float32x2_t v214 = vadd_f32(v185, v192); + float32x2_t v216 = vsub_f32(v185, v192); + float32x2_t v218 = vsub_f32(v185, v199); + float32x2_t v292 = vadd_f32(v291, v254); + float32x2_t v294 = vsub_f32(v291, v254); + float32x2_t v296 = vsub_f32(v291, v258); + float32x2_t v299 = vadd_f32(v298, v283); + float32x2_t v301 = vsub_f32(v300, v290); + float32x2_t v303 = vadd_f32(v302, v290); + float32x2_t v378 = vadd_f32(v333, v340); + float32x2_t v398 = vadd_f32(v397, v333); + float32x2_t v399 = vsub_f32(v397, v333); + v6[0] = vget_lane_s32(vreinterpret_s32_s16(v402), 0); + float32x2_t v209 = vadd_f32(v208, v174); + float32x2_t v211 = vsub_f32(v210, v178); + float32x2_t v213 = vadd_f32(v212, v178); + float32x2_t v215 = vadd_f32(v214, v199); + float32x2_t v217 = vsub_f32(v216, v206); + float32x2_t v219 = vadd_f32(v218, v206); + float32x2_t v293 = vadd_f32(v292, v258); + float32x2_t v295 = vsub_f32(v294, v262); + float32x2_t v297 = vadd_f32(v296, v262); + float32x2_t v379 = vadd_f32(v378, v347); + float32x2_t v381 = vsub_f32(v378, v347); + float32x2_t v383 = vsub_f32(v378, v354); + int16x4_t v408 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v399, 15), (int32x2_t){0, 0})); + int16x4_t v414 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v398, 15), (int32x2_t){0, 0})); + float32x2_t v220 = vadd_f32(v209, v215); + float32x2_t v221 = vsub_f32(v209, v215); + float32x2_t v222 = vadd_f32(v211, v217); + float32x2_t v223 = vsub_f32(v211, v217); + float32x2_t v224 = vadd_f32(v213, v219); + float32x2_t v225 = vsub_f32(v213, v219); + float32x2_t v304 = vadd_f32(v293, v299); + float32x2_t v305 = vsub_f32(v293, v299); + float32x2_t v306 = vadd_f32(v295, v301); + float32x2_t v307 = vsub_f32(v295, v301); + float32x2_t v308 = vadd_f32(v297, v303); + float32x2_t v309 = vsub_f32(v297, v303); + float32x2_t v380 = vadd_f32(v379, v354); + float32x2_t v382 = vsub_f32(v381, v361); + float32x2_t v384 = vadd_f32(v383, v361); + v6[ostride * 7] = vget_lane_s32(vreinterpret_s32_s16(v408), 0); + v6[ostride * 14] = vget_lane_s32(vreinterpret_s32_s16(v414), 0); + float32x2_t v391 = vadd_f32(v380, v386); + float32x2_t v392 = vsub_f32(v380, v386); + float32x2_t v393 = vadd_f32(v382, v388); + float32x2_t v394 = vsub_f32(v382, v388); + float32x2_t v395 = vadd_f32(v384, v390); + float32x2_t v396 = vsub_f32(v384, v390); + float32x2_t v418 = vadd_f32(v221, v305); + int16x4_t v423 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v221, 15), (int32x2_t){0, 0})); + float32x2_t v439 = vadd_f32(v223, v307); + int16x4_t v444 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v223, 15), (int32x2_t){0, 0})); + float32x2_t v460 = vadd_f32(v224, v308); + int16x4_t v465 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v224, 15), (int32x2_t){0, 0})); + float32x2_t v481 = vadd_f32(v225, v309); + int16x4_t v486 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v225, 15), (int32x2_t){0, 0})); + float32x2_t v502 = vadd_f32(v222, v306); + int16x4_t v507 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v222, 15), (int32x2_t){0, 0})); + float32x2_t v523 = vadd_f32(v220, v304); + int16x4_t v528 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v220, 15), (int32x2_t){0, 0})); + float32x2_t v419 = vadd_f32(v418, v392); + float32x2_t v420 = vsub_f32(v418, v392); + v6[ostride * 15] = vget_lane_s32(vreinterpret_s32_s16(v423), 0); + float32x2_t v440 = vadd_f32(v439, v394); + float32x2_t v441 = vsub_f32(v439, v394); + v6[ostride * 9] = vget_lane_s32(vreinterpret_s32_s16(v444), 0); + float32x2_t v461 = vadd_f32(v460, v395); + float32x2_t v462 = vsub_f32(v460, v395); + v6[ostride * 3] = vget_lane_s32(vreinterpret_s32_s16(v465), 0); + float32x2_t v482 = vadd_f32(v481, v396); + float32x2_t v483 = vsub_f32(v481, v396); + v6[ostride * 18] = vget_lane_s32(vreinterpret_s32_s16(v486), 0); + float32x2_t v503 = vadd_f32(v502, v393); + float32x2_t v504 = vsub_f32(v502, v393); + v6[ostride * 12] = vget_lane_s32(vreinterpret_s32_s16(v507), 0); + float32x2_t v524 = vadd_f32(v523, v391); + float32x2_t v525 = vsub_f32(v523, v391); + v6[ostride * 6] = vget_lane_s32(vreinterpret_s32_s16(v528), 0); + int16x4_t v429 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v420, 15), (int32x2_t){0, 0})); + int16x4_t v435 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v419, 15), (int32x2_t){0, 0})); + int16x4_t v450 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v441, 15), (int32x2_t){0, 0})); + int16x4_t v456 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v440, 15), (int32x2_t){0, 0})); + int16x4_t v471 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v462, 15), (int32x2_t){0, 0})); + int16x4_t v477 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v461, 15), (int32x2_t){0, 0})); + int16x4_t v492 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v483, 15), (int32x2_t){0, 0})); + int16x4_t v498 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v482, 15), (int32x2_t){0, 0})); + int16x4_t v513 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v504, 15), (int32x2_t){0, 0})); + int16x4_t v519 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v503, 15), (int32x2_t){0, 0})); + int16x4_t v534 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v525, 15), (int32x2_t){0, 0})); + int16x4_t v540 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v524, 15), (int32x2_t){0, 0})); + v6[ostride] = vget_lane_s32(vreinterpret_s32_s16(v429), 0); + v6[ostride * 8] = vget_lane_s32(vreinterpret_s32_s16(v435), 0); + v6[ostride * 16] = vget_lane_s32(vreinterpret_s32_s16(v450), 0); + v6[ostride * 2] = vget_lane_s32(vreinterpret_s32_s16(v456), 0); + v6[ostride * 10] = vget_lane_s32(vreinterpret_s32_s16(v471), 0); + v6[ostride * 17] = vget_lane_s32(vreinterpret_s32_s16(v477), 0); + v6[ostride * 4] = vget_lane_s32(vreinterpret_s32_s16(v492), 0); + v6[ostride * 11] = vget_lane_s32(vreinterpret_s32_s16(v498), 0); + v6[ostride * 19] = vget_lane_s32(vreinterpret_s32_s16(v513), 0); + v6[ostride * 5] = vget_lane_s32(vreinterpret_s32_s16(v519), 0); + v6[ostride * 13] = vget_lane_s32(vreinterpret_s32_s16(v534), 0); + v6[ostride * 20] = vget_lane_s32(vreinterpret_s32_s16(v540), 0); v5 += 1 * 1; v6 += 1 * 1; } @@ -14035,215 +8809,119 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu21(const armral_cmplx_f32_t *restrict x, svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v822)[0])); svfloat32_t v1092 = svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v831)[0])); - svfloat32_t v32; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v32) : "w"(v1052), "w"(v1054)); - svfloat32_t v33; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v33) : "w"(v1052), "w"(v1054)); - svfloat32_t v56; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v56) : "w"(v1058), "w"(v1060)); - svfloat32_t v57; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v57) : "w"(v1058), "w"(v1060)); - svfloat32_t v80; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v80) : "w"(v1064), "w"(v1066)); - svfloat32_t v81; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v81) : "w"(v1064), "w"(v1066)); - svfloat32_t v104; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v104) : "w"(v1070), "w"(v1072)); - svfloat32_t v105; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v105) : "w"(v1070), "w"(v1072)); - svfloat32_t v128; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v128) : "w"(v1076), "w"(v1078)); - svfloat32_t v129; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v129) : "w"(v1076), "w"(v1078)); - svfloat32_t v152; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v152) : "w"(v1082), "w"(v1084)); - svfloat32_t v153; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v153) : "w"(v1082), "w"(v1084)); - svfloat32_t v176; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v176) : "w"(v1088), "w"(v1090)); - svfloat32_t v177; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v177) : "w"(v1088), "w"(v1090)); - svfloat32_t v41; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v41) : "w"(v32), "w"(v1056)); - svfloat32_t v65; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v65) : "w"(v56), "w"(v1062)); - svfloat32_t v89; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v89) : "w"(v80), "w"(v1068)); - svfloat32_t v113; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v113) : "w"(v104), "w"(v1074)); - svfloat32_t v137; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v137) : "w"(v128), "w"(v1080)); - svfloat32_t v161; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v161) : "w"(v152), "w"(v1086)); - svfloat32_t v185; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v185) : "w"(v176), "w"(v1092)); - svfloat32_t v275; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v275) : "w"(v56), "w"(v176)); - svfloat32_t v276; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v276) : "w"(v56), "w"(v176)); - svfloat32_t v277; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v277) : "w"(v128), "w"(v104)); - svfloat32_t v278; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v278) : "w"(v128), "w"(v104)); - svfloat32_t v279; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v279) : "w"(v80), "w"(v152)); - svfloat32_t v280; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v280) : "w"(v80), "w"(v152)); - svfloat32_t v364; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v364) : "w"(v57), "w"(v177)); - svfloat32_t v365; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v365) : "w"(v57), "w"(v177)); - svfloat32_t v366; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v366) : "w"(v129), "w"(v105)); - svfloat32_t v367; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v367) : "w"(v129), "w"(v105)); - svfloat32_t v368; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v368) : "w"(v81), "w"(v153)); - svfloat32_t v369; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v369) : "w"(v81), "w"(v153)); - svfloat32_t v186; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v186) : "w"(v65), "w"(v185)); - svfloat32_t v187; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v187) : "w"(v65), "w"(v185)); - svfloat32_t v188; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v188) : "w"(v137), "w"(v113)); - svfloat32_t v189; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v189) : "w"(v137), "w"(v113)); - svfloat32_t v190; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v190) : "w"(v89), "w"(v161)); - svfloat32_t v191; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v191) : "w"(v89), "w"(v161)); - svfloat32_t v281; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v281) : "w"(v275), "w"(v277)); - svfloat32_t v284; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v284) : "w"(v275), "w"(v277)); - svfloat32_t v285; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v285) : "w"(v277), "w"(v279)); - svfloat32_t v286; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v286) : "w"(v279), "w"(v275)); - svfloat32_t v287; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v287) : "w"(v276), "w"(v278)); - svfloat32_t v289; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v289) : "w"(v276), "w"(v278)); - svfloat32_t v290; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v290) : "w"(v278), "w"(v280)); - svfloat32_t v291; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v291) : "w"(v280), "w"(v276)); - svfloat32_t v370; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v370) : "w"(v364), "w"(v366)); - svfloat32_t v373; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v373) : "w"(v364), "w"(v366)); - svfloat32_t v374; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v374) : "w"(v366), "w"(v368)); - svfloat32_t v375; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v375) : "w"(v368), "w"(v364)); - svfloat32_t v376; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v376) : "w"(v365), "w"(v367)); - svfloat32_t v378; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v378) : "w"(v365), "w"(v367)); - svfloat32_t v379; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v379) : "w"(v367), "w"(v369)); - svfloat32_t v380; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v380) : "w"(v369), "w"(v365)); - svfloat32_t v192; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v192) : "w"(v186), "w"(v188)); - svfloat32_t v195; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v195) : "w"(v186), "w"(v188)); - svfloat32_t v196; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v196) : "w"(v188), "w"(v190)); - svfloat32_t v197; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v197) : "w"(v190), "w"(v186)); - svfloat32_t v198; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v198) : "w"(v187), "w"(v189)); - svfloat32_t v200; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v200) : "w"(v187), "w"(v189)); - svfloat32_t v201; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v201) : "w"(v189), "w"(v191)); - svfloat32_t v202; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v202) : "w"(v191), "w"(v187)); - svfloat32_t v282; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v282) : "w"(v281), "w"(v279)); - svfloat32_t v288; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v288) : "w"(v287), "w"(v280)); - svfloat32_t zero330; - asm volatile("mov %0.s, #0" : "=w"(zero330)); + svfloat32_t v32 = svadd_f32_x(svptrue_b32(), v1052, v1054); + svfloat32_t v33 = svsub_f32_x(svptrue_b32(), v1052, v1054); + svfloat32_t v56 = svadd_f32_x(svptrue_b32(), v1058, v1060); + svfloat32_t v57 = svsub_f32_x(svptrue_b32(), v1058, v1060); + svfloat32_t v80 = svadd_f32_x(svptrue_b32(), v1064, v1066); + svfloat32_t v81 = svsub_f32_x(svptrue_b32(), v1064, v1066); + svfloat32_t v104 = svadd_f32_x(svptrue_b32(), v1070, v1072); + svfloat32_t v105 = svsub_f32_x(svptrue_b32(), v1070, v1072); + svfloat32_t v128 = svadd_f32_x(svptrue_b32(), v1076, v1078); + svfloat32_t v129 = svsub_f32_x(svptrue_b32(), v1076, v1078); + svfloat32_t v152 = svadd_f32_x(svptrue_b32(), v1082, v1084); + svfloat32_t v153 = svsub_f32_x(svptrue_b32(), v1082, v1084); + svfloat32_t v176 = svadd_f32_x(svptrue_b32(), v1088, v1090); + svfloat32_t v177 = svsub_f32_x(svptrue_b32(), v1088, v1090); + svfloat32_t v41 = svadd_f32_x(svptrue_b32(), v32, v1056); + svfloat32_t v65 = svadd_f32_x(svptrue_b32(), v56, v1062); + svfloat32_t v89 = svadd_f32_x(svptrue_b32(), v80, v1068); + svfloat32_t v113 = svadd_f32_x(svptrue_b32(), v104, v1074); + svfloat32_t v137 = svadd_f32_x(svptrue_b32(), v128, v1080); + svfloat32_t v161 = svadd_f32_x(svptrue_b32(), v152, v1086); + svfloat32_t v185 = svadd_f32_x(svptrue_b32(), v176, v1092); + svfloat32_t v275 = svadd_f32_x(svptrue_b32(), v56, v176); + svfloat32_t v276 = svsub_f32_x(svptrue_b32(), v56, v176); + svfloat32_t v277 = svadd_f32_x(svptrue_b32(), v128, v104); + svfloat32_t v278 = svsub_f32_x(svptrue_b32(), v128, v104); + svfloat32_t v279 = svadd_f32_x(svptrue_b32(), v80, v152); + svfloat32_t v280 = svsub_f32_x(svptrue_b32(), v80, v152); + svfloat32_t v364 = svadd_f32_x(svptrue_b32(), v57, v177); + svfloat32_t v365 = svsub_f32_x(svptrue_b32(), v57, v177); + svfloat32_t v366 = svadd_f32_x(svptrue_b32(), v129, v105); + svfloat32_t v367 = svsub_f32_x(svptrue_b32(), v129, v105); + svfloat32_t v368 = svadd_f32_x(svptrue_b32(), v81, v153); + svfloat32_t v369 = svsub_f32_x(svptrue_b32(), v81, v153); + svfloat32_t v186 = svadd_f32_x(svptrue_b32(), v65, v185); + svfloat32_t v187 = svsub_f32_x(svptrue_b32(), v65, v185); + svfloat32_t v188 = svadd_f32_x(svptrue_b32(), v137, v113); + svfloat32_t v189 = svsub_f32_x(svptrue_b32(), v137, v113); + svfloat32_t v190 = svadd_f32_x(svptrue_b32(), v89, v161); + svfloat32_t v191 = svsub_f32_x(svptrue_b32(), v89, v161); + svfloat32_t v281 = svadd_f32_x(svptrue_b32(), v275, v277); + svfloat32_t v284 = svsub_f32_x(svptrue_b32(), v275, v277); + svfloat32_t v285 = svsub_f32_x(svptrue_b32(), v277, v279); + svfloat32_t v286 = svsub_f32_x(svptrue_b32(), v279, v275); + svfloat32_t v287 = svadd_f32_x(svptrue_b32(), v276, v278); + svfloat32_t v289 = svsub_f32_x(svptrue_b32(), v276, v278); + svfloat32_t v290 = svsub_f32_x(svptrue_b32(), v278, v280); + svfloat32_t v291 = svsub_f32_x(svptrue_b32(), v280, v276); + svfloat32_t v370 = svadd_f32_x(svptrue_b32(), v364, v366); + svfloat32_t v373 = svsub_f32_x(svptrue_b32(), v364, v366); + svfloat32_t v374 = svsub_f32_x(svptrue_b32(), v366, v368); + svfloat32_t v375 = svsub_f32_x(svptrue_b32(), v368, v364); + svfloat32_t v376 = svadd_f32_x(svptrue_b32(), v365, v367); + svfloat32_t v378 = svsub_f32_x(svptrue_b32(), v365, v367); + svfloat32_t v379 = svsub_f32_x(svptrue_b32(), v367, v369); + svfloat32_t v380 = svsub_f32_x(svptrue_b32(), v369, v365); + svfloat32_t v192 = svadd_f32_x(svptrue_b32(), v186, v188); + svfloat32_t v195 = svsub_f32_x(svptrue_b32(), v186, v188); + svfloat32_t v196 = svsub_f32_x(svptrue_b32(), v188, v190); + svfloat32_t v197 = svsub_f32_x(svptrue_b32(), v190, v186); + svfloat32_t v198 = svadd_f32_x(svptrue_b32(), v187, v189); + svfloat32_t v200 = svsub_f32_x(svptrue_b32(), v187, v189); + svfloat32_t v201 = svsub_f32_x(svptrue_b32(), v189, v191); + svfloat32_t v202 = svsub_f32_x(svptrue_b32(), v191, v187); + svfloat32_t v282 = svadd_f32_x(svptrue_b32(), v281, v279); + svfloat32_t v288 = svadd_f32_x(svptrue_b32(), v287, v280); + svfloat32_t zero330 = svdup_n_f32(0); svfloat32_t v330 = svcmla_f32_x(pred_full, zero330, v849, v289, 90); - svfloat32_t zero337; - asm volatile("mov %0.s, #0" : "=w"(zero337)); + svfloat32_t zero337 = svdup_n_f32(0); svfloat32_t v337 = svcmla_f32_x(pred_full, zero337, v850, v290, 90); - svfloat32_t zero344; - asm volatile("mov %0.s, #0" : "=w"(zero344)); + svfloat32_t zero344 = svdup_n_f32(0); svfloat32_t v344 = svcmla_f32_x(pred_full, zero344, v851, v291, 90); - svfloat32_t v371; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v371) : "w"(v370), "w"(v368)); - svfloat32_t v377; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v377) : "w"(v376), "w"(v369)); - svfloat32_t zero401; - asm volatile("mov %0.s, #0" : "=w"(zero401)); + svfloat32_t v371 = svadd_f32_x(svptrue_b32(), v370, v368); + svfloat32_t v377 = svadd_f32_x(svptrue_b32(), v376, v369); + svfloat32_t zero401 = svdup_n_f32(0); svfloat32_t v401 = svcmla_f32_x(pred_full, zero401, v854, v373, 90); - svfloat32_t zero408; - asm volatile("mov %0.s, #0" : "=w"(zero408)); + svfloat32_t zero408 = svdup_n_f32(0); svfloat32_t v408 = svcmla_f32_x(pred_full, zero408, v855, v374, 90); - svfloat32_t zero415; - asm volatile("mov %0.s, #0" : "=w"(zero415)); + svfloat32_t zero415 = svdup_n_f32(0); svfloat32_t v415 = svcmla_f32_x(pred_full, zero415, v856, v375, 90); - svfloat32_t v425; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v425) : "w"(v378), "w"(v858)); - svfloat32_t v430; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v430) : "w"(v379), "w"(v859)); - svfloat32_t v193; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v193) : "w"(v192), "w"(v190)); - svfloat32_t v199; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v199) : "w"(v198), "w"(v191)); - svfloat32_t zero241; - asm volatile("mov %0.s, #0" : "=w"(zero241)); + svfloat32_t v425 = svmul_f32_x(svptrue_b32(), v378, v858); + svfloat32_t v430 = svmul_f32_x(svptrue_b32(), v379, v859); + svfloat32_t v193 = svadd_f32_x(svptrue_b32(), v192, v190); + svfloat32_t v199 = svadd_f32_x(svptrue_b32(), v198, v191); + svfloat32_t zero241 = svdup_n_f32(0); svfloat32_t v241 = svcmla_f32_x(pred_full, zero241, v840, v200, 90); - svfloat32_t zero248; - asm volatile("mov %0.s, #0" : "=w"(zero248)); + svfloat32_t zero248 = svdup_n_f32(0); svfloat32_t v248 = svcmla_f32_x(pred_full, zero248, v841, v201, 90); - svfloat32_t zero255; - asm volatile("mov %0.s, #0" : "=w"(zero255)); + svfloat32_t zero255 = svdup_n_f32(0); svfloat32_t v255 = svcmla_f32_x(pred_full, zero255, v842, v202, 90); - svfloat32_t v283; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v283) : "w"(v282), "w"(v32)); - svfloat32_t v301; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v301) : "w"(v282), "w"(v844)); - svfloat32_t zero323; - asm volatile("mov %0.s, #0" : "=w"(zero323)); + svfloat32_t v283 = svadd_f32_x(svptrue_b32(), v282, v32); + svfloat32_t v301 = svmul_f32_x(svptrue_b32(), v282, v844); + svfloat32_t zero323 = svdup_n_f32(0); svfloat32_t v323 = svcmla_f32_x(pred_full, zero323, v848, v288, 90); - svfloat32_t v372; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v372) : "w"(v371), "w"(v33)); - svfloat32_t v194; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v194) : "w"(v193), "w"(v41)); - svfloat32_t zero234; - asm volatile("mov %0.s, #0" : "=w"(zero234)); + svfloat32_t v372 = svadd_f32_x(svptrue_b32(), v371, v33); + svfloat32_t v194 = svadd_f32_x(svptrue_b32(), v193, v41); + svfloat32_t zero234 = svdup_n_f32(0); svfloat32_t v234 = svcmla_f32_x(pred_full, zero234, v839, v199, 90); - svfloat32_t v352; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v352) : "w"(v323), "w"(v330)); - svfloat32_t v354; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v354) : "w"(v323), "w"(v330)); - svfloat32_t v356; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v356) : "w"(v323), "w"(v337)); - svfloat32_t zero387; - asm volatile("mov %0.s, #0" : "=w"(zero387)); + svfloat32_t v352 = svadd_f32_x(svptrue_b32(), v323, v330); + svfloat32_t v354 = svsub_f32_x(svptrue_b32(), v323, v330); + svfloat32_t v356 = svsub_f32_x(svptrue_b32(), v323, v337); + svfloat32_t zero387 = svdup_n_f32(0); svfloat32_t v387 = svcmla_f32_x(pred_full, zero387, v852, v372, 90); svfloat32_t v443 = svmla_f32_x(pred_full, v425, v377, v857); svfloat32_t v445 = svnmls_f32_x(pred_full, v425, v377, v857); svfloat32_t v447 = svnmls_f32_x(pred_full, v430, v377, v857); svfloat32_t v256 = svmla_f32_x(pred_full, v194, v193, v835); - svfloat32_t v263; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v263) : "w"(v234), "w"(v241)); - svfloat32_t v265; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v265) : "w"(v234), "w"(v241)); - svfloat32_t v267; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v267) : "w"(v234), "w"(v248)); + svfloat32_t v263 = svadd_f32_x(svptrue_b32(), v234, v241); + svfloat32_t v265 = svsub_f32_x(svptrue_b32(), v234, v241); + svfloat32_t v267 = svsub_f32_x(svptrue_b32(), v234, v248); svfloat32_t v345 = svmla_f32_x(pred_full, v301, v283, v843); - svfloat32_t v353; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v353) : "w"(v352), "w"(v337)); - svfloat32_t v355; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v355) : "w"(v354), "w"(v344)); - svfloat32_t v357; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v357) : "w"(v356), "w"(v344)); + svfloat32_t v353 = svadd_f32_x(svptrue_b32(), v352, v337); + svfloat32_t v355 = svsub_f32_x(svptrue_b32(), v354, v344); + svfloat32_t v357 = svadd_f32_x(svptrue_b32(), v356, v344); svfloat32_t v436 = svcmla_f32_x(pred_full, v387, v853, v371, 90); svfloat32_t v444 = svmla_f32_x(pred_full, v443, v379, v859); svfloat32_t v446 = svmls_f32_x(pred_full, v445, v380, v860); @@ -14257,25 +8935,17 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu21(const armral_cmplx_f32_t *restrict x, svfloat32_t v257 = svmla_f32_x(pred_full, v256, v195, v836); svfloat32_t v259 = svmls_f32_x(pred_full, v256, v195, v836); svfloat32_t v261 = svmls_f32_x(pred_full, v256, v196, v837); - svfloat32_t v264; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v264) : "w"(v263), "w"(v248)); - svfloat32_t v266; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v266) : "w"(v265), "w"(v255)); - svfloat32_t v268; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v268) : "w"(v267), "w"(v255)); + svfloat32_t v264 = svadd_f32_x(svptrue_b32(), v263, v248); + svfloat32_t v266 = svsub_f32_x(svptrue_b32(), v265, v255); + svfloat32_t v268 = svadd_f32_x(svptrue_b32(), v267, v255); svfloat32_t v346 = svmla_f32_x(pred_full, v345, v284, v845); svfloat32_t v348 = svmls_f32_x(pred_full, v345, v284, v845); svfloat32_t v350 = svmls_f32_x(pred_full, v345, v285, v846); - svfloat32_t v437; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v437) : "w"(v436), "w"(v401)); - svfloat32_t v439; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v439) : "w"(v436), "w"(v401)); - svfloat32_t v441; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v441) : "w"(v436), "w"(v408)); - svfloat32_t v456; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v456) : "w"(v455), "w"(v387)); - svfloat32_t v457; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v457) : "w"(v455), "w"(v387)); + svfloat32_t v437 = svadd_f32_x(svptrue_b32(), v436, v401); + svfloat32_t v439 = svsub_f32_x(svptrue_b32(), v436, v401); + svfloat32_t v441 = svsub_f32_x(svptrue_b32(), v436, v408); + svfloat32_t v456 = svadd_f32_x(svptrue_b32(), v455, v387); + svfloat32_t v457 = svsub_f32_x(svptrue_b32(), v455, v387); svst1w_u64(pred_full, (unsigned *)(v868), svreinterpret_u64_s16(v460)); svfloat32_t v258 = svmla_f32_x(pred_full, v257, v196, v837); svfloat32_t v260 = svmls_f32_x(pred_full, v259, v197, v838); @@ -14283,12 +8953,9 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu21(const armral_cmplx_f32_t *restrict x, svfloat32_t v347 = svmla_f32_x(pred_full, v346, v285, v846); svfloat32_t v349 = svmls_f32_x(pred_full, v348, v286, v847); svfloat32_t v351 = svmla_f32_x(pred_full, v350, v286, v847); - svfloat32_t v438; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v438) : "w"(v437), "w"(v408)); - svfloat32_t v440; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v440) : "w"(v439), "w"(v415)); - svfloat32_t v442; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v442) : "w"(v441), "w"(v415)); + svfloat32_t v438 = svadd_f32_x(svptrue_b32(), v437, v408); + svfloat32_t v440 = svsub_f32_x(svptrue_b32(), v439, v415); + svfloat32_t v442 = svadd_f32_x(svptrue_b32(), v441, v415); svint16_t v468 = svtbl_s16( svreinterpret_s16_s32(svcvt_s32_f32_x( pred_full, svmul_n_f32_x(pred_full, v457, (float)(1ULL << 31ULL)))), @@ -14299,110 +8966,74 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu21(const armral_cmplx_f32_t *restrict x, pred_full, svmul_n_f32_x(pred_full, v456, (float)(1ULL << 31ULL)))), svreinterpret_u16_u64( svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svfloat32_t v269; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v269) : "w"(v258), "w"(v264)); - svfloat32_t v270; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v270) : "w"(v258), "w"(v264)); - svfloat32_t v271; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v271) : "w"(v260), "w"(v266)); - svfloat32_t v272; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v272) : "w"(v260), "w"(v266)); - svfloat32_t v273; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v273) : "w"(v262), "w"(v268)); - svfloat32_t v274; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v274) : "w"(v262), "w"(v268)); - svfloat32_t v358; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v358) : "w"(v347), "w"(v353)); - svfloat32_t v359; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v359) : "w"(v347), "w"(v353)); - svfloat32_t v360; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v360) : "w"(v349), "w"(v355)); - svfloat32_t v361; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v361) : "w"(v349), "w"(v355)); - svfloat32_t v362; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v362) : "w"(v351), "w"(v357)); - svfloat32_t v363; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v363) : "w"(v351), "w"(v357)); - svfloat32_t v449; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v449) : "w"(v438), "w"(v444)); - svfloat32_t v450; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v450) : "w"(v438), "w"(v444)); - svfloat32_t v451; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v451) : "w"(v440), "w"(v446)); - svfloat32_t v452; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v452) : "w"(v440), "w"(v446)); - svfloat32_t v453; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v453) : "w"(v442), "w"(v448)); - svfloat32_t v454; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v454) : "w"(v442), "w"(v448)); + svfloat32_t v269 = svadd_f32_x(svptrue_b32(), v258, v264); + svfloat32_t v270 = svsub_f32_x(svptrue_b32(), v258, v264); + svfloat32_t v271 = svadd_f32_x(svptrue_b32(), v260, v266); + svfloat32_t v272 = svsub_f32_x(svptrue_b32(), v260, v266); + svfloat32_t v273 = svadd_f32_x(svptrue_b32(), v262, v268); + svfloat32_t v274 = svsub_f32_x(svptrue_b32(), v262, v268); + svfloat32_t v358 = svadd_f32_x(svptrue_b32(), v347, v353); + svfloat32_t v359 = svsub_f32_x(svptrue_b32(), v347, v353); + svfloat32_t v360 = svadd_f32_x(svptrue_b32(), v349, v355); + svfloat32_t v361 = svsub_f32_x(svptrue_b32(), v349, v355); + svfloat32_t v362 = svadd_f32_x(svptrue_b32(), v351, v357); + svfloat32_t v363 = svsub_f32_x(svptrue_b32(), v351, v357); + svfloat32_t v449 = svadd_f32_x(svptrue_b32(), v438, v444); + svfloat32_t v450 = svsub_f32_x(svptrue_b32(), v438, v444); + svfloat32_t v451 = svadd_f32_x(svptrue_b32(), v440, v446); + svfloat32_t v452 = svsub_f32_x(svptrue_b32(), v440, v446); + svfloat32_t v453 = svadd_f32_x(svptrue_b32(), v442, v448); + svfloat32_t v454 = svsub_f32_x(svptrue_b32(), v442, v448); svst1w_u64(pred_full, (unsigned *)(v877), svreinterpret_u64_s16(v468)); svst1w_u64(pred_full, (unsigned *)(v886), svreinterpret_u64_s16(v476)); - svfloat32_t v482; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v482) : "w"(v270), "w"(v359)); + svfloat32_t v482 = svadd_f32_x(svptrue_b32(), v270, v359); svint16_t v487 = svtbl_s16( svreinterpret_s16_s32(svcvt_s32_f32_x( pred_full, svmul_n_f32_x(pred_full, v270, (float)(1ULL << 31ULL)))), svreinterpret_u16_u64( svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svfloat32_t v509; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v509) : "w"(v272), "w"(v361)); + svfloat32_t v509 = svadd_f32_x(svptrue_b32(), v272, v361); svint16_t v514 = svtbl_s16( svreinterpret_s16_s32(svcvt_s32_f32_x( pred_full, svmul_n_f32_x(pred_full, v272, (float)(1ULL << 31ULL)))), svreinterpret_u16_u64( svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svfloat32_t v536; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v536) : "w"(v273), "w"(v362)); + svfloat32_t v536 = svadd_f32_x(svptrue_b32(), v273, v362); svint16_t v541 = svtbl_s16( svreinterpret_s16_s32(svcvt_s32_f32_x( pred_full, svmul_n_f32_x(pred_full, v273, (float)(1ULL << 31ULL)))), svreinterpret_u16_u64( svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svfloat32_t v563; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v563) : "w"(v274), "w"(v363)); + svfloat32_t v563 = svadd_f32_x(svptrue_b32(), v274, v363); svint16_t v568 = svtbl_s16( svreinterpret_s16_s32(svcvt_s32_f32_x( pred_full, svmul_n_f32_x(pred_full, v274, (float)(1ULL << 31ULL)))), svreinterpret_u16_u64( svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svfloat32_t v590; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v590) : "w"(v271), "w"(v360)); + svfloat32_t v590 = svadd_f32_x(svptrue_b32(), v271, v360); svint16_t v595 = svtbl_s16( svreinterpret_s16_s32(svcvt_s32_f32_x( pred_full, svmul_n_f32_x(pred_full, v271, (float)(1ULL << 31ULL)))), svreinterpret_u16_u64( svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svfloat32_t v617; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v617) : "w"(v269), "w"(v358)); + svfloat32_t v617 = svadd_f32_x(svptrue_b32(), v269, v358); svint16_t v622 = svtbl_s16( svreinterpret_s16_s32(svcvt_s32_f32_x( pred_full, svmul_n_f32_x(pred_full, v269, (float)(1ULL << 31ULL)))), svreinterpret_u16_u64( svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svfloat32_t v483; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v483) : "w"(v482), "w"(v450)); - svfloat32_t v484; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v484) : "w"(v482), "w"(v450)); - svfloat32_t v510; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v510) : "w"(v509), "w"(v452)); - svfloat32_t v511; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v511) : "w"(v509), "w"(v452)); - svfloat32_t v537; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v537) : "w"(v536), "w"(v453)); - svfloat32_t v538; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v538) : "w"(v536), "w"(v453)); - svfloat32_t v564; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v564) : "w"(v563), "w"(v454)); - svfloat32_t v565; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v565) : "w"(v563), "w"(v454)); - svfloat32_t v591; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v591) : "w"(v590), "w"(v451)); - svfloat32_t v592; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v592) : "w"(v590), "w"(v451)); - svfloat32_t v618; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v618) : "w"(v617), "w"(v449)); - svfloat32_t v619; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v619) : "w"(v617), "w"(v449)); + svfloat32_t v483 = svadd_f32_x(svptrue_b32(), v482, v450); + svfloat32_t v484 = svsub_f32_x(svptrue_b32(), v482, v450); + svfloat32_t v510 = svadd_f32_x(svptrue_b32(), v509, v452); + svfloat32_t v511 = svsub_f32_x(svptrue_b32(), v509, v452); + svfloat32_t v537 = svadd_f32_x(svptrue_b32(), v536, v453); + svfloat32_t v538 = svsub_f32_x(svptrue_b32(), v536, v453); + svfloat32_t v564 = svadd_f32_x(svptrue_b32(), v563, v454); + svfloat32_t v565 = svsub_f32_x(svptrue_b32(), v563, v454); + svfloat32_t v591 = svadd_f32_x(svptrue_b32(), v590, v451); + svfloat32_t v592 = svsub_f32_x(svptrue_b32(), v590, v451); + svfloat32_t v618 = svadd_f32_x(svptrue_b32(), v617, v449); + svfloat32_t v619 = svsub_f32_x(svptrue_b32(), v617, v449); svst1w_u64(pred_full, (unsigned *)(v895), svreinterpret_u64_s16(v487)); svst1w_u64(pred_full, (unsigned *)(v922), svreinterpret_u64_s16(v514)); svst1w_u64(pred_full, (unsigned *)(v949), svreinterpret_u64_s16(v541)); @@ -14495,853 +9126,406 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu22(const armral_cmplx_f32_t *restrict x, float v4 = dir; const float32x2_t *v5 = (const float32x2_t *)x; int32_t *v6 = (int32_t *)y; - int64_t v12 = howmany - 1; - int64_t v811 = howmany / 2; - for (int j = 0; j < v12; j += 2) { - float v459 = 1.1000000000000001e+00F; - float v463 = 3.3166247903554003e-01F; - float v464 = -3.3166247903554003e-01F; - float v472 = 5.1541501300188641e-01F; - float v477 = 9.4125353283118118e-01F; - float v482 = 1.4143537075597825e+00F; - float v487 = 8.5949297361449750e-01F; - float v492 = 4.2314838273285138e-02F; - float v497 = 3.8639279888589606e-01F; - float v502 = 5.1254589567200015e-01F; - float v507 = 1.0702757469471715e+00F; - float v512 = 5.5486073394528512e-01F; - float v516 = 1.2412944743900585e+00F; - float v517 = -1.2412944743900585e+00F; - float v524 = 2.0897833842005756e-01F; - float v525 = -2.0897833842005756e-01F; - float v532 = 3.7415717312460811e-01F; - float v533 = -3.7415717312460811e-01F; - float v540 = 4.9929922194110327e-02F; - float v541 = -4.9929922194110327e-02F; - float v548 = 6.5815896284539266e-01F; - float v549 = -6.5815896284539266e-01F; - float v556 = 6.3306543373877577e-01F; - float v557 = -6.3306543373877577e-01F; - float v564 = 1.0822460581641109e+00F; - float v565 = -1.0822460581641109e+00F; - float v572 = 8.1720737907134022e-01F; - float v573 = -8.1720737907134022e-01F; - float v580 = 4.2408709531871824e-01F; - float v581 = -4.2408709531871824e-01F; - float32x2_t v583 = (float32x2_t){v4, v4}; - const float32x2_t *v1607 = &v5[istride]; - int32_t *v1716 = &v6[ostride]; - float32x2_t v460 = (float32x2_t){v459, v459}; - float32x2_t v465 = (float32x2_t){v463, v464}; - float32x2_t v473 = (float32x2_t){v472, v472}; - float32x2_t v478 = (float32x2_t){v477, v477}; - float32x2_t v483 = (float32x2_t){v482, v482}; - float32x2_t v488 = (float32x2_t){v487, v487}; - float32x2_t v493 = (float32x2_t){v492, v492}; - float32x2_t v498 = (float32x2_t){v497, v497}; - float32x2_t v503 = (float32x2_t){v502, v502}; - float32x2_t v508 = (float32x2_t){v507, v507}; - float32x2_t v513 = (float32x2_t){v512, v512}; - float32x2_t v518 = (float32x2_t){v516, v517}; - float32x2_t v526 = (float32x2_t){v524, v525}; - float32x2_t v534 = (float32x2_t){v532, v533}; - float32x2_t v542 = (float32x2_t){v540, v541}; - float32x2_t v550 = (float32x2_t){v548, v549}; - float32x2_t v558 = (float32x2_t){v556, v557}; - float32x2_t v566 = (float32x2_t){v564, v565}; - float32x2_t v574 = (float32x2_t){v572, v573}; - float32x2_t v582 = (float32x2_t){v580, v581}; - const float32x2_t *v1490 = &v5[0]; - int32_t *v1689 = &v6[0]; - float32x4_t v1908 = vld1q_f32((const float32_t *)v1607); - float32x4_t v461 = vcombine_f32(v460, v460); - float32x2_t v467 = vmul_f32(v583, v465); - float32x4_t v474 = vcombine_f32(v473, v473); - float32x4_t v479 = vcombine_f32(v478, v478); - float32x4_t v484 = vcombine_f32(v483, v483); - float32x4_t v489 = vcombine_f32(v488, v488); - float32x4_t v494 = vcombine_f32(v493, v493); - float32x4_t v499 = vcombine_f32(v498, v498); - float32x4_t v504 = vcombine_f32(v503, v503); - float32x4_t v509 = vcombine_f32(v508, v508); - float32x4_t v514 = vcombine_f32(v513, v513); - float32x2_t v520 = vmul_f32(v583, v518); - float32x2_t v528 = vmul_f32(v583, v526); - float32x2_t v536 = vmul_f32(v583, v534); - float32x2_t v544 = vmul_f32(v583, v542); - float32x2_t v552 = vmul_f32(v583, v550); - float32x2_t v560 = vmul_f32(v583, v558); - float32x2_t v568 = vmul_f32(v583, v566); - float32x2_t v576 = vmul_f32(v583, v574); - float32x2_t v584 = vmul_f32(v583, v582); - const float32x2_t *v1499 = &v5[istride * 11]; - const float32x2_t *v1508 = &v5[istride * 2]; - const float32x2_t *v1517 = &v5[istride * 13]; - const float32x2_t *v1526 = &v5[istride * 4]; - const float32x2_t *v1535 = &v5[istride * 15]; - const float32x2_t *v1544 = &v5[istride * 6]; - const float32x2_t *v1553 = &v5[istride * 17]; - const float32x2_t *v1562 = &v5[istride * 8]; - const float32x2_t *v1571 = &v5[istride * 19]; - const float32x2_t *v1580 = &v5[istride * 10]; - const float32x2_t *v1589 = &v5[istride * 21]; - const float32x2_t *v1598 = &v5[istride * 12]; - const float32x2_t *v1616 = &v5[istride * 14]; - const float32x2_t *v1625 = &v5[istride * 3]; - const float32x2_t *v1634 = &v5[istride * 16]; - const float32x2_t *v1643 = &v5[istride * 5]; - const float32x2_t *v1652 = &v5[istride * 18]; - const float32x2_t *v1661 = &v5[istride * 7]; - const float32x2_t *v1670 = &v5[istride * 20]; - const float32x2_t *v1679 = &v5[istride * 9]; - int32_t *v1698 = &v6[ostride * 11]; - int32_t *v1707 = &v6[ostride * 12]; - int32_t *v1725 = &v6[ostride * 2]; - int32_t *v1734 = &v6[ostride * 13]; - int32_t *v1743 = &v6[ostride * 14]; - int32_t *v1752 = &v6[ostride * 3]; - int32_t *v1761 = &v6[ostride * 4]; - int32_t *v1770 = &v6[ostride * 15]; - int32_t *v1779 = &v6[ostride * 16]; - int32_t *v1788 = &v6[ostride * 5]; - int32_t *v1797 = &v6[ostride * 6]; - int32_t *v1806 = &v6[ostride * 17]; - int32_t *v1815 = &v6[ostride * 18]; - int32_t *v1824 = &v6[ostride * 7]; - int32_t *v1833 = &v6[ostride * 8]; - int32_t *v1842 = &v6[ostride * 19]; - int32_t *v1851 = &v6[ostride * 20]; - int32_t *v1860 = &v6[ostride * 9]; - int32_t *v1869 = &v6[ostride * 10]; - int32_t *v1878 = &v6[ostride * 21]; - float32x4_t v1882 = vld1q_f32((const float32_t *)v1490); - float32x4_t v469 = vcombine_f32(v467, v467); - float32x4_t v522 = vcombine_f32(v520, v520); - float32x4_t v530 = vcombine_f32(v528, v528); - float32x4_t v538 = vcombine_f32(v536, v536); - float32x4_t v546 = vcombine_f32(v544, v544); - float32x4_t v554 = vcombine_f32(v552, v552); - float32x4_t v562 = vcombine_f32(v560, v560); - float32x4_t v570 = vcombine_f32(v568, v568); - float32x4_t v578 = vcombine_f32(v576, v576); - float32x4_t v586 = vcombine_f32(v584, v584); - float32x4_t v1884 = vld1q_f32((const float32_t *)v1499); - float32x4_t v1886 = vld1q_f32((const float32_t *)v1508); - float32x4_t v1888 = vld1q_f32((const float32_t *)v1517); - float32x4_t v1890 = vld1q_f32((const float32_t *)v1526); - float32x4_t v1892 = vld1q_f32((const float32_t *)v1535); - float32x4_t v1894 = vld1q_f32((const float32_t *)v1544); - float32x4_t v1896 = vld1q_f32((const float32_t *)v1553); - float32x4_t v1898 = vld1q_f32((const float32_t *)v1562); - float32x4_t v1900 = vld1q_f32((const float32_t *)v1571); - float32x4_t v1902 = vld1q_f32((const float32_t *)v1580); - float32x4_t v1904 = vld1q_f32((const float32_t *)v1589); - float32x4_t v1906 = vld1q_f32((const float32_t *)v1598); - float32x4_t v1910 = vld1q_f32((const float32_t *)v1616); - float32x4_t v1912 = vld1q_f32((const float32_t *)v1625); - float32x4_t v1914 = vld1q_f32((const float32_t *)v1634); - float32x4_t v1916 = vld1q_f32((const float32_t *)v1643); - float32x4_t v1918 = vld1q_f32((const float32_t *)v1652); - float32x4_t v1920 = vld1q_f32((const float32_t *)v1661); - float32x4_t v1922 = vld1q_f32((const float32_t *)v1670); - float32x4_t v1924 = vld1q_f32((const float32_t *)v1679); - float32x4_t v35 = vaddq_f32(v1882, v1884); - float32x4_t v36 = vsubq_f32(v1882, v1884); - float32x4_t v51 = vaddq_f32(v1886, v1888); - float32x4_t v52 = vsubq_f32(v1886, v1888); - float32x4_t v67 = vaddq_f32(v1890, v1892); - float32x4_t v68 = vsubq_f32(v1890, v1892); - float32x4_t v83 = vaddq_f32(v1894, v1896); - float32x4_t v84 = vsubq_f32(v1894, v1896); - float32x4_t v99 = vaddq_f32(v1898, v1900); - float32x4_t v100 = vsubq_f32(v1898, v1900); - float32x4_t v115 = vaddq_f32(v1902, v1904); - float32x4_t v116 = vsubq_f32(v1902, v1904); - float32x4_t v131 = vaddq_f32(v1906, v1908); - float32x4_t v132 = vsubq_f32(v1906, v1908); - float32x4_t v147 = vaddq_f32(v1910, v1912); - float32x4_t v148 = vsubq_f32(v1910, v1912); - float32x4_t v163 = vaddq_f32(v1914, v1916); - float32x4_t v164 = vsubq_f32(v1914, v1916); - float32x4_t v179 = vaddq_f32(v1918, v1920); - float32x4_t v180 = vsubq_f32(v1918, v1920); - float32x4_t v195 = vaddq_f32(v1922, v1924); - float32x4_t v196 = vsubq_f32(v1922, v1924); - float32x4_t v197 = vaddq_f32(v51, v195); - float32x4_t v198 = vaddq_f32(v67, v179); - float32x4_t v199 = vaddq_f32(v83, v163); - float32x4_t v200 = vaddq_f32(v99, v147); - float32x4_t v201 = vaddq_f32(v115, v131); - float32x4_t v202 = vsubq_f32(v51, v195); - float32x4_t v203 = vsubq_f32(v67, v179); - float32x4_t v204 = vsubq_f32(v83, v163); - float32x4_t v205 = vsubq_f32(v99, v147); - float32x4_t v206 = vsubq_f32(v115, v131); - float32x4_t v416 = vaddq_f32(v52, v196); - float32x4_t v417 = vaddq_f32(v68, v180); - float32x4_t v418 = vaddq_f32(v84, v164); - float32x4_t v419 = vaddq_f32(v100, v148); - float32x4_t v420 = vaddq_f32(v116, v132); - float32x4_t v421 = vsubq_f32(v52, v196); - float32x4_t v422 = vsubq_f32(v68, v180); - float32x4_t v423 = vsubq_f32(v84, v164); - float32x4_t v424 = vsubq_f32(v100, v148); - float32x4_t v425 = vsubq_f32(v116, v132); - float32x4_t v207 = vaddq_f32(v197, v198); - float32x4_t v208 = vaddq_f32(v199, v201); - float32x4_t v210 = vsubq_f32(v203, v204); - float32x4_t v211 = vaddq_f32(v202, v206); - float32x4_t v216 = vsubq_f32(v198, v200); - float32x4_t v217 = vsubq_f32(v197, v200); - float32x4_t v218 = vsubq_f32(v198, v197); - float32x4_t v219 = vsubq_f32(v201, v200); - float32x4_t v220 = vsubq_f32(v199, v200); - float32x4_t v221 = vsubq_f32(v201, v199); - float32x4_t v222 = vsubq_f32(v198, v201); - float32x4_t v223 = vsubq_f32(v197, v199); - float32x4_t v225 = vaddq_f32(v203, v205); - float32x4_t v226 = vsubq_f32(v202, v205); - float32x4_t v227 = vaddq_f32(v202, v203); - float32x4_t v228 = vsubq_f32(v205, v206); - float32x4_t v229 = vsubq_f32(v204, v205); - float32x4_t v230 = vsubq_f32(v204, v206); - float32x4_t v231 = vaddq_f32(v203, v206); - float32x4_t v232 = vsubq_f32(v202, v204); - float32x4_t v426 = vaddq_f32(v416, v417); - float32x4_t v427 = vaddq_f32(v418, v420); - float32x4_t v429 = vsubq_f32(v422, v423); - float32x4_t v430 = vaddq_f32(v421, v425); - float32x4_t v435 = vsubq_f32(v417, v419); - float32x4_t v436 = vsubq_f32(v416, v419); - float32x4_t v437 = vsubq_f32(v417, v416); - float32x4_t v438 = vsubq_f32(v420, v419); - float32x4_t v439 = vsubq_f32(v418, v419); - float32x4_t v440 = vsubq_f32(v420, v418); - float32x4_t v441 = vsubq_f32(v417, v420); - float32x4_t v442 = vsubq_f32(v416, v418); - float32x4_t v444 = vaddq_f32(v422, v424); - float32x4_t v445 = vsubq_f32(v421, v424); - float32x4_t v446 = vaddq_f32(v421, v422); - float32x4_t v447 = vsubq_f32(v424, v425); - float32x4_t v448 = vsubq_f32(v423, v424); - float32x4_t v449 = vsubq_f32(v423, v425); - float32x4_t v450 = vaddq_f32(v422, v425); - float32x4_t v451 = vsubq_f32(v421, v423); - float32x4_t v209 = vaddq_f32(v200, v207); - float32x4_t v214 = vsubq_f32(v210, v211); - float32x4_t v224 = vsubq_f32(v208, v207); - float32x4_t v233 = vaddq_f32(v210, v211); - float32x4_t v256 = vmulq_f32(v216, v474); - float32x4_t v261 = vmulq_f32(v217, v479); - float32x4_t v266 = vmulq_f32(v218, v484); - float32x4_t v271 = vmulq_f32(v219, v489); - float32x4_t v276 = vmulq_f32(v220, v494); - float32x4_t v281 = vmulq_f32(v221, v499); - float32x4_t v286 = vmulq_f32(v222, v504); - float32x4_t v291 = vmulq_f32(v223, v509); - float32x4_t v302 = vrev64q_f32(v225); - float32x4_t v310 = vrev64q_f32(v226); - float32x4_t v318 = vrev64q_f32(v227); - float32x4_t v326 = vrev64q_f32(v228); - float32x4_t v334 = vrev64q_f32(v229); - float32x4_t v342 = vrev64q_f32(v230); - float32x4_t v350 = vrev64q_f32(v231); - float32x4_t v358 = vrev64q_f32(v232); - float32x4_t v428 = vaddq_f32(v419, v426); - float32x4_t v433 = vsubq_f32(v429, v430); - float32x4_t v443 = vsubq_f32(v427, v426); - float32x4_t v452 = vaddq_f32(v429, v430); - float32x4_t v475 = vmulq_f32(v435, v474); - float32x4_t v480 = vmulq_f32(v436, v479); - float32x4_t v485 = vmulq_f32(v437, v484); - float32x4_t v490 = vmulq_f32(v438, v489); - float32x4_t v495 = vmulq_f32(v439, v494); - float32x4_t v500 = vmulq_f32(v440, v499); - float32x4_t v505 = vmulq_f32(v441, v504); - float32x4_t v510 = vmulq_f32(v442, v509); - float32x4_t v521 = vrev64q_f32(v444); - float32x4_t v529 = vrev64q_f32(v445); - float32x4_t v537 = vrev64q_f32(v446); - float32x4_t v545 = vrev64q_f32(v447); - float32x4_t v553 = vrev64q_f32(v448); - float32x4_t v561 = vrev64q_f32(v449); - float32x4_t v569 = vrev64q_f32(v450); - float32x4_t v577 = vrev64q_f32(v451); - float32x4_t v212 = vaddq_f32(v209, v208); - float32x4_t v215 = vsubq_f32(v214, v205); - float32x4_t v296 = vmulq_f32(v224, v514); - float32x4_t v304 = vmulq_f32(v302, v522); - float32x4_t v312 = vmulq_f32(v310, v530); - float32x4_t v320 = vmulq_f32(v318, v538); - float32x4_t v328 = vmulq_f32(v326, v546); - float32x4_t v336 = vmulq_f32(v334, v554); - float32x4_t v344 = vmulq_f32(v342, v562); - float32x4_t v352 = vmulq_f32(v350, v570); - float32x4_t v360 = vmulq_f32(v358, v578); - float32x4_t v366 = vrev64q_f32(v233); - float32x4_t v370 = vaddq_f32(v256, v261); - float32x4_t v371 = vaddq_f32(v261, v266); - float32x4_t v372 = vsubq_f32(v256, v266); - float32x4_t v373 = vaddq_f32(v271, v276); - float32x4_t v374 = vaddq_f32(v276, v281); - float32x4_t v375 = vsubq_f32(v271, v281); - float32x4_t v431 = vaddq_f32(v428, v427); - float32x4_t v434 = vsubq_f32(v433, v424); - float32x4_t v515 = vmulq_f32(v443, v514); - float32x4_t v523 = vmulq_f32(v521, v522); - float32x4_t v531 = vmulq_f32(v529, v530); - float32x4_t v539 = vmulq_f32(v537, v538); - float32x4_t v547 = vmulq_f32(v545, v546); - float32x4_t v555 = vmulq_f32(v553, v554); - float32x4_t v563 = vmulq_f32(v561, v562); - float32x4_t v571 = vmulq_f32(v569, v570); - float32x4_t v579 = vmulq_f32(v577, v578); - float32x4_t v585 = vrev64q_f32(v452); - float32x4_t v589 = vaddq_f32(v475, v480); - float32x4_t v590 = vaddq_f32(v480, v485); - float32x4_t v591 = vsubq_f32(v475, v485); - float32x4_t v592 = vaddq_f32(v490, v495); - float32x4_t v593 = vaddq_f32(v495, v500); - float32x4_t v594 = vsubq_f32(v490, v500); - float32x4_t v213 = vaddq_f32(v35, v212); - float32x4_t v243 = vmulq_f32(v212, v461); - float32x4_t v249 = vrev64q_f32(v215); - float32x4_t v368 = vmulq_f32(v366, v586); - float32x4_t v376 = vaddq_f32(v291, v296); - float32x4_t v377 = vaddq_f32(v286, v296); - float32x4_t v378 = vaddq_f32(v312, v320); - float32x4_t v379 = vsubq_f32(v304, v320); - float32x4_t v380 = vaddq_f32(v336, v344); - float32x4_t v381 = vsubq_f32(v328, v344); - float32x4_t v432 = vaddq_f32(v36, v431); - float32x4_t v462 = vmulq_f32(v431, v461); - float32x4_t v468 = vrev64q_f32(v434); - float32x4_t v587 = vmulq_f32(v585, v586); - float32x4_t v595 = vaddq_f32(v510, v515); - float32x4_t v596 = vaddq_f32(v505, v515); - float32x4_t v597 = vaddq_f32(v531, v539); - float32x4_t v598 = vsubq_f32(v523, v539); - float32x4_t v599 = vaddq_f32(v555, v563); - float32x4_t v600 = vsubq_f32(v547, v563); - float32x4_t v251 = vmulq_f32(v249, v469); - float32x4_t v369 = vsubq_f32(v213, v243); - float32x4_t v382 = vaddq_f32(v360, v368); - float32x4_t v383 = vsubq_f32(v352, v368); - float32x4_t v384 = vaddq_f32(v374, v376); - float32x4_t v402 = vaddq_f32(v378, v379); - float32x4_t v470 = vmulq_f32(v468, v469); - float32x4_t v588 = vsubq_f32(v432, v462); - float32x4_t v601 = vaddq_f32(v579, v587); - float32x4_t v602 = vsubq_f32(v571, v587); - float32x4_t v603 = vaddq_f32(v593, v595); - float32x4_t v621 = vaddq_f32(v597, v598); - int16x4_t v637 = vqmovn_s32(vcvtq_n_s32_f32(v213, 15)); - int16x4_t v645 = vqmovn_s32(vcvtq_n_s32_f32(v432, 15)); - float32x4_t v385 = vaddq_f32(v384, v369); - float32x4_t v386 = vsubq_f32(v369, v371); - float32x4_t v388 = vaddq_f32(v369, v375); - float32x4_t v390 = vsubq_f32(v369, v372); - float32x4_t v392 = vaddq_f32(v369, v370); - float32x4_t v394 = vaddq_f32(v251, v380); - float32x4_t v396 = vsubq_f32(v382, v378); - float32x4_t v398 = vaddq_f32(v251, v383); - float32x4_t v400 = vsubq_f32(v383, v379); - float32x4_t v403 = vaddq_f32(v402, v380); - float32x4_t v604 = vaddq_f32(v603, v588); - float32x4_t v605 = vsubq_f32(v588, v590); - float32x4_t v607 = vaddq_f32(v588, v594); - float32x4_t v609 = vsubq_f32(v588, v591); - float32x4_t v611 = vaddq_f32(v588, v589); - float32x4_t v613 = vaddq_f32(v470, v599); - float32x4_t v615 = vsubq_f32(v601, v597); - float32x4_t v617 = vaddq_f32(v470, v602); - float32x4_t v619 = vsubq_f32(v602, v598); - float32x4_t v622 = vaddq_f32(v621, v599); - vst1_s16((int16_t *)v1689, v637); - vst1_s16((int16_t *)v1698, v645); - float32x4_t v387 = vsubq_f32(v386, v376); - float32x4_t v389 = vaddq_f32(v388, v377); - float32x4_t v391 = vsubq_f32(v390, v377); - float32x4_t v393 = vsubq_f32(v392, v373); - float32x4_t v395 = vaddq_f32(v394, v382); - float32x4_t v397 = vsubq_f32(v396, v251); - float32x4_t v399 = vaddq_f32(v398, v381); - float32x4_t v401 = vsubq_f32(v400, v251); - float32x4_t v404 = vaddq_f32(v403, v381); - float32x4_t v606 = vsubq_f32(v605, v595); - float32x4_t v608 = vaddq_f32(v607, v596); - float32x4_t v610 = vsubq_f32(v609, v596); - float32x4_t v612 = vsubq_f32(v611, v592); - float32x4_t v614 = vaddq_f32(v613, v601); - float32x4_t v616 = vsubq_f32(v615, v470); - float32x4_t v618 = vaddq_f32(v617, v600); - float32x4_t v620 = vsubq_f32(v619, v470); - float32x4_t v623 = vaddq_f32(v622, v600); - float32x4_t v405 = vsubq_f32(v404, v251); - float32x4_t v407 = vaddq_f32(v385, v395); - float32x4_t v408 = vaddq_f32(v387, v397); - float32x4_t v409 = vsubq_f32(v389, v399); - float32x4_t v410 = vaddq_f32(v391, v401); - float32x4_t v411 = vsubq_f32(v391, v401); - float32x4_t v412 = vaddq_f32(v389, v399); - float32x4_t v413 = vsubq_f32(v387, v397); - float32x4_t v414 = vsubq_f32(v385, v395); - float32x4_t v624 = vsubq_f32(v623, v470); - float32x4_t v626 = vaddq_f32(v604, v614); - float32x4_t v627 = vaddq_f32(v606, v616); - float32x4_t v628 = vsubq_f32(v608, v618); - float32x4_t v629 = vaddq_f32(v610, v620); - float32x4_t v630 = vsubq_f32(v610, v620); - float32x4_t v631 = vaddq_f32(v608, v618); - float32x4_t v632 = vsubq_f32(v606, v616); - float32x4_t v633 = vsubq_f32(v604, v614); - float32x4_t v406 = vaddq_f32(v393, v405); - float32x4_t v415 = vsubq_f32(v393, v405); - float32x4_t v625 = vaddq_f32(v612, v624); - float32x4_t v634 = vsubq_f32(v612, v624); - int16x4_t v669 = vqmovn_s32(vcvtq_n_s32_f32(v414, 15)); - int16x4_t v677 = vqmovn_s32(vcvtq_n_s32_f32(v633, 15)); - int16x4_t v685 = vqmovn_s32(vcvtq_n_s32_f32(v413, 15)); - int16x4_t v693 = vqmovn_s32(vcvtq_n_s32_f32(v632, 15)); - int16x4_t v701 = vqmovn_s32(vcvtq_n_s32_f32(v412, 15)); - int16x4_t v709 = vqmovn_s32(vcvtq_n_s32_f32(v631, 15)); - int16x4_t v717 = vqmovn_s32(vcvtq_n_s32_f32(v411, 15)); - int16x4_t v725 = vqmovn_s32(vcvtq_n_s32_f32(v630, 15)); - int16x4_t v733 = vqmovn_s32(vcvtq_n_s32_f32(v410, 15)); - int16x4_t v741 = vqmovn_s32(vcvtq_n_s32_f32(v629, 15)); - int16x4_t v749 = vqmovn_s32(vcvtq_n_s32_f32(v409, 15)); - int16x4_t v757 = vqmovn_s32(vcvtq_n_s32_f32(v628, 15)); - int16x4_t v765 = vqmovn_s32(vcvtq_n_s32_f32(v408, 15)); - int16x4_t v773 = vqmovn_s32(vcvtq_n_s32_f32(v627, 15)); - int16x4_t v781 = vqmovn_s32(vcvtq_n_s32_f32(v407, 15)); - int16x4_t v789 = vqmovn_s32(vcvtq_n_s32_f32(v626, 15)); - int16x4_t v653 = vqmovn_s32(vcvtq_n_s32_f32(v415, 15)); - int16x4_t v661 = vqmovn_s32(vcvtq_n_s32_f32(v634, 15)); - int16x4_t v797 = vqmovn_s32(vcvtq_n_s32_f32(v406, 15)); - int16x4_t v805 = vqmovn_s32(vcvtq_n_s32_f32(v625, 15)); - vst1_s16((int16_t *)v1725, v669); - vst1_s16((int16_t *)v1734, v677); - vst1_s16((int16_t *)v1743, v685); - vst1_s16((int16_t *)v1752, v693); - vst1_s16((int16_t *)v1761, v701); - vst1_s16((int16_t *)v1770, v709); - vst1_s16((int16_t *)v1779, v717); - vst1_s16((int16_t *)v1788, v725); - vst1_s16((int16_t *)v1797, v733); - vst1_s16((int16_t *)v1806, v741); - vst1_s16((int16_t *)v1815, v749); - vst1_s16((int16_t *)v1824, v757); - vst1_s16((int16_t *)v1833, v765); - vst1_s16((int16_t *)v1842, v773); - vst1_s16((int16_t *)v1851, v781); - vst1_s16((int16_t *)v1860, v789); - vst1_s16((int16_t *)v1707, v653); - vst1_s16((int16_t *)v1716, v661); - vst1_s16((int16_t *)v1869, v797); - vst1_s16((int16_t *)v1878, v805); - v5 += 2 * 1; - v6 += 2 * 1; - } - for (int j = v811 * 2; j < howmany; j += 1) { - float32x2_t v900 = v5[istride]; - float v1191 = 1.1000000000000001e+00F; - float v1194 = 3.3166247903554003e-01F; - float v1195 = -3.3166247903554003e-01F; - float v1202 = 5.1541501300188641e-01F; - float v1206 = 9.4125353283118118e-01F; - float v1210 = 1.4143537075597825e+00F; - float v1214 = 8.5949297361449750e-01F; - float v1218 = 4.2314838273285138e-02F; - float v1222 = 3.8639279888589606e-01F; - float v1226 = 5.1254589567200015e-01F; - float v1230 = 1.0702757469471715e+00F; - float v1234 = 5.5486073394528512e-01F; - float v1237 = 1.2412944743900585e+00F; - float v1238 = -1.2412944743900585e+00F; - float v1244 = 2.0897833842005756e-01F; - float v1245 = -2.0897833842005756e-01F; - float v1251 = 3.7415717312460811e-01F; - float v1252 = -3.7415717312460811e-01F; - float v1258 = 4.9929922194110327e-02F; - float v1259 = -4.9929922194110327e-02F; - float v1265 = 6.5815896284539266e-01F; - float v1266 = -6.5815896284539266e-01F; - float v1272 = 6.3306543373877577e-01F; - float v1273 = -6.3306543373877577e-01F; - float v1279 = 1.0822460581641109e+00F; - float v1280 = -1.0822460581641109e+00F; - float v1286 = 8.1720737907134022e-01F; - float v1287 = -8.1720737907134022e-01F; - float v1293 = 4.2408709531871824e-01F; - float v1294 = -4.2408709531871824e-01F; - float32x2_t v1296 = (float32x2_t){v4, v4}; - float32x2_t v823 = v5[0]; - float32x2_t v1192 = (float32x2_t){v1191, v1191}; - float32x2_t v1196 = (float32x2_t){v1194, v1195}; - float32x2_t v1203 = (float32x2_t){v1202, v1202}; - float32x2_t v1207 = (float32x2_t){v1206, v1206}; - float32x2_t v1211 = (float32x2_t){v1210, v1210}; - float32x2_t v1215 = (float32x2_t){v1214, v1214}; - float32x2_t v1219 = (float32x2_t){v1218, v1218}; - float32x2_t v1223 = (float32x2_t){v1222, v1222}; - float32x2_t v1227 = (float32x2_t){v1226, v1226}; - float32x2_t v1231 = (float32x2_t){v1230, v1230}; - float32x2_t v1235 = (float32x2_t){v1234, v1234}; - float32x2_t v1239 = (float32x2_t){v1237, v1238}; - float32x2_t v1246 = (float32x2_t){v1244, v1245}; - float32x2_t v1253 = (float32x2_t){v1251, v1252}; - float32x2_t v1260 = (float32x2_t){v1258, v1259}; - float32x2_t v1267 = (float32x2_t){v1265, v1266}; - float32x2_t v1274 = (float32x2_t){v1272, v1273}; - float32x2_t v1281 = (float32x2_t){v1279, v1280}; - float32x2_t v1288 = (float32x2_t){v1286, v1287}; - float32x2_t v1295 = (float32x2_t){v1293, v1294}; - float32x2_t v828 = v5[istride * 11]; - float32x2_t v835 = v5[istride * 2]; - float32x2_t v840 = v5[istride * 13]; - float32x2_t v847 = v5[istride * 4]; - float32x2_t v852 = v5[istride * 15]; - float32x2_t v859 = v5[istride * 6]; - float32x2_t v864 = v5[istride * 17]; - float32x2_t v871 = v5[istride * 8]; - float32x2_t v876 = v5[istride * 19]; - float32x2_t v883 = v5[istride * 10]; - float32x2_t v888 = v5[istride * 21]; - float32x2_t v895 = v5[istride * 12]; - float32x2_t v907 = v5[istride * 14]; - float32x2_t v912 = v5[istride * 3]; - float32x2_t v919 = v5[istride * 16]; - float32x2_t v924 = v5[istride * 5]; - float32x2_t v931 = v5[istride * 18]; - float32x2_t v936 = v5[istride * 7]; - float32x2_t v943 = v5[istride * 20]; - float32x2_t v948 = v5[istride * 9]; - float32x2_t v1198 = vmul_f32(v1296, v1196); - float32x2_t v1241 = vmul_f32(v1296, v1239); - float32x2_t v1248 = vmul_f32(v1296, v1246); - float32x2_t v1255 = vmul_f32(v1296, v1253); - float32x2_t v1262 = vmul_f32(v1296, v1260); - float32x2_t v1269 = vmul_f32(v1296, v1267); - float32x2_t v1276 = vmul_f32(v1296, v1274); - float32x2_t v1283 = vmul_f32(v1296, v1281); - float32x2_t v1290 = vmul_f32(v1296, v1288); - float32x2_t v1297 = vmul_f32(v1296, v1295); - float32x2_t v829 = vadd_f32(v823, v828); - float32x2_t v830 = vsub_f32(v823, v828); - float32x2_t v841 = vadd_f32(v835, v840); - float32x2_t v842 = vsub_f32(v835, v840); - float32x2_t v853 = vadd_f32(v847, v852); - float32x2_t v854 = vsub_f32(v847, v852); - float32x2_t v865 = vadd_f32(v859, v864); - float32x2_t v866 = vsub_f32(v859, v864); - float32x2_t v877 = vadd_f32(v871, v876); - float32x2_t v878 = vsub_f32(v871, v876); - float32x2_t v889 = vadd_f32(v883, v888); - float32x2_t v890 = vsub_f32(v883, v888); - float32x2_t v901 = vadd_f32(v895, v900); - float32x2_t v902 = vsub_f32(v895, v900); - float32x2_t v913 = vadd_f32(v907, v912); - float32x2_t v914 = vsub_f32(v907, v912); - float32x2_t v925 = vadd_f32(v919, v924); - float32x2_t v926 = vsub_f32(v919, v924); - float32x2_t v937 = vadd_f32(v931, v936); - float32x2_t v938 = vsub_f32(v931, v936); - float32x2_t v949 = vadd_f32(v943, v948); - float32x2_t v950 = vsub_f32(v943, v948); - float32x2_t v951 = vadd_f32(v841, v949); - float32x2_t v952 = vadd_f32(v853, v937); - float32x2_t v953 = vadd_f32(v865, v925); - float32x2_t v954 = vadd_f32(v877, v913); - float32x2_t v955 = vadd_f32(v889, v901); - float32x2_t v956 = vsub_f32(v841, v949); - float32x2_t v957 = vsub_f32(v853, v937); - float32x2_t v958 = vsub_f32(v865, v925); - float32x2_t v959 = vsub_f32(v877, v913); - float32x2_t v960 = vsub_f32(v889, v901); - float32x2_t v1149 = vadd_f32(v842, v950); - float32x2_t v1150 = vadd_f32(v854, v938); - float32x2_t v1151 = vadd_f32(v866, v926); - float32x2_t v1152 = vadd_f32(v878, v914); - float32x2_t v1153 = vadd_f32(v890, v902); - float32x2_t v1154 = vsub_f32(v842, v950); - float32x2_t v1155 = vsub_f32(v854, v938); - float32x2_t v1156 = vsub_f32(v866, v926); - float32x2_t v1157 = vsub_f32(v878, v914); - float32x2_t v1158 = vsub_f32(v890, v902); - float32x2_t v961 = vadd_f32(v951, v952); - float32x2_t v962 = vadd_f32(v953, v955); - float32x2_t v964 = vsub_f32(v957, v958); - float32x2_t v965 = vadd_f32(v956, v960); - float32x2_t v970 = vsub_f32(v952, v954); - float32x2_t v971 = vsub_f32(v951, v954); - float32x2_t v972 = vsub_f32(v952, v951); - float32x2_t v973 = vsub_f32(v955, v954); - float32x2_t v974 = vsub_f32(v953, v954); - float32x2_t v975 = vsub_f32(v955, v953); - float32x2_t v976 = vsub_f32(v952, v955); - float32x2_t v977 = vsub_f32(v951, v953); - float32x2_t v979 = vadd_f32(v957, v959); - float32x2_t v980 = vsub_f32(v956, v959); - float32x2_t v981 = vadd_f32(v956, v957); - float32x2_t v982 = vsub_f32(v959, v960); - float32x2_t v983 = vsub_f32(v958, v959); - float32x2_t v984 = vsub_f32(v958, v960); - float32x2_t v985 = vadd_f32(v957, v960); - float32x2_t v986 = vsub_f32(v956, v958); - float32x2_t v1159 = vadd_f32(v1149, v1150); - float32x2_t v1160 = vadd_f32(v1151, v1153); - float32x2_t v1162 = vsub_f32(v1155, v1156); - float32x2_t v1163 = vadd_f32(v1154, v1158); - float32x2_t v1168 = vsub_f32(v1150, v1152); - float32x2_t v1169 = vsub_f32(v1149, v1152); - float32x2_t v1170 = vsub_f32(v1150, v1149); - float32x2_t v1171 = vsub_f32(v1153, v1152); - float32x2_t v1172 = vsub_f32(v1151, v1152); - float32x2_t v1173 = vsub_f32(v1153, v1151); - float32x2_t v1174 = vsub_f32(v1150, v1153); - float32x2_t v1175 = vsub_f32(v1149, v1151); - float32x2_t v1177 = vadd_f32(v1155, v1157); - float32x2_t v1178 = vsub_f32(v1154, v1157); - float32x2_t v1179 = vadd_f32(v1154, v1155); - float32x2_t v1180 = vsub_f32(v1157, v1158); - float32x2_t v1181 = vsub_f32(v1156, v1157); - float32x2_t v1182 = vsub_f32(v1156, v1158); - float32x2_t v1183 = vadd_f32(v1155, v1158); - float32x2_t v1184 = vsub_f32(v1154, v1156); - float32x2_t v963 = vadd_f32(v954, v961); - float32x2_t v968 = vsub_f32(v964, v965); - float32x2_t v978 = vsub_f32(v962, v961); - float32x2_t v987 = vadd_f32(v964, v965); - float32x2_t v1006 = vmul_f32(v970, v1203); - float32x2_t v1010 = vmul_f32(v971, v1207); - float32x2_t v1014 = vmul_f32(v972, v1211); - float32x2_t v1018 = vmul_f32(v973, v1215); - float32x2_t v1022 = vmul_f32(v974, v1219); - float32x2_t v1026 = vmul_f32(v975, v1223); - float32x2_t v1030 = vmul_f32(v976, v1227); - float32x2_t v1034 = vmul_f32(v977, v1231); - float32x2_t v1044 = vrev64_f32(v979); - float32x2_t v1051 = vrev64_f32(v980); - float32x2_t v1058 = vrev64_f32(v981); - float32x2_t v1065 = vrev64_f32(v982); - float32x2_t v1072 = vrev64_f32(v983); - float32x2_t v1079 = vrev64_f32(v984); - float32x2_t v1086 = vrev64_f32(v985); - float32x2_t v1093 = vrev64_f32(v986); - float32x2_t v1161 = vadd_f32(v1152, v1159); - float32x2_t v1166 = vsub_f32(v1162, v1163); - float32x2_t v1176 = vsub_f32(v1160, v1159); - float32x2_t v1185 = vadd_f32(v1162, v1163); - float32x2_t v1204 = vmul_f32(v1168, v1203); - float32x2_t v1208 = vmul_f32(v1169, v1207); - float32x2_t v1212 = vmul_f32(v1170, v1211); - float32x2_t v1216 = vmul_f32(v1171, v1215); - float32x2_t v1220 = vmul_f32(v1172, v1219); - float32x2_t v1224 = vmul_f32(v1173, v1223); - float32x2_t v1228 = vmul_f32(v1174, v1227); - float32x2_t v1232 = vmul_f32(v1175, v1231); - float32x2_t v1242 = vrev64_f32(v1177); - float32x2_t v1249 = vrev64_f32(v1178); - float32x2_t v1256 = vrev64_f32(v1179); - float32x2_t v1263 = vrev64_f32(v1180); - float32x2_t v1270 = vrev64_f32(v1181); - float32x2_t v1277 = vrev64_f32(v1182); - float32x2_t v1284 = vrev64_f32(v1183); - float32x2_t v1291 = vrev64_f32(v1184); - float32x2_t v966 = vadd_f32(v963, v962); - float32x2_t v969 = vsub_f32(v968, v959); - float32x2_t v1038 = vmul_f32(v978, v1235); - float32x2_t v1045 = vmul_f32(v1044, v1241); - float32x2_t v1052 = vmul_f32(v1051, v1248); - float32x2_t v1059 = vmul_f32(v1058, v1255); - float32x2_t v1066 = vmul_f32(v1065, v1262); - float32x2_t v1073 = vmul_f32(v1072, v1269); - float32x2_t v1080 = vmul_f32(v1079, v1276); - float32x2_t v1087 = vmul_f32(v1086, v1283); - float32x2_t v1094 = vmul_f32(v1093, v1290); - float32x2_t v1100 = vrev64_f32(v987); - float32x2_t v1103 = vadd_f32(v1006, v1010); - float32x2_t v1104 = vadd_f32(v1010, v1014); - float32x2_t v1105 = vsub_f32(v1006, v1014); - float32x2_t v1106 = vadd_f32(v1018, v1022); - float32x2_t v1107 = vadd_f32(v1022, v1026); - float32x2_t v1108 = vsub_f32(v1018, v1026); - float32x2_t v1164 = vadd_f32(v1161, v1160); - float32x2_t v1167 = vsub_f32(v1166, v1157); - float32x2_t v1236 = vmul_f32(v1176, v1235); - float32x2_t v1243 = vmul_f32(v1242, v1241); - float32x2_t v1250 = vmul_f32(v1249, v1248); - float32x2_t v1257 = vmul_f32(v1256, v1255); - float32x2_t v1264 = vmul_f32(v1263, v1262); - float32x2_t v1271 = vmul_f32(v1270, v1269); - float32x2_t v1278 = vmul_f32(v1277, v1276); - float32x2_t v1285 = vmul_f32(v1284, v1283); - float32x2_t v1292 = vmul_f32(v1291, v1290); - float32x2_t v1298 = vrev64_f32(v1185); - float32x2_t v1301 = vadd_f32(v1204, v1208); - float32x2_t v1302 = vadd_f32(v1208, v1212); - float32x2_t v1303 = vsub_f32(v1204, v1212); - float32x2_t v1304 = vadd_f32(v1216, v1220); - float32x2_t v1305 = vadd_f32(v1220, v1224); - float32x2_t v1306 = vsub_f32(v1216, v1224); - float32x2_t v967 = vadd_f32(v829, v966); - float32x2_t v995 = vmul_f32(v966, v1192); - float32x2_t v1001 = vrev64_f32(v969); - float32x2_t v1101 = vmul_f32(v1100, v1297); - float32x2_t v1109 = vadd_f32(v1034, v1038); - float32x2_t v1110 = vadd_f32(v1030, v1038); - float32x2_t v1111 = vadd_f32(v1052, v1059); - float32x2_t v1112 = vsub_f32(v1045, v1059); - float32x2_t v1113 = vadd_f32(v1073, v1080); - float32x2_t v1114 = vsub_f32(v1066, v1080); - float32x2_t v1165 = vadd_f32(v830, v1164); - float32x2_t v1193 = vmul_f32(v1164, v1192); - float32x2_t v1199 = vrev64_f32(v1167); - float32x2_t v1299 = vmul_f32(v1298, v1297); - float32x2_t v1307 = vadd_f32(v1232, v1236); - float32x2_t v1308 = vadd_f32(v1228, v1236); - float32x2_t v1309 = vadd_f32(v1250, v1257); - float32x2_t v1310 = vsub_f32(v1243, v1257); - float32x2_t v1311 = vadd_f32(v1271, v1278); - float32x2_t v1312 = vsub_f32(v1264, v1278); - float32x2_t v1002 = vmul_f32(v1001, v1198); - float32x2_t v1102 = vsub_f32(v967, v995); - float32x2_t v1115 = vadd_f32(v1094, v1101); - float32x2_t v1116 = vsub_f32(v1087, v1101); - float32x2_t v1117 = vadd_f32(v1107, v1109); - float32x2_t v1135 = vadd_f32(v1111, v1112); - float32x2_t v1200 = vmul_f32(v1199, v1198); - float32x2_t v1300 = vsub_f32(v1165, v1193); - float32x2_t v1313 = vadd_f32(v1292, v1299); - float32x2_t v1314 = vsub_f32(v1285, v1299); - float32x2_t v1315 = vadd_f32(v1305, v1307); - float32x2_t v1333 = vadd_f32(v1309, v1310); - int16x4_t v1349 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v967, 15), (int32x2_t){0, 0})); - int16x4_t v1355 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1165, 15), (int32x2_t){0, 0})); - float32x2_t v1118 = vadd_f32(v1117, v1102); - float32x2_t v1119 = vsub_f32(v1102, v1104); - float32x2_t v1121 = vadd_f32(v1102, v1108); - float32x2_t v1123 = vsub_f32(v1102, v1105); - float32x2_t v1125 = vadd_f32(v1102, v1103); - float32x2_t v1127 = vadd_f32(v1002, v1113); - float32x2_t v1129 = vsub_f32(v1115, v1111); - float32x2_t v1131 = vadd_f32(v1002, v1116); - float32x2_t v1133 = vsub_f32(v1116, v1112); - float32x2_t v1136 = vadd_f32(v1135, v1113); - float32x2_t v1316 = vadd_f32(v1315, v1300); - float32x2_t v1317 = vsub_f32(v1300, v1302); - float32x2_t v1319 = vadd_f32(v1300, v1306); - float32x2_t v1321 = vsub_f32(v1300, v1303); - float32x2_t v1323 = vadd_f32(v1300, v1301); - float32x2_t v1325 = vadd_f32(v1200, v1311); - float32x2_t v1327 = vsub_f32(v1313, v1309); - float32x2_t v1329 = vadd_f32(v1200, v1314); - float32x2_t v1331 = vsub_f32(v1314, v1310); - float32x2_t v1334 = vadd_f32(v1333, v1311); - v6[0] = vget_lane_s32(vreinterpret_s32_s16(v1349), 0); - v6[ostride * 11] = vget_lane_s32(vreinterpret_s32_s16(v1355), 0); - float32x2_t v1120 = vsub_f32(v1119, v1109); - float32x2_t v1122 = vadd_f32(v1121, v1110); - float32x2_t v1124 = vsub_f32(v1123, v1110); - float32x2_t v1126 = vsub_f32(v1125, v1106); - float32x2_t v1128 = vadd_f32(v1127, v1115); - float32x2_t v1130 = vsub_f32(v1129, v1002); - float32x2_t v1132 = vadd_f32(v1131, v1114); - float32x2_t v1134 = vsub_f32(v1133, v1002); - float32x2_t v1137 = vadd_f32(v1136, v1114); - float32x2_t v1318 = vsub_f32(v1317, v1307); - float32x2_t v1320 = vadd_f32(v1319, v1308); - float32x2_t v1322 = vsub_f32(v1321, v1308); - float32x2_t v1324 = vsub_f32(v1323, v1304); - float32x2_t v1326 = vadd_f32(v1325, v1313); - float32x2_t v1328 = vsub_f32(v1327, v1200); - float32x2_t v1330 = vadd_f32(v1329, v1312); - float32x2_t v1332 = vsub_f32(v1331, v1200); - float32x2_t v1335 = vadd_f32(v1334, v1312); - float32x2_t v1138 = vsub_f32(v1137, v1002); - float32x2_t v1140 = vadd_f32(v1118, v1128); - float32x2_t v1141 = vadd_f32(v1120, v1130); - float32x2_t v1142 = vsub_f32(v1122, v1132); - float32x2_t v1143 = vadd_f32(v1124, v1134); - float32x2_t v1144 = vsub_f32(v1124, v1134); - float32x2_t v1145 = vadd_f32(v1122, v1132); - float32x2_t v1146 = vsub_f32(v1120, v1130); - float32x2_t v1147 = vsub_f32(v1118, v1128); - float32x2_t v1336 = vsub_f32(v1335, v1200); - float32x2_t v1338 = vadd_f32(v1316, v1326); - float32x2_t v1339 = vadd_f32(v1318, v1328); - float32x2_t v1340 = vsub_f32(v1320, v1330); - float32x2_t v1341 = vadd_f32(v1322, v1332); - float32x2_t v1342 = vsub_f32(v1322, v1332); - float32x2_t v1343 = vadd_f32(v1320, v1330); - float32x2_t v1344 = vsub_f32(v1318, v1328); - float32x2_t v1345 = vsub_f32(v1316, v1326); - float32x2_t v1139 = vadd_f32(v1126, v1138); - float32x2_t v1148 = vsub_f32(v1126, v1138); - float32x2_t v1337 = vadd_f32(v1324, v1336); - float32x2_t v1346 = vsub_f32(v1324, v1336); - int16x4_t v1373 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1147, 15), (int32x2_t){0, 0})); - int16x4_t v1379 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1345, 15), (int32x2_t){0, 0})); - int16x4_t v1385 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1146, 15), (int32x2_t){0, 0})); - int16x4_t v1391 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1344, 15), (int32x2_t){0, 0})); - int16x4_t v1397 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1145, 15), (int32x2_t){0, 0})); - int16x4_t v1403 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1343, 15), (int32x2_t){0, 0})); - int16x4_t v1409 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1144, 15), (int32x2_t){0, 0})); - int16x4_t v1415 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1342, 15), (int32x2_t){0, 0})); - int16x4_t v1421 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1143, 15), (int32x2_t){0, 0})); - int16x4_t v1427 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1341, 15), (int32x2_t){0, 0})); - int16x4_t v1433 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1142, 15), (int32x2_t){0, 0})); - int16x4_t v1439 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1340, 15), (int32x2_t){0, 0})); - int16x4_t v1445 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1141, 15), (int32x2_t){0, 0})); - int16x4_t v1451 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1339, 15), (int32x2_t){0, 0})); - int16x4_t v1457 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1140, 15), (int32x2_t){0, 0})); - int16x4_t v1463 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1338, 15), (int32x2_t){0, 0})); - int16x4_t v1361 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1148, 15), (int32x2_t){0, 0})); - int16x4_t v1367 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1346, 15), (int32x2_t){0, 0})); - v6[ostride * 2] = vget_lane_s32(vreinterpret_s32_s16(v1373), 0); - v6[ostride * 13] = vget_lane_s32(vreinterpret_s32_s16(v1379), 0); - v6[ostride * 14] = vget_lane_s32(vreinterpret_s32_s16(v1385), 0); - v6[ostride * 3] = vget_lane_s32(vreinterpret_s32_s16(v1391), 0); - v6[ostride * 4] = vget_lane_s32(vreinterpret_s32_s16(v1397), 0); - v6[ostride * 15] = vget_lane_s32(vreinterpret_s32_s16(v1403), 0); - v6[ostride * 16] = vget_lane_s32(vreinterpret_s32_s16(v1409), 0); - v6[ostride * 5] = vget_lane_s32(vreinterpret_s32_s16(v1415), 0); - v6[ostride * 6] = vget_lane_s32(vreinterpret_s32_s16(v1421), 0); - v6[ostride * 17] = vget_lane_s32(vreinterpret_s32_s16(v1427), 0); - v6[ostride * 18] = vget_lane_s32(vreinterpret_s32_s16(v1433), 0); - v6[ostride * 7] = vget_lane_s32(vreinterpret_s32_s16(v1439), 0); - v6[ostride * 8] = vget_lane_s32(vreinterpret_s32_s16(v1445), 0); - v6[ostride * 19] = vget_lane_s32(vreinterpret_s32_s16(v1451), 0); - v6[ostride * 20] = vget_lane_s32(vreinterpret_s32_s16(v1457), 0); - v6[ostride * 9] = vget_lane_s32(vreinterpret_s32_s16(v1463), 0); - int16x4_t v1469 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1139, 15), (int32x2_t){0, 0})); - int16x4_t v1475 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1337, 15), (int32x2_t){0, 0})); - v6[ostride * 12] = vget_lane_s32(vreinterpret_s32_s16(v1361), 0); - v6[ostride] = vget_lane_s32(vreinterpret_s32_s16(v1367), 0); - v6[ostride * 10] = vget_lane_s32(vreinterpret_s32_s16(v1469), 0); - v6[ostride * 21] = vget_lane_s32(vreinterpret_s32_s16(v1475), 0); + for (int j = 0; j < howmany; j += 1) { + float32x2_t v97 = v5[istride]; + float v388 = 1.1000000000000001e+00F; + float v391 = 3.3166247903554003e-01F; + float v392 = -3.3166247903554003e-01F; + float v399 = 5.1541501300188641e-01F; + float v403 = 9.4125353283118118e-01F; + float v407 = 1.4143537075597825e+00F; + float v411 = 8.5949297361449750e-01F; + float v415 = 4.2314838273285138e-02F; + float v419 = 3.8639279888589606e-01F; + float v423 = 5.1254589567200015e-01F; + float v427 = 1.0702757469471715e+00F; + float v431 = 5.5486073394528512e-01F; + float v434 = 1.2412944743900585e+00F; + float v435 = -1.2412944743900585e+00F; + float v441 = 2.0897833842005756e-01F; + float v442 = -2.0897833842005756e-01F; + float v448 = 3.7415717312460811e-01F; + float v449 = -3.7415717312460811e-01F; + float v455 = 4.9929922194110327e-02F; + float v456 = -4.9929922194110327e-02F; + float v462 = 6.5815896284539266e-01F; + float v463 = -6.5815896284539266e-01F; + float v469 = 6.3306543373877577e-01F; + float v470 = -6.3306543373877577e-01F; + float v476 = 1.0822460581641109e+00F; + float v477 = -1.0822460581641109e+00F; + float v483 = 8.1720737907134022e-01F; + float v484 = -8.1720737907134022e-01F; + float v490 = 4.2408709531871824e-01F; + float v491 = -4.2408709531871824e-01F; + float32x2_t v493 = (float32x2_t){v4, v4}; + float32x2_t v20 = v5[0]; + float32x2_t v389 = (float32x2_t){v388, v388}; + float32x2_t v393 = (float32x2_t){v391, v392}; + float32x2_t v400 = (float32x2_t){v399, v399}; + float32x2_t v404 = (float32x2_t){v403, v403}; + float32x2_t v408 = (float32x2_t){v407, v407}; + float32x2_t v412 = (float32x2_t){v411, v411}; + float32x2_t v416 = (float32x2_t){v415, v415}; + float32x2_t v420 = (float32x2_t){v419, v419}; + float32x2_t v424 = (float32x2_t){v423, v423}; + float32x2_t v428 = (float32x2_t){v427, v427}; + float32x2_t v432 = (float32x2_t){v431, v431}; + float32x2_t v436 = (float32x2_t){v434, v435}; + float32x2_t v443 = (float32x2_t){v441, v442}; + float32x2_t v450 = (float32x2_t){v448, v449}; + float32x2_t v457 = (float32x2_t){v455, v456}; + float32x2_t v464 = (float32x2_t){v462, v463}; + float32x2_t v471 = (float32x2_t){v469, v470}; + float32x2_t v478 = (float32x2_t){v476, v477}; + float32x2_t v485 = (float32x2_t){v483, v484}; + float32x2_t v492 = (float32x2_t){v490, v491}; + float32x2_t v25 = v5[istride * 11]; + float32x2_t v32 = v5[istride * 2]; + float32x2_t v37 = v5[istride * 13]; + float32x2_t v44 = v5[istride * 4]; + float32x2_t v49 = v5[istride * 15]; + float32x2_t v56 = v5[istride * 6]; + float32x2_t v61 = v5[istride * 17]; + float32x2_t v68 = v5[istride * 8]; + float32x2_t v73 = v5[istride * 19]; + float32x2_t v80 = v5[istride * 10]; + float32x2_t v85 = v5[istride * 21]; + float32x2_t v92 = v5[istride * 12]; + float32x2_t v104 = v5[istride * 14]; + float32x2_t v109 = v5[istride * 3]; + float32x2_t v116 = v5[istride * 16]; + float32x2_t v121 = v5[istride * 5]; + float32x2_t v128 = v5[istride * 18]; + float32x2_t v133 = v5[istride * 7]; + float32x2_t v140 = v5[istride * 20]; + float32x2_t v145 = v5[istride * 9]; + float32x2_t v395 = vmul_f32(v493, v393); + float32x2_t v438 = vmul_f32(v493, v436); + float32x2_t v445 = vmul_f32(v493, v443); + float32x2_t v452 = vmul_f32(v493, v450); + float32x2_t v459 = vmul_f32(v493, v457); + float32x2_t v466 = vmul_f32(v493, v464); + float32x2_t v473 = vmul_f32(v493, v471); + float32x2_t v480 = vmul_f32(v493, v478); + float32x2_t v487 = vmul_f32(v493, v485); + float32x2_t v494 = vmul_f32(v493, v492); + float32x2_t v26 = vadd_f32(v20, v25); + float32x2_t v27 = vsub_f32(v20, v25); + float32x2_t v38 = vadd_f32(v32, v37); + float32x2_t v39 = vsub_f32(v32, v37); + float32x2_t v50 = vadd_f32(v44, v49); + float32x2_t v51 = vsub_f32(v44, v49); + float32x2_t v62 = vadd_f32(v56, v61); + float32x2_t v63 = vsub_f32(v56, v61); + float32x2_t v74 = vadd_f32(v68, v73); + float32x2_t v75 = vsub_f32(v68, v73); + float32x2_t v86 = vadd_f32(v80, v85); + float32x2_t v87 = vsub_f32(v80, v85); + float32x2_t v98 = vadd_f32(v92, v97); + float32x2_t v99 = vsub_f32(v92, v97); + float32x2_t v110 = vadd_f32(v104, v109); + float32x2_t v111 = vsub_f32(v104, v109); + float32x2_t v122 = vadd_f32(v116, v121); + float32x2_t v123 = vsub_f32(v116, v121); + float32x2_t v134 = vadd_f32(v128, v133); + float32x2_t v135 = vsub_f32(v128, v133); + float32x2_t v146 = vadd_f32(v140, v145); + float32x2_t v147 = vsub_f32(v140, v145); + float32x2_t v148 = vadd_f32(v38, v146); + float32x2_t v149 = vadd_f32(v50, v134); + float32x2_t v150 = vadd_f32(v62, v122); + float32x2_t v151 = vadd_f32(v74, v110); + float32x2_t v152 = vadd_f32(v86, v98); + float32x2_t v153 = vsub_f32(v38, v146); + float32x2_t v154 = vsub_f32(v50, v134); + float32x2_t v155 = vsub_f32(v62, v122); + float32x2_t v156 = vsub_f32(v74, v110); + float32x2_t v157 = vsub_f32(v86, v98); + float32x2_t v346 = vadd_f32(v39, v147); + float32x2_t v347 = vadd_f32(v51, v135); + float32x2_t v348 = vadd_f32(v63, v123); + float32x2_t v349 = vadd_f32(v75, v111); + float32x2_t v350 = vadd_f32(v87, v99); + float32x2_t v351 = vsub_f32(v39, v147); + float32x2_t v352 = vsub_f32(v51, v135); + float32x2_t v353 = vsub_f32(v63, v123); + float32x2_t v354 = vsub_f32(v75, v111); + float32x2_t v355 = vsub_f32(v87, v99); + float32x2_t v158 = vadd_f32(v148, v149); + float32x2_t v159 = vadd_f32(v150, v152); + float32x2_t v161 = vsub_f32(v154, v155); + float32x2_t v162 = vadd_f32(v153, v157); + float32x2_t v167 = vsub_f32(v149, v151); + float32x2_t v168 = vsub_f32(v148, v151); + float32x2_t v169 = vsub_f32(v149, v148); + float32x2_t v170 = vsub_f32(v152, v151); + float32x2_t v171 = vsub_f32(v150, v151); + float32x2_t v172 = vsub_f32(v152, v150); + float32x2_t v173 = vsub_f32(v149, v152); + float32x2_t v174 = vsub_f32(v148, v150); + float32x2_t v176 = vadd_f32(v154, v156); + float32x2_t v177 = vsub_f32(v153, v156); + float32x2_t v178 = vadd_f32(v153, v154); + float32x2_t v179 = vsub_f32(v156, v157); + float32x2_t v180 = vsub_f32(v155, v156); + float32x2_t v181 = vsub_f32(v155, v157); + float32x2_t v182 = vadd_f32(v154, v157); + float32x2_t v183 = vsub_f32(v153, v155); + float32x2_t v356 = vadd_f32(v346, v347); + float32x2_t v357 = vadd_f32(v348, v350); + float32x2_t v359 = vsub_f32(v352, v353); + float32x2_t v360 = vadd_f32(v351, v355); + float32x2_t v365 = vsub_f32(v347, v349); + float32x2_t v366 = vsub_f32(v346, v349); + float32x2_t v367 = vsub_f32(v347, v346); + float32x2_t v368 = vsub_f32(v350, v349); + float32x2_t v369 = vsub_f32(v348, v349); + float32x2_t v370 = vsub_f32(v350, v348); + float32x2_t v371 = vsub_f32(v347, v350); + float32x2_t v372 = vsub_f32(v346, v348); + float32x2_t v374 = vadd_f32(v352, v354); + float32x2_t v375 = vsub_f32(v351, v354); + float32x2_t v376 = vadd_f32(v351, v352); + float32x2_t v377 = vsub_f32(v354, v355); + float32x2_t v378 = vsub_f32(v353, v354); + float32x2_t v379 = vsub_f32(v353, v355); + float32x2_t v380 = vadd_f32(v352, v355); + float32x2_t v381 = vsub_f32(v351, v353); + float32x2_t v160 = vadd_f32(v151, v158); + float32x2_t v165 = vsub_f32(v161, v162); + float32x2_t v175 = vsub_f32(v159, v158); + float32x2_t v184 = vadd_f32(v161, v162); + float32x2_t v203 = vmul_f32(v167, v400); + float32x2_t v207 = vmul_f32(v168, v404); + float32x2_t v211 = vmul_f32(v169, v408); + float32x2_t v215 = vmul_f32(v170, v412); + float32x2_t v219 = vmul_f32(v171, v416); + float32x2_t v223 = vmul_f32(v172, v420); + float32x2_t v227 = vmul_f32(v173, v424); + float32x2_t v231 = vmul_f32(v174, v428); + float32x2_t v241 = vrev64_f32(v176); + float32x2_t v248 = vrev64_f32(v177); + float32x2_t v255 = vrev64_f32(v178); + float32x2_t v262 = vrev64_f32(v179); + float32x2_t v269 = vrev64_f32(v180); + float32x2_t v276 = vrev64_f32(v181); + float32x2_t v283 = vrev64_f32(v182); + float32x2_t v290 = vrev64_f32(v183); + float32x2_t v358 = vadd_f32(v349, v356); + float32x2_t v363 = vsub_f32(v359, v360); + float32x2_t v373 = vsub_f32(v357, v356); + float32x2_t v382 = vadd_f32(v359, v360); + float32x2_t v401 = vmul_f32(v365, v400); + float32x2_t v405 = vmul_f32(v366, v404); + float32x2_t v409 = vmul_f32(v367, v408); + float32x2_t v413 = vmul_f32(v368, v412); + float32x2_t v417 = vmul_f32(v369, v416); + float32x2_t v421 = vmul_f32(v370, v420); + float32x2_t v425 = vmul_f32(v371, v424); + float32x2_t v429 = vmul_f32(v372, v428); + float32x2_t v439 = vrev64_f32(v374); + float32x2_t v446 = vrev64_f32(v375); + float32x2_t v453 = vrev64_f32(v376); + float32x2_t v460 = vrev64_f32(v377); + float32x2_t v467 = vrev64_f32(v378); + float32x2_t v474 = vrev64_f32(v379); + float32x2_t v481 = vrev64_f32(v380); + float32x2_t v488 = vrev64_f32(v381); + float32x2_t v163 = vadd_f32(v160, v159); + float32x2_t v166 = vsub_f32(v165, v156); + float32x2_t v235 = vmul_f32(v175, v432); + float32x2_t v242 = vmul_f32(v241, v438); + float32x2_t v249 = vmul_f32(v248, v445); + float32x2_t v256 = vmul_f32(v255, v452); + float32x2_t v263 = vmul_f32(v262, v459); + float32x2_t v270 = vmul_f32(v269, v466); + float32x2_t v277 = vmul_f32(v276, v473); + float32x2_t v284 = vmul_f32(v283, v480); + float32x2_t v291 = vmul_f32(v290, v487); + float32x2_t v297 = vrev64_f32(v184); + float32x2_t v300 = vadd_f32(v203, v207); + float32x2_t v301 = vadd_f32(v207, v211); + float32x2_t v302 = vsub_f32(v203, v211); + float32x2_t v303 = vadd_f32(v215, v219); + float32x2_t v304 = vadd_f32(v219, v223); + float32x2_t v305 = vsub_f32(v215, v223); + float32x2_t v361 = vadd_f32(v358, v357); + float32x2_t v364 = vsub_f32(v363, v354); + float32x2_t v433 = vmul_f32(v373, v432); + float32x2_t v440 = vmul_f32(v439, v438); + float32x2_t v447 = vmul_f32(v446, v445); + float32x2_t v454 = vmul_f32(v453, v452); + float32x2_t v461 = vmul_f32(v460, v459); + float32x2_t v468 = vmul_f32(v467, v466); + float32x2_t v475 = vmul_f32(v474, v473); + float32x2_t v482 = vmul_f32(v481, v480); + float32x2_t v489 = vmul_f32(v488, v487); + float32x2_t v495 = vrev64_f32(v382); + float32x2_t v498 = vadd_f32(v401, v405); + float32x2_t v499 = vadd_f32(v405, v409); + float32x2_t v500 = vsub_f32(v401, v409); + float32x2_t v501 = vadd_f32(v413, v417); + float32x2_t v502 = vadd_f32(v417, v421); + float32x2_t v503 = vsub_f32(v413, v421); + float32x2_t v164 = vadd_f32(v26, v163); + float32x2_t v192 = vmul_f32(v163, v389); + float32x2_t v198 = vrev64_f32(v166); + float32x2_t v298 = vmul_f32(v297, v494); + float32x2_t v306 = vadd_f32(v231, v235); + float32x2_t v307 = vadd_f32(v227, v235); + float32x2_t v308 = vadd_f32(v249, v256); + float32x2_t v309 = vsub_f32(v242, v256); + float32x2_t v310 = vadd_f32(v270, v277); + float32x2_t v311 = vsub_f32(v263, v277); + float32x2_t v362 = vadd_f32(v27, v361); + float32x2_t v390 = vmul_f32(v361, v389); + float32x2_t v396 = vrev64_f32(v364); + float32x2_t v496 = vmul_f32(v495, v494); + float32x2_t v504 = vadd_f32(v429, v433); + float32x2_t v505 = vadd_f32(v425, v433); + float32x2_t v506 = vadd_f32(v447, v454); + float32x2_t v507 = vsub_f32(v440, v454); + float32x2_t v508 = vadd_f32(v468, v475); + float32x2_t v509 = vsub_f32(v461, v475); + float32x2_t v199 = vmul_f32(v198, v395); + float32x2_t v299 = vsub_f32(v164, v192); + float32x2_t v312 = vadd_f32(v291, v298); + float32x2_t v313 = vsub_f32(v284, v298); + float32x2_t v314 = vadd_f32(v304, v306); + float32x2_t v332 = vadd_f32(v308, v309); + float32x2_t v397 = vmul_f32(v396, v395); + float32x2_t v497 = vsub_f32(v362, v390); + float32x2_t v510 = vadd_f32(v489, v496); + float32x2_t v511 = vsub_f32(v482, v496); + float32x2_t v512 = vadd_f32(v502, v504); + float32x2_t v530 = vadd_f32(v506, v507); + int16x4_t v546 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v164, 15), (int32x2_t){0, 0})); + int16x4_t v552 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v362, 15), (int32x2_t){0, 0})); + float32x2_t v315 = vadd_f32(v314, v299); + float32x2_t v316 = vsub_f32(v299, v301); + float32x2_t v318 = vadd_f32(v299, v305); + float32x2_t v320 = vsub_f32(v299, v302); + float32x2_t v322 = vadd_f32(v299, v300); + float32x2_t v324 = vadd_f32(v199, v310); + float32x2_t v326 = vsub_f32(v312, v308); + float32x2_t v328 = vadd_f32(v199, v313); + float32x2_t v330 = vsub_f32(v313, v309); + float32x2_t v333 = vadd_f32(v332, v310); + float32x2_t v513 = vadd_f32(v512, v497); + float32x2_t v514 = vsub_f32(v497, v499); + float32x2_t v516 = vadd_f32(v497, v503); + float32x2_t v518 = vsub_f32(v497, v500); + float32x2_t v520 = vadd_f32(v497, v498); + float32x2_t v522 = vadd_f32(v397, v508); + float32x2_t v524 = vsub_f32(v510, v506); + float32x2_t v526 = vadd_f32(v397, v511); + float32x2_t v528 = vsub_f32(v511, v507); + float32x2_t v531 = vadd_f32(v530, v508); + v6[0] = vget_lane_s32(vreinterpret_s32_s16(v546), 0); + v6[ostride * 11] = vget_lane_s32(vreinterpret_s32_s16(v552), 0); + float32x2_t v317 = vsub_f32(v316, v306); + float32x2_t v319 = vadd_f32(v318, v307); + float32x2_t v321 = vsub_f32(v320, v307); + float32x2_t v323 = vsub_f32(v322, v303); + float32x2_t v325 = vadd_f32(v324, v312); + float32x2_t v327 = vsub_f32(v326, v199); + float32x2_t v329 = vadd_f32(v328, v311); + float32x2_t v331 = vsub_f32(v330, v199); + float32x2_t v334 = vadd_f32(v333, v311); + float32x2_t v515 = vsub_f32(v514, v504); + float32x2_t v517 = vadd_f32(v516, v505); + float32x2_t v519 = vsub_f32(v518, v505); + float32x2_t v521 = vsub_f32(v520, v501); + float32x2_t v523 = vadd_f32(v522, v510); + float32x2_t v525 = vsub_f32(v524, v397); + float32x2_t v527 = vadd_f32(v526, v509); + float32x2_t v529 = vsub_f32(v528, v397); + float32x2_t v532 = vadd_f32(v531, v509); + float32x2_t v335 = vsub_f32(v334, v199); + float32x2_t v337 = vadd_f32(v315, v325); + float32x2_t v338 = vadd_f32(v317, v327); + float32x2_t v339 = vsub_f32(v319, v329); + float32x2_t v340 = vadd_f32(v321, v331); + float32x2_t v341 = vsub_f32(v321, v331); + float32x2_t v342 = vadd_f32(v319, v329); + float32x2_t v343 = vsub_f32(v317, v327); + float32x2_t v344 = vsub_f32(v315, v325); + float32x2_t v533 = vsub_f32(v532, v397); + float32x2_t v535 = vadd_f32(v513, v523); + float32x2_t v536 = vadd_f32(v515, v525); + float32x2_t v537 = vsub_f32(v517, v527); + float32x2_t v538 = vadd_f32(v519, v529); + float32x2_t v539 = vsub_f32(v519, v529); + float32x2_t v540 = vadd_f32(v517, v527); + float32x2_t v541 = vsub_f32(v515, v525); + float32x2_t v542 = vsub_f32(v513, v523); + float32x2_t v336 = vadd_f32(v323, v335); + float32x2_t v345 = vsub_f32(v323, v335); + float32x2_t v534 = vadd_f32(v521, v533); + float32x2_t v543 = vsub_f32(v521, v533); + int16x4_t v570 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v344, 15), (int32x2_t){0, 0})); + int16x4_t v576 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v542, 15), (int32x2_t){0, 0})); + int16x4_t v582 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v343, 15), (int32x2_t){0, 0})); + int16x4_t v588 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v541, 15), (int32x2_t){0, 0})); + int16x4_t v594 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v342, 15), (int32x2_t){0, 0})); + int16x4_t v600 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v540, 15), (int32x2_t){0, 0})); + int16x4_t v606 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v341, 15), (int32x2_t){0, 0})); + int16x4_t v612 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v539, 15), (int32x2_t){0, 0})); + int16x4_t v618 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v340, 15), (int32x2_t){0, 0})); + int16x4_t v624 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v538, 15), (int32x2_t){0, 0})); + int16x4_t v630 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v339, 15), (int32x2_t){0, 0})); + int16x4_t v636 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v537, 15), (int32x2_t){0, 0})); + int16x4_t v642 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v338, 15), (int32x2_t){0, 0})); + int16x4_t v648 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v536, 15), (int32x2_t){0, 0})); + int16x4_t v654 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v337, 15), (int32x2_t){0, 0})); + int16x4_t v660 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v535, 15), (int32x2_t){0, 0})); + int16x4_t v558 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v345, 15), (int32x2_t){0, 0})); + int16x4_t v564 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v543, 15), (int32x2_t){0, 0})); + v6[ostride * 2] = vget_lane_s32(vreinterpret_s32_s16(v570), 0); + v6[ostride * 13] = vget_lane_s32(vreinterpret_s32_s16(v576), 0); + v6[ostride * 14] = vget_lane_s32(vreinterpret_s32_s16(v582), 0); + v6[ostride * 3] = vget_lane_s32(vreinterpret_s32_s16(v588), 0); + v6[ostride * 4] = vget_lane_s32(vreinterpret_s32_s16(v594), 0); + v6[ostride * 15] = vget_lane_s32(vreinterpret_s32_s16(v600), 0); + v6[ostride * 16] = vget_lane_s32(vreinterpret_s32_s16(v606), 0); + v6[ostride * 5] = vget_lane_s32(vreinterpret_s32_s16(v612), 0); + v6[ostride * 6] = vget_lane_s32(vreinterpret_s32_s16(v618), 0); + v6[ostride * 17] = vget_lane_s32(vreinterpret_s32_s16(v624), 0); + v6[ostride * 18] = vget_lane_s32(vreinterpret_s32_s16(v630), 0); + v6[ostride * 7] = vget_lane_s32(vreinterpret_s32_s16(v636), 0); + v6[ostride * 8] = vget_lane_s32(vreinterpret_s32_s16(v642), 0); + v6[ostride * 19] = vget_lane_s32(vreinterpret_s32_s16(v648), 0); + v6[ostride * 20] = vget_lane_s32(vreinterpret_s32_s16(v654), 0); + v6[ostride * 9] = vget_lane_s32(vreinterpret_s32_s16(v660), 0); + int16x4_t v666 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v336, 15), (int32x2_t){0, 0})); + int16x4_t v672 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v534, 15), (int32x2_t){0, 0})); + v6[ostride * 12] = vget_lane_s32(vreinterpret_s32_s16(v558), 0); + v6[ostride] = vget_lane_s32(vreinterpret_s32_s16(v564), 0); + v6[ostride * 10] = vget_lane_s32(vreinterpret_s32_s16(v666), 0); + v6[ostride * 21] = vget_lane_s32(vreinterpret_s32_s16(v672), 0); v5 += 1 * 1; v6 += 1 * 1; } @@ -15542,240 +9726,128 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu22(const armral_cmplx_f32_t *restrict x, svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v975)[0])); svfloat32_t v1271 = svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v984)[0])); - svfloat32_t v32; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v32) : "w"(v1229), "w"(v1231)); - svfloat32_t v33; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v33) : "w"(v1229), "w"(v1231)); - svfloat32_t v48; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v48) : "w"(v1233), "w"(v1235)); - svfloat32_t v49; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v49) : "w"(v1233), "w"(v1235)); - svfloat32_t v64; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v64) : "w"(v1237), "w"(v1239)); - svfloat32_t v65; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v65) : "w"(v1237), "w"(v1239)); - svfloat32_t v80; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v80) : "w"(v1241), "w"(v1243)); - svfloat32_t v81; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v81) : "w"(v1241), "w"(v1243)); - svfloat32_t v96; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v96) : "w"(v1245), "w"(v1247)); - svfloat32_t v97; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v97) : "w"(v1245), "w"(v1247)); - svfloat32_t v112; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v112) : "w"(v1249), "w"(v1251)); - svfloat32_t v113; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v113) : "w"(v1249), "w"(v1251)); - svfloat32_t v128; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v128) : "w"(v1253), "w"(v1255)); - svfloat32_t v129; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v129) : "w"(v1253), "w"(v1255)); - svfloat32_t v144; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v144) : "w"(v1257), "w"(v1259)); - svfloat32_t v145; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v145) : "w"(v1257), "w"(v1259)); - svfloat32_t v160; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v160) : "w"(v1261), "w"(v1263)); - svfloat32_t v161; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v161) : "w"(v1261), "w"(v1263)); - svfloat32_t v176; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v176) : "w"(v1265), "w"(v1267)); - svfloat32_t v177; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v177) : "w"(v1265), "w"(v1267)); - svfloat32_t v192; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v192) : "w"(v1269), "w"(v1271)); - svfloat32_t v193; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v193) : "w"(v1269), "w"(v1271)); - svfloat32_t v194; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v194) : "w"(v48), "w"(v192)); - svfloat32_t v195; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v195) : "w"(v64), "w"(v176)); - svfloat32_t v196; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v196) : "w"(v80), "w"(v160)); - svfloat32_t v197; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v197) : "w"(v96), "w"(v144)); - svfloat32_t v198; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v198) : "w"(v112), "w"(v128)); - svfloat32_t v199; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v199) : "w"(v48), "w"(v192)); - svfloat32_t v200; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v200) : "w"(v64), "w"(v176)); - svfloat32_t v201; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v201) : "w"(v80), "w"(v160)); - svfloat32_t v202; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v202) : "w"(v96), "w"(v144)); - svfloat32_t v203; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v203) : "w"(v112), "w"(v128)); - svfloat32_t v403; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v403) : "w"(v49), "w"(v193)); - svfloat32_t v404; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v404) : "w"(v65), "w"(v177)); - svfloat32_t v405; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v405) : "w"(v81), "w"(v161)); - svfloat32_t v406; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v406) : "w"(v97), "w"(v145)); - svfloat32_t v407; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v407) : "w"(v113), "w"(v129)); - svfloat32_t v408; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v408) : "w"(v49), "w"(v193)); - svfloat32_t v409; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v409) : "w"(v65), "w"(v177)); - svfloat32_t v410; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v410) : "w"(v81), "w"(v161)); - svfloat32_t v411; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v411) : "w"(v97), "w"(v145)); - svfloat32_t v412; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v412) : "w"(v113), "w"(v129)); - svfloat32_t v204; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v204) : "w"(v194), "w"(v195)); - svfloat32_t v205; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v205) : "w"(v196), "w"(v198)); - svfloat32_t v207; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v207) : "w"(v200), "w"(v201)); - svfloat32_t v208; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v208) : "w"(v199), "w"(v203)); - svfloat32_t v213; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v213) : "w"(v195), "w"(v197)); - svfloat32_t v214; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v214) : "w"(v194), "w"(v197)); - svfloat32_t v215; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v215) : "w"(v195), "w"(v194)); - svfloat32_t v216; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v216) : "w"(v198), "w"(v197)); - svfloat32_t v217; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v217) : "w"(v196), "w"(v197)); - svfloat32_t v218; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v218) : "w"(v198), "w"(v196)); - svfloat32_t v219; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v219) : "w"(v195), "w"(v198)); - svfloat32_t v220; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v220) : "w"(v194), "w"(v196)); - svfloat32_t v222; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v222) : "w"(v200), "w"(v202)); - svfloat32_t v223; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v223) : "w"(v199), "w"(v202)); - svfloat32_t v224; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v224) : "w"(v199), "w"(v200)); - svfloat32_t v225; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v225) : "w"(v202), "w"(v203)); - svfloat32_t v226; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v226) : "w"(v201), "w"(v202)); - svfloat32_t v227; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v227) : "w"(v201), "w"(v203)); - svfloat32_t v228; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v228) : "w"(v200), "w"(v203)); - svfloat32_t v229; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v229) : "w"(v199), "w"(v201)); - svfloat32_t v413; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v413) : "w"(v403), "w"(v404)); - svfloat32_t v414; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v414) : "w"(v405), "w"(v407)); - svfloat32_t v416; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v416) : "w"(v409), "w"(v410)); - svfloat32_t v417; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v417) : "w"(v408), "w"(v412)); - svfloat32_t v422; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v422) : "w"(v404), "w"(v406)); - svfloat32_t v423; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v423) : "w"(v403), "w"(v406)); - svfloat32_t v424; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v424) : "w"(v404), "w"(v403)); - svfloat32_t v425; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v425) : "w"(v407), "w"(v406)); - svfloat32_t v426; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v426) : "w"(v405), "w"(v406)); - svfloat32_t v427; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v427) : "w"(v407), "w"(v405)); - svfloat32_t v428; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v428) : "w"(v404), "w"(v407)); - svfloat32_t v429; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v429) : "w"(v403), "w"(v405)); - svfloat32_t v431; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v431) : "w"(v409), "w"(v411)); - svfloat32_t v432; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v432) : "w"(v408), "w"(v411)); - svfloat32_t v433; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v433) : "w"(v408), "w"(v409)); - svfloat32_t v434; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v434) : "w"(v411), "w"(v412)); - svfloat32_t v435; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v435) : "w"(v410), "w"(v411)); - svfloat32_t v436; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v436) : "w"(v410), "w"(v412)); - svfloat32_t v437; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v437) : "w"(v409), "w"(v412)); - svfloat32_t v438; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v438) : "w"(v408), "w"(v410)); - svfloat32_t v206; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v206) : "w"(v197), "w"(v204)); - svfloat32_t v211; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v211) : "w"(v207), "w"(v208)); - svfloat32_t v221; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v221) : "w"(v205), "w"(v204)); - svfloat32_t v230; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v230) : "w"(v207), "w"(v208)); - svfloat32_t v257; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v257) : "w"(v214), "w"(v1012)); - svfloat32_t v262; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v262) : "w"(v215), "w"(v1013)); - svfloat32_t v272; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v272) : "w"(v217), "w"(v1015)); - svfloat32_t v277; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v277) : "w"(v218), "w"(v1016)); - svfloat32_t zero299; - asm volatile("mov %0.s, #0" : "=w"(zero299)); + svfloat32_t v32 = svadd_f32_x(svptrue_b32(), v1229, v1231); + svfloat32_t v33 = svsub_f32_x(svptrue_b32(), v1229, v1231); + svfloat32_t v48 = svadd_f32_x(svptrue_b32(), v1233, v1235); + svfloat32_t v49 = svsub_f32_x(svptrue_b32(), v1233, v1235); + svfloat32_t v64 = svadd_f32_x(svptrue_b32(), v1237, v1239); + svfloat32_t v65 = svsub_f32_x(svptrue_b32(), v1237, v1239); + svfloat32_t v80 = svadd_f32_x(svptrue_b32(), v1241, v1243); + svfloat32_t v81 = svsub_f32_x(svptrue_b32(), v1241, v1243); + svfloat32_t v96 = svadd_f32_x(svptrue_b32(), v1245, v1247); + svfloat32_t v97 = svsub_f32_x(svptrue_b32(), v1245, v1247); + svfloat32_t v112 = svadd_f32_x(svptrue_b32(), v1249, v1251); + svfloat32_t v113 = svsub_f32_x(svptrue_b32(), v1249, v1251); + svfloat32_t v128 = svadd_f32_x(svptrue_b32(), v1253, v1255); + svfloat32_t v129 = svsub_f32_x(svptrue_b32(), v1253, v1255); + svfloat32_t v144 = svadd_f32_x(svptrue_b32(), v1257, v1259); + svfloat32_t v145 = svsub_f32_x(svptrue_b32(), v1257, v1259); + svfloat32_t v160 = svadd_f32_x(svptrue_b32(), v1261, v1263); + svfloat32_t v161 = svsub_f32_x(svptrue_b32(), v1261, v1263); + svfloat32_t v176 = svadd_f32_x(svptrue_b32(), v1265, v1267); + svfloat32_t v177 = svsub_f32_x(svptrue_b32(), v1265, v1267); + svfloat32_t v192 = svadd_f32_x(svptrue_b32(), v1269, v1271); + svfloat32_t v193 = svsub_f32_x(svptrue_b32(), v1269, v1271); + svfloat32_t v194 = svadd_f32_x(svptrue_b32(), v48, v192); + svfloat32_t v195 = svadd_f32_x(svptrue_b32(), v64, v176); + svfloat32_t v196 = svadd_f32_x(svptrue_b32(), v80, v160); + svfloat32_t v197 = svadd_f32_x(svptrue_b32(), v96, v144); + svfloat32_t v198 = svadd_f32_x(svptrue_b32(), v112, v128); + svfloat32_t v199 = svsub_f32_x(svptrue_b32(), v48, v192); + svfloat32_t v200 = svsub_f32_x(svptrue_b32(), v64, v176); + svfloat32_t v201 = svsub_f32_x(svptrue_b32(), v80, v160); + svfloat32_t v202 = svsub_f32_x(svptrue_b32(), v96, v144); + svfloat32_t v203 = svsub_f32_x(svptrue_b32(), v112, v128); + svfloat32_t v403 = svadd_f32_x(svptrue_b32(), v49, v193); + svfloat32_t v404 = svadd_f32_x(svptrue_b32(), v65, v177); + svfloat32_t v405 = svadd_f32_x(svptrue_b32(), v81, v161); + svfloat32_t v406 = svadd_f32_x(svptrue_b32(), v97, v145); + svfloat32_t v407 = svadd_f32_x(svptrue_b32(), v113, v129); + svfloat32_t v408 = svsub_f32_x(svptrue_b32(), v49, v193); + svfloat32_t v409 = svsub_f32_x(svptrue_b32(), v65, v177); + svfloat32_t v410 = svsub_f32_x(svptrue_b32(), v81, v161); + svfloat32_t v411 = svsub_f32_x(svptrue_b32(), v97, v145); + svfloat32_t v412 = svsub_f32_x(svptrue_b32(), v113, v129); + svfloat32_t v204 = svadd_f32_x(svptrue_b32(), v194, v195); + svfloat32_t v205 = svadd_f32_x(svptrue_b32(), v196, v198); + svfloat32_t v207 = svsub_f32_x(svptrue_b32(), v200, v201); + svfloat32_t v208 = svadd_f32_x(svptrue_b32(), v199, v203); + svfloat32_t v213 = svsub_f32_x(svptrue_b32(), v195, v197); + svfloat32_t v214 = svsub_f32_x(svptrue_b32(), v194, v197); + svfloat32_t v215 = svsub_f32_x(svptrue_b32(), v195, v194); + svfloat32_t v216 = svsub_f32_x(svptrue_b32(), v198, v197); + svfloat32_t v217 = svsub_f32_x(svptrue_b32(), v196, v197); + svfloat32_t v218 = svsub_f32_x(svptrue_b32(), v198, v196); + svfloat32_t v219 = svsub_f32_x(svptrue_b32(), v195, v198); + svfloat32_t v220 = svsub_f32_x(svptrue_b32(), v194, v196); + svfloat32_t v222 = svadd_f32_x(svptrue_b32(), v200, v202); + svfloat32_t v223 = svsub_f32_x(svptrue_b32(), v199, v202); + svfloat32_t v224 = svadd_f32_x(svptrue_b32(), v199, v200); + svfloat32_t v225 = svsub_f32_x(svptrue_b32(), v202, v203); + svfloat32_t v226 = svsub_f32_x(svptrue_b32(), v201, v202); + svfloat32_t v227 = svsub_f32_x(svptrue_b32(), v201, v203); + svfloat32_t v228 = svadd_f32_x(svptrue_b32(), v200, v203); + svfloat32_t v229 = svsub_f32_x(svptrue_b32(), v199, v201); + svfloat32_t v413 = svadd_f32_x(svptrue_b32(), v403, v404); + svfloat32_t v414 = svadd_f32_x(svptrue_b32(), v405, v407); + svfloat32_t v416 = svsub_f32_x(svptrue_b32(), v409, v410); + svfloat32_t v417 = svadd_f32_x(svptrue_b32(), v408, v412); + svfloat32_t v422 = svsub_f32_x(svptrue_b32(), v404, v406); + svfloat32_t v423 = svsub_f32_x(svptrue_b32(), v403, v406); + svfloat32_t v424 = svsub_f32_x(svptrue_b32(), v404, v403); + svfloat32_t v425 = svsub_f32_x(svptrue_b32(), v407, v406); + svfloat32_t v426 = svsub_f32_x(svptrue_b32(), v405, v406); + svfloat32_t v427 = svsub_f32_x(svptrue_b32(), v407, v405); + svfloat32_t v428 = svsub_f32_x(svptrue_b32(), v404, v407); + svfloat32_t v429 = svsub_f32_x(svptrue_b32(), v403, v405); + svfloat32_t v431 = svadd_f32_x(svptrue_b32(), v409, v411); + svfloat32_t v432 = svsub_f32_x(svptrue_b32(), v408, v411); + svfloat32_t v433 = svadd_f32_x(svptrue_b32(), v408, v409); + svfloat32_t v434 = svsub_f32_x(svptrue_b32(), v411, v412); + svfloat32_t v435 = svsub_f32_x(svptrue_b32(), v410, v411); + svfloat32_t v436 = svsub_f32_x(svptrue_b32(), v410, v412); + svfloat32_t v437 = svadd_f32_x(svptrue_b32(), v409, v412); + svfloat32_t v438 = svsub_f32_x(svptrue_b32(), v408, v410); + svfloat32_t v206 = svadd_f32_x(svptrue_b32(), v197, v204); + svfloat32_t v211 = svsub_f32_x(svptrue_b32(), v207, v208); + svfloat32_t v221 = svsub_f32_x(svptrue_b32(), v205, v204); + svfloat32_t v230 = svadd_f32_x(svptrue_b32(), v207, v208); + svfloat32_t v257 = svmul_f32_x(svptrue_b32(), v214, v1012); + svfloat32_t v262 = svmul_f32_x(svptrue_b32(), v215, v1013); + svfloat32_t v272 = svmul_f32_x(svptrue_b32(), v217, v1015); + svfloat32_t v277 = svmul_f32_x(svptrue_b32(), v218, v1016); + svfloat32_t zero299 = svdup_n_f32(0); svfloat32_t v299 = svcmla_f32_x(pred_full, zero299, v1020, v222, 90); - svfloat32_t zero313; - asm volatile("mov %0.s, #0" : "=w"(zero313)); + svfloat32_t zero313 = svdup_n_f32(0); svfloat32_t v313 = svcmla_f32_x(pred_full, zero313, v1022, v224, 90); - svfloat32_t zero320; - asm volatile("mov %0.s, #0" : "=w"(zero320)); + svfloat32_t zero320 = svdup_n_f32(0); svfloat32_t v320 = svcmla_f32_x(pred_full, zero320, v1023, v225, 90); - svfloat32_t zero334; - asm volatile("mov %0.s, #0" : "=w"(zero334)); + svfloat32_t zero334 = svdup_n_f32(0); svfloat32_t v334 = svcmla_f32_x(pred_full, zero334, v1025, v227, 90); - svfloat32_t zero341; - asm volatile("mov %0.s, #0" : "=w"(zero341)); + svfloat32_t zero341 = svdup_n_f32(0); svfloat32_t v341 = svcmla_f32_x(pred_full, zero341, v1026, v228, 90); - svfloat32_t v415; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v415) : "w"(v406), "w"(v413)); - svfloat32_t v420; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v420) : "w"(v416), "w"(v417)); - svfloat32_t v430; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v430) : "w"(v414), "w"(v413)); - svfloat32_t v439; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v439) : "w"(v416), "w"(v417)); - svfloat32_t v466; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v466) : "w"(v423), "w"(v1012)); - svfloat32_t v471; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v471) : "w"(v424), "w"(v1013)); - svfloat32_t v481; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v481) : "w"(v426), "w"(v1015)); - svfloat32_t v486; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v486) : "w"(v427), "w"(v1016)); - svfloat32_t zero508; - asm volatile("mov %0.s, #0" : "=w"(zero508)); + svfloat32_t v415 = svadd_f32_x(svptrue_b32(), v406, v413); + svfloat32_t v420 = svsub_f32_x(svptrue_b32(), v416, v417); + svfloat32_t v430 = svsub_f32_x(svptrue_b32(), v414, v413); + svfloat32_t v439 = svadd_f32_x(svptrue_b32(), v416, v417); + svfloat32_t v466 = svmul_f32_x(svptrue_b32(), v423, v1012); + svfloat32_t v471 = svmul_f32_x(svptrue_b32(), v424, v1013); + svfloat32_t v481 = svmul_f32_x(svptrue_b32(), v426, v1015); + svfloat32_t v486 = svmul_f32_x(svptrue_b32(), v427, v1016); + svfloat32_t zero508 = svdup_n_f32(0); svfloat32_t v508 = svcmla_f32_x(pred_full, zero508, v1020, v431, 90); - svfloat32_t zero522; - asm volatile("mov %0.s, #0" : "=w"(zero522)); + svfloat32_t zero522 = svdup_n_f32(0); svfloat32_t v522 = svcmla_f32_x(pred_full, zero522, v1022, v433, 90); - svfloat32_t zero529; - asm volatile("mov %0.s, #0" : "=w"(zero529)); + svfloat32_t zero529 = svdup_n_f32(0); svfloat32_t v529 = svcmla_f32_x(pred_full, zero529, v1023, v434, 90); - svfloat32_t zero543; - asm volatile("mov %0.s, #0" : "=w"(zero543)); + svfloat32_t zero543 = svdup_n_f32(0); svfloat32_t v543 = svcmla_f32_x(pred_full, zero543, v1025, v436, 90); - svfloat32_t zero550; - asm volatile("mov %0.s, #0" : "=w"(zero550)); + svfloat32_t zero550 = svdup_n_f32(0); svfloat32_t v550 = svcmla_f32_x(pred_full, zero550, v1026, v437, 90); - svfloat32_t v209; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v209) : "w"(v206), "w"(v205)); - svfloat32_t v212; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v212) : "w"(v211), "w"(v202)); - svfloat32_t v292; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v292) : "w"(v221), "w"(v1019)); - svfloat32_t zero355; - asm volatile("mov %0.s, #0" : "=w"(zero355)); + svfloat32_t v209 = svadd_f32_x(svptrue_b32(), v206, v205); + svfloat32_t v212 = svsub_f32_x(svptrue_b32(), v211, v202); + svfloat32_t v292 = svmul_f32_x(svptrue_b32(), v221, v1019); + svfloat32_t zero355 = svdup_n_f32(0); svfloat32_t v355 = svcmla_f32_x(pred_full, zero355, v1028, v230, 90); svfloat32_t v357 = svmla_f32_x(pred_full, v257, v213, v1011); svfloat32_t v358 = svmla_f32_x(pred_full, v262, v214, v1012); @@ -15784,19 +9856,13 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu22(const armral_cmplx_f32_t *restrict x, svfloat32_t v361 = svmla_f32_x(pred_full, v277, v217, v1015); svfloat32_t v362 = svnmls_f32_x(pred_full, v277, v216, v1014); svfloat32_t v365 = svcmla_f32_x(pred_full, v313, v1021, v223, 90); - svfloat32_t v366; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v366) : "w"(v299), "w"(v313)); + svfloat32_t v366 = svsub_f32_x(svptrue_b32(), v299, v313); svfloat32_t v367 = svcmla_f32_x(pred_full, v334, v1024, v226, 90); - svfloat32_t v368; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v368) : "w"(v320), "w"(v334)); - svfloat32_t v418; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v418) : "w"(v415), "w"(v414)); - svfloat32_t v421; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v421) : "w"(v420), "w"(v411)); - svfloat32_t v501; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v501) : "w"(v430), "w"(v1019)); - svfloat32_t zero564; - asm volatile("mov %0.s, #0" : "=w"(zero564)); + svfloat32_t v368 = svsub_f32_x(svptrue_b32(), v320, v334); + svfloat32_t v418 = svadd_f32_x(svptrue_b32(), v415, v414); + svfloat32_t v421 = svsub_f32_x(svptrue_b32(), v420, v411); + svfloat32_t v501 = svmul_f32_x(svptrue_b32(), v430, v1019); + svfloat32_t zero564 = svdup_n_f32(0); svfloat32_t v564 = svcmla_f32_x(pred_full, zero564, v1028, v439, 90); svfloat32_t v566 = svmla_f32_x(pred_full, v466, v422, v1011); svfloat32_t v567 = svmla_f32_x(pred_full, v471, v423, v1012); @@ -15805,61 +9871,39 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu22(const armral_cmplx_f32_t *restrict x, svfloat32_t v570 = svmla_f32_x(pred_full, v486, v426, v1015); svfloat32_t v571 = svnmls_f32_x(pred_full, v486, v425, v1014); svfloat32_t v574 = svcmla_f32_x(pred_full, v522, v1021, v432, 90); - svfloat32_t v575; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v575) : "w"(v508), "w"(v522)); + svfloat32_t v575 = svsub_f32_x(svptrue_b32(), v508, v522); svfloat32_t v576 = svcmla_f32_x(pred_full, v543, v1024, v435, 90); - svfloat32_t v577; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v577) : "w"(v529), "w"(v543)); - svfloat32_t v210; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v210) : "w"(v32), "w"(v209)); - svfloat32_t zero247; - asm volatile("mov %0.s, #0" : "=w"(zero247)); + svfloat32_t v577 = svsub_f32_x(svptrue_b32(), v529, v543); + svfloat32_t v210 = svadd_f32_x(svptrue_b32(), v32, v209); + svfloat32_t zero247 = svdup_n_f32(0); svfloat32_t v247 = svcmla_f32_x(pred_full, zero247, v1010, v212, 90); svfloat32_t v363 = svmla_f32_x(pred_full, v292, v220, v1018); svfloat32_t v364 = svmla_f32_x(pred_full, v292, v219, v1017); svfloat32_t v369 = svcmla_f32_x(pred_full, v355, v1027, v229, 90); - svfloat32_t v370; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v370) : "w"(v341), "w"(v355)); - svfloat32_t v389; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v389) : "w"(v365), "w"(v366)); - svfloat32_t v419; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v419) : "w"(v33), "w"(v418)); - svfloat32_t zero456; - asm volatile("mov %0.s, #0" : "=w"(zero456)); + svfloat32_t v370 = svsub_f32_x(svptrue_b32(), v341, v355); + svfloat32_t v389 = svadd_f32_x(svptrue_b32(), v365, v366); + svfloat32_t v419 = svadd_f32_x(svptrue_b32(), v33, v418); + svfloat32_t zero456 = svdup_n_f32(0); svfloat32_t v456 = svcmla_f32_x(pred_full, zero456, v1010, v421, 90); svfloat32_t v572 = svmla_f32_x(pred_full, v501, v429, v1018); svfloat32_t v573 = svmla_f32_x(pred_full, v501, v428, v1017); svfloat32_t v578 = svcmla_f32_x(pred_full, v564, v1027, v438, 90); - svfloat32_t v579; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v579) : "w"(v550), "w"(v564)); - svfloat32_t v598; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v598) : "w"(v574), "w"(v575)); + svfloat32_t v579 = svsub_f32_x(svptrue_b32(), v550, v564); + svfloat32_t v598 = svadd_f32_x(svptrue_b32(), v574, v575); svfloat32_t v356 = svmls_f32_x(pred_full, v210, v209, v1009); - svfloat32_t v371; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v371) : "w"(v361), "w"(v363)); - svfloat32_t v381; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v381) : "w"(v247), "w"(v367)); - svfloat32_t v383; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v383) : "w"(v369), "w"(v365)); - svfloat32_t v385; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v385) : "w"(v247), "w"(v370)); - svfloat32_t v387; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v387) : "w"(v370), "w"(v366)); - svfloat32_t v390; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v390) : "w"(v389), "w"(v367)); + svfloat32_t v371 = svadd_f32_x(svptrue_b32(), v361, v363); + svfloat32_t v381 = svadd_f32_x(svptrue_b32(), v247, v367); + svfloat32_t v383 = svsub_f32_x(svptrue_b32(), v369, v365); + svfloat32_t v385 = svadd_f32_x(svptrue_b32(), v247, v370); + svfloat32_t v387 = svsub_f32_x(svptrue_b32(), v370, v366); + svfloat32_t v390 = svadd_f32_x(svptrue_b32(), v389, v367); svfloat32_t v565 = svmls_f32_x(pred_full, v419, v418, v1009); - svfloat32_t v580; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v580) : "w"(v570), "w"(v572)); - svfloat32_t v590; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v590) : "w"(v456), "w"(v576)); - svfloat32_t v592; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v592) : "w"(v578), "w"(v574)); - svfloat32_t v594; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v594) : "w"(v456), "w"(v579)); - svfloat32_t v596; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v596) : "w"(v579), "w"(v575)); - svfloat32_t v599; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v599) : "w"(v598), "w"(v576)); + svfloat32_t v580 = svadd_f32_x(svptrue_b32(), v570, v572); + svfloat32_t v590 = svadd_f32_x(svptrue_b32(), v456, v576); + svfloat32_t v592 = svsub_f32_x(svptrue_b32(), v578, v574); + svfloat32_t v594 = svadd_f32_x(svptrue_b32(), v456, v579); + svfloat32_t v596 = svsub_f32_x(svptrue_b32(), v579, v575); + svfloat32_t v599 = svadd_f32_x(svptrue_b32(), v598, v576); svint16_t v614 = svtbl_s16( svreinterpret_s16_s32(svcvt_s32_f32_x( pred_full, svmul_n_f32_x(pred_full, v210, (float)(1ULL << 31ULL)))), @@ -15870,108 +9914,58 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu22(const armral_cmplx_f32_t *restrict x, pred_full, svmul_n_f32_x(pred_full, v419, (float)(1ULL << 31ULL)))), svreinterpret_u16_u64( svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svfloat32_t v372; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v372) : "w"(v371), "w"(v356)); - svfloat32_t v373; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v373) : "w"(v356), "w"(v358)); - svfloat32_t v375; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v375) : "w"(v356), "w"(v362)); - svfloat32_t v377; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v377) : "w"(v356), "w"(v359)); - svfloat32_t v379; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v379) : "w"(v356), "w"(v357)); - svfloat32_t v382; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v382) : "w"(v381), "w"(v369)); - svfloat32_t v384; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v384) : "w"(v383), "w"(v247)); - svfloat32_t v386; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v386) : "w"(v385), "w"(v368)); - svfloat32_t v388; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v388) : "w"(v387), "w"(v247)); - svfloat32_t v391; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v391) : "w"(v390), "w"(v368)); - svfloat32_t v581; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v581) : "w"(v580), "w"(v565)); - svfloat32_t v582; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v582) : "w"(v565), "w"(v567)); - svfloat32_t v584; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v584) : "w"(v565), "w"(v571)); - svfloat32_t v586; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v586) : "w"(v565), "w"(v568)); - svfloat32_t v588; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v588) : "w"(v565), "w"(v566)); - svfloat32_t v591; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v591) : "w"(v590), "w"(v578)); - svfloat32_t v593; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v593) : "w"(v592), "w"(v456)); - svfloat32_t v595; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v595) : "w"(v594), "w"(v577)); - svfloat32_t v597; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v597) : "w"(v596), "w"(v456)); - svfloat32_t v600; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v600) : "w"(v599), "w"(v577)); + svfloat32_t v372 = svadd_f32_x(svptrue_b32(), v371, v356); + svfloat32_t v373 = svsub_f32_x(svptrue_b32(), v356, v358); + svfloat32_t v375 = svadd_f32_x(svptrue_b32(), v356, v362); + svfloat32_t v377 = svsub_f32_x(svptrue_b32(), v356, v359); + svfloat32_t v379 = svadd_f32_x(svptrue_b32(), v356, v357); + svfloat32_t v382 = svadd_f32_x(svptrue_b32(), v381, v369); + svfloat32_t v384 = svsub_f32_x(svptrue_b32(), v383, v247); + svfloat32_t v386 = svadd_f32_x(svptrue_b32(), v385, v368); + svfloat32_t v388 = svsub_f32_x(svptrue_b32(), v387, v247); + svfloat32_t v391 = svadd_f32_x(svptrue_b32(), v390, v368); + svfloat32_t v581 = svadd_f32_x(svptrue_b32(), v580, v565); + svfloat32_t v582 = svsub_f32_x(svptrue_b32(), v565, v567); + svfloat32_t v584 = svadd_f32_x(svptrue_b32(), v565, v571); + svfloat32_t v586 = svsub_f32_x(svptrue_b32(), v565, v568); + svfloat32_t v588 = svadd_f32_x(svptrue_b32(), v565, v566); + svfloat32_t v591 = svadd_f32_x(svptrue_b32(), v590, v578); + svfloat32_t v593 = svsub_f32_x(svptrue_b32(), v592, v456); + svfloat32_t v595 = svadd_f32_x(svptrue_b32(), v594, v577); + svfloat32_t v597 = svsub_f32_x(svptrue_b32(), v596, v456); + svfloat32_t v600 = svadd_f32_x(svptrue_b32(), v599, v577); svst1w_u64(pred_full, (unsigned *)(v1036), svreinterpret_u64_s16(v614)); svst1w_u64(pred_full, (unsigned *)(v1045), svreinterpret_u64_s16(v622)); - svfloat32_t v374; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v374) : "w"(v373), "w"(v363)); - svfloat32_t v376; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v376) : "w"(v375), "w"(v364)); - svfloat32_t v378; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v378) : "w"(v377), "w"(v364)); - svfloat32_t v380; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v380) : "w"(v379), "w"(v360)); - svfloat32_t v392; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v392) : "w"(v391), "w"(v247)); - svfloat32_t v394; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v394) : "w"(v372), "w"(v382)); - svfloat32_t v401; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v401) : "w"(v372), "w"(v382)); - svfloat32_t v583; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v583) : "w"(v582), "w"(v572)); - svfloat32_t v585; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v585) : "w"(v584), "w"(v573)); - svfloat32_t v587; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v587) : "w"(v586), "w"(v573)); - svfloat32_t v589; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v589) : "w"(v588), "w"(v569)); - svfloat32_t v601; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v601) : "w"(v600), "w"(v456)); - svfloat32_t v603; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v603) : "w"(v581), "w"(v591)); - svfloat32_t v610; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v610) : "w"(v581), "w"(v591)); - svfloat32_t v393; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v393) : "w"(v380), "w"(v392)); - svfloat32_t v395; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v395) : "w"(v374), "w"(v384)); - svfloat32_t v396; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v396) : "w"(v376), "w"(v386)); - svfloat32_t v397; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v397) : "w"(v378), "w"(v388)); - svfloat32_t v398; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v398) : "w"(v378), "w"(v388)); - svfloat32_t v399; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v399) : "w"(v376), "w"(v386)); - svfloat32_t v400; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v400) : "w"(v374), "w"(v384)); - svfloat32_t v402; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v402) : "w"(v380), "w"(v392)); - svfloat32_t v602; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v602) : "w"(v589), "w"(v601)); - svfloat32_t v604; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v604) : "w"(v583), "w"(v593)); - svfloat32_t v605; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v605) : "w"(v585), "w"(v595)); - svfloat32_t v606; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v606) : "w"(v587), "w"(v597)); - svfloat32_t v607; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v607) : "w"(v587), "w"(v597)); - svfloat32_t v608; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v608) : "w"(v585), "w"(v595)); - svfloat32_t v609; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v609) : "w"(v583), "w"(v593)); - svfloat32_t v611; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v611) : "w"(v589), "w"(v601)); + svfloat32_t v374 = svsub_f32_x(svptrue_b32(), v373, v363); + svfloat32_t v376 = svadd_f32_x(svptrue_b32(), v375, v364); + svfloat32_t v378 = svsub_f32_x(svptrue_b32(), v377, v364); + svfloat32_t v380 = svsub_f32_x(svptrue_b32(), v379, v360); + svfloat32_t v392 = svsub_f32_x(svptrue_b32(), v391, v247); + svfloat32_t v394 = svadd_f32_x(svptrue_b32(), v372, v382); + svfloat32_t v401 = svsub_f32_x(svptrue_b32(), v372, v382); + svfloat32_t v583 = svsub_f32_x(svptrue_b32(), v582, v572); + svfloat32_t v585 = svadd_f32_x(svptrue_b32(), v584, v573); + svfloat32_t v587 = svsub_f32_x(svptrue_b32(), v586, v573); + svfloat32_t v589 = svsub_f32_x(svptrue_b32(), v588, v569); + svfloat32_t v601 = svsub_f32_x(svptrue_b32(), v600, v456); + svfloat32_t v603 = svadd_f32_x(svptrue_b32(), v581, v591); + svfloat32_t v610 = svsub_f32_x(svptrue_b32(), v581, v591); + svfloat32_t v393 = svadd_f32_x(svptrue_b32(), v380, v392); + svfloat32_t v395 = svadd_f32_x(svptrue_b32(), v374, v384); + svfloat32_t v396 = svsub_f32_x(svptrue_b32(), v376, v386); + svfloat32_t v397 = svadd_f32_x(svptrue_b32(), v378, v388); + svfloat32_t v398 = svsub_f32_x(svptrue_b32(), v378, v388); + svfloat32_t v399 = svadd_f32_x(svptrue_b32(), v376, v386); + svfloat32_t v400 = svsub_f32_x(svptrue_b32(), v374, v384); + svfloat32_t v402 = svsub_f32_x(svptrue_b32(), v380, v392); + svfloat32_t v602 = svadd_f32_x(svptrue_b32(), v589, v601); + svfloat32_t v604 = svadd_f32_x(svptrue_b32(), v583, v593); + svfloat32_t v605 = svsub_f32_x(svptrue_b32(), v585, v595); + svfloat32_t v606 = svadd_f32_x(svptrue_b32(), v587, v597); + svfloat32_t v607 = svsub_f32_x(svptrue_b32(), v587, v597); + svfloat32_t v608 = svadd_f32_x(svptrue_b32(), v585, v595); + svfloat32_t v609 = svsub_f32_x(svptrue_b32(), v583, v593); + svfloat32_t v611 = svsub_f32_x(svptrue_b32(), v589, v601); svint16_t v646 = svtbl_s16( svreinterpret_s16_s32(svcvt_s32_f32_x( pred_full, svmul_n_f32_x(pred_full, v401, (float)(1ULL << 31ULL)))), @@ -16106,614 +10100,290 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu24(const armral_cmplx_f32_t *restrict x, float v4 = dir; const float32x2_t *v5 = (const float32x2_t *)x; int32_t *v6 = (int32_t *)y; - int64_t v12 = howmany - 1; - int64_t v660 = howmany / 2; - for (int j = 0; j < v12; j += 2) { - float v257 = 1.0000000000000000e+00F; - float v258 = -1.0000000000000000e+00F; - float v266 = -7.0710678118654746e-01F; - float v274 = 7.0710678118654757e-01F; - float v332 = -1.4999999999999998e+00F; - float v333 = 1.4999999999999998e+00F; - float v341 = 1.0606601717798210e+00F; - float v349 = -1.0606601717798212e+00F; - float v408 = 8.6602540378443871e-01F; - float v417 = -8.6602540378443871e-01F; - float v426 = 6.1237243569579458e-01F; - float v427 = -6.1237243569579458e-01F; - float32x2_t v429 = (float32x2_t){v4, v4}; - const float32x2_t *v1288 = &v5[istride]; - int32_t *v1451 = &v6[ostride]; - float32x2_t v259 = (float32x2_t){v257, v258}; - float32x2_t v267 = (float32x2_t){v274, v266}; - float32x2_t v275 = (float32x2_t){v274, v274}; - float32x2_t v329 = (float32x2_t){v332, v332}; - float32x2_t v334 = (float32x2_t){v332, v333}; - float32x2_t v342 = (float32x2_t){v349, v341}; - float32x2_t v350 = (float32x2_t){v349, v349}; - float32x2_t v410 = (float32x2_t){v408, v417}; - float32x2_t v418 = (float32x2_t){v417, v417}; - float32x2_t v423 = (float32x2_t){v427, v427}; - float32x2_t v428 = (float32x2_t){v426, v427}; - const float32x2_t *v1216 = &v5[0]; - int32_t *v1415 = &v6[0]; - float32x4_t v1646 = vld1q_f32((const float32_t *)v1288); - float32x2_t v261 = vmul_f32(v429, v259); - float32x2_t v269 = vmul_f32(v429, v267); - float32x4_t v276 = vcombine_f32(v275, v275); - float32x4_t v330 = vcombine_f32(v329, v329); - float32x2_t v336 = vmul_f32(v429, v334); - float32x2_t v344 = vmul_f32(v429, v342); - float32x4_t v351 = vcombine_f32(v350, v350); - float32x2_t v412 = vmul_f32(v429, v410); - float32x4_t v419 = vcombine_f32(v418, v418); - float32x4_t v424 = vcombine_f32(v423, v423); - float32x2_t v430 = vmul_f32(v429, v428); - const float32x2_t *v1197 = &v5[istride * 8]; - const float32x2_t *v1206 = &v5[istride * 16]; - const float32x2_t *v1225 = &v5[istride * 11]; - const float32x2_t *v1234 = &v5[istride * 19]; - const float32x2_t *v1243 = &v5[istride * 3]; - const float32x2_t *v1252 = &v5[istride * 14]; - const float32x2_t *v1261 = &v5[istride * 22]; - const float32x2_t *v1270 = &v5[istride * 6]; - const float32x2_t *v1279 = &v5[istride * 17]; - const float32x2_t *v1297 = &v5[istride * 9]; - const float32x2_t *v1306 = &v5[istride * 20]; - const float32x2_t *v1315 = &v5[istride * 4]; - const float32x2_t *v1324 = &v5[istride * 12]; - const float32x2_t *v1333 = &v5[istride * 23]; - const float32x2_t *v1342 = &v5[istride * 7]; - const float32x2_t *v1351 = &v5[istride * 15]; - const float32x2_t *v1360 = &v5[istride * 2]; - const float32x2_t *v1369 = &v5[istride * 10]; - const float32x2_t *v1378 = &v5[istride * 18]; - const float32x2_t *v1387 = &v5[istride * 5]; - const float32x2_t *v1396 = &v5[istride * 13]; - const float32x2_t *v1405 = &v5[istride * 21]; - int32_t *v1424 = &v6[ostride * 16]; - int32_t *v1433 = &v6[ostride * 8]; - int32_t *v1442 = &v6[ostride * 9]; - int32_t *v1460 = &v6[ostride * 17]; - int32_t *v1469 = &v6[ostride * 18]; - int32_t *v1478 = &v6[ostride * 10]; - int32_t *v1487 = &v6[ostride * 2]; - int32_t *v1496 = &v6[ostride * 3]; - int32_t *v1505 = &v6[ostride * 19]; - int32_t *v1514 = &v6[ostride * 11]; - int32_t *v1523 = &v6[ostride * 12]; - int32_t *v1532 = &v6[ostride * 4]; - int32_t *v1541 = &v6[ostride * 20]; - int32_t *v1550 = &v6[ostride * 21]; - int32_t *v1559 = &v6[ostride * 13]; - int32_t *v1568 = &v6[ostride * 5]; - int32_t *v1577 = &v6[ostride * 6]; - int32_t *v1586 = &v6[ostride * 22]; - int32_t *v1595 = &v6[ostride * 14]; - int32_t *v1604 = &v6[ostride * 15]; - int32_t *v1613 = &v6[ostride * 7]; - int32_t *v1622 = &v6[ostride * 23]; - float32x4_t v1630 = vld1q_f32((const float32_t *)v1216); - float32x4_t v263 = vcombine_f32(v261, v261); - float32x4_t v271 = vcombine_f32(v269, v269); - float32x4_t v338 = vcombine_f32(v336, v336); - float32x4_t v346 = vcombine_f32(v344, v344); - float32x4_t v414 = vcombine_f32(v412, v412); - float32x4_t v432 = vcombine_f32(v430, v430); - float32x4_t v1626 = vld1q_f32((const float32_t *)v1197); - float32x4_t v1628 = vld1q_f32((const float32_t *)v1206); - float32x4_t v1632 = vld1q_f32((const float32_t *)v1225); - float32x4_t v1634 = vld1q_f32((const float32_t *)v1234); - float32x4_t v1636 = vld1q_f32((const float32_t *)v1243); - float32x4_t v1638 = vld1q_f32((const float32_t *)v1252); - float32x4_t v1640 = vld1q_f32((const float32_t *)v1261); - float32x4_t v1642 = vld1q_f32((const float32_t *)v1270); - float32x4_t v1644 = vld1q_f32((const float32_t *)v1279); - float32x4_t v1648 = vld1q_f32((const float32_t *)v1297); - float32x4_t v1650 = vld1q_f32((const float32_t *)v1306); - float32x4_t v1652 = vld1q_f32((const float32_t *)v1315); - float32x4_t v1654 = vld1q_f32((const float32_t *)v1324); - float32x4_t v1656 = vld1q_f32((const float32_t *)v1333); - float32x4_t v1658 = vld1q_f32((const float32_t *)v1342); - float32x4_t v1660 = vld1q_f32((const float32_t *)v1351); - float32x4_t v1662 = vld1q_f32((const float32_t *)v1360); - float32x4_t v1664 = vld1q_f32((const float32_t *)v1369); - float32x4_t v1666 = vld1q_f32((const float32_t *)v1378); - float32x4_t v1668 = vld1q_f32((const float32_t *)v1387); - float32x4_t v1670 = vld1q_f32((const float32_t *)v1396); - float32x4_t v1672 = vld1q_f32((const float32_t *)v1405); - float32x4_t v35 = vaddq_f32(v1626, v1628); - float32x4_t v36 = vsubq_f32(v1626, v1628); - float32x4_t v59 = vaddq_f32(v1632, v1634); - float32x4_t v60 = vsubq_f32(v1632, v1634); - float32x4_t v83 = vaddq_f32(v1638, v1640); - float32x4_t v84 = vsubq_f32(v1638, v1640); - float32x4_t v107 = vaddq_f32(v1644, v1646); - float32x4_t v108 = vsubq_f32(v1644, v1646); - float32x4_t v131 = vaddq_f32(v1650, v1652); - float32x4_t v132 = vsubq_f32(v1650, v1652); - float32x4_t v155 = vaddq_f32(v1656, v1658); - float32x4_t v156 = vsubq_f32(v1656, v1658); - float32x4_t v179 = vaddq_f32(v1662, v1664); - float32x4_t v180 = vsubq_f32(v1662, v1664); - float32x4_t v203 = vaddq_f32(v1668, v1670); - float32x4_t v204 = vsubq_f32(v1668, v1670); - float32x4_t v44 = vaddq_f32(v35, v1630); - float32x4_t v68 = vaddq_f32(v59, v1636); - float32x4_t v92 = vaddq_f32(v83, v1642); - float32x4_t v116 = vaddq_f32(v107, v1648); - float32x4_t v140 = vaddq_f32(v131, v1654); - float32x4_t v164 = vaddq_f32(v155, v1660); - float32x4_t v188 = vaddq_f32(v179, v1666); - float32x4_t v212 = vaddq_f32(v203, v1672); - float32x4_t v288 = vaddq_f32(v35, v131); - float32x4_t v289 = vsubq_f32(v35, v131); - float32x4_t v290 = vaddq_f32(v83, v179); - float32x4_t v291 = vsubq_f32(v83, v179); - float32x4_t v292 = vaddq_f32(v59, v155); - float32x4_t v293 = vsubq_f32(v59, v155); - float32x4_t v294 = vaddq_f32(v107, v203); - float32x4_t v295 = vsubq_f32(v107, v203); - float32x4_t v363 = vaddq_f32(v36, v132); - float32x4_t v364 = vsubq_f32(v36, v132); - float32x4_t v365 = vaddq_f32(v84, v180); - float32x4_t v366 = vsubq_f32(v84, v180); - float32x4_t v367 = vaddq_f32(v60, v156); - float32x4_t v368 = vsubq_f32(v60, v156); - float32x4_t v369 = vaddq_f32(v108, v204); - float32x4_t v370 = vsubq_f32(v108, v204); - float32x4_t v213 = vaddq_f32(v44, v140); - float32x4_t v214 = vsubq_f32(v44, v140); - float32x4_t v215 = vaddq_f32(v92, v188); - float32x4_t v216 = vsubq_f32(v92, v188); - float32x4_t v217 = vaddq_f32(v68, v164); - float32x4_t v218 = vsubq_f32(v68, v164); - float32x4_t v219 = vaddq_f32(v116, v212); - float32x4_t v220 = vsubq_f32(v116, v212); - float32x4_t v296 = vaddq_f32(v288, v290); - float32x4_t v297 = vsubq_f32(v288, v290); - float32x4_t v298 = vaddq_f32(v292, v294); - float32x4_t v299 = vsubq_f32(v292, v294); - float32x4_t v302 = vaddq_f32(v293, v295); - float32x4_t v303 = vsubq_f32(v293, v295); - float32x4_t v331 = vmulq_f32(v289, v330); - float32x4_t v337 = vrev64q_f32(v291); - float32x4_t v371 = vaddq_f32(v363, v365); - float32x4_t v372 = vsubq_f32(v363, v365); - float32x4_t v373 = vaddq_f32(v367, v369); - float32x4_t v374 = vsubq_f32(v367, v369); - float32x4_t v377 = vaddq_f32(v368, v370); - float32x4_t v378 = vsubq_f32(v368, v370); - float32x4_t v413 = vrev64q_f32(v364); - float32x4_t v420 = vmulq_f32(v366, v419); - float32x4_t v221 = vaddq_f32(v213, v215); - float32x4_t v222 = vsubq_f32(v213, v215); - float32x4_t v223 = vaddq_f32(v217, v219); - float32x4_t v224 = vsubq_f32(v217, v219); - float32x4_t v227 = vaddq_f32(v218, v220); - float32x4_t v228 = vsubq_f32(v218, v220); - float32x4_t v262 = vrev64q_f32(v216); - float32x4_t v300 = vaddq_f32(v296, v298); - float32x4_t v301 = vsubq_f32(v296, v298); - float32x4_t v318 = vmulq_f32(v297, v330); - float32x4_t v324 = vrev64q_f32(v299); - float32x4_t v339 = vmulq_f32(v337, v338); - float32x4_t v345 = vrev64q_f32(v302); - float32x4_t v352 = vmulq_f32(v303, v351); - float32x4_t v375 = vaddq_f32(v371, v373); - float32x4_t v376 = vsubq_f32(v371, v373); - float32x4_t v400 = vrev64q_f32(v372); - float32x4_t v407 = vmulq_f32(v374, v419); - float32x4_t v415 = vmulq_f32(v413, v414); - float32x4_t v425 = vmulq_f32(v377, v424); - float32x4_t v431 = vrev64q_f32(v378); - float32x4_t v225 = vaddq_f32(v221, v223); - float32x4_t v226 = vsubq_f32(v221, v223); - float32x4_t v249 = vrev64q_f32(v224); - float32x4_t v264 = vmulq_f32(v262, v263); - float32x4_t v270 = vrev64q_f32(v227); - float32x4_t v277 = vmulq_f32(v228, v276); - float32x4_t v308 = vmulq_f32(v300, v330); - float32x4_t v313 = vmulq_f32(v301, v330); - float32x4_t v326 = vmulq_f32(v324, v338); - float32x4_t v347 = vmulq_f32(v345, v346); - float32x4_t v355 = vaddq_f32(v331, v352); - float32x4_t v356 = vsubq_f32(v331, v352); - float32x4_t v384 = vrev64q_f32(v375); - float32x4_t v392 = vrev64q_f32(v376); - float32x4_t v402 = vmulq_f32(v400, v414); - float32x4_t v433 = vmulq_f32(v431, v432); - float32x4_t v438 = vaddq_f32(v420, v425); - float32x4_t v439 = vsubq_f32(v420, v425); - float32x4_t v251 = vmulq_f32(v249, v263); - float32x4_t v272 = vmulq_f32(v270, v271); - float32x4_t v280 = vaddq_f32(v214, v277); - float32x4_t v281 = vsubq_f32(v214, v277); - float32x4_t v353 = vaddq_f32(v318, v326); - float32x4_t v354 = vsubq_f32(v318, v326); - float32x4_t v357 = vaddq_f32(v339, v347); - float32x4_t v358 = vsubq_f32(v339, v347); - float32x4_t v386 = vmulq_f32(v384, v414); - float32x4_t v394 = vmulq_f32(v392, v414); - float32x4_t v434 = vaddq_f32(v402, v407); - float32x4_t v435 = vsubq_f32(v402, v407); - float32x4_t v436 = vaddq_f32(v415, v433); - float32x4_t v437 = vsubq_f32(v415, v433); - float32x4_t v444 = vaddq_f32(v225, v308); - int16x4_t v449 = vqmovn_s32(vcvtq_n_s32_f32(v225, 15)); - float32x4_t v552 = vaddq_f32(v226, v313); - int16x4_t v557 = vqmovn_s32(vcvtq_n_s32_f32(v226, 15)); - float32x4_t v278 = vaddq_f32(v222, v251); - float32x4_t v279 = vsubq_f32(v222, v251); - float32x4_t v282 = vaddq_f32(v264, v272); - float32x4_t v283 = vsubq_f32(v264, v272); - float32x4_t v359 = vaddq_f32(v355, v357); - float32x4_t v360 = vsubq_f32(v355, v357); - float32x4_t v361 = vaddq_f32(v356, v358); - float32x4_t v362 = vsubq_f32(v356, v358); - float32x4_t v440 = vaddq_f32(v436, v438); - float32x4_t v441 = vsubq_f32(v436, v438); - float32x4_t v442 = vaddq_f32(v437, v439); - float32x4_t v443 = vsubq_f32(v437, v439); - float32x4_t v445 = vaddq_f32(v444, v386); - float32x4_t v446 = vsubq_f32(v444, v386); - float32x4_t v553 = vaddq_f32(v552, v394); - float32x4_t v554 = vsubq_f32(v552, v394); - vst1_s16((int16_t *)v1415, v449); - vst1_s16((int16_t *)v1523, v557); - float32x4_t v284 = vaddq_f32(v280, v282); - float32x4_t v285 = vsubq_f32(v280, v282); - float32x4_t v286 = vaddq_f32(v281, v283); - float32x4_t v287 = vsubq_f32(v281, v283); - int16x4_t v457 = vqmovn_s32(vcvtq_n_s32_f32(v446, 15)); - int16x4_t v465 = vqmovn_s32(vcvtq_n_s32_f32(v445, 15)); - float32x4_t v498 = vaddq_f32(v279, v354); - int16x4_t v503 = vqmovn_s32(vcvtq_n_s32_f32(v279, 15)); - int16x4_t v565 = vqmovn_s32(vcvtq_n_s32_f32(v554, 15)); - int16x4_t v573 = vqmovn_s32(vcvtq_n_s32_f32(v553, 15)); - float32x4_t v606 = vaddq_f32(v278, v353); - int16x4_t v611 = vqmovn_s32(vcvtq_n_s32_f32(v278, 15)); - float32x4_t v471 = vaddq_f32(v285, v360); - int16x4_t v476 = vqmovn_s32(vcvtq_n_s32_f32(v285, 15)); - float32x4_t v499 = vaddq_f32(v498, v435); - float32x4_t v500 = vsubq_f32(v498, v435); - float32x4_t v525 = vaddq_f32(v286, v361); - int16x4_t v530 = vqmovn_s32(vcvtq_n_s32_f32(v286, 15)); - float32x4_t v579 = vaddq_f32(v287, v362); - int16x4_t v584 = vqmovn_s32(vcvtq_n_s32_f32(v287, 15)); - float32x4_t v607 = vaddq_f32(v606, v434); - float32x4_t v608 = vsubq_f32(v606, v434); - float32x4_t v633 = vaddq_f32(v284, v359); - int16x4_t v638 = vqmovn_s32(vcvtq_n_s32_f32(v284, 15)); - vst1_s16((int16_t *)v1424, v457); - vst1_s16((int16_t *)v1433, v465); - vst1_s16((int16_t *)v1469, v503); - vst1_s16((int16_t *)v1532, v565); - vst1_s16((int16_t *)v1541, v573); - vst1_s16((int16_t *)v1577, v611); - float32x4_t v472 = vaddq_f32(v471, v441); - float32x4_t v473 = vsubq_f32(v471, v441); - int16x4_t v511 = vqmovn_s32(vcvtq_n_s32_f32(v500, 15)); - int16x4_t v519 = vqmovn_s32(vcvtq_n_s32_f32(v499, 15)); - float32x4_t v526 = vaddq_f32(v525, v442); - float32x4_t v527 = vsubq_f32(v525, v442); - float32x4_t v580 = vaddq_f32(v579, v443); - float32x4_t v581 = vsubq_f32(v579, v443); - int16x4_t v619 = vqmovn_s32(vcvtq_n_s32_f32(v608, 15)); - int16x4_t v627 = vqmovn_s32(vcvtq_n_s32_f32(v607, 15)); - float32x4_t v634 = vaddq_f32(v633, v440); - float32x4_t v635 = vsubq_f32(v633, v440); - vst1_s16((int16_t *)v1442, v476); - vst1_s16((int16_t *)v1496, v530); - vst1_s16((int16_t *)v1550, v584); - vst1_s16((int16_t *)v1604, v638); - int16x4_t v484 = vqmovn_s32(vcvtq_n_s32_f32(v473, 15)); - int16x4_t v492 = vqmovn_s32(vcvtq_n_s32_f32(v472, 15)); - int16x4_t v538 = vqmovn_s32(vcvtq_n_s32_f32(v527, 15)); - int16x4_t v546 = vqmovn_s32(vcvtq_n_s32_f32(v526, 15)); - int16x4_t v592 = vqmovn_s32(vcvtq_n_s32_f32(v581, 15)); - int16x4_t v600 = vqmovn_s32(vcvtq_n_s32_f32(v580, 15)); - int16x4_t v646 = vqmovn_s32(vcvtq_n_s32_f32(v635, 15)); - int16x4_t v654 = vqmovn_s32(vcvtq_n_s32_f32(v634, 15)); - vst1_s16((int16_t *)v1478, v511); - vst1_s16((int16_t *)v1487, v519); - vst1_s16((int16_t *)v1586, v619); - vst1_s16((int16_t *)v1595, v627); - vst1_s16((int16_t *)v1451, v484); - vst1_s16((int16_t *)v1460, v492); - vst1_s16((int16_t *)v1505, v538); - vst1_s16((int16_t *)v1514, v546); - vst1_s16((int16_t *)v1559, v592); - vst1_s16((int16_t *)v1568, v600); - vst1_s16((int16_t *)v1613, v646); - vst1_s16((int16_t *)v1622, v654); - v5 += 2 * 1; - v6 += 2 * 1; - } - for (int j = v660 * 2; j < howmany; j += 1) { - float32x2_t v731 = v5[istride]; - float v851 = 1.0000000000000000e+00F; - float v852 = -1.0000000000000000e+00F; - float v859 = -7.0710678118654746e-01F; - float v866 = 7.0710678118654757e-01F; - float v918 = -1.4999999999999998e+00F; - float v919 = 1.4999999999999998e+00F; - float v926 = 1.0606601717798210e+00F; - float v933 = -1.0606601717798212e+00F; - float v987 = 8.6602540378443871e-01F; - float v995 = -8.6602540378443871e-01F; - float v1002 = 6.1237243569579458e-01F; - float v1003 = -6.1237243569579458e-01F; - float32x2_t v1005 = (float32x2_t){v4, v4}; - float32x2_t v684 = v5[0]; - float32x2_t v853 = (float32x2_t){v851, v852}; - float32x2_t v860 = (float32x2_t){v866, v859}; - float32x2_t v867 = (float32x2_t){v866, v866}; - float32x2_t v916 = (float32x2_t){v918, v918}; - float32x2_t v920 = (float32x2_t){v918, v919}; - float32x2_t v927 = (float32x2_t){v933, v926}; - float32x2_t v934 = (float32x2_t){v933, v933}; - float32x2_t v989 = (float32x2_t){v987, v995}; - float32x2_t v996 = (float32x2_t){v995, v995}; - float32x2_t v1000 = (float32x2_t){v1003, v1003}; - float32x2_t v1004 = (float32x2_t){v1002, v1003}; - float32x2_t v672 = v5[istride * 8]; - float32x2_t v677 = v5[istride * 16]; - float32x2_t v690 = v5[istride * 11]; - float32x2_t v695 = v5[istride * 19]; - float32x2_t v702 = v5[istride * 3]; - float32x2_t v708 = v5[istride * 14]; - float32x2_t v713 = v5[istride * 22]; - float32x2_t v720 = v5[istride * 6]; - float32x2_t v726 = v5[istride * 17]; - float32x2_t v738 = v5[istride * 9]; - float32x2_t v744 = v5[istride * 20]; - float32x2_t v749 = v5[istride * 4]; - float32x2_t v756 = v5[istride * 12]; - float32x2_t v762 = v5[istride * 23]; - float32x2_t v767 = v5[istride * 7]; - float32x2_t v774 = v5[istride * 15]; - float32x2_t v780 = v5[istride * 2]; - float32x2_t v785 = v5[istride * 10]; - float32x2_t v792 = v5[istride * 18]; - float32x2_t v798 = v5[istride * 5]; - float32x2_t v803 = v5[istride * 13]; - float32x2_t v810 = v5[istride * 21]; - float32x2_t v855 = vmul_f32(v1005, v853); - float32x2_t v862 = vmul_f32(v1005, v860); - float32x2_t v922 = vmul_f32(v1005, v920); - float32x2_t v929 = vmul_f32(v1005, v927); - float32x2_t v991 = vmul_f32(v1005, v989); - float32x2_t v1006 = vmul_f32(v1005, v1004); - float32x2_t v678 = vadd_f32(v672, v677); - float32x2_t v679 = vsub_f32(v672, v677); - float32x2_t v696 = vadd_f32(v690, v695); - float32x2_t v697 = vsub_f32(v690, v695); - float32x2_t v714 = vadd_f32(v708, v713); - float32x2_t v715 = vsub_f32(v708, v713); - float32x2_t v732 = vadd_f32(v726, v731); - float32x2_t v733 = vsub_f32(v726, v731); - float32x2_t v750 = vadd_f32(v744, v749); - float32x2_t v751 = vsub_f32(v744, v749); - float32x2_t v768 = vadd_f32(v762, v767); - float32x2_t v769 = vsub_f32(v762, v767); - float32x2_t v786 = vadd_f32(v780, v785); - float32x2_t v787 = vsub_f32(v780, v785); - float32x2_t v804 = vadd_f32(v798, v803); - float32x2_t v805 = vsub_f32(v798, v803); - float32x2_t v685 = vadd_f32(v678, v684); - float32x2_t v703 = vadd_f32(v696, v702); - float32x2_t v721 = vadd_f32(v714, v720); - float32x2_t v739 = vadd_f32(v732, v738); - float32x2_t v757 = vadd_f32(v750, v756); - float32x2_t v775 = vadd_f32(v768, v774); - float32x2_t v793 = vadd_f32(v786, v792); - float32x2_t v811 = vadd_f32(v804, v810); - float32x2_t v879 = vadd_f32(v678, v750); - float32x2_t v880 = vsub_f32(v678, v750); - float32x2_t v881 = vadd_f32(v714, v786); - float32x2_t v882 = vsub_f32(v714, v786); - float32x2_t v883 = vadd_f32(v696, v768); - float32x2_t v884 = vsub_f32(v696, v768); - float32x2_t v885 = vadd_f32(v732, v804); - float32x2_t v886 = vsub_f32(v732, v804); - float32x2_t v946 = vadd_f32(v679, v751); - float32x2_t v947 = vsub_f32(v679, v751); - float32x2_t v948 = vadd_f32(v715, v787); - float32x2_t v949 = vsub_f32(v715, v787); - float32x2_t v950 = vadd_f32(v697, v769); - float32x2_t v951 = vsub_f32(v697, v769); - float32x2_t v952 = vadd_f32(v733, v805); - float32x2_t v953 = vsub_f32(v733, v805); - float32x2_t v812 = vadd_f32(v685, v757); - float32x2_t v813 = vsub_f32(v685, v757); - float32x2_t v814 = vadd_f32(v721, v793); - float32x2_t v815 = vsub_f32(v721, v793); - float32x2_t v816 = vadd_f32(v703, v775); - float32x2_t v817 = vsub_f32(v703, v775); - float32x2_t v818 = vadd_f32(v739, v811); - float32x2_t v819 = vsub_f32(v739, v811); - float32x2_t v887 = vadd_f32(v879, v881); - float32x2_t v888 = vsub_f32(v879, v881); - float32x2_t v889 = vadd_f32(v883, v885); - float32x2_t v890 = vsub_f32(v883, v885); - float32x2_t v893 = vadd_f32(v884, v886); - float32x2_t v894 = vsub_f32(v884, v886); - float32x2_t v917 = vmul_f32(v880, v916); - float32x2_t v923 = vrev64_f32(v882); - float32x2_t v954 = vadd_f32(v946, v948); - float32x2_t v955 = vsub_f32(v946, v948); - float32x2_t v956 = vadd_f32(v950, v952); - float32x2_t v957 = vsub_f32(v950, v952); - float32x2_t v960 = vadd_f32(v951, v953); - float32x2_t v961 = vsub_f32(v951, v953); - float32x2_t v992 = vrev64_f32(v947); - float32x2_t v997 = vmul_f32(v949, v996); - float32x2_t v820 = vadd_f32(v812, v814); - float32x2_t v821 = vsub_f32(v812, v814); - float32x2_t v822 = vadd_f32(v816, v818); - float32x2_t v823 = vsub_f32(v816, v818); - float32x2_t v826 = vadd_f32(v817, v819); - float32x2_t v827 = vsub_f32(v817, v819); - float32x2_t v856 = vrev64_f32(v815); - float32x2_t v891 = vadd_f32(v887, v889); - float32x2_t v892 = vsub_f32(v887, v889); - float32x2_t v906 = vmul_f32(v888, v916); - float32x2_t v912 = vrev64_f32(v890); - float32x2_t v924 = vmul_f32(v923, v922); - float32x2_t v930 = vrev64_f32(v893); - float32x2_t v935 = vmul_f32(v894, v934); - float32x2_t v958 = vadd_f32(v954, v956); - float32x2_t v959 = vsub_f32(v954, v956); - float32x2_t v981 = vrev64_f32(v955); - float32x2_t v986 = vmul_f32(v957, v996); - float32x2_t v993 = vmul_f32(v992, v991); - float32x2_t v1001 = vmul_f32(v960, v1000); - float32x2_t v1007 = vrev64_f32(v961); - float32x2_t v824 = vadd_f32(v820, v822); - float32x2_t v825 = vsub_f32(v820, v822); - float32x2_t v845 = vrev64_f32(v823); - float32x2_t v857 = vmul_f32(v856, v855); - float32x2_t v863 = vrev64_f32(v826); - float32x2_t v868 = vmul_f32(v827, v867); - float32x2_t v898 = vmul_f32(v891, v916); - float32x2_t v902 = vmul_f32(v892, v916); - float32x2_t v913 = vmul_f32(v912, v922); - float32x2_t v931 = vmul_f32(v930, v929); - float32x2_t v938 = vadd_f32(v917, v935); - float32x2_t v939 = vsub_f32(v917, v935); - float32x2_t v967 = vrev64_f32(v958); - float32x2_t v974 = vrev64_f32(v959); - float32x2_t v982 = vmul_f32(v981, v991); - float32x2_t v1008 = vmul_f32(v1007, v1006); - float32x2_t v1013 = vadd_f32(v997, v1001); - float32x2_t v1014 = vsub_f32(v997, v1001); - float32x2_t v846 = vmul_f32(v845, v855); - float32x2_t v864 = vmul_f32(v863, v862); - float32x2_t v871 = vadd_f32(v813, v868); - float32x2_t v872 = vsub_f32(v813, v868); - float32x2_t v936 = vadd_f32(v906, v913); - float32x2_t v937 = vsub_f32(v906, v913); - float32x2_t v940 = vadd_f32(v924, v931); - float32x2_t v941 = vsub_f32(v924, v931); - float32x2_t v968 = vmul_f32(v967, v991); - float32x2_t v975 = vmul_f32(v974, v991); - float32x2_t v1009 = vadd_f32(v982, v986); - float32x2_t v1010 = vsub_f32(v982, v986); - float32x2_t v1011 = vadd_f32(v993, v1008); - float32x2_t v1012 = vsub_f32(v993, v1008); - float32x2_t v1019 = vadd_f32(v824, v898); - int16x4_t v1024 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v824, 15), (int32x2_t){0, 0})); - float32x2_t v1103 = vadd_f32(v825, v902); - int16x4_t v1108 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v825, 15), (int32x2_t){0, 0})); - float32x2_t v869 = vadd_f32(v821, v846); - float32x2_t v870 = vsub_f32(v821, v846); - float32x2_t v873 = vadd_f32(v857, v864); - float32x2_t v874 = vsub_f32(v857, v864); - float32x2_t v942 = vadd_f32(v938, v940); - float32x2_t v943 = vsub_f32(v938, v940); - float32x2_t v944 = vadd_f32(v939, v941); - float32x2_t v945 = vsub_f32(v939, v941); - float32x2_t v1015 = vadd_f32(v1011, v1013); - float32x2_t v1016 = vsub_f32(v1011, v1013); - float32x2_t v1017 = vadd_f32(v1012, v1014); - float32x2_t v1018 = vsub_f32(v1012, v1014); - float32x2_t v1020 = vadd_f32(v1019, v968); - float32x2_t v1021 = vsub_f32(v1019, v968); - v6[0] = vget_lane_s32(vreinterpret_s32_s16(v1024), 0); - float32x2_t v1104 = vadd_f32(v1103, v975); - float32x2_t v1105 = vsub_f32(v1103, v975); - v6[ostride * 12] = vget_lane_s32(vreinterpret_s32_s16(v1108), 0); - float32x2_t v875 = vadd_f32(v871, v873); - float32x2_t v876 = vsub_f32(v871, v873); - float32x2_t v877 = vadd_f32(v872, v874); - float32x2_t v878 = vsub_f32(v872, v874); - int16x4_t v1030 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1021, 15), (int32x2_t){0, 0})); - int16x4_t v1036 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1020, 15), (int32x2_t){0, 0})); - float32x2_t v1061 = vadd_f32(v870, v937); - int16x4_t v1066 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v870, 15), (int32x2_t){0, 0})); - int16x4_t v1114 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1105, 15), (int32x2_t){0, 0})); - int16x4_t v1120 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1104, 15), (int32x2_t){0, 0})); - float32x2_t v1145 = vadd_f32(v869, v936); - int16x4_t v1150 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v869, 15), (int32x2_t){0, 0})); - v6[ostride * 16] = vget_lane_s32(vreinterpret_s32_s16(v1030), 0); - v6[ostride * 8] = vget_lane_s32(vreinterpret_s32_s16(v1036), 0); - float32x2_t v1040 = vadd_f32(v876, v943); - int16x4_t v1045 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v876, 15), (int32x2_t){0, 0})); - float32x2_t v1062 = vadd_f32(v1061, v1010); - float32x2_t v1063 = vsub_f32(v1061, v1010); - v6[ostride * 18] = vget_lane_s32(vreinterpret_s32_s16(v1066), 0); - float32x2_t v1082 = vadd_f32(v877, v944); - int16x4_t v1087 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v877, 15), (int32x2_t){0, 0})); - v6[ostride * 4] = vget_lane_s32(vreinterpret_s32_s16(v1114), 0); - v6[ostride * 20] = vget_lane_s32(vreinterpret_s32_s16(v1120), 0); - float32x2_t v1124 = vadd_f32(v878, v945); - int16x4_t v1129 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v878, 15), (int32x2_t){0, 0})); - float32x2_t v1146 = vadd_f32(v1145, v1009); - float32x2_t v1147 = vsub_f32(v1145, v1009); - v6[ostride * 6] = vget_lane_s32(vreinterpret_s32_s16(v1150), 0); - float32x2_t v1166 = vadd_f32(v875, v942); - int16x4_t v1171 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v875, 15), (int32x2_t){0, 0})); - float32x2_t v1041 = vadd_f32(v1040, v1016); - float32x2_t v1042 = vsub_f32(v1040, v1016); - v6[ostride * 9] = vget_lane_s32(vreinterpret_s32_s16(v1045), 0); - int16x4_t v1072 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1063, 15), (int32x2_t){0, 0})); - int16x4_t v1078 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1062, 15), (int32x2_t){0, 0})); - float32x2_t v1083 = vadd_f32(v1082, v1017); - float32x2_t v1084 = vsub_f32(v1082, v1017); - v6[ostride * 3] = vget_lane_s32(vreinterpret_s32_s16(v1087), 0); - float32x2_t v1125 = vadd_f32(v1124, v1018); - float32x2_t v1126 = vsub_f32(v1124, v1018); - v6[ostride * 21] = vget_lane_s32(vreinterpret_s32_s16(v1129), 0); - int16x4_t v1156 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1147, 15), (int32x2_t){0, 0})); - int16x4_t v1162 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1146, 15), (int32x2_t){0, 0})); - float32x2_t v1167 = vadd_f32(v1166, v1015); - float32x2_t v1168 = vsub_f32(v1166, v1015); - v6[ostride * 15] = vget_lane_s32(vreinterpret_s32_s16(v1171), 0); - int16x4_t v1051 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1042, 15), (int32x2_t){0, 0})); - int16x4_t v1057 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1041, 15), (int32x2_t){0, 0})); - v6[ostride * 10] = vget_lane_s32(vreinterpret_s32_s16(v1072), 0); - v6[ostride * 2] = vget_lane_s32(vreinterpret_s32_s16(v1078), 0); - int16x4_t v1093 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1084, 15), (int32x2_t){0, 0})); - int16x4_t v1099 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1083, 15), (int32x2_t){0, 0})); - int16x4_t v1135 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1126, 15), (int32x2_t){0, 0})); - int16x4_t v1141 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1125, 15), (int32x2_t){0, 0})); - v6[ostride * 22] = vget_lane_s32(vreinterpret_s32_s16(v1156), 0); - v6[ostride * 14] = vget_lane_s32(vreinterpret_s32_s16(v1162), 0); - int16x4_t v1177 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1168, 15), (int32x2_t){0, 0})); - int16x4_t v1183 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1167, 15), (int32x2_t){0, 0})); - v6[ostride] = vget_lane_s32(vreinterpret_s32_s16(v1051), 0); - v6[ostride * 17] = vget_lane_s32(vreinterpret_s32_s16(v1057), 0); - v6[ostride * 19] = vget_lane_s32(vreinterpret_s32_s16(v1093), 0); - v6[ostride * 11] = vget_lane_s32(vreinterpret_s32_s16(v1099), 0); - v6[ostride * 13] = vget_lane_s32(vreinterpret_s32_s16(v1135), 0); - v6[ostride * 5] = vget_lane_s32(vreinterpret_s32_s16(v1141), 0); - v6[ostride * 7] = vget_lane_s32(vreinterpret_s32_s16(v1177), 0); - v6[ostride * 23] = vget_lane_s32(vreinterpret_s32_s16(v1183), 0); + for (int j = 0; j < howmany; j += 1) { + float32x2_t v79 = v5[istride]; + float v199 = 1.0000000000000000e+00F; + float v200 = -1.0000000000000000e+00F; + float v207 = -7.0710678118654746e-01F; + float v214 = 7.0710678118654757e-01F; + float v266 = -1.4999999999999998e+00F; + float v267 = 1.4999999999999998e+00F; + float v274 = 1.0606601717798210e+00F; + float v281 = -1.0606601717798212e+00F; + float v335 = 8.6602540378443871e-01F; + float v343 = -8.6602540378443871e-01F; + float v350 = 6.1237243569579458e-01F; + float v351 = -6.1237243569579458e-01F; + float32x2_t v353 = (float32x2_t){v4, v4}; + float32x2_t v32 = v5[0]; + float32x2_t v201 = (float32x2_t){v199, v200}; + float32x2_t v208 = (float32x2_t){v214, v207}; + float32x2_t v215 = (float32x2_t){v214, v214}; + float32x2_t v264 = (float32x2_t){v266, v266}; + float32x2_t v268 = (float32x2_t){v266, v267}; + float32x2_t v275 = (float32x2_t){v281, v274}; + float32x2_t v282 = (float32x2_t){v281, v281}; + float32x2_t v337 = (float32x2_t){v335, v343}; + float32x2_t v344 = (float32x2_t){v343, v343}; + float32x2_t v348 = (float32x2_t){v351, v351}; + float32x2_t v352 = (float32x2_t){v350, v351}; + float32x2_t v20 = v5[istride * 8]; + float32x2_t v25 = v5[istride * 16]; + float32x2_t v38 = v5[istride * 11]; + float32x2_t v43 = v5[istride * 19]; + float32x2_t v50 = v5[istride * 3]; + float32x2_t v56 = v5[istride * 14]; + float32x2_t v61 = v5[istride * 22]; + float32x2_t v68 = v5[istride * 6]; + float32x2_t v74 = v5[istride * 17]; + float32x2_t v86 = v5[istride * 9]; + float32x2_t v92 = v5[istride * 20]; + float32x2_t v97 = v5[istride * 4]; + float32x2_t v104 = v5[istride * 12]; + float32x2_t v110 = v5[istride * 23]; + float32x2_t v115 = v5[istride * 7]; + float32x2_t v122 = v5[istride * 15]; + float32x2_t v128 = v5[istride * 2]; + float32x2_t v133 = v5[istride * 10]; + float32x2_t v140 = v5[istride * 18]; + float32x2_t v146 = v5[istride * 5]; + float32x2_t v151 = v5[istride * 13]; + float32x2_t v158 = v5[istride * 21]; + float32x2_t v203 = vmul_f32(v353, v201); + float32x2_t v210 = vmul_f32(v353, v208); + float32x2_t v270 = vmul_f32(v353, v268); + float32x2_t v277 = vmul_f32(v353, v275); + float32x2_t v339 = vmul_f32(v353, v337); + float32x2_t v354 = vmul_f32(v353, v352); + float32x2_t v26 = vadd_f32(v20, v25); + float32x2_t v27 = vsub_f32(v20, v25); + float32x2_t v44 = vadd_f32(v38, v43); + float32x2_t v45 = vsub_f32(v38, v43); + float32x2_t v62 = vadd_f32(v56, v61); + float32x2_t v63 = vsub_f32(v56, v61); + float32x2_t v80 = vadd_f32(v74, v79); + float32x2_t v81 = vsub_f32(v74, v79); + float32x2_t v98 = vadd_f32(v92, v97); + float32x2_t v99 = vsub_f32(v92, v97); + float32x2_t v116 = vadd_f32(v110, v115); + float32x2_t v117 = vsub_f32(v110, v115); + float32x2_t v134 = vadd_f32(v128, v133); + float32x2_t v135 = vsub_f32(v128, v133); + float32x2_t v152 = vadd_f32(v146, v151); + float32x2_t v153 = vsub_f32(v146, v151); + float32x2_t v33 = vadd_f32(v26, v32); + float32x2_t v51 = vadd_f32(v44, v50); + float32x2_t v69 = vadd_f32(v62, v68); + float32x2_t v87 = vadd_f32(v80, v86); + float32x2_t v105 = vadd_f32(v98, v104); + float32x2_t v123 = vadd_f32(v116, v122); + float32x2_t v141 = vadd_f32(v134, v140); + float32x2_t v159 = vadd_f32(v152, v158); + float32x2_t v227 = vadd_f32(v26, v98); + float32x2_t v228 = vsub_f32(v26, v98); + float32x2_t v229 = vadd_f32(v62, v134); + float32x2_t v230 = vsub_f32(v62, v134); + float32x2_t v231 = vadd_f32(v44, v116); + float32x2_t v232 = vsub_f32(v44, v116); + float32x2_t v233 = vadd_f32(v80, v152); + float32x2_t v234 = vsub_f32(v80, v152); + float32x2_t v294 = vadd_f32(v27, v99); + float32x2_t v295 = vsub_f32(v27, v99); + float32x2_t v296 = vadd_f32(v63, v135); + float32x2_t v297 = vsub_f32(v63, v135); + float32x2_t v298 = vadd_f32(v45, v117); + float32x2_t v299 = vsub_f32(v45, v117); + float32x2_t v300 = vadd_f32(v81, v153); + float32x2_t v301 = vsub_f32(v81, v153); + float32x2_t v160 = vadd_f32(v33, v105); + float32x2_t v161 = vsub_f32(v33, v105); + float32x2_t v162 = vadd_f32(v69, v141); + float32x2_t v163 = vsub_f32(v69, v141); + float32x2_t v164 = vadd_f32(v51, v123); + float32x2_t v165 = vsub_f32(v51, v123); + float32x2_t v166 = vadd_f32(v87, v159); + float32x2_t v167 = vsub_f32(v87, v159); + float32x2_t v235 = vadd_f32(v227, v229); + float32x2_t v236 = vsub_f32(v227, v229); + float32x2_t v237 = vadd_f32(v231, v233); + float32x2_t v238 = vsub_f32(v231, v233); + float32x2_t v241 = vadd_f32(v232, v234); + float32x2_t v242 = vsub_f32(v232, v234); + float32x2_t v265 = vmul_f32(v228, v264); + float32x2_t v271 = vrev64_f32(v230); + float32x2_t v302 = vadd_f32(v294, v296); + float32x2_t v303 = vsub_f32(v294, v296); + float32x2_t v304 = vadd_f32(v298, v300); + float32x2_t v305 = vsub_f32(v298, v300); + float32x2_t v308 = vadd_f32(v299, v301); + float32x2_t v309 = vsub_f32(v299, v301); + float32x2_t v340 = vrev64_f32(v295); + float32x2_t v345 = vmul_f32(v297, v344); + float32x2_t v168 = vadd_f32(v160, v162); + float32x2_t v169 = vsub_f32(v160, v162); + float32x2_t v170 = vadd_f32(v164, v166); + float32x2_t v171 = vsub_f32(v164, v166); + float32x2_t v174 = vadd_f32(v165, v167); + float32x2_t v175 = vsub_f32(v165, v167); + float32x2_t v204 = vrev64_f32(v163); + float32x2_t v239 = vadd_f32(v235, v237); + float32x2_t v240 = vsub_f32(v235, v237); + float32x2_t v254 = vmul_f32(v236, v264); + float32x2_t v260 = vrev64_f32(v238); + float32x2_t v272 = vmul_f32(v271, v270); + float32x2_t v278 = vrev64_f32(v241); + float32x2_t v283 = vmul_f32(v242, v282); + float32x2_t v306 = vadd_f32(v302, v304); + float32x2_t v307 = vsub_f32(v302, v304); + float32x2_t v329 = vrev64_f32(v303); + float32x2_t v334 = vmul_f32(v305, v344); + float32x2_t v341 = vmul_f32(v340, v339); + float32x2_t v349 = vmul_f32(v308, v348); + float32x2_t v355 = vrev64_f32(v309); + float32x2_t v172 = vadd_f32(v168, v170); + float32x2_t v173 = vsub_f32(v168, v170); + float32x2_t v193 = vrev64_f32(v171); + float32x2_t v205 = vmul_f32(v204, v203); + float32x2_t v211 = vrev64_f32(v174); + float32x2_t v216 = vmul_f32(v175, v215); + float32x2_t v246 = vmul_f32(v239, v264); + float32x2_t v250 = vmul_f32(v240, v264); + float32x2_t v261 = vmul_f32(v260, v270); + float32x2_t v279 = vmul_f32(v278, v277); + float32x2_t v286 = vadd_f32(v265, v283); + float32x2_t v287 = vsub_f32(v265, v283); + float32x2_t v315 = vrev64_f32(v306); + float32x2_t v322 = vrev64_f32(v307); + float32x2_t v330 = vmul_f32(v329, v339); + float32x2_t v356 = vmul_f32(v355, v354); + float32x2_t v361 = vadd_f32(v345, v349); + float32x2_t v362 = vsub_f32(v345, v349); + float32x2_t v194 = vmul_f32(v193, v203); + float32x2_t v212 = vmul_f32(v211, v210); + float32x2_t v219 = vadd_f32(v161, v216); + float32x2_t v220 = vsub_f32(v161, v216); + float32x2_t v284 = vadd_f32(v254, v261); + float32x2_t v285 = vsub_f32(v254, v261); + float32x2_t v288 = vadd_f32(v272, v279); + float32x2_t v289 = vsub_f32(v272, v279); + float32x2_t v316 = vmul_f32(v315, v339); + float32x2_t v323 = vmul_f32(v322, v339); + float32x2_t v357 = vadd_f32(v330, v334); + float32x2_t v358 = vsub_f32(v330, v334); + float32x2_t v359 = vadd_f32(v341, v356); + float32x2_t v360 = vsub_f32(v341, v356); + float32x2_t v367 = vadd_f32(v172, v246); + int16x4_t v372 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v172, 15), (int32x2_t){0, 0})); + float32x2_t v451 = vadd_f32(v173, v250); + int16x4_t v456 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v173, 15), (int32x2_t){0, 0})); + float32x2_t v217 = vadd_f32(v169, v194); + float32x2_t v218 = vsub_f32(v169, v194); + float32x2_t v221 = vadd_f32(v205, v212); + float32x2_t v222 = vsub_f32(v205, v212); + float32x2_t v290 = vadd_f32(v286, v288); + float32x2_t v291 = vsub_f32(v286, v288); + float32x2_t v292 = vadd_f32(v287, v289); + float32x2_t v293 = vsub_f32(v287, v289); + float32x2_t v363 = vadd_f32(v359, v361); + float32x2_t v364 = vsub_f32(v359, v361); + float32x2_t v365 = vadd_f32(v360, v362); + float32x2_t v366 = vsub_f32(v360, v362); + float32x2_t v368 = vadd_f32(v367, v316); + float32x2_t v369 = vsub_f32(v367, v316); + v6[0] = vget_lane_s32(vreinterpret_s32_s16(v372), 0); + float32x2_t v452 = vadd_f32(v451, v323); + float32x2_t v453 = vsub_f32(v451, v323); + v6[ostride * 12] = vget_lane_s32(vreinterpret_s32_s16(v456), 0); + float32x2_t v223 = vadd_f32(v219, v221); + float32x2_t v224 = vsub_f32(v219, v221); + float32x2_t v225 = vadd_f32(v220, v222); + float32x2_t v226 = vsub_f32(v220, v222); + int16x4_t v378 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v369, 15), (int32x2_t){0, 0})); + int16x4_t v384 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v368, 15), (int32x2_t){0, 0})); + float32x2_t v409 = vadd_f32(v218, v285); + int16x4_t v414 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v218, 15), (int32x2_t){0, 0})); + int16x4_t v462 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v453, 15), (int32x2_t){0, 0})); + int16x4_t v468 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v452, 15), (int32x2_t){0, 0})); + float32x2_t v493 = vadd_f32(v217, v284); + int16x4_t v498 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v217, 15), (int32x2_t){0, 0})); + v6[ostride * 16] = vget_lane_s32(vreinterpret_s32_s16(v378), 0); + v6[ostride * 8] = vget_lane_s32(vreinterpret_s32_s16(v384), 0); + float32x2_t v388 = vadd_f32(v224, v291); + int16x4_t v393 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v224, 15), (int32x2_t){0, 0})); + float32x2_t v410 = vadd_f32(v409, v358); + float32x2_t v411 = vsub_f32(v409, v358); + v6[ostride * 18] = vget_lane_s32(vreinterpret_s32_s16(v414), 0); + float32x2_t v430 = vadd_f32(v225, v292); + int16x4_t v435 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v225, 15), (int32x2_t){0, 0})); + v6[ostride * 4] = vget_lane_s32(vreinterpret_s32_s16(v462), 0); + v6[ostride * 20] = vget_lane_s32(vreinterpret_s32_s16(v468), 0); + float32x2_t v472 = vadd_f32(v226, v293); + int16x4_t v477 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v226, 15), (int32x2_t){0, 0})); + float32x2_t v494 = vadd_f32(v493, v357); + float32x2_t v495 = vsub_f32(v493, v357); + v6[ostride * 6] = vget_lane_s32(vreinterpret_s32_s16(v498), 0); + float32x2_t v514 = vadd_f32(v223, v290); + int16x4_t v519 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v223, 15), (int32x2_t){0, 0})); + float32x2_t v389 = vadd_f32(v388, v364); + float32x2_t v390 = vsub_f32(v388, v364); + v6[ostride * 9] = vget_lane_s32(vreinterpret_s32_s16(v393), 0); + int16x4_t v420 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v411, 15), (int32x2_t){0, 0})); + int16x4_t v426 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v410, 15), (int32x2_t){0, 0})); + float32x2_t v431 = vadd_f32(v430, v365); + float32x2_t v432 = vsub_f32(v430, v365); + v6[ostride * 3] = vget_lane_s32(vreinterpret_s32_s16(v435), 0); + float32x2_t v473 = vadd_f32(v472, v366); + float32x2_t v474 = vsub_f32(v472, v366); + v6[ostride * 21] = vget_lane_s32(vreinterpret_s32_s16(v477), 0); + int16x4_t v504 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v495, 15), (int32x2_t){0, 0})); + int16x4_t v510 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v494, 15), (int32x2_t){0, 0})); + float32x2_t v515 = vadd_f32(v514, v363); + float32x2_t v516 = vsub_f32(v514, v363); + v6[ostride * 15] = vget_lane_s32(vreinterpret_s32_s16(v519), 0); + int16x4_t v399 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v390, 15), (int32x2_t){0, 0})); + int16x4_t v405 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v389, 15), (int32x2_t){0, 0})); + v6[ostride * 10] = vget_lane_s32(vreinterpret_s32_s16(v420), 0); + v6[ostride * 2] = vget_lane_s32(vreinterpret_s32_s16(v426), 0); + int16x4_t v441 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v432, 15), (int32x2_t){0, 0})); + int16x4_t v447 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v431, 15), (int32x2_t){0, 0})); + int16x4_t v483 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v474, 15), (int32x2_t){0, 0})); + int16x4_t v489 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v473, 15), (int32x2_t){0, 0})); + v6[ostride * 22] = vget_lane_s32(vreinterpret_s32_s16(v504), 0); + v6[ostride * 14] = vget_lane_s32(vreinterpret_s32_s16(v510), 0); + int16x4_t v525 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v516, 15), (int32x2_t){0, 0})); + int16x4_t v531 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v515, 15), (int32x2_t){0, 0})); + v6[ostride] = vget_lane_s32(vreinterpret_s32_s16(v399), 0); + v6[ostride * 17] = vget_lane_s32(vreinterpret_s32_s16(v405), 0); + v6[ostride * 19] = vget_lane_s32(vreinterpret_s32_s16(v441), 0); + v6[ostride * 11] = vget_lane_s32(vreinterpret_s32_s16(v447), 0); + v6[ostride * 13] = vget_lane_s32(vreinterpret_s32_s16(v483), 0); + v6[ostride * 5] = vget_lane_s32(vreinterpret_s32_s16(v489), 0); + v6[ostride * 7] = vget_lane_s32(vreinterpret_s32_s16(v525), 0); + v6[ostride * 23] = vget_lane_s32(vreinterpret_s32_s16(v531), 0); v5 += 1 * 1; v6 += 1 * 1; } @@ -16902,229 +10572,128 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu24(const armral_cmplx_f32_t *restrict x, svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v851)[0])); svfloat32_t v1151 = svreinterpret_f32_f64(svld1_f64(pred_full, &((const double *)v860)[0])); - svfloat32_t v32; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v32) : "w"(v1105), "w"(v1107)); - svfloat32_t v33; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v33) : "w"(v1105), "w"(v1107)); - svfloat32_t v56; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v56) : "w"(v1111), "w"(v1113)); - svfloat32_t v57; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v57) : "w"(v1111), "w"(v1113)); - svfloat32_t v80; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v80) : "w"(v1117), "w"(v1119)); - svfloat32_t v81; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v81) : "w"(v1117), "w"(v1119)); - svfloat32_t v104; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v104) : "w"(v1123), "w"(v1125)); - svfloat32_t v105; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v105) : "w"(v1123), "w"(v1125)); - svfloat32_t v128; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v128) : "w"(v1129), "w"(v1131)); - svfloat32_t v129; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v129) : "w"(v1129), "w"(v1131)); - svfloat32_t v152; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v152) : "w"(v1135), "w"(v1137)); - svfloat32_t v153; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v153) : "w"(v1135), "w"(v1137)); - svfloat32_t v176; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v176) : "w"(v1141), "w"(v1143)); - svfloat32_t v177; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v177) : "w"(v1141), "w"(v1143)); - svfloat32_t v200; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v200) : "w"(v1147), "w"(v1149)); - svfloat32_t v201; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v201) : "w"(v1147), "w"(v1149)); - svfloat32_t v41; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v41) : "w"(v32), "w"(v1109)); - svfloat32_t v65; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v65) : "w"(v56), "w"(v1115)); - svfloat32_t v89; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v89) : "w"(v80), "w"(v1121)); - svfloat32_t v113; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v113) : "w"(v104), "w"(v1127)); - svfloat32_t v137; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v137) : "w"(v128), "w"(v1133)); - svfloat32_t v161; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v161) : "w"(v152), "w"(v1139)); - svfloat32_t v185; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v185) : "w"(v176), "w"(v1145)); - svfloat32_t v209; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v209) : "w"(v200), "w"(v1151)); - svfloat32_t v282; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v282) : "w"(v32), "w"(v128)); - svfloat32_t v283; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v283) : "w"(v32), "w"(v128)); - svfloat32_t v284; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v284) : "w"(v80), "w"(v176)); - svfloat32_t v285; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v285) : "w"(v80), "w"(v176)); - svfloat32_t v286; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v286) : "w"(v56), "w"(v152)); - svfloat32_t v287; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v287) : "w"(v56), "w"(v152)); - svfloat32_t v288; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v288) : "w"(v104), "w"(v200)); - svfloat32_t v289; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v289) : "w"(v104), "w"(v200)); - svfloat32_t v354; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v354) : "w"(v33), "w"(v129)); - svfloat32_t v355; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v355) : "w"(v33), "w"(v129)); - svfloat32_t v356; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v356) : "w"(v81), "w"(v177)); - svfloat32_t v357; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v357) : "w"(v81), "w"(v177)); - svfloat32_t v358; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v358) : "w"(v57), "w"(v153)); - svfloat32_t v359; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v359) : "w"(v57), "w"(v153)); - svfloat32_t v360; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v360) : "w"(v105), "w"(v201)); - svfloat32_t v361; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v361) : "w"(v105), "w"(v201)); - svfloat32_t v210; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v210) : "w"(v41), "w"(v137)); - svfloat32_t v211; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v211) : "w"(v41), "w"(v137)); - svfloat32_t v212; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v212) : "w"(v89), "w"(v185)); - svfloat32_t v213; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v213) : "w"(v89), "w"(v185)); - svfloat32_t v214; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v214) : "w"(v65), "w"(v161)); - svfloat32_t v215; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v215) : "w"(v65), "w"(v161)); - svfloat32_t v216; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v216) : "w"(v113), "w"(v209)); - svfloat32_t v217; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v217) : "w"(v113), "w"(v209)); - svfloat32_t v290; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v290) : "w"(v282), "w"(v284)); - svfloat32_t v291; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v291) : "w"(v282), "w"(v284)); - svfloat32_t v292; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v292) : "w"(v286), "w"(v288)); - svfloat32_t v293; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v293) : "w"(v286), "w"(v288)); - svfloat32_t v296; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v296) : "w"(v287), "w"(v289)); - svfloat32_t v297; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v297) : "w"(v287), "w"(v289)); - svfloat32_t zero331; - asm volatile("mov %0.s, #0" : "=w"(zero331)); + svfloat32_t v32 = svadd_f32_x(svptrue_b32(), v1105, v1107); + svfloat32_t v33 = svsub_f32_x(svptrue_b32(), v1105, v1107); + svfloat32_t v56 = svadd_f32_x(svptrue_b32(), v1111, v1113); + svfloat32_t v57 = svsub_f32_x(svptrue_b32(), v1111, v1113); + svfloat32_t v80 = svadd_f32_x(svptrue_b32(), v1117, v1119); + svfloat32_t v81 = svsub_f32_x(svptrue_b32(), v1117, v1119); + svfloat32_t v104 = svadd_f32_x(svptrue_b32(), v1123, v1125); + svfloat32_t v105 = svsub_f32_x(svptrue_b32(), v1123, v1125); + svfloat32_t v128 = svadd_f32_x(svptrue_b32(), v1129, v1131); + svfloat32_t v129 = svsub_f32_x(svptrue_b32(), v1129, v1131); + svfloat32_t v152 = svadd_f32_x(svptrue_b32(), v1135, v1137); + svfloat32_t v153 = svsub_f32_x(svptrue_b32(), v1135, v1137); + svfloat32_t v176 = svadd_f32_x(svptrue_b32(), v1141, v1143); + svfloat32_t v177 = svsub_f32_x(svptrue_b32(), v1141, v1143); + svfloat32_t v200 = svadd_f32_x(svptrue_b32(), v1147, v1149); + svfloat32_t v201 = svsub_f32_x(svptrue_b32(), v1147, v1149); + svfloat32_t v41 = svadd_f32_x(svptrue_b32(), v32, v1109); + svfloat32_t v65 = svadd_f32_x(svptrue_b32(), v56, v1115); + svfloat32_t v89 = svadd_f32_x(svptrue_b32(), v80, v1121); + svfloat32_t v113 = svadd_f32_x(svptrue_b32(), v104, v1127); + svfloat32_t v137 = svadd_f32_x(svptrue_b32(), v128, v1133); + svfloat32_t v161 = svadd_f32_x(svptrue_b32(), v152, v1139); + svfloat32_t v185 = svadd_f32_x(svptrue_b32(), v176, v1145); + svfloat32_t v209 = svadd_f32_x(svptrue_b32(), v200, v1151); + svfloat32_t v282 = svadd_f32_x(svptrue_b32(), v32, v128); + svfloat32_t v283 = svsub_f32_x(svptrue_b32(), v32, v128); + svfloat32_t v284 = svadd_f32_x(svptrue_b32(), v80, v176); + svfloat32_t v285 = svsub_f32_x(svptrue_b32(), v80, v176); + svfloat32_t v286 = svadd_f32_x(svptrue_b32(), v56, v152); + svfloat32_t v287 = svsub_f32_x(svptrue_b32(), v56, v152); + svfloat32_t v288 = svadd_f32_x(svptrue_b32(), v104, v200); + svfloat32_t v289 = svsub_f32_x(svptrue_b32(), v104, v200); + svfloat32_t v354 = svadd_f32_x(svptrue_b32(), v33, v129); + svfloat32_t v355 = svsub_f32_x(svptrue_b32(), v33, v129); + svfloat32_t v356 = svadd_f32_x(svptrue_b32(), v81, v177); + svfloat32_t v357 = svsub_f32_x(svptrue_b32(), v81, v177); + svfloat32_t v358 = svadd_f32_x(svptrue_b32(), v57, v153); + svfloat32_t v359 = svsub_f32_x(svptrue_b32(), v57, v153); + svfloat32_t v360 = svadd_f32_x(svptrue_b32(), v105, v201); + svfloat32_t v361 = svsub_f32_x(svptrue_b32(), v105, v201); + svfloat32_t v210 = svadd_f32_x(svptrue_b32(), v41, v137); + svfloat32_t v211 = svsub_f32_x(svptrue_b32(), v41, v137); + svfloat32_t v212 = svadd_f32_x(svptrue_b32(), v89, v185); + svfloat32_t v213 = svsub_f32_x(svptrue_b32(), v89, v185); + svfloat32_t v214 = svadd_f32_x(svptrue_b32(), v65, v161); + svfloat32_t v215 = svsub_f32_x(svptrue_b32(), v65, v161); + svfloat32_t v216 = svadd_f32_x(svptrue_b32(), v113, v209); + svfloat32_t v217 = svsub_f32_x(svptrue_b32(), v113, v209); + svfloat32_t v290 = svadd_f32_x(svptrue_b32(), v282, v284); + svfloat32_t v291 = svsub_f32_x(svptrue_b32(), v282, v284); + svfloat32_t v292 = svadd_f32_x(svptrue_b32(), v286, v288); + svfloat32_t v293 = svsub_f32_x(svptrue_b32(), v286, v288); + svfloat32_t v296 = svadd_f32_x(svptrue_b32(), v287, v289); + svfloat32_t v297 = svsub_f32_x(svptrue_b32(), v287, v289); + svfloat32_t zero331 = svdup_n_f32(0); svfloat32_t v331 = svcmla_f32_x(pred_full, zero331, v876, v285, 90); - svfloat32_t v362; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v362) : "w"(v354), "w"(v356)); - svfloat32_t v363; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v363) : "w"(v354), "w"(v356)); - svfloat32_t v364; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v364) : "w"(v358), "w"(v360)); - svfloat32_t v365; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v365) : "w"(v358), "w"(v360)); - svfloat32_t v368; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v368) : "w"(v359), "w"(v361)); - svfloat32_t v369; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v369) : "w"(v359), "w"(v361)); - svfloat32_t zero402; - asm volatile("mov %0.s, #0" : "=w"(zero402)); + svfloat32_t v362 = svadd_f32_x(svptrue_b32(), v354, v356); + svfloat32_t v363 = svsub_f32_x(svptrue_b32(), v354, v356); + svfloat32_t v364 = svadd_f32_x(svptrue_b32(), v358, v360); + svfloat32_t v365 = svsub_f32_x(svptrue_b32(), v358, v360); + svfloat32_t v368 = svadd_f32_x(svptrue_b32(), v359, v361); + svfloat32_t v369 = svsub_f32_x(svptrue_b32(), v359, v361); + svfloat32_t zero402 = svdup_n_f32(0); svfloat32_t v402 = svcmla_f32_x(pred_full, zero402, v883, v355, 90); - svfloat32_t v218; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v218) : "w"(v210), "w"(v212)); - svfloat32_t v219; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v219) : "w"(v210), "w"(v212)); - svfloat32_t v220; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v220) : "w"(v214), "w"(v216)); - svfloat32_t v221; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v221) : "w"(v214), "w"(v216)); - svfloat32_t v224; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v224) : "w"(v215), "w"(v217)); - svfloat32_t v225; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v225) : "w"(v215), "w"(v217)); - svfloat32_t zero259; - asm volatile("mov %0.s, #0" : "=w"(zero259)); + svfloat32_t v218 = svadd_f32_x(svptrue_b32(), v210, v212); + svfloat32_t v219 = svsub_f32_x(svptrue_b32(), v210, v212); + svfloat32_t v220 = svadd_f32_x(svptrue_b32(), v214, v216); + svfloat32_t v221 = svsub_f32_x(svptrue_b32(), v214, v216); + svfloat32_t v224 = svadd_f32_x(svptrue_b32(), v215, v217); + svfloat32_t v225 = svsub_f32_x(svptrue_b32(), v215, v217); + svfloat32_t zero259 = svdup_n_f32(0); svfloat32_t v259 = svcmla_f32_x(pred_full, zero259, v868, v213, 90); - svfloat32_t v294; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v294) : "w"(v290), "w"(v292)); - svfloat32_t v295; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v295) : "w"(v290), "w"(v292)); - svfloat32_t zero319; - asm volatile("mov %0.s, #0" : "=w"(zero319)); + svfloat32_t v294 = svadd_f32_x(svptrue_b32(), v290, v292); + svfloat32_t v295 = svsub_f32_x(svptrue_b32(), v290, v292); + svfloat32_t zero319 = svdup_n_f32(0); svfloat32_t v319 = svcmla_f32_x(pred_full, zero319, v876, v293, 90); - svfloat32_t zero338; - asm volatile("mov %0.s, #0" : "=w"(zero338)); + svfloat32_t zero338 = svdup_n_f32(0); svfloat32_t v338 = svcmla_f32_x(pred_full, zero338, v877, v296, 90); - svfloat32_t v343; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v343) : "w"(v297), "w"(v878)); - svfloat32_t v366; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v366) : "w"(v362), "w"(v364)); - svfloat32_t v367; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v367) : "w"(v362), "w"(v364)); - svfloat32_t zero390; - asm volatile("mov %0.s, #0" : "=w"(zero390)); + svfloat32_t v343 = svmul_f32_x(svptrue_b32(), v297, v878); + svfloat32_t v366 = svadd_f32_x(svptrue_b32(), v362, v364); + svfloat32_t v367 = svsub_f32_x(svptrue_b32(), v362, v364); + svfloat32_t zero390 = svdup_n_f32(0); svfloat32_t v390 = svcmla_f32_x(pred_full, zero390, v883, v363, 90); - svfloat32_t v412; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v412) : "w"(v368), "w"(v885)); - svfloat32_t zero419; - asm volatile("mov %0.s, #0" : "=w"(zero419)); + svfloat32_t v412 = svmul_f32_x(svptrue_b32(), v368, v885); + svfloat32_t zero419 = svdup_n_f32(0); svfloat32_t v419 = svcmla_f32_x(pred_full, zero419, v886, v369, 90); - svfloat32_t v222; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v222) : "w"(v218), "w"(v220)); - svfloat32_t v223; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v223) : "w"(v218), "w"(v220)); - svfloat32_t zero247; - asm volatile("mov %0.s, #0" : "=w"(zero247)); + svfloat32_t v222 = svadd_f32_x(svptrue_b32(), v218, v220); + svfloat32_t v223 = svsub_f32_x(svptrue_b32(), v218, v220); + svfloat32_t zero247 = svdup_n_f32(0); svfloat32_t v247 = svcmla_f32_x(pred_full, zero247, v868, v221, 90); - svfloat32_t zero266; - asm volatile("mov %0.s, #0" : "=w"(zero266)); + svfloat32_t zero266 = svdup_n_f32(0); svfloat32_t v266 = svcmla_f32_x(pred_full, zero266, v869, v224, 90); svfloat32_t v344 = svmla_f32_x(pred_full, v319, v291, v875); svfloat32_t v345 = svnmls_f32_x(pred_full, v319, v291, v875); svfloat32_t v346 = svmla_f32_x(pred_full, v343, v283, v875); svfloat32_t v347 = svnmls_f32_x(pred_full, v343, v283, v875); - svfloat32_t v348; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v348) : "w"(v331), "w"(v338)); - svfloat32_t v349; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v349) : "w"(v331), "w"(v338)); - svfloat32_t zero376; - asm volatile("mov %0.s, #0" : "=w"(zero376)); + svfloat32_t v348 = svadd_f32_x(svptrue_b32(), v331, v338); + svfloat32_t v349 = svsub_f32_x(svptrue_b32(), v331, v338); + svfloat32_t zero376 = svdup_n_f32(0); svfloat32_t v376 = svcmla_f32_x(pred_full, zero376, v883, v366, 90); - svfloat32_t zero383; - asm volatile("mov %0.s, #0" : "=w"(zero383)); + svfloat32_t zero383 = svdup_n_f32(0); svfloat32_t v383 = svcmla_f32_x(pred_full, zero383, v883, v367, 90); svfloat32_t v420 = svmla_f32_x(pred_full, v390, v365, v884); svfloat32_t v421 = svmls_f32_x(pred_full, v390, v365, v884); - svfloat32_t v422; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v422) : "w"(v402), "w"(v419)); - svfloat32_t v423; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v423) : "w"(v402), "w"(v419)); + svfloat32_t v422 = svadd_f32_x(svptrue_b32(), v402, v419); + svfloat32_t v423 = svsub_f32_x(svptrue_b32(), v402, v419); svfloat32_t v424 = svmla_f32_x(pred_full, v412, v357, v884); svfloat32_t v425 = svnmls_f32_x(pred_full, v412, v357, v884); - svfloat32_t v272; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v272) : "w"(v219), "w"(v247)); - svfloat32_t v273; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v273) : "w"(v219), "w"(v247)); + svfloat32_t v272 = svadd_f32_x(svptrue_b32(), v219, v247); + svfloat32_t v273 = svsub_f32_x(svptrue_b32(), v219, v247); svfloat32_t v274 = svmla_f32_x(pred_full, v211, v225, v870); svfloat32_t v275 = svmls_f32_x(pred_full, v211, v225, v870); - svfloat32_t v276; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v276) : "w"(v259), "w"(v266)); - svfloat32_t v277; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v277) : "w"(v259), "w"(v266)); - svfloat32_t v350; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v350) : "w"(v346), "w"(v348)); - svfloat32_t v351; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v351) : "w"(v346), "w"(v348)); - svfloat32_t v352; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v352) : "w"(v347), "w"(v349)); - svfloat32_t v353; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v353) : "w"(v347), "w"(v349)); - svfloat32_t v426; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v426) : "w"(v422), "w"(v424)); - svfloat32_t v427; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v427) : "w"(v422), "w"(v424)); - svfloat32_t v428; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v428) : "w"(v423), "w"(v425)); - svfloat32_t v429; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v429) : "w"(v423), "w"(v425)); + svfloat32_t v276 = svadd_f32_x(svptrue_b32(), v259, v266); + svfloat32_t v277 = svsub_f32_x(svptrue_b32(), v259, v266); + svfloat32_t v350 = svadd_f32_x(svptrue_b32(), v346, v348); + svfloat32_t v351 = svsub_f32_x(svptrue_b32(), v346, v348); + svfloat32_t v352 = svadd_f32_x(svptrue_b32(), v347, v349); + svfloat32_t v353 = svsub_f32_x(svptrue_b32(), v347, v349); + svfloat32_t v426 = svadd_f32_x(svptrue_b32(), v422, v424); + svfloat32_t v427 = svsub_f32_x(svptrue_b32(), v422, v424); + svfloat32_t v428 = svadd_f32_x(svptrue_b32(), v423, v425); + svfloat32_t v429 = svsub_f32_x(svptrue_b32(), v423, v425); svfloat32_t v430 = svmla_f32_x(pred_full, v222, v294, v875); svint16_t v435 = svtbl_s16( svreinterpret_s16_s32(svcvt_s32_f32_x( @@ -17137,31 +10706,21 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu24(const armral_cmplx_f32_t *restrict x, pred_full, svmul_n_f32_x(pred_full, v223, (float)(1ULL << 31ULL)))), svreinterpret_u16_u64( svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svfloat32_t v278; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v278) : "w"(v274), "w"(v276)); - svfloat32_t v279; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v279) : "w"(v274), "w"(v276)); - svfloat32_t v280; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v280) : "w"(v275), "w"(v277)); - svfloat32_t v281; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v281) : "w"(v275), "w"(v277)); - svfloat32_t v431; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v431) : "w"(v430), "w"(v376)); - svfloat32_t v432; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v432) : "w"(v430), "w"(v376)); - svfloat32_t v484; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v484) : "w"(v273), "w"(v345)); + svfloat32_t v278 = svadd_f32_x(svptrue_b32(), v274, v276); + svfloat32_t v279 = svsub_f32_x(svptrue_b32(), v274, v276); + svfloat32_t v280 = svadd_f32_x(svptrue_b32(), v275, v277); + svfloat32_t v281 = svsub_f32_x(svptrue_b32(), v275, v277); + svfloat32_t v431 = svadd_f32_x(svptrue_b32(), v430, v376); + svfloat32_t v432 = svsub_f32_x(svptrue_b32(), v430, v376); + svfloat32_t v484 = svadd_f32_x(svptrue_b32(), v273, v345); svint16_t v489 = svtbl_s16( svreinterpret_s16_s32(svcvt_s32_f32_x( pred_full, svmul_n_f32_x(pred_full, v273, (float)(1ULL << 31ULL)))), svreinterpret_u16_u64( svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svfloat32_t v539; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v539) : "w"(v538), "w"(v383)); - svfloat32_t v540; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v540) : "w"(v538), "w"(v383)); - svfloat32_t v592; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v592) : "w"(v272), "w"(v344)); + svfloat32_t v539 = svadd_f32_x(svptrue_b32(), v538, v383); + svfloat32_t v540 = svsub_f32_x(svptrue_b32(), v538, v383); + svfloat32_t v592 = svadd_f32_x(svptrue_b32(), v272, v344); svint16_t v597 = svtbl_s16( svreinterpret_s16_s32(svcvt_s32_f32_x( pred_full, svmul_n_f32_x(pred_full, v272, (float)(1ULL << 31ULL)))), @@ -17179,19 +10738,15 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu24(const armral_cmplx_f32_t *restrict x, pred_full, svmul_n_f32_x(pred_full, v431, (float)(1ULL << 31ULL)))), svreinterpret_u16_u64( svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svfloat32_t v457; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v457) : "w"(v279), "w"(v351)); + svfloat32_t v457 = svadd_f32_x(svptrue_b32(), v279, v351); svint16_t v462 = svtbl_s16( svreinterpret_s16_s32(svcvt_s32_f32_x( pred_full, svmul_n_f32_x(pred_full, v279, (float)(1ULL << 31ULL)))), svreinterpret_u16_u64( svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svfloat32_t v485; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v485) : "w"(v484), "w"(v421)); - svfloat32_t v486; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v486) : "w"(v484), "w"(v421)); - svfloat32_t v511; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v511) : "w"(v280), "w"(v352)); + svfloat32_t v485 = svadd_f32_x(svptrue_b32(), v484, v421); + svfloat32_t v486 = svsub_f32_x(svptrue_b32(), v484, v421); + svfloat32_t v511 = svadd_f32_x(svptrue_b32(), v280, v352); svint16_t v516 = svtbl_s16( svreinterpret_s16_s32(svcvt_s32_f32_x( pred_full, svmul_n_f32_x(pred_full, v280, (float)(1ULL << 31ULL)))), @@ -17207,19 +10762,15 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu24(const armral_cmplx_f32_t *restrict x, pred_full, svmul_n_f32_x(pred_full, v539, (float)(1ULL << 31ULL)))), svreinterpret_u16_u64( svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svfloat32_t v565; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v565) : "w"(v281), "w"(v353)); + svfloat32_t v565 = svadd_f32_x(svptrue_b32(), v281, v353); svint16_t v570 = svtbl_s16( svreinterpret_s16_s32(svcvt_s32_f32_x( pred_full, svmul_n_f32_x(pred_full, v281, (float)(1ULL << 31ULL)))), svreinterpret_u16_u64( svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svfloat32_t v593; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v593) : "w"(v592), "w"(v420)); - svfloat32_t v594; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v594) : "w"(v592), "w"(v420)); - svfloat32_t v619; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v619) : "w"(v278), "w"(v350)); + svfloat32_t v593 = svadd_f32_x(svptrue_b32(), v592, v420); + svfloat32_t v594 = svsub_f32_x(svptrue_b32(), v592, v420); + svfloat32_t v619 = svadd_f32_x(svptrue_b32(), v278, v350); svint16_t v624 = svtbl_s16( svreinterpret_s16_s32(svcvt_s32_f32_x( pred_full, svmul_n_f32_x(pred_full, v278, (float)(1ULL << 31ULL)))), @@ -17227,10 +10778,8 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu24(const armral_cmplx_f32_t *restrict x, svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); svst1w_u64(pred_full, (unsigned *)(v948), svreinterpret_u64_s16(v489)); svst1w_u64(pred_full, (unsigned *)(v1056), svreinterpret_u64_s16(v597)); - svfloat32_t v458; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v458) : "w"(v457), "w"(v427)); - svfloat32_t v459; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v459) : "w"(v457), "w"(v427)); + svfloat32_t v458 = svadd_f32_x(svptrue_b32(), v457, v427); + svfloat32_t v459 = svsub_f32_x(svptrue_b32(), v457, v427); svint16_t v497 = svtbl_s16( svreinterpret_s16_s32(svcvt_s32_f32_x( pred_full, svmul_n_f32_x(pred_full, v486, (float)(1ULL << 31ULL)))), @@ -17241,14 +10790,10 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu24(const armral_cmplx_f32_t *restrict x, pred_full, svmul_n_f32_x(pred_full, v485, (float)(1ULL << 31ULL)))), svreinterpret_u16_u64( svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svfloat32_t v512; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v512) : "w"(v511), "w"(v428)); - svfloat32_t v513; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v513) : "w"(v511), "w"(v428)); - svfloat32_t v566; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v566) : "w"(v565), "w"(v429)); - svfloat32_t v567; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v567) : "w"(v565), "w"(v429)); + svfloat32_t v512 = svadd_f32_x(svptrue_b32(), v511, v428); + svfloat32_t v513 = svsub_f32_x(svptrue_b32(), v511, v428); + svfloat32_t v566 = svadd_f32_x(svptrue_b32(), v565, v429); + svfloat32_t v567 = svsub_f32_x(svptrue_b32(), v565, v429); svint16_t v605 = svtbl_s16( svreinterpret_s16_s32(svcvt_s32_f32_x( pred_full, svmul_n_f32_x(pred_full, v594, (float)(1ULL << 31ULL)))), @@ -17259,10 +10804,8 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu24(const armral_cmplx_f32_t *restrict x, pred_full, svmul_n_f32_x(pred_full, v593, (float)(1ULL << 31ULL)))), svreinterpret_u16_u64( svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svfloat32_t v620; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v620) : "w"(v619), "w"(v426)); - svfloat32_t v621; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v621) : "w"(v619), "w"(v426)); + svfloat32_t v620 = svadd_f32_x(svptrue_b32(), v619, v426); + svfloat32_t v621 = svsub_f32_x(svptrue_b32(), v619, v426); svst1w_u64(pred_full, (unsigned *)(v903), svreinterpret_u64_s16(v443)); svst1w_u64(pred_full, (unsigned *)(v912), svreinterpret_u64_s16(v451)); svst1w_u64(pred_full, (unsigned *)(v921), svreinterpret_u64_s16(v462)); @@ -17337,1213 +10880,508 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu25(const armral_cmplx_f32_t *restrict x, float v4 = dir; const float32x2_t *v5 = (const float32x2_t *)x; int32_t *v6 = (int32_t *)y; - int64_t v12 = howmany - 1; - int64_t v1726 = howmany / 2; - for (int j = 0; j < v12; j += 2) { - float v916 = 0.0000000000000000e+00F; - float v1035 = 9.6858316112863108e-01F; - float v1039 = -2.4868988716485479e-01F; - float v1040 = 2.4868988716485479e-01F; - float v1208 = 8.7630668004386358e-01F; - float v1212 = -4.8175367410171532e-01F; - float v1213 = 4.8175367410171532e-01F; - float v1381 = 7.2896862742141155e-01F; - float v1385 = -6.8454710592868862e-01F; - float v1386 = 6.8454710592868862e-01F; - float v1395 = 6.2790519529313527e-02F; - float v1399 = -9.9802672842827156e-01F; - float v1400 = 9.9802672842827156e-01F; - float v1554 = 5.3582679497899655e-01F; - float v1558 = -8.4432792550201508e-01F; - float v1559 = 8.4432792550201508e-01F; - float v1568 = -4.2577929156507272e-01F; - float v1572 = -9.0482705246601947e-01F; - float v1573 = 9.0482705246601947e-01F; - float v1582 = -6.3742398974868952e-01F; - float v1586 = 7.7051324277578936e-01F; - float v1587 = -7.7051324277578936e-01F; - float v1603 = -9.9211470131447776e-01F; - float v1607 = -1.2533323356430454e-01F; - float v1608 = 1.2533323356430454e-01F; - float v1626 = 2.5000000000000000e-01F; - float v1638 = 5.5901699437494745e-01F; - float v1650 = 6.1803398874989490e-01F; - float v1681 = 9.5105651629515353e-01F; - float v1682 = -9.5105651629515353e-01F; - float32x2_t v1684 = (float32x2_t){v4, v4}; - float v1713 = 2.0000000000000000e+00F; - const float32x2_t *v3205 = &v5[istride]; - int32_t *v3431 = &v6[ostride]; - float v919 = dir * v916; - float32x2_t v1036 = (float32x2_t){v1035, v1035}; - float32x2_t v1041 = (float32x2_t){v1039, v1040}; - float32x2_t v1209 = (float32x2_t){v1208, v1208}; - float32x2_t v1214 = (float32x2_t){v1212, v1213}; - float32x2_t v1382 = (float32x2_t){v1381, v1381}; - float32x2_t v1387 = (float32x2_t){v1385, v1386}; - float32x2_t v1396 = (float32x2_t){v1395, v1395}; - float32x2_t v1401 = (float32x2_t){v1399, v1400}; - float32x2_t v1436 = (float32x2_t){v1587, v1586}; - float32x2_t v1555 = (float32x2_t){v1554, v1554}; - float32x2_t v1560 = (float32x2_t){v1558, v1559}; - float32x2_t v1569 = (float32x2_t){v1568, v1568}; - float32x2_t v1574 = (float32x2_t){v1572, v1573}; - float32x2_t v1583 = (float32x2_t){v1582, v1582}; - float32x2_t v1588 = (float32x2_t){v1586, v1587}; - float32x2_t v1604 = (float32x2_t){v1603, v1603}; - float32x2_t v1609 = (float32x2_t){v1607, v1608}; - float32x2_t v1627 = (float32x2_t){v1626, v1626}; - float32x2_t v1639 = (float32x2_t){v1638, v1638}; - float32x2_t v1651 = (float32x2_t){v1650, v1650}; - float32x2_t v1683 = (float32x2_t){v1681, v1682}; - float32x2_t v1714 = (float32x2_t){v1713, v1713}; - const float32x2_t *v3160 = &v5[0]; - int32_t *v3386 = &v6[0]; - float32x4_t v3616 = vld1q_f32((const float32_t *)v3205); - float32x2_t v917 = (float32x2_t){v916, v919}; - float32x4_t v1037 = vcombine_f32(v1036, v1036); - float32x2_t v1043 = vmul_f32(v1684, v1041); - float32x4_t v1210 = vcombine_f32(v1209, v1209); - float32x2_t v1216 = vmul_f32(v1684, v1214); - float32x4_t v1383 = vcombine_f32(v1382, v1382); - float32x2_t v1389 = vmul_f32(v1684, v1387); - float32x4_t v1397 = vcombine_f32(v1396, v1396); - float32x2_t v1403 = vmul_f32(v1684, v1401); - float32x2_t v1438 = vmul_f32(v1684, v1436); - float32x4_t v1556 = vcombine_f32(v1555, v1555); - float32x2_t v1562 = vmul_f32(v1684, v1560); - float32x4_t v1570 = vcombine_f32(v1569, v1569); - float32x2_t v1576 = vmul_f32(v1684, v1574); - float32x4_t v1584 = vcombine_f32(v1583, v1583); - float32x2_t v1590 = vmul_f32(v1684, v1588); - float32x4_t v1605 = vcombine_f32(v1604, v1604); - float32x2_t v1611 = vmul_f32(v1684, v1609); - float32x4_t v1628 = vcombine_f32(v1627, v1627); - float32x4_t v1640 = vcombine_f32(v1639, v1639); - float32x4_t v1652 = vcombine_f32(v1651, v1651); - float32x2_t v1685 = vmul_f32(v1684, v1683); - float32x4_t v1715 = vcombine_f32(v1714, v1714); - const float32x2_t *v3169 = &v5[istride * 5]; - const float32x2_t *v3178 = &v5[istride * 10]; - const float32x2_t *v3187 = &v5[istride * 15]; - const float32x2_t *v3196 = &v5[istride * 20]; - const float32x2_t *v3214 = &v5[istride * 6]; - const float32x2_t *v3223 = &v5[istride * 11]; - const float32x2_t *v3232 = &v5[istride * 16]; - const float32x2_t *v3241 = &v5[istride * 21]; - const float32x2_t *v3250 = &v5[istride * 2]; - const float32x2_t *v3259 = &v5[istride * 7]; - const float32x2_t *v3268 = &v5[istride * 12]; - const float32x2_t *v3277 = &v5[istride * 17]; - const float32x2_t *v3286 = &v5[istride * 22]; - const float32x2_t *v3295 = &v5[istride * 3]; - const float32x2_t *v3304 = &v5[istride * 8]; - const float32x2_t *v3313 = &v5[istride * 13]; - const float32x2_t *v3322 = &v5[istride * 18]; - const float32x2_t *v3331 = &v5[istride * 23]; - const float32x2_t *v3340 = &v5[istride * 4]; - const float32x2_t *v3349 = &v5[istride * 9]; - const float32x2_t *v3358 = &v5[istride * 14]; - const float32x2_t *v3367 = &v5[istride * 19]; - const float32x2_t *v3376 = &v5[istride * 24]; - int32_t *v3395 = &v6[ostride * 5]; - int32_t *v3404 = &v6[ostride * 10]; - int32_t *v3413 = &v6[ostride * 15]; - int32_t *v3422 = &v6[ostride * 20]; - int32_t *v3440 = &v6[ostride * 6]; - int32_t *v3449 = &v6[ostride * 11]; - int32_t *v3458 = &v6[ostride * 16]; - int32_t *v3467 = &v6[ostride * 21]; - int32_t *v3476 = &v6[ostride * 2]; - int32_t *v3485 = &v6[ostride * 7]; - int32_t *v3494 = &v6[ostride * 12]; - int32_t *v3503 = &v6[ostride * 17]; - int32_t *v3512 = &v6[ostride * 22]; - int32_t *v3521 = &v6[ostride * 3]; - int32_t *v3530 = &v6[ostride * 8]; - int32_t *v3539 = &v6[ostride * 13]; - int32_t *v3548 = &v6[ostride * 18]; - int32_t *v3557 = &v6[ostride * 23]; - int32_t *v3566 = &v6[ostride * 4]; - int32_t *v3575 = &v6[ostride * 9]; - int32_t *v3584 = &v6[ostride * 14]; - int32_t *v3593 = &v6[ostride * 19]; - int32_t *v3602 = &v6[ostride * 24]; - float32x4_t v3606 = vld1q_f32((const float32_t *)v3160); - float32x4_t v921 = vcombine_f32(v917, v917); - float32x4_t v1045 = vcombine_f32(v1043, v1043); - float32x4_t v1218 = vcombine_f32(v1216, v1216); - float32x4_t v1391 = vcombine_f32(v1389, v1389); - float32x4_t v1405 = vcombine_f32(v1403, v1403); - float32x4_t v1440 = vcombine_f32(v1438, v1438); - float32x4_t v1564 = vcombine_f32(v1562, v1562); - float32x4_t v1578 = vcombine_f32(v1576, v1576); - float32x4_t v1592 = vcombine_f32(v1590, v1590); - float32x4_t v1613 = vcombine_f32(v1611, v1611); - float32x4_t v1687 = vcombine_f32(v1685, v1685); - float32x4_t v3608 = vld1q_f32((const float32_t *)v3169); - float32x4_t v3610 = vld1q_f32((const float32_t *)v3178); - float32x4_t v3612 = vld1q_f32((const float32_t *)v3187); - float32x4_t v3614 = vld1q_f32((const float32_t *)v3196); - float32x4_t v3618 = vld1q_f32((const float32_t *)v3214); - float32x4_t v3620 = vld1q_f32((const float32_t *)v3223); - float32x4_t v3622 = vld1q_f32((const float32_t *)v3232); - float32x4_t v3624 = vld1q_f32((const float32_t *)v3241); - float32x4_t v3626 = vld1q_f32((const float32_t *)v3250); - float32x4_t v3628 = vld1q_f32((const float32_t *)v3259); - float32x4_t v3630 = vld1q_f32((const float32_t *)v3268); - float32x4_t v3632 = vld1q_f32((const float32_t *)v3277); - float32x4_t v3634 = vld1q_f32((const float32_t *)v3286); - float32x4_t v3636 = vld1q_f32((const float32_t *)v3295); - float32x4_t v3638 = vld1q_f32((const float32_t *)v3304); - float32x4_t v3640 = vld1q_f32((const float32_t *)v3313); - float32x4_t v3642 = vld1q_f32((const float32_t *)v3322); - float32x4_t v3644 = vld1q_f32((const float32_t *)v3331); - float32x4_t v3646 = vld1q_f32((const float32_t *)v3340); - float32x4_t v3648 = vld1q_f32((const float32_t *)v3349); - float32x4_t v3650 = vld1q_f32((const float32_t *)v3358); - float32x4_t v3652 = vld1q_f32((const float32_t *)v3367); - float32x4_t v3654 = vld1q_f32((const float32_t *)v3376); - float32x4_t v66 = vrev64q_f32(v3608); - float32x4_t v80 = vrev64q_f32(v3610); - float32x4_t v94 = vrev64q_f32(v3614); - float32x4_t v115 = vrev64q_f32(v3612); - float32x4_t v234 = vrev64q_f32(v3618); - float32x4_t v248 = vrev64q_f32(v3620); - float32x4_t v262 = vrev64q_f32(v3624); - float32x4_t v283 = vrev64q_f32(v3622); - float32x4_t v402 = vrev64q_f32(v3628); - float32x4_t v416 = vrev64q_f32(v3630); - float32x4_t v430 = vrev64q_f32(v3634); - float32x4_t v451 = vrev64q_f32(v3632); - float32x4_t v570 = vrev64q_f32(v3638); - float32x4_t v584 = vrev64q_f32(v3640); - float32x4_t v598 = vrev64q_f32(v3644); - float32x4_t v619 = vrev64q_f32(v3642); - float32x4_t v738 = vrev64q_f32(v3648); - float32x4_t v752 = vrev64q_f32(v3650); - float32x4_t v766 = vrev64q_f32(v3654); - float32x4_t v787 = vrev64q_f32(v3652); - float32x4_t v68 = vmulq_f32(v66, v921); - float32x4_t v82 = vmulq_f32(v80, v921); - float32x4_t v96 = vmulq_f32(v94, v921); - float32x4_t v117 = vmulq_f32(v115, v921); - float32x4_t v236 = vmulq_f32(v234, v921); - float32x4_t v250 = vmulq_f32(v248, v921); - float32x4_t v264 = vmulq_f32(v262, v921); - float32x4_t v285 = vmulq_f32(v283, v921); - float32x4_t v404 = vmulq_f32(v402, v921); - float32x4_t v418 = vmulq_f32(v416, v921); - float32x4_t v432 = vmulq_f32(v430, v921); - float32x4_t v453 = vmulq_f32(v451, v921); - float32x4_t v572 = vmulq_f32(v570, v921); - float32x4_t v586 = vmulq_f32(v584, v921); - float32x4_t v600 = vmulq_f32(v598, v921); - float32x4_t v621 = vmulq_f32(v619, v921); - float32x4_t v740 = vmulq_f32(v738, v921); - float32x4_t v754 = vmulq_f32(v752, v921); - float32x4_t v768 = vmulq_f32(v766, v921); - float32x4_t v789 = vmulq_f32(v787, v921); - float32x4_t v69 = vaddq_f32(v68, v3608); - float32x4_t v83 = vaddq_f32(v82, v3610); - float32x4_t v97 = vaddq_f32(v96, v3614); - float32x4_t v118 = vaddq_f32(v117, v3612); - float32x4_t v237 = vaddq_f32(v236, v3618); - float32x4_t v251 = vaddq_f32(v250, v3620); - float32x4_t v265 = vaddq_f32(v264, v3624); - float32x4_t v286 = vaddq_f32(v285, v3622); - float32x4_t v405 = vaddq_f32(v404, v3628); - float32x4_t v419 = vaddq_f32(v418, v3630); - float32x4_t v433 = vaddq_f32(v432, v3634); - float32x4_t v454 = vaddq_f32(v453, v3632); - float32x4_t v573 = vaddq_f32(v572, v3638); - float32x4_t v587 = vaddq_f32(v586, v3640); - float32x4_t v601 = vaddq_f32(v600, v3644); - float32x4_t v622 = vaddq_f32(v621, v3642); - float32x4_t v741 = vaddq_f32(v740, v3648); - float32x4_t v755 = vaddq_f32(v754, v3650); - float32x4_t v769 = vaddq_f32(v768, v3654); - float32x4_t v790 = vaddq_f32(v789, v3652); - float32x4_t v98 = vsubq_f32(v69, v97); - float32x4_t v103 = vmulq_f32(v69, v1715); - float32x4_t v119 = vsubq_f32(v83, v118); - float32x4_t v124 = vmulq_f32(v83, v1715); - float32x4_t v266 = vsubq_f32(v237, v265); - float32x4_t v271 = vmulq_f32(v237, v1715); - float32x4_t v287 = vsubq_f32(v251, v286); - float32x4_t v292 = vmulq_f32(v251, v1715); - float32x4_t v434 = vsubq_f32(v405, v433); - float32x4_t v439 = vmulq_f32(v405, v1715); - float32x4_t v455 = vsubq_f32(v419, v454); - float32x4_t v460 = vmulq_f32(v419, v1715); - float32x4_t v602 = vsubq_f32(v573, v601); - float32x4_t v607 = vmulq_f32(v573, v1715); - float32x4_t v623 = vsubq_f32(v587, v622); - float32x4_t v628 = vmulq_f32(v587, v1715); - float32x4_t v770 = vsubq_f32(v741, v769); - float32x4_t v775 = vmulq_f32(v741, v1715); - float32x4_t v791 = vsubq_f32(v755, v790); - float32x4_t v796 = vmulq_f32(v755, v1715); - float32x4_t v104 = vsubq_f32(v103, v98); - float32x4_t v125 = vsubq_f32(v124, v119); - float32x4_t v138 = vmulq_f32(v119, v1652); - float32x4_t v156 = vmulq_f32(v98, v1652); - float32x4_t v272 = vsubq_f32(v271, v266); - float32x4_t v293 = vsubq_f32(v292, v287); - float32x4_t v306 = vmulq_f32(v287, v1652); - float32x4_t v324 = vmulq_f32(v266, v1652); - float32x4_t v440 = vsubq_f32(v439, v434); - float32x4_t v461 = vsubq_f32(v460, v455); - float32x4_t v474 = vmulq_f32(v455, v1652); - float32x4_t v492 = vmulq_f32(v434, v1652); - float32x4_t v608 = vsubq_f32(v607, v602); - float32x4_t v629 = vsubq_f32(v628, v623); - float32x4_t v642 = vmulq_f32(v623, v1652); - float32x4_t v660 = vmulq_f32(v602, v1652); - float32x4_t v776 = vsubq_f32(v775, v770); - float32x4_t v797 = vsubq_f32(v796, v791); - float32x4_t v810 = vmulq_f32(v791, v1652); - float32x4_t v828 = vmulq_f32(v770, v1652); - float32x4_t v126 = vaddq_f32(v104, v125); - float32x4_t v127 = vsubq_f32(v104, v125); - float32x4_t v139 = vaddq_f32(v98, v138); - float32x4_t v157 = vsubq_f32(v156, v119); - float32x4_t v294 = vaddq_f32(v272, v293); - float32x4_t v295 = vsubq_f32(v272, v293); - float32x4_t v307 = vaddq_f32(v266, v306); - float32x4_t v325 = vsubq_f32(v324, v287); - float32x4_t v462 = vaddq_f32(v440, v461); - float32x4_t v463 = vsubq_f32(v440, v461); - float32x4_t v475 = vaddq_f32(v434, v474); - float32x4_t v493 = vsubq_f32(v492, v455); - float32x4_t v630 = vaddq_f32(v608, v629); - float32x4_t v631 = vsubq_f32(v608, v629); - float32x4_t v643 = vaddq_f32(v602, v642); - float32x4_t v661 = vsubq_f32(v660, v623); - float32x4_t v798 = vaddq_f32(v776, v797); - float32x4_t v799 = vsubq_f32(v776, v797); - float32x4_t v811 = vaddq_f32(v770, v810); - float32x4_t v829 = vsubq_f32(v828, v791); - float32x4_t v132 = vmulq_f32(v126, v1628); - float32x4_t v144 = vmulq_f32(v127, v1640); - float32x4_t v158 = vaddq_f32(v3606, v126); - float32x4_t v164 = vrev64q_f32(v139); - float32x4_t v173 = vrev64q_f32(v157); - float32x4_t v300 = vmulq_f32(v294, v1628); - float32x4_t v312 = vmulq_f32(v295, v1640); - float32x4_t v326 = vaddq_f32(v3616, v294); - float32x4_t v332 = vrev64q_f32(v307); - float32x4_t v341 = vrev64q_f32(v325); - float32x4_t v468 = vmulq_f32(v462, v1628); - float32x4_t v480 = vmulq_f32(v463, v1640); - float32x4_t v494 = vaddq_f32(v3626, v462); - float32x4_t v500 = vrev64q_f32(v475); - float32x4_t v509 = vrev64q_f32(v493); - float32x4_t v636 = vmulq_f32(v630, v1628); - float32x4_t v648 = vmulq_f32(v631, v1640); - float32x4_t v662 = vaddq_f32(v3636, v630); - float32x4_t v668 = vrev64q_f32(v643); - float32x4_t v677 = vrev64q_f32(v661); - float32x4_t v804 = vmulq_f32(v798, v1628); - float32x4_t v816 = vmulq_f32(v799, v1640); - float32x4_t v830 = vaddq_f32(v3646, v798); - float32x4_t v836 = vrev64q_f32(v811); - float32x4_t v845 = vrev64q_f32(v829); - float32x4_t v133 = vsubq_f32(v3606, v132); - float32x4_t v166 = vmulq_f32(v164, v1687); - float32x4_t v175 = vmulq_f32(v173, v1687); - float32x4_t v301 = vsubq_f32(v3616, v300); - float32x4_t v334 = vmulq_f32(v332, v1687); - float32x4_t v343 = vmulq_f32(v341, v1687); - float32x4_t v469 = vsubq_f32(v3626, v468); - float32x4_t v502 = vmulq_f32(v500, v1687); - float32x4_t v511 = vmulq_f32(v509, v1687); - float32x4_t v637 = vsubq_f32(v3636, v636); - float32x4_t v670 = vmulq_f32(v668, v1687); - float32x4_t v679 = vmulq_f32(v677, v1687); - float32x4_t v805 = vsubq_f32(v3646, v804); - float32x4_t v838 = vmulq_f32(v836, v1687); - float32x4_t v847 = vmulq_f32(v845, v1687); - float32x4_t v871 = vrev64q_f32(v326); - float32x4_t v885 = vrev64q_f32(v494); - float32x4_t v899 = vrev64q_f32(v830); - float32x4_t v920 = vrev64q_f32(v662); - float32x4_t v145 = vsubq_f32(v133, v144); - float32x4_t v150 = vmulq_f32(v133, v1715); - float32x4_t v313 = vsubq_f32(v301, v312); - float32x4_t v318 = vmulq_f32(v301, v1715); - float32x4_t v481 = vsubq_f32(v469, v480); - float32x4_t v486 = vmulq_f32(v469, v1715); - float32x4_t v649 = vsubq_f32(v637, v648); - float32x4_t v654 = vmulq_f32(v637, v1715); - float32x4_t v817 = vsubq_f32(v805, v816); - float32x4_t v822 = vmulq_f32(v805, v1715); - float32x4_t v873 = vmulq_f32(v871, v921); - float32x4_t v887 = vmulq_f32(v885, v921); - float32x4_t v901 = vmulq_f32(v899, v921); - float32x4_t v922 = vmulq_f32(v920, v921); - float32x4_t v151 = vsubq_f32(v150, v145); - float32x4_t v176 = vsubq_f32(v145, v175); - float32x4_t v181 = vmulq_f32(v145, v1715); - float32x4_t v319 = vsubq_f32(v318, v313); - float32x4_t v344 = vsubq_f32(v313, v343); - float32x4_t v349 = vmulq_f32(v313, v1715); - float32x4_t v487 = vsubq_f32(v486, v481); - float32x4_t v512 = vsubq_f32(v481, v511); - float32x4_t v517 = vmulq_f32(v481, v1715); - float32x4_t v655 = vsubq_f32(v654, v649); - float32x4_t v680 = vsubq_f32(v649, v679); - float32x4_t v685 = vmulq_f32(v649, v1715); - float32x4_t v823 = vsubq_f32(v822, v817); - float32x4_t v848 = vsubq_f32(v817, v847); - float32x4_t v853 = vmulq_f32(v817, v1715); - float32x4_t v874 = vaddq_f32(v873, v326); - float32x4_t v888 = vaddq_f32(v887, v494); - float32x4_t v902 = vaddq_f32(v901, v830); - float32x4_t v923 = vaddq_f32(v922, v662); - float32x4_t v167 = vsubq_f32(v151, v166); - float32x4_t v182 = vsubq_f32(v181, v176); - float32x4_t v187 = vmulq_f32(v151, v1715); - float32x4_t v335 = vsubq_f32(v319, v334); - float32x4_t v350 = vsubq_f32(v349, v344); - float32x4_t v355 = vmulq_f32(v319, v1715); - float32x4_t v503 = vsubq_f32(v487, v502); - float32x4_t v518 = vsubq_f32(v517, v512); - float32x4_t v523 = vmulq_f32(v487, v1715); - float32x4_t v671 = vsubq_f32(v655, v670); - float32x4_t v686 = vsubq_f32(v685, v680); - float32x4_t v691 = vmulq_f32(v655, v1715); - float32x4_t v839 = vsubq_f32(v823, v838); - float32x4_t v854 = vsubq_f32(v853, v848); - float32x4_t v859 = vmulq_f32(v823, v1715); - float32x4_t v903 = vsubq_f32(v874, v902); - float32x4_t v908 = vmulq_f32(v874, v1715); - float32x4_t v924 = vsubq_f32(v888, v923); - float32x4_t v929 = vmulq_f32(v888, v1715); - float32x4_t v1217 = vrev64q_f32(v344); - float32x4_t v1231 = vrev64q_f32(v512); - float32x4_t v1245 = vrev64q_f32(v848); - float32x4_t v1266 = vrev64q_f32(v680); - float32x4_t v188 = vsubq_f32(v187, v167); - float32x4_t v356 = vsubq_f32(v355, v335); - float32x4_t v524 = vsubq_f32(v523, v503); - float32x4_t v692 = vsubq_f32(v691, v671); - float32x4_t v860 = vsubq_f32(v859, v839); - float32x4_t v909 = vsubq_f32(v908, v903); - float32x4_t v930 = vsubq_f32(v929, v924); - float32x4_t v943 = vmulq_f32(v924, v1652); - float32x4_t v961 = vmulq_f32(v903, v1652); - float32x4_t v1044 = vrev64q_f32(v335); - float32x4_t v1058 = vrev64q_f32(v503); - float32x4_t v1072 = vrev64q_f32(v839); - float32x4_t v1093 = vrev64q_f32(v671); - float32x4_t v1219 = vmulq_f32(v1217, v1218); - float32x4_t v1233 = vmulq_f32(v1231, v1564); - float32x4_t v1247 = vmulq_f32(v1245, v1578); - float32x4_t v1268 = vmulq_f32(v1266, v1405); - float32x4_t v1390 = vrev64q_f32(v350); - float32x4_t v1404 = vrev64q_f32(v518); - float32x4_t v1418 = vrev64q_f32(v854); - float32x4_t v1439 = vrev64q_f32(v686); - float32x4_t v931 = vaddq_f32(v909, v930); - float32x4_t v932 = vsubq_f32(v909, v930); - float32x4_t v944 = vaddq_f32(v903, v943); - float32x4_t v962 = vsubq_f32(v961, v924); - float32x4_t v1046 = vmulq_f32(v1044, v1045); - float32x4_t v1060 = vmulq_f32(v1058, v1218); - float32x4_t v1074 = vmulq_f32(v1072, v1564); - float32x4_t v1095 = vmulq_f32(v1093, v1391); - float32x4_t v1220 = vfmaq_f32(v1219, v344, v1210); - float32x4_t v1234 = vfmaq_f32(v1233, v512, v1556); - float32x4_t v1248 = vfmaq_f32(v1247, v848, v1570); - float32x4_t v1269 = vfmaq_f32(v1268, v680, v1397); - float32x4_t v1392 = vmulq_f32(v1390, v1391); - float32x4_t v1406 = vmulq_f32(v1404, v1405); - float32x4_t v1420 = vmulq_f32(v1418, v1613); - float32x4_t v1441 = vmulq_f32(v1439, v1440); - float32x4_t v1563 = vrev64q_f32(v356); - float32x4_t v1577 = vrev64q_f32(v524); - float32x4_t v1591 = vrev64q_f32(v860); - float32x4_t v1612 = vrev64q_f32(v692); - float32x4_t v937 = vmulq_f32(v931, v1628); - float32x4_t v949 = vmulq_f32(v932, v1640); - float32x4_t v963 = vaddq_f32(v158, v931); - float32x4_t v977 = vrev64q_f32(v944); - float32x4_t v994 = vrev64q_f32(v962); - float32x4_t v1047 = vfmaq_f32(v1046, v335, v1037); - float32x4_t v1061 = vfmaq_f32(v1060, v503, v1210); - float32x4_t v1075 = vfmaq_f32(v1074, v839, v1556); - float32x4_t v1096 = vfmaq_f32(v1095, v671, v1383); - float32x4_t v1249 = vsubq_f32(v1220, v1248); - float32x4_t v1254 = vmulq_f32(v1220, v1715); - float32x4_t v1270 = vsubq_f32(v1234, v1269); - float32x4_t v1275 = vmulq_f32(v1234, v1715); - float32x4_t v1393 = vfmaq_f32(v1392, v350, v1383); - float32x4_t v1407 = vfmaq_f32(v1406, v518, v1397); - float32x4_t v1421 = vfmaq_f32(v1420, v854, v1605); - float32x4_t v1442 = vfmaq_f32(v1441, v686, v1584); - float32x4_t v1565 = vmulq_f32(v1563, v1564); - float32x4_t v1579 = vmulq_f32(v1577, v1578); - float32x4_t v1593 = vmulq_f32(v1591, v1592); - float32x4_t v1614 = vmulq_f32(v1612, v1613); - float32x4_t v938 = vsubq_f32(v158, v937); - int16x4_t v966 = vqmovn_s32(vcvtq_n_s32_f32(v963, 15)); - float32x4_t v979 = vmulq_f32(v977, v1687); - float32x4_t v996 = vmulq_f32(v994, v1687); - float32x4_t v1076 = vsubq_f32(v1047, v1075); - float32x4_t v1081 = vmulq_f32(v1047, v1715); - float32x4_t v1097 = vsubq_f32(v1061, v1096); - float32x4_t v1102 = vmulq_f32(v1061, v1715); - float32x4_t v1255 = vsubq_f32(v1254, v1249); - float32x4_t v1276 = vsubq_f32(v1275, v1270); - float32x4_t v1289 = vmulq_f32(v1270, v1652); - float32x4_t v1307 = vmulq_f32(v1249, v1652); - float32x4_t v1422 = vsubq_f32(v1393, v1421); - float32x4_t v1427 = vmulq_f32(v1393, v1715); - float32x4_t v1443 = vsubq_f32(v1407, v1442); - float32x4_t v1448 = vmulq_f32(v1407, v1715); - float32x4_t v1566 = vfmaq_f32(v1565, v356, v1556); - float32x4_t v1580 = vfmaq_f32(v1579, v524, v1570); - float32x4_t v1594 = vfmaq_f32(v1593, v860, v1584); - float32x4_t v1615 = vfmaq_f32(v1614, v692, v1605); - float32x4_t v950 = vsubq_f32(v938, v949); - float32x4_t v955 = vmulq_f32(v938, v1715); - float32x4_t v1082 = vsubq_f32(v1081, v1076); - float32x4_t v1103 = vsubq_f32(v1102, v1097); - float32x4_t v1116 = vmulq_f32(v1097, v1652); - float32x4_t v1134 = vmulq_f32(v1076, v1652); - float32x4_t v1277 = vaddq_f32(v1255, v1276); - float32x4_t v1278 = vsubq_f32(v1255, v1276); - float32x4_t v1290 = vaddq_f32(v1249, v1289); - float32x4_t v1308 = vsubq_f32(v1307, v1270); - float32x4_t v1428 = vsubq_f32(v1427, v1422); - float32x4_t v1449 = vsubq_f32(v1448, v1443); - float32x4_t v1462 = vmulq_f32(v1443, v1652); - float32x4_t v1480 = vmulq_f32(v1422, v1652); - float32x4_t v1595 = vsubq_f32(v1566, v1594); - float32x4_t v1600 = vmulq_f32(v1566, v1715); - float32x4_t v1616 = vsubq_f32(v1580, v1615); - float32x4_t v1621 = vmulq_f32(v1580, v1715); - vst1_s16((int16_t *)v3386, v966); - float32x4_t v956 = vsubq_f32(v955, v950); - float32x4_t v997 = vsubq_f32(v950, v996); - float32x4_t v1010 = vmulq_f32(v950, v1715); - float32x4_t v1104 = vaddq_f32(v1082, v1103); - float32x4_t v1105 = vsubq_f32(v1082, v1103); - float32x4_t v1117 = vaddq_f32(v1076, v1116); - float32x4_t v1135 = vsubq_f32(v1134, v1097); - float32x4_t v1283 = vmulq_f32(v1277, v1628); - float32x4_t v1295 = vmulq_f32(v1278, v1640); - float32x4_t v1309 = vaddq_f32(v176, v1277); - float32x4_t v1323 = vrev64q_f32(v1290); - float32x4_t v1340 = vrev64q_f32(v1308); - float32x4_t v1450 = vaddq_f32(v1428, v1449); - float32x4_t v1451 = vsubq_f32(v1428, v1449); - float32x4_t v1463 = vaddq_f32(v1422, v1462); - float32x4_t v1481 = vsubq_f32(v1480, v1443); - float32x4_t v1601 = vsubq_f32(v1600, v1595); - float32x4_t v1622 = vsubq_f32(v1621, v1616); - float32x4_t v1635 = vmulq_f32(v1616, v1652); - float32x4_t v1653 = vmulq_f32(v1595, v1652); - float32x4_t v980 = vsubq_f32(v956, v979); - int16x4_t v1000 = vqmovn_s32(vcvtq_n_s32_f32(v997, 15)); - float32x4_t v1011 = vsubq_f32(v1010, v997); - float32x4_t v1024 = vmulq_f32(v956, v1715); - float32x4_t v1110 = vmulq_f32(v1104, v1628); - float32x4_t v1122 = vmulq_f32(v1105, v1640); - float32x4_t v1136 = vaddq_f32(v167, v1104); - float32x4_t v1150 = vrev64q_f32(v1117); - float32x4_t v1167 = vrev64q_f32(v1135); - float32x4_t v1284 = vsubq_f32(v176, v1283); - int16x4_t v1312 = vqmovn_s32(vcvtq_n_s32_f32(v1309, 15)); - float32x4_t v1325 = vmulq_f32(v1323, v1687); - float32x4_t v1342 = vmulq_f32(v1340, v1687); - float32x4_t v1456 = vmulq_f32(v1450, v1628); - float32x4_t v1468 = vmulq_f32(v1451, v1640); - float32x4_t v1482 = vaddq_f32(v182, v1450); - float32x4_t v1496 = vrev64q_f32(v1463); - float32x4_t v1513 = vrev64q_f32(v1481); - float32x4_t v1623 = vaddq_f32(v1601, v1622); - float32x4_t v1624 = vsubq_f32(v1601, v1622); - float32x4_t v1636 = vaddq_f32(v1595, v1635); - float32x4_t v1654 = vsubq_f32(v1653, v1616); - int16x4_t v983 = vqmovn_s32(vcvtq_n_s32_f32(v980, 15)); - int16x4_t v1014 = vqmovn_s32(vcvtq_n_s32_f32(v1011, 15)); - float32x4_t v1025 = vsubq_f32(v1024, v980); - float32x4_t v1111 = vsubq_f32(v167, v1110); - int16x4_t v1139 = vqmovn_s32(vcvtq_n_s32_f32(v1136, 15)); - float32x4_t v1152 = vmulq_f32(v1150, v1687); - float32x4_t v1169 = vmulq_f32(v1167, v1687); - float32x4_t v1296 = vsubq_f32(v1284, v1295); - float32x4_t v1301 = vmulq_f32(v1284, v1715); - float32x4_t v1457 = vsubq_f32(v182, v1456); - int16x4_t v1485 = vqmovn_s32(vcvtq_n_s32_f32(v1482, 15)); - float32x4_t v1498 = vmulq_f32(v1496, v1687); - float32x4_t v1515 = vmulq_f32(v1513, v1687); - float32x4_t v1629 = vmulq_f32(v1623, v1628); - float32x4_t v1641 = vmulq_f32(v1624, v1640); - float32x4_t v1655 = vaddq_f32(v188, v1623); - float32x4_t v1669 = vrev64q_f32(v1636); - float32x4_t v1686 = vrev64q_f32(v1654); - vst1_s16((int16_t *)v3404, v1000); - vst1_s16((int16_t *)v3476, v1312); - int16x4_t v1028 = vqmovn_s32(vcvtq_n_s32_f32(v1025, 15)); - float32x4_t v1123 = vsubq_f32(v1111, v1122); - float32x4_t v1128 = vmulq_f32(v1111, v1715); - float32x4_t v1302 = vsubq_f32(v1301, v1296); - float32x4_t v1343 = vsubq_f32(v1296, v1342); - float32x4_t v1356 = vmulq_f32(v1296, v1715); - float32x4_t v1469 = vsubq_f32(v1457, v1468); - float32x4_t v1474 = vmulq_f32(v1457, v1715); - float32x4_t v1630 = vsubq_f32(v188, v1629); - int16x4_t v1658 = vqmovn_s32(vcvtq_n_s32_f32(v1655, 15)); - float32x4_t v1671 = vmulq_f32(v1669, v1687); - float32x4_t v1688 = vmulq_f32(v1686, v1687); - vst1_s16((int16_t *)v3395, v983); - vst1_s16((int16_t *)v3413, v1014); - vst1_s16((int16_t *)v3431, v1139); - vst1_s16((int16_t *)v3521, v1485); - float32x4_t v1129 = vsubq_f32(v1128, v1123); - float32x4_t v1170 = vsubq_f32(v1123, v1169); - float32x4_t v1183 = vmulq_f32(v1123, v1715); - float32x4_t v1326 = vsubq_f32(v1302, v1325); - int16x4_t v1346 = vqmovn_s32(vcvtq_n_s32_f32(v1343, 15)); - float32x4_t v1357 = vsubq_f32(v1356, v1343); - float32x4_t v1370 = vmulq_f32(v1302, v1715); - float32x4_t v1475 = vsubq_f32(v1474, v1469); - float32x4_t v1516 = vsubq_f32(v1469, v1515); - float32x4_t v1529 = vmulq_f32(v1469, v1715); - float32x4_t v1642 = vsubq_f32(v1630, v1641); - float32x4_t v1647 = vmulq_f32(v1630, v1715); - vst1_s16((int16_t *)v3422, v1028); - vst1_s16((int16_t *)v3566, v1658); - float32x4_t v1153 = vsubq_f32(v1129, v1152); - int16x4_t v1173 = vqmovn_s32(vcvtq_n_s32_f32(v1170, 15)); - float32x4_t v1184 = vsubq_f32(v1183, v1170); - float32x4_t v1197 = vmulq_f32(v1129, v1715); - int16x4_t v1329 = vqmovn_s32(vcvtq_n_s32_f32(v1326, 15)); - int16x4_t v1360 = vqmovn_s32(vcvtq_n_s32_f32(v1357, 15)); - float32x4_t v1371 = vsubq_f32(v1370, v1326); - float32x4_t v1499 = vsubq_f32(v1475, v1498); - int16x4_t v1519 = vqmovn_s32(vcvtq_n_s32_f32(v1516, 15)); - float32x4_t v1530 = vsubq_f32(v1529, v1516); - float32x4_t v1543 = vmulq_f32(v1475, v1715); - float32x4_t v1648 = vsubq_f32(v1647, v1642); - float32x4_t v1689 = vsubq_f32(v1642, v1688); - float32x4_t v1702 = vmulq_f32(v1642, v1715); - vst1_s16((int16_t *)v3494, v1346); - int16x4_t v1156 = vqmovn_s32(vcvtq_n_s32_f32(v1153, 15)); - int16x4_t v1187 = vqmovn_s32(vcvtq_n_s32_f32(v1184, 15)); - float32x4_t v1198 = vsubq_f32(v1197, v1153); - int16x4_t v1374 = vqmovn_s32(vcvtq_n_s32_f32(v1371, 15)); - int16x4_t v1502 = vqmovn_s32(vcvtq_n_s32_f32(v1499, 15)); - int16x4_t v1533 = vqmovn_s32(vcvtq_n_s32_f32(v1530, 15)); - float32x4_t v1544 = vsubq_f32(v1543, v1499); - float32x4_t v1672 = vsubq_f32(v1648, v1671); - int16x4_t v1692 = vqmovn_s32(vcvtq_n_s32_f32(v1689, 15)); - float32x4_t v1703 = vsubq_f32(v1702, v1689); - float32x4_t v1716 = vmulq_f32(v1648, v1715); - vst1_s16((int16_t *)v3449, v1173); - vst1_s16((int16_t *)v3485, v1329); - vst1_s16((int16_t *)v3503, v1360); - vst1_s16((int16_t *)v3539, v1519); - int16x4_t v1201 = vqmovn_s32(vcvtq_n_s32_f32(v1198, 15)); - int16x4_t v1547 = vqmovn_s32(vcvtq_n_s32_f32(v1544, 15)); - int16x4_t v1675 = vqmovn_s32(vcvtq_n_s32_f32(v1672, 15)); - int16x4_t v1706 = vqmovn_s32(vcvtq_n_s32_f32(v1703, 15)); - float32x4_t v1717 = vsubq_f32(v1716, v1672); - vst1_s16((int16_t *)v3440, v1156); - vst1_s16((int16_t *)v3458, v1187); - vst1_s16((int16_t *)v3512, v1374); - vst1_s16((int16_t *)v3530, v1502); - vst1_s16((int16_t *)v3548, v1533); - vst1_s16((int16_t *)v3584, v1692); - int16x4_t v1720 = vqmovn_s32(vcvtq_n_s32_f32(v1717, 15)); - vst1_s16((int16_t *)v3467, v1201); - vst1_s16((int16_t *)v3557, v1547); - vst1_s16((int16_t *)v3575, v1675); - vst1_s16((int16_t *)v3593, v1706); - vst1_s16((int16_t *)v3602, v1720); - v5 += 2 * 1; - v6 += 2 * 1; - } - for (int j = v1726 * 2; j < howmany; j += 1) { - float32x2_t v1877 = v5[istride]; - float v2476 = 0.0000000000000000e+00F; - float v2574 = 9.6858316112863108e-01F; - float v2577 = -2.4868988716485479e-01F; - float v2578 = 2.4868988716485479e-01F; - float v2718 = 8.7630668004386358e-01F; - float v2721 = -4.8175367410171532e-01F; - float v2722 = 4.8175367410171532e-01F; - float v2862 = 7.2896862742141155e-01F; - float v2865 = -6.8454710592868862e-01F; - float v2866 = 6.8454710592868862e-01F; - float v2874 = 6.2790519529313527e-02F; - float v2877 = -9.9802672842827156e-01F; - float v2878 = 9.9802672842827156e-01F; - float v3006 = 5.3582679497899655e-01F; - float v3009 = -8.4432792550201508e-01F; - float v3010 = 8.4432792550201508e-01F; - float v3018 = -4.2577929156507272e-01F; - float v3021 = -9.0482705246601947e-01F; - float v3022 = 9.0482705246601947e-01F; - float v3030 = -6.3742398974868952e-01F; - float v3033 = 7.7051324277578936e-01F; - float v3034 = -7.7051324277578936e-01F; - float v3048 = -9.9211470131447776e-01F; - float v3051 = -1.2533323356430454e-01F; - float v3052 = 1.2533323356430454e-01F; - float v3068 = 2.5000000000000000e-01F; - float v3078 = 5.5901699437494745e-01F; - float v3088 = 6.1803398874989490e-01F; - float v3113 = 9.5105651629515353e-01F; - float v3114 = -9.5105651629515353e-01F; - float32x2_t v3116 = (float32x2_t){v4, v4}; - float v3139 = 2.0000000000000000e+00F; - float32x2_t v1738 = v5[0]; - float v2479 = dir * v2476; - float32x2_t v2575 = (float32x2_t){v2574, v2574}; - float32x2_t v2579 = (float32x2_t){v2577, v2578}; - float32x2_t v2719 = (float32x2_t){v2718, v2718}; - float32x2_t v2723 = (float32x2_t){v2721, v2722}; - float32x2_t v2863 = (float32x2_t){v2862, v2862}; - float32x2_t v2867 = (float32x2_t){v2865, v2866}; - float32x2_t v2875 = (float32x2_t){v2874, v2874}; - float32x2_t v2879 = (float32x2_t){v2877, v2878}; - float32x2_t v2909 = (float32x2_t){v3034, v3033}; - float32x2_t v3007 = (float32x2_t){v3006, v3006}; - float32x2_t v3011 = (float32x2_t){v3009, v3010}; - float32x2_t v3019 = (float32x2_t){v3018, v3018}; - float32x2_t v3023 = (float32x2_t){v3021, v3022}; - float32x2_t v3031 = (float32x2_t){v3030, v3030}; - float32x2_t v3035 = (float32x2_t){v3033, v3034}; - float32x2_t v3049 = (float32x2_t){v3048, v3048}; - float32x2_t v3053 = (float32x2_t){v3051, v3052}; - float32x2_t v3069 = (float32x2_t){v3068, v3068}; - float32x2_t v3079 = (float32x2_t){v3078, v3078}; - float32x2_t v3089 = (float32x2_t){v3088, v3088}; - float32x2_t v3115 = (float32x2_t){v3113, v3114}; - float32x2_t v3140 = (float32x2_t){v3139, v3139}; - float32x2_t v1743 = v5[istride * 5]; - float32x2_t v1748 = v5[istride * 10]; - float32x2_t v1753 = v5[istride * 15]; - float32x2_t v1758 = v5[istride * 20]; - float32x2_t v1882 = v5[istride * 6]; - float32x2_t v1887 = v5[istride * 11]; - float32x2_t v1892 = v5[istride * 16]; - float32x2_t v1897 = v5[istride * 21]; - float32x2_t v2016 = v5[istride * 2]; - float32x2_t v2021 = v5[istride * 7]; - float32x2_t v2026 = v5[istride * 12]; - float32x2_t v2031 = v5[istride * 17]; - float32x2_t v2036 = v5[istride * 22]; - float32x2_t v2155 = v5[istride * 3]; - float32x2_t v2160 = v5[istride * 8]; - float32x2_t v2165 = v5[istride * 13]; - float32x2_t v2170 = v5[istride * 18]; - float32x2_t v2175 = v5[istride * 23]; - float32x2_t v2294 = v5[istride * 4]; - float32x2_t v2299 = v5[istride * 9]; - float32x2_t v2304 = v5[istride * 14]; - float32x2_t v2309 = v5[istride * 19]; - float32x2_t v2314 = v5[istride * 24]; - float32x2_t v2477 = (float32x2_t){v2476, v2479}; - float32x2_t v2581 = vmul_f32(v3116, v2579); - float32x2_t v2725 = vmul_f32(v3116, v2723); - float32x2_t v2869 = vmul_f32(v3116, v2867); - float32x2_t v2881 = vmul_f32(v3116, v2879); - float32x2_t v2911 = vmul_f32(v3116, v2909); - float32x2_t v3013 = vmul_f32(v3116, v3011); - float32x2_t v3025 = vmul_f32(v3116, v3023); - float32x2_t v3037 = vmul_f32(v3116, v3035); - float32x2_t v3055 = vmul_f32(v3116, v3053); - float32x2_t v3117 = vmul_f32(v3116, v3115); - float32x2_t v1768 = vrev64_f32(v1743); - float32x2_t v1780 = vrev64_f32(v1748); - float32x2_t v1792 = vrev64_f32(v1758); - float32x2_t v1810 = vrev64_f32(v1753); - float32x2_t v1907 = vrev64_f32(v1882); - float32x2_t v1919 = vrev64_f32(v1887); - float32x2_t v1931 = vrev64_f32(v1897); - float32x2_t v1949 = vrev64_f32(v1892); - float32x2_t v2046 = vrev64_f32(v2021); - float32x2_t v2058 = vrev64_f32(v2026); - float32x2_t v2070 = vrev64_f32(v2036); - float32x2_t v2088 = vrev64_f32(v2031); - float32x2_t v2185 = vrev64_f32(v2160); - float32x2_t v2197 = vrev64_f32(v2165); - float32x2_t v2209 = vrev64_f32(v2175); - float32x2_t v2227 = vrev64_f32(v2170); - float32x2_t v2324 = vrev64_f32(v2299); - float32x2_t v2336 = vrev64_f32(v2304); - float32x2_t v2348 = vrev64_f32(v2314); - float32x2_t v2366 = vrev64_f32(v2309); - float32x2_t v1769 = vmul_f32(v1768, v2477); - float32x2_t v1781 = vmul_f32(v1780, v2477); - float32x2_t v1793 = vmul_f32(v1792, v2477); - float32x2_t v1811 = vmul_f32(v1810, v2477); - float32x2_t v1908 = vmul_f32(v1907, v2477); - float32x2_t v1920 = vmul_f32(v1919, v2477); - float32x2_t v1932 = vmul_f32(v1931, v2477); - float32x2_t v1950 = vmul_f32(v1949, v2477); - float32x2_t v2047 = vmul_f32(v2046, v2477); - float32x2_t v2059 = vmul_f32(v2058, v2477); - float32x2_t v2071 = vmul_f32(v2070, v2477); - float32x2_t v2089 = vmul_f32(v2088, v2477); - float32x2_t v2186 = vmul_f32(v2185, v2477); - float32x2_t v2198 = vmul_f32(v2197, v2477); - float32x2_t v2210 = vmul_f32(v2209, v2477); - float32x2_t v2228 = vmul_f32(v2227, v2477); - float32x2_t v2325 = vmul_f32(v2324, v2477); - float32x2_t v2337 = vmul_f32(v2336, v2477); - float32x2_t v2349 = vmul_f32(v2348, v2477); - float32x2_t v2367 = vmul_f32(v2366, v2477); - float32x2_t v1770 = vadd_f32(v1769, v1743); - float32x2_t v1782 = vadd_f32(v1781, v1748); - float32x2_t v1794 = vadd_f32(v1793, v1758); - float32x2_t v1812 = vadd_f32(v1811, v1753); - float32x2_t v1909 = vadd_f32(v1908, v1882); - float32x2_t v1921 = vadd_f32(v1920, v1887); - float32x2_t v1933 = vadd_f32(v1932, v1897); - float32x2_t v1951 = vadd_f32(v1950, v1892); - float32x2_t v2048 = vadd_f32(v2047, v2021); - float32x2_t v2060 = vadd_f32(v2059, v2026); - float32x2_t v2072 = vadd_f32(v2071, v2036); - float32x2_t v2090 = vadd_f32(v2089, v2031); - float32x2_t v2187 = vadd_f32(v2186, v2160); - float32x2_t v2199 = vadd_f32(v2198, v2165); - float32x2_t v2211 = vadd_f32(v2210, v2175); - float32x2_t v2229 = vadd_f32(v2228, v2170); - float32x2_t v2326 = vadd_f32(v2325, v2299); - float32x2_t v2338 = vadd_f32(v2337, v2304); - float32x2_t v2350 = vadd_f32(v2349, v2314); - float32x2_t v2368 = vadd_f32(v2367, v2309); - float32x2_t v1795 = vsub_f32(v1770, v1794); - float32x2_t v1799 = vmul_f32(v1770, v3140); - float32x2_t v1813 = vsub_f32(v1782, v1812); - float32x2_t v1817 = vmul_f32(v1782, v3140); - float32x2_t v1934 = vsub_f32(v1909, v1933); - float32x2_t v1938 = vmul_f32(v1909, v3140); - float32x2_t v1952 = vsub_f32(v1921, v1951); - float32x2_t v1956 = vmul_f32(v1921, v3140); - float32x2_t v2073 = vsub_f32(v2048, v2072); - float32x2_t v2077 = vmul_f32(v2048, v3140); - float32x2_t v2091 = vsub_f32(v2060, v2090); - float32x2_t v2095 = vmul_f32(v2060, v3140); - float32x2_t v2212 = vsub_f32(v2187, v2211); - float32x2_t v2216 = vmul_f32(v2187, v3140); - float32x2_t v2230 = vsub_f32(v2199, v2229); - float32x2_t v2234 = vmul_f32(v2199, v3140); - float32x2_t v2351 = vsub_f32(v2326, v2350); - float32x2_t v2355 = vmul_f32(v2326, v3140); - float32x2_t v2369 = vsub_f32(v2338, v2368); - float32x2_t v2373 = vmul_f32(v2338, v3140); - float32x2_t v1800 = vsub_f32(v1799, v1795); - float32x2_t v1818 = vsub_f32(v1817, v1813); - float32x2_t v1829 = vmul_f32(v1813, v3089); - float32x2_t v1844 = vmul_f32(v1795, v3089); - float32x2_t v1939 = vsub_f32(v1938, v1934); - float32x2_t v1957 = vsub_f32(v1956, v1952); - float32x2_t v1968 = vmul_f32(v1952, v3089); - float32x2_t v1983 = vmul_f32(v1934, v3089); - float32x2_t v2078 = vsub_f32(v2077, v2073); - float32x2_t v2096 = vsub_f32(v2095, v2091); - float32x2_t v2107 = vmul_f32(v2091, v3089); - float32x2_t v2122 = vmul_f32(v2073, v3089); - float32x2_t v2217 = vsub_f32(v2216, v2212); - float32x2_t v2235 = vsub_f32(v2234, v2230); - float32x2_t v2246 = vmul_f32(v2230, v3089); - float32x2_t v2261 = vmul_f32(v2212, v3089); - float32x2_t v2356 = vsub_f32(v2355, v2351); - float32x2_t v2374 = vsub_f32(v2373, v2369); - float32x2_t v2385 = vmul_f32(v2369, v3089); - float32x2_t v2400 = vmul_f32(v2351, v3089); - float32x2_t v1819 = vadd_f32(v1800, v1818); - float32x2_t v1820 = vsub_f32(v1800, v1818); - float32x2_t v1830 = vadd_f32(v1795, v1829); - float32x2_t v1845 = vsub_f32(v1844, v1813); - float32x2_t v1958 = vadd_f32(v1939, v1957); - float32x2_t v1959 = vsub_f32(v1939, v1957); - float32x2_t v1969 = vadd_f32(v1934, v1968); - float32x2_t v1984 = vsub_f32(v1983, v1952); - float32x2_t v2097 = vadd_f32(v2078, v2096); - float32x2_t v2098 = vsub_f32(v2078, v2096); - float32x2_t v2108 = vadd_f32(v2073, v2107); - float32x2_t v2123 = vsub_f32(v2122, v2091); - float32x2_t v2236 = vadd_f32(v2217, v2235); - float32x2_t v2237 = vsub_f32(v2217, v2235); - float32x2_t v2247 = vadd_f32(v2212, v2246); - float32x2_t v2262 = vsub_f32(v2261, v2230); - float32x2_t v2375 = vadd_f32(v2356, v2374); - float32x2_t v2376 = vsub_f32(v2356, v2374); - float32x2_t v2386 = vadd_f32(v2351, v2385); - float32x2_t v2401 = vsub_f32(v2400, v2369); - float32x2_t v1824 = vmul_f32(v1819, v3069); - float32x2_t v1834 = vmul_f32(v1820, v3079); - float32x2_t v1846 = vadd_f32(v1738, v1819); - float32x2_t v1852 = vrev64_f32(v1830); - float32x2_t v1860 = vrev64_f32(v1845); - float32x2_t v1963 = vmul_f32(v1958, v3069); - float32x2_t v1973 = vmul_f32(v1959, v3079); - float32x2_t v1985 = vadd_f32(v1877, v1958); - float32x2_t v1991 = vrev64_f32(v1969); - float32x2_t v1999 = vrev64_f32(v1984); - float32x2_t v2102 = vmul_f32(v2097, v3069); - float32x2_t v2112 = vmul_f32(v2098, v3079); - float32x2_t v2124 = vadd_f32(v2016, v2097); - float32x2_t v2130 = vrev64_f32(v2108); - float32x2_t v2138 = vrev64_f32(v2123); - float32x2_t v2241 = vmul_f32(v2236, v3069); - float32x2_t v2251 = vmul_f32(v2237, v3079); - float32x2_t v2263 = vadd_f32(v2155, v2236); - float32x2_t v2269 = vrev64_f32(v2247); - float32x2_t v2277 = vrev64_f32(v2262); - float32x2_t v2380 = vmul_f32(v2375, v3069); - float32x2_t v2390 = vmul_f32(v2376, v3079); - float32x2_t v2402 = vadd_f32(v2294, v2375); - float32x2_t v2408 = vrev64_f32(v2386); - float32x2_t v2416 = vrev64_f32(v2401); - float32x2_t v1825 = vsub_f32(v1738, v1824); - float32x2_t v1853 = vmul_f32(v1852, v3117); - float32x2_t v1861 = vmul_f32(v1860, v3117); - float32x2_t v1964 = vsub_f32(v1877, v1963); - float32x2_t v1992 = vmul_f32(v1991, v3117); - float32x2_t v2000 = vmul_f32(v1999, v3117); - float32x2_t v2103 = vsub_f32(v2016, v2102); - float32x2_t v2131 = vmul_f32(v2130, v3117); - float32x2_t v2139 = vmul_f32(v2138, v3117); - float32x2_t v2242 = vsub_f32(v2155, v2241); - float32x2_t v2270 = vmul_f32(v2269, v3117); - float32x2_t v2278 = vmul_f32(v2277, v3117); - float32x2_t v2381 = vsub_f32(v2294, v2380); - float32x2_t v2409 = vmul_f32(v2408, v3117); - float32x2_t v2417 = vmul_f32(v2416, v3117); - float32x2_t v2438 = vrev64_f32(v1985); - float32x2_t v2450 = vrev64_f32(v2124); - float32x2_t v2462 = vrev64_f32(v2402); - float32x2_t v2480 = vrev64_f32(v2263); - float32x2_t v1835 = vsub_f32(v1825, v1834); - float32x2_t v1839 = vmul_f32(v1825, v3140); - float32x2_t v1974 = vsub_f32(v1964, v1973); - float32x2_t v1978 = vmul_f32(v1964, v3140); - float32x2_t v2113 = vsub_f32(v2103, v2112); - float32x2_t v2117 = vmul_f32(v2103, v3140); - float32x2_t v2252 = vsub_f32(v2242, v2251); - float32x2_t v2256 = vmul_f32(v2242, v3140); - float32x2_t v2391 = vsub_f32(v2381, v2390); - float32x2_t v2395 = vmul_f32(v2381, v3140); - float32x2_t v2439 = vmul_f32(v2438, v2477); - float32x2_t v2451 = vmul_f32(v2450, v2477); - float32x2_t v2463 = vmul_f32(v2462, v2477); - float32x2_t v2481 = vmul_f32(v2480, v2477); - float32x2_t v1840 = vsub_f32(v1839, v1835); - float32x2_t v1862 = vsub_f32(v1835, v1861); - float32x2_t v1866 = vmul_f32(v1835, v3140); - float32x2_t v1979 = vsub_f32(v1978, v1974); - float32x2_t v2001 = vsub_f32(v1974, v2000); - float32x2_t v2005 = vmul_f32(v1974, v3140); - float32x2_t v2118 = vsub_f32(v2117, v2113); - float32x2_t v2140 = vsub_f32(v2113, v2139); - float32x2_t v2144 = vmul_f32(v2113, v3140); - float32x2_t v2257 = vsub_f32(v2256, v2252); - float32x2_t v2279 = vsub_f32(v2252, v2278); - float32x2_t v2283 = vmul_f32(v2252, v3140); - float32x2_t v2396 = vsub_f32(v2395, v2391); - float32x2_t v2418 = vsub_f32(v2391, v2417); - float32x2_t v2422 = vmul_f32(v2391, v3140); - float32x2_t v2440 = vadd_f32(v2439, v1985); - float32x2_t v2452 = vadd_f32(v2451, v2124); - float32x2_t v2464 = vadd_f32(v2463, v2402); - float32x2_t v2482 = vadd_f32(v2481, v2263); - float32x2_t v1854 = vsub_f32(v1840, v1853); - float32x2_t v1867 = vsub_f32(v1866, v1862); - float32x2_t v1871 = vmul_f32(v1840, v3140); - float32x2_t v1993 = vsub_f32(v1979, v1992); - float32x2_t v2006 = vsub_f32(v2005, v2001); - float32x2_t v2010 = vmul_f32(v1979, v3140); - float32x2_t v2132 = vsub_f32(v2118, v2131); - float32x2_t v2145 = vsub_f32(v2144, v2140); - float32x2_t v2149 = vmul_f32(v2118, v3140); - float32x2_t v2271 = vsub_f32(v2257, v2270); - float32x2_t v2284 = vsub_f32(v2283, v2279); - float32x2_t v2288 = vmul_f32(v2257, v3140); - float32x2_t v2410 = vsub_f32(v2396, v2409); - float32x2_t v2423 = vsub_f32(v2422, v2418); - float32x2_t v2427 = vmul_f32(v2396, v3140); - float32x2_t v2465 = vsub_f32(v2440, v2464); - float32x2_t v2469 = vmul_f32(v2440, v3140); - float32x2_t v2483 = vsub_f32(v2452, v2482); - float32x2_t v2487 = vmul_f32(v2452, v3140); - float32x2_t v2726 = vrev64_f32(v2001); - float32x2_t v2738 = vrev64_f32(v2140); - float32x2_t v2750 = vrev64_f32(v2418); - float32x2_t v2768 = vrev64_f32(v2279); - float32x2_t v1872 = vsub_f32(v1871, v1854); - float32x2_t v2011 = vsub_f32(v2010, v1993); - float32x2_t v2150 = vsub_f32(v2149, v2132); - float32x2_t v2289 = vsub_f32(v2288, v2271); - float32x2_t v2428 = vsub_f32(v2427, v2410); - float32x2_t v2470 = vsub_f32(v2469, v2465); - float32x2_t v2488 = vsub_f32(v2487, v2483); - float32x2_t v2499 = vmul_f32(v2483, v3089); - float32x2_t v2514 = vmul_f32(v2465, v3089); - float32x2_t v2582 = vrev64_f32(v1993); - float32x2_t v2594 = vrev64_f32(v2132); - float32x2_t v2606 = vrev64_f32(v2410); - float32x2_t v2624 = vrev64_f32(v2271); - float32x2_t v2727 = vmul_f32(v2726, v2725); - float32x2_t v2739 = vmul_f32(v2738, v3013); - float32x2_t v2751 = vmul_f32(v2750, v3025); - float32x2_t v2769 = vmul_f32(v2768, v2881); - float32x2_t v2870 = vrev64_f32(v2006); - float32x2_t v2882 = vrev64_f32(v2145); - float32x2_t v2894 = vrev64_f32(v2423); - float32x2_t v2912 = vrev64_f32(v2284); - float32x2_t v2489 = vadd_f32(v2470, v2488); - float32x2_t v2490 = vsub_f32(v2470, v2488); - float32x2_t v2500 = vadd_f32(v2465, v2499); - float32x2_t v2515 = vsub_f32(v2514, v2483); - float32x2_t v2583 = vmul_f32(v2582, v2581); - float32x2_t v2595 = vmul_f32(v2594, v2725); - float32x2_t v2607 = vmul_f32(v2606, v3013); - float32x2_t v2625 = vmul_f32(v2624, v2869); - float32x2_t v2728 = vfma_f32(v2727, v2001, v2719); - float32x2_t v2740 = vfma_f32(v2739, v2140, v3007); - float32x2_t v2752 = vfma_f32(v2751, v2418, v3019); - float32x2_t v2770 = vfma_f32(v2769, v2279, v2875); - float32x2_t v2871 = vmul_f32(v2870, v2869); - float32x2_t v2883 = vmul_f32(v2882, v2881); - float32x2_t v2895 = vmul_f32(v2894, v3055); - float32x2_t v2913 = vmul_f32(v2912, v2911); - float32x2_t v3014 = vrev64_f32(v2011); - float32x2_t v3026 = vrev64_f32(v2150); - float32x2_t v3038 = vrev64_f32(v2428); - float32x2_t v3056 = vrev64_f32(v2289); - float32x2_t v2494 = vmul_f32(v2489, v3069); - float32x2_t v2504 = vmul_f32(v2490, v3079); - float32x2_t v2516 = vadd_f32(v1846, v2489); - float32x2_t v2528 = vrev64_f32(v2500); - float32x2_t v2542 = vrev64_f32(v2515); - float32x2_t v2584 = vfma_f32(v2583, v1993, v2575); - float32x2_t v2596 = vfma_f32(v2595, v2132, v2719); - float32x2_t v2608 = vfma_f32(v2607, v2410, v3007); - float32x2_t v2626 = vfma_f32(v2625, v2271, v2863); - float32x2_t v2753 = vsub_f32(v2728, v2752); - float32x2_t v2757 = vmul_f32(v2728, v3140); - float32x2_t v2771 = vsub_f32(v2740, v2770); - float32x2_t v2775 = vmul_f32(v2740, v3140); - float32x2_t v2872 = vfma_f32(v2871, v2006, v2863); - float32x2_t v2884 = vfma_f32(v2883, v2145, v2875); - float32x2_t v2896 = vfma_f32(v2895, v2423, v3049); - float32x2_t v2914 = vfma_f32(v2913, v2284, v3031); - float32x2_t v3015 = vmul_f32(v3014, v3013); - float32x2_t v3027 = vmul_f32(v3026, v3025); - float32x2_t v3039 = vmul_f32(v3038, v3037); - float32x2_t v3057 = vmul_f32(v3056, v3055); - float32x2_t v2495 = vsub_f32(v1846, v2494); - int16x4_t v2519 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v2516, 15), (int32x2_t){0, 0})); - float32x2_t v2529 = vmul_f32(v2528, v3117); - float32x2_t v2543 = vmul_f32(v2542, v3117); - float32x2_t v2609 = vsub_f32(v2584, v2608); - float32x2_t v2613 = vmul_f32(v2584, v3140); - float32x2_t v2627 = vsub_f32(v2596, v2626); - float32x2_t v2631 = vmul_f32(v2596, v3140); - float32x2_t v2758 = vsub_f32(v2757, v2753); - float32x2_t v2776 = vsub_f32(v2775, v2771); - float32x2_t v2787 = vmul_f32(v2771, v3089); - float32x2_t v2802 = vmul_f32(v2753, v3089); - float32x2_t v2897 = vsub_f32(v2872, v2896); - float32x2_t v2901 = vmul_f32(v2872, v3140); - float32x2_t v2915 = vsub_f32(v2884, v2914); - float32x2_t v2919 = vmul_f32(v2884, v3140); - float32x2_t v3016 = vfma_f32(v3015, v2011, v3007); - float32x2_t v3028 = vfma_f32(v3027, v2150, v3019); - float32x2_t v3040 = vfma_f32(v3039, v2428, v3031); - float32x2_t v3058 = vfma_f32(v3057, v2289, v3049); - float32x2_t v2505 = vsub_f32(v2495, v2504); - float32x2_t v2509 = vmul_f32(v2495, v3140); - v6[0] = vget_lane_s32(vreinterpret_s32_s16(v2519), 0); - float32x2_t v2614 = vsub_f32(v2613, v2609); - float32x2_t v2632 = vsub_f32(v2631, v2627); - float32x2_t v2643 = vmul_f32(v2627, v3089); - float32x2_t v2658 = vmul_f32(v2609, v3089); - float32x2_t v2777 = vadd_f32(v2758, v2776); - float32x2_t v2778 = vsub_f32(v2758, v2776); - float32x2_t v2788 = vadd_f32(v2753, v2787); - float32x2_t v2803 = vsub_f32(v2802, v2771); - float32x2_t v2902 = vsub_f32(v2901, v2897); - float32x2_t v2920 = vsub_f32(v2919, v2915); - float32x2_t v2931 = vmul_f32(v2915, v3089); - float32x2_t v2946 = vmul_f32(v2897, v3089); - float32x2_t v3041 = vsub_f32(v3016, v3040); - float32x2_t v3045 = vmul_f32(v3016, v3140); - float32x2_t v3059 = vsub_f32(v3028, v3058); - float32x2_t v3063 = vmul_f32(v3028, v3140); - float32x2_t v2510 = vsub_f32(v2509, v2505); - float32x2_t v2544 = vsub_f32(v2505, v2543); - float32x2_t v2554 = vmul_f32(v2505, v3140); - float32x2_t v2633 = vadd_f32(v2614, v2632); - float32x2_t v2634 = vsub_f32(v2614, v2632); - float32x2_t v2644 = vadd_f32(v2609, v2643); - float32x2_t v2659 = vsub_f32(v2658, v2627); - float32x2_t v2782 = vmul_f32(v2777, v3069); - float32x2_t v2792 = vmul_f32(v2778, v3079); - float32x2_t v2804 = vadd_f32(v1862, v2777); - float32x2_t v2816 = vrev64_f32(v2788); - float32x2_t v2830 = vrev64_f32(v2803); - float32x2_t v2921 = vadd_f32(v2902, v2920); - float32x2_t v2922 = vsub_f32(v2902, v2920); - float32x2_t v2932 = vadd_f32(v2897, v2931); - float32x2_t v2947 = vsub_f32(v2946, v2915); - float32x2_t v3046 = vsub_f32(v3045, v3041); - float32x2_t v3064 = vsub_f32(v3063, v3059); - float32x2_t v3075 = vmul_f32(v3059, v3089); - float32x2_t v3090 = vmul_f32(v3041, v3089); - float32x2_t v2530 = vsub_f32(v2510, v2529); - int16x4_t v2547 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v2544, 15), (int32x2_t){0, 0})); - float32x2_t v2555 = vsub_f32(v2554, v2544); - float32x2_t v2565 = vmul_f32(v2510, v3140); - float32x2_t v2638 = vmul_f32(v2633, v3069); - float32x2_t v2648 = vmul_f32(v2634, v3079); - float32x2_t v2660 = vadd_f32(v1854, v2633); - float32x2_t v2672 = vrev64_f32(v2644); - float32x2_t v2686 = vrev64_f32(v2659); - float32x2_t v2783 = vsub_f32(v1862, v2782); - int16x4_t v2807 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v2804, 15), (int32x2_t){0, 0})); - float32x2_t v2817 = vmul_f32(v2816, v3117); - float32x2_t v2831 = vmul_f32(v2830, v3117); - float32x2_t v2926 = vmul_f32(v2921, v3069); - float32x2_t v2936 = vmul_f32(v2922, v3079); - float32x2_t v2948 = vadd_f32(v1867, v2921); - float32x2_t v2960 = vrev64_f32(v2932); - float32x2_t v2974 = vrev64_f32(v2947); - float32x2_t v3065 = vadd_f32(v3046, v3064); - float32x2_t v3066 = vsub_f32(v3046, v3064); - float32x2_t v3076 = vadd_f32(v3041, v3075); - float32x2_t v3091 = vsub_f32(v3090, v3059); - int16x4_t v2533 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v2530, 15), (int32x2_t){0, 0})); - v6[ostride * 10] = vget_lane_s32(vreinterpret_s32_s16(v2547), 0); - int16x4_t v2558 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v2555, 15), (int32x2_t){0, 0})); - float32x2_t v2566 = vsub_f32(v2565, v2530); - float32x2_t v2639 = vsub_f32(v1854, v2638); - int16x4_t v2663 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v2660, 15), (int32x2_t){0, 0})); - float32x2_t v2673 = vmul_f32(v2672, v3117); - float32x2_t v2687 = vmul_f32(v2686, v3117); - float32x2_t v2793 = vsub_f32(v2783, v2792); - float32x2_t v2797 = vmul_f32(v2783, v3140); - v6[ostride * 2] = vget_lane_s32(vreinterpret_s32_s16(v2807), 0); - float32x2_t v2927 = vsub_f32(v1867, v2926); - int16x4_t v2951 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v2948, 15), (int32x2_t){0, 0})); - float32x2_t v2961 = vmul_f32(v2960, v3117); - float32x2_t v2975 = vmul_f32(v2974, v3117); - float32x2_t v3070 = vmul_f32(v3065, v3069); - float32x2_t v3080 = vmul_f32(v3066, v3079); - float32x2_t v3092 = vadd_f32(v1872, v3065); - float32x2_t v3104 = vrev64_f32(v3076); - float32x2_t v3118 = vrev64_f32(v3091); - v6[ostride * 5] = vget_lane_s32(vreinterpret_s32_s16(v2533), 0); - v6[ostride * 15] = vget_lane_s32(vreinterpret_s32_s16(v2558), 0); - int16x4_t v2569 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v2566, 15), (int32x2_t){0, 0})); - float32x2_t v2649 = vsub_f32(v2639, v2648); - float32x2_t v2653 = vmul_f32(v2639, v3140); - v6[ostride] = vget_lane_s32(vreinterpret_s32_s16(v2663), 0); - float32x2_t v2798 = vsub_f32(v2797, v2793); - float32x2_t v2832 = vsub_f32(v2793, v2831); - float32x2_t v2842 = vmul_f32(v2793, v3140); - float32x2_t v2937 = vsub_f32(v2927, v2936); - float32x2_t v2941 = vmul_f32(v2927, v3140); - v6[ostride * 3] = vget_lane_s32(vreinterpret_s32_s16(v2951), 0); - float32x2_t v3071 = vsub_f32(v1872, v3070); - int16x4_t v3095 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v3092, 15), (int32x2_t){0, 0})); - float32x2_t v3105 = vmul_f32(v3104, v3117); - float32x2_t v3119 = vmul_f32(v3118, v3117); - v6[ostride * 20] = vget_lane_s32(vreinterpret_s32_s16(v2569), 0); - float32x2_t v2654 = vsub_f32(v2653, v2649); - float32x2_t v2688 = vsub_f32(v2649, v2687); - float32x2_t v2698 = vmul_f32(v2649, v3140); - float32x2_t v2818 = vsub_f32(v2798, v2817); - int16x4_t v2835 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v2832, 15), (int32x2_t){0, 0})); - float32x2_t v2843 = vsub_f32(v2842, v2832); - float32x2_t v2853 = vmul_f32(v2798, v3140); - float32x2_t v2942 = vsub_f32(v2941, v2937); - float32x2_t v2976 = vsub_f32(v2937, v2975); - float32x2_t v2986 = vmul_f32(v2937, v3140); - float32x2_t v3081 = vsub_f32(v3071, v3080); - float32x2_t v3085 = vmul_f32(v3071, v3140); - v6[ostride * 4] = vget_lane_s32(vreinterpret_s32_s16(v3095), 0); - float32x2_t v2674 = vsub_f32(v2654, v2673); - int16x4_t v2691 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v2688, 15), (int32x2_t){0, 0})); - float32x2_t v2699 = vsub_f32(v2698, v2688); - float32x2_t v2709 = vmul_f32(v2654, v3140); - int16x4_t v2821 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v2818, 15), (int32x2_t){0, 0})); - v6[ostride * 12] = vget_lane_s32(vreinterpret_s32_s16(v2835), 0); - int16x4_t v2846 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v2843, 15), (int32x2_t){0, 0})); - float32x2_t v2854 = vsub_f32(v2853, v2818); - float32x2_t v2962 = vsub_f32(v2942, v2961); - int16x4_t v2979 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v2976, 15), (int32x2_t){0, 0})); - float32x2_t v2987 = vsub_f32(v2986, v2976); - float32x2_t v2997 = vmul_f32(v2942, v3140); - float32x2_t v3086 = vsub_f32(v3085, v3081); - float32x2_t v3120 = vsub_f32(v3081, v3119); - float32x2_t v3130 = vmul_f32(v3081, v3140); - int16x4_t v2677 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v2674, 15), (int32x2_t){0, 0})); - v6[ostride * 11] = vget_lane_s32(vreinterpret_s32_s16(v2691), 0); - int16x4_t v2702 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v2699, 15), (int32x2_t){0, 0})); - float32x2_t v2710 = vsub_f32(v2709, v2674); - v6[ostride * 7] = vget_lane_s32(vreinterpret_s32_s16(v2821), 0); - v6[ostride * 17] = vget_lane_s32(vreinterpret_s32_s16(v2846), 0); - int16x4_t v2857 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v2854, 15), (int32x2_t){0, 0})); - int16x4_t v2965 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v2962, 15), (int32x2_t){0, 0})); - v6[ostride * 13] = vget_lane_s32(vreinterpret_s32_s16(v2979), 0); - int16x4_t v2990 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v2987, 15), (int32x2_t){0, 0})); - float32x2_t v2998 = vsub_f32(v2997, v2962); - float32x2_t v3106 = vsub_f32(v3086, v3105); - int16x4_t v3123 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v3120, 15), (int32x2_t){0, 0})); - float32x2_t v3131 = vsub_f32(v3130, v3120); - float32x2_t v3141 = vmul_f32(v3086, v3140); - v6[ostride * 6] = vget_lane_s32(vreinterpret_s32_s16(v2677), 0); - v6[ostride * 16] = vget_lane_s32(vreinterpret_s32_s16(v2702), 0); - int16x4_t v2713 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v2710, 15), (int32x2_t){0, 0})); - v6[ostride * 22] = vget_lane_s32(vreinterpret_s32_s16(v2857), 0); - v6[ostride * 8] = vget_lane_s32(vreinterpret_s32_s16(v2965), 0); - v6[ostride * 18] = vget_lane_s32(vreinterpret_s32_s16(v2990), 0); - int16x4_t v3001 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v2998, 15), (int32x2_t){0, 0})); - int16x4_t v3109 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v3106, 15), (int32x2_t){0, 0})); - v6[ostride * 14] = vget_lane_s32(vreinterpret_s32_s16(v3123), 0); - int16x4_t v3134 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v3131, 15), (int32x2_t){0, 0})); - float32x2_t v3142 = vsub_f32(v3141, v3106); - v6[ostride * 21] = vget_lane_s32(vreinterpret_s32_s16(v2713), 0); - v6[ostride * 23] = vget_lane_s32(vreinterpret_s32_s16(v3001), 0); - v6[ostride * 9] = vget_lane_s32(vreinterpret_s32_s16(v3109), 0); - v6[ostride * 19] = vget_lane_s32(vreinterpret_s32_s16(v3134), 0); - int16x4_t v3145 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v3142, 15), (int32x2_t){0, 0})); - v6[ostride * 24] = vget_lane_s32(vreinterpret_s32_s16(v3145), 0); + for (int j = 0; j < howmany; j += 1) { + float32x2_t v159 = v5[istride]; + float v856 = 9.6858316112863108e-01F; + float v859 = -2.4868988716485479e-01F; + float v860 = 2.4868988716485479e-01F; + float v1000 = 8.7630668004386358e-01F; + float v1003 = -4.8175367410171532e-01F; + float v1004 = 4.8175367410171532e-01F; + float v1144 = 7.2896862742141155e-01F; + float v1147 = -6.8454710592868862e-01F; + float v1148 = 6.8454710592868862e-01F; + float v1156 = 6.2790519529313527e-02F; + float v1159 = -9.9802672842827156e-01F; + float v1160 = 9.9802672842827156e-01F; + float v1288 = 5.3582679497899655e-01F; + float v1291 = -8.4432792550201508e-01F; + float v1292 = 8.4432792550201508e-01F; + float v1300 = -4.2577929156507272e-01F; + float v1303 = -9.0482705246601947e-01F; + float v1304 = 9.0482705246601947e-01F; + float v1312 = -6.3742398974868952e-01F; + float v1315 = 7.7051324277578936e-01F; + float v1316 = -7.7051324277578936e-01F; + float v1330 = -9.9211470131447776e-01F; + float v1333 = -1.2533323356430454e-01F; + float v1334 = 1.2533323356430454e-01F; + float v1350 = 2.5000000000000000e-01F; + float v1360 = 5.5901699437494745e-01F; + float v1370 = 6.1803398874989490e-01F; + float v1395 = 9.5105651629515353e-01F; + float v1396 = -9.5105651629515353e-01F; + float32x2_t v1398 = (float32x2_t){v4, v4}; + float v1421 = 2.0000000000000000e+00F; + float32x2_t v20 = v5[0]; + float32x2_t v857 = (float32x2_t){v856, v856}; + float32x2_t v861 = (float32x2_t){v859, v860}; + float32x2_t v1001 = (float32x2_t){v1000, v1000}; + float32x2_t v1005 = (float32x2_t){v1003, v1004}; + float32x2_t v1145 = (float32x2_t){v1144, v1144}; + float32x2_t v1149 = (float32x2_t){v1147, v1148}; + float32x2_t v1157 = (float32x2_t){v1156, v1156}; + float32x2_t v1161 = (float32x2_t){v1159, v1160}; + float32x2_t v1191 = (float32x2_t){v1316, v1315}; + float32x2_t v1289 = (float32x2_t){v1288, v1288}; + float32x2_t v1293 = (float32x2_t){v1291, v1292}; + float32x2_t v1301 = (float32x2_t){v1300, v1300}; + float32x2_t v1305 = (float32x2_t){v1303, v1304}; + float32x2_t v1313 = (float32x2_t){v1312, v1312}; + float32x2_t v1317 = (float32x2_t){v1315, v1316}; + float32x2_t v1331 = (float32x2_t){v1330, v1330}; + float32x2_t v1335 = (float32x2_t){v1333, v1334}; + float32x2_t v1351 = (float32x2_t){v1350, v1350}; + float32x2_t v1361 = (float32x2_t){v1360, v1360}; + float32x2_t v1371 = (float32x2_t){v1370, v1370}; + float32x2_t v1397 = (float32x2_t){v1395, v1396}; + float32x2_t v1422 = (float32x2_t){v1421, v1421}; + float32x2_t v25 = v5[istride * 5]; + float32x2_t v30 = v5[istride * 10]; + float32x2_t v35 = v5[istride * 15]; + float32x2_t v40 = v5[istride * 20]; + float32x2_t v164 = v5[istride * 6]; + float32x2_t v169 = v5[istride * 11]; + float32x2_t v174 = v5[istride * 16]; + float32x2_t v179 = v5[istride * 21]; + float32x2_t v298 = v5[istride * 2]; + float32x2_t v303 = v5[istride * 7]; + float32x2_t v308 = v5[istride * 12]; + float32x2_t v313 = v5[istride * 17]; + float32x2_t v318 = v5[istride * 22]; + float32x2_t v437 = v5[istride * 3]; + float32x2_t v442 = v5[istride * 8]; + float32x2_t v447 = v5[istride * 13]; + float32x2_t v452 = v5[istride * 18]; + float32x2_t v457 = v5[istride * 23]; + float32x2_t v576 = v5[istride * 4]; + float32x2_t v581 = v5[istride * 9]; + float32x2_t v586 = v5[istride * 14]; + float32x2_t v591 = v5[istride * 19]; + float32x2_t v596 = v5[istride * 24]; + float32x2_t v863 = vmul_f32(v1398, v861); + float32x2_t v1007 = vmul_f32(v1398, v1005); + float32x2_t v1151 = vmul_f32(v1398, v1149); + float32x2_t v1163 = vmul_f32(v1398, v1161); + float32x2_t v1193 = vmul_f32(v1398, v1191); + float32x2_t v1295 = vmul_f32(v1398, v1293); + float32x2_t v1307 = vmul_f32(v1398, v1305); + float32x2_t v1319 = vmul_f32(v1398, v1317); + float32x2_t v1337 = vmul_f32(v1398, v1335); + float32x2_t v1399 = vmul_f32(v1398, v1397); + float32x2_t v77 = vsub_f32(v25, v40); + float32x2_t v81 = vmul_f32(v25, v1422); + float32x2_t v95 = vsub_f32(v30, v35); + float32x2_t v99 = vmul_f32(v30, v1422); + float32x2_t v216 = vsub_f32(v164, v179); + float32x2_t v220 = vmul_f32(v164, v1422); + float32x2_t v234 = vsub_f32(v169, v174); + float32x2_t v238 = vmul_f32(v169, v1422); + float32x2_t v355 = vsub_f32(v303, v318); + float32x2_t v359 = vmul_f32(v303, v1422); + float32x2_t v373 = vsub_f32(v308, v313); + float32x2_t v377 = vmul_f32(v308, v1422); + float32x2_t v494 = vsub_f32(v442, v457); + float32x2_t v498 = vmul_f32(v442, v1422); + float32x2_t v512 = vsub_f32(v447, v452); + float32x2_t v516 = vmul_f32(v447, v1422); + float32x2_t v633 = vsub_f32(v581, v596); + float32x2_t v637 = vmul_f32(v581, v1422); + float32x2_t v651 = vsub_f32(v586, v591); + float32x2_t v655 = vmul_f32(v586, v1422); + float32x2_t v82 = vsub_f32(v81, v77); + float32x2_t v100 = vsub_f32(v99, v95); + float32x2_t v111 = vmul_f32(v95, v1371); + float32x2_t v126 = vmul_f32(v77, v1371); + float32x2_t v221 = vsub_f32(v220, v216); + float32x2_t v239 = vsub_f32(v238, v234); + float32x2_t v250 = vmul_f32(v234, v1371); + float32x2_t v265 = vmul_f32(v216, v1371); + float32x2_t v360 = vsub_f32(v359, v355); + float32x2_t v378 = vsub_f32(v377, v373); + float32x2_t v389 = vmul_f32(v373, v1371); + float32x2_t v404 = vmul_f32(v355, v1371); + float32x2_t v499 = vsub_f32(v498, v494); + float32x2_t v517 = vsub_f32(v516, v512); + float32x2_t v528 = vmul_f32(v512, v1371); + float32x2_t v543 = vmul_f32(v494, v1371); + float32x2_t v638 = vsub_f32(v637, v633); + float32x2_t v656 = vsub_f32(v655, v651); + float32x2_t v667 = vmul_f32(v651, v1371); + float32x2_t v682 = vmul_f32(v633, v1371); + float32x2_t v101 = vadd_f32(v82, v100); + float32x2_t v102 = vsub_f32(v82, v100); + float32x2_t v112 = vadd_f32(v77, v111); + float32x2_t v127 = vsub_f32(v126, v95); + float32x2_t v240 = vadd_f32(v221, v239); + float32x2_t v241 = vsub_f32(v221, v239); + float32x2_t v251 = vadd_f32(v216, v250); + float32x2_t v266 = vsub_f32(v265, v234); + float32x2_t v379 = vadd_f32(v360, v378); + float32x2_t v380 = vsub_f32(v360, v378); + float32x2_t v390 = vadd_f32(v355, v389); + float32x2_t v405 = vsub_f32(v404, v373); + float32x2_t v518 = vadd_f32(v499, v517); + float32x2_t v519 = vsub_f32(v499, v517); + float32x2_t v529 = vadd_f32(v494, v528); + float32x2_t v544 = vsub_f32(v543, v512); + float32x2_t v657 = vadd_f32(v638, v656); + float32x2_t v658 = vsub_f32(v638, v656); + float32x2_t v668 = vadd_f32(v633, v667); + float32x2_t v683 = vsub_f32(v682, v651); + float32x2_t v106 = vmul_f32(v101, v1351); + float32x2_t v116 = vmul_f32(v102, v1361); + float32x2_t v128 = vadd_f32(v20, v101); + float32x2_t v134 = vrev64_f32(v112); + float32x2_t v142 = vrev64_f32(v127); + float32x2_t v245 = vmul_f32(v240, v1351); + float32x2_t v255 = vmul_f32(v241, v1361); + float32x2_t v267 = vadd_f32(v159, v240); + float32x2_t v273 = vrev64_f32(v251); + float32x2_t v281 = vrev64_f32(v266); + float32x2_t v384 = vmul_f32(v379, v1351); + float32x2_t v394 = vmul_f32(v380, v1361); + float32x2_t v406 = vadd_f32(v298, v379); + float32x2_t v412 = vrev64_f32(v390); + float32x2_t v420 = vrev64_f32(v405); + float32x2_t v523 = vmul_f32(v518, v1351); + float32x2_t v533 = vmul_f32(v519, v1361); + float32x2_t v545 = vadd_f32(v437, v518); + float32x2_t v551 = vrev64_f32(v529); + float32x2_t v559 = vrev64_f32(v544); + float32x2_t v662 = vmul_f32(v657, v1351); + float32x2_t v672 = vmul_f32(v658, v1361); + float32x2_t v684 = vadd_f32(v576, v657); + float32x2_t v690 = vrev64_f32(v668); + float32x2_t v698 = vrev64_f32(v683); + float32x2_t v107 = vsub_f32(v20, v106); + float32x2_t v135 = vmul_f32(v134, v1399); + float32x2_t v143 = vmul_f32(v142, v1399); + float32x2_t v246 = vsub_f32(v159, v245); + float32x2_t v274 = vmul_f32(v273, v1399); + float32x2_t v282 = vmul_f32(v281, v1399); + float32x2_t v385 = vsub_f32(v298, v384); + float32x2_t v413 = vmul_f32(v412, v1399); + float32x2_t v421 = vmul_f32(v420, v1399); + float32x2_t v524 = vsub_f32(v437, v523); + float32x2_t v552 = vmul_f32(v551, v1399); + float32x2_t v560 = vmul_f32(v559, v1399); + float32x2_t v663 = vsub_f32(v576, v662); + float32x2_t v691 = vmul_f32(v690, v1399); + float32x2_t v699 = vmul_f32(v698, v1399); + float32x2_t v747 = vsub_f32(v267, v684); + float32x2_t v751 = vmul_f32(v267, v1422); + float32x2_t v765 = vsub_f32(v406, v545); + float32x2_t v769 = vmul_f32(v406, v1422); + float32x2_t v117 = vsub_f32(v107, v116); + float32x2_t v121 = vmul_f32(v107, v1422); + float32x2_t v256 = vsub_f32(v246, v255); + float32x2_t v260 = vmul_f32(v246, v1422); + float32x2_t v395 = vsub_f32(v385, v394); + float32x2_t v399 = vmul_f32(v385, v1422); + float32x2_t v534 = vsub_f32(v524, v533); + float32x2_t v538 = vmul_f32(v524, v1422); + float32x2_t v673 = vsub_f32(v663, v672); + float32x2_t v677 = vmul_f32(v663, v1422); + float32x2_t v752 = vsub_f32(v751, v747); + float32x2_t v770 = vsub_f32(v769, v765); + float32x2_t v781 = vmul_f32(v765, v1371); + float32x2_t v796 = vmul_f32(v747, v1371); + float32x2_t v122 = vsub_f32(v121, v117); + float32x2_t v144 = vsub_f32(v117, v143); + float32x2_t v148 = vmul_f32(v117, v1422); + float32x2_t v261 = vsub_f32(v260, v256); + float32x2_t v283 = vsub_f32(v256, v282); + float32x2_t v287 = vmul_f32(v256, v1422); + float32x2_t v400 = vsub_f32(v399, v395); + float32x2_t v422 = vsub_f32(v395, v421); + float32x2_t v426 = vmul_f32(v395, v1422); + float32x2_t v539 = vsub_f32(v538, v534); + float32x2_t v561 = vsub_f32(v534, v560); + float32x2_t v565 = vmul_f32(v534, v1422); + float32x2_t v678 = vsub_f32(v677, v673); + float32x2_t v700 = vsub_f32(v673, v699); + float32x2_t v704 = vmul_f32(v673, v1422); + float32x2_t v771 = vadd_f32(v752, v770); + float32x2_t v772 = vsub_f32(v752, v770); + float32x2_t v782 = vadd_f32(v747, v781); + float32x2_t v797 = vsub_f32(v796, v765); + float32x2_t v136 = vsub_f32(v122, v135); + float32x2_t v149 = vsub_f32(v148, v144); + float32x2_t v153 = vmul_f32(v122, v1422); + float32x2_t v275 = vsub_f32(v261, v274); + float32x2_t v288 = vsub_f32(v287, v283); + float32x2_t v292 = vmul_f32(v261, v1422); + float32x2_t v414 = vsub_f32(v400, v413); + float32x2_t v427 = vsub_f32(v426, v422); + float32x2_t v431 = vmul_f32(v400, v1422); + float32x2_t v553 = vsub_f32(v539, v552); + float32x2_t v566 = vsub_f32(v565, v561); + float32x2_t v570 = vmul_f32(v539, v1422); + float32x2_t v692 = vsub_f32(v678, v691); + float32x2_t v705 = vsub_f32(v704, v700); + float32x2_t v709 = vmul_f32(v678, v1422); + float32x2_t v776 = vmul_f32(v771, v1351); + float32x2_t v786 = vmul_f32(v772, v1361); + float32x2_t v798 = vadd_f32(v128, v771); + float32x2_t v810 = vrev64_f32(v782); + float32x2_t v824 = vrev64_f32(v797); + float32x2_t v1008 = vrev64_f32(v283); + float32x2_t v1020 = vrev64_f32(v422); + float32x2_t v1032 = vrev64_f32(v700); + float32x2_t v1050 = vrev64_f32(v561); + float32x2_t v154 = vsub_f32(v153, v136); + float32x2_t v293 = vsub_f32(v292, v275); + float32x2_t v432 = vsub_f32(v431, v414); + float32x2_t v571 = vsub_f32(v570, v553); + float32x2_t v710 = vsub_f32(v709, v692); + float32x2_t v777 = vsub_f32(v128, v776); + int16x4_t v801 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v798, 15), (int32x2_t){0, 0})); + float32x2_t v811 = vmul_f32(v810, v1399); + float32x2_t v825 = vmul_f32(v824, v1399); + float32x2_t v864 = vrev64_f32(v275); + float32x2_t v876 = vrev64_f32(v414); + float32x2_t v888 = vrev64_f32(v692); + float32x2_t v906 = vrev64_f32(v553); + float32x2_t v1009 = vmul_f32(v1008, v1007); + float32x2_t v1021 = vmul_f32(v1020, v1295); + float32x2_t v1033 = vmul_f32(v1032, v1307); + float32x2_t v1051 = vmul_f32(v1050, v1163); + float32x2_t v1152 = vrev64_f32(v288); + float32x2_t v1164 = vrev64_f32(v427); + float32x2_t v1176 = vrev64_f32(v705); + float32x2_t v1194 = vrev64_f32(v566); + float32x2_t v787 = vsub_f32(v777, v786); + float32x2_t v791 = vmul_f32(v777, v1422); + v6[0] = vget_lane_s32(vreinterpret_s32_s16(v801), 0); + float32x2_t v865 = vmul_f32(v864, v863); + float32x2_t v877 = vmul_f32(v876, v1007); + float32x2_t v889 = vmul_f32(v888, v1295); + float32x2_t v907 = vmul_f32(v906, v1151); + float32x2_t v1010 = vfma_f32(v1009, v283, v1001); + float32x2_t v1022 = vfma_f32(v1021, v422, v1289); + float32x2_t v1034 = vfma_f32(v1033, v700, v1301); + float32x2_t v1052 = vfma_f32(v1051, v561, v1157); + float32x2_t v1153 = vmul_f32(v1152, v1151); + float32x2_t v1165 = vmul_f32(v1164, v1163); + float32x2_t v1177 = vmul_f32(v1176, v1337); + float32x2_t v1195 = vmul_f32(v1194, v1193); + float32x2_t v1296 = vrev64_f32(v293); + float32x2_t v1308 = vrev64_f32(v432); + float32x2_t v1320 = vrev64_f32(v710); + float32x2_t v1338 = vrev64_f32(v571); + float32x2_t v792 = vsub_f32(v791, v787); + float32x2_t v826 = vsub_f32(v787, v825); + float32x2_t v836 = vmul_f32(v787, v1422); + float32x2_t v866 = vfma_f32(v865, v275, v857); + float32x2_t v878 = vfma_f32(v877, v414, v1001); + float32x2_t v890 = vfma_f32(v889, v692, v1289); + float32x2_t v908 = vfma_f32(v907, v553, v1145); + float32x2_t v1035 = vsub_f32(v1010, v1034); + float32x2_t v1039 = vmul_f32(v1010, v1422); + float32x2_t v1053 = vsub_f32(v1022, v1052); + float32x2_t v1057 = vmul_f32(v1022, v1422); + float32x2_t v1154 = vfma_f32(v1153, v288, v1145); + float32x2_t v1166 = vfma_f32(v1165, v427, v1157); + float32x2_t v1178 = vfma_f32(v1177, v705, v1331); + float32x2_t v1196 = vfma_f32(v1195, v566, v1313); + float32x2_t v1297 = vmul_f32(v1296, v1295); + float32x2_t v1309 = vmul_f32(v1308, v1307); + float32x2_t v1321 = vmul_f32(v1320, v1319); + float32x2_t v1339 = vmul_f32(v1338, v1337); + float32x2_t v812 = vsub_f32(v792, v811); + int16x4_t v829 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v826, 15), (int32x2_t){0, 0})); + float32x2_t v837 = vsub_f32(v836, v826); + float32x2_t v847 = vmul_f32(v792, v1422); + float32x2_t v891 = vsub_f32(v866, v890); + float32x2_t v895 = vmul_f32(v866, v1422); + float32x2_t v909 = vsub_f32(v878, v908); + float32x2_t v913 = vmul_f32(v878, v1422); + float32x2_t v1040 = vsub_f32(v1039, v1035); + float32x2_t v1058 = vsub_f32(v1057, v1053); + float32x2_t v1069 = vmul_f32(v1053, v1371); + float32x2_t v1084 = vmul_f32(v1035, v1371); + float32x2_t v1179 = vsub_f32(v1154, v1178); + float32x2_t v1183 = vmul_f32(v1154, v1422); + float32x2_t v1197 = vsub_f32(v1166, v1196); + float32x2_t v1201 = vmul_f32(v1166, v1422); + float32x2_t v1298 = vfma_f32(v1297, v293, v1289); + float32x2_t v1310 = vfma_f32(v1309, v432, v1301); + float32x2_t v1322 = vfma_f32(v1321, v710, v1313); + float32x2_t v1340 = vfma_f32(v1339, v571, v1331); + int16x4_t v815 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v812, 15), (int32x2_t){0, 0})); + v6[ostride * 10] = vget_lane_s32(vreinterpret_s32_s16(v829), 0); + int16x4_t v840 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v837, 15), (int32x2_t){0, 0})); + float32x2_t v848 = vsub_f32(v847, v812); + float32x2_t v896 = vsub_f32(v895, v891); + float32x2_t v914 = vsub_f32(v913, v909); + float32x2_t v925 = vmul_f32(v909, v1371); + float32x2_t v940 = vmul_f32(v891, v1371); + float32x2_t v1059 = vadd_f32(v1040, v1058); + float32x2_t v1060 = vsub_f32(v1040, v1058); + float32x2_t v1070 = vadd_f32(v1035, v1069); + float32x2_t v1085 = vsub_f32(v1084, v1053); + float32x2_t v1184 = vsub_f32(v1183, v1179); + float32x2_t v1202 = vsub_f32(v1201, v1197); + float32x2_t v1213 = vmul_f32(v1197, v1371); + float32x2_t v1228 = vmul_f32(v1179, v1371); + float32x2_t v1323 = vsub_f32(v1298, v1322); + float32x2_t v1327 = vmul_f32(v1298, v1422); + float32x2_t v1341 = vsub_f32(v1310, v1340); + float32x2_t v1345 = vmul_f32(v1310, v1422); + v6[ostride * 5] = vget_lane_s32(vreinterpret_s32_s16(v815), 0); + v6[ostride * 15] = vget_lane_s32(vreinterpret_s32_s16(v840), 0); + int16x4_t v851 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v848, 15), (int32x2_t){0, 0})); + float32x2_t v915 = vadd_f32(v896, v914); + float32x2_t v916 = vsub_f32(v896, v914); + float32x2_t v926 = vadd_f32(v891, v925); + float32x2_t v941 = vsub_f32(v940, v909); + float32x2_t v1064 = vmul_f32(v1059, v1351); + float32x2_t v1074 = vmul_f32(v1060, v1361); + float32x2_t v1086 = vadd_f32(v144, v1059); + float32x2_t v1098 = vrev64_f32(v1070); + float32x2_t v1112 = vrev64_f32(v1085); + float32x2_t v1203 = vadd_f32(v1184, v1202); + float32x2_t v1204 = vsub_f32(v1184, v1202); + float32x2_t v1214 = vadd_f32(v1179, v1213); + float32x2_t v1229 = vsub_f32(v1228, v1197); + float32x2_t v1328 = vsub_f32(v1327, v1323); + float32x2_t v1346 = vsub_f32(v1345, v1341); + float32x2_t v1357 = vmul_f32(v1341, v1371); + float32x2_t v1372 = vmul_f32(v1323, v1371); + v6[ostride * 20] = vget_lane_s32(vreinterpret_s32_s16(v851), 0); + float32x2_t v920 = vmul_f32(v915, v1351); + float32x2_t v930 = vmul_f32(v916, v1361); + float32x2_t v942 = vadd_f32(v136, v915); + float32x2_t v954 = vrev64_f32(v926); + float32x2_t v968 = vrev64_f32(v941); + float32x2_t v1065 = vsub_f32(v144, v1064); + int16x4_t v1089 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1086, 15), (int32x2_t){0, 0})); + float32x2_t v1099 = vmul_f32(v1098, v1399); + float32x2_t v1113 = vmul_f32(v1112, v1399); + float32x2_t v1208 = vmul_f32(v1203, v1351); + float32x2_t v1218 = vmul_f32(v1204, v1361); + float32x2_t v1230 = vadd_f32(v149, v1203); + float32x2_t v1242 = vrev64_f32(v1214); + float32x2_t v1256 = vrev64_f32(v1229); + float32x2_t v1347 = vadd_f32(v1328, v1346); + float32x2_t v1348 = vsub_f32(v1328, v1346); + float32x2_t v1358 = vadd_f32(v1323, v1357); + float32x2_t v1373 = vsub_f32(v1372, v1341); + float32x2_t v921 = vsub_f32(v136, v920); + int16x4_t v945 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v942, 15), (int32x2_t){0, 0})); + float32x2_t v955 = vmul_f32(v954, v1399); + float32x2_t v969 = vmul_f32(v968, v1399); + float32x2_t v1075 = vsub_f32(v1065, v1074); + float32x2_t v1079 = vmul_f32(v1065, v1422); + v6[ostride * 2] = vget_lane_s32(vreinterpret_s32_s16(v1089), 0); + float32x2_t v1209 = vsub_f32(v149, v1208); + int16x4_t v1233 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1230, 15), (int32x2_t){0, 0})); + float32x2_t v1243 = vmul_f32(v1242, v1399); + float32x2_t v1257 = vmul_f32(v1256, v1399); + float32x2_t v1352 = vmul_f32(v1347, v1351); + float32x2_t v1362 = vmul_f32(v1348, v1361); + float32x2_t v1374 = vadd_f32(v154, v1347); + float32x2_t v1386 = vrev64_f32(v1358); + float32x2_t v1400 = vrev64_f32(v1373); + float32x2_t v931 = vsub_f32(v921, v930); + float32x2_t v935 = vmul_f32(v921, v1422); + v6[ostride] = vget_lane_s32(vreinterpret_s32_s16(v945), 0); + float32x2_t v1080 = vsub_f32(v1079, v1075); + float32x2_t v1114 = vsub_f32(v1075, v1113); + float32x2_t v1124 = vmul_f32(v1075, v1422); + float32x2_t v1219 = vsub_f32(v1209, v1218); + float32x2_t v1223 = vmul_f32(v1209, v1422); + v6[ostride * 3] = vget_lane_s32(vreinterpret_s32_s16(v1233), 0); + float32x2_t v1353 = vsub_f32(v154, v1352); + int16x4_t v1377 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1374, 15), (int32x2_t){0, 0})); + float32x2_t v1387 = vmul_f32(v1386, v1399); + float32x2_t v1401 = vmul_f32(v1400, v1399); + float32x2_t v936 = vsub_f32(v935, v931); + float32x2_t v970 = vsub_f32(v931, v969); + float32x2_t v980 = vmul_f32(v931, v1422); + float32x2_t v1100 = vsub_f32(v1080, v1099); + int16x4_t v1117 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1114, 15), (int32x2_t){0, 0})); + float32x2_t v1125 = vsub_f32(v1124, v1114); + float32x2_t v1135 = vmul_f32(v1080, v1422); + float32x2_t v1224 = vsub_f32(v1223, v1219); + float32x2_t v1258 = vsub_f32(v1219, v1257); + float32x2_t v1268 = vmul_f32(v1219, v1422); + float32x2_t v1363 = vsub_f32(v1353, v1362); + float32x2_t v1367 = vmul_f32(v1353, v1422); + v6[ostride * 4] = vget_lane_s32(vreinterpret_s32_s16(v1377), 0); + float32x2_t v956 = vsub_f32(v936, v955); + int16x4_t v973 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v970, 15), (int32x2_t){0, 0})); + float32x2_t v981 = vsub_f32(v980, v970); + float32x2_t v991 = vmul_f32(v936, v1422); + int16x4_t v1103 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1100, 15), (int32x2_t){0, 0})); + v6[ostride * 12] = vget_lane_s32(vreinterpret_s32_s16(v1117), 0); + int16x4_t v1128 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1125, 15), (int32x2_t){0, 0})); + float32x2_t v1136 = vsub_f32(v1135, v1100); + float32x2_t v1244 = vsub_f32(v1224, v1243); + int16x4_t v1261 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1258, 15), (int32x2_t){0, 0})); + float32x2_t v1269 = vsub_f32(v1268, v1258); + float32x2_t v1279 = vmul_f32(v1224, v1422); + float32x2_t v1368 = vsub_f32(v1367, v1363); + float32x2_t v1402 = vsub_f32(v1363, v1401); + float32x2_t v1412 = vmul_f32(v1363, v1422); + int16x4_t v959 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v956, 15), (int32x2_t){0, 0})); + v6[ostride * 11] = vget_lane_s32(vreinterpret_s32_s16(v973), 0); + int16x4_t v984 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v981, 15), (int32x2_t){0, 0})); + float32x2_t v992 = vsub_f32(v991, v956); + v6[ostride * 7] = vget_lane_s32(vreinterpret_s32_s16(v1103), 0); + v6[ostride * 17] = vget_lane_s32(vreinterpret_s32_s16(v1128), 0); + int16x4_t v1139 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1136, 15), (int32x2_t){0, 0})); + int16x4_t v1247 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1244, 15), (int32x2_t){0, 0})); + v6[ostride * 13] = vget_lane_s32(vreinterpret_s32_s16(v1261), 0); + int16x4_t v1272 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1269, 15), (int32x2_t){0, 0})); + float32x2_t v1280 = vsub_f32(v1279, v1244); + float32x2_t v1388 = vsub_f32(v1368, v1387); + int16x4_t v1405 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1402, 15), (int32x2_t){0, 0})); + float32x2_t v1413 = vsub_f32(v1412, v1402); + float32x2_t v1423 = vmul_f32(v1368, v1422); + v6[ostride * 6] = vget_lane_s32(vreinterpret_s32_s16(v959), 0); + v6[ostride * 16] = vget_lane_s32(vreinterpret_s32_s16(v984), 0); + int16x4_t v995 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v992, 15), (int32x2_t){0, 0})); + v6[ostride * 22] = vget_lane_s32(vreinterpret_s32_s16(v1139), 0); + v6[ostride * 8] = vget_lane_s32(vreinterpret_s32_s16(v1247), 0); + v6[ostride * 18] = vget_lane_s32(vreinterpret_s32_s16(v1272), 0); + int16x4_t v1283 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1280, 15), (int32x2_t){0, 0})); + int16x4_t v1391 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1388, 15), (int32x2_t){0, 0})); + v6[ostride * 14] = vget_lane_s32(vreinterpret_s32_s16(v1405), 0); + int16x4_t v1416 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1413, 15), (int32x2_t){0, 0})); + float32x2_t v1424 = vsub_f32(v1423, v1388); + v6[ostride * 21] = vget_lane_s32(vreinterpret_s32_s16(v995), 0); + v6[ostride * 23] = vget_lane_s32(vreinterpret_s32_s16(v1283), 0); + v6[ostride * 9] = vget_lane_s32(vreinterpret_s32_s16(v1391), 0); + v6[ostride * 19] = vget_lane_s32(vreinterpret_s32_s16(v1416), 0); + int16x4_t v1427 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1424, 15), (int32x2_t){0, 0})); + v6[ostride * 24] = vget_lane_s32(vreinterpret_s32_s16(v1427), 0); v5 += 1 * 1; v6 += 1 * 1; } @@ -18586,7 +11424,6 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu25(const armral_cmplx_f32_t *restrict x, float v1565 = 2.5000000000000000e-01F; float v1577 = 5.5901699437494745e-01F; float v1589 = 6.1803398874989490e-01F; - float v1619 = 0.0000000000000000e+00F; float v1620 = -9.5105651629515353e-01F; float v1650 = 2.0000000000000000e+00F; const float32x2_t *v1734 = &v5[v0]; @@ -18614,7 +11451,6 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu25(const armral_cmplx_f32_t *restrict x, int64_t v681 = v0 * 14; int64_t v688 = v0 * 19; int64_t v695 = v0 * 24; - float v883 = v4 * v1619; int64_t v944 = v2 * 5; int64_t v960 = v2 * 10; int64_t v974 = v2 * 15; @@ -18649,6 +11485,7 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu25(const armral_cmplx_f32_t *restrict x, int64_t v1642 = v2 * 19; int64_t v1656 = v2 * 24; const float32x2_t *v1670 = &v5[0]; + svfloat32_t v1992 = svdup_n_f32(0); int32_t *v2006 = &v6[0]; svfloat32_t v2049 = svdup_n_f32(v996); svfloat32_t v2113 = svdup_n_f32(v1163); @@ -18662,7 +11499,7 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu25(const armral_cmplx_f32_t *restrict x, svfloat32_t v2253 = svdup_n_f32(v1577); svfloat32_t v2255 = svdup_n_f32(v1589); svfloat32_t v2295 = svdup_n_f32(v1650); - svfloat32_t v2316 = svreinterpret_f32_f64( + svfloat32_t v2340 = svreinterpret_f32_f64( svld1_f64(pred_full, &((const double *)v1734)[0])); const float32x2_t *v1679 = &v5[v26]; const float32x2_t *v1688 = &v5[v33]; @@ -18687,7 +11524,6 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu25(const armral_cmplx_f32_t *restrict x, const float32x2_t *v1944 = &v5[v681]; const float32x2_t *v1953 = &v5[v688]; const float32x2_t *v1962 = &v5[v695]; - svfloat32_t v1992 = svdup_n_f32(v883); int32_t *v2016 = &v6[v944]; int32_t *v2026 = &v6[v960]; int32_t *v2036 = &v6[v974]; @@ -18721,94 +11557,84 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu25(const armral_cmplx_f32_t *restrict x, int32_t *v2282 = &v6[v1628]; int32_t *v2292 = &v6[v1642]; int32_t *v2302 = &v6[v1656]; - svfloat32_t v2306 = svreinterpret_f32_f64( + svfloat32_t v2330 = svreinterpret_f32_f64( svld1_f64(pred_full, &((const double *)v1670)[0])); - svfloat32_t v2308 = svreinterpret_f32_f64( + svfloat32_t v2332 = svreinterpret_f32_f64( svld1_f64(pred_full, &((const double *)v1679)[0])); - svfloat32_t v2310 = svreinterpret_f32_f64( + svfloat32_t v2334 = svreinterpret_f32_f64( svld1_f64(pred_full, &((const double *)v1688)[0])); - svfloat32_t v2312 = svreinterpret_f32_f64( + svfloat32_t v2336 = svreinterpret_f32_f64( svld1_f64(pred_full, &((const double *)v1697)[0])); - svfloat32_t v2314 = svreinterpret_f32_f64( + svfloat32_t v2338 = svreinterpret_f32_f64( svld1_f64(pred_full, &((const double *)v1706)[0])); - svfloat32_t v2318 = svreinterpret_f32_f64( + svfloat32_t v2342 = svreinterpret_f32_f64( svld1_f64(pred_full, &((const double *)v1743)[0])); - svfloat32_t v2320 = svreinterpret_f32_f64( + svfloat32_t v2344 = svreinterpret_f32_f64( svld1_f64(pred_full, &((const double *)v1752)[0])); - svfloat32_t v2322 = svreinterpret_f32_f64( + svfloat32_t v2346 = svreinterpret_f32_f64( svld1_f64(pred_full, &((const double *)v1761)[0])); - svfloat32_t v2324 = svreinterpret_f32_f64( + svfloat32_t v2348 = svreinterpret_f32_f64( svld1_f64(pred_full, &((const double *)v1770)[0])); - svfloat32_t v2326 = svreinterpret_f32_f64( + svfloat32_t v2350 = svreinterpret_f32_f64( svld1_f64(pred_full, &((const double *)v1798)[0])); - svfloat32_t v2328 = svreinterpret_f32_f64( + svfloat32_t v2352 = svreinterpret_f32_f64( svld1_f64(pred_full, &((const double *)v1807)[0])); - svfloat32_t v2330 = svreinterpret_f32_f64( + svfloat32_t v2354 = svreinterpret_f32_f64( svld1_f64(pred_full, &((const double *)v1816)[0])); - svfloat32_t v2332 = svreinterpret_f32_f64( + svfloat32_t v2356 = svreinterpret_f32_f64( svld1_f64(pred_full, &((const double *)v1825)[0])); - svfloat32_t v2334 = svreinterpret_f32_f64( + svfloat32_t v2358 = svreinterpret_f32_f64( svld1_f64(pred_full, &((const double *)v1834)[0])); - svfloat32_t v2336 = svreinterpret_f32_f64( + svfloat32_t v2360 = svreinterpret_f32_f64( svld1_f64(pred_full, &((const double *)v1862)[0])); - svfloat32_t v2338 = svreinterpret_f32_f64( + svfloat32_t v2362 = svreinterpret_f32_f64( svld1_f64(pred_full, &((const double *)v1871)[0])); - svfloat32_t v2340 = svreinterpret_f32_f64( + svfloat32_t v2364 = svreinterpret_f32_f64( svld1_f64(pred_full, &((const double *)v1880)[0])); - svfloat32_t v2342 = svreinterpret_f32_f64( + svfloat32_t v2366 = svreinterpret_f32_f64( svld1_f64(pred_full, &((const double *)v1889)[0])); - svfloat32_t v2344 = svreinterpret_f32_f64( + svfloat32_t v2368 = svreinterpret_f32_f64( svld1_f64(pred_full, &((const double *)v1898)[0])); - svfloat32_t v2346 = svreinterpret_f32_f64( + svfloat32_t v2370 = svreinterpret_f32_f64( svld1_f64(pred_full, &((const double *)v1926)[0])); - svfloat32_t v2348 = svreinterpret_f32_f64( + svfloat32_t v2372 = svreinterpret_f32_f64( svld1_f64(pred_full, &((const double *)v1935)[0])); - svfloat32_t v2350 = svreinterpret_f32_f64( + svfloat32_t v2374 = svreinterpret_f32_f64( svld1_f64(pred_full, &((const double *)v1944)[0])); - svfloat32_t v2352 = svreinterpret_f32_f64( + svfloat32_t v2376 = svreinterpret_f32_f64( svld1_f64(pred_full, &((const double *)v1953)[0])); - svfloat32_t v2354 = svreinterpret_f32_f64( + svfloat32_t v2378 = svreinterpret_f32_f64( svld1_f64(pred_full, &((const double *)v1962)[0])); - svfloat32_t v65 = svcmla_f32_x(pred_full, v2308, v1992, v2308, 90); - svfloat32_t v78 = svcmla_f32_x(pred_full, v2310, v1992, v2310, 90); - svfloat32_t v91 = svcmla_f32_x(pred_full, v2314, v1992, v2314, 90); - svfloat32_t v111 = svcmla_f32_x(pred_full, v2312, v1992, v2312, 90); - svfloat32_t v227 = svcmla_f32_x(pred_full, v2318, v1992, v2318, 90); - svfloat32_t v240 = svcmla_f32_x(pred_full, v2320, v1992, v2320, 90); - svfloat32_t v253 = svcmla_f32_x(pred_full, v2324, v1992, v2324, 90); - svfloat32_t v273 = svcmla_f32_x(pred_full, v2322, v1992, v2322, 90); - svfloat32_t v389 = svcmla_f32_x(pred_full, v2328, v1992, v2328, 90); - svfloat32_t v402 = svcmla_f32_x(pred_full, v2330, v1992, v2330, 90); - svfloat32_t v415 = svcmla_f32_x(pred_full, v2334, v1992, v2334, 90); - svfloat32_t v435 = svcmla_f32_x(pred_full, v2332, v1992, v2332, 90); - svfloat32_t v551 = svcmla_f32_x(pred_full, v2338, v1992, v2338, 90); - svfloat32_t v564 = svcmla_f32_x(pred_full, v2340, v1992, v2340, 90); - svfloat32_t v577 = svcmla_f32_x(pred_full, v2344, v1992, v2344, 90); - svfloat32_t v597 = svcmla_f32_x(pred_full, v2342, v1992, v2342, 90); - svfloat32_t v713 = svcmla_f32_x(pred_full, v2348, v1992, v2348, 90); - svfloat32_t v726 = svcmla_f32_x(pred_full, v2350, v1992, v2350, 90); - svfloat32_t v739 = svcmla_f32_x(pred_full, v2354, v1992, v2354, 90); - svfloat32_t v759 = svcmla_f32_x(pred_full, v2352, v1992, v2352, 90); - svfloat32_t v92; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v92) : "w"(v65), "w"(v91)); - svfloat32_t v112; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v112) : "w"(v78), "w"(v111)); - svfloat32_t v254; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v254) : "w"(v227), "w"(v253)); - svfloat32_t v274; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v274) : "w"(v240), "w"(v273)); - svfloat32_t v416; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v416) : "w"(v389), "w"(v415)); - svfloat32_t v436; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v436) : "w"(v402), "w"(v435)); - svfloat32_t v578; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v578) : "w"(v551), "w"(v577)); - svfloat32_t v598; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v598) : "w"(v564), "w"(v597)); - svfloat32_t v740; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v740) : "w"(v713), "w"(v739)); - svfloat32_t v760; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v760) : "w"(v726), "w"(v759)); + svfloat32_t v65 = svcmla_f32_x(pred_full, v2332, v1992, v2332, 90); + svfloat32_t v78 = svcmla_f32_x(pred_full, v2334, v1992, v2334, 90); + svfloat32_t v91 = svcmla_f32_x(pred_full, v2338, v1992, v2338, 90); + svfloat32_t v111 = svcmla_f32_x(pred_full, v2336, v1992, v2336, 90); + svfloat32_t v227 = svcmla_f32_x(pred_full, v2342, v1992, v2342, 90); + svfloat32_t v240 = svcmla_f32_x(pred_full, v2344, v1992, v2344, 90); + svfloat32_t v253 = svcmla_f32_x(pred_full, v2348, v1992, v2348, 90); + svfloat32_t v273 = svcmla_f32_x(pred_full, v2346, v1992, v2346, 90); + svfloat32_t v389 = svcmla_f32_x(pred_full, v2352, v1992, v2352, 90); + svfloat32_t v402 = svcmla_f32_x(pred_full, v2354, v1992, v2354, 90); + svfloat32_t v415 = svcmla_f32_x(pred_full, v2358, v1992, v2358, 90); + svfloat32_t v435 = svcmla_f32_x(pred_full, v2356, v1992, v2356, 90); + svfloat32_t v551 = svcmla_f32_x(pred_full, v2362, v1992, v2362, 90); + svfloat32_t v564 = svcmla_f32_x(pred_full, v2364, v1992, v2364, 90); + svfloat32_t v577 = svcmla_f32_x(pred_full, v2368, v1992, v2368, 90); + svfloat32_t v597 = svcmla_f32_x(pred_full, v2366, v1992, v2366, 90); + svfloat32_t v713 = svcmla_f32_x(pred_full, v2372, v1992, v2372, 90); + svfloat32_t v726 = svcmla_f32_x(pred_full, v2374, v1992, v2374, 90); + svfloat32_t v739 = svcmla_f32_x(pred_full, v2378, v1992, v2378, 90); + svfloat32_t v759 = svcmla_f32_x(pred_full, v2376, v1992, v2376, 90); + svfloat32_t v92 = svsub_f32_x(svptrue_b32(), v65, v91); + svfloat32_t v112 = svsub_f32_x(svptrue_b32(), v78, v111); + svfloat32_t v254 = svsub_f32_x(svptrue_b32(), v227, v253); + svfloat32_t v274 = svsub_f32_x(svptrue_b32(), v240, v273); + svfloat32_t v416 = svsub_f32_x(svptrue_b32(), v389, v415); + svfloat32_t v436 = svsub_f32_x(svptrue_b32(), v402, v435); + svfloat32_t v578 = svsub_f32_x(svptrue_b32(), v551, v577); + svfloat32_t v598 = svsub_f32_x(svptrue_b32(), v564, v597); + svfloat32_t v740 = svsub_f32_x(svptrue_b32(), v713, v739); + svfloat32_t v760 = svsub_f32_x(svptrue_b32(), v726, v759); svfloat32_t v98 = svnmls_f32_x(pred_full, v92, v65, v2295); svfloat32_t v118 = svnmls_f32_x(pred_full, v112, v78, v2295); svfloat32_t v260 = svnmls_f32_x(pred_full, v254, v227, v2295); @@ -18819,81 +11645,56 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu25(const armral_cmplx_f32_t *restrict x, svfloat32_t v604 = svnmls_f32_x(pred_full, v598, v564, v2295); svfloat32_t v746 = svnmls_f32_x(pred_full, v740, v713, v2295); svfloat32_t v766 = svnmls_f32_x(pred_full, v760, v726, v2295); - svfloat32_t v119; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v119) : "w"(v98), "w"(v118)); - svfloat32_t v120; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v120) : "w"(v98), "w"(v118)); + svfloat32_t v119 = svadd_f32_x(svptrue_b32(), v98, v118); + svfloat32_t v120 = svsub_f32_x(svptrue_b32(), v98, v118); svfloat32_t v132 = svmla_f32_x(pred_full, v92, v112, v2255); svfloat32_t v150 = svnmls_f32_x(pred_full, v112, v92, v2255); - svfloat32_t v281; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v281) : "w"(v260), "w"(v280)); - svfloat32_t v282; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v282) : "w"(v260), "w"(v280)); + svfloat32_t v281 = svadd_f32_x(svptrue_b32(), v260, v280); + svfloat32_t v282 = svsub_f32_x(svptrue_b32(), v260, v280); svfloat32_t v294 = svmla_f32_x(pred_full, v254, v274, v2255); svfloat32_t v312 = svnmls_f32_x(pred_full, v274, v254, v2255); - svfloat32_t v443; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v443) : "w"(v422), "w"(v442)); - svfloat32_t v444; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v444) : "w"(v422), "w"(v442)); + svfloat32_t v443 = svadd_f32_x(svptrue_b32(), v422, v442); + svfloat32_t v444 = svsub_f32_x(svptrue_b32(), v422, v442); svfloat32_t v456 = svmla_f32_x(pred_full, v416, v436, v2255); svfloat32_t v474 = svnmls_f32_x(pred_full, v436, v416, v2255); - svfloat32_t v605; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v605) : "w"(v584), "w"(v604)); - svfloat32_t v606; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v606) : "w"(v584), "w"(v604)); + svfloat32_t v605 = svadd_f32_x(svptrue_b32(), v584, v604); + svfloat32_t v606 = svsub_f32_x(svptrue_b32(), v584, v604); svfloat32_t v618 = svmla_f32_x(pred_full, v578, v598, v2255); svfloat32_t v636 = svnmls_f32_x(pred_full, v598, v578, v2255); - svfloat32_t v767; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v767) : "w"(v746), "w"(v766)); - svfloat32_t v768; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v768) : "w"(v746), "w"(v766)); + svfloat32_t v767 = svadd_f32_x(svptrue_b32(), v746, v766); + svfloat32_t v768 = svsub_f32_x(svptrue_b32(), v746, v766); svfloat32_t v780 = svmla_f32_x(pred_full, v740, v760, v2255); svfloat32_t v798 = svnmls_f32_x(pred_full, v760, v740, v2255); - svfloat32_t v151; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v151) : "w"(v2306), "w"(v119)); - svfloat32_t zero158; - asm volatile("mov %0.s, #0" : "=w"(zero158)); + svfloat32_t v151 = svadd_f32_x(svptrue_b32(), v2330, v119); + svfloat32_t zero158 = svdup_n_f32(0); svfloat32_t v158 = svcmla_f32_x(pred_full, zero158, v2275, v132, 90); - svfloat32_t zero166; - asm volatile("mov %0.s, #0" : "=w"(zero166)); + svfloat32_t zero166 = svdup_n_f32(0); svfloat32_t v166 = svcmla_f32_x(pred_full, zero166, v2275, v150, 90); - svfloat32_t v313; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v313) : "w"(v2316), "w"(v281)); - svfloat32_t zero320; - asm volatile("mov %0.s, #0" : "=w"(zero320)); + svfloat32_t v313 = svadd_f32_x(svptrue_b32(), v2340, v281); + svfloat32_t zero320 = svdup_n_f32(0); svfloat32_t v320 = svcmla_f32_x(pred_full, zero320, v2275, v294, 90); - svfloat32_t zero328; - asm volatile("mov %0.s, #0" : "=w"(zero328)); + svfloat32_t zero328 = svdup_n_f32(0); svfloat32_t v328 = svcmla_f32_x(pred_full, zero328, v2275, v312, 90); - svfloat32_t v475; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v475) : "w"(v2326), "w"(v443)); - svfloat32_t zero482; - asm volatile("mov %0.s, #0" : "=w"(zero482)); + svfloat32_t v475 = svadd_f32_x(svptrue_b32(), v2350, v443); + svfloat32_t zero482 = svdup_n_f32(0); svfloat32_t v482 = svcmla_f32_x(pred_full, zero482, v2275, v456, 90); - svfloat32_t zero490; - asm volatile("mov %0.s, #0" : "=w"(zero490)); + svfloat32_t zero490 = svdup_n_f32(0); svfloat32_t v490 = svcmla_f32_x(pred_full, zero490, v2275, v474, 90); - svfloat32_t v637; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v637) : "w"(v2336), "w"(v605)); - svfloat32_t zero644; - asm volatile("mov %0.s, #0" : "=w"(zero644)); + svfloat32_t v637 = svadd_f32_x(svptrue_b32(), v2360, v605); + svfloat32_t zero644 = svdup_n_f32(0); svfloat32_t v644 = svcmla_f32_x(pred_full, zero644, v2275, v618, 90); - svfloat32_t zero652; - asm volatile("mov %0.s, #0" : "=w"(zero652)); + svfloat32_t zero652 = svdup_n_f32(0); svfloat32_t v652 = svcmla_f32_x(pred_full, zero652, v2275, v636, 90); - svfloat32_t v799; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v799) : "w"(v2346), "w"(v767)); - svfloat32_t zero806; - asm volatile("mov %0.s, #0" : "=w"(zero806)); + svfloat32_t v799 = svadd_f32_x(svptrue_b32(), v2370, v767); + svfloat32_t zero806 = svdup_n_f32(0); svfloat32_t v806 = svcmla_f32_x(pred_full, zero806, v2275, v780, 90); - svfloat32_t zero814; - asm volatile("mov %0.s, #0" : "=w"(zero814)); + svfloat32_t zero814 = svdup_n_f32(0); svfloat32_t v814 = svcmla_f32_x(pred_full, zero814, v2275, v798, 90); - svfloat32_t v126 = svmls_f32_x(pred_full, v2306, v119, v2251); - svfloat32_t v288 = svmls_f32_x(pred_full, v2316, v281, v2251); - svfloat32_t v450 = svmls_f32_x(pred_full, v2326, v443, v2251); - svfloat32_t v612 = svmls_f32_x(pred_full, v2336, v605, v2251); - svfloat32_t v774 = svmls_f32_x(pred_full, v2346, v767, v2251); + svfloat32_t v126 = svmls_f32_x(pred_full, v2330, v119, v2251); + svfloat32_t v288 = svmls_f32_x(pred_full, v2340, v281, v2251); + svfloat32_t v450 = svmls_f32_x(pred_full, v2350, v443, v2251); + svfloat32_t v612 = svmls_f32_x(pred_full, v2360, v605, v2251); + svfloat32_t v774 = svmls_f32_x(pred_full, v2370, v767, v2251); svfloat32_t v138 = svmls_f32_x(pred_full, v126, v120, v2253); svfloat32_t v300 = svmls_f32_x(pred_full, v288, v282, v2253); svfloat32_t v462 = svmls_f32_x(pred_full, v450, v444, v2253); @@ -18904,124 +11705,85 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu25(const armral_cmplx_f32_t *restrict x, svfloat32_t v866 = svcmla_f32_x(pred_full, v799, v1992, v799, 90); svfloat32_t v886 = svcmla_f32_x(pred_full, v637, v1992, v637, 90); svfloat32_t v144 = svnmls_f32_x(pred_full, v138, v126, v2295); - svfloat32_t v167; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v167) : "w"(v138), "w"(v166)); + svfloat32_t v167 = svsub_f32_x(svptrue_b32(), v138, v166); svfloat32_t v306 = svnmls_f32_x(pred_full, v300, v288, v2295); - svfloat32_t v329; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v329) : "w"(v300), "w"(v328)); + svfloat32_t v329 = svsub_f32_x(svptrue_b32(), v300, v328); svfloat32_t v468 = svnmls_f32_x(pred_full, v462, v450, v2295); - svfloat32_t v491; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v491) : "w"(v462), "w"(v490)); + svfloat32_t v491 = svsub_f32_x(svptrue_b32(), v462, v490); svfloat32_t v630 = svnmls_f32_x(pred_full, v624, v612, v2295); - svfloat32_t v653; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v653) : "w"(v624), "w"(v652)); + svfloat32_t v653 = svsub_f32_x(svptrue_b32(), v624, v652); svfloat32_t v792 = svnmls_f32_x(pred_full, v786, v774, v2295); - svfloat32_t v815; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v815) : "w"(v786), "w"(v814)); - svfloat32_t v867; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v867) : "w"(v840), "w"(v866)); - svfloat32_t v887; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v887) : "w"(v853), "w"(v886)); - svfloat32_t v159; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v159) : "w"(v144), "w"(v158)); + svfloat32_t v815 = svsub_f32_x(svptrue_b32(), v786, v814); + svfloat32_t v867 = svsub_f32_x(svptrue_b32(), v840, v866); + svfloat32_t v887 = svsub_f32_x(svptrue_b32(), v853, v886); + svfloat32_t v159 = svsub_f32_x(svptrue_b32(), v144, v158); svfloat32_t v173 = svnmls_f32_x(pred_full, v167, v138, v2295); - svfloat32_t v321; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v321) : "w"(v306), "w"(v320)); + svfloat32_t v321 = svsub_f32_x(svptrue_b32(), v306, v320); svfloat32_t v335 = svnmls_f32_x(pred_full, v329, v300, v2295); - svfloat32_t v483; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v483) : "w"(v468), "w"(v482)); + svfloat32_t v483 = svsub_f32_x(svptrue_b32(), v468, v482); svfloat32_t v497 = svnmls_f32_x(pred_full, v491, v462, v2295); - svfloat32_t v645; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v645) : "w"(v630), "w"(v644)); + svfloat32_t v645 = svsub_f32_x(svptrue_b32(), v630, v644); svfloat32_t v659 = svnmls_f32_x(pred_full, v653, v624, v2295); - svfloat32_t v807; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v807) : "w"(v792), "w"(v806)); + svfloat32_t v807 = svsub_f32_x(svptrue_b32(), v792, v806); svfloat32_t v821 = svnmls_f32_x(pred_full, v815, v786, v2295); svfloat32_t v873 = svnmls_f32_x(pred_full, v867, v840, v2295); svfloat32_t v893 = svnmls_f32_x(pred_full, v887, v853, v2295); - svfloat32_t v1166; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v1166) : "w"(v329), "w"(v2113)); - svfloat32_t v1179; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v1179) : "w"(v491), "w"(v2241)); - svfloat32_t v1192; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v1192) : "w"(v815), "w"(v2243)); - svfloat32_t v1212; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v1212) : "w"(v653), "w"(v2179)); + svfloat32_t v1166 = svmul_f32_x(svptrue_b32(), v329, v2113); + svfloat32_t v1179 = svmul_f32_x(svptrue_b32(), v491, v2241); + svfloat32_t v1192 = svmul_f32_x(svptrue_b32(), v815, v2243); + svfloat32_t v1212 = svmul_f32_x(svptrue_b32(), v653, v2179); svfloat32_t v179 = svnmls_f32_x(pred_full, v159, v144, v2295); svfloat32_t v341 = svnmls_f32_x(pred_full, v321, v306, v2295); svfloat32_t v503 = svnmls_f32_x(pred_full, v483, v468, v2295); svfloat32_t v665 = svnmls_f32_x(pred_full, v645, v630, v2295); svfloat32_t v827 = svnmls_f32_x(pred_full, v807, v792, v2295); - svfloat32_t v894; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v894) : "w"(v873), "w"(v893)); - svfloat32_t v895; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v895) : "w"(v873), "w"(v893)); + svfloat32_t v894 = svadd_f32_x(svptrue_b32(), v873, v893); + svfloat32_t v895 = svsub_f32_x(svptrue_b32(), v873, v893); svfloat32_t v907 = svmla_f32_x(pred_full, v867, v887, v2255); svfloat32_t v925 = svnmls_f32_x(pred_full, v887, v867, v2255); - svfloat32_t v999; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v999) : "w"(v321), "w"(v2049)); - svfloat32_t v1012; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v1012) : "w"(v483), "w"(v2113)); - svfloat32_t v1025; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v1025) : "w"(v807), "w"(v2241)); - svfloat32_t v1045; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v1045) : "w"(v645), "w"(v2177)); + svfloat32_t v999 = svmul_f32_x(svptrue_b32(), v321, v2049); + svfloat32_t v1012 = svmul_f32_x(svptrue_b32(), v483, v2113); + svfloat32_t v1025 = svmul_f32_x(svptrue_b32(), v807, v2241); + svfloat32_t v1045 = svmul_f32_x(svptrue_b32(), v645, v2177); svfloat32_t v1174 = svcmla_f32_x(pred_full, v1166, v2114, v329, 90); svfloat32_t v1187 = svcmla_f32_x(pred_full, v1179, v2242, v491, 90); svfloat32_t v1200 = svcmla_f32_x(pred_full, v1192, v2244, v815, 90); svfloat32_t v1220 = svcmla_f32_x(pred_full, v1212, v2180, v653, 90); - svfloat32_t v1333; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v1333) : "w"(v335), "w"(v2177)); - svfloat32_t v1346; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v1346) : "w"(v497), "w"(v2179)); - svfloat32_t v1359; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v1359) : "w"(v821), "w"(v2248)); - svfloat32_t v1379; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v1379) : "w"(v659), "w"(v2245)); - svfloat32_t v926; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v926) : "w"(v151), "w"(v894)); - svfloat32_t zero941; - asm volatile("mov %0.s, #0" : "=w"(zero941)); + svfloat32_t v1333 = svmul_f32_x(svptrue_b32(), v335, v2177); + svfloat32_t v1346 = svmul_f32_x(svptrue_b32(), v497, v2179); + svfloat32_t v1359 = svmul_f32_x(svptrue_b32(), v821, v2248); + svfloat32_t v1379 = svmul_f32_x(svptrue_b32(), v659, v2245); + svfloat32_t v926 = svadd_f32_x(svptrue_b32(), v151, v894); + svfloat32_t zero941 = svdup_n_f32(0); svfloat32_t v941 = svcmla_f32_x(pred_full, zero941, v2275, v907, 90); - svfloat32_t zero957; - asm volatile("mov %0.s, #0" : "=w"(zero957)); + svfloat32_t zero957 = svdup_n_f32(0); svfloat32_t v957 = svcmla_f32_x(pred_full, zero957, v2275, v925, 90); svfloat32_t v1007 = svcmla_f32_x(pred_full, v999, v2050, v321, 90); svfloat32_t v1020 = svcmla_f32_x(pred_full, v1012, v2114, v483, 90); svfloat32_t v1033 = svcmla_f32_x(pred_full, v1025, v2242, v807, 90); svfloat32_t v1053 = svcmla_f32_x(pred_full, v1045, v2178, v645, 90); - svfloat32_t v1201; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1201) : "w"(v1174), "w"(v1200)); - svfloat32_t v1221; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1221) : "w"(v1187), "w"(v1220)); + svfloat32_t v1201 = svsub_f32_x(svptrue_b32(), v1174, v1200); + svfloat32_t v1221 = svsub_f32_x(svptrue_b32(), v1187, v1220); svfloat32_t v1341 = svcmla_f32_x(pred_full, v1333, v2178, v335, 90); svfloat32_t v1354 = svcmla_f32_x(pred_full, v1346, v2180, v497, 90); svfloat32_t v1367 = svcmla_f32_x(pred_full, v1359, v2249, v821, 90); svfloat32_t v1387 = svcmla_f32_x(pred_full, v1379, v2185, v659, 90); - svfloat32_t v1500; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v1500) : "w"(v341), "w"(v2241)); - svfloat32_t v1513; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v1513) : "w"(v503), "w"(v2243)); - svfloat32_t v1526; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v1526) : "w"(v827), "w"(v2245)); - svfloat32_t v1546; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v1546) : "w"(v665), "w"(v2248)); + svfloat32_t v1500 = svmul_f32_x(svptrue_b32(), v341, v2241); + svfloat32_t v1513 = svmul_f32_x(svptrue_b32(), v503, v2243); + svfloat32_t v1526 = svmul_f32_x(svptrue_b32(), v827, v2245); + svfloat32_t v1546 = svmul_f32_x(svptrue_b32(), v665, v2248); svfloat32_t v901 = svmls_f32_x(pred_full, v151, v894, v2251); svint16_t v929 = svtbl_s16( svreinterpret_s16_s32(svcvt_s32_f32_x( pred_full, svmul_n_f32_x(pred_full, v926, (float)(1ULL << 31ULL)))), svreinterpret_u16_u64( svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svfloat32_t v1034; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1034) : "w"(v1007), "w"(v1033)); - svfloat32_t v1054; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1054) : "w"(v1020), "w"(v1053)); + svfloat32_t v1034 = svsub_f32_x(svptrue_b32(), v1007, v1033); + svfloat32_t v1054 = svsub_f32_x(svptrue_b32(), v1020, v1053); svfloat32_t v1207 = svnmls_f32_x(pred_full, v1201, v1174, v2295); svfloat32_t v1227 = svnmls_f32_x(pred_full, v1221, v1187, v2295); - svfloat32_t v1368; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1368) : "w"(v1341), "w"(v1367)); - svfloat32_t v1388; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1388) : "w"(v1354), "w"(v1387)); + svfloat32_t v1368 = svsub_f32_x(svptrue_b32(), v1341, v1367); + svfloat32_t v1388 = svsub_f32_x(svptrue_b32(), v1354, v1387); svfloat32_t v1508 = svcmla_f32_x(pred_full, v1500, v2242, v341, 90); svfloat32_t v1521 = svcmla_f32_x(pred_full, v1513, v2244, v503, 90); svfloat32_t v1534 = svcmla_f32_x(pred_full, v1526, v2246, v827, 90); @@ -19029,59 +11791,43 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu25(const armral_cmplx_f32_t *restrict x, svfloat32_t v913 = svmls_f32_x(pred_full, v901, v895, v2253); svfloat32_t v1040 = svnmls_f32_x(pred_full, v1034, v1007, v2295); svfloat32_t v1060 = svnmls_f32_x(pred_full, v1054, v1020, v2295); - svfloat32_t v1228; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v1228) : "w"(v1207), "w"(v1227)); - svfloat32_t v1229; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1229) : "w"(v1207), "w"(v1227)); + svfloat32_t v1228 = svadd_f32_x(svptrue_b32(), v1207, v1227); + svfloat32_t v1229 = svsub_f32_x(svptrue_b32(), v1207, v1227); svfloat32_t v1241 = svmla_f32_x(pred_full, v1201, v1221, v2255); svfloat32_t v1259 = svnmls_f32_x(pred_full, v1221, v1201, v2255); svfloat32_t v1374 = svnmls_f32_x(pred_full, v1368, v1341, v2295); svfloat32_t v1394 = svnmls_f32_x(pred_full, v1388, v1354, v2295); - svfloat32_t v1535; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1535) : "w"(v1508), "w"(v1534)); - svfloat32_t v1555; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1555) : "w"(v1521), "w"(v1554)); + svfloat32_t v1535 = svsub_f32_x(svptrue_b32(), v1508, v1534); + svfloat32_t v1555 = svsub_f32_x(svptrue_b32(), v1521, v1554); svst1w_u64(pred_full, (unsigned *)(v2006), svreinterpret_u64_s16(v929)); svfloat32_t v919 = svnmls_f32_x(pred_full, v913, v901, v2295); - svfloat32_t v958; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v958) : "w"(v913), "w"(v957)); - svfloat32_t v1061; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v1061) : "w"(v1040), "w"(v1060)); - svfloat32_t v1062; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1062) : "w"(v1040), "w"(v1060)); + svfloat32_t v958 = svsub_f32_x(svptrue_b32(), v913, v957); + svfloat32_t v1061 = svadd_f32_x(svptrue_b32(), v1040, v1060); + svfloat32_t v1062 = svsub_f32_x(svptrue_b32(), v1040, v1060); svfloat32_t v1074 = svmla_f32_x(pred_full, v1034, v1054, v2255); svfloat32_t v1092 = svnmls_f32_x(pred_full, v1054, v1034, v2255); - svfloat32_t v1260; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v1260) : "w"(v167), "w"(v1228)); - svfloat32_t zero1275; - asm volatile("mov %0.s, #0" : "=w"(zero1275)); + svfloat32_t v1260 = svadd_f32_x(svptrue_b32(), v167, v1228); + svfloat32_t zero1275 = svdup_n_f32(0); svfloat32_t v1275 = svcmla_f32_x(pred_full, zero1275, v2275, v1241, 90); - svfloat32_t zero1291; - asm volatile("mov %0.s, #0" : "=w"(zero1291)); + svfloat32_t zero1291 = svdup_n_f32(0); svfloat32_t v1291 = svcmla_f32_x(pred_full, zero1291, v2275, v1259, 90); - svfloat32_t v1395; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v1395) : "w"(v1374), "w"(v1394)); - svfloat32_t v1396; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1396) : "w"(v1374), "w"(v1394)); + svfloat32_t v1395 = svadd_f32_x(svptrue_b32(), v1374, v1394); + svfloat32_t v1396 = svsub_f32_x(svptrue_b32(), v1374, v1394); svfloat32_t v1408 = svmla_f32_x(pred_full, v1368, v1388, v2255); svfloat32_t v1426 = svnmls_f32_x(pred_full, v1388, v1368, v2255); svfloat32_t v1541 = svnmls_f32_x(pred_full, v1535, v1508, v2295); svfloat32_t v1561 = svnmls_f32_x(pred_full, v1555, v1521, v2295); - svfloat32_t v942; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v942) : "w"(v919), "w"(v941)); + svfloat32_t v942 = svsub_f32_x(svptrue_b32(), v919, v941); svint16_t v961 = svtbl_s16( svreinterpret_s16_s32(svcvt_s32_f32_x( pred_full, svmul_n_f32_x(pred_full, v958, (float)(1ULL << 31ULL)))), svreinterpret_u16_u64( svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); svfloat32_t v972 = svnmls_f32_x(pred_full, v958, v913, v2295); - svfloat32_t v1093; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v1093) : "w"(v159), "w"(v1061)); - svfloat32_t zero1108; - asm volatile("mov %0.s, #0" : "=w"(zero1108)); + svfloat32_t v1093 = svadd_f32_x(svptrue_b32(), v159, v1061); + svfloat32_t zero1108 = svdup_n_f32(0); svfloat32_t v1108 = svcmla_f32_x(pred_full, zero1108, v2275, v1074, 90); - svfloat32_t zero1124; - asm volatile("mov %0.s, #0" : "=w"(zero1124)); + svfloat32_t zero1124 = svdup_n_f32(0); svfloat32_t v1124 = svcmla_f32_x(pred_full, zero1124, v2275, v1092, 90); svfloat32_t v1235 = svmls_f32_x(pred_full, v167, v1228, v2251); svint16_t v1263 = @@ -19090,18 +11836,13 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu25(const armral_cmplx_f32_t *restrict x, svmul_n_f32_x(pred_full, v1260, (float)(1ULL << 31ULL)))), svreinterpret_u16_u64(svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svfloat32_t v1427; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v1427) : "w"(v173), "w"(v1395)); - svfloat32_t zero1442; - asm volatile("mov %0.s, #0" : "=w"(zero1442)); + svfloat32_t v1427 = svadd_f32_x(svptrue_b32(), v173, v1395); + svfloat32_t zero1442 = svdup_n_f32(0); svfloat32_t v1442 = svcmla_f32_x(pred_full, zero1442, v2275, v1408, 90); - svfloat32_t zero1458; - asm volatile("mov %0.s, #0" : "=w"(zero1458)); + svfloat32_t zero1458 = svdup_n_f32(0); svfloat32_t v1458 = svcmla_f32_x(pred_full, zero1458, v2275, v1426, 90); - svfloat32_t v1562; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v1562) : "w"(v1541), "w"(v1561)); - svfloat32_t v1563; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1563) : "w"(v1541), "w"(v1561)); + svfloat32_t v1562 = svadd_f32_x(svptrue_b32(), v1541, v1561); + svfloat32_t v1563 = svsub_f32_x(svptrue_b32(), v1541, v1561); svfloat32_t v1575 = svmla_f32_x(pred_full, v1535, v1555, v2255); svfloat32_t v1593 = svnmls_f32_x(pred_full, v1555, v1535, v2255); svint16_t v945 = svtbl_s16( @@ -19130,13 +11871,10 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu25(const armral_cmplx_f32_t *restrict x, svmul_n_f32_x(pred_full, v1427, (float)(1ULL << 31ULL)))), svreinterpret_u16_u64(svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svfloat32_t v1594; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v1594) : "w"(v179), "w"(v1562)); - svfloat32_t zero1609; - asm volatile("mov %0.s, #0" : "=w"(zero1609)); + svfloat32_t v1594 = svadd_f32_x(svptrue_b32(), v179, v1562); + svfloat32_t zero1609 = svdup_n_f32(0); svfloat32_t v1609 = svcmla_f32_x(pred_full, zero1609, v2275, v1575, 90); - svfloat32_t zero1625; - asm volatile("mov %0.s, #0" : "=w"(zero1625)); + svfloat32_t zero1625 = svdup_n_f32(0); svfloat32_t v1625 = svcmla_f32_x(pred_full, zero1625, v2275, v1593, 90); svst1w_u64(pred_full, (unsigned *)(v2026), svreinterpret_u64_s16(v961)); svst1w_u64(pred_full, (unsigned *)(v2134), svreinterpret_u64_s16(v1263)); @@ -19147,8 +11885,7 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu25(const armral_cmplx_f32_t *restrict x, svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); svfloat32_t v1080 = svmls_f32_x(pred_full, v1068, v1062, v2253); svfloat32_t v1253 = svnmls_f32_x(pred_full, v1247, v1235, v2295); - svfloat32_t v1292; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1292) : "w"(v1247), "w"(v1291)); + svfloat32_t v1292 = svsub_f32_x(svptrue_b32(), v1247, v1291); svfloat32_t v1414 = svmls_f32_x(pred_full, v1402, v1396, v2253); svfloat32_t v1569 = svmls_f32_x(pred_full, v179, v1562, v2251); svint16_t v1597 = @@ -19162,10 +11899,8 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu25(const armral_cmplx_f32_t *restrict x, svst1w_u64(pred_full, (unsigned *)(v2070), svreinterpret_u64_s16(v1096)); svst1w_u64(pred_full, (unsigned *)(v2198), svreinterpret_u64_s16(v1430)); svfloat32_t v1086 = svnmls_f32_x(pred_full, v1080, v1068, v2295); - svfloat32_t v1125; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1125) : "w"(v1080), "w"(v1124)); - svfloat32_t v1276; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1276) : "w"(v1253), "w"(v1275)); + svfloat32_t v1125 = svsub_f32_x(svptrue_b32(), v1080, v1124); + svfloat32_t v1276 = svsub_f32_x(svptrue_b32(), v1253, v1275); svint16_t v1295 = svtbl_s16(svreinterpret_s16_s32(svcvt_s32_f32_x( pred_full, @@ -19174,13 +11909,11 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu25(const armral_cmplx_f32_t *restrict x, 0x0000000000040004ULL))); svfloat32_t v1306 = svnmls_f32_x(pred_full, v1292, v1247, v2295); svfloat32_t v1420 = svnmls_f32_x(pred_full, v1414, v1402, v2295); - svfloat32_t v1459; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1459) : "w"(v1414), "w"(v1458)); + svfloat32_t v1459 = svsub_f32_x(svptrue_b32(), v1414, v1458); svfloat32_t v1581 = svmls_f32_x(pred_full, v1569, v1563, v2253); svst1w_u64(pred_full, (unsigned *)(v2046), svreinterpret_u64_s16(v989)); svst1w_u64(pred_full, (unsigned *)(v2262), svreinterpret_u64_s16(v1597)); - svfloat32_t v1109; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1109) : "w"(v1086), "w"(v1108)); + svfloat32_t v1109 = svsub_f32_x(svptrue_b32(), v1086, v1108); svint16_t v1128 = svtbl_s16(svreinterpret_s16_s32(svcvt_s32_f32_x( pred_full, @@ -19201,8 +11934,7 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu25(const armral_cmplx_f32_t *restrict x, svreinterpret_u16_u64(svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); svfloat32_t v1320 = svnmls_f32_x(pred_full, v1276, v1253, v2295); - svfloat32_t v1443; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1443) : "w"(v1420), "w"(v1442)); + svfloat32_t v1443 = svsub_f32_x(svptrue_b32(), v1420, v1442); svint16_t v1462 = svtbl_s16(svreinterpret_s16_s32(svcvt_s32_f32_x( pred_full, @@ -19211,8 +11943,7 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu25(const armral_cmplx_f32_t *restrict x, 0x0000000000040004ULL))); svfloat32_t v1473 = svnmls_f32_x(pred_full, v1459, v1414, v2295); svfloat32_t v1587 = svnmls_f32_x(pred_full, v1581, v1569, v2295); - svfloat32_t v1626; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1626) : "w"(v1581), "w"(v1625)); + svfloat32_t v1626 = svsub_f32_x(svptrue_b32(), v1581, v1625); svst1w_u64(pred_full, (unsigned *)(v2154), svreinterpret_u64_s16(v1295)); svint16_t v1112 = svtbl_s16(svreinterpret_s16_s32(svcvt_s32_f32_x( @@ -19246,8 +11977,7 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu25(const armral_cmplx_f32_t *restrict x, svreinterpret_u16_u64(svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); svfloat32_t v1487 = svnmls_f32_x(pred_full, v1443, v1420, v2295); - svfloat32_t v1610; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1610) : "w"(v1587), "w"(v1609)); + svfloat32_t v1610 = svsub_f32_x(svptrue_b32(), v1587, v1609); svint16_t v1629 = svtbl_s16(svreinterpret_s16_s32(svcvt_s32_f32_x( pred_full, @@ -19315,1009 +12045,478 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu32(const armral_cmplx_f32_t *restrict x, float v4 = dir; const float32x2_t *v5 = (const float32x2_t *)x; int32_t *v6 = (int32_t *)y; - int64_t v12 = howmany - 1; - int64_t v1209 = howmany / 2; - for (int j = 0; j < v12; j += 2) { - float v914 = 7.0710678118654757e-01F; - float v927 = -7.0710678118654746e-01F; - float v988 = 5.5557023301960229e-01F; - float v1005 = -1.9509032201612861e-01F; - float v1067 = 9.2387953251128674e-01F; - float v1075 = -9.2387953251128685e-01F; - float v1079 = 3.8268343236508967e-01F; - float v1080 = -3.8268343236508967e-01F; - float v1136 = 1.9509032201612833e-01F; - float v1140 = -9.8078528040323043e-01F; - float v1141 = 9.8078528040323043e-01F; - float v1149 = -5.5557023301960218e-01F; - float v1153 = 8.3146961230254524e-01F; - float v1154 = -8.3146961230254524e-01F; - float v1165 = -1.0000000000000000e+00F; - float v1166 = 1.0000000000000000e+00F; - float32x2_t v1168 = (float32x2_t){v4, v4}; - const float32x2_t *v2357 = &v5[istride]; - int32_t *v2538 = &v6[ostride]; - float32x2_t v693 = (float32x2_t){v1141, v1141}; - float32x2_t v767 = (float32x2_t){v1067, v1067}; - float32x2_t v772 = (float32x2_t){v1080, v1079}; - float32x2_t v841 = (float32x2_t){v1153, v1153}; - float32x2_t v846 = (float32x2_t){v1149, v988}; - float32x2_t v854 = (float32x2_t){v1005, v1005}; - float32x2_t v915 = (float32x2_t){v914, v914}; - float32x2_t v928 = (float32x2_t){v927, v927}; - float32x2_t v933 = (float32x2_t){v1166, v1165}; - float32x2_t v989 = (float32x2_t){v988, v988}; - float32x2_t v994 = (float32x2_t){v1154, v1153}; - float32x2_t v1002 = (float32x2_t){v1140, v1140}; - float32x2_t v1007 = (float32x2_t){v1005, v1136}; - float32x2_t v1063 = (float32x2_t){v1079, v1079}; - float32x2_t v1068 = (float32x2_t){v1075, v1067}; - float32x2_t v1076 = (float32x2_t){v1075, v1075}; - float32x2_t v1081 = (float32x2_t){v1079, v1080}; - float32x2_t v1137 = (float32x2_t){v1136, v1136}; - float32x2_t v1142 = (float32x2_t){v1140, v1141}; - float32x2_t v1150 = (float32x2_t){v1149, v1149}; - float32x2_t v1155 = (float32x2_t){v1153, v1154}; - float32x2_t v1167 = (float32x2_t){v1165, v1166}; - const float32x2_t *v2213 = &v5[0]; - int32_t *v2502 = &v6[0]; - float32x4_t v2817 = vld1q_f32((const float32_t *)v2357); - float32x4_t v694 = vcombine_f32(v693, v693); - float32x4_t v768 = vcombine_f32(v767, v767); - float32x2_t v774 = vmul_f32(v1168, v772); - float32x4_t v842 = vcombine_f32(v841, v841); - float32x2_t v848 = vmul_f32(v1168, v846); - float32x4_t v855 = vcombine_f32(v854, v854); - float32x4_t v916 = vcombine_f32(v915, v915); - float32x4_t v929 = vcombine_f32(v928, v928); - float32x2_t v935 = vmul_f32(v1168, v933); - float32x4_t v990 = vcombine_f32(v989, v989); - float32x2_t v996 = vmul_f32(v1168, v994); - float32x4_t v1003 = vcombine_f32(v1002, v1002); - float32x2_t v1009 = vmul_f32(v1168, v1007); - float32x4_t v1064 = vcombine_f32(v1063, v1063); - float32x2_t v1070 = vmul_f32(v1168, v1068); - float32x4_t v1077 = vcombine_f32(v1076, v1076); - float32x2_t v1083 = vmul_f32(v1168, v1081); - float32x4_t v1138 = vcombine_f32(v1137, v1137); - float32x2_t v1144 = vmul_f32(v1168, v1142); - float32x4_t v1151 = vcombine_f32(v1150, v1150); - float32x2_t v1157 = vmul_f32(v1168, v1155); - float32x2_t v1169 = vmul_f32(v1168, v1167); - const float32x2_t *v2222 = &v5[istride * 16]; - const float32x2_t *v2231 = &v5[istride * 8]; - const float32x2_t *v2240 = &v5[istride * 24]; - const float32x2_t *v2249 = &v5[istride * 4]; - const float32x2_t *v2258 = &v5[istride * 20]; - const float32x2_t *v2267 = &v5[istride * 12]; - const float32x2_t *v2276 = &v5[istride * 28]; - const float32x2_t *v2285 = &v5[istride * 2]; - const float32x2_t *v2294 = &v5[istride * 18]; - const float32x2_t *v2303 = &v5[istride * 10]; - const float32x2_t *v2312 = &v5[istride * 26]; - const float32x2_t *v2321 = &v5[istride * 6]; - const float32x2_t *v2330 = &v5[istride * 22]; - const float32x2_t *v2339 = &v5[istride * 14]; - const float32x2_t *v2348 = &v5[istride * 30]; - const float32x2_t *v2366 = &v5[istride * 17]; - const float32x2_t *v2375 = &v5[istride * 9]; - const float32x2_t *v2384 = &v5[istride * 25]; - const float32x2_t *v2393 = &v5[istride * 5]; - const float32x2_t *v2402 = &v5[istride * 21]; - const float32x2_t *v2411 = &v5[istride * 13]; - const float32x2_t *v2420 = &v5[istride * 29]; - const float32x2_t *v2429 = &v5[istride * 3]; - const float32x2_t *v2438 = &v5[istride * 19]; - const float32x2_t *v2447 = &v5[istride * 11]; - const float32x2_t *v2456 = &v5[istride * 27]; - const float32x2_t *v2465 = &v5[istride * 7]; - const float32x2_t *v2474 = &v5[istride * 23]; - const float32x2_t *v2483 = &v5[istride * 15]; - const float32x2_t *v2492 = &v5[istride * 31]; - int32_t *v2511 = &v6[ostride * 8]; - int32_t *v2520 = &v6[ostride * 16]; - int32_t *v2529 = &v6[ostride * 24]; - int32_t *v2547 = &v6[ostride * 9]; - int32_t *v2556 = &v6[ostride * 17]; - int32_t *v2565 = &v6[ostride * 25]; - int32_t *v2574 = &v6[ostride * 2]; - int32_t *v2583 = &v6[ostride * 10]; - int32_t *v2592 = &v6[ostride * 18]; - int32_t *v2601 = &v6[ostride * 26]; - int32_t *v2610 = &v6[ostride * 3]; - int32_t *v2619 = &v6[ostride * 11]; - int32_t *v2628 = &v6[ostride * 19]; - int32_t *v2637 = &v6[ostride * 27]; - int32_t *v2646 = &v6[ostride * 4]; - int32_t *v2655 = &v6[ostride * 12]; - int32_t *v2664 = &v6[ostride * 20]; - int32_t *v2673 = &v6[ostride * 28]; - int32_t *v2682 = &v6[ostride * 5]; - int32_t *v2691 = &v6[ostride * 13]; - int32_t *v2700 = &v6[ostride * 21]; - int32_t *v2709 = &v6[ostride * 29]; - int32_t *v2718 = &v6[ostride * 6]; - int32_t *v2727 = &v6[ostride * 14]; - int32_t *v2736 = &v6[ostride * 22]; - int32_t *v2745 = &v6[ostride * 30]; - int32_t *v2754 = &v6[ostride * 7]; - int32_t *v2763 = &v6[ostride * 15]; - int32_t *v2772 = &v6[ostride * 23]; - int32_t *v2781 = &v6[ostride * 31]; - float32x4_t v2785 = vld1q_f32((const float32_t *)v2213); - float32x4_t v776 = vcombine_f32(v774, v774); - float32x4_t v850 = vcombine_f32(v848, v848); - float32x4_t v937 = vcombine_f32(v935, v935); - float32x4_t v998 = vcombine_f32(v996, v996); - float32x4_t v1011 = vcombine_f32(v1009, v1009); - float32x4_t v1072 = vcombine_f32(v1070, v1070); - float32x4_t v1085 = vcombine_f32(v1083, v1083); - float32x4_t v1146 = vcombine_f32(v1144, v1144); - float32x4_t v1159 = vcombine_f32(v1157, v1157); - float32x4_t v1171 = vcombine_f32(v1169, v1169); - float32x4_t v2787 = vld1q_f32((const float32_t *)v2222); - float32x4_t v2789 = vld1q_f32((const float32_t *)v2231); - float32x4_t v2791 = vld1q_f32((const float32_t *)v2240); - float32x4_t v2793 = vld1q_f32((const float32_t *)v2249); - float32x4_t v2795 = vld1q_f32((const float32_t *)v2258); - float32x4_t v2797 = vld1q_f32((const float32_t *)v2267); - float32x4_t v2799 = vld1q_f32((const float32_t *)v2276); - float32x4_t v2801 = vld1q_f32((const float32_t *)v2285); - float32x4_t v2803 = vld1q_f32((const float32_t *)v2294); - float32x4_t v2805 = vld1q_f32((const float32_t *)v2303); - float32x4_t v2807 = vld1q_f32((const float32_t *)v2312); - float32x4_t v2809 = vld1q_f32((const float32_t *)v2321); - float32x4_t v2811 = vld1q_f32((const float32_t *)v2330); - float32x4_t v2813 = vld1q_f32((const float32_t *)v2339); - float32x4_t v2815 = vld1q_f32((const float32_t *)v2348); - float32x4_t v2819 = vld1q_f32((const float32_t *)v2366); - float32x4_t v2821 = vld1q_f32((const float32_t *)v2375); - float32x4_t v2823 = vld1q_f32((const float32_t *)v2384); - float32x4_t v2825 = vld1q_f32((const float32_t *)v2393); - float32x4_t v2827 = vld1q_f32((const float32_t *)v2402); - float32x4_t v2829 = vld1q_f32((const float32_t *)v2411); - float32x4_t v2831 = vld1q_f32((const float32_t *)v2420); - float32x4_t v2833 = vld1q_f32((const float32_t *)v2429); - float32x4_t v2835 = vld1q_f32((const float32_t *)v2438); - float32x4_t v2837 = vld1q_f32((const float32_t *)v2447); - float32x4_t v2839 = vld1q_f32((const float32_t *)v2456); - float32x4_t v2841 = vld1q_f32((const float32_t *)v2465); - float32x4_t v2843 = vld1q_f32((const float32_t *)v2474); - float32x4_t v2845 = vld1q_f32((const float32_t *)v2483); - float32x4_t v2847 = vld1q_f32((const float32_t *)v2492); - float32x4_t v35 = vaddq_f32(v2785, v2787); - float32x4_t v36 = vsubq_f32(v2785, v2787); - float32x4_t v51 = vaddq_f32(v2789, v2791); - float32x4_t v52 = vsubq_f32(v2789, v2791); - float32x4_t v79 = vaddq_f32(v2793, v2795); - float32x4_t v80 = vsubq_f32(v2793, v2795); - float32x4_t v95 = vaddq_f32(v2797, v2799); - float32x4_t v96 = vsubq_f32(v2797, v2799); - float32x4_t v167 = vaddq_f32(v2801, v2803); - float32x4_t v168 = vsubq_f32(v2801, v2803); - float32x4_t v183 = vaddq_f32(v2805, v2807); - float32x4_t v184 = vsubq_f32(v2805, v2807); - float32x4_t v211 = vaddq_f32(v2809, v2811); - float32x4_t v212 = vsubq_f32(v2809, v2811); - float32x4_t v227 = vaddq_f32(v2813, v2815); - float32x4_t v228 = vsubq_f32(v2813, v2815); - float32x4_t v395 = vaddq_f32(v2817, v2819); - float32x4_t v396 = vsubq_f32(v2817, v2819); - float32x4_t v411 = vaddq_f32(v2821, v2823); - float32x4_t v412 = vsubq_f32(v2821, v2823); - float32x4_t v439 = vaddq_f32(v2825, v2827); - float32x4_t v440 = vsubq_f32(v2825, v2827); - float32x4_t v455 = vaddq_f32(v2829, v2831); - float32x4_t v456 = vsubq_f32(v2829, v2831); - float32x4_t v527 = vaddq_f32(v2833, v2835); - float32x4_t v528 = vsubq_f32(v2833, v2835); - float32x4_t v543 = vaddq_f32(v2837, v2839); - float32x4_t v544 = vsubq_f32(v2837, v2839); - float32x4_t v571 = vaddq_f32(v2841, v2843); - float32x4_t v572 = vsubq_f32(v2841, v2843); - float32x4_t v587 = vaddq_f32(v2845, v2847); - float32x4_t v588 = vsubq_f32(v2845, v2847); - float32x4_t v58 = vrev64q_f32(v52); - float32x4_t v61 = vaddq_f32(v35, v51); - float32x4_t v62 = vsubq_f32(v35, v51); - float32x4_t v97 = vaddq_f32(v79, v95); - float32x4_t v98 = vsubq_f32(v79, v95); - float32x4_t v115 = vmulq_f32(v80, v916); - float32x4_t v128 = vmulq_f32(v96, v929); - float32x4_t v190 = vrev64q_f32(v184); - float32x4_t v193 = vaddq_f32(v167, v183); - float32x4_t v194 = vsubq_f32(v167, v183); - float32x4_t v234 = vrev64q_f32(v228); - float32x4_t v237 = vaddq_f32(v211, v227); - float32x4_t v238 = vsubq_f32(v211, v227); - float32x4_t v418 = vrev64q_f32(v412); - float32x4_t v421 = vaddq_f32(v395, v411); - float32x4_t v422 = vsubq_f32(v395, v411); - float32x4_t v457 = vaddq_f32(v439, v455); - float32x4_t v458 = vsubq_f32(v439, v455); - float32x4_t v475 = vmulq_f32(v440, v916); - float32x4_t v488 = vmulq_f32(v456, v929); - float32x4_t v550 = vrev64q_f32(v544); - float32x4_t v553 = vaddq_f32(v527, v543); - float32x4_t v554 = vsubq_f32(v527, v543); - float32x4_t v589 = vaddq_f32(v571, v587); - float32x4_t v590 = vsubq_f32(v571, v587); - float32x4_t v607 = vmulq_f32(v572, v916); - float32x4_t v620 = vmulq_f32(v588, v929); - float32x4_t v60 = vmulq_f32(v58, v937); - float32x4_t v104 = vrev64q_f32(v98); - float32x4_t v107 = vaddq_f32(v61, v97); - float32x4_t v108 = vsubq_f32(v61, v97); - float32x4_t v121 = vrev64q_f32(v115); - float32x4_t v134 = vrev64q_f32(v128); - float32x4_t v192 = vmulq_f32(v190, v937); - float32x4_t v236 = vmulq_f32(v234, v937); - float32x4_t v241 = vaddq_f32(v193, v237); - float32x4_t v242 = vsubq_f32(v193, v237); - float32x4_t v301 = vmulq_f32(v194, v916); - float32x4_t v314 = vmulq_f32(v238, v929); - float32x4_t v420 = vmulq_f32(v418, v937); - float32x4_t v464 = vrev64q_f32(v458); - float32x4_t v467 = vaddq_f32(v421, v457); - float32x4_t v468 = vsubq_f32(v421, v457); - float32x4_t v481 = vrev64q_f32(v475); - float32x4_t v494 = vrev64q_f32(v488); - float32x4_t v552 = vmulq_f32(v550, v937); - float32x4_t v596 = vrev64q_f32(v590); - float32x4_t v599 = vaddq_f32(v553, v589); - float32x4_t v600 = vsubq_f32(v553, v589); - float32x4_t v613 = vrev64q_f32(v607); - float32x4_t v626 = vrev64q_f32(v620); - float32x4_t v63 = vsubq_f32(v36, v60); - float32x4_t v64 = vaddq_f32(v36, v60); - float32x4_t v106 = vmulq_f32(v104, v937); - float32x4_t v123 = vmulq_f32(v121, v1171); - float32x4_t v136 = vmulq_f32(v134, v937); - float32x4_t v195 = vsubq_f32(v168, v192); - float32x4_t v196 = vaddq_f32(v168, v192); - float32x4_t v239 = vsubq_f32(v212, v236); - float32x4_t v240 = vaddq_f32(v212, v236); - float32x4_t v248 = vrev64q_f32(v242); - float32x4_t v251 = vaddq_f32(v107, v241); - float32x4_t v252 = vsubq_f32(v107, v241); - float32x4_t v307 = vrev64q_f32(v301); - float32x4_t v320 = vrev64q_f32(v314); - float32x4_t v423 = vsubq_f32(v396, v420); - float32x4_t v424 = vaddq_f32(v396, v420); - float32x4_t v466 = vmulq_f32(v464, v937); - float32x4_t v483 = vmulq_f32(v481, v1171); - float32x4_t v496 = vmulq_f32(v494, v937); - float32x4_t v555 = vsubq_f32(v528, v552); - float32x4_t v556 = vaddq_f32(v528, v552); - float32x4_t v598 = vmulq_f32(v596, v937); - float32x4_t v615 = vmulq_f32(v613, v1171); - float32x4_t v628 = vmulq_f32(v626, v937); - float32x4_t v645 = vaddq_f32(v467, v599); - float32x4_t v646 = vsubq_f32(v467, v599); - float32x4_t v917 = vmulq_f32(v468, v916); - float32x4_t v930 = vmulq_f32(v600, v929); - float32x4_t v109 = vsubq_f32(v62, v106); - float32x4_t v110 = vaddq_f32(v62, v106); - float32x4_t v137 = vaddq_f32(v115, v123); - float32x4_t v138 = vaddq_f32(v128, v136); - float32x4_t v250 = vmulq_f32(v248, v937); - float32x4_t v259 = vmulq_f32(v195, v768); - float32x4_t v265 = vrev64q_f32(v195); - float32x4_t v272 = vmulq_f32(v239, v1064); - float32x4_t v278 = vrev64q_f32(v239); - float32x4_t v309 = vmulq_f32(v307, v1171); - float32x4_t v322 = vmulq_f32(v320, v937); - float32x4_t v343 = vmulq_f32(v196, v1064); - float32x4_t v349 = vrev64q_f32(v196); - float32x4_t v356 = vmulq_f32(v240, v1077); - float32x4_t v362 = vrev64q_f32(v240); - float32x4_t v469 = vsubq_f32(v422, v466); - float32x4_t v470 = vaddq_f32(v422, v466); - float32x4_t v497 = vaddq_f32(v475, v483); - float32x4_t v498 = vaddq_f32(v488, v496); - float32x4_t v601 = vsubq_f32(v554, v598); - float32x4_t v602 = vaddq_f32(v554, v598); - float32x4_t v629 = vaddq_f32(v607, v615); - float32x4_t v630 = vaddq_f32(v620, v628); - float32x4_t v652 = vrev64q_f32(v646); - float32x4_t v655 = vaddq_f32(v251, v645); - float32x4_t v656 = vsubq_f32(v251, v645); - float32x4_t v923 = vrev64q_f32(v917); - float32x4_t v936 = vrev64q_f32(v930); - float32x4_t v139 = vaddq_f32(v137, v138); - float32x4_t v140 = vsubq_f32(v138, v137); - float32x4_t v253 = vsubq_f32(v108, v250); - float32x4_t v254 = vaddq_f32(v108, v250); - float32x4_t v323 = vaddq_f32(v301, v309); - float32x4_t v324 = vaddq_f32(v314, v322); - float32x4_t v499 = vaddq_f32(v497, v498); - float32x4_t v500 = vsubq_f32(v498, v497); - float32x4_t v631 = vaddq_f32(v629, v630); - float32x4_t v632 = vsubq_f32(v630, v629); - float32x4_t v654 = vmulq_f32(v652, v937); - int16x4_t v661 = vqmovn_s32(vcvtq_n_s32_f32(v655, 15)); - int16x4_t v677 = vqmovn_s32(vcvtq_n_s32_f32(v656, 15)); - float32x4_t v769 = vmulq_f32(v469, v768); - float32x4_t v775 = vrev64q_f32(v469); - float32x4_t v782 = vmulq_f32(v601, v1064); - float32x4_t v788 = vrev64q_f32(v601); - float32x4_t v925 = vmulq_f32(v923, v1171); - float32x4_t v938 = vmulq_f32(v936, v937); - float32x4_t v1065 = vmulq_f32(v470, v1064); - float32x4_t v1071 = vrev64q_f32(v470); - float32x4_t v1078 = vmulq_f32(v602, v1077); - float32x4_t v1084 = vrev64q_f32(v602); - float32x4_t v146 = vrev64q_f32(v140); - float32x4_t v149 = vaddq_f32(v63, v139); - float32x4_t v150 = vsubq_f32(v63, v139); - float32x4_t v281 = vfmaq_f32(v259, v265, v776); - float32x4_t v282 = vfmaq_f32(v272, v278, v1072); - float32x4_t v325 = vaddq_f32(v323, v324); - float32x4_t v326 = vsubq_f32(v324, v323); - float32x4_t v365 = vfmaq_f32(v343, v349, v1072); - float32x4_t v366 = vfmaq_f32(v356, v362, v1085); - float32x4_t v506 = vrev64q_f32(v500); - float32x4_t v509 = vaddq_f32(v423, v499); - float32x4_t v510 = vsubq_f32(v423, v499); - float32x4_t v638 = vrev64q_f32(v632); - float32x4_t v641 = vaddq_f32(v555, v631); - float32x4_t v642 = vsubq_f32(v555, v631); - float32x4_t v657 = vsubq_f32(v252, v654); - float32x4_t v658 = vaddq_f32(v252, v654); - float32x4_t v939 = vaddq_f32(v917, v925); - float32x4_t v940 = vaddq_f32(v930, v938); - vst1_s16((int16_t *)v2502, v661); - vst1_s16((int16_t *)v2520, v677); - float32x4_t v148 = vmulq_f32(v146, v1171); - float32x4_t v283 = vaddq_f32(v281, v282); - float32x4_t v284 = vsubq_f32(v282, v281); - float32x4_t v332 = vrev64q_f32(v326); - float32x4_t v335 = vaddq_f32(v109, v325); - float32x4_t v336 = vsubq_f32(v109, v325); - float32x4_t v367 = vaddq_f32(v365, v366); - float32x4_t v368 = vsubq_f32(v366, v365); - float32x4_t v508 = vmulq_f32(v506, v1171); - float32x4_t v640 = vmulq_f32(v638, v1171); - int16x4_t v669 = vqmovn_s32(vcvtq_n_s32_f32(v657, 15)); - int16x4_t v685 = vqmovn_s32(vcvtq_n_s32_f32(v658, 15)); - float32x4_t v695 = vmulq_f32(v509, v694); - float32x4_t v701 = vrev64q_f32(v509); - float32x4_t v708 = vmulq_f32(v641, v842); - float32x4_t v714 = vrev64q_f32(v641); - float32x4_t v791 = vfmaq_f32(v769, v775, v776); - float32x4_t v792 = vfmaq_f32(v782, v788, v1072); - float32x4_t v941 = vaddq_f32(v939, v940); - float32x4_t v942 = vsubq_f32(v940, v939); - float32x4_t v991 = vmulq_f32(v510, v990); - float32x4_t v997 = vrev64q_f32(v510); - float32x4_t v1004 = vmulq_f32(v642, v1003); - float32x4_t v1010 = vrev64q_f32(v642); - float32x4_t v1087 = vfmaq_f32(v1065, v1071, v1072); - float32x4_t v1088 = vfmaq_f32(v1078, v1084, v1085); - float32x4_t v151 = vsubq_f32(v64, v148); - float32x4_t v152 = vaddq_f32(v64, v148); - float32x4_t v290 = vrev64q_f32(v284); - float32x4_t v293 = vaddq_f32(v149, v283); - float32x4_t v294 = vsubq_f32(v149, v283); - float32x4_t v334 = vmulq_f32(v332, v1171); - float32x4_t v374 = vrev64q_f32(v368); - float32x4_t v511 = vsubq_f32(v424, v508); - float32x4_t v512 = vaddq_f32(v424, v508); - float32x4_t v643 = vsubq_f32(v556, v640); - float32x4_t v644 = vaddq_f32(v556, v640); - float32x4_t v793 = vaddq_f32(v791, v792); - float32x4_t v794 = vsubq_f32(v792, v791); - float32x4_t v948 = vrev64q_f32(v942); - float32x4_t v951 = vaddq_f32(v253, v941); - float32x4_t v952 = vsubq_f32(v253, v941); - float32x4_t v1089 = vaddq_f32(v1087, v1088); - float32x4_t v1090 = vsubq_f32(v1088, v1087); - vst1_s16((int16_t *)v2511, v669); - vst1_s16((int16_t *)v2529, v685); - float32x4_t v292 = vmulq_f32(v290, v1171); - float32x4_t v337 = vsubq_f32(v110, v334); - float32x4_t v338 = vaddq_f32(v110, v334); - float32x4_t v376 = vmulq_f32(v374, v1171); - float32x4_t v377 = vaddq_f32(v151, v367); - float32x4_t v378 = vsubq_f32(v151, v367); - float32x4_t v717 = vfmaq_f32(v695, v701, v1011); - float32x4_t v718 = vfmaq_f32(v708, v714, v850); - float32x4_t v800 = vrev64q_f32(v794); - float32x4_t v803 = vaddq_f32(v335, v793); - float32x4_t v804 = vsubq_f32(v335, v793); - float32x4_t v843 = vmulq_f32(v511, v842); - float32x4_t v849 = vrev64q_f32(v511); - float32x4_t v856 = vmulq_f32(v643, v855); - float32x4_t v862 = vrev64q_f32(v643); - float32x4_t v950 = vmulq_f32(v948, v1171); - int16x4_t v957 = vqmovn_s32(vcvtq_n_s32_f32(v951, 15)); - int16x4_t v973 = vqmovn_s32(vcvtq_n_s32_f32(v952, 15)); - float32x4_t v1013 = vfmaq_f32(v991, v997, v998); - float32x4_t v1014 = vfmaq_f32(v1004, v1010, v1011); - float32x4_t v1096 = vrev64q_f32(v1090); - float32x4_t v1139 = vmulq_f32(v512, v1138); - float32x4_t v1145 = vrev64q_f32(v512); - float32x4_t v1152 = vmulq_f32(v644, v1151); - float32x4_t v1158 = vrev64q_f32(v644); - float32x4_t v295 = vsubq_f32(v150, v292); - float32x4_t v296 = vaddq_f32(v150, v292); - float32x4_t v379 = vsubq_f32(v152, v376); - float32x4_t v380 = vaddq_f32(v152, v376); - float32x4_t v719 = vaddq_f32(v717, v718); - float32x4_t v720 = vsubq_f32(v718, v717); - float32x4_t v802 = vmulq_f32(v800, v1171); - int16x4_t v809 = vqmovn_s32(vcvtq_n_s32_f32(v803, 15)); - int16x4_t v825 = vqmovn_s32(vcvtq_n_s32_f32(v804, 15)); - float32x4_t v953 = vsubq_f32(v254, v950); - float32x4_t v954 = vaddq_f32(v254, v950); - float32x4_t v1015 = vaddq_f32(v1013, v1014); - float32x4_t v1016 = vsubq_f32(v1014, v1013); - float32x4_t v1098 = vmulq_f32(v1096, v1171); - float32x4_t v1099 = vaddq_f32(v337, v1089); - float32x4_t v1100 = vsubq_f32(v337, v1089); - vst1_s16((int16_t *)v2646, v957); - vst1_s16((int16_t *)v2664, v973); - float32x4_t v726 = vrev64q_f32(v720); - float32x4_t v729 = vaddq_f32(v293, v719); - float32x4_t v730 = vsubq_f32(v293, v719); - float32x4_t v805 = vsubq_f32(v336, v802); - float32x4_t v806 = vaddq_f32(v336, v802); - float32x4_t v865 = vfmaq_f32(v843, v849, v850); - float32x4_t v866 = vfmaq_f32(v856, v862, v1146); - int16x4_t v965 = vqmovn_s32(vcvtq_n_s32_f32(v953, 15)); - int16x4_t v981 = vqmovn_s32(vcvtq_n_s32_f32(v954, 15)); - float32x4_t v1022 = vrev64q_f32(v1016); - float32x4_t v1025 = vaddq_f32(v295, v1015); - float32x4_t v1026 = vsubq_f32(v295, v1015); - float32x4_t v1101 = vsubq_f32(v338, v1098); - float32x4_t v1102 = vaddq_f32(v338, v1098); - int16x4_t v1105 = vqmovn_s32(vcvtq_n_s32_f32(v1099, 15)); - int16x4_t v1121 = vqmovn_s32(vcvtq_n_s32_f32(v1100, 15)); - float32x4_t v1161 = vfmaq_f32(v1139, v1145, v1146); - float32x4_t v1162 = vfmaq_f32(v1152, v1158, v1159); - vst1_s16((int16_t *)v2574, v809); - vst1_s16((int16_t *)v2592, v825); - float32x4_t v728 = vmulq_f32(v726, v1171); - int16x4_t v735 = vqmovn_s32(vcvtq_n_s32_f32(v729, 15)); - int16x4_t v751 = vqmovn_s32(vcvtq_n_s32_f32(v730, 15)); - int16x4_t v817 = vqmovn_s32(vcvtq_n_s32_f32(v805, 15)); - int16x4_t v833 = vqmovn_s32(vcvtq_n_s32_f32(v806, 15)); - float32x4_t v867 = vaddq_f32(v865, v866); - float32x4_t v868 = vsubq_f32(v866, v865); - float32x4_t v1024 = vmulq_f32(v1022, v1171); - int16x4_t v1031 = vqmovn_s32(vcvtq_n_s32_f32(v1025, 15)); - int16x4_t v1047 = vqmovn_s32(vcvtq_n_s32_f32(v1026, 15)); - int16x4_t v1113 = vqmovn_s32(vcvtq_n_s32_f32(v1101, 15)); - int16x4_t v1129 = vqmovn_s32(vcvtq_n_s32_f32(v1102, 15)); - float32x4_t v1163 = vaddq_f32(v1161, v1162); - float32x4_t v1164 = vsubq_f32(v1162, v1161); - vst1_s16((int16_t *)v2655, v965); - vst1_s16((int16_t *)v2673, v981); - vst1_s16((int16_t *)v2718, v1105); - vst1_s16((int16_t *)v2736, v1121); - float32x4_t v731 = vsubq_f32(v294, v728); - float32x4_t v732 = vaddq_f32(v294, v728); - float32x4_t v874 = vrev64q_f32(v868); - float32x4_t v877 = vaddq_f32(v377, v867); - float32x4_t v878 = vsubq_f32(v377, v867); - float32x4_t v1027 = vsubq_f32(v296, v1024); - float32x4_t v1028 = vaddq_f32(v296, v1024); - float32x4_t v1170 = vrev64q_f32(v1164); - float32x4_t v1173 = vaddq_f32(v379, v1163); - float32x4_t v1174 = vsubq_f32(v379, v1163); - vst1_s16((int16_t *)v2538, v735); - vst1_s16((int16_t *)v2556, v751); - vst1_s16((int16_t *)v2583, v817); - vst1_s16((int16_t *)v2601, v833); - vst1_s16((int16_t *)v2682, v1031); - vst1_s16((int16_t *)v2700, v1047); - vst1_s16((int16_t *)v2727, v1113); - vst1_s16((int16_t *)v2745, v1129); - int16x4_t v743 = vqmovn_s32(vcvtq_n_s32_f32(v731, 15)); - int16x4_t v759 = vqmovn_s32(vcvtq_n_s32_f32(v732, 15)); - float32x4_t v876 = vmulq_f32(v874, v1171); - int16x4_t v883 = vqmovn_s32(vcvtq_n_s32_f32(v877, 15)); - int16x4_t v899 = vqmovn_s32(vcvtq_n_s32_f32(v878, 15)); - int16x4_t v1039 = vqmovn_s32(vcvtq_n_s32_f32(v1027, 15)); - int16x4_t v1055 = vqmovn_s32(vcvtq_n_s32_f32(v1028, 15)); - float32x4_t v1172 = vmulq_f32(v1170, v1171); - int16x4_t v1179 = vqmovn_s32(vcvtq_n_s32_f32(v1173, 15)); - int16x4_t v1195 = vqmovn_s32(vcvtq_n_s32_f32(v1174, 15)); - float32x4_t v879 = vsubq_f32(v378, v876); - float32x4_t v880 = vaddq_f32(v378, v876); - float32x4_t v1175 = vsubq_f32(v380, v1172); - float32x4_t v1176 = vaddq_f32(v380, v1172); - vst1_s16((int16_t *)v2547, v743); - vst1_s16((int16_t *)v2565, v759); - vst1_s16((int16_t *)v2610, v883); - vst1_s16((int16_t *)v2628, v899); - vst1_s16((int16_t *)v2691, v1039); - vst1_s16((int16_t *)v2709, v1055); - vst1_s16((int16_t *)v2754, v1179); - vst1_s16((int16_t *)v2772, v1195); - int16x4_t v891 = vqmovn_s32(vcvtq_n_s32_f32(v879, 15)); - int16x4_t v907 = vqmovn_s32(vcvtq_n_s32_f32(v880, 15)); - int16x4_t v1187 = vqmovn_s32(vcvtq_n_s32_f32(v1175, 15)); - int16x4_t v1203 = vqmovn_s32(vcvtq_n_s32_f32(v1176, 15)); - vst1_s16((int16_t *)v2619, v891); - vst1_s16((int16_t *)v2637, v907); - vst1_s16((int16_t *)v2763, v1187); - vst1_s16((int16_t *)v2781, v1203); - v5 += 2 * 1; - v6 += 2 * 1; - } - for (int j = v1209 * 2; j < howmany; j += 1) { - float32x2_t v1524 = v5[istride]; - float v1959 = 7.0710678118654757e-01F; - float v1970 = -7.0710678118654746e-01F; - float v2020 = 5.5557023301960229e-01F; - float v2034 = -1.9509032201612861e-01F; - float v2085 = 9.2387953251128674e-01F; - float v2092 = -9.2387953251128685e-01F; - float v2095 = 3.8268343236508967e-01F; - float v2096 = -3.8268343236508967e-01F; - float v2142 = 1.9509032201612833e-01F; - float v2145 = -9.8078528040323043e-01F; - float v2146 = 9.8078528040323043e-01F; - float v2153 = -5.5557023301960218e-01F; - float v2156 = 8.3146961230254524e-01F; - float v2157 = -8.3146961230254524e-01F; - float v2167 = -1.0000000000000000e+00F; - float v2168 = 1.0000000000000000e+00F; - float32x2_t v2170 = (float32x2_t){v4, v4}; - float32x2_t v1221 = v5[0]; - float32x2_t v1777 = (float32x2_t){v2146, v2146}; - float32x2_t v1838 = (float32x2_t){v2085, v2085}; - float32x2_t v1842 = (float32x2_t){v2096, v2095}; - float32x2_t v1899 = (float32x2_t){v2156, v2156}; - float32x2_t v1903 = (float32x2_t){v2153, v2020}; - float32x2_t v1910 = (float32x2_t){v2034, v2034}; - float32x2_t v1960 = (float32x2_t){v1959, v1959}; - float32x2_t v1971 = (float32x2_t){v1970, v1970}; - float32x2_t v1975 = (float32x2_t){v2168, v2167}; - float32x2_t v2021 = (float32x2_t){v2020, v2020}; - float32x2_t v2025 = (float32x2_t){v2157, v2156}; - float32x2_t v2032 = (float32x2_t){v2145, v2145}; - float32x2_t v2036 = (float32x2_t){v2034, v2142}; - float32x2_t v2082 = (float32x2_t){v2095, v2095}; - float32x2_t v2086 = (float32x2_t){v2092, v2085}; - float32x2_t v2093 = (float32x2_t){v2092, v2092}; - float32x2_t v2097 = (float32x2_t){v2095, v2096}; - float32x2_t v2143 = (float32x2_t){v2142, v2142}; - float32x2_t v2147 = (float32x2_t){v2145, v2146}; - float32x2_t v2154 = (float32x2_t){v2153, v2153}; - float32x2_t v2158 = (float32x2_t){v2156, v2157}; - float32x2_t v2169 = (float32x2_t){v2167, v2168}; - float32x2_t v1226 = v5[istride * 16]; - float32x2_t v1233 = v5[istride * 8]; - float32x2_t v1238 = v5[istride * 24]; - float32x2_t v1256 = v5[istride * 4]; - float32x2_t v1261 = v5[istride * 20]; - float32x2_t v1268 = v5[istride * 12]; - float32x2_t v1273 = v5[istride * 28]; - float32x2_t v1330 = v5[istride * 2]; - float32x2_t v1335 = v5[istride * 18]; - float32x2_t v1342 = v5[istride * 10]; - float32x2_t v1347 = v5[istride * 26]; - float32x2_t v1365 = v5[istride * 6]; - float32x2_t v1370 = v5[istride * 22]; - float32x2_t v1377 = v5[istride * 14]; - float32x2_t v1382 = v5[istride * 30]; - float32x2_t v1529 = v5[istride * 17]; - float32x2_t v1536 = v5[istride * 9]; - float32x2_t v1541 = v5[istride * 25]; - float32x2_t v1559 = v5[istride * 5]; - float32x2_t v1564 = v5[istride * 21]; - float32x2_t v1571 = v5[istride * 13]; - float32x2_t v1576 = v5[istride * 29]; - float32x2_t v1633 = v5[istride * 3]; - float32x2_t v1638 = v5[istride * 19]; - float32x2_t v1645 = v5[istride * 11]; - float32x2_t v1650 = v5[istride * 27]; - float32x2_t v1668 = v5[istride * 7]; - float32x2_t v1673 = v5[istride * 23]; - float32x2_t v1680 = v5[istride * 15]; - float32x2_t v1685 = v5[istride * 31]; - float32x2_t v1844 = vmul_f32(v2170, v1842); - float32x2_t v1905 = vmul_f32(v2170, v1903); - float32x2_t v1977 = vmul_f32(v2170, v1975); - float32x2_t v2027 = vmul_f32(v2170, v2025); - float32x2_t v2038 = vmul_f32(v2170, v2036); - float32x2_t v2088 = vmul_f32(v2170, v2086); - float32x2_t v2099 = vmul_f32(v2170, v2097); - float32x2_t v2149 = vmul_f32(v2170, v2147); - float32x2_t v2160 = vmul_f32(v2170, v2158); - float32x2_t v2171 = vmul_f32(v2170, v2169); - float32x2_t v1227 = vadd_f32(v1221, v1226); - float32x2_t v1228 = vsub_f32(v1221, v1226); - float32x2_t v1239 = vadd_f32(v1233, v1238); - float32x2_t v1240 = vsub_f32(v1233, v1238); - float32x2_t v1262 = vadd_f32(v1256, v1261); - float32x2_t v1263 = vsub_f32(v1256, v1261); - float32x2_t v1274 = vadd_f32(v1268, v1273); - float32x2_t v1275 = vsub_f32(v1268, v1273); - float32x2_t v1336 = vadd_f32(v1330, v1335); - float32x2_t v1337 = vsub_f32(v1330, v1335); - float32x2_t v1348 = vadd_f32(v1342, v1347); - float32x2_t v1349 = vsub_f32(v1342, v1347); - float32x2_t v1371 = vadd_f32(v1365, v1370); - float32x2_t v1372 = vsub_f32(v1365, v1370); - float32x2_t v1383 = vadd_f32(v1377, v1382); - float32x2_t v1384 = vsub_f32(v1377, v1382); - float32x2_t v1530 = vadd_f32(v1524, v1529); - float32x2_t v1531 = vsub_f32(v1524, v1529); - float32x2_t v1542 = vadd_f32(v1536, v1541); - float32x2_t v1543 = vsub_f32(v1536, v1541); - float32x2_t v1565 = vadd_f32(v1559, v1564); - float32x2_t v1566 = vsub_f32(v1559, v1564); - float32x2_t v1577 = vadd_f32(v1571, v1576); - float32x2_t v1578 = vsub_f32(v1571, v1576); - float32x2_t v1639 = vadd_f32(v1633, v1638); - float32x2_t v1640 = vsub_f32(v1633, v1638); - float32x2_t v1651 = vadd_f32(v1645, v1650); - float32x2_t v1652 = vsub_f32(v1645, v1650); - float32x2_t v1674 = vadd_f32(v1668, v1673); - float32x2_t v1675 = vsub_f32(v1668, v1673); - float32x2_t v1686 = vadd_f32(v1680, v1685); - float32x2_t v1687 = vsub_f32(v1680, v1685); - float32x2_t v1246 = vrev64_f32(v1240); - float32x2_t v1248 = vadd_f32(v1227, v1239); - float32x2_t v1249 = vsub_f32(v1227, v1239); - float32x2_t v1276 = vadd_f32(v1262, v1274); - float32x2_t v1277 = vsub_f32(v1262, v1274); - float32x2_t v1292 = vmul_f32(v1263, v1960); - float32x2_t v1303 = vmul_f32(v1275, v1971); - float32x2_t v1355 = vrev64_f32(v1349); - float32x2_t v1357 = vadd_f32(v1336, v1348); - float32x2_t v1358 = vsub_f32(v1336, v1348); - float32x2_t v1390 = vrev64_f32(v1384); - float32x2_t v1392 = vadd_f32(v1371, v1383); - float32x2_t v1393 = vsub_f32(v1371, v1383); - float32x2_t v1549 = vrev64_f32(v1543); - float32x2_t v1551 = vadd_f32(v1530, v1542); - float32x2_t v1552 = vsub_f32(v1530, v1542); - float32x2_t v1579 = vadd_f32(v1565, v1577); - float32x2_t v1580 = vsub_f32(v1565, v1577); - float32x2_t v1595 = vmul_f32(v1566, v1960); - float32x2_t v1606 = vmul_f32(v1578, v1971); - float32x2_t v1658 = vrev64_f32(v1652); - float32x2_t v1660 = vadd_f32(v1639, v1651); - float32x2_t v1661 = vsub_f32(v1639, v1651); - float32x2_t v1688 = vadd_f32(v1674, v1686); - float32x2_t v1689 = vsub_f32(v1674, v1686); - float32x2_t v1704 = vmul_f32(v1675, v1960); - float32x2_t v1715 = vmul_f32(v1687, v1971); - float32x2_t v1247 = vmul_f32(v1246, v1977); - float32x2_t v1283 = vrev64_f32(v1277); - float32x2_t v1285 = vadd_f32(v1248, v1276); - float32x2_t v1286 = vsub_f32(v1248, v1276); - float32x2_t v1298 = vrev64_f32(v1292); - float32x2_t v1309 = vrev64_f32(v1303); - float32x2_t v1356 = vmul_f32(v1355, v1977); - float32x2_t v1391 = vmul_f32(v1390, v1977); - float32x2_t v1396 = vadd_f32(v1357, v1392); - float32x2_t v1397 = vsub_f32(v1357, v1392); - float32x2_t v1449 = vmul_f32(v1358, v1960); - float32x2_t v1460 = vmul_f32(v1393, v1971); - float32x2_t v1550 = vmul_f32(v1549, v1977); - float32x2_t v1586 = vrev64_f32(v1580); - float32x2_t v1588 = vadd_f32(v1551, v1579); - float32x2_t v1589 = vsub_f32(v1551, v1579); - float32x2_t v1601 = vrev64_f32(v1595); - float32x2_t v1612 = vrev64_f32(v1606); - float32x2_t v1659 = vmul_f32(v1658, v1977); - float32x2_t v1695 = vrev64_f32(v1689); - float32x2_t v1697 = vadd_f32(v1660, v1688); - float32x2_t v1698 = vsub_f32(v1660, v1688); - float32x2_t v1710 = vrev64_f32(v1704); - float32x2_t v1721 = vrev64_f32(v1715); - float32x2_t v1250 = vsub_f32(v1228, v1247); - float32x2_t v1251 = vadd_f32(v1228, v1247); - float32x2_t v1284 = vmul_f32(v1283, v1977); - float32x2_t v1299 = vmul_f32(v1298, v2171); - float32x2_t v1310 = vmul_f32(v1309, v1977); - float32x2_t v1359 = vsub_f32(v1337, v1356); - float32x2_t v1360 = vadd_f32(v1337, v1356); - float32x2_t v1394 = vsub_f32(v1372, v1391); - float32x2_t v1395 = vadd_f32(v1372, v1391); - float32x2_t v1403 = vrev64_f32(v1397); - float32x2_t v1405 = vadd_f32(v1285, v1396); - float32x2_t v1406 = vsub_f32(v1285, v1396); - float32x2_t v1455 = vrev64_f32(v1449); - float32x2_t v1466 = vrev64_f32(v1460); - float32x2_t v1553 = vsub_f32(v1531, v1550); - float32x2_t v1554 = vadd_f32(v1531, v1550); - float32x2_t v1587 = vmul_f32(v1586, v1977); - float32x2_t v1602 = vmul_f32(v1601, v2171); - float32x2_t v1613 = vmul_f32(v1612, v1977); - float32x2_t v1662 = vsub_f32(v1640, v1659); - float32x2_t v1663 = vadd_f32(v1640, v1659); - float32x2_t v1696 = vmul_f32(v1695, v1977); - float32x2_t v1711 = vmul_f32(v1710, v2171); - float32x2_t v1722 = vmul_f32(v1721, v1977); - float32x2_t v1738 = vadd_f32(v1588, v1697); - float32x2_t v1739 = vsub_f32(v1588, v1697); - float32x2_t v1961 = vmul_f32(v1589, v1960); - float32x2_t v1972 = vmul_f32(v1698, v1971); - float32x2_t v1287 = vsub_f32(v1249, v1284); - float32x2_t v1288 = vadd_f32(v1249, v1284); - float32x2_t v1311 = vadd_f32(v1292, v1299); - float32x2_t v1312 = vadd_f32(v1303, v1310); - float32x2_t v1404 = vmul_f32(v1403, v1977); - float32x2_t v1412 = vmul_f32(v1359, v1838); - float32x2_t v1418 = vrev64_f32(v1359); - float32x2_t v1423 = vmul_f32(v1394, v2082); - float32x2_t v1429 = vrev64_f32(v1394); - float32x2_t v1456 = vmul_f32(v1455, v2171); - float32x2_t v1467 = vmul_f32(v1466, v1977); - float32x2_t v1486 = vmul_f32(v1360, v2082); - float32x2_t v1492 = vrev64_f32(v1360); - float32x2_t v1497 = vmul_f32(v1395, v2093); - float32x2_t v1503 = vrev64_f32(v1395); - float32x2_t v1590 = vsub_f32(v1552, v1587); - float32x2_t v1591 = vadd_f32(v1552, v1587); - float32x2_t v1614 = vadd_f32(v1595, v1602); - float32x2_t v1615 = vadd_f32(v1606, v1613); - float32x2_t v1699 = vsub_f32(v1661, v1696); - float32x2_t v1700 = vadd_f32(v1661, v1696); - float32x2_t v1723 = vadd_f32(v1704, v1711); - float32x2_t v1724 = vadd_f32(v1715, v1722); - float32x2_t v1745 = vrev64_f32(v1739); - float32x2_t v1747 = vadd_f32(v1405, v1738); - float32x2_t v1748 = vsub_f32(v1405, v1738); - float32x2_t v1967 = vrev64_f32(v1961); - float32x2_t v1978 = vrev64_f32(v1972); - float32x2_t v1313 = vadd_f32(v1311, v1312); - float32x2_t v1314 = vsub_f32(v1312, v1311); - float32x2_t v1407 = vsub_f32(v1286, v1404); - float32x2_t v1408 = vadd_f32(v1286, v1404); - float32x2_t v1468 = vadd_f32(v1449, v1456); - float32x2_t v1469 = vadd_f32(v1460, v1467); - float32x2_t v1616 = vadd_f32(v1614, v1615); - float32x2_t v1617 = vsub_f32(v1615, v1614); - float32x2_t v1725 = vadd_f32(v1723, v1724); - float32x2_t v1726 = vsub_f32(v1724, v1723); - float32x2_t v1746 = vmul_f32(v1745, v1977); - int16x4_t v1753 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1747, 15), (int32x2_t){0, 0})); - int16x4_t v1765 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1748, 15), (int32x2_t){0, 0})); - float32x2_t v1839 = vmul_f32(v1590, v1838); - float32x2_t v1845 = vrev64_f32(v1590); - float32x2_t v1850 = vmul_f32(v1699, v2082); - float32x2_t v1856 = vrev64_f32(v1699); - float32x2_t v1968 = vmul_f32(v1967, v2171); - float32x2_t v1979 = vmul_f32(v1978, v1977); - float32x2_t v2083 = vmul_f32(v1591, v2082); - float32x2_t v2089 = vrev64_f32(v1591); - float32x2_t v2094 = vmul_f32(v1700, v2093); - float32x2_t v2100 = vrev64_f32(v1700); - float32x2_t v1320 = vrev64_f32(v1314); - float32x2_t v1322 = vadd_f32(v1250, v1313); - float32x2_t v1323 = vsub_f32(v1250, v1313); - float32x2_t v1431 = vfma_f32(v1412, v1418, v1844); - float32x2_t v1432 = vfma_f32(v1423, v1429, v2088); - float32x2_t v1470 = vadd_f32(v1468, v1469); - float32x2_t v1471 = vsub_f32(v1469, v1468); - float32x2_t v1505 = vfma_f32(v1486, v1492, v2088); - float32x2_t v1506 = vfma_f32(v1497, v1503, v2099); - float32x2_t v1623 = vrev64_f32(v1617); - float32x2_t v1625 = vadd_f32(v1553, v1616); - float32x2_t v1626 = vsub_f32(v1553, v1616); - float32x2_t v1732 = vrev64_f32(v1726); - float32x2_t v1734 = vadd_f32(v1662, v1725); - float32x2_t v1735 = vsub_f32(v1662, v1725); - float32x2_t v1749 = vsub_f32(v1406, v1746); - float32x2_t v1750 = vadd_f32(v1406, v1746); - v6[0] = vget_lane_s32(vreinterpret_s32_s16(v1753), 0); - v6[ostride * 16] = vget_lane_s32(vreinterpret_s32_s16(v1765), 0); - float32x2_t v1980 = vadd_f32(v1961, v1968); - float32x2_t v1981 = vadd_f32(v1972, v1979); - float32x2_t v1321 = vmul_f32(v1320, v2171); - float32x2_t v1433 = vadd_f32(v1431, v1432); - float32x2_t v1434 = vsub_f32(v1432, v1431); - float32x2_t v1477 = vrev64_f32(v1471); - float32x2_t v1479 = vadd_f32(v1287, v1470); - float32x2_t v1480 = vsub_f32(v1287, v1470); - float32x2_t v1507 = vadd_f32(v1505, v1506); - float32x2_t v1508 = vsub_f32(v1506, v1505); - float32x2_t v1624 = vmul_f32(v1623, v2171); - float32x2_t v1733 = vmul_f32(v1732, v2171); - int16x4_t v1759 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1749, 15), (int32x2_t){0, 0})); - int16x4_t v1771 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1750, 15), (int32x2_t){0, 0})); - float32x2_t v1778 = vmul_f32(v1625, v1777); - float32x2_t v1784 = vrev64_f32(v1625); - float32x2_t v1789 = vmul_f32(v1734, v1899); - float32x2_t v1795 = vrev64_f32(v1734); - float32x2_t v1858 = vfma_f32(v1839, v1845, v1844); - float32x2_t v1859 = vfma_f32(v1850, v1856, v2088); - float32x2_t v1982 = vadd_f32(v1980, v1981); - float32x2_t v1983 = vsub_f32(v1981, v1980); - float32x2_t v2022 = vmul_f32(v1626, v2021); - float32x2_t v2028 = vrev64_f32(v1626); - float32x2_t v2033 = vmul_f32(v1735, v2032); - float32x2_t v2039 = vrev64_f32(v1735); - float32x2_t v2102 = vfma_f32(v2083, v2089, v2088); - float32x2_t v2103 = vfma_f32(v2094, v2100, v2099); - float32x2_t v1324 = vsub_f32(v1251, v1321); - float32x2_t v1325 = vadd_f32(v1251, v1321); - float32x2_t v1440 = vrev64_f32(v1434); - float32x2_t v1442 = vadd_f32(v1322, v1433); - float32x2_t v1443 = vsub_f32(v1322, v1433); - float32x2_t v1478 = vmul_f32(v1477, v2171); - float32x2_t v1514 = vrev64_f32(v1508); - float32x2_t v1627 = vsub_f32(v1554, v1624); - float32x2_t v1628 = vadd_f32(v1554, v1624); - float32x2_t v1736 = vsub_f32(v1663, v1733); - float32x2_t v1737 = vadd_f32(v1663, v1733); - v6[ostride * 8] = vget_lane_s32(vreinterpret_s32_s16(v1759), 0); - v6[ostride * 24] = vget_lane_s32(vreinterpret_s32_s16(v1771), 0); - float32x2_t v1860 = vadd_f32(v1858, v1859); - float32x2_t v1861 = vsub_f32(v1859, v1858); - float32x2_t v1989 = vrev64_f32(v1983); - float32x2_t v1991 = vadd_f32(v1407, v1982); - float32x2_t v1992 = vsub_f32(v1407, v1982); - float32x2_t v2104 = vadd_f32(v2102, v2103); - float32x2_t v2105 = vsub_f32(v2103, v2102); - float32x2_t v1441 = vmul_f32(v1440, v2171); - float32x2_t v1481 = vsub_f32(v1288, v1478); - float32x2_t v1482 = vadd_f32(v1288, v1478); - float32x2_t v1515 = vmul_f32(v1514, v2171); - float32x2_t v1516 = vadd_f32(v1324, v1507); - float32x2_t v1517 = vsub_f32(v1324, v1507); - float32x2_t v1797 = vfma_f32(v1778, v1784, v2038); - float32x2_t v1798 = vfma_f32(v1789, v1795, v1905); - float32x2_t v1867 = vrev64_f32(v1861); - float32x2_t v1869 = vadd_f32(v1479, v1860); - float32x2_t v1870 = vsub_f32(v1479, v1860); - float32x2_t v1900 = vmul_f32(v1627, v1899); - float32x2_t v1906 = vrev64_f32(v1627); - float32x2_t v1911 = vmul_f32(v1736, v1910); - float32x2_t v1917 = vrev64_f32(v1736); - float32x2_t v1990 = vmul_f32(v1989, v2171); - int16x4_t v1997 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1991, 15), (int32x2_t){0, 0})); - int16x4_t v2009 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1992, 15), (int32x2_t){0, 0})); - float32x2_t v2041 = vfma_f32(v2022, v2028, v2027); - float32x2_t v2042 = vfma_f32(v2033, v2039, v2038); - float32x2_t v2111 = vrev64_f32(v2105); - float32x2_t v2144 = vmul_f32(v1628, v2143); - float32x2_t v2150 = vrev64_f32(v1628); - float32x2_t v2155 = vmul_f32(v1737, v2154); - float32x2_t v2161 = vrev64_f32(v1737); - float32x2_t v1444 = vsub_f32(v1323, v1441); - float32x2_t v1445 = vadd_f32(v1323, v1441); - float32x2_t v1518 = vsub_f32(v1325, v1515); - float32x2_t v1519 = vadd_f32(v1325, v1515); - float32x2_t v1799 = vadd_f32(v1797, v1798); - float32x2_t v1800 = vsub_f32(v1798, v1797); - float32x2_t v1868 = vmul_f32(v1867, v2171); - int16x4_t v1875 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1869, 15), (int32x2_t){0, 0})); - int16x4_t v1887 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1870, 15), (int32x2_t){0, 0})); - float32x2_t v1993 = vsub_f32(v1408, v1990); - float32x2_t v1994 = vadd_f32(v1408, v1990); - v6[ostride * 4] = vget_lane_s32(vreinterpret_s32_s16(v1997), 0); - v6[ostride * 20] = vget_lane_s32(vreinterpret_s32_s16(v2009), 0); - float32x2_t v2043 = vadd_f32(v2041, v2042); - float32x2_t v2044 = vsub_f32(v2042, v2041); - float32x2_t v2112 = vmul_f32(v2111, v2171); - float32x2_t v2113 = vadd_f32(v1481, v2104); - float32x2_t v2114 = vsub_f32(v1481, v2104); - float32x2_t v1806 = vrev64_f32(v1800); - float32x2_t v1808 = vadd_f32(v1442, v1799); - float32x2_t v1809 = vsub_f32(v1442, v1799); - float32x2_t v1871 = vsub_f32(v1480, v1868); - float32x2_t v1872 = vadd_f32(v1480, v1868); - v6[ostride * 2] = vget_lane_s32(vreinterpret_s32_s16(v1875), 0); - v6[ostride * 18] = vget_lane_s32(vreinterpret_s32_s16(v1887), 0); - float32x2_t v1919 = vfma_f32(v1900, v1906, v1905); - float32x2_t v1920 = vfma_f32(v1911, v1917, v2149); - int16x4_t v2003 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1993, 15), (int32x2_t){0, 0})); - int16x4_t v2015 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1994, 15), (int32x2_t){0, 0})); - float32x2_t v2050 = vrev64_f32(v2044); - float32x2_t v2052 = vadd_f32(v1444, v2043); - float32x2_t v2053 = vsub_f32(v1444, v2043); - float32x2_t v2115 = vsub_f32(v1482, v2112); - float32x2_t v2116 = vadd_f32(v1482, v2112); - int16x4_t v2119 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v2113, 15), (int32x2_t){0, 0})); - int16x4_t v2131 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v2114, 15), (int32x2_t){0, 0})); - float32x2_t v2163 = vfma_f32(v2144, v2150, v2149); - float32x2_t v2164 = vfma_f32(v2155, v2161, v2160); - float32x2_t v1807 = vmul_f32(v1806, v2171); - int16x4_t v1814 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1808, 15), (int32x2_t){0, 0})); - int16x4_t v1826 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1809, 15), (int32x2_t){0, 0})); - int16x4_t v1881 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1871, 15), (int32x2_t){0, 0})); - int16x4_t v1893 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1872, 15), (int32x2_t){0, 0})); - float32x2_t v1921 = vadd_f32(v1919, v1920); - float32x2_t v1922 = vsub_f32(v1920, v1919); - v6[ostride * 12] = vget_lane_s32(vreinterpret_s32_s16(v2003), 0); - v6[ostride * 28] = vget_lane_s32(vreinterpret_s32_s16(v2015), 0); - float32x2_t v2051 = vmul_f32(v2050, v2171); - int16x4_t v2058 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v2052, 15), (int32x2_t){0, 0})); - int16x4_t v2070 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v2053, 15), (int32x2_t){0, 0})); - v6[ostride * 6] = vget_lane_s32(vreinterpret_s32_s16(v2119), 0); - int16x4_t v2125 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v2115, 15), (int32x2_t){0, 0})); - v6[ostride * 22] = vget_lane_s32(vreinterpret_s32_s16(v2131), 0); - int16x4_t v2137 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v2116, 15), (int32x2_t){0, 0})); - float32x2_t v2165 = vadd_f32(v2163, v2164); - float32x2_t v2166 = vsub_f32(v2164, v2163); - float32x2_t v1810 = vsub_f32(v1443, v1807); - float32x2_t v1811 = vadd_f32(v1443, v1807); - v6[ostride] = vget_lane_s32(vreinterpret_s32_s16(v1814), 0); - v6[ostride * 17] = vget_lane_s32(vreinterpret_s32_s16(v1826), 0); - v6[ostride * 10] = vget_lane_s32(vreinterpret_s32_s16(v1881), 0); - v6[ostride * 26] = vget_lane_s32(vreinterpret_s32_s16(v1893), 0); - float32x2_t v1928 = vrev64_f32(v1922); - float32x2_t v1930 = vadd_f32(v1516, v1921); - float32x2_t v1931 = vsub_f32(v1516, v1921); - float32x2_t v2054 = vsub_f32(v1445, v2051); - float32x2_t v2055 = vadd_f32(v1445, v2051); - v6[ostride * 5] = vget_lane_s32(vreinterpret_s32_s16(v2058), 0); - v6[ostride * 21] = vget_lane_s32(vreinterpret_s32_s16(v2070), 0); - v6[ostride * 14] = vget_lane_s32(vreinterpret_s32_s16(v2125), 0); - v6[ostride * 30] = vget_lane_s32(vreinterpret_s32_s16(v2137), 0); - float32x2_t v2172 = vrev64_f32(v2166); - float32x2_t v2174 = vadd_f32(v1518, v2165); - float32x2_t v2175 = vsub_f32(v1518, v2165); - int16x4_t v1820 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1810, 15), (int32x2_t){0, 0})); - int16x4_t v1832 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1811, 15), (int32x2_t){0, 0})); - float32x2_t v1929 = vmul_f32(v1928, v2171); - int16x4_t v1936 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1930, 15), (int32x2_t){0, 0})); - int16x4_t v1948 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1931, 15), (int32x2_t){0, 0})); - int16x4_t v2064 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v2054, 15), (int32x2_t){0, 0})); - int16x4_t v2076 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v2055, 15), (int32x2_t){0, 0})); - float32x2_t v2173 = vmul_f32(v2172, v2171); - int16x4_t v2180 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v2174, 15), (int32x2_t){0, 0})); - int16x4_t v2192 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v2175, 15), (int32x2_t){0, 0})); - v6[ostride * 9] = vget_lane_s32(vreinterpret_s32_s16(v1820), 0); - v6[ostride * 25] = vget_lane_s32(vreinterpret_s32_s16(v1832), 0); - float32x2_t v1932 = vsub_f32(v1517, v1929); - float32x2_t v1933 = vadd_f32(v1517, v1929); - v6[ostride * 3] = vget_lane_s32(vreinterpret_s32_s16(v1936), 0); - v6[ostride * 19] = vget_lane_s32(vreinterpret_s32_s16(v1948), 0); - v6[ostride * 13] = vget_lane_s32(vreinterpret_s32_s16(v2064), 0); - v6[ostride * 29] = vget_lane_s32(vreinterpret_s32_s16(v2076), 0); - float32x2_t v2176 = vsub_f32(v1519, v2173); - float32x2_t v2177 = vadd_f32(v1519, v2173); - v6[ostride * 7] = vget_lane_s32(vreinterpret_s32_s16(v2180), 0); - v6[ostride * 23] = vget_lane_s32(vreinterpret_s32_s16(v2192), 0); - int16x4_t v1942 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1932, 15), (int32x2_t){0, 0})); - int16x4_t v1954 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1933, 15), (int32x2_t){0, 0})); - int16x4_t v2186 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v2176, 15), (int32x2_t){0, 0})); - int16x4_t v2198 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v2177, 15), (int32x2_t){0, 0})); - v6[ostride * 11] = vget_lane_s32(vreinterpret_s32_s16(v1942), 0); - v6[ostride * 27] = vget_lane_s32(vreinterpret_s32_s16(v1954), 0); - v6[ostride * 15] = vget_lane_s32(vreinterpret_s32_s16(v2186), 0); - v6[ostride * 31] = vget_lane_s32(vreinterpret_s32_s16(v2198), 0); + for (int j = 0; j < howmany; j += 1) { + float32x2_t v323 = v5[istride]; + float v758 = 7.0710678118654757e-01F; + float v769 = -7.0710678118654746e-01F; + float v819 = 5.5557023301960229e-01F; + float v833 = -1.9509032201612861e-01F; + float v884 = 9.2387953251128674e-01F; + float v891 = -9.2387953251128685e-01F; + float v894 = 3.8268343236508967e-01F; + float v895 = -3.8268343236508967e-01F; + float v941 = 1.9509032201612833e-01F; + float v944 = -9.8078528040323043e-01F; + float v945 = 9.8078528040323043e-01F; + float v952 = -5.5557023301960218e-01F; + float v955 = 8.3146961230254524e-01F; + float v956 = -8.3146961230254524e-01F; + float v966 = -1.0000000000000000e+00F; + float v967 = 1.0000000000000000e+00F; + float32x2_t v969 = (float32x2_t){v4, v4}; + float32x2_t v20 = v5[0]; + float32x2_t v576 = (float32x2_t){v945, v945}; + float32x2_t v637 = (float32x2_t){v884, v884}; + float32x2_t v641 = (float32x2_t){v895, v894}; + float32x2_t v698 = (float32x2_t){v955, v955}; + float32x2_t v702 = (float32x2_t){v952, v819}; + float32x2_t v709 = (float32x2_t){v833, v833}; + float32x2_t v759 = (float32x2_t){v758, v758}; + float32x2_t v770 = (float32x2_t){v769, v769}; + float32x2_t v774 = (float32x2_t){v967, v966}; + float32x2_t v820 = (float32x2_t){v819, v819}; + float32x2_t v824 = (float32x2_t){v956, v955}; + float32x2_t v831 = (float32x2_t){v944, v944}; + float32x2_t v835 = (float32x2_t){v833, v941}; + float32x2_t v881 = (float32x2_t){v894, v894}; + float32x2_t v885 = (float32x2_t){v891, v884}; + float32x2_t v892 = (float32x2_t){v891, v891}; + float32x2_t v896 = (float32x2_t){v894, v895}; + float32x2_t v942 = (float32x2_t){v941, v941}; + float32x2_t v946 = (float32x2_t){v944, v945}; + float32x2_t v953 = (float32x2_t){v952, v952}; + float32x2_t v957 = (float32x2_t){v955, v956}; + float32x2_t v968 = (float32x2_t){v966, v967}; + float32x2_t v25 = v5[istride * 16]; + float32x2_t v32 = v5[istride * 8]; + float32x2_t v37 = v5[istride * 24]; + float32x2_t v55 = v5[istride * 4]; + float32x2_t v60 = v5[istride * 20]; + float32x2_t v67 = v5[istride * 12]; + float32x2_t v72 = v5[istride * 28]; + float32x2_t v129 = v5[istride * 2]; + float32x2_t v134 = v5[istride * 18]; + float32x2_t v141 = v5[istride * 10]; + float32x2_t v146 = v5[istride * 26]; + float32x2_t v164 = v5[istride * 6]; + float32x2_t v169 = v5[istride * 22]; + float32x2_t v176 = v5[istride * 14]; + float32x2_t v181 = v5[istride * 30]; + float32x2_t v328 = v5[istride * 17]; + float32x2_t v335 = v5[istride * 9]; + float32x2_t v340 = v5[istride * 25]; + float32x2_t v358 = v5[istride * 5]; + float32x2_t v363 = v5[istride * 21]; + float32x2_t v370 = v5[istride * 13]; + float32x2_t v375 = v5[istride * 29]; + float32x2_t v432 = v5[istride * 3]; + float32x2_t v437 = v5[istride * 19]; + float32x2_t v444 = v5[istride * 11]; + float32x2_t v449 = v5[istride * 27]; + float32x2_t v467 = v5[istride * 7]; + float32x2_t v472 = v5[istride * 23]; + float32x2_t v479 = v5[istride * 15]; + float32x2_t v484 = v5[istride * 31]; + float32x2_t v643 = vmul_f32(v969, v641); + float32x2_t v704 = vmul_f32(v969, v702); + float32x2_t v776 = vmul_f32(v969, v774); + float32x2_t v826 = vmul_f32(v969, v824); + float32x2_t v837 = vmul_f32(v969, v835); + float32x2_t v887 = vmul_f32(v969, v885); + float32x2_t v898 = vmul_f32(v969, v896); + float32x2_t v948 = vmul_f32(v969, v946); + float32x2_t v959 = vmul_f32(v969, v957); + float32x2_t v970 = vmul_f32(v969, v968); + float32x2_t v26 = vadd_f32(v20, v25); + float32x2_t v27 = vsub_f32(v20, v25); + float32x2_t v38 = vadd_f32(v32, v37); + float32x2_t v39 = vsub_f32(v32, v37); + float32x2_t v61 = vadd_f32(v55, v60); + float32x2_t v62 = vsub_f32(v55, v60); + float32x2_t v73 = vadd_f32(v67, v72); + float32x2_t v74 = vsub_f32(v67, v72); + float32x2_t v135 = vadd_f32(v129, v134); + float32x2_t v136 = vsub_f32(v129, v134); + float32x2_t v147 = vadd_f32(v141, v146); + float32x2_t v148 = vsub_f32(v141, v146); + float32x2_t v170 = vadd_f32(v164, v169); + float32x2_t v171 = vsub_f32(v164, v169); + float32x2_t v182 = vadd_f32(v176, v181); + float32x2_t v183 = vsub_f32(v176, v181); + float32x2_t v329 = vadd_f32(v323, v328); + float32x2_t v330 = vsub_f32(v323, v328); + float32x2_t v341 = vadd_f32(v335, v340); + float32x2_t v342 = vsub_f32(v335, v340); + float32x2_t v364 = vadd_f32(v358, v363); + float32x2_t v365 = vsub_f32(v358, v363); + float32x2_t v376 = vadd_f32(v370, v375); + float32x2_t v377 = vsub_f32(v370, v375); + float32x2_t v438 = vadd_f32(v432, v437); + float32x2_t v439 = vsub_f32(v432, v437); + float32x2_t v450 = vadd_f32(v444, v449); + float32x2_t v451 = vsub_f32(v444, v449); + float32x2_t v473 = vadd_f32(v467, v472); + float32x2_t v474 = vsub_f32(v467, v472); + float32x2_t v485 = vadd_f32(v479, v484); + float32x2_t v486 = vsub_f32(v479, v484); + float32x2_t v45 = vrev64_f32(v39); + float32x2_t v47 = vadd_f32(v26, v38); + float32x2_t v48 = vsub_f32(v26, v38); + float32x2_t v75 = vadd_f32(v61, v73); + float32x2_t v76 = vsub_f32(v61, v73); + float32x2_t v91 = vmul_f32(v62, v759); + float32x2_t v102 = vmul_f32(v74, v770); + float32x2_t v154 = vrev64_f32(v148); + float32x2_t v156 = vadd_f32(v135, v147); + float32x2_t v157 = vsub_f32(v135, v147); + float32x2_t v189 = vrev64_f32(v183); + float32x2_t v191 = vadd_f32(v170, v182); + float32x2_t v192 = vsub_f32(v170, v182); + float32x2_t v348 = vrev64_f32(v342); + float32x2_t v350 = vadd_f32(v329, v341); + float32x2_t v351 = vsub_f32(v329, v341); + float32x2_t v378 = vadd_f32(v364, v376); + float32x2_t v379 = vsub_f32(v364, v376); + float32x2_t v394 = vmul_f32(v365, v759); + float32x2_t v405 = vmul_f32(v377, v770); + float32x2_t v457 = vrev64_f32(v451); + float32x2_t v459 = vadd_f32(v438, v450); + float32x2_t v460 = vsub_f32(v438, v450); + float32x2_t v487 = vadd_f32(v473, v485); + float32x2_t v488 = vsub_f32(v473, v485); + float32x2_t v503 = vmul_f32(v474, v759); + float32x2_t v514 = vmul_f32(v486, v770); + float32x2_t v46 = vmul_f32(v45, v776); + float32x2_t v82 = vrev64_f32(v76); + float32x2_t v84 = vadd_f32(v47, v75); + float32x2_t v85 = vsub_f32(v47, v75); + float32x2_t v97 = vrev64_f32(v91); + float32x2_t v108 = vrev64_f32(v102); + float32x2_t v155 = vmul_f32(v154, v776); + float32x2_t v190 = vmul_f32(v189, v776); + float32x2_t v195 = vadd_f32(v156, v191); + float32x2_t v196 = vsub_f32(v156, v191); + float32x2_t v248 = vmul_f32(v157, v759); + float32x2_t v259 = vmul_f32(v192, v770); + float32x2_t v349 = vmul_f32(v348, v776); + float32x2_t v385 = vrev64_f32(v379); + float32x2_t v387 = vadd_f32(v350, v378); + float32x2_t v388 = vsub_f32(v350, v378); + float32x2_t v400 = vrev64_f32(v394); + float32x2_t v411 = vrev64_f32(v405); + float32x2_t v458 = vmul_f32(v457, v776); + float32x2_t v494 = vrev64_f32(v488); + float32x2_t v496 = vadd_f32(v459, v487); + float32x2_t v497 = vsub_f32(v459, v487); + float32x2_t v509 = vrev64_f32(v503); + float32x2_t v520 = vrev64_f32(v514); + float32x2_t v49 = vsub_f32(v27, v46); + float32x2_t v50 = vadd_f32(v27, v46); + float32x2_t v83 = vmul_f32(v82, v776); + float32x2_t v98 = vmul_f32(v97, v970); + float32x2_t v109 = vmul_f32(v108, v776); + float32x2_t v158 = vsub_f32(v136, v155); + float32x2_t v159 = vadd_f32(v136, v155); + float32x2_t v193 = vsub_f32(v171, v190); + float32x2_t v194 = vadd_f32(v171, v190); + float32x2_t v202 = vrev64_f32(v196); + float32x2_t v204 = vadd_f32(v84, v195); + float32x2_t v205 = vsub_f32(v84, v195); + float32x2_t v254 = vrev64_f32(v248); + float32x2_t v265 = vrev64_f32(v259); + float32x2_t v352 = vsub_f32(v330, v349); + float32x2_t v353 = vadd_f32(v330, v349); + float32x2_t v386 = vmul_f32(v385, v776); + float32x2_t v401 = vmul_f32(v400, v970); + float32x2_t v412 = vmul_f32(v411, v776); + float32x2_t v461 = vsub_f32(v439, v458); + float32x2_t v462 = vadd_f32(v439, v458); + float32x2_t v495 = vmul_f32(v494, v776); + float32x2_t v510 = vmul_f32(v509, v970); + float32x2_t v521 = vmul_f32(v520, v776); + float32x2_t v537 = vadd_f32(v387, v496); + float32x2_t v538 = vsub_f32(v387, v496); + float32x2_t v760 = vmul_f32(v388, v759); + float32x2_t v771 = vmul_f32(v497, v770); + float32x2_t v86 = vsub_f32(v48, v83); + float32x2_t v87 = vadd_f32(v48, v83); + float32x2_t v110 = vadd_f32(v91, v98); + float32x2_t v111 = vadd_f32(v102, v109); + float32x2_t v203 = vmul_f32(v202, v776); + float32x2_t v211 = vmul_f32(v158, v637); + float32x2_t v217 = vrev64_f32(v158); + float32x2_t v222 = vmul_f32(v193, v881); + float32x2_t v228 = vrev64_f32(v193); + float32x2_t v255 = vmul_f32(v254, v970); + float32x2_t v266 = vmul_f32(v265, v776); + float32x2_t v285 = vmul_f32(v159, v881); + float32x2_t v291 = vrev64_f32(v159); + float32x2_t v296 = vmul_f32(v194, v892); + float32x2_t v302 = vrev64_f32(v194); + float32x2_t v389 = vsub_f32(v351, v386); + float32x2_t v390 = vadd_f32(v351, v386); + float32x2_t v413 = vadd_f32(v394, v401); + float32x2_t v414 = vadd_f32(v405, v412); + float32x2_t v498 = vsub_f32(v460, v495); + float32x2_t v499 = vadd_f32(v460, v495); + float32x2_t v522 = vadd_f32(v503, v510); + float32x2_t v523 = vadd_f32(v514, v521); + float32x2_t v544 = vrev64_f32(v538); + float32x2_t v546 = vadd_f32(v204, v537); + float32x2_t v547 = vsub_f32(v204, v537); + float32x2_t v766 = vrev64_f32(v760); + float32x2_t v777 = vrev64_f32(v771); + float32x2_t v112 = vadd_f32(v110, v111); + float32x2_t v113 = vsub_f32(v111, v110); + float32x2_t v206 = vsub_f32(v85, v203); + float32x2_t v207 = vadd_f32(v85, v203); + float32x2_t v267 = vadd_f32(v248, v255); + float32x2_t v268 = vadd_f32(v259, v266); + float32x2_t v415 = vadd_f32(v413, v414); + float32x2_t v416 = vsub_f32(v414, v413); + float32x2_t v524 = vadd_f32(v522, v523); + float32x2_t v525 = vsub_f32(v523, v522); + float32x2_t v545 = vmul_f32(v544, v776); + int16x4_t v552 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v546, 15), (int32x2_t){0, 0})); + int16x4_t v564 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v547, 15), (int32x2_t){0, 0})); + float32x2_t v638 = vmul_f32(v389, v637); + float32x2_t v644 = vrev64_f32(v389); + float32x2_t v649 = vmul_f32(v498, v881); + float32x2_t v655 = vrev64_f32(v498); + float32x2_t v767 = vmul_f32(v766, v970); + float32x2_t v778 = vmul_f32(v777, v776); + float32x2_t v882 = vmul_f32(v390, v881); + float32x2_t v888 = vrev64_f32(v390); + float32x2_t v893 = vmul_f32(v499, v892); + float32x2_t v899 = vrev64_f32(v499); + float32x2_t v119 = vrev64_f32(v113); + float32x2_t v121 = vadd_f32(v49, v112); + float32x2_t v122 = vsub_f32(v49, v112); + float32x2_t v230 = vfma_f32(v211, v217, v643); + float32x2_t v231 = vfma_f32(v222, v228, v887); + float32x2_t v269 = vadd_f32(v267, v268); + float32x2_t v270 = vsub_f32(v268, v267); + float32x2_t v304 = vfma_f32(v285, v291, v887); + float32x2_t v305 = vfma_f32(v296, v302, v898); + float32x2_t v422 = vrev64_f32(v416); + float32x2_t v424 = vadd_f32(v352, v415); + float32x2_t v425 = vsub_f32(v352, v415); + float32x2_t v531 = vrev64_f32(v525); + float32x2_t v533 = vadd_f32(v461, v524); + float32x2_t v534 = vsub_f32(v461, v524); + float32x2_t v548 = vsub_f32(v205, v545); + float32x2_t v549 = vadd_f32(v205, v545); + v6[0] = vget_lane_s32(vreinterpret_s32_s16(v552), 0); + v6[ostride * 16] = vget_lane_s32(vreinterpret_s32_s16(v564), 0); + float32x2_t v779 = vadd_f32(v760, v767); + float32x2_t v780 = vadd_f32(v771, v778); + float32x2_t v120 = vmul_f32(v119, v970); + float32x2_t v232 = vadd_f32(v230, v231); + float32x2_t v233 = vsub_f32(v231, v230); + float32x2_t v276 = vrev64_f32(v270); + float32x2_t v278 = vadd_f32(v86, v269); + float32x2_t v279 = vsub_f32(v86, v269); + float32x2_t v306 = vadd_f32(v304, v305); + float32x2_t v307 = vsub_f32(v305, v304); + float32x2_t v423 = vmul_f32(v422, v970); + float32x2_t v532 = vmul_f32(v531, v970); + int16x4_t v558 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v548, 15), (int32x2_t){0, 0})); + int16x4_t v570 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v549, 15), (int32x2_t){0, 0})); + float32x2_t v577 = vmul_f32(v424, v576); + float32x2_t v583 = vrev64_f32(v424); + float32x2_t v588 = vmul_f32(v533, v698); + float32x2_t v594 = vrev64_f32(v533); + float32x2_t v657 = vfma_f32(v638, v644, v643); + float32x2_t v658 = vfma_f32(v649, v655, v887); + float32x2_t v781 = vadd_f32(v779, v780); + float32x2_t v782 = vsub_f32(v780, v779); + float32x2_t v821 = vmul_f32(v425, v820); + float32x2_t v827 = vrev64_f32(v425); + float32x2_t v832 = vmul_f32(v534, v831); + float32x2_t v838 = vrev64_f32(v534); + float32x2_t v901 = vfma_f32(v882, v888, v887); + float32x2_t v902 = vfma_f32(v893, v899, v898); + float32x2_t v123 = vsub_f32(v50, v120); + float32x2_t v124 = vadd_f32(v50, v120); + float32x2_t v239 = vrev64_f32(v233); + float32x2_t v241 = vadd_f32(v121, v232); + float32x2_t v242 = vsub_f32(v121, v232); + float32x2_t v277 = vmul_f32(v276, v970); + float32x2_t v313 = vrev64_f32(v307); + float32x2_t v426 = vsub_f32(v353, v423); + float32x2_t v427 = vadd_f32(v353, v423); + float32x2_t v535 = vsub_f32(v462, v532); + float32x2_t v536 = vadd_f32(v462, v532); + v6[ostride * 8] = vget_lane_s32(vreinterpret_s32_s16(v558), 0); + v6[ostride * 24] = vget_lane_s32(vreinterpret_s32_s16(v570), 0); + float32x2_t v659 = vadd_f32(v657, v658); + float32x2_t v660 = vsub_f32(v658, v657); + float32x2_t v788 = vrev64_f32(v782); + float32x2_t v790 = vadd_f32(v206, v781); + float32x2_t v791 = vsub_f32(v206, v781); + float32x2_t v903 = vadd_f32(v901, v902); + float32x2_t v904 = vsub_f32(v902, v901); + float32x2_t v240 = vmul_f32(v239, v970); + float32x2_t v280 = vsub_f32(v87, v277); + float32x2_t v281 = vadd_f32(v87, v277); + float32x2_t v314 = vmul_f32(v313, v970); + float32x2_t v315 = vadd_f32(v123, v306); + float32x2_t v316 = vsub_f32(v123, v306); + float32x2_t v596 = vfma_f32(v577, v583, v837); + float32x2_t v597 = vfma_f32(v588, v594, v704); + float32x2_t v666 = vrev64_f32(v660); + float32x2_t v668 = vadd_f32(v278, v659); + float32x2_t v669 = vsub_f32(v278, v659); + float32x2_t v699 = vmul_f32(v426, v698); + float32x2_t v705 = vrev64_f32(v426); + float32x2_t v710 = vmul_f32(v535, v709); + float32x2_t v716 = vrev64_f32(v535); + float32x2_t v789 = vmul_f32(v788, v970); + int16x4_t v796 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v790, 15), (int32x2_t){0, 0})); + int16x4_t v808 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v791, 15), (int32x2_t){0, 0})); + float32x2_t v840 = vfma_f32(v821, v827, v826); + float32x2_t v841 = vfma_f32(v832, v838, v837); + float32x2_t v910 = vrev64_f32(v904); + float32x2_t v943 = vmul_f32(v427, v942); + float32x2_t v949 = vrev64_f32(v427); + float32x2_t v954 = vmul_f32(v536, v953); + float32x2_t v960 = vrev64_f32(v536); + float32x2_t v243 = vsub_f32(v122, v240); + float32x2_t v244 = vadd_f32(v122, v240); + float32x2_t v317 = vsub_f32(v124, v314); + float32x2_t v318 = vadd_f32(v124, v314); + float32x2_t v598 = vadd_f32(v596, v597); + float32x2_t v599 = vsub_f32(v597, v596); + float32x2_t v667 = vmul_f32(v666, v970); + int16x4_t v674 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v668, 15), (int32x2_t){0, 0})); + int16x4_t v686 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v669, 15), (int32x2_t){0, 0})); + float32x2_t v792 = vsub_f32(v207, v789); + float32x2_t v793 = vadd_f32(v207, v789); + v6[ostride * 4] = vget_lane_s32(vreinterpret_s32_s16(v796), 0); + v6[ostride * 20] = vget_lane_s32(vreinterpret_s32_s16(v808), 0); + float32x2_t v842 = vadd_f32(v840, v841); + float32x2_t v843 = vsub_f32(v841, v840); + float32x2_t v911 = vmul_f32(v910, v970); + float32x2_t v912 = vadd_f32(v280, v903); + float32x2_t v913 = vsub_f32(v280, v903); + float32x2_t v605 = vrev64_f32(v599); + float32x2_t v607 = vadd_f32(v241, v598); + float32x2_t v608 = vsub_f32(v241, v598); + float32x2_t v670 = vsub_f32(v279, v667); + float32x2_t v671 = vadd_f32(v279, v667); + v6[ostride * 2] = vget_lane_s32(vreinterpret_s32_s16(v674), 0); + v6[ostride * 18] = vget_lane_s32(vreinterpret_s32_s16(v686), 0); + float32x2_t v718 = vfma_f32(v699, v705, v704); + float32x2_t v719 = vfma_f32(v710, v716, v948); + int16x4_t v802 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v792, 15), (int32x2_t){0, 0})); + int16x4_t v814 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v793, 15), (int32x2_t){0, 0})); + float32x2_t v849 = vrev64_f32(v843); + float32x2_t v851 = vadd_f32(v243, v842); + float32x2_t v852 = vsub_f32(v243, v842); + float32x2_t v914 = vsub_f32(v281, v911); + float32x2_t v915 = vadd_f32(v281, v911); + int16x4_t v918 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v912, 15), (int32x2_t){0, 0})); + int16x4_t v930 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v913, 15), (int32x2_t){0, 0})); + float32x2_t v962 = vfma_f32(v943, v949, v948); + float32x2_t v963 = vfma_f32(v954, v960, v959); + float32x2_t v606 = vmul_f32(v605, v970); + int16x4_t v613 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v607, 15), (int32x2_t){0, 0})); + int16x4_t v625 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v608, 15), (int32x2_t){0, 0})); + int16x4_t v680 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v670, 15), (int32x2_t){0, 0})); + int16x4_t v692 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v671, 15), (int32x2_t){0, 0})); + float32x2_t v720 = vadd_f32(v718, v719); + float32x2_t v721 = vsub_f32(v719, v718); + v6[ostride * 12] = vget_lane_s32(vreinterpret_s32_s16(v802), 0); + v6[ostride * 28] = vget_lane_s32(vreinterpret_s32_s16(v814), 0); + float32x2_t v850 = vmul_f32(v849, v970); + int16x4_t v857 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v851, 15), (int32x2_t){0, 0})); + int16x4_t v869 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v852, 15), (int32x2_t){0, 0})); + v6[ostride * 6] = vget_lane_s32(vreinterpret_s32_s16(v918), 0); + int16x4_t v924 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v914, 15), (int32x2_t){0, 0})); + v6[ostride * 22] = vget_lane_s32(vreinterpret_s32_s16(v930), 0); + int16x4_t v936 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v915, 15), (int32x2_t){0, 0})); + float32x2_t v964 = vadd_f32(v962, v963); + float32x2_t v965 = vsub_f32(v963, v962); + float32x2_t v609 = vsub_f32(v242, v606); + float32x2_t v610 = vadd_f32(v242, v606); + v6[ostride] = vget_lane_s32(vreinterpret_s32_s16(v613), 0); + v6[ostride * 17] = vget_lane_s32(vreinterpret_s32_s16(v625), 0); + v6[ostride * 10] = vget_lane_s32(vreinterpret_s32_s16(v680), 0); + v6[ostride * 26] = vget_lane_s32(vreinterpret_s32_s16(v692), 0); + float32x2_t v727 = vrev64_f32(v721); + float32x2_t v729 = vadd_f32(v315, v720); + float32x2_t v730 = vsub_f32(v315, v720); + float32x2_t v853 = vsub_f32(v244, v850); + float32x2_t v854 = vadd_f32(v244, v850); + v6[ostride * 5] = vget_lane_s32(vreinterpret_s32_s16(v857), 0); + v6[ostride * 21] = vget_lane_s32(vreinterpret_s32_s16(v869), 0); + v6[ostride * 14] = vget_lane_s32(vreinterpret_s32_s16(v924), 0); + v6[ostride * 30] = vget_lane_s32(vreinterpret_s32_s16(v936), 0); + float32x2_t v971 = vrev64_f32(v965); + float32x2_t v973 = vadd_f32(v317, v964); + float32x2_t v974 = vsub_f32(v317, v964); + int16x4_t v619 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v609, 15), (int32x2_t){0, 0})); + int16x4_t v631 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v610, 15), (int32x2_t){0, 0})); + float32x2_t v728 = vmul_f32(v727, v970); + int16x4_t v735 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v729, 15), (int32x2_t){0, 0})); + int16x4_t v747 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v730, 15), (int32x2_t){0, 0})); + int16x4_t v863 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v853, 15), (int32x2_t){0, 0})); + int16x4_t v875 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v854, 15), (int32x2_t){0, 0})); + float32x2_t v972 = vmul_f32(v971, v970); + int16x4_t v979 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v973, 15), (int32x2_t){0, 0})); + int16x4_t v991 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v974, 15), (int32x2_t){0, 0})); + v6[ostride * 9] = vget_lane_s32(vreinterpret_s32_s16(v619), 0); + v6[ostride * 25] = vget_lane_s32(vreinterpret_s32_s16(v631), 0); + float32x2_t v731 = vsub_f32(v316, v728); + float32x2_t v732 = vadd_f32(v316, v728); + v6[ostride * 3] = vget_lane_s32(vreinterpret_s32_s16(v735), 0); + v6[ostride * 19] = vget_lane_s32(vreinterpret_s32_s16(v747), 0); + v6[ostride * 13] = vget_lane_s32(vreinterpret_s32_s16(v863), 0); + v6[ostride * 29] = vget_lane_s32(vreinterpret_s32_s16(v875), 0); + float32x2_t v975 = vsub_f32(v318, v972); + float32x2_t v976 = vadd_f32(v318, v972); + v6[ostride * 7] = vget_lane_s32(vreinterpret_s32_s16(v979), 0); + v6[ostride * 23] = vget_lane_s32(vreinterpret_s32_s16(v991), 0); + int16x4_t v741 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v731, 15), (int32x2_t){0, 0})); + int16x4_t v753 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v732, 15), (int32x2_t){0, 0})); + int16x4_t v985 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v975, 15), (int32x2_t){0, 0})); + int16x4_t v997 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v976, 15), (int32x2_t){0, 0})); + v6[ostride * 11] = vget_lane_s32(vreinterpret_s32_s16(v741), 0); + v6[ostride * 27] = vget_lane_s32(vreinterpret_s32_s16(v753), 0); + v6[ostride * 15] = vget_lane_s32(vreinterpret_s32_s16(v985), 0); + v6[ostride * 31] = vget_lane_s32(vreinterpret_s32_s16(v997), 0); v5 += 1 * 1; v6 += 1 * 1; } @@ -20355,7 +12554,6 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu32(const armral_cmplx_f32_t *restrict x, float v1092 = 9.8078528040323043e-01F; float v1099 = -5.5557023301960218e-01F; float v1104 = -8.3146961230254524e-01F; - float v1115 = 1.0000000000000000e+00F; const float32x2_t *v1333 = &v5[v0]; int32_t *v1534 = &v6[v2]; int64_t v26 = v0 * 16; @@ -20423,7 +12621,6 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu32(const armral_cmplx_f32_t *restrict x, int64_t v1079 = v2 * 30; float v1095 = v4 * v1092; float v1107 = v4 * v1104; - float v1118 = v4 * v1115; int64_t v1126 = v2 * 7; int64_t v1134 = v2 * 15; int64_t v1142 = v2 * 23; @@ -20442,6 +12639,7 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu32(const armral_cmplx_f32_t *restrict x, svfloat32_t v1730 = svdup_n_f32(v1028); svfloat32_t v1769 = svdup_n_f32(v1087); svfloat32_t v1771 = svdup_n_f32(v1099); + svfloat32_t v1773 = svdup_n_f32(v4); svfloat32_t v1843 = svreinterpret_f32_f64( svld1_f64(pred_full, &((const double *)v1333)[0])); const float32x2_t *v1173 = &v5[v26]; @@ -20509,7 +12707,6 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu32(const armral_cmplx_f32_t *restrict x, int32_t *v1766 = &v6[v1079]; svfloat32_t v1770 = svdup_n_f32(v1095); svfloat32_t v1772 = svdup_n_f32(v1107); - svfloat32_t v1773 = svdup_n_f32(v1118); int32_t *v1780 = &v6[v1126]; int32_t *v1789 = &v6[v1134]; int32_t *v1798 = &v6[v1142]; @@ -20576,293 +12773,162 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu32(const armral_cmplx_f32_t *restrict x, svld1_f64(pred_full, &((const double *)v1467)[0])); svfloat32_t v1873 = svreinterpret_f32_f64( svld1_f64(pred_full, &((const double *)v1476)[0])); - svfloat32_t v32; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v32) : "w"(v1811), "w"(v1813)); - svfloat32_t v33; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v33) : "w"(v1811), "w"(v1813)); - svfloat32_t v48; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v48) : "w"(v1815), "w"(v1817)); - svfloat32_t v49; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v49) : "w"(v1815), "w"(v1817)); - svfloat32_t v75; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v75) : "w"(v1819), "w"(v1821)); - svfloat32_t v76; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v76) : "w"(v1819), "w"(v1821)); - svfloat32_t v91; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v91) : "w"(v1823), "w"(v1825)); - svfloat32_t v92; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v92) : "w"(v1823), "w"(v1825)); - svfloat32_t v159; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v159) : "w"(v1827), "w"(v1829)); - svfloat32_t v160; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v160) : "w"(v1827), "w"(v1829)); - svfloat32_t v175; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v175) : "w"(v1831), "w"(v1833)); - svfloat32_t v176; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v176) : "w"(v1831), "w"(v1833)); - svfloat32_t v202; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v202) : "w"(v1835), "w"(v1837)); - svfloat32_t v203; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v203) : "w"(v1835), "w"(v1837)); - svfloat32_t v218; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v218) : "w"(v1839), "w"(v1841)); - svfloat32_t v219; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v219) : "w"(v1839), "w"(v1841)); - svfloat32_t v375; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v375) : "w"(v1843), "w"(v1845)); - svfloat32_t v376; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v376) : "w"(v1843), "w"(v1845)); - svfloat32_t v391; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v391) : "w"(v1847), "w"(v1849)); - svfloat32_t v392; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v392) : "w"(v1847), "w"(v1849)); - svfloat32_t v418; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v418) : "w"(v1851), "w"(v1853)); - svfloat32_t v419; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v419) : "w"(v1851), "w"(v1853)); - svfloat32_t v434; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v434) : "w"(v1855), "w"(v1857)); - svfloat32_t v435; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v435) : "w"(v1855), "w"(v1857)); - svfloat32_t v502; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v502) : "w"(v1859), "w"(v1861)); - svfloat32_t v503; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v503) : "w"(v1859), "w"(v1861)); - svfloat32_t v518; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v518) : "w"(v1863), "w"(v1865)); - svfloat32_t v519; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v519) : "w"(v1863), "w"(v1865)); - svfloat32_t v545; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v545) : "w"(v1867), "w"(v1869)); - svfloat32_t v546; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v546) : "w"(v1867), "w"(v1869)); - svfloat32_t v561; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v561) : "w"(v1871), "w"(v1873)); - svfloat32_t v562; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v562) : "w"(v1871), "w"(v1873)); - svfloat32_t zero56; - asm volatile("mov %0.s, #0" : "=w"(zero56)); + svfloat32_t v32 = svadd_f32_x(svptrue_b32(), v1811, v1813); + svfloat32_t v33 = svsub_f32_x(svptrue_b32(), v1811, v1813); + svfloat32_t v48 = svadd_f32_x(svptrue_b32(), v1815, v1817); + svfloat32_t v49 = svsub_f32_x(svptrue_b32(), v1815, v1817); + svfloat32_t v75 = svadd_f32_x(svptrue_b32(), v1819, v1821); + svfloat32_t v76 = svsub_f32_x(svptrue_b32(), v1819, v1821); + svfloat32_t v91 = svadd_f32_x(svptrue_b32(), v1823, v1825); + svfloat32_t v92 = svsub_f32_x(svptrue_b32(), v1823, v1825); + svfloat32_t v159 = svadd_f32_x(svptrue_b32(), v1827, v1829); + svfloat32_t v160 = svsub_f32_x(svptrue_b32(), v1827, v1829); + svfloat32_t v175 = svadd_f32_x(svptrue_b32(), v1831, v1833); + svfloat32_t v176 = svsub_f32_x(svptrue_b32(), v1831, v1833); + svfloat32_t v202 = svadd_f32_x(svptrue_b32(), v1835, v1837); + svfloat32_t v203 = svsub_f32_x(svptrue_b32(), v1835, v1837); + svfloat32_t v218 = svadd_f32_x(svptrue_b32(), v1839, v1841); + svfloat32_t v219 = svsub_f32_x(svptrue_b32(), v1839, v1841); + svfloat32_t v375 = svadd_f32_x(svptrue_b32(), v1843, v1845); + svfloat32_t v376 = svsub_f32_x(svptrue_b32(), v1843, v1845); + svfloat32_t v391 = svadd_f32_x(svptrue_b32(), v1847, v1849); + svfloat32_t v392 = svsub_f32_x(svptrue_b32(), v1847, v1849); + svfloat32_t v418 = svadd_f32_x(svptrue_b32(), v1851, v1853); + svfloat32_t v419 = svsub_f32_x(svptrue_b32(), v1851, v1853); + svfloat32_t v434 = svadd_f32_x(svptrue_b32(), v1855, v1857); + svfloat32_t v435 = svsub_f32_x(svptrue_b32(), v1855, v1857); + svfloat32_t v502 = svadd_f32_x(svptrue_b32(), v1859, v1861); + svfloat32_t v503 = svsub_f32_x(svptrue_b32(), v1859, v1861); + svfloat32_t v518 = svadd_f32_x(svptrue_b32(), v1863, v1865); + svfloat32_t v519 = svsub_f32_x(svptrue_b32(), v1863, v1865); + svfloat32_t v545 = svadd_f32_x(svptrue_b32(), v1867, v1869); + svfloat32_t v546 = svsub_f32_x(svptrue_b32(), v1867, v1869); + svfloat32_t v561 = svadd_f32_x(svptrue_b32(), v1871, v1873); + svfloat32_t v562 = svsub_f32_x(svptrue_b32(), v1871, v1873); + svfloat32_t zero56 = svdup_n_f32(0); svfloat32_t v56 = svcmla_f32_x(pred_full, zero56, v1649, v49, 90); - svfloat32_t v57; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v57) : "w"(v32), "w"(v48)); - svfloat32_t v58; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v58) : "w"(v32), "w"(v48)); - svfloat32_t v93; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v93) : "w"(v75), "w"(v91)); - svfloat32_t v94; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v94) : "w"(v75), "w"(v91)); - svfloat32_t v110; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v110) : "w"(v76), "w"(v1646)); - svfloat32_t v122; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v122) : "w"(v92), "w"(v1648)); - svfloat32_t zero183; - asm volatile("mov %0.s, #0" : "=w"(zero183)); + svfloat32_t v57 = svadd_f32_x(svptrue_b32(), v32, v48); + svfloat32_t v58 = svsub_f32_x(svptrue_b32(), v32, v48); + svfloat32_t v93 = svadd_f32_x(svptrue_b32(), v75, v91); + svfloat32_t v94 = svsub_f32_x(svptrue_b32(), v75, v91); + svfloat32_t v110 = svmul_f32_x(svptrue_b32(), v76, v1646); + svfloat32_t v122 = svmul_f32_x(svptrue_b32(), v92, v1648); + svfloat32_t zero183 = svdup_n_f32(0); svfloat32_t v183 = svcmla_f32_x(pred_full, zero183, v1649, v176, 90); - svfloat32_t v184; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v184) : "w"(v159), "w"(v175)); - svfloat32_t v185; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v185) : "w"(v159), "w"(v175)); - svfloat32_t zero226; - asm volatile("mov %0.s, #0" : "=w"(zero226)); + svfloat32_t v184 = svadd_f32_x(svptrue_b32(), v159, v175); + svfloat32_t v185 = svsub_f32_x(svptrue_b32(), v159, v175); + svfloat32_t zero226 = svdup_n_f32(0); svfloat32_t v226 = svcmla_f32_x(pred_full, zero226, v1649, v219, 90); - svfloat32_t v227; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v227) : "w"(v202), "w"(v218)); - svfloat32_t v228; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v228) : "w"(v202), "w"(v218)); - svfloat32_t zero399; - asm volatile("mov %0.s, #0" : "=w"(zero399)); + svfloat32_t v227 = svadd_f32_x(svptrue_b32(), v202, v218); + svfloat32_t v228 = svsub_f32_x(svptrue_b32(), v202, v218); + svfloat32_t zero399 = svdup_n_f32(0); svfloat32_t v399 = svcmla_f32_x(pred_full, zero399, v1649, v392, 90); - svfloat32_t v400; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v400) : "w"(v375), "w"(v391)); - svfloat32_t v401; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v401) : "w"(v375), "w"(v391)); - svfloat32_t v436; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v436) : "w"(v418), "w"(v434)); - svfloat32_t v437; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v437) : "w"(v418), "w"(v434)); - svfloat32_t v453; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v453) : "w"(v419), "w"(v1646)); - svfloat32_t v465; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v465) : "w"(v435), "w"(v1648)); - svfloat32_t zero526; - asm volatile("mov %0.s, #0" : "=w"(zero526)); + svfloat32_t v400 = svadd_f32_x(svptrue_b32(), v375, v391); + svfloat32_t v401 = svsub_f32_x(svptrue_b32(), v375, v391); + svfloat32_t v436 = svadd_f32_x(svptrue_b32(), v418, v434); + svfloat32_t v437 = svsub_f32_x(svptrue_b32(), v418, v434); + svfloat32_t v453 = svmul_f32_x(svptrue_b32(), v419, v1646); + svfloat32_t v465 = svmul_f32_x(svptrue_b32(), v435, v1648); + svfloat32_t zero526 = svdup_n_f32(0); svfloat32_t v526 = svcmla_f32_x(pred_full, zero526, v1649, v519, 90); - svfloat32_t v527; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v527) : "w"(v502), "w"(v518)); - svfloat32_t v528; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v528) : "w"(v502), "w"(v518)); - svfloat32_t v563; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v563) : "w"(v545), "w"(v561)); - svfloat32_t v564; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v564) : "w"(v545), "w"(v561)); - svfloat32_t v580; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v580) : "w"(v546), "w"(v1646)); - svfloat32_t v592; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v592) : "w"(v562), "w"(v1648)); - svfloat32_t v59; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v59) : "w"(v33), "w"(v56)); - svfloat32_t v60; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v60) : "w"(v33), "w"(v56)); - svfloat32_t zero101; - asm volatile("mov %0.s, #0" : "=w"(zero101)); + svfloat32_t v527 = svadd_f32_x(svptrue_b32(), v502, v518); + svfloat32_t v528 = svsub_f32_x(svptrue_b32(), v502, v518); + svfloat32_t v563 = svadd_f32_x(svptrue_b32(), v545, v561); + svfloat32_t v564 = svsub_f32_x(svptrue_b32(), v545, v561); + svfloat32_t v580 = svmul_f32_x(svptrue_b32(), v546, v1646); + svfloat32_t v592 = svmul_f32_x(svptrue_b32(), v562, v1648); + svfloat32_t v59 = svsub_f32_x(svptrue_b32(), v33, v56); + svfloat32_t v60 = svadd_f32_x(svptrue_b32(), v33, v56); + svfloat32_t zero101 = svdup_n_f32(0); svfloat32_t v101 = svcmla_f32_x(pred_full, zero101, v1649, v94, 90); - svfloat32_t v102; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v102) : "w"(v57), "w"(v93)); - svfloat32_t v103; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v103) : "w"(v57), "w"(v93)); - svfloat32_t v186; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v186) : "w"(v160), "w"(v183)); - svfloat32_t v187; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v187) : "w"(v160), "w"(v183)); - svfloat32_t v229; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v229) : "w"(v203), "w"(v226)); - svfloat32_t v230; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v230) : "w"(v203), "w"(v226)); - svfloat32_t v231; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v231) : "w"(v184), "w"(v227)); - svfloat32_t v232; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v232) : "w"(v184), "w"(v227)); - svfloat32_t v287; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v287) : "w"(v185), "w"(v1646)); - svfloat32_t v299; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v299) : "w"(v228), "w"(v1648)); - svfloat32_t v402; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v402) : "w"(v376), "w"(v399)); - svfloat32_t v403; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v403) : "w"(v376), "w"(v399)); - svfloat32_t zero444; - asm volatile("mov %0.s, #0" : "=w"(zero444)); + svfloat32_t v102 = svadd_f32_x(svptrue_b32(), v57, v93); + svfloat32_t v103 = svsub_f32_x(svptrue_b32(), v57, v93); + svfloat32_t v186 = svsub_f32_x(svptrue_b32(), v160, v183); + svfloat32_t v187 = svadd_f32_x(svptrue_b32(), v160, v183); + svfloat32_t v229 = svsub_f32_x(svptrue_b32(), v203, v226); + svfloat32_t v230 = svadd_f32_x(svptrue_b32(), v203, v226); + svfloat32_t v231 = svadd_f32_x(svptrue_b32(), v184, v227); + svfloat32_t v232 = svsub_f32_x(svptrue_b32(), v184, v227); + svfloat32_t v287 = svmul_f32_x(svptrue_b32(), v185, v1646); + svfloat32_t v299 = svmul_f32_x(svptrue_b32(), v228, v1648); + svfloat32_t v402 = svsub_f32_x(svptrue_b32(), v376, v399); + svfloat32_t v403 = svadd_f32_x(svptrue_b32(), v376, v399); + svfloat32_t zero444 = svdup_n_f32(0); svfloat32_t v444 = svcmla_f32_x(pred_full, zero444, v1649, v437, 90); - svfloat32_t v445; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v445) : "w"(v400), "w"(v436)); - svfloat32_t v446; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v446) : "w"(v400), "w"(v436)); - svfloat32_t v529; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v529) : "w"(v503), "w"(v526)); - svfloat32_t v530; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v530) : "w"(v503), "w"(v526)); - svfloat32_t zero571; - asm volatile("mov %0.s, #0" : "=w"(zero571)); + svfloat32_t v445 = svadd_f32_x(svptrue_b32(), v400, v436); + svfloat32_t v446 = svsub_f32_x(svptrue_b32(), v400, v436); + svfloat32_t v529 = svsub_f32_x(svptrue_b32(), v503, v526); + svfloat32_t v530 = svadd_f32_x(svptrue_b32(), v503, v526); + svfloat32_t zero571 = svdup_n_f32(0); svfloat32_t v571 = svcmla_f32_x(pred_full, zero571, v1649, v564, 90); - svfloat32_t v572; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v572) : "w"(v527), "w"(v563)); - svfloat32_t v573; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v573) : "w"(v527), "w"(v563)); - svfloat32_t v104; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v104) : "w"(v58), "w"(v101)); - svfloat32_t v105; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v105) : "w"(v58), "w"(v101)); + svfloat32_t v572 = svadd_f32_x(svptrue_b32(), v527, v563); + svfloat32_t v573 = svsub_f32_x(svptrue_b32(), v527, v563); + svfloat32_t v104 = svsub_f32_x(svptrue_b32(), v58, v101); + svfloat32_t v105 = svadd_f32_x(svptrue_b32(), v58, v101); svfloat32_t v130 = svcmla_f32_x(pred_full, v110, v1773, v110, 90); svfloat32_t v131 = svcmla_f32_x(pred_full, v122, v1649, v122, 90); - svfloat32_t zero239; - asm volatile("mov %0.s, #0" : "=w"(zero239)); + svfloat32_t zero239 = svdup_n_f32(0); svfloat32_t v239 = svcmla_f32_x(pred_full, zero239, v1649, v232, 90); - svfloat32_t v240; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v240) : "w"(v102), "w"(v231)); - svfloat32_t v241; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v241) : "w"(v102), "w"(v231)); - svfloat32_t v248; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v248) : "w"(v186), "w"(v1564)); - svfloat32_t v260; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v260) : "w"(v229), "w"(v1728)); - svfloat32_t v326; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v326) : "w"(v187), "w"(v1728)); - svfloat32_t v338; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v338) : "w"(v230), "w"(v1730)); - svfloat32_t v447; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v447) : "w"(v401), "w"(v444)); - svfloat32_t v448; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v448) : "w"(v401), "w"(v444)); + svfloat32_t v240 = svadd_f32_x(svptrue_b32(), v102, v231); + svfloat32_t v241 = svsub_f32_x(svptrue_b32(), v102, v231); + svfloat32_t v248 = svmul_f32_x(svptrue_b32(), v186, v1564); + svfloat32_t v260 = svmul_f32_x(svptrue_b32(), v229, v1728); + svfloat32_t v326 = svmul_f32_x(svptrue_b32(), v187, v1728); + svfloat32_t v338 = svmul_f32_x(svptrue_b32(), v230, v1730); + svfloat32_t v447 = svsub_f32_x(svptrue_b32(), v401, v444); + svfloat32_t v448 = svadd_f32_x(svptrue_b32(), v401, v444); svfloat32_t v473 = svcmla_f32_x(pred_full, v453, v1773, v453, 90); svfloat32_t v474 = svcmla_f32_x(pred_full, v465, v1649, v465, 90); - svfloat32_t v574; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v574) : "w"(v528), "w"(v571)); - svfloat32_t v575; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v575) : "w"(v528), "w"(v571)); + svfloat32_t v574 = svsub_f32_x(svptrue_b32(), v528, v571); + svfloat32_t v575 = svadd_f32_x(svptrue_b32(), v528, v571); svfloat32_t v600 = svcmla_f32_x(pred_full, v580, v1773, v580, 90); svfloat32_t v601 = svcmla_f32_x(pred_full, v592, v1649, v592, 90); - svfloat32_t v615; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v615) : "w"(v445), "w"(v572)); - svfloat32_t v616; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v616) : "w"(v445), "w"(v572)); - svfloat32_t v877; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v877) : "w"(v446), "w"(v1646)); - svfloat32_t v889; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v889) : "w"(v573), "w"(v1648)); - svfloat32_t v132; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v132) : "w"(v130), "w"(v131)); - svfloat32_t v133; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v133) : "w"(v131), "w"(v130)); - svfloat32_t v242; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v242) : "w"(v103), "w"(v239)); - svfloat32_t v243; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v243) : "w"(v103), "w"(v239)); + svfloat32_t v615 = svadd_f32_x(svptrue_b32(), v445, v572); + svfloat32_t v616 = svsub_f32_x(svptrue_b32(), v445, v572); + svfloat32_t v877 = svmul_f32_x(svptrue_b32(), v446, v1646); + svfloat32_t v889 = svmul_f32_x(svptrue_b32(), v573, v1648); + svfloat32_t v132 = svadd_f32_x(svptrue_b32(), v130, v131); + svfloat32_t v133 = svsub_f32_x(svptrue_b32(), v131, v130); + svfloat32_t v242 = svsub_f32_x(svptrue_b32(), v103, v239); + svfloat32_t v243 = svadd_f32_x(svptrue_b32(), v103, v239); svfloat32_t v268 = svcmla_f32_x(pred_full, v248, v1565, v186, 90); svfloat32_t v269 = svcmla_f32_x(pred_full, v260, v1729, v229, 90); svfloat32_t v307 = svcmla_f32_x(pred_full, v287, v1773, v287, 90); svfloat32_t v308 = svcmla_f32_x(pred_full, v299, v1649, v299, 90); svfloat32_t v346 = svcmla_f32_x(pred_full, v326, v1729, v187, 90); svfloat32_t v347 = svcmla_f32_x(pred_full, v338, v1731, v230, 90); - svfloat32_t v475; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v475) : "w"(v473), "w"(v474)); - svfloat32_t v476; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v476) : "w"(v474), "w"(v473)); - svfloat32_t v602; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v602) : "w"(v600), "w"(v601)); - svfloat32_t v603; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v603) : "w"(v601), "w"(v600)); - svfloat32_t zero623; - asm volatile("mov %0.s, #0" : "=w"(zero623)); + svfloat32_t v475 = svadd_f32_x(svptrue_b32(), v473, v474); + svfloat32_t v476 = svsub_f32_x(svptrue_b32(), v474, v473); + svfloat32_t v602 = svadd_f32_x(svptrue_b32(), v600, v601); + svfloat32_t v603 = svsub_f32_x(svptrue_b32(), v601, v600); + svfloat32_t zero623 = svdup_n_f32(0); svfloat32_t v623 = svcmla_f32_x(pred_full, zero623, v1649, v616, 90); - svfloat32_t v624; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v624) : "w"(v240), "w"(v615)); - svfloat32_t v625; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v625) : "w"(v240), "w"(v615)); - svfloat32_t v735; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v735) : "w"(v447), "w"(v1564)); - svfloat32_t v747; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v747) : "w"(v574), "w"(v1728)); - svfloat32_t v1019; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v1019) : "w"(v448), "w"(v1728)); - svfloat32_t v1031; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v1031) : "w"(v575), "w"(v1730)); - svfloat32_t zero140; - asm volatile("mov %0.s, #0" : "=w"(zero140)); + svfloat32_t v624 = svadd_f32_x(svptrue_b32(), v240, v615); + svfloat32_t v625 = svsub_f32_x(svptrue_b32(), v240, v615); + svfloat32_t v735 = svmul_f32_x(svptrue_b32(), v447, v1564); + svfloat32_t v747 = svmul_f32_x(svptrue_b32(), v574, v1728); + svfloat32_t v1019 = svmul_f32_x(svptrue_b32(), v448, v1728); + svfloat32_t v1031 = svmul_f32_x(svptrue_b32(), v575, v1730); + svfloat32_t zero140 = svdup_n_f32(0); svfloat32_t v140 = svcmla_f32_x(pred_full, zero140, v1773, v133, 90); - svfloat32_t v141; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v141) : "w"(v59), "w"(v132)); - svfloat32_t v142; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v142) : "w"(v59), "w"(v132)); - svfloat32_t v270; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v270) : "w"(v268), "w"(v269)); - svfloat32_t v271; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v271) : "w"(v269), "w"(v268)); - svfloat32_t v309; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v309) : "w"(v307), "w"(v308)); - svfloat32_t v310; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v310) : "w"(v308), "w"(v307)); - svfloat32_t v348; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v348) : "w"(v346), "w"(v347)); - svfloat32_t v349; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v349) : "w"(v347), "w"(v346)); - svfloat32_t zero483; - asm volatile("mov %0.s, #0" : "=w"(zero483)); + svfloat32_t v141 = svadd_f32_x(svptrue_b32(), v59, v132); + svfloat32_t v142 = svsub_f32_x(svptrue_b32(), v59, v132); + svfloat32_t v270 = svadd_f32_x(svptrue_b32(), v268, v269); + svfloat32_t v271 = svsub_f32_x(svptrue_b32(), v269, v268); + svfloat32_t v309 = svadd_f32_x(svptrue_b32(), v307, v308); + svfloat32_t v310 = svsub_f32_x(svptrue_b32(), v308, v307); + svfloat32_t v348 = svadd_f32_x(svptrue_b32(), v346, v347); + svfloat32_t v349 = svsub_f32_x(svptrue_b32(), v347, v346); + svfloat32_t zero483 = svdup_n_f32(0); svfloat32_t v483 = svcmla_f32_x(pred_full, zero483, v1773, v476, 90); - svfloat32_t v484; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v484) : "w"(v402), "w"(v475)); - svfloat32_t v485; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v485) : "w"(v402), "w"(v475)); - svfloat32_t zero610; - asm volatile("mov %0.s, #0" : "=w"(zero610)); + svfloat32_t v484 = svadd_f32_x(svptrue_b32(), v402, v475); + svfloat32_t v485 = svsub_f32_x(svptrue_b32(), v402, v475); + svfloat32_t zero610 = svdup_n_f32(0); svfloat32_t v610 = svcmla_f32_x(pred_full, zero610, v1773, v603, 90); - svfloat32_t v611; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v611) : "w"(v529), "w"(v602)); - svfloat32_t v612; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v612) : "w"(v529), "w"(v602)); - svfloat32_t v626; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v626) : "w"(v241), "w"(v623)); - svfloat32_t v627; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v627) : "w"(v241), "w"(v623)); + svfloat32_t v611 = svadd_f32_x(svptrue_b32(), v529, v602); + svfloat32_t v612 = svsub_f32_x(svptrue_b32(), v529, v602); + svfloat32_t v626 = svsub_f32_x(svptrue_b32(), v241, v623); + svfloat32_t v627 = svadd_f32_x(svptrue_b32(), v241, v623); svint16_t v630 = svtbl_s16( svreinterpret_s16_s32(svcvt_s32_f32_x( pred_full, svmul_n_f32_x(pred_full, v624, (float)(1ULL << 31ULL)))), @@ -20879,35 +12945,22 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu32(const armral_cmplx_f32_t *restrict x, svfloat32_t v898 = svcmla_f32_x(pred_full, v889, v1649, v889, 90); svfloat32_t v1039 = svcmla_f32_x(pred_full, v1019, v1729, v448, 90); svfloat32_t v1040 = svcmla_f32_x(pred_full, v1031, v1731, v575, 90); - svfloat32_t v143; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v143) : "w"(v60), "w"(v140)); - svfloat32_t v144; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v144) : "w"(v60), "w"(v140)); - svfloat32_t zero278; - asm volatile("mov %0.s, #0" : "=w"(zero278)); + svfloat32_t v143 = svsub_f32_x(svptrue_b32(), v60, v140); + svfloat32_t v144 = svadd_f32_x(svptrue_b32(), v60, v140); + svfloat32_t zero278 = svdup_n_f32(0); svfloat32_t v278 = svcmla_f32_x(pred_full, zero278, v1773, v271, 90); - svfloat32_t v279; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v279) : "w"(v141), "w"(v270)); - svfloat32_t v280; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v280) : "w"(v141), "w"(v270)); - svfloat32_t zero317; - asm volatile("mov %0.s, #0" : "=w"(zero317)); + svfloat32_t v279 = svadd_f32_x(svptrue_b32(), v141, v270); + svfloat32_t v280 = svsub_f32_x(svptrue_b32(), v141, v270); + svfloat32_t zero317 = svdup_n_f32(0); svfloat32_t v317 = svcmla_f32_x(pred_full, zero317, v1773, v310, 90); - svfloat32_t v318; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v318) : "w"(v104), "w"(v309)); - svfloat32_t v319; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v319) : "w"(v104), "w"(v309)); - svfloat32_t zero356; - asm volatile("mov %0.s, #0" : "=w"(zero356)); + svfloat32_t v318 = svadd_f32_x(svptrue_b32(), v104, v309); + svfloat32_t v319 = svsub_f32_x(svptrue_b32(), v104, v309); + svfloat32_t zero356 = svdup_n_f32(0); svfloat32_t v356 = svcmla_f32_x(pred_full, zero356, v1773, v349, 90); - svfloat32_t v486; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v486) : "w"(v403), "w"(v483)); - svfloat32_t v487; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v487) : "w"(v403), "w"(v483)); - svfloat32_t v613; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v613) : "w"(v530), "w"(v610)); - svfloat32_t v614; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v614) : "w"(v530), "w"(v610)); + svfloat32_t v486 = svsub_f32_x(svptrue_b32(), v403, v483); + svfloat32_t v487 = svadd_f32_x(svptrue_b32(), v403, v483); + svfloat32_t v613 = svsub_f32_x(svptrue_b32(), v530, v610); + svfloat32_t v614 = svadd_f32_x(svptrue_b32(), v530, v610); svint16_t v638 = svtbl_s16( svreinterpret_s16_s32(svcvt_s32_f32_x( pred_full, svmul_n_f32_x(pred_full, v626, (float)(1ULL << 31ULL)))), @@ -20918,83 +12971,50 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu32(const armral_cmplx_f32_t *restrict x, pred_full, svmul_n_f32_x(pred_full, v627, (float)(1ULL << 31ULL)))), svreinterpret_u16_u64( svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svfloat32_t v664; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v664) : "w"(v484), "w"(v1523)); - svfloat32_t v676; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v676) : "w"(v611), "w"(v1605)); - svfloat32_t v757; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v757) : "w"(v755), "w"(v756)); - svfloat32_t v758; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v758) : "w"(v756), "w"(v755)); - svfloat32_t v899; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v899) : "w"(v897), "w"(v898)); - svfloat32_t v900; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v900) : "w"(v898), "w"(v897)); - svfloat32_t v948; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v948) : "w"(v485), "w"(v1687)); - svfloat32_t v960; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v960) : "w"(v612), "w"(v1689)); - svfloat32_t v1041; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v1041) : "w"(v1039), "w"(v1040)); - svfloat32_t v1042; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1042) : "w"(v1040), "w"(v1039)); + svfloat32_t v664 = svmul_f32_x(svptrue_b32(), v484, v1523); + svfloat32_t v676 = svmul_f32_x(svptrue_b32(), v611, v1605); + svfloat32_t v757 = svadd_f32_x(svptrue_b32(), v755, v756); + svfloat32_t v758 = svsub_f32_x(svptrue_b32(), v756, v755); + svfloat32_t v899 = svadd_f32_x(svptrue_b32(), v897, v898); + svfloat32_t v900 = svsub_f32_x(svptrue_b32(), v898, v897); + svfloat32_t v948 = svmul_f32_x(svptrue_b32(), v485, v1687); + svfloat32_t v960 = svmul_f32_x(svptrue_b32(), v612, v1689); + svfloat32_t v1041 = svadd_f32_x(svptrue_b32(), v1039, v1040); + svfloat32_t v1042 = svsub_f32_x(svptrue_b32(), v1040, v1039); svst1w_u64(pred_full, (unsigned *)(v1493), svreinterpret_u64_s16(v630)); svst1w_u64(pred_full, (unsigned *)(v1511), svreinterpret_u64_s16(v646)); - svfloat32_t v281; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v281) : "w"(v142), "w"(v278)); - svfloat32_t v282; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v282) : "w"(v142), "w"(v278)); - svfloat32_t v320; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v320) : "w"(v105), "w"(v317)); - svfloat32_t v321; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v321) : "w"(v105), "w"(v317)); - svfloat32_t v357; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v357) : "w"(v143), "w"(v348)); - svfloat32_t v358; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v358) : "w"(v143), "w"(v348)); - svfloat32_t v359; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v359) : "w"(v144), "w"(v356)); - svfloat32_t v360; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v360) : "w"(v144), "w"(v356)); + svfloat32_t v281 = svsub_f32_x(svptrue_b32(), v142, v278); + svfloat32_t v282 = svadd_f32_x(svptrue_b32(), v142, v278); + svfloat32_t v320 = svsub_f32_x(svptrue_b32(), v105, v317); + svfloat32_t v321 = svadd_f32_x(svptrue_b32(), v105, v317); + svfloat32_t v357 = svadd_f32_x(svptrue_b32(), v143, v348); + svfloat32_t v358 = svsub_f32_x(svptrue_b32(), v143, v348); + svfloat32_t v359 = svsub_f32_x(svptrue_b32(), v144, v356); + svfloat32_t v360 = svadd_f32_x(svptrue_b32(), v144, v356); svfloat32_t v684 = svcmla_f32_x(pred_full, v664, v1690, v484, 90); svfloat32_t v685 = svcmla_f32_x(pred_full, v676, v1606, v611, 90); - svfloat32_t zero765; - asm volatile("mov %0.s, #0" : "=w"(zero765)); + svfloat32_t zero765 = svdup_n_f32(0); svfloat32_t v765 = svcmla_f32_x(pred_full, zero765, v1773, v758, 90); - svfloat32_t v766; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v766) : "w"(v318), "w"(v757)); - svfloat32_t v767; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v767) : "w"(v318), "w"(v757)); - svfloat32_t v806; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v806) : "w"(v486), "w"(v1605)); - svfloat32_t v818; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v818) : "w"(v613), "w"(v1607)); - svfloat32_t zero907; - asm volatile("mov %0.s, #0" : "=w"(zero907)); + svfloat32_t v766 = svadd_f32_x(svptrue_b32(), v318, v757); + svfloat32_t v767 = svsub_f32_x(svptrue_b32(), v318, v757); + svfloat32_t v806 = svmul_f32_x(svptrue_b32(), v486, v1605); + svfloat32_t v818 = svmul_f32_x(svptrue_b32(), v613, v1607); + svfloat32_t zero907 = svdup_n_f32(0); svfloat32_t v907 = svcmla_f32_x(pred_full, zero907, v1773, v900, 90); - svfloat32_t v908; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v908) : "w"(v242), "w"(v899)); - svfloat32_t v909; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v909) : "w"(v242), "w"(v899)); + svfloat32_t v908 = svadd_f32_x(svptrue_b32(), v242, v899); + svfloat32_t v909 = svsub_f32_x(svptrue_b32(), v242, v899); svfloat32_t v968 = svcmla_f32_x(pred_full, v948, v1688, v485, 90); svfloat32_t v969 = svcmla_f32_x(pred_full, v960, v1690, v612, 90); - svfloat32_t zero1049; - asm volatile("mov %0.s, #0" : "=w"(zero1049)); + svfloat32_t zero1049 = svdup_n_f32(0); svfloat32_t v1049 = svcmla_f32_x(pred_full, zero1049, v1773, v1042, 90); - svfloat32_t v1090; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v1090) : "w"(v487), "w"(v1769)); - svfloat32_t v1102; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v1102) : "w"(v614), "w"(v1771)); + svfloat32_t v1090 = svmul_f32_x(svptrue_b32(), v487, v1769); + svfloat32_t v1102 = svmul_f32_x(svptrue_b32(), v614, v1771); svst1w_u64(pred_full, (unsigned *)(v1502), svreinterpret_u64_s16(v638)); svst1w_u64(pred_full, (unsigned *)(v1520), svreinterpret_u64_s16(v654)); - svfloat32_t v686; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v686) : "w"(v684), "w"(v685)); - svfloat32_t v687; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v687) : "w"(v685), "w"(v684)); - svfloat32_t v768; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v768) : "w"(v319), "w"(v765)); - svfloat32_t v769; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v769) : "w"(v319), "w"(v765)); + svfloat32_t v686 = svadd_f32_x(svptrue_b32(), v684, v685); + svfloat32_t v687 = svsub_f32_x(svptrue_b32(), v685, v684); + svfloat32_t v768 = svsub_f32_x(svptrue_b32(), v319, v765); + svfloat32_t v769 = svadd_f32_x(svptrue_b32(), v319, v765); svint16_t v772 = svtbl_s16( svreinterpret_s16_s32(svcvt_s32_f32_x( pred_full, svmul_n_f32_x(pred_full, v766, (float)(1ULL << 31ULL)))), @@ -21007,10 +13027,8 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu32(const armral_cmplx_f32_t *restrict x, svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); svfloat32_t v826 = svcmla_f32_x(pred_full, v806, v1606, v486, 90); svfloat32_t v827 = svcmla_f32_x(pred_full, v818, v1770, v613, 90); - svfloat32_t v910; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v910) : "w"(v243), "w"(v907)); - svfloat32_t v911; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v911) : "w"(v243), "w"(v907)); + svfloat32_t v910 = svsub_f32_x(svptrue_b32(), v243, v907); + svfloat32_t v911 = svadd_f32_x(svptrue_b32(), v243, v907); svint16_t v914 = svtbl_s16( svreinterpret_s16_s32(svcvt_s32_f32_x( pred_full, svmul_n_f32_x(pred_full, v908, (float)(1ULL << 31ULL)))), @@ -21021,27 +13039,18 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu32(const armral_cmplx_f32_t *restrict x, pred_full, svmul_n_f32_x(pred_full, v909, (float)(1ULL << 31ULL)))), svreinterpret_u16_u64( svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svfloat32_t v970; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v970) : "w"(v968), "w"(v969)); - svfloat32_t v971; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v971) : "w"(v969), "w"(v968)); - svfloat32_t v1050; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v1050) : "w"(v320), "w"(v1041)); - svfloat32_t v1051; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1051) : "w"(v320), "w"(v1041)); - svfloat32_t v1052; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1052) : "w"(v321), "w"(v1049)); - svfloat32_t v1053; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v1053) : "w"(v321), "w"(v1049)); + svfloat32_t v970 = svadd_f32_x(svptrue_b32(), v968, v969); + svfloat32_t v971 = svsub_f32_x(svptrue_b32(), v969, v968); + svfloat32_t v1050 = svadd_f32_x(svptrue_b32(), v320, v1041); + svfloat32_t v1051 = svsub_f32_x(svptrue_b32(), v320, v1041); + svfloat32_t v1052 = svsub_f32_x(svptrue_b32(), v321, v1049); + svfloat32_t v1053 = svadd_f32_x(svptrue_b32(), v321, v1049); svfloat32_t v1110 = svcmla_f32_x(pred_full, v1090, v1770, v487, 90); svfloat32_t v1111 = svcmla_f32_x(pred_full, v1102, v1772, v614, 90); - svfloat32_t zero694; - asm volatile("mov %0.s, #0" : "=w"(zero694)); + svfloat32_t zero694 = svdup_n_f32(0); svfloat32_t v694 = svcmla_f32_x(pred_full, zero694, v1773, v687, 90); - svfloat32_t v695; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v695) : "w"(v279), "w"(v686)); - svfloat32_t v696; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v696) : "w"(v279), "w"(v686)); + svfloat32_t v695 = svadd_f32_x(svptrue_b32(), v279, v686); + svfloat32_t v696 = svsub_f32_x(svptrue_b32(), v279, v686); svint16_t v780 = svtbl_s16( svreinterpret_s16_s32(svcvt_s32_f32_x( pred_full, svmul_n_f32_x(pred_full, v768, (float)(1ULL << 31ULL)))), @@ -21052,10 +13061,8 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu32(const armral_cmplx_f32_t *restrict x, pred_full, svmul_n_f32_x(pred_full, v769, (float)(1ULL << 31ULL)))), svreinterpret_u16_u64( svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svfloat32_t v828; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v828) : "w"(v826), "w"(v827)); - svfloat32_t v829; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v829) : "w"(v827), "w"(v826)); + svfloat32_t v828 = svadd_f32_x(svptrue_b32(), v826, v827); + svfloat32_t v829 = svsub_f32_x(svptrue_b32(), v827, v826); svint16_t v922 = svtbl_s16( svreinterpret_s16_s32(svcvt_s32_f32_x( pred_full, svmul_n_f32_x(pred_full, v910, (float)(1ULL << 31ULL)))), @@ -21066,13 +13073,10 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu32(const armral_cmplx_f32_t *restrict x, pred_full, svmul_n_f32_x(pred_full, v911, (float)(1ULL << 31ULL)))), svreinterpret_u16_u64( svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svfloat32_t zero978; - asm volatile("mov %0.s, #0" : "=w"(zero978)); + svfloat32_t zero978 = svdup_n_f32(0); svfloat32_t v978 = svcmla_f32_x(pred_full, zero978, v1773, v971, 90); - svfloat32_t v979; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v979) : "w"(v281), "w"(v970)); - svfloat32_t v980; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v980) : "w"(v281), "w"(v970)); + svfloat32_t v979 = svadd_f32_x(svptrue_b32(), v281, v970); + svfloat32_t v980 = svsub_f32_x(svptrue_b32(), v281, v970); svint16_t v1056 = svtbl_s16(svreinterpret_s16_s32(svcvt_s32_f32_x( pred_full, @@ -21097,18 +13101,14 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu32(const armral_cmplx_f32_t *restrict x, svmul_n_f32_x(pred_full, v1053, (float)(1ULL << 31ULL)))), svreinterpret_u16_u64(svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svfloat32_t v1112; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v1112) : "w"(v1110), "w"(v1111)); - svfloat32_t v1113; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1113) : "w"(v1111), "w"(v1110)); + svfloat32_t v1112 = svadd_f32_x(svptrue_b32(), v1110, v1111); + svfloat32_t v1113 = svsub_f32_x(svptrue_b32(), v1111, v1110); svst1w_u64(pred_full, (unsigned *)(v1575), svreinterpret_u64_s16(v772)); svst1w_u64(pred_full, (unsigned *)(v1593), svreinterpret_u64_s16(v788)); svst1w_u64(pred_full, (unsigned *)(v1657), svreinterpret_u64_s16(v914)); svst1w_u64(pred_full, (unsigned *)(v1675), svreinterpret_u64_s16(v930)); - svfloat32_t v697; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v697) : "w"(v280), "w"(v694)); - svfloat32_t v698; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v698) : "w"(v280), "w"(v694)); + svfloat32_t v697 = svsub_f32_x(svptrue_b32(), v280, v694); + svfloat32_t v698 = svadd_f32_x(svptrue_b32(), v280, v694); svint16_t v701 = svtbl_s16( svreinterpret_s16_s32(svcvt_s32_f32_x( pred_full, svmul_n_f32_x(pred_full, v695, (float)(1ULL << 31ULL)))), @@ -21119,17 +13119,12 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu32(const armral_cmplx_f32_t *restrict x, pred_full, svmul_n_f32_x(pred_full, v696, (float)(1ULL << 31ULL)))), svreinterpret_u16_u64( svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svfloat32_t zero836; - asm volatile("mov %0.s, #0" : "=w"(zero836)); + svfloat32_t zero836 = svdup_n_f32(0); svfloat32_t v836 = svcmla_f32_x(pred_full, zero836, v1773, v829, 90); - svfloat32_t v837; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v837) : "w"(v357), "w"(v828)); - svfloat32_t v838; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v838) : "w"(v357), "w"(v828)); - svfloat32_t v981; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v981) : "w"(v282), "w"(v978)); - svfloat32_t v982; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v982) : "w"(v282), "w"(v978)); + svfloat32_t v837 = svadd_f32_x(svptrue_b32(), v357, v828); + svfloat32_t v838 = svsub_f32_x(svptrue_b32(), v357, v828); + svfloat32_t v981 = svsub_f32_x(svptrue_b32(), v282, v978); + svfloat32_t v982 = svadd_f32_x(svptrue_b32(), v282, v978); svint16_t v985 = svtbl_s16( svreinterpret_s16_s32(svcvt_s32_f32_x( pred_full, svmul_n_f32_x(pred_full, v979, (float)(1ULL << 31ULL)))), @@ -21140,13 +13135,10 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu32(const armral_cmplx_f32_t *restrict x, pred_full, svmul_n_f32_x(pred_full, v980, (float)(1ULL << 31ULL)))), svreinterpret_u16_u64( svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svfloat32_t zero1120; - asm volatile("mov %0.s, #0" : "=w"(zero1120)); + svfloat32_t zero1120 = svdup_n_f32(0); svfloat32_t v1120 = svcmla_f32_x(pred_full, zero1120, v1773, v1113, 90); - svfloat32_t v1121; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v1121) : "w"(v359), "w"(v1112)); - svfloat32_t v1122; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1122) : "w"(v359), "w"(v1112)); + svfloat32_t v1121 = svadd_f32_x(svptrue_b32(), v359, v1112); + svfloat32_t v1122 = svsub_f32_x(svptrue_b32(), v359, v1112); svst1w_u64(pred_full, (unsigned *)(v1584), svreinterpret_u64_s16(v780)); svst1w_u64(pred_full, (unsigned *)(v1602), svreinterpret_u64_s16(v796)); svst1w_u64(pred_full, (unsigned *)(v1666), svreinterpret_u64_s16(v922)); @@ -21165,10 +13157,8 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu32(const armral_cmplx_f32_t *restrict x, pred_full, svmul_n_f32_x(pred_full, v698, (float)(1ULL << 31ULL)))), svreinterpret_u16_u64( svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svfloat32_t v839; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v839) : "w"(v358), "w"(v836)); - svfloat32_t v840; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v840) : "w"(v358), "w"(v836)); + svfloat32_t v839 = svsub_f32_x(svptrue_b32(), v358, v836); + svfloat32_t v840 = svadd_f32_x(svptrue_b32(), v358, v836); svint16_t v843 = svtbl_s16( svreinterpret_s16_s32(svcvt_s32_f32_x( pred_full, svmul_n_f32_x(pred_full, v837, (float)(1ULL << 31ULL)))), @@ -21189,10 +13179,8 @@ void armral_fft_cf32_cf32_cs16_ac_n_uu32(const armral_cmplx_f32_t *restrict x, pred_full, svmul_n_f32_x(pred_full, v982, (float)(1ULL << 31ULL)))), svreinterpret_u16_u64( svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svfloat32_t v1123; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1123) : "w"(v360), "w"(v1120)); - svfloat32_t v1124; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v1124) : "w"(v360), "w"(v1120)); + svfloat32_t v1123 = svsub_f32_x(svptrue_b32(), v360, v1120); + svfloat32_t v1124 = svadd_f32_x(svptrue_b32(), v360, v1120); svint16_t v1127 = svtbl_s16(svreinterpret_s16_s32(svcvt_s32_f32_x( pred_full, diff --git a/src/LowerPHY/FFT/fft_cf32_cf32_cs16_ac_n_uu.h b/src/LowerPHY/FFT/fft_cf32_cf32_cs16_ac_n_uu.h index 85fe5a07a56be409546ac9e440f0722216c71780..5f87847de4e9a005851d3a153ead9b8b7df8e524 100644 --- a/src/LowerPHY/FFT/fft_cf32_cf32_cs16_ac_n_uu.h +++ b/src/LowerPHY/FFT/fft_cf32_cf32_cs16_ac_n_uu.h @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #pragma once diff --git a/src/LowerPHY/FFT/fft_cf32_kernel_lookup.c b/src/LowerPHY/FFT/fft_cf32_kernel_lookup.c index d656718b9e134cb8770cb29fa6bc1111972c785b..2520ecdc1d8f5da4569870f6d46246af7927c065 100644 --- a/src/LowerPHY/FFT/fft_cf32_kernel_lookup.c +++ b/src/LowerPHY/FFT/fft_cf32_kernel_lookup.c @@ -1,13 +1,15 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "fft_cf32_kernel_lookup.h" #include -#define NUM_FFT_CF32_BASE_KERNELS 33 +#define NUM_FFT_CF32_BASE_KERNELS 41 static cf32_cf32_cf32_ac_n_uu_fft_t *base_cf32_cf32_cf32_ac_n_uu_kernels[NUM_FFT_CF32_BASE_KERNELS] = { @@ -39,31 +41,84 @@ static cf32_cf32_cf32_ac_n_uu_fft_t armral_fft_cf32_cf32_cf32_ac_n_uu25, NULL, NULL, + armral_fft_cf32_cf32_cf32_ac_n_uu28, + NULL, + armral_fft_cf32_cf32_cf32_ac_n_uu30, + NULL, + armral_fft_cf32_cf32_cf32_ac_n_uu32, + NULL, + NULL, + NULL, + armral_fft_cf32_cf32_cf32_ac_n_uu36, + NULL, + NULL, + NULL, + armral_fft_cf32_cf32_cf32_ac_n_uu40, +}; + +static cf32_cf32_cf32_ac_n_uun_fft_t + *base_cf32_cf32_cf32_ac_n_uun_kernels[NUM_FFT_CF32_BASE_KERNELS] = { + NULL, + NULL, + armral_fft_cf32_cf32_cf32_ac_n_uun2, + armral_fft_cf32_cf32_cf32_ac_n_uun3, + armral_fft_cf32_cf32_cf32_ac_n_uun4, + armral_fft_cf32_cf32_cf32_ac_n_uun5, + armral_fft_cf32_cf32_cf32_ac_n_uun6, + armral_fft_cf32_cf32_cf32_ac_n_uun7, + armral_fft_cf32_cf32_cf32_ac_n_uun8, + armral_fft_cf32_cf32_cf32_ac_n_uun9, + armral_fft_cf32_cf32_cf32_ac_n_uun10, + armral_fft_cf32_cf32_cf32_ac_n_uun11, + armral_fft_cf32_cf32_cf32_ac_n_uun12, + armral_fft_cf32_cf32_cf32_ac_n_uun13, + armral_fft_cf32_cf32_cf32_ac_n_uun14, + armral_fft_cf32_cf32_cf32_ac_n_uun15, + armral_fft_cf32_cf32_cf32_ac_n_uun16, + armral_fft_cf32_cf32_cf32_ac_n_uun17, + armral_fft_cf32_cf32_cf32_ac_n_uun18, + armral_fft_cf32_cf32_cf32_ac_n_uun19, + armral_fft_cf32_cf32_cf32_ac_n_uun20, + armral_fft_cf32_cf32_cf32_ac_n_uun21, + armral_fft_cf32_cf32_cf32_ac_n_uun22, + NULL, + armral_fft_cf32_cf32_cf32_ac_n_uun24, + armral_fft_cf32_cf32_cf32_ac_n_uun25, + NULL, + NULL, + NULL, + NULL, + NULL, + NULL, + armral_fft_cf32_cf32_cf32_ac_n_uun32, + NULL, + NULL, + NULL, + NULL, NULL, NULL, NULL, NULL, - armral_fft_cf32_cf32_cf32_ac_n_uu32, }; static cf32_cf32_cf32_ac_n_gu_fft_t *base_cf32_cf32_cf32_ac_n_gu_kernels[NUM_FFT_CF32_BASE_KERNELS] = { NULL, NULL, - armral_fft_cf32_cf32_cf32_ac_n_gu2, - armral_fft_cf32_cf32_cf32_ac_n_gu3, - armral_fft_cf32_cf32_cf32_ac_n_gu4, - armral_fft_cf32_cf32_cf32_ac_n_gu5, - armral_fft_cf32_cf32_cf32_ac_n_gu6, - armral_fft_cf32_cf32_cf32_ac_n_gu7, - armral_fft_cf32_cf32_cf32_ac_n_gu8, - armral_fft_cf32_cf32_cf32_ac_n_gu9, - armral_fft_cf32_cf32_cf32_ac_n_gu10, - armral_fft_cf32_cf32_cf32_ac_n_gu11, - armral_fft_cf32_cf32_cf32_ac_n_gu12, + NULL, + NULL, + NULL, + NULL, + NULL, + NULL, + NULL, + NULL, + NULL, + NULL, + NULL, armral_fft_cf32_cf32_cf32_ac_n_gu13, armral_fft_cf32_cf32_cf32_ac_n_gu14, - armral_fft_cf32_cf32_cf32_ac_n_gu15, + NULL, armral_fft_cf32_cf32_cf32_ac_n_gu16, armral_fft_cf32_cf32_cf32_ac_n_gu17, armral_fft_cf32_cf32_cf32_ac_n_gu18, @@ -81,6 +136,14 @@ static cf32_cf32_cf32_ac_n_gu_fft_t NULL, NULL, armral_fft_cf32_cf32_cf32_ac_n_gu32, + NULL, + NULL, + NULL, + NULL, + NULL, + NULL, + NULL, + NULL, }; static cf32_cf32_cf32_ab_t_gu_fft_t @@ -118,6 +181,14 @@ static cf32_cf32_cf32_ab_t_gu_fft_t NULL, NULL, armral_fft_cf32_cf32_cf32_ab_t_gu32, + NULL, + NULL, + NULL, + NULL, + NULL, + NULL, + NULL, + NULL, }; static cf32_cf32_cf32_ab_t_gs_fft_t @@ -154,24 +225,32 @@ static cf32_cf32_cf32_ab_t_gs_fft_t NULL, NULL, NULL, - armral_fft_cf32_cf32_cf32_ab_t_gs32, + NULL, + NULL, + NULL, + NULL, + NULL, + NULL, + NULL, + NULL, + NULL, }; static cf32_cf32_cf32_ac_t_uu_fft_t *base_cf32_cf32_cf32_ac_t_uu_kernels[NUM_FFT_CF32_BASE_KERNELS] = { NULL, NULL, - armral_fft_cf32_cf32_cf32_ac_t_uu2, - armral_fft_cf32_cf32_cf32_ac_t_uu3, - armral_fft_cf32_cf32_cf32_ac_t_uu4, - armral_fft_cf32_cf32_cf32_ac_t_uu5, - armral_fft_cf32_cf32_cf32_ac_t_uu6, + NULL, + NULL, + NULL, + NULL, + NULL, armral_fft_cf32_cf32_cf32_ac_t_uu7, - armral_fft_cf32_cf32_cf32_ac_t_uu8, + NULL, armral_fft_cf32_cf32_cf32_ac_t_uu9, - armral_fft_cf32_cf32_cf32_ac_t_uu10, + NULL, armral_fft_cf32_cf32_cf32_ac_t_uu11, - armral_fft_cf32_cf32_cf32_ac_t_uu12, + NULL, armral_fft_cf32_cf32_cf32_ac_t_uu13, armral_fft_cf32_cf32_cf32_ac_t_uu14, armral_fft_cf32_cf32_cf32_ac_t_uu15, @@ -192,6 +271,14 @@ static cf32_cf32_cf32_ac_t_uu_fft_t NULL, NULL, armral_fft_cf32_cf32_cf32_ac_t_uu32, + NULL, + NULL, + NULL, + NULL, + NULL, + NULL, + NULL, + NULL, }; cf32_cf32_cf32_ac_n_uu_fft_t * @@ -202,6 +289,14 @@ lookup_ac_uu_base_kernel_cf32_cf32(int n, armral_fft_direction_t dir) { return base_cf32_cf32_cf32_ac_n_uu_kernels[n]; } +cf32_cf32_cf32_ac_n_uun_fft_t * +lookup_ac_uun_base_kernel_cf32_cf32(int n, armral_fft_direction_t dir) { + if (n >= NUM_FFT_CF32_BASE_KERNELS) { + return NULL; + } + return base_cf32_cf32_cf32_ac_n_uun_kernels[n]; +} + cf32_cf32_cf32_ac_n_gu_fft_t * lookup_ac_gu_base_kernel_cf32_cf32(int n, armral_fft_direction_t dir) { if (n >= NUM_FFT_CF32_BASE_KERNELS) { diff --git a/src/LowerPHY/FFT/fft_cf32_kernel_lookup.h b/src/LowerPHY/FFT/fft_cf32_kernel_lookup.h index fcef99b1fbcb91f583521224559a549e2cf0e669..7b570fb03a1439baeb0235bf11334bffc1f6dc55 100644 --- a/src/LowerPHY/FFT/fft_cf32_kernel_lookup.h +++ b/src/LowerPHY/FFT/fft_cf32_kernel_lookup.h @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #pragma once @@ -8,6 +10,7 @@ #include "fft_cf32_cf32_cf32_ab_t_gu.h" #include "fft_cf32_cf32_cf32_ac_n_gu.h" #include "fft_cf32_cf32_cf32_ac_n_uu.h" +#include "fft_cf32_cf32_cf32_ac_n_uun.h" #include "fft_cf32_cf32_cf32_ac_t_uu.h" #ifdef __cplusplus @@ -17,6 +20,9 @@ extern "C" { cf32_cf32_cf32_ac_n_uu_fft_t * lookup_ac_uu_base_kernel_cf32_cf32(int n, armral_fft_direction_t dir); +cf32_cf32_cf32_ac_n_uun_fft_t * +lookup_ac_uun_base_kernel_cf32_cf32(int n, armral_fft_direction_t dir); + cf32_cf32_cf32_ac_n_gu_fft_t * lookup_ac_gu_base_kernel_cf32_cf32(int n, armral_fft_direction_t dir); diff --git a/src/LowerPHY/FFT/fft_cs16.cpp b/src/LowerPHY/FFT/fft_cs16.cpp index 856eda0584ebd54581291709f8bf97015272f440..9728d7b4e4a451f23db58660448067ff4e7821b5 100644 --- a/src/LowerPHY/FFT/fft_cs16.cpp +++ b/src/LowerPHY/FFT/fft_cs16.cpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "fft_execute.hpp" diff --git a/src/LowerPHY/FFT/fft_cs16_cf32_cf32_ac_n_uu.c b/src/LowerPHY/FFT/fft_cs16_cf32_cf32_ac_n_uu.c index 2181663e18674b8c0cd5dc82fa1e8b38653b5bf5..ec622ca8522251c362056e16177f987415fcd1d2 100644 --- a/src/LowerPHY/FFT/fft_cs16_cf32_cf32_ac_n_uu.c +++ b/src/LowerPHY/FFT/fft_cs16_cf32_cf32_ac_n_uu.c @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "fft_cs16_cf32_cf32_ac_n_uu.h" @@ -335,84 +337,51 @@ void armral_fft_cs16_cf32_cf32_ac_n_uu7(const armral_cmplx_int16_t *restrict x, svcvt_f32_s32_x(pred_full, svld1sh_s32(pred_full, (const int16_t *)&v263[0])), 1.F / (1ULL << 15ULL)); - svfloat32_t v34; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v34) : "w"(v25), "w"(v33)); - svfloat32_t v35; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v35) : "w"(v25), "w"(v33)); - svfloat32_t v52; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v52) : "w"(v43), "w"(v51)); - svfloat32_t v53; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v53) : "w"(v43), "w"(v51)); - svfloat32_t v70; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v70) : "w"(v61), "w"(v69)); - svfloat32_t v71; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v71) : "w"(v61), "w"(v69)); - svfloat32_t v72; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v72) : "w"(v34), "w"(v52)); - svfloat32_t v83; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v83) : "w"(v34), "w"(v52)); - svfloat32_t v84; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v84) : "w"(v52), "w"(v70)); - svfloat32_t v85; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v85) : "w"(v70), "w"(v34)); - svfloat32_t v86; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v86) : "w"(v35), "w"(v53)); - svfloat32_t v88; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v88) : "w"(v35), "w"(v53)); - svfloat32_t v89; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v89) : "w"(v53), "w"(v71)); - svfloat32_t v90; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v90) : "w"(v71), "w"(v35)); - svfloat32_t v73; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v73) : "w"(v72), "w"(v70)); - svfloat32_t v87; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v87) : "w"(v86), "w"(v71)); - svfloat32_t zero129; - asm volatile("mov %0.s, #0" : "=w"(zero129)); + svfloat32_t v34 = svadd_f32_x(svptrue_b32(), v25, v33); + svfloat32_t v35 = svsub_f32_x(svptrue_b32(), v25, v33); + svfloat32_t v52 = svadd_f32_x(svptrue_b32(), v43, v51); + svfloat32_t v53 = svsub_f32_x(svptrue_b32(), v43, v51); + svfloat32_t v70 = svadd_f32_x(svptrue_b32(), v61, v69); + svfloat32_t v71 = svsub_f32_x(svptrue_b32(), v61, v69); + svfloat32_t v72 = svadd_f32_x(svptrue_b32(), v34, v52); + svfloat32_t v83 = svsub_f32_x(svptrue_b32(), v34, v52); + svfloat32_t v84 = svsub_f32_x(svptrue_b32(), v52, v70); + svfloat32_t v85 = svsub_f32_x(svptrue_b32(), v70, v34); + svfloat32_t v86 = svadd_f32_x(svptrue_b32(), v35, v53); + svfloat32_t v88 = svsub_f32_x(svptrue_b32(), v35, v53); + svfloat32_t v89 = svsub_f32_x(svptrue_b32(), v53, v71); + svfloat32_t v90 = svsub_f32_x(svptrue_b32(), v71, v35); + svfloat32_t v73 = svadd_f32_x(svptrue_b32(), v72, v70); + svfloat32_t v87 = svadd_f32_x(svptrue_b32(), v86, v71); + svfloat32_t zero129 = svdup_n_f32(0); svfloat32_t v129 = svcmla_f32_x(pred_full, zero129, v282, v88, 90); - svfloat32_t zero136; - asm volatile("mov %0.s, #0" : "=w"(zero136)); + svfloat32_t zero136 = svdup_n_f32(0); svfloat32_t v136 = svcmla_f32_x(pred_full, zero136, v283, v89, 90); - svfloat32_t zero143; - asm volatile("mov %0.s, #0" : "=w"(zero143)); + svfloat32_t zero143 = svdup_n_f32(0); svfloat32_t v143 = svcmla_f32_x(pred_full, zero143, v284, v90, 90); - svfloat32_t v82; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v82) : "w"(v73), "w"(v81)); - svfloat32_t zero122; - asm volatile("mov %0.s, #0" : "=w"(zero122)); + svfloat32_t v82 = svadd_f32_x(svptrue_b32(), v73, v81); + svfloat32_t zero122 = svdup_n_f32(0); svfloat32_t v122 = svcmla_f32_x(pred_full, zero122, v281, v87, 90); svfloat32_t v144 = svmla_f32_x(pred_full, v82, v73, v277); - svfloat32_t v151; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v151) : "w"(v122), "w"(v129)); - svfloat32_t v153; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v153) : "w"(v122), "w"(v129)); - svfloat32_t v155; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v155) : "w"(v122), "w"(v136)); + svfloat32_t v151 = svadd_f32_x(svptrue_b32(), v122, v129); + svfloat32_t v153 = svsub_f32_x(svptrue_b32(), v122, v129); + svfloat32_t v155 = svsub_f32_x(svptrue_b32(), v122, v136); svst1_f64(pred_full, (double *)(v292), svreinterpret_f64_f32(v82)); svfloat32_t v145 = svmla_f32_x(pred_full, v144, v83, v278); svfloat32_t v147 = svmls_f32_x(pred_full, v144, v83, v278); svfloat32_t v149 = svmls_f32_x(pred_full, v144, v84, v279); - svfloat32_t v152; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v152) : "w"(v151), "w"(v136)); - svfloat32_t v154; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v154) : "w"(v153), "w"(v143)); - svfloat32_t v156; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v156) : "w"(v155), "w"(v143)); + svfloat32_t v152 = svadd_f32_x(svptrue_b32(), v151, v136); + svfloat32_t v154 = svsub_f32_x(svptrue_b32(), v153, v143); + svfloat32_t v156 = svadd_f32_x(svptrue_b32(), v155, v143); svfloat32_t v146 = svmla_f32_x(pred_full, v145, v84, v279); svfloat32_t v148 = svmls_f32_x(pred_full, v147, v85, v280); svfloat32_t v150 = svmla_f32_x(pred_full, v149, v85, v280); - svfloat32_t v157; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v157) : "w"(v146), "w"(v152)); - svfloat32_t v158; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v158) : "w"(v146), "w"(v152)); - svfloat32_t v159; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v159) : "w"(v148), "w"(v154)); - svfloat32_t v160; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v160) : "w"(v148), "w"(v154)); - svfloat32_t v161; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v161) : "w"(v150), "w"(v156)); - svfloat32_t v162; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v162) : "w"(v150), "w"(v156)); + svfloat32_t v157 = svadd_f32_x(svptrue_b32(), v146, v152); + svfloat32_t v158 = svsub_f32_x(svptrue_b32(), v146, v152); + svfloat32_t v159 = svadd_f32_x(svptrue_b32(), v148, v154); + svfloat32_t v160 = svsub_f32_x(svptrue_b32(), v148, v154); + svfloat32_t v161 = svadd_f32_x(svptrue_b32(), v150, v156); + svfloat32_t v162 = svsub_f32_x(svptrue_b32(), v150, v156); svst1_f64(pred_full, (double *)(v301), svreinterpret_f64_f32(v158)); svst1_f64(pred_full, (double *)(v310), svreinterpret_f64_f32(v160)); svst1_f64(pred_full, (double *)(v319), svreinterpret_f64_f32(v161)); @@ -816,88 +785,51 @@ void armral_fft_cs16_cf32_cf32_ac_n_uu9(const armral_cmplx_int16_t *restrict x, svcvt_f32_s32_x(pred_full, svld1sh_s32(pred_full, (const int16_t *)&v332[0])), 1.F / (1ULL << 15ULL)); - svfloat32_t v34; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v34) : "w"(v25), "w"(v33)); - svfloat32_t v35; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v35) : "w"(v25), "w"(v33)); - svfloat32_t v52; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v52) : "w"(v43), "w"(v51)); - svfloat32_t v53; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v53) : "w"(v43), "w"(v51)); - svfloat32_t v70; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v70) : "w"(v61), "w"(v69)); - svfloat32_t v71; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v71) : "w"(v61), "w"(v69)); - svfloat32_t v88; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v88) : "w"(v79), "w"(v87)); - svfloat32_t v89; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v89) : "w"(v79), "w"(v87)); - svfloat32_t v90; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v90) : "w"(v34), "w"(v52)); - svfloat32_t v102; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v102) : "w"(v35), "w"(v53)); - svfloat32_t v104; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v104) : "w"(v34), "w"(v52)); - svfloat32_t v105; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v105) : "w"(v52), "w"(v88)); - svfloat32_t v106; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v106) : "w"(v88), "w"(v34)); - svfloat32_t v107; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v107) : "w"(v35), "w"(v53)); - svfloat32_t v108; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v108) : "w"(v53), "w"(v89)); - svfloat32_t v109; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v109) : "w"(v89), "w"(v35)); - svfloat32_t zero138; - asm volatile("mov %0.s, #0" : "=w"(zero138)); + svfloat32_t v34 = svadd_f32_x(svptrue_b32(), v25, v33); + svfloat32_t v35 = svsub_f32_x(svptrue_b32(), v25, v33); + svfloat32_t v52 = svadd_f32_x(svptrue_b32(), v43, v51); + svfloat32_t v53 = svsub_f32_x(svptrue_b32(), v43, v51); + svfloat32_t v70 = svadd_f32_x(svptrue_b32(), v61, v69); + svfloat32_t v71 = svsub_f32_x(svptrue_b32(), v61, v69); + svfloat32_t v88 = svadd_f32_x(svptrue_b32(), v79, v87); + svfloat32_t v89 = svsub_f32_x(svptrue_b32(), v79, v87); + svfloat32_t v90 = svadd_f32_x(svptrue_b32(), v34, v52); + svfloat32_t v102 = svadd_f32_x(svptrue_b32(), v35, v53); + svfloat32_t v104 = svsub_f32_x(svptrue_b32(), v34, v52); + svfloat32_t v105 = svsub_f32_x(svptrue_b32(), v52, v88); + svfloat32_t v106 = svsub_f32_x(svptrue_b32(), v88, v34); + svfloat32_t v107 = svsub_f32_x(svptrue_b32(), v35, v53); + svfloat32_t v108 = svsub_f32_x(svptrue_b32(), v53, v89); + svfloat32_t v109 = svsub_f32_x(svptrue_b32(), v89, v35); + svfloat32_t zero138 = svdup_n_f32(0); svfloat32_t v138 = svcmla_f32_x(pred_full, zero138, v349, v71, 90); - svfloat32_t v91; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v91) : "w"(v90), "w"(v88)); - svfloat32_t v103; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v103) : "w"(v102), "w"(v89)); - svfloat32_t zero160; - asm volatile("mov %0.s, #0" : "=w"(zero160)); + svfloat32_t v91 = svadd_f32_x(svptrue_b32(), v90, v88); + svfloat32_t v103 = svadd_f32_x(svptrue_b32(), v102, v89); + svfloat32_t zero160 = svdup_n_f32(0); svfloat32_t v160 = svcmla_f32_x(pred_full, zero160, v353, v107, 90); - svfloat32_t zero167; - asm volatile("mov %0.s, #0" : "=w"(zero167)); + svfloat32_t zero167 = svdup_n_f32(0); svfloat32_t v167 = svcmla_f32_x(pred_full, zero167, v354, v108, 90); - svfloat32_t zero174; - asm volatile("mov %0.s, #0" : "=w"(zero174)); + svfloat32_t zero174 = svdup_n_f32(0); svfloat32_t v174 = svcmla_f32_x(pred_full, zero174, v355, v109, 90); - svfloat32_t v92; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v92) : "w"(v91), "w"(v70)); - svfloat32_t v119; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v119) : "w"(v91), "w"(v346)); - svfloat32_t zero126; - asm volatile("mov %0.s, #0" : "=w"(zero126)); + svfloat32_t v92 = svadd_f32_x(svptrue_b32(), v91, v70); + svfloat32_t v119 = svmul_f32_x(svptrue_b32(), v91, v346); + svfloat32_t zero126 = svdup_n_f32(0); svfloat32_t v126 = svcmla_f32_x(pred_full, zero126, v349, v103, 90); - svfloat32_t v188; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v188) : "w"(v138), "w"(v160)); - svfloat32_t v190; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v190) : "w"(v138), "w"(v167)); - svfloat32_t v192; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v192) : "w"(v138), "w"(v160)); - svfloat32_t v101; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v101) : "w"(v92), "w"(v100)); - svfloat32_t v175; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v175) : "w"(v119), "w"(v119)); - svfloat32_t v189; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v189) : "w"(v188), "w"(v167)); - svfloat32_t v191; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v191) : "w"(v190), "w"(v174)); - svfloat32_t v193; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v193) : "w"(v192), "w"(v174)); + svfloat32_t v188 = svadd_f32_x(svptrue_b32(), v138, v160); + svfloat32_t v190 = svsub_f32_x(svptrue_b32(), v138, v167); + svfloat32_t v192 = svsub_f32_x(svptrue_b32(), v138, v160); + svfloat32_t v101 = svadd_f32_x(svptrue_b32(), v92, v100); + svfloat32_t v175 = svadd_f32_x(svptrue_b32(), v119, v119); + svfloat32_t v189 = svadd_f32_x(svptrue_b32(), v188, v167); + svfloat32_t v191 = svadd_f32_x(svptrue_b32(), v190, v174); + svfloat32_t v193 = svsub_f32_x(svptrue_b32(), v192, v174); svfloat32_t v176 = svmla_f32_x(pred_full, v175, v91, v346); svfloat32_t v180 = svmla_f32_x(pred_full, v101, v70, v348); svst1_f64(pred_full, (double *)(v363), svreinterpret_f64_f32(v101)); - svfloat32_t v177; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v177) : "w"(v101), "w"(v176)); - svfloat32_t v181; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v181) : "w"(v180), "w"(v175)); - svfloat32_t v178; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v178) : "w"(v177), "w"(v126)); - svfloat32_t v179; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v179) : "w"(v177), "w"(v126)); + svfloat32_t v177 = svadd_f32_x(svptrue_b32(), v101, v176); + svfloat32_t v181 = svadd_f32_x(svptrue_b32(), v180, v175); + svfloat32_t v178 = svadd_f32_x(svptrue_b32(), v177, v126); + svfloat32_t v179 = svsub_f32_x(svptrue_b32(), v177, v126); svfloat32_t v182 = svmla_f32_x(pred_full, v181, v104, v350); svfloat32_t v184 = svmls_f32_x(pred_full, v181, v105, v351); svfloat32_t v186 = svmls_f32_x(pred_full, v181, v104, v350); @@ -906,18 +838,12 @@ void armral_fft_cs16_cf32_cf32_ac_n_uu9(const armral_cmplx_int16_t *restrict x, svfloat32_t v187 = svmls_f32_x(pred_full, v186, v106, v352); svst1_f64(pred_full, (double *)(v390), svreinterpret_f64_f32(v179)); svst1_f64(pred_full, (double *)(v417), svreinterpret_f64_f32(v178)); - svfloat32_t v194; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v194) : "w"(v183), "w"(v189)); - svfloat32_t v195; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v195) : "w"(v183), "w"(v189)); - svfloat32_t v196; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v196) : "w"(v185), "w"(v191)); - svfloat32_t v197; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v197) : "w"(v185), "w"(v191)); - svfloat32_t v198; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v198) : "w"(v187), "w"(v193)); - svfloat32_t v199; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v199) : "w"(v187), "w"(v193)); + svfloat32_t v194 = svadd_f32_x(svptrue_b32(), v183, v189); + svfloat32_t v195 = svsub_f32_x(svptrue_b32(), v183, v189); + svfloat32_t v196 = svadd_f32_x(svptrue_b32(), v185, v191); + svfloat32_t v197 = svsub_f32_x(svptrue_b32(), v185, v191); + svfloat32_t v198 = svadd_f32_x(svptrue_b32(), v187, v193); + svfloat32_t v199 = svsub_f32_x(svptrue_b32(), v187, v193); svst1_f64(pred_full, (double *)(v372), svreinterpret_f64_f32(v195)); svst1_f64(pred_full, (double *)(v381), svreinterpret_f64_f32(v196)); svst1_f64(pred_full, (double *)(v399), svreinterpret_f64_f32(v199)); @@ -1570,105 +1496,58 @@ void armral_fft_cs16_cf32_cf32_ac_n_uu11(const armral_cmplx_int16_t *restrict x, svcvt_f32_s32_x(pred_full, svld1sh_s32(pred_full, (const int16_t *)&v479[0])), 1.F / (1ULL << 15ULL)); - svfloat32_t v34; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v34) : "w"(v25), "w"(v33)); - svfloat32_t v51; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v51) : "w"(v42), "w"(v50)); - svfloat32_t v68; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v68) : "w"(v59), "w"(v67)); - svfloat32_t v85; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v85) : "w"(v76), "w"(v84)); - svfloat32_t v102; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v102) : "w"(v93), "w"(v101)); - svfloat32_t v103; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v103) : "w"(v25), "w"(v33)); - svfloat32_t v104; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v104) : "w"(v42), "w"(v50)); - svfloat32_t v105; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v105) : "w"(v59), "w"(v67)); - svfloat32_t v106; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v106) : "w"(v76), "w"(v84)); - svfloat32_t v107; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v107) : "w"(v93), "w"(v101)); - svfloat32_t v108; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v108) : "w"(v34), "w"(v51)); - svfloat32_t v109; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v109) : "w"(v68), "w"(v102)); - svfloat32_t v111; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v111) : "w"(v104), "w"(v105)); - svfloat32_t v112; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v112) : "w"(v103), "w"(v107)); - svfloat32_t v125; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v125) : "w"(v51), "w"(v85)); - svfloat32_t v126; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v126) : "w"(v34), "w"(v85)); - svfloat32_t v127; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v127) : "w"(v51), "w"(v34)); - svfloat32_t v128; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v128) : "w"(v102), "w"(v85)); - svfloat32_t v129; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v129) : "w"(v68), "w"(v85)); - svfloat32_t v130; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v130) : "w"(v102), "w"(v68)); - svfloat32_t v131; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v131) : "w"(v51), "w"(v102)); - svfloat32_t v132; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v132) : "w"(v34), "w"(v68)); - svfloat32_t v134; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v134) : "w"(v104), "w"(v106)); - svfloat32_t v135; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v135) : "w"(v103), "w"(v106)); - svfloat32_t v136; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v136) : "w"(v103), "w"(v104)); - svfloat32_t v137; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v137) : "w"(v106), "w"(v107)); - svfloat32_t v138; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v138) : "w"(v105), "w"(v106)); - svfloat32_t v139; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v139) : "w"(v105), "w"(v107)); - svfloat32_t v140; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v140) : "w"(v104), "w"(v107)); - svfloat32_t v141; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v141) : "w"(v103), "w"(v105)); - svfloat32_t v110; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v110) : "w"(v85), "w"(v108)); - svfloat32_t v123; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v123) : "w"(v111), "w"(v112)); - svfloat32_t v133; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v133) : "w"(v109), "w"(v108)); - svfloat32_t v142; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v142) : "w"(v111), "w"(v112)); - svfloat32_t v169; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v169) : "w"(v126), "w"(v496)); - svfloat32_t v174; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v174) : "w"(v127), "w"(v497)); - svfloat32_t v184; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v184) : "w"(v129), "w"(v499)); - svfloat32_t v189; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v189) : "w"(v130), "w"(v500)); - svfloat32_t zero211; - asm volatile("mov %0.s, #0" : "=w"(zero211)); + svfloat32_t v34 = svadd_f32_x(svptrue_b32(), v25, v33); + svfloat32_t v51 = svadd_f32_x(svptrue_b32(), v42, v50); + svfloat32_t v68 = svadd_f32_x(svptrue_b32(), v59, v67); + svfloat32_t v85 = svadd_f32_x(svptrue_b32(), v76, v84); + svfloat32_t v102 = svadd_f32_x(svptrue_b32(), v93, v101); + svfloat32_t v103 = svsub_f32_x(svptrue_b32(), v25, v33); + svfloat32_t v104 = svsub_f32_x(svptrue_b32(), v42, v50); + svfloat32_t v105 = svsub_f32_x(svptrue_b32(), v59, v67); + svfloat32_t v106 = svsub_f32_x(svptrue_b32(), v76, v84); + svfloat32_t v107 = svsub_f32_x(svptrue_b32(), v93, v101); + svfloat32_t v108 = svadd_f32_x(svptrue_b32(), v34, v51); + svfloat32_t v109 = svadd_f32_x(svptrue_b32(), v68, v102); + svfloat32_t v111 = svsub_f32_x(svptrue_b32(), v104, v105); + svfloat32_t v112 = svadd_f32_x(svptrue_b32(), v103, v107); + svfloat32_t v125 = svsub_f32_x(svptrue_b32(), v51, v85); + svfloat32_t v126 = svsub_f32_x(svptrue_b32(), v34, v85); + svfloat32_t v127 = svsub_f32_x(svptrue_b32(), v51, v34); + svfloat32_t v128 = svsub_f32_x(svptrue_b32(), v102, v85); + svfloat32_t v129 = svsub_f32_x(svptrue_b32(), v68, v85); + svfloat32_t v130 = svsub_f32_x(svptrue_b32(), v102, v68); + svfloat32_t v131 = svsub_f32_x(svptrue_b32(), v51, v102); + svfloat32_t v132 = svsub_f32_x(svptrue_b32(), v34, v68); + svfloat32_t v134 = svadd_f32_x(svptrue_b32(), v104, v106); + svfloat32_t v135 = svsub_f32_x(svptrue_b32(), v103, v106); + svfloat32_t v136 = svadd_f32_x(svptrue_b32(), v103, v104); + svfloat32_t v137 = svsub_f32_x(svptrue_b32(), v106, v107); + svfloat32_t v138 = svsub_f32_x(svptrue_b32(), v105, v106); + svfloat32_t v139 = svsub_f32_x(svptrue_b32(), v105, v107); + svfloat32_t v140 = svadd_f32_x(svptrue_b32(), v104, v107); + svfloat32_t v141 = svsub_f32_x(svptrue_b32(), v103, v105); + svfloat32_t v110 = svadd_f32_x(svptrue_b32(), v85, v108); + svfloat32_t v123 = svsub_f32_x(svptrue_b32(), v111, v112); + svfloat32_t v133 = svsub_f32_x(svptrue_b32(), v109, v108); + svfloat32_t v142 = svadd_f32_x(svptrue_b32(), v111, v112); + svfloat32_t v169 = svmul_f32_x(svptrue_b32(), v126, v496); + svfloat32_t v174 = svmul_f32_x(svptrue_b32(), v127, v497); + svfloat32_t v184 = svmul_f32_x(svptrue_b32(), v129, v499); + svfloat32_t v189 = svmul_f32_x(svptrue_b32(), v130, v500); + svfloat32_t zero211 = svdup_n_f32(0); svfloat32_t v211 = svcmla_f32_x(pred_full, zero211, v504, v134, 90); - svfloat32_t zero225; - asm volatile("mov %0.s, #0" : "=w"(zero225)); + svfloat32_t zero225 = svdup_n_f32(0); svfloat32_t v225 = svcmla_f32_x(pred_full, zero225, v506, v136, 90); - svfloat32_t zero232; - asm volatile("mov %0.s, #0" : "=w"(zero232)); + svfloat32_t zero232 = svdup_n_f32(0); svfloat32_t v232 = svcmla_f32_x(pred_full, zero232, v507, v137, 90); - svfloat32_t zero246; - asm volatile("mov %0.s, #0" : "=w"(zero246)); + svfloat32_t zero246 = svdup_n_f32(0); svfloat32_t v246 = svcmla_f32_x(pred_full, zero246, v509, v139, 90); - svfloat32_t zero253; - asm volatile("mov %0.s, #0" : "=w"(zero253)); + svfloat32_t zero253 = svdup_n_f32(0); svfloat32_t v253 = svcmla_f32_x(pred_full, zero253, v510, v140, 90); - svfloat32_t v113; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v113) : "w"(v110), "w"(v109)); - svfloat32_t v124; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v124) : "w"(v123), "w"(v106)); - svfloat32_t v204; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v204) : "w"(v133), "w"(v503)); - svfloat32_t zero267; - asm volatile("mov %0.s, #0" : "=w"(zero267)); + svfloat32_t v113 = svadd_f32_x(svptrue_b32(), v110, v109); + svfloat32_t v124 = svsub_f32_x(svptrue_b32(), v123, v106); + svfloat32_t v204 = svmul_f32_x(svptrue_b32(), v133, v503); + svfloat32_t zero267 = svdup_n_f32(0); svfloat32_t v267 = svcmla_f32_x(pred_full, zero267, v512, v142, 90); svfloat32_t v269 = svmla_f32_x(pred_full, v169, v125, v495); svfloat32_t v270 = svmla_f32_x(pred_full, v174, v126, v496); @@ -1677,87 +1556,50 @@ void armral_fft_cs16_cf32_cf32_ac_n_uu11(const armral_cmplx_int16_t *restrict x, svfloat32_t v273 = svmla_f32_x(pred_full, v189, v129, v499); svfloat32_t v274 = svnmls_f32_x(pred_full, v189, v128, v498); svfloat32_t v277 = svcmla_f32_x(pred_full, v225, v505, v135, 90); - svfloat32_t v278; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v278) : "w"(v211), "w"(v225)); + svfloat32_t v278 = svsub_f32_x(svptrue_b32(), v211, v225); svfloat32_t v279 = svcmla_f32_x(pred_full, v246, v508, v138, 90); - svfloat32_t v280; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v280) : "w"(v232), "w"(v246)); - svfloat32_t v122; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v122) : "w"(v121), "w"(v113)); - svfloat32_t zero159; - asm volatile("mov %0.s, #0" : "=w"(zero159)); + svfloat32_t v280 = svsub_f32_x(svptrue_b32(), v232, v246); + svfloat32_t v122 = svadd_f32_x(svptrue_b32(), v121, v113); + svfloat32_t zero159 = svdup_n_f32(0); svfloat32_t v159 = svcmla_f32_x(pred_full, zero159, v494, v124, 90); svfloat32_t v275 = svmla_f32_x(pred_full, v204, v132, v502); svfloat32_t v276 = svmla_f32_x(pred_full, v204, v131, v501); svfloat32_t v281 = svcmla_f32_x(pred_full, v267, v511, v141, 90); - svfloat32_t v282; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v282) : "w"(v253), "w"(v267)); - svfloat32_t v301; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v301) : "w"(v277), "w"(v278)); + svfloat32_t v282 = svsub_f32_x(svptrue_b32(), v253, v267); + svfloat32_t v301 = svadd_f32_x(svptrue_b32(), v277, v278); svfloat32_t v268 = svmls_f32_x(pred_full, v122, v113, v493); - svfloat32_t v283; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v283) : "w"(v273), "w"(v275)); - svfloat32_t v293; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v293) : "w"(v159), "w"(v279)); - svfloat32_t v295; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v295) : "w"(v281), "w"(v277)); - svfloat32_t v297; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v297) : "w"(v159), "w"(v282)); - svfloat32_t v299; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v299) : "w"(v282), "w"(v278)); - svfloat32_t v302; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v302) : "w"(v301), "w"(v279)); + svfloat32_t v283 = svadd_f32_x(svptrue_b32(), v273, v275); + svfloat32_t v293 = svadd_f32_x(svptrue_b32(), v159, v279); + svfloat32_t v295 = svsub_f32_x(svptrue_b32(), v281, v277); + svfloat32_t v297 = svadd_f32_x(svptrue_b32(), v159, v282); + svfloat32_t v299 = svsub_f32_x(svptrue_b32(), v282, v278); + svfloat32_t v302 = svadd_f32_x(svptrue_b32(), v301, v279); svst1_f64(pred_full, (double *)(v520), svreinterpret_f64_f32(v122)); - svfloat32_t v284; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v284) : "w"(v283), "w"(v268)); - svfloat32_t v285; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v285) : "w"(v268), "w"(v270)); - svfloat32_t v287; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v287) : "w"(v268), "w"(v274)); - svfloat32_t v289; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v289) : "w"(v268), "w"(v271)); - svfloat32_t v291; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v291) : "w"(v268), "w"(v269)); - svfloat32_t v294; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v294) : "w"(v293), "w"(v281)); - svfloat32_t v296; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v296) : "w"(v295), "w"(v159)); - svfloat32_t v298; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v298) : "w"(v297), "w"(v280)); - svfloat32_t v300; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v300) : "w"(v299), "w"(v159)); - svfloat32_t v303; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v303) : "w"(v302), "w"(v280)); - svfloat32_t v286; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v286) : "w"(v285), "w"(v275)); - svfloat32_t v288; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v288) : "w"(v287), "w"(v276)); - svfloat32_t v290; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v290) : "w"(v289), "w"(v276)); - svfloat32_t v292; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v292) : "w"(v291), "w"(v272)); - svfloat32_t v304; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v304) : "w"(v303), "w"(v159)); - svfloat32_t v306; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v306) : "w"(v284), "w"(v294)); - svfloat32_t v313; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v313) : "w"(v284), "w"(v294)); - svfloat32_t v305; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v305) : "w"(v292), "w"(v304)); - svfloat32_t v307; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v307) : "w"(v286), "w"(v296)); - svfloat32_t v308; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v308) : "w"(v288), "w"(v298)); - svfloat32_t v309; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v309) : "w"(v290), "w"(v300)); - svfloat32_t v310; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v310) : "w"(v290), "w"(v300)); - svfloat32_t v311; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v311) : "w"(v288), "w"(v298)); - svfloat32_t v312; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v312) : "w"(v286), "w"(v296)); - svfloat32_t v314; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v314) : "w"(v292), "w"(v304)); + svfloat32_t v284 = svadd_f32_x(svptrue_b32(), v283, v268); + svfloat32_t v285 = svsub_f32_x(svptrue_b32(), v268, v270); + svfloat32_t v287 = svadd_f32_x(svptrue_b32(), v268, v274); + svfloat32_t v289 = svsub_f32_x(svptrue_b32(), v268, v271); + svfloat32_t v291 = svadd_f32_x(svptrue_b32(), v268, v269); + svfloat32_t v294 = svadd_f32_x(svptrue_b32(), v293, v281); + svfloat32_t v296 = svsub_f32_x(svptrue_b32(), v295, v159); + svfloat32_t v298 = svadd_f32_x(svptrue_b32(), v297, v280); + svfloat32_t v300 = svsub_f32_x(svptrue_b32(), v299, v159); + svfloat32_t v303 = svadd_f32_x(svptrue_b32(), v302, v280); + svfloat32_t v286 = svsub_f32_x(svptrue_b32(), v285, v275); + svfloat32_t v288 = svadd_f32_x(svptrue_b32(), v287, v276); + svfloat32_t v290 = svsub_f32_x(svptrue_b32(), v289, v276); + svfloat32_t v292 = svsub_f32_x(svptrue_b32(), v291, v272); + svfloat32_t v304 = svsub_f32_x(svptrue_b32(), v303, v159); + svfloat32_t v306 = svadd_f32_x(svptrue_b32(), v284, v294); + svfloat32_t v313 = svsub_f32_x(svptrue_b32(), v284, v294); + svfloat32_t v305 = svadd_f32_x(svptrue_b32(), v292, v304); + svfloat32_t v307 = svadd_f32_x(svptrue_b32(), v286, v296); + svfloat32_t v308 = svsub_f32_x(svptrue_b32(), v288, v298); + svfloat32_t v309 = svadd_f32_x(svptrue_b32(), v290, v300); + svfloat32_t v310 = svsub_f32_x(svptrue_b32(), v290, v300); + svfloat32_t v311 = svadd_f32_x(svptrue_b32(), v288, v298); + svfloat32_t v312 = svsub_f32_x(svptrue_b32(), v286, v296); + svfloat32_t v314 = svsub_f32_x(svptrue_b32(), v292, v304); svst1_f64(pred_full, (double *)(v538), svreinterpret_f64_f32(v306)); svst1_f64(pred_full, (double *)(v601), svreinterpret_f64_f32(v313)); svst1_f64(pred_full, (double *)(v529), svreinterpret_f64_f32(v305)); @@ -2482,218 +2324,125 @@ void armral_fft_cs16_cf32_cf32_ac_n_uu13(const armral_cmplx_int16_t *restrict x, svcvt_f32_s32_x(pred_full, svld1sh_s32(pred_full, (const int16_t *)&v541[0])), 1.F / (1ULL << 15ULL)); - svfloat32_t v34; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v34) : "w"(v25), "w"(v33)); - svfloat32_t v51; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v51) : "w"(v42), "w"(v50)); - svfloat32_t v68; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v68) : "w"(v59), "w"(v67)); - svfloat32_t v85; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v85) : "w"(v76), "w"(v84)); - svfloat32_t v102; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v102) : "w"(v93), "w"(v101)); - svfloat32_t v119; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v119) : "w"(v110), "w"(v118)); - svfloat32_t v120; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v120) : "w"(v25), "w"(v33)); - svfloat32_t v121; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v121) : "w"(v42), "w"(v50)); - svfloat32_t v122; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v122) : "w"(v59), "w"(v67)); - svfloat32_t v123; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v123) : "w"(v76), "w"(v84)); - svfloat32_t v124; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v124) : "w"(v93), "w"(v101)); - svfloat32_t v125; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v125) : "w"(v110), "w"(v118)); - svfloat32_t v126; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v126) : "w"(v51), "w"(v102)); - svfloat32_t v128; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v128) : "w"(v34), "w"(v68)); - svfloat32_t v131; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v131) : "w"(v121), "w"(v124)); - svfloat32_t v133; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v133) : "w"(v120), "w"(v122)); - svfloat32_t v135; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v135) : "w"(v51), "w"(v119)); - svfloat32_t v136; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v136) : "w"(v68), "w"(v85)); - svfloat32_t v137; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v137) : "w"(v34), "w"(v85)); - svfloat32_t v138; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v138) : "w"(v102), "w"(v119)); - svfloat32_t v143; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v143) : "w"(v121), "w"(v125)); - svfloat32_t v144; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v144) : "w"(v120), "w"(v122)); - svfloat32_t v145; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v145) : "w"(v121), "w"(v124)); - svfloat32_t v146; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v146) : "w"(v120), "w"(v123)); - svfloat32_t v147; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v147) : "w"(v124), "w"(v125)); - svfloat32_t v148; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v148) : "w"(v122), "w"(v123)); - svfloat32_t v127; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v127) : "w"(v126), "w"(v119)); - svfloat32_t v129; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v129) : "w"(v128), "w"(v85)); - svfloat32_t v132; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v132) : "w"(v131), "w"(v125)); - svfloat32_t v134; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v134) : "w"(v133), "w"(v123)); - svfloat32_t v139; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v139) : "w"(v135), "w"(v136)); - svfloat32_t v140; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v140) : "w"(v137), "w"(v138)); - svfloat32_t v141; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v141) : "w"(v135), "w"(v136)); - svfloat32_t v142; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v142) : "w"(v137), "w"(v138)); - svfloat32_t v162; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v162) : "w"(v143), "w"(v144)); - svfloat32_t v163; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v163) : "w"(v145), "w"(v146)); - svfloat32_t v164; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v164) : "w"(v147), "w"(v148)); - svfloat32_t zero237; - asm volatile("mov %0.s, #0" : "=w"(zero237)); + svfloat32_t v34 = svadd_f32_x(svptrue_b32(), v25, v33); + svfloat32_t v51 = svadd_f32_x(svptrue_b32(), v42, v50); + svfloat32_t v68 = svadd_f32_x(svptrue_b32(), v59, v67); + svfloat32_t v85 = svadd_f32_x(svptrue_b32(), v76, v84); + svfloat32_t v102 = svadd_f32_x(svptrue_b32(), v93, v101); + svfloat32_t v119 = svadd_f32_x(svptrue_b32(), v110, v118); + svfloat32_t v120 = svsub_f32_x(svptrue_b32(), v25, v33); + svfloat32_t v121 = svsub_f32_x(svptrue_b32(), v42, v50); + svfloat32_t v122 = svsub_f32_x(svptrue_b32(), v59, v67); + svfloat32_t v123 = svsub_f32_x(svptrue_b32(), v76, v84); + svfloat32_t v124 = svsub_f32_x(svptrue_b32(), v93, v101); + svfloat32_t v125 = svsub_f32_x(svptrue_b32(), v110, v118); + svfloat32_t v126 = svadd_f32_x(svptrue_b32(), v51, v102); + svfloat32_t v128 = svadd_f32_x(svptrue_b32(), v34, v68); + svfloat32_t v131 = svadd_f32_x(svptrue_b32(), v121, v124); + svfloat32_t v133 = svadd_f32_x(svptrue_b32(), v120, v122); + svfloat32_t v135 = svsub_f32_x(svptrue_b32(), v51, v119); + svfloat32_t v136 = svsub_f32_x(svptrue_b32(), v68, v85); + svfloat32_t v137 = svsub_f32_x(svptrue_b32(), v34, v85); + svfloat32_t v138 = svsub_f32_x(svptrue_b32(), v102, v119); + svfloat32_t v143 = svsub_f32_x(svptrue_b32(), v121, v125); + svfloat32_t v144 = svsub_f32_x(svptrue_b32(), v120, v122); + svfloat32_t v145 = svsub_f32_x(svptrue_b32(), v121, v124); + svfloat32_t v146 = svadd_f32_x(svptrue_b32(), v120, v123); + svfloat32_t v147 = svsub_f32_x(svptrue_b32(), v124, v125); + svfloat32_t v148 = svadd_f32_x(svptrue_b32(), v122, v123); + svfloat32_t v127 = svadd_f32_x(svptrue_b32(), v126, v119); + svfloat32_t v129 = svadd_f32_x(svptrue_b32(), v128, v85); + svfloat32_t v132 = svadd_f32_x(svptrue_b32(), v131, v125); + svfloat32_t v134 = svsub_f32_x(svptrue_b32(), v133, v123); + svfloat32_t v139 = svsub_f32_x(svptrue_b32(), v135, v136); + svfloat32_t v140 = svsub_f32_x(svptrue_b32(), v137, v138); + svfloat32_t v141 = svadd_f32_x(svptrue_b32(), v135, v136); + svfloat32_t v142 = svadd_f32_x(svptrue_b32(), v137, v138); + svfloat32_t v162 = svadd_f32_x(svptrue_b32(), v143, v144); + svfloat32_t v163 = svadd_f32_x(svptrue_b32(), v145, v146); + svfloat32_t v164 = svsub_f32_x(svptrue_b32(), v147, v148); + svfloat32_t zero237 = svdup_n_f32(0); svfloat32_t v237 = svcmla_f32_x(pred_full, zero237, v566, v143, 90); - svfloat32_t zero244; - asm volatile("mov %0.s, #0" : "=w"(zero244)); + svfloat32_t zero244 = svdup_n_f32(0); svfloat32_t v244 = svcmla_f32_x(pred_full, zero244, v567, v144, 90); - svfloat32_t zero258; - asm volatile("mov %0.s, #0" : "=w"(zero258)); + svfloat32_t zero258 = svdup_n_f32(0); svfloat32_t v258 = svcmla_f32_x(pred_full, zero258, v569, v145, 90); - svfloat32_t zero265; - asm volatile("mov %0.s, #0" : "=w"(zero265)); + svfloat32_t zero265 = svdup_n_f32(0); svfloat32_t v265 = svcmla_f32_x(pred_full, zero265, v570, v146, 90); - svfloat32_t zero279; - asm volatile("mov %0.s, #0" : "=w"(zero279)); + svfloat32_t zero279 = svdup_n_f32(0); svfloat32_t v279 = svcmla_f32_x(pred_full, zero279, v572, v147, 90); - svfloat32_t v130; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v130) : "w"(v127), "w"(v129)); - svfloat32_t v158; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v158) : "w"(v129), "w"(v127)); - svfloat32_t v159; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v159) : "w"(v132), "w"(v134)); - svfloat32_t v160; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v160) : "w"(v139), "w"(v140)); - svfloat32_t v161; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v161) : "w"(v141), "w"(v142)); - svfloat32_t zero186; - asm volatile("mov %0.s, #0" : "=w"(zero186)); + svfloat32_t v130 = svadd_f32_x(svptrue_b32(), v127, v129); + svfloat32_t v158 = svsub_f32_x(svptrue_b32(), v129, v127); + svfloat32_t v159 = svadd_f32_x(svptrue_b32(), v132, v134); + svfloat32_t v160 = svadd_f32_x(svptrue_b32(), v139, v140); + svfloat32_t v161 = svsub_f32_x(svptrue_b32(), v141, v142); + svfloat32_t zero186 = svdup_n_f32(0); svfloat32_t v186 = svcmla_f32_x(pred_full, zero186, v557, v132, 90); - svfloat32_t zero193; - asm volatile("mov %0.s, #0" : "=w"(zero193)); + svfloat32_t zero193 = svdup_n_f32(0); svfloat32_t v193 = svcmla_f32_x(pred_full, zero193, v558, v134, 90); - svfloat32_t v205; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v205) : "w"(v139), "w"(v560)); - svfloat32_t zero251; - asm volatile("mov %0.s, #0" : "=w"(zero251)); + svfloat32_t v205 = svmul_f32_x(svptrue_b32(), v139, v560); + svfloat32_t zero251 = svdup_n_f32(0); svfloat32_t v251 = svcmla_f32_x(pred_full, zero251, v568, v162, 90); - svfloat32_t zero272; - asm volatile("mov %0.s, #0" : "=w"(zero272)); + svfloat32_t zero272 = svdup_n_f32(0); svfloat32_t v272 = svcmla_f32_x(pred_full, zero272, v571, v163, 90); - svfloat32_t zero293; - asm volatile("mov %0.s, #0" : "=w"(zero293)); + svfloat32_t zero293 = svdup_n_f32(0); svfloat32_t v293 = svcmla_f32_x(pred_full, zero293, v574, v164, 90); - svfloat32_t v157; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v157) : "w"(v156), "w"(v130)); - svfloat32_t zero200; - asm volatile("mov %0.s, #0" : "=w"(zero200)); + svfloat32_t v157 = svadd_f32_x(svptrue_b32(), v156, v130); + svfloat32_t zero200 = svdup_n_f32(0); svfloat32_t v200 = svcmla_f32_x(pred_full, zero200, v559, v159, 90); - svfloat32_t v215; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v215) : "w"(v160), "w"(v562)); + svfloat32_t v215 = svmul_f32_x(svptrue_b32(), v160, v562); svfloat32_t v295 = svmla_f32_x(pred_full, v205, v140, v561); - svfloat32_t v307; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v307) : "w"(v237), "w"(v251)); - svfloat32_t v308; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v308) : "w"(v244), "w"(v251)); - svfloat32_t v309; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v309) : "w"(v258), "w"(v272)); - svfloat32_t v310; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v310) : "w"(v265), "w"(v272)); - svfloat32_t v311; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v311) : "w"(v279), "w"(v293)); + svfloat32_t v307 = svsub_f32_x(svptrue_b32(), v237, v251); + svfloat32_t v308 = svsub_f32_x(svptrue_b32(), v244, v251); + svfloat32_t v309 = svsub_f32_x(svptrue_b32(), v258, v272); + svfloat32_t v310 = svsub_f32_x(svptrue_b32(), v265, v272); + svfloat32_t v311 = svsub_f32_x(svptrue_b32(), v279, v293); svfloat32_t v312 = svcmla_f32_x(pred_full, v293, v573, v148, 90); svfloat32_t v294 = svmls_f32_x(pred_full, v157, v130, v555); svfloat32_t v296 = svmls_f32_x(pred_full, v295, v158, v556); svfloat32_t v297 = svmla_f32_x(pred_full, v215, v140, v561); svfloat32_t v299 = svnmls_f32_x(pred_full, v205, v160, v562); - svfloat32_t v313; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v313) : "w"(v186), "w"(v200)); - svfloat32_t v314; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v314) : "w"(v193), "w"(v200)); - svfloat32_t v325; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v325) : "w"(v307), "w"(v311)); - svfloat32_t v327; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v327) : "w"(v309), "w"(v311)); - svfloat32_t v329; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v329) : "w"(v308), "w"(v312)); + svfloat32_t v313 = svsub_f32_x(svptrue_b32(), v186, v200); + svfloat32_t v314 = svsub_f32_x(svptrue_b32(), v193, v200); + svfloat32_t v325 = svadd_f32_x(svptrue_b32(), v307, v311); + svfloat32_t v327 = svadd_f32_x(svptrue_b32(), v309, v311); + svfloat32_t v329 = svsub_f32_x(svptrue_b32(), v308, v312); svst1_f64(pred_full, (double *)(v582), svreinterpret_f64_f32(v157)); svfloat32_t v298 = svmla_f32_x(pred_full, v297, v158, v556); svfloat32_t v300 = svmls_f32_x(pred_full, v299, v158, v556); svfloat32_t v301 = svmla_f32_x(pred_full, v294, v141, v563); svfloat32_t v303 = svmls_f32_x(pred_full, v294, v142, v564); svfloat32_t v305 = svmls_f32_x(pred_full, v294, v141, v563); - svfloat32_t v321; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v321) : "w"(v314), "w"(v307)); - svfloat32_t v323; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v323) : "w"(v312), "w"(v313)); - svfloat32_t v326; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v326) : "w"(v325), "w"(v314)); - svfloat32_t v328; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v328) : "w"(v327), "w"(v314)); - svfloat32_t v330; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v330) : "w"(v329), "w"(v313)); - svfloat32_t v331; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v331) : "w"(v313), "w"(v308)); + svfloat32_t v321 = svsub_f32_x(svptrue_b32(), v314, v307); + svfloat32_t v323 = svsub_f32_x(svptrue_b32(), v312, v313); + svfloat32_t v326 = svadd_f32_x(svptrue_b32(), v325, v314); + svfloat32_t v328 = svsub_f32_x(svptrue_b32(), v327, v314); + svfloat32_t v330 = svsub_f32_x(svptrue_b32(), v329, v313); + svfloat32_t v331 = svadd_f32_x(svptrue_b32(), v313, v308); svfloat32_t v302 = svmla_f32_x(pred_full, v301, v142, v564); svfloat32_t v304 = svmls_f32_x(pred_full, v303, v161, v565); svfloat32_t v306 = svmla_f32_x(pred_full, v305, v161, v565); - svfloat32_t v322; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v322) : "w"(v321), "w"(v309)); - svfloat32_t v324; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v324) : "w"(v323), "w"(v310)); - svfloat32_t v332; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v332) : "w"(v331), "w"(v310)); - svfloat32_t v315; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v315) : "w"(v296), "w"(v302)); - svfloat32_t v316; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v316) : "w"(v298), "w"(v304)); - svfloat32_t v317; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v317) : "w"(v304), "w"(v298)); - svfloat32_t v318; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v318) : "w"(v300), "w"(v306)); - svfloat32_t v319; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v319) : "w"(v302), "w"(v296)); - svfloat32_t v320; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v320) : "w"(v306), "w"(v300)); - svfloat32_t v333; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v333) : "w"(v315), "w"(v322)); - svfloat32_t v334; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v334) : "w"(v316), "w"(v324)); - svfloat32_t v335; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v335) : "w"(v317), "w"(v326)); - svfloat32_t v336; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v336) : "w"(v318), "w"(v328)); - svfloat32_t v337; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v337) : "w"(v319), "w"(v330)); - svfloat32_t v338; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v338) : "w"(v320), "w"(v332)); - svfloat32_t v339; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v339) : "w"(v320), "w"(v332)); - svfloat32_t v340; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v340) : "w"(v319), "w"(v330)); - svfloat32_t v341; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v341) : "w"(v318), "w"(v328)); - svfloat32_t v342; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v342) : "w"(v317), "w"(v326)); - svfloat32_t v343; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v343) : "w"(v316), "w"(v324)); - svfloat32_t v344; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v344) : "w"(v315), "w"(v322)); + svfloat32_t v322 = svadd_f32_x(svptrue_b32(), v321, v309); + svfloat32_t v324 = svsub_f32_x(svptrue_b32(), v323, v310); + svfloat32_t v332 = svsub_f32_x(svptrue_b32(), v331, v310); + svfloat32_t v315 = svadd_f32_x(svptrue_b32(), v296, v302); + svfloat32_t v316 = svadd_f32_x(svptrue_b32(), v298, v304); + svfloat32_t v317 = svsub_f32_x(svptrue_b32(), v304, v298); + svfloat32_t v318 = svadd_f32_x(svptrue_b32(), v300, v306); + svfloat32_t v319 = svsub_f32_x(svptrue_b32(), v302, v296); + svfloat32_t v320 = svsub_f32_x(svptrue_b32(), v306, v300); + svfloat32_t v333 = svsub_f32_x(svptrue_b32(), v315, v322); + svfloat32_t v334 = svadd_f32_x(svptrue_b32(), v316, v324); + svfloat32_t v335 = svsub_f32_x(svptrue_b32(), v317, v326); + svfloat32_t v336 = svsub_f32_x(svptrue_b32(), v318, v328); + svfloat32_t v337 = svadd_f32_x(svptrue_b32(), v319, v330); + svfloat32_t v338 = svsub_f32_x(svptrue_b32(), v320, v332); + svfloat32_t v339 = svadd_f32_x(svptrue_b32(), v320, v332); + svfloat32_t v340 = svsub_f32_x(svptrue_b32(), v319, v330); + svfloat32_t v341 = svadd_f32_x(svptrue_b32(), v318, v328); + svfloat32_t v342 = svadd_f32_x(svptrue_b32(), v317, v326); + svfloat32_t v343 = svsub_f32_x(svptrue_b32(), v316, v324); + svfloat32_t v344 = svadd_f32_x(svptrue_b32(), v315, v322); svst1_f64(pred_full, (double *)(v591), svreinterpret_f64_f32(v333)); svst1_f64(pred_full, (double *)(v600), svreinterpret_f64_f32(v334)); svst1_f64(pred_full, (double *)(v609), svreinterpret_f64_f32(v335)); @@ -3281,190 +3030,110 @@ void armral_fft_cs16_cf32_cf32_ac_n_uu14(const armral_cmplx_int16_t *restrict x, svcvt_f32_s32_x(pred_full, svld1sh_s32(pred_full, (const int16_t *)&v544[0])), 1.F / (1ULL << 15ULL)); - svfloat32_t v34; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v34) : "w"(v25), "w"(v33)); - svfloat32_t v35; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v35) : "w"(v25), "w"(v33)); - svfloat32_t v52; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v52) : "w"(v43), "w"(v51)); - svfloat32_t v53; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v53) : "w"(v43), "w"(v51)); - svfloat32_t v70; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v70) : "w"(v61), "w"(v69)); - svfloat32_t v71; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v71) : "w"(v61), "w"(v69)); - svfloat32_t v88; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v88) : "w"(v79), "w"(v87)); - svfloat32_t v89; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v89) : "w"(v79), "w"(v87)); - svfloat32_t v106; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v106) : "w"(v97), "w"(v105)); - svfloat32_t v107; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v107) : "w"(v97), "w"(v105)); - svfloat32_t v124; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v124) : "w"(v115), "w"(v123)); - svfloat32_t v125; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v125) : "w"(v115), "w"(v123)); - svfloat32_t v142; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v142) : "w"(v133), "w"(v141)); - svfloat32_t v143; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v143) : "w"(v133), "w"(v141)); - svfloat32_t v144; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v144) : "w"(v52), "w"(v142)); - svfloat32_t v145; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v145) : "w"(v52), "w"(v142)); - svfloat32_t v146; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v146) : "w"(v106), "w"(v88)); - svfloat32_t v147; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v147) : "w"(v106), "w"(v88)); - svfloat32_t v148; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v148) : "w"(v70), "w"(v124)); - svfloat32_t v149; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v149) : "w"(v70), "w"(v124)); - svfloat32_t v233; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v233) : "w"(v53), "w"(v143)); - svfloat32_t v234; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v234) : "w"(v53), "w"(v143)); - svfloat32_t v235; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v235) : "w"(v107), "w"(v89)); - svfloat32_t v236; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v236) : "w"(v107), "w"(v89)); - svfloat32_t v237; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v237) : "w"(v71), "w"(v125)); - svfloat32_t v238; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v238) : "w"(v71), "w"(v125)); - svfloat32_t v150; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v150) : "w"(v144), "w"(v146)); - svfloat32_t v153; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v153) : "w"(v144), "w"(v146)); - svfloat32_t v154; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v154) : "w"(v146), "w"(v148)); - svfloat32_t v155; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v155) : "w"(v148), "w"(v144)); - svfloat32_t v156; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v156) : "w"(v145), "w"(v147)); - svfloat32_t v158; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v158) : "w"(v145), "w"(v147)); - svfloat32_t v159; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v159) : "w"(v147), "w"(v149)); - svfloat32_t v160; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v160) : "w"(v149), "w"(v145)); - svfloat32_t v239; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v239) : "w"(v233), "w"(v235)); - svfloat32_t v242; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v242) : "w"(v233), "w"(v235)); - svfloat32_t v243; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v243) : "w"(v235), "w"(v237)); - svfloat32_t v244; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v244) : "w"(v237), "w"(v233)); - svfloat32_t v245; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v245) : "w"(v234), "w"(v236)); - svfloat32_t v247; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v247) : "w"(v234), "w"(v236)); - svfloat32_t v248; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v248) : "w"(v236), "w"(v238)); - svfloat32_t v249; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v249) : "w"(v238), "w"(v234)); - svfloat32_t v151; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v151) : "w"(v150), "w"(v148)); - svfloat32_t v157; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v157) : "w"(v156), "w"(v149)); - svfloat32_t zero199; - asm volatile("mov %0.s, #0" : "=w"(zero199)); + svfloat32_t v34 = svadd_f32_x(svptrue_b32(), v25, v33); + svfloat32_t v35 = svsub_f32_x(svptrue_b32(), v25, v33); + svfloat32_t v52 = svadd_f32_x(svptrue_b32(), v43, v51); + svfloat32_t v53 = svsub_f32_x(svptrue_b32(), v43, v51); + svfloat32_t v70 = svadd_f32_x(svptrue_b32(), v61, v69); + svfloat32_t v71 = svsub_f32_x(svptrue_b32(), v61, v69); + svfloat32_t v88 = svadd_f32_x(svptrue_b32(), v79, v87); + svfloat32_t v89 = svsub_f32_x(svptrue_b32(), v79, v87); + svfloat32_t v106 = svadd_f32_x(svptrue_b32(), v97, v105); + svfloat32_t v107 = svsub_f32_x(svptrue_b32(), v97, v105); + svfloat32_t v124 = svadd_f32_x(svptrue_b32(), v115, v123); + svfloat32_t v125 = svsub_f32_x(svptrue_b32(), v115, v123); + svfloat32_t v142 = svadd_f32_x(svptrue_b32(), v133, v141); + svfloat32_t v143 = svsub_f32_x(svptrue_b32(), v133, v141); + svfloat32_t v144 = svadd_f32_x(svptrue_b32(), v52, v142); + svfloat32_t v145 = svsub_f32_x(svptrue_b32(), v52, v142); + svfloat32_t v146 = svadd_f32_x(svptrue_b32(), v106, v88); + svfloat32_t v147 = svsub_f32_x(svptrue_b32(), v106, v88); + svfloat32_t v148 = svadd_f32_x(svptrue_b32(), v70, v124); + svfloat32_t v149 = svsub_f32_x(svptrue_b32(), v70, v124); + svfloat32_t v233 = svadd_f32_x(svptrue_b32(), v53, v143); + svfloat32_t v234 = svsub_f32_x(svptrue_b32(), v53, v143); + svfloat32_t v235 = svadd_f32_x(svptrue_b32(), v107, v89); + svfloat32_t v236 = svsub_f32_x(svptrue_b32(), v107, v89); + svfloat32_t v237 = svadd_f32_x(svptrue_b32(), v71, v125); + svfloat32_t v238 = svsub_f32_x(svptrue_b32(), v71, v125); + svfloat32_t v150 = svadd_f32_x(svptrue_b32(), v144, v146); + svfloat32_t v153 = svsub_f32_x(svptrue_b32(), v144, v146); + svfloat32_t v154 = svsub_f32_x(svptrue_b32(), v146, v148); + svfloat32_t v155 = svsub_f32_x(svptrue_b32(), v148, v144); + svfloat32_t v156 = svadd_f32_x(svptrue_b32(), v145, v147); + svfloat32_t v158 = svsub_f32_x(svptrue_b32(), v145, v147); + svfloat32_t v159 = svsub_f32_x(svptrue_b32(), v147, v149); + svfloat32_t v160 = svsub_f32_x(svptrue_b32(), v149, v145); + svfloat32_t v239 = svadd_f32_x(svptrue_b32(), v233, v235); + svfloat32_t v242 = svsub_f32_x(svptrue_b32(), v233, v235); + svfloat32_t v243 = svsub_f32_x(svptrue_b32(), v235, v237); + svfloat32_t v244 = svsub_f32_x(svptrue_b32(), v237, v233); + svfloat32_t v245 = svadd_f32_x(svptrue_b32(), v234, v236); + svfloat32_t v247 = svsub_f32_x(svptrue_b32(), v234, v236); + svfloat32_t v248 = svsub_f32_x(svptrue_b32(), v236, v238); + svfloat32_t v249 = svsub_f32_x(svptrue_b32(), v238, v234); + svfloat32_t v151 = svadd_f32_x(svptrue_b32(), v150, v148); + svfloat32_t v157 = svadd_f32_x(svptrue_b32(), v156, v149); + svfloat32_t zero199 = svdup_n_f32(0); svfloat32_t v199 = svcmla_f32_x(pred_full, zero199, v562, v158, 90); - svfloat32_t zero206; - asm volatile("mov %0.s, #0" : "=w"(zero206)); + svfloat32_t zero206 = svdup_n_f32(0); svfloat32_t v206 = svcmla_f32_x(pred_full, zero206, v563, v159, 90); - svfloat32_t zero213; - asm volatile("mov %0.s, #0" : "=w"(zero213)); + svfloat32_t zero213 = svdup_n_f32(0); svfloat32_t v213 = svcmla_f32_x(pred_full, zero213, v564, v160, 90); - svfloat32_t v240; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v240) : "w"(v239), "w"(v237)); - svfloat32_t v246; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v246) : "w"(v245), "w"(v238)); - svfloat32_t zero288; - asm volatile("mov %0.s, #0" : "=w"(zero288)); + svfloat32_t v240 = svadd_f32_x(svptrue_b32(), v239, v237); + svfloat32_t v246 = svadd_f32_x(svptrue_b32(), v245, v238); + svfloat32_t zero288 = svdup_n_f32(0); svfloat32_t v288 = svcmla_f32_x(pred_full, zero288, v562, v247, 90); - svfloat32_t zero295; - asm volatile("mov %0.s, #0" : "=w"(zero295)); + svfloat32_t zero295 = svdup_n_f32(0); svfloat32_t v295 = svcmla_f32_x(pred_full, zero295, v563, v248, 90); - svfloat32_t zero302; - asm volatile("mov %0.s, #0" : "=w"(zero302)); + svfloat32_t zero302 = svdup_n_f32(0); svfloat32_t v302 = svcmla_f32_x(pred_full, zero302, v564, v249, 90); - svfloat32_t v152; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v152) : "w"(v151), "w"(v34)); - svfloat32_t zero192; - asm volatile("mov %0.s, #0" : "=w"(zero192)); + svfloat32_t v152 = svadd_f32_x(svptrue_b32(), v151, v34); + svfloat32_t zero192 = svdup_n_f32(0); svfloat32_t v192 = svcmla_f32_x(pred_full, zero192, v561, v157, 90); - svfloat32_t v241; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v241) : "w"(v240), "w"(v35)); - svfloat32_t zero281; - asm volatile("mov %0.s, #0" : "=w"(zero281)); + svfloat32_t v241 = svadd_f32_x(svptrue_b32(), v240, v35); + svfloat32_t zero281 = svdup_n_f32(0); svfloat32_t v281 = svcmla_f32_x(pred_full, zero281, v561, v246, 90); svfloat32_t v214 = svmla_f32_x(pred_full, v152, v151, v557); - svfloat32_t v221; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v221) : "w"(v192), "w"(v199)); - svfloat32_t v223; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v223) : "w"(v192), "w"(v199)); - svfloat32_t v225; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v225) : "w"(v192), "w"(v206)); + svfloat32_t v221 = svadd_f32_x(svptrue_b32(), v192, v199); + svfloat32_t v223 = svsub_f32_x(svptrue_b32(), v192, v199); + svfloat32_t v225 = svsub_f32_x(svptrue_b32(), v192, v206); svfloat32_t v303 = svmla_f32_x(pred_full, v241, v240, v557); - svfloat32_t v310; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v310) : "w"(v281), "w"(v288)); - svfloat32_t v312; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v312) : "w"(v281), "w"(v288)); - svfloat32_t v314; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v314) : "w"(v281), "w"(v295)); + svfloat32_t v310 = svadd_f32_x(svptrue_b32(), v281, v288); + svfloat32_t v312 = svsub_f32_x(svptrue_b32(), v281, v288); + svfloat32_t v314 = svsub_f32_x(svptrue_b32(), v281, v295); svst1_f64(pred_full, (double *)(v572), svreinterpret_f64_f32(v152)); svst1_f64(pred_full, (double *)(v581), svreinterpret_f64_f32(v241)); svfloat32_t v215 = svmla_f32_x(pred_full, v214, v153, v558); svfloat32_t v217 = svmls_f32_x(pred_full, v214, v153, v558); svfloat32_t v219 = svmls_f32_x(pred_full, v214, v154, v559); - svfloat32_t v222; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v222) : "w"(v221), "w"(v206)); - svfloat32_t v224; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v224) : "w"(v223), "w"(v213)); - svfloat32_t v226; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v226) : "w"(v225), "w"(v213)); + svfloat32_t v222 = svadd_f32_x(svptrue_b32(), v221, v206); + svfloat32_t v224 = svsub_f32_x(svptrue_b32(), v223, v213); + svfloat32_t v226 = svadd_f32_x(svptrue_b32(), v225, v213); svfloat32_t v304 = svmla_f32_x(pred_full, v303, v242, v558); svfloat32_t v306 = svmls_f32_x(pred_full, v303, v242, v558); svfloat32_t v308 = svmls_f32_x(pred_full, v303, v243, v559); - svfloat32_t v311; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v311) : "w"(v310), "w"(v295)); - svfloat32_t v313; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v313) : "w"(v312), "w"(v302)); - svfloat32_t v315; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v315) : "w"(v314), "w"(v302)); + svfloat32_t v311 = svadd_f32_x(svptrue_b32(), v310, v295); + svfloat32_t v313 = svsub_f32_x(svptrue_b32(), v312, v302); + svfloat32_t v315 = svadd_f32_x(svptrue_b32(), v314, v302); svfloat32_t v216 = svmla_f32_x(pred_full, v215, v154, v559); svfloat32_t v218 = svmls_f32_x(pred_full, v217, v155, v560); svfloat32_t v220 = svmla_f32_x(pred_full, v219, v155, v560); svfloat32_t v305 = svmla_f32_x(pred_full, v304, v243, v559); svfloat32_t v307 = svmls_f32_x(pred_full, v306, v244, v560); svfloat32_t v309 = svmla_f32_x(pred_full, v308, v244, v560); - svfloat32_t v227; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v227) : "w"(v216), "w"(v222)); - svfloat32_t v228; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v228) : "w"(v216), "w"(v222)); - svfloat32_t v229; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v229) : "w"(v218), "w"(v224)); - svfloat32_t v230; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v230) : "w"(v218), "w"(v224)); - svfloat32_t v231; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v231) : "w"(v220), "w"(v226)); - svfloat32_t v232; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v232) : "w"(v220), "w"(v226)); - svfloat32_t v316; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v316) : "w"(v305), "w"(v311)); - svfloat32_t v317; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v317) : "w"(v305), "w"(v311)); - svfloat32_t v318; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v318) : "w"(v307), "w"(v313)); - svfloat32_t v319; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v319) : "w"(v307), "w"(v313)); - svfloat32_t v320; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v320) : "w"(v309), "w"(v315)); - svfloat32_t v321; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v321) : "w"(v309), "w"(v315)); + svfloat32_t v227 = svadd_f32_x(svptrue_b32(), v216, v222); + svfloat32_t v228 = svsub_f32_x(svptrue_b32(), v216, v222); + svfloat32_t v229 = svadd_f32_x(svptrue_b32(), v218, v224); + svfloat32_t v230 = svsub_f32_x(svptrue_b32(), v218, v224); + svfloat32_t v231 = svadd_f32_x(svptrue_b32(), v220, v226); + svfloat32_t v232 = svsub_f32_x(svptrue_b32(), v220, v226); + svfloat32_t v316 = svadd_f32_x(svptrue_b32(), v305, v311); + svfloat32_t v317 = svsub_f32_x(svptrue_b32(), v305, v311); + svfloat32_t v318 = svadd_f32_x(svptrue_b32(), v307, v313); + svfloat32_t v319 = svsub_f32_x(svptrue_b32(), v307, v313); + svfloat32_t v320 = svadd_f32_x(svptrue_b32(), v309, v315); + svfloat32_t v321 = svsub_f32_x(svptrue_b32(), v309, v315); svst1_f64(pred_full, (double *)(v590), svreinterpret_f64_f32(v228)); svst1_f64(pred_full, (double *)(v599), svreinterpret_f64_f32(v317)); svst1_f64(pred_full, (double *)(v608), svreinterpret_f64_f32(v230)); @@ -4151,116 +3820,66 @@ void armral_fft_cs16_cf32_cf32_ac_n_uu15(const armral_cmplx_int16_t *restrict x, svcvt_f32_s32_x(pred_full, svld1sh_s32(pred_full, (const int16_t *)&v565[0])), 1.F / (1ULL << 15ULL)); - svfloat32_t v34; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v34) : "w"(v25), "w"(v33)); - svfloat32_t v35; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v35) : "w"(v25), "w"(v33)); - svfloat32_t v61; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v61) : "w"(v52), "w"(v60)); - svfloat32_t v62; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v62) : "w"(v52), "w"(v60)); - svfloat32_t v88; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v88) : "w"(v79), "w"(v87)); - svfloat32_t v89; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v89) : "w"(v79), "w"(v87)); - svfloat32_t v115; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v115) : "w"(v106), "w"(v114)); - svfloat32_t v116; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v116) : "w"(v106), "w"(v114)); - svfloat32_t v142; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v142) : "w"(v133), "w"(v141)); - svfloat32_t v143; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v143) : "w"(v133), "w"(v141)); - svfloat32_t v44; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v44) : "w"(v34), "w"(v43)); - svfloat32_t v71; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v71) : "w"(v61), "w"(v70)); - svfloat32_t v98; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v98) : "w"(v88), "w"(v97)); - svfloat32_t v125; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v125) : "w"(v115), "w"(v124)); - svfloat32_t v152; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v152) : "w"(v142), "w"(v151)); - svfloat32_t v206; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v206) : "w"(v61), "w"(v142)); - svfloat32_t v207; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v207) : "w"(v61), "w"(v142)); - svfloat32_t v208; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v208) : "w"(v115), "w"(v88)); - svfloat32_t v209; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v209) : "w"(v115), "w"(v88)); - svfloat32_t v259; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v259) : "w"(v62), "w"(v143)); - svfloat32_t v260; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v260) : "w"(v62), "w"(v143)); - svfloat32_t v261; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v261) : "w"(v116), "w"(v89)); - svfloat32_t v262; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v262) : "w"(v116), "w"(v89)); - svfloat32_t v153; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v153) : "w"(v71), "w"(v152)); - svfloat32_t v154; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v154) : "w"(v71), "w"(v152)); - svfloat32_t v155; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v155) : "w"(v125), "w"(v98)); - svfloat32_t v156; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v156) : "w"(v125), "w"(v98)); - svfloat32_t v210; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v210) : "w"(v206), "w"(v208)); - svfloat32_t v211; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v211) : "w"(v206), "w"(v208)); - svfloat32_t v212; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v212) : "w"(v207), "w"(v209)); - svfloat32_t zero235; - asm volatile("mov %0.s, #0" : "=w"(zero235)); + svfloat32_t v34 = svadd_f32_x(svptrue_b32(), v25, v33); + svfloat32_t v35 = svsub_f32_x(svptrue_b32(), v25, v33); + svfloat32_t v61 = svadd_f32_x(svptrue_b32(), v52, v60); + svfloat32_t v62 = svsub_f32_x(svptrue_b32(), v52, v60); + svfloat32_t v88 = svadd_f32_x(svptrue_b32(), v79, v87); + svfloat32_t v89 = svsub_f32_x(svptrue_b32(), v79, v87); + svfloat32_t v115 = svadd_f32_x(svptrue_b32(), v106, v114); + svfloat32_t v116 = svsub_f32_x(svptrue_b32(), v106, v114); + svfloat32_t v142 = svadd_f32_x(svptrue_b32(), v133, v141); + svfloat32_t v143 = svsub_f32_x(svptrue_b32(), v133, v141); + svfloat32_t v44 = svadd_f32_x(svptrue_b32(), v34, v43); + svfloat32_t v71 = svadd_f32_x(svptrue_b32(), v61, v70); + svfloat32_t v98 = svadd_f32_x(svptrue_b32(), v88, v97); + svfloat32_t v125 = svadd_f32_x(svptrue_b32(), v115, v124); + svfloat32_t v152 = svadd_f32_x(svptrue_b32(), v142, v151); + svfloat32_t v206 = svadd_f32_x(svptrue_b32(), v61, v142); + svfloat32_t v207 = svsub_f32_x(svptrue_b32(), v61, v142); + svfloat32_t v208 = svadd_f32_x(svptrue_b32(), v115, v88); + svfloat32_t v209 = svsub_f32_x(svptrue_b32(), v115, v88); + svfloat32_t v259 = svadd_f32_x(svptrue_b32(), v62, v143); + svfloat32_t v260 = svsub_f32_x(svptrue_b32(), v62, v143); + svfloat32_t v261 = svadd_f32_x(svptrue_b32(), v116, v89); + svfloat32_t v262 = svsub_f32_x(svptrue_b32(), v116, v89); + svfloat32_t v153 = svadd_f32_x(svptrue_b32(), v71, v152); + svfloat32_t v154 = svsub_f32_x(svptrue_b32(), v71, v152); + svfloat32_t v155 = svadd_f32_x(svptrue_b32(), v125, v98); + svfloat32_t v156 = svsub_f32_x(svptrue_b32(), v125, v98); + svfloat32_t v210 = svadd_f32_x(svptrue_b32(), v206, v208); + svfloat32_t v211 = svsub_f32_x(svptrue_b32(), v206, v208); + svfloat32_t v212 = svadd_f32_x(svptrue_b32(), v207, v209); + svfloat32_t zero235 = svdup_n_f32(0); svfloat32_t v235 = svcmla_f32_x(pred_full, zero235, v577, v207, 90); - svfloat32_t v263; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v263) : "w"(v259), "w"(v261)); - svfloat32_t v264; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v264) : "w"(v259), "w"(v261)); - svfloat32_t v265; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v265) : "w"(v260), "w"(v262)); - svfloat32_t v302; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v302) : "w"(v262), "w"(v585)); - svfloat32_t v157; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v157) : "w"(v153), "w"(v155)); - svfloat32_t v158; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v158) : "w"(v153), "w"(v155)); - svfloat32_t v159; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v159) : "w"(v154), "w"(v156)); - svfloat32_t zero182; - asm volatile("mov %0.s, #0" : "=w"(zero182)); + svfloat32_t v263 = svadd_f32_x(svptrue_b32(), v259, v261); + svfloat32_t v264 = svsub_f32_x(svptrue_b32(), v259, v261); + svfloat32_t v265 = svadd_f32_x(svptrue_b32(), v260, v262); + svfloat32_t v302 = svmul_f32_x(svptrue_b32(), v262, v585); + svfloat32_t v157 = svadd_f32_x(svptrue_b32(), v153, v155); + svfloat32_t v158 = svsub_f32_x(svptrue_b32(), v153, v155); + svfloat32_t v159 = svadd_f32_x(svptrue_b32(), v154, v156); + svfloat32_t zero182 = svdup_n_f32(0); svfloat32_t v182 = svcmla_f32_x(pred_full, zero182, v571, v154, 90); - svfloat32_t v213; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v213) : "w"(v210), "w"(v34)); - svfloat32_t v223; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v223) : "w"(v210), "w"(v575)); - svfloat32_t zero242; - asm volatile("mov %0.s, #0" : "=w"(zero242)); + svfloat32_t v213 = svadd_f32_x(svptrue_b32(), v210, v34); + svfloat32_t v223 = svmul_f32_x(svptrue_b32(), v210, v575); + svfloat32_t zero242 = svdup_n_f32(0); svfloat32_t v242 = svcmla_f32_x(pred_full, zero242, v578, v212, 90); - svfloat32_t v266; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v266) : "w"(v263), "w"(v35)); - svfloat32_t zero287; - asm volatile("mov %0.s, #0" : "=w"(zero287)); + svfloat32_t v266 = svadd_f32_x(svptrue_b32(), v263, v35); + svfloat32_t zero287 = svdup_n_f32(0); svfloat32_t v287 = svcmla_f32_x(pred_full, zero287, v582, v264, 90); - svfloat32_t v297; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v297) : "w"(v265), "w"(v584)); - svfloat32_t v160; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v160) : "w"(v157), "w"(v44)); - svfloat32_t zero189; - asm volatile("mov %0.s, #0" : "=w"(zero189)); + svfloat32_t v297 = svmul_f32_x(svptrue_b32(), v265, v584); + svfloat32_t v160 = svadd_f32_x(svptrue_b32(), v157, v44); + svfloat32_t zero189 = svdup_n_f32(0); svfloat32_t v189 = svcmla_f32_x(pred_full, zero189, v572, v159, 90); - svfloat32_t v253; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v253) : "w"(v235), "w"(v242)); + svfloat32_t v253 = svsub_f32_x(svptrue_b32(), v235, v242); svfloat32_t v254 = svcmla_f32_x(pred_full, v242, v579, v209, 90); - svfloat32_t zero273; - asm volatile("mov %0.s, #0" : "=w"(zero273)); + svfloat32_t zero273 = svdup_n_f32(0); svfloat32_t v273 = svcmla_f32_x(pred_full, zero273, v580, v266, 90); svfloat32_t v306 = svnmls_f32_x(pred_full, v297, v260, v583); svfloat32_t v307 = svmla_f32_x(pred_full, v302, v265, v584); svfloat32_t v197 = svmla_f32_x(pred_full, v160, v157, v569); - svfloat32_t v200; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v200) : "w"(v182), "w"(v189)); + svfloat32_t v200 = svsub_f32_x(svptrue_b32(), v182, v189); svfloat32_t v201 = svcmla_f32_x(pred_full, v189, v573, v156, 90); svfloat32_t v250 = svmla_f32_x(pred_full, v223, v213, v574); svfloat32_t v303 = svcmla_f32_x(pred_full, v273, v581, v263, 90); @@ -4270,68 +3889,40 @@ void armral_fft_cs16_cf32_cf32_ac_n_uu15(const armral_cmplx_int16_t *restrict x, svfloat32_t v199 = svmls_f32_x(pred_full, v197, v158, v570); svfloat32_t v251 = svmla_f32_x(pred_full, v250, v211, v576); svfloat32_t v252 = svmls_f32_x(pred_full, v250, v211, v576); - svfloat32_t v304; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v304) : "w"(v303), "w"(v287)); - svfloat32_t v305; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v305) : "w"(v303), "w"(v287)); - svfloat32_t v313; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v313) : "w"(v312), "w"(v273)); - svfloat32_t v314; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v314) : "w"(v312), "w"(v273)); - svfloat32_t v202; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v202) : "w"(v198), "w"(v200)); - svfloat32_t v203; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v203) : "w"(v198), "w"(v200)); - svfloat32_t v204; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v204) : "w"(v199), "w"(v201)); - svfloat32_t v205; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v205) : "w"(v199), "w"(v201)); - svfloat32_t v255; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v255) : "w"(v251), "w"(v253)); - svfloat32_t v256; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v256) : "w"(v251), "w"(v253)); - svfloat32_t v257; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v257) : "w"(v252), "w"(v254)); - svfloat32_t v258; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v258) : "w"(v252), "w"(v254)); - svfloat32_t v308; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v308) : "w"(v304), "w"(v306)); - svfloat32_t v309; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v309) : "w"(v304), "w"(v306)); - svfloat32_t v310; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v310) : "w"(v305), "w"(v307)); - svfloat32_t v311; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v311) : "w"(v305), "w"(v307)); + svfloat32_t v304 = svadd_f32_x(svptrue_b32(), v303, v287); + svfloat32_t v305 = svsub_f32_x(svptrue_b32(), v303, v287); + svfloat32_t v313 = svadd_f32_x(svptrue_b32(), v312, v273); + svfloat32_t v314 = svsub_f32_x(svptrue_b32(), v312, v273); + svfloat32_t v202 = svadd_f32_x(svptrue_b32(), v198, v200); + svfloat32_t v203 = svsub_f32_x(svptrue_b32(), v198, v200); + svfloat32_t v204 = svadd_f32_x(svptrue_b32(), v199, v201); + svfloat32_t v205 = svsub_f32_x(svptrue_b32(), v199, v201); + svfloat32_t v255 = svadd_f32_x(svptrue_b32(), v251, v253); + svfloat32_t v256 = svsub_f32_x(svptrue_b32(), v251, v253); + svfloat32_t v257 = svadd_f32_x(svptrue_b32(), v252, v254); + svfloat32_t v258 = svsub_f32_x(svptrue_b32(), v252, v254); + svfloat32_t v308 = svadd_f32_x(svptrue_b32(), v304, v306); + svfloat32_t v309 = svsub_f32_x(svptrue_b32(), v304, v306); + svfloat32_t v310 = svadd_f32_x(svptrue_b32(), v305, v307); + svfloat32_t v311 = svsub_f32_x(svptrue_b32(), v305, v307); svst1_f64(pred_full, (double *)(v602), svreinterpret_f64_f32(v314)); svst1_f64(pred_full, (double *)(v611), svreinterpret_f64_f32(v313)); - svfloat32_t v336; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v336) : "w"(v203), "w"(v256)); - svfloat32_t v360; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v360) : "w"(v205), "w"(v258)); - svfloat32_t v384; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v384) : "w"(v204), "w"(v257)); - svfloat32_t v408; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v408) : "w"(v202), "w"(v255)); + svfloat32_t v336 = svadd_f32_x(svptrue_b32(), v203, v256); + svfloat32_t v360 = svadd_f32_x(svptrue_b32(), v205, v258); + svfloat32_t v384 = svadd_f32_x(svptrue_b32(), v204, v257); + svfloat32_t v408 = svadd_f32_x(svptrue_b32(), v202, v255); svst1_f64(pred_full, (double *)(v620), svreinterpret_f64_f32(v203)); svst1_f64(pred_full, (double *)(v647), svreinterpret_f64_f32(v205)); svst1_f64(pred_full, (double *)(v674), svreinterpret_f64_f32(v204)); svst1_f64(pred_full, (double *)(v701), svreinterpret_f64_f32(v202)); - svfloat32_t v337; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v337) : "w"(v336), "w"(v309)); - svfloat32_t v338; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v338) : "w"(v336), "w"(v309)); - svfloat32_t v361; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v361) : "w"(v360), "w"(v311)); - svfloat32_t v362; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v362) : "w"(v360), "w"(v311)); - svfloat32_t v385; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v385) : "w"(v384), "w"(v310)); - svfloat32_t v386; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v386) : "w"(v384), "w"(v310)); - svfloat32_t v409; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v409) : "w"(v408), "w"(v308)); - svfloat32_t v410; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v410) : "w"(v408), "w"(v308)); + svfloat32_t v337 = svadd_f32_x(svptrue_b32(), v336, v309); + svfloat32_t v338 = svsub_f32_x(svptrue_b32(), v336, v309); + svfloat32_t v361 = svadd_f32_x(svptrue_b32(), v360, v311); + svfloat32_t v362 = svsub_f32_x(svptrue_b32(), v360, v311); + svfloat32_t v385 = svadd_f32_x(svptrue_b32(), v384, v310); + svfloat32_t v386 = svsub_f32_x(svptrue_b32(), v384, v310); + svfloat32_t v409 = svadd_f32_x(svptrue_b32(), v408, v308); + svfloat32_t v410 = svsub_f32_x(svptrue_b32(), v408, v308); svst1_f64(pred_full, (double *)(v629), svreinterpret_f64_f32(v338)); svst1_f64(pred_full, (double *)(v638), svreinterpret_f64_f32(v337)); svst1_f64(pred_full, (double *)(v656), svreinterpret_f64_f32(v362)); @@ -4941,192 +4532,109 @@ void armral_fft_cs16_cf32_cf32_ac_n_uu16(const armral_cmplx_int16_t *restrict x, svcvt_f32_s32_x(pred_full, svld1sh_s32(pred_full, (const int16_t *)&v590[0])), 1.F / (1ULL << 15ULL)); - svfloat32_t v34; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v34) : "w"(v25), "w"(v33)); - svfloat32_t v35; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v35) : "w"(v25), "w"(v33)); - svfloat32_t v52; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v52) : "w"(v43), "w"(v51)); - svfloat32_t v53; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v53) : "w"(v43), "w"(v51)); - svfloat32_t v70; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v70) : "w"(v61), "w"(v69)); - svfloat32_t v71; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v71) : "w"(v61), "w"(v69)); - svfloat32_t v88; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v88) : "w"(v79), "w"(v87)); - svfloat32_t v89; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v89) : "w"(v79), "w"(v87)); - svfloat32_t v106; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v106) : "w"(v97), "w"(v105)); - svfloat32_t v107; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v107) : "w"(v97), "w"(v105)); - svfloat32_t v124; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v124) : "w"(v115), "w"(v123)); - svfloat32_t v125; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v125) : "w"(v115), "w"(v123)); - svfloat32_t v142; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v142) : "w"(v133), "w"(v141)); - svfloat32_t v143; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v143) : "w"(v133), "w"(v141)); - svfloat32_t v160; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v160) : "w"(v151), "w"(v159)); - svfloat32_t v161; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v161) : "w"(v151), "w"(v159)); - svfloat32_t v162; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v162) : "w"(v34), "w"(v52)); - svfloat32_t v163; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v163) : "w"(v34), "w"(v52)); - svfloat32_t v164; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v164) : "w"(v70), "w"(v88)); - svfloat32_t v165; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v165) : "w"(v70), "w"(v88)); - svfloat32_t v166; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v166) : "w"(v106), "w"(v124)); - svfloat32_t v167; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v167) : "w"(v106), "w"(v124)); - svfloat32_t v168; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v168) : "w"(v142), "w"(v160)); - svfloat32_t v169; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v169) : "w"(v142), "w"(v160)); - svfloat32_t v178; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v178) : "w"(v71), "w"(v89)); - svfloat32_t v179; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v179) : "w"(v71), "w"(v89)); - svfloat32_t v180; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v180) : "w"(v107), "w"(v161)); - svfloat32_t v181; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v181) : "w"(v107), "w"(v161)); - svfloat32_t v182; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v182) : "w"(v125), "w"(v143)); - svfloat32_t v183; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v183) : "w"(v125), "w"(v143)); - svfloat32_t zero243; - asm volatile("mov %0.s, #0" : "=w"(zero243)); + svfloat32_t v34 = svadd_f32_x(svptrue_b32(), v25, v33); + svfloat32_t v35 = svsub_f32_x(svptrue_b32(), v25, v33); + svfloat32_t v52 = svadd_f32_x(svptrue_b32(), v43, v51); + svfloat32_t v53 = svsub_f32_x(svptrue_b32(), v43, v51); + svfloat32_t v70 = svadd_f32_x(svptrue_b32(), v61, v69); + svfloat32_t v71 = svsub_f32_x(svptrue_b32(), v61, v69); + svfloat32_t v88 = svadd_f32_x(svptrue_b32(), v79, v87); + svfloat32_t v89 = svsub_f32_x(svptrue_b32(), v79, v87); + svfloat32_t v106 = svadd_f32_x(svptrue_b32(), v97, v105); + svfloat32_t v107 = svsub_f32_x(svptrue_b32(), v97, v105); + svfloat32_t v124 = svadd_f32_x(svptrue_b32(), v115, v123); + svfloat32_t v125 = svsub_f32_x(svptrue_b32(), v115, v123); + svfloat32_t v142 = svadd_f32_x(svptrue_b32(), v133, v141); + svfloat32_t v143 = svsub_f32_x(svptrue_b32(), v133, v141); + svfloat32_t v160 = svadd_f32_x(svptrue_b32(), v151, v159); + svfloat32_t v161 = svsub_f32_x(svptrue_b32(), v151, v159); + svfloat32_t v162 = svadd_f32_x(svptrue_b32(), v34, v52); + svfloat32_t v163 = svsub_f32_x(svptrue_b32(), v34, v52); + svfloat32_t v164 = svadd_f32_x(svptrue_b32(), v70, v88); + svfloat32_t v165 = svsub_f32_x(svptrue_b32(), v70, v88); + svfloat32_t v166 = svadd_f32_x(svptrue_b32(), v106, v124); + svfloat32_t v167 = svsub_f32_x(svptrue_b32(), v106, v124); + svfloat32_t v168 = svadd_f32_x(svptrue_b32(), v142, v160); + svfloat32_t v169 = svsub_f32_x(svptrue_b32(), v142, v160); + svfloat32_t v178 = svadd_f32_x(svptrue_b32(), v71, v89); + svfloat32_t v179 = svsub_f32_x(svptrue_b32(), v71, v89); + svfloat32_t v180 = svadd_f32_x(svptrue_b32(), v107, v161); + svfloat32_t v181 = svsub_f32_x(svptrue_b32(), v107, v161); + svfloat32_t v182 = svadd_f32_x(svptrue_b32(), v125, v143); + svfloat32_t v183 = svsub_f32_x(svptrue_b32(), v125, v143); + svfloat32_t zero243 = svdup_n_f32(0); svfloat32_t v243 = svcmla_f32_x(pred_full, zero243, v602, v53, 90); - svfloat32_t v170; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v170) : "w"(v162), "w"(v164)); - svfloat32_t v171; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v171) : "w"(v162), "w"(v164)); - svfloat32_t v172; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v172) : "w"(v166), "w"(v168)); - svfloat32_t v173; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v173) : "w"(v166), "w"(v168)); - svfloat32_t v176; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v176) : "w"(v167), "w"(v169)); - svfloat32_t v177; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v177) : "w"(v167), "w"(v169)); - svfloat32_t v184; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v184) : "w"(v180), "w"(v182)); - svfloat32_t v185; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v185) : "w"(v181), "w"(v183)); - svfloat32_t zero219; - asm volatile("mov %0.s, #0" : "=w"(zero219)); + svfloat32_t v170 = svadd_f32_x(svptrue_b32(), v162, v164); + svfloat32_t v171 = svsub_f32_x(svptrue_b32(), v162, v164); + svfloat32_t v172 = svadd_f32_x(svptrue_b32(), v166, v168); + svfloat32_t v173 = svsub_f32_x(svptrue_b32(), v166, v168); + svfloat32_t v176 = svadd_f32_x(svptrue_b32(), v167, v169); + svfloat32_t v177 = svsub_f32_x(svptrue_b32(), v167, v169); + svfloat32_t v184 = svadd_f32_x(svptrue_b32(), v180, v182); + svfloat32_t v185 = svadd_f32_x(svptrue_b32(), v181, v183); + svfloat32_t zero219 = svdup_n_f32(0); svfloat32_t v219 = svcmla_f32_x(pred_full, zero219, v602, v165, 90); - svfloat32_t zero250; - asm volatile("mov %0.s, #0" : "=w"(zero250)); + svfloat32_t zero250 = svdup_n_f32(0); svfloat32_t v250 = svcmla_f32_x(pred_full, zero250, v603, v178, 90); - svfloat32_t zero276; - asm volatile("mov %0.s, #0" : "=w"(zero276)); + svfloat32_t zero276 = svdup_n_f32(0); svfloat32_t v276 = svcmla_f32_x(pred_full, zero276, v607, v182, 90); - svfloat32_t v286; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v286) : "w"(v181), "w"(v609)); - svfloat32_t v291; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v291) : "w"(v183), "w"(v610)); - svfloat32_t v174; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v174) : "w"(v170), "w"(v172)); - svfloat32_t v175; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v175) : "w"(v170), "w"(v172)); - svfloat32_t zero207; - asm volatile("mov %0.s, #0" : "=w"(zero207)); + svfloat32_t v286 = svmul_f32_x(svptrue_b32(), v181, v609); + svfloat32_t v291 = svmul_f32_x(svptrue_b32(), v183, v610); + svfloat32_t v174 = svadd_f32_x(svptrue_b32(), v170, v172); + svfloat32_t v175 = svsub_f32_x(svptrue_b32(), v170, v172); + svfloat32_t zero207 = svdup_n_f32(0); svfloat32_t v207 = svcmla_f32_x(pred_full, zero207, v602, v173, 90); - svfloat32_t zero226; - asm volatile("mov %0.s, #0" : "=w"(zero226)); + svfloat32_t zero226 = svdup_n_f32(0); svfloat32_t v226 = svcmla_f32_x(pred_full, zero226, v603, v176, 90); - svfloat32_t zero262; - asm volatile("mov %0.s, #0" : "=w"(zero262)); + svfloat32_t zero262 = svdup_n_f32(0); svfloat32_t v262 = svcmla_f32_x(pred_full, zero262, v605, v184, 90); - svfloat32_t v281; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v281) : "w"(v185), "w"(v608)); + svfloat32_t v281 = svmul_f32_x(svptrue_b32(), v185, v608); svfloat32_t v302 = svmla_f32_x(pred_full, v35, v179, v604); svfloat32_t v303 = svmls_f32_x(pred_full, v35, v179, v604); - svfloat32_t v304; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v304) : "w"(v243), "w"(v250)); - svfloat32_t v305; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v305) : "w"(v243), "w"(v250)); - svfloat32_t v292; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v292) : "w"(v171), "w"(v207)); - svfloat32_t v293; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v293) : "w"(v171), "w"(v207)); + svfloat32_t v304 = svadd_f32_x(svptrue_b32(), v243, v250); + svfloat32_t v305 = svsub_f32_x(svptrue_b32(), v243, v250); + svfloat32_t v292 = svadd_f32_x(svptrue_b32(), v171, v207); + svfloat32_t v293 = svsub_f32_x(svptrue_b32(), v171, v207); svfloat32_t v294 = svmla_f32_x(pred_full, v163, v177, v604); - svfloat32_t v295; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v295) : "w"(v219), "w"(v226)); + svfloat32_t v295 = svadd_f32_x(svptrue_b32(), v219, v226); svfloat32_t v296 = svmls_f32_x(pred_full, v163, v177, v604); - svfloat32_t v297; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v297) : "w"(v226), "w"(v219)); + svfloat32_t v297 = svsub_f32_x(svptrue_b32(), v226, v219); svfloat32_t v306 = svcmla_f32_x(pred_full, v262, v606, v180, 90); - svfloat32_t v307; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v307) : "w"(v262), "w"(v276)); + svfloat32_t v307 = svsub_f32_x(svptrue_b32(), v262, v276); svfloat32_t v308 = svnmls_f32_x(pred_full, v281, v181, v609); svfloat32_t v309 = svnmls_f32_x(pred_full, v281, v183, v610); svfloat32_t v310 = svnmls_f32_x(pred_full, v286, v185, v608); svfloat32_t v311 = svnmls_f32_x(pred_full, v291, v185, v608); - svfloat32_t v316; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v316) : "w"(v303), "w"(v305)); - svfloat32_t v317; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v317) : "w"(v303), "w"(v305)); + svfloat32_t v316 = svadd_f32_x(svptrue_b32(), v303, v305); + svfloat32_t v317 = svsub_f32_x(svptrue_b32(), v303, v305); svst1_f64(pred_full, (double *)(v618), svreinterpret_f64_f32(v174)); svst1_f64(pred_full, (double *)(v690), svreinterpret_f64_f32(v175)); - svfloat32_t v298; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v298) : "w"(v294), "w"(v295)); - svfloat32_t v299; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v299) : "w"(v296), "w"(v297)); - svfloat32_t v300; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v300) : "w"(v296), "w"(v297)); - svfloat32_t v301; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v301) : "w"(v294), "w"(v295)); - svfloat32_t v312; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v312) : "w"(v302), "w"(v308)); - svfloat32_t v313; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v313) : "w"(v302), "w"(v308)); - svfloat32_t v314; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v314) : "w"(v302), "w"(v310)); - svfloat32_t v315; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v315) : "w"(v302), "w"(v310)); - svfloat32_t v318; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v318) : "w"(v303), "w"(v311)); - svfloat32_t v319; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v319) : "w"(v303), "w"(v311)); - svfloat32_t v322; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v322) : "w"(v306), "w"(v304)); - svfloat32_t v323; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v323) : "w"(v306), "w"(v304)); - svfloat32_t v324; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v324) : "w"(v307), "w"(v309)); - svfloat32_t v325; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v325) : "w"(v307), "w"(v309)); - svfloat32_t v326; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v326) : "w"(v307), "w"(v305)); - svfloat32_t v327; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v327) : "w"(v307), "w"(v305)); + svfloat32_t v298 = svadd_f32_x(svptrue_b32(), v294, v295); + svfloat32_t v299 = svadd_f32_x(svptrue_b32(), v296, v297); + svfloat32_t v300 = svsub_f32_x(svptrue_b32(), v296, v297); + svfloat32_t v301 = svsub_f32_x(svptrue_b32(), v294, v295); + svfloat32_t v312 = svadd_f32_x(svptrue_b32(), v302, v308); + svfloat32_t v313 = svsub_f32_x(svptrue_b32(), v302, v308); + svfloat32_t v314 = svadd_f32_x(svptrue_b32(), v302, v310); + svfloat32_t v315 = svsub_f32_x(svptrue_b32(), v302, v310); + svfloat32_t v318 = svadd_f32_x(svptrue_b32(), v303, v311); + svfloat32_t v319 = svsub_f32_x(svptrue_b32(), v303, v311); + svfloat32_t v322 = svadd_f32_x(svptrue_b32(), v306, v304); + svfloat32_t v323 = svsub_f32_x(svptrue_b32(), v306, v304); + svfloat32_t v324 = svadd_f32_x(svptrue_b32(), v307, v309); + svfloat32_t v325 = svsub_f32_x(svptrue_b32(), v307, v309); + svfloat32_t v326 = svadd_f32_x(svptrue_b32(), v307, v305); + svfloat32_t v327 = svsub_f32_x(svptrue_b32(), v307, v305); svst1_f64(pred_full, (double *)(v654), svreinterpret_f64_f32(v293)); svst1_f64(pred_full, (double *)(v726), svreinterpret_f64_f32(v292)); - svfloat32_t v328; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v328) : "w"(v312), "w"(v322)); - svfloat32_t v329; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v329) : "w"(v313), "w"(v323)); - svfloat32_t v330; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v330) : "w"(v314), "w"(v323)); - svfloat32_t v331; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v331) : "w"(v315), "w"(v322)); - svfloat32_t v332; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v332) : "w"(v316), "w"(v324)); - svfloat32_t v333; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v333) : "w"(v317), "w"(v325)); - svfloat32_t v334; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v334) : "w"(v318), "w"(v327)); - svfloat32_t v335; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v335) : "w"(v319), "w"(v326)); + svfloat32_t v328 = svadd_f32_x(svptrue_b32(), v312, v322); + svfloat32_t v329 = svadd_f32_x(svptrue_b32(), v313, v323); + svfloat32_t v330 = svsub_f32_x(svptrue_b32(), v314, v323); + svfloat32_t v331 = svsub_f32_x(svptrue_b32(), v315, v322); + svfloat32_t v332 = svadd_f32_x(svptrue_b32(), v316, v324); + svfloat32_t v333 = svadd_f32_x(svptrue_b32(), v317, v325); + svfloat32_t v334 = svsub_f32_x(svptrue_b32(), v318, v327); + svfloat32_t v335 = svsub_f32_x(svptrue_b32(), v319, v326); svst1_f64(pred_full, (double *)(v636), svreinterpret_f64_f32(v301)); svst1_f64(pred_full, (double *)(v672), svreinterpret_f64_f32(v300)); svst1_f64(pred_full, (double *)(v708), svreinterpret_f64_f32(v299)); @@ -6245,167 +5753,91 @@ void armral_fft_cs16_cf32_cf32_ac_n_uu17(const armral_cmplx_int16_t *restrict x, svcvt_f32_s32_x(pred_full, svld1sh_s32(pred_full, (const int16_t *)&v793[0])), 1.F / (1ULL << 15ULL)); - svfloat32_t v34; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v34) : "w"(v25), "w"(v33)); - svfloat32_t v35; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v35) : "w"(v25), "w"(v33)); - svfloat32_t v52; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v52) : "w"(v43), "w"(v51)); - svfloat32_t v53; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v53) : "w"(v43), "w"(v51)); - svfloat32_t v70; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v70) : "w"(v61), "w"(v69)); - svfloat32_t v71; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v71) : "w"(v61), "w"(v69)); - svfloat32_t v88; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v88) : "w"(v79), "w"(v87)); - svfloat32_t v89; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v89) : "w"(v79), "w"(v87)); - svfloat32_t v106; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v106) : "w"(v97), "w"(v105)); - svfloat32_t v107; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v107) : "w"(v97), "w"(v105)); - svfloat32_t v124; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v124) : "w"(v115), "w"(v123)); - svfloat32_t v125; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v125) : "w"(v115), "w"(v123)); - svfloat32_t v142; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v142) : "w"(v133), "w"(v141)); - svfloat32_t v143; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v143) : "w"(v133), "w"(v141)); - svfloat32_t v160; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v160) : "w"(v151), "w"(v159)); - svfloat32_t v161; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v161) : "w"(v151), "w"(v159)); - svfloat32_t v162; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v162) : "w"(v34), "w"(v106)); - svfloat32_t v163; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v163) : "w"(v52), "w"(v124)); - svfloat32_t v164; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v164) : "w"(v70), "w"(v142)); - svfloat32_t v165; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v165) : "w"(v88), "w"(v160)); - svfloat32_t v168; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v168) : "w"(v34), "w"(v106)); - svfloat32_t v169; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v169) : "w"(v52), "w"(v124)); - svfloat32_t v170; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v170) : "w"(v70), "w"(v142)); - svfloat32_t v171; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v171) : "w"(v88), "w"(v160)); - svfloat32_t v182; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v182) : "w"(v35), "w"(v71)); - svfloat32_t v183; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v183) : "w"(v53), "w"(v89)); - svfloat32_t v184; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v184) : "w"(v35), "w"(v71)); - svfloat32_t v185; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v185) : "w"(v161), "w"(v125)); - svfloat32_t v186; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v186) : "w"(v107), "w"(v143)); - svfloat32_t v187; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v187) : "w"(v125), "w"(v161)); - svfloat32_t v188; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v188) : "w"(v107), "w"(v143)); - svfloat32_t v189; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v189) : "w"(v53), "w"(v89)); - svfloat32_t v202; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v202) : "w"(v35), "w"(v107)); - svfloat32_t v203; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v203) : "w"(v89), "w"(v161)); - svfloat32_t v166; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v166) : "w"(v162), "w"(v164)); - svfloat32_t v167; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v167) : "w"(v163), "w"(v165)); - svfloat32_t v172; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v172) : "w"(v162), "w"(v164)); - svfloat32_t v173; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v173) : "w"(v163), "w"(v165)); - svfloat32_t v176; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v176) : "w"(v169), "w"(v171)); - svfloat32_t v177; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v177) : "w"(v168), "w"(v170)); - svfloat32_t v179; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v179) : "w"(v170), "w"(v171)); - svfloat32_t v180; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v180) : "w"(v168), "w"(v169)); - svfloat32_t v190; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v190) : "w"(v182), "w"(v183)); - svfloat32_t v191; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v191) : "w"(v186), "w"(v187)); - svfloat32_t v193; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v193) : "w"(v182), "w"(v183)); - svfloat32_t v194; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v194) : "w"(v186), "w"(v187)); - svfloat32_t v196; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v196) : "w"(v184), "w"(v185)); - svfloat32_t v197; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v197) : "w"(v188), "w"(v189)); - svfloat32_t v199; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v199) : "w"(v184), "w"(v185)); - svfloat32_t v200; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v200) : "w"(v188), "w"(v189)); - svfloat32_t v240; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v240) : "w"(v170), "w"(v809)); - svfloat32_t zero407; - asm volatile("mov %0.s, #0" : "=w"(zero407)); + svfloat32_t v34 = svadd_f32_x(svptrue_b32(), v25, v33); + svfloat32_t v35 = svsub_f32_x(svptrue_b32(), v25, v33); + svfloat32_t v52 = svadd_f32_x(svptrue_b32(), v43, v51); + svfloat32_t v53 = svsub_f32_x(svptrue_b32(), v43, v51); + svfloat32_t v70 = svadd_f32_x(svptrue_b32(), v61, v69); + svfloat32_t v71 = svsub_f32_x(svptrue_b32(), v61, v69); + svfloat32_t v88 = svadd_f32_x(svptrue_b32(), v79, v87); + svfloat32_t v89 = svsub_f32_x(svptrue_b32(), v79, v87); + svfloat32_t v106 = svadd_f32_x(svptrue_b32(), v97, v105); + svfloat32_t v107 = svsub_f32_x(svptrue_b32(), v97, v105); + svfloat32_t v124 = svadd_f32_x(svptrue_b32(), v115, v123); + svfloat32_t v125 = svsub_f32_x(svptrue_b32(), v115, v123); + svfloat32_t v142 = svadd_f32_x(svptrue_b32(), v133, v141); + svfloat32_t v143 = svsub_f32_x(svptrue_b32(), v133, v141); + svfloat32_t v160 = svadd_f32_x(svptrue_b32(), v151, v159); + svfloat32_t v161 = svsub_f32_x(svptrue_b32(), v151, v159); + svfloat32_t v162 = svadd_f32_x(svptrue_b32(), v34, v106); + svfloat32_t v163 = svadd_f32_x(svptrue_b32(), v52, v124); + svfloat32_t v164 = svadd_f32_x(svptrue_b32(), v70, v142); + svfloat32_t v165 = svadd_f32_x(svptrue_b32(), v88, v160); + svfloat32_t v168 = svsub_f32_x(svptrue_b32(), v34, v106); + svfloat32_t v169 = svsub_f32_x(svptrue_b32(), v52, v124); + svfloat32_t v170 = svsub_f32_x(svptrue_b32(), v70, v142); + svfloat32_t v171 = svsub_f32_x(svptrue_b32(), v88, v160); + svfloat32_t v182 = svadd_f32_x(svptrue_b32(), v35, v71); + svfloat32_t v183 = svadd_f32_x(svptrue_b32(), v53, v89); + svfloat32_t v184 = svsub_f32_x(svptrue_b32(), v35, v71); + svfloat32_t v185 = svsub_f32_x(svptrue_b32(), v161, v125); + svfloat32_t v186 = svadd_f32_x(svptrue_b32(), v107, v143); + svfloat32_t v187 = svadd_f32_x(svptrue_b32(), v125, v161); + svfloat32_t v188 = svsub_f32_x(svptrue_b32(), v107, v143); + svfloat32_t v189 = svsub_f32_x(svptrue_b32(), v53, v89); + svfloat32_t v202 = svadd_f32_x(svptrue_b32(), v35, v107); + svfloat32_t v203 = svadd_f32_x(svptrue_b32(), v89, v161); + svfloat32_t v166 = svadd_f32_x(svptrue_b32(), v162, v164); + svfloat32_t v167 = svadd_f32_x(svptrue_b32(), v163, v165); + svfloat32_t v172 = svsub_f32_x(svptrue_b32(), v162, v164); + svfloat32_t v173 = svsub_f32_x(svptrue_b32(), v163, v165); + svfloat32_t v176 = svadd_f32_x(svptrue_b32(), v169, v171); + svfloat32_t v177 = svadd_f32_x(svptrue_b32(), v168, v170); + svfloat32_t v179 = svsub_f32_x(svptrue_b32(), v170, v171); + svfloat32_t v180 = svsub_f32_x(svptrue_b32(), v168, v169); + svfloat32_t v190 = svadd_f32_x(svptrue_b32(), v182, v183); + svfloat32_t v191 = svadd_f32_x(svptrue_b32(), v186, v187); + svfloat32_t v193 = svsub_f32_x(svptrue_b32(), v182, v183); + svfloat32_t v194 = svsub_f32_x(svptrue_b32(), v186, v187); + svfloat32_t v196 = svadd_f32_x(svptrue_b32(), v184, v185); + svfloat32_t v197 = svadd_f32_x(svptrue_b32(), v188, v189); + svfloat32_t v199 = svsub_f32_x(svptrue_b32(), v184, v185); + svfloat32_t v200 = svsub_f32_x(svptrue_b32(), v188, v189); + svfloat32_t v240 = svmul_f32_x(svptrue_b32(), v170, v809); + svfloat32_t zero407 = svdup_n_f32(0); svfloat32_t v407 = svcmla_f32_x(pred_full, zero407, v836, v203, 90); - svfloat32_t v174; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v174) : "w"(v166), "w"(v167)); - svfloat32_t v175; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v175) : "w"(v166), "w"(v167)); - svfloat32_t v178; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v178) : "w"(v177), "w"(v176)); - svfloat32_t v181; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v181) : "w"(v172), "w"(v173)); - svfloat32_t v192; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v192) : "w"(v190), "w"(v191)); - svfloat32_t v195; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v195) : "w"(v193), "w"(v194)); - svfloat32_t v198; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v198) : "w"(v196), "w"(v197)); - svfloat32_t v201; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v201) : "w"(v199), "w"(v200)); - svfloat32_t v204; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v204) : "w"(v197), "w"(v191)); - svfloat32_t v207; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v207) : "w"(v190), "w"(v196)); - svfloat32_t v250; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v250) : "w"(v172), "w"(v811)); - svfloat32_t v255; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v255) : "w"(v173), "w"(v812)); - svfloat32_t v285; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v285) : "w"(v179), "w"(v818)); - svfloat32_t v290; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v290) : "w"(v180), "w"(v819)); - svfloat32_t v205; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v205) : "w"(v204), "w"(v35)); - svfloat32_t v208; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v208) : "w"(v207), "w"(v89)); - svfloat32_t v220; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v220) : "w"(v219), "w"(v174)); - svfloat32_t v280; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v280) : "w"(v178), "w"(v817)); - svfloat32_t zero316; - asm volatile("mov %0.s, #0" : "=w"(zero316)); + svfloat32_t v174 = svadd_f32_x(svptrue_b32(), v166, v167); + svfloat32_t v175 = svsub_f32_x(svptrue_b32(), v166, v167); + svfloat32_t v178 = svsub_f32_x(svptrue_b32(), v177, v176); + svfloat32_t v181 = svadd_f32_x(svptrue_b32(), v172, v173); + svfloat32_t v192 = svadd_f32_x(svptrue_b32(), v190, v191); + svfloat32_t v195 = svadd_f32_x(svptrue_b32(), v193, v194); + svfloat32_t v198 = svadd_f32_x(svptrue_b32(), v196, v197); + svfloat32_t v201 = svadd_f32_x(svptrue_b32(), v199, v200); + svfloat32_t v204 = svsub_f32_x(svptrue_b32(), v197, v191); + svfloat32_t v207 = svsub_f32_x(svptrue_b32(), v190, v196); + svfloat32_t v250 = svmul_f32_x(svptrue_b32(), v172, v811); + svfloat32_t v255 = svmul_f32_x(svptrue_b32(), v173, v812); + svfloat32_t v285 = svmul_f32_x(svptrue_b32(), v179, v818); + svfloat32_t v290 = svmul_f32_x(svptrue_b32(), v180, v819); + svfloat32_t v205 = svadd_f32_x(svptrue_b32(), v204, v35); + svfloat32_t v208 = svadd_f32_x(svptrue_b32(), v207, v89); + svfloat32_t v220 = svadd_f32_x(svptrue_b32(), v219, v174); + svfloat32_t v280 = svmul_f32_x(svptrue_b32(), v178, v817); + svfloat32_t zero316 = svdup_n_f32(0); svfloat32_t v316 = svcmla_f32_x(pred_full, zero316, v823, v192, 90); - svfloat32_t zero337; - asm volatile("mov %0.s, #0" : "=w"(zero337)); + svfloat32_t zero337 = svdup_n_f32(0); svfloat32_t v337 = svcmla_f32_x(pred_full, zero337, v826, v195, 90); - svfloat32_t zero358; - asm volatile("mov %0.s, #0" : "=w"(zero358)); + svfloat32_t zero358 = svdup_n_f32(0); svfloat32_t v358 = svcmla_f32_x(pred_full, zero358, v829, v198, 90); - svfloat32_t zero379; - asm volatile("mov %0.s, #0" : "=w"(zero379)); + svfloat32_t zero379 = svdup_n_f32(0); svfloat32_t v379 = svcmla_f32_x(pred_full, zero379, v832, v201, 90); svfloat32_t v445 = svmla_f32_x(pred_full, v285, v171, v810); svfloat32_t v446 = svnmls_f32_x(pred_full, v240, v179, v818); svfloat32_t v447 = svmla_f32_x(pred_full, v290, v169, v808); svfloat32_t v448 = svnmls_f32_x(pred_full, v290, v168, v807); - svfloat32_t v206; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v206) : "w"(v205), "w"(v203)); - svfloat32_t v209; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v209) : "w"(v208), "w"(v107)); + svfloat32_t v206 = svsub_f32_x(svptrue_b32(), v205, v203); + svfloat32_t v209 = svadd_f32_x(svptrue_b32(), v208, v107); svfloat32_t v443 = svmla_f32_x(pred_full, v280, v176, v815); svfloat32_t v444 = svnmls_f32_x(pred_full, v280, v177, v816); svfloat32_t v449 = svnmls_f32_x(pred_full, v255, v181, v820); @@ -6420,160 +5852,91 @@ void armral_fft_cs16_cf32_cf32_ac_n_uu17(const armral_cmplx_int16_t *restrict x, svfloat32_t v476 = svcmla_f32_x(pred_full, v379, v830, v199, 90); svfloat32_t v477 = svcmla_f32_x(pred_full, v379, v831, v200, 90); svst1_f64(pred_full, (double *)(v849), svreinterpret_f64_f32(v220)); - svfloat32_t v210; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v210) : "w"(v209), "w"(v161)); - svfloat32_t zero428; - asm volatile("mov %0.s, #0" : "=w"(zero428)); + svfloat32_t v210 = svsub_f32_x(svptrue_b32(), v209, v161); + svfloat32_t zero428 = svdup_n_f32(0); svfloat32_t v428 = svcmla_f32_x(pred_full, zero428, v839, v206, 90); svfloat32_t v452 = svmla_f32_x(pred_full, v451, v175, v814); svfloat32_t v453 = svmls_f32_x(pred_full, v451, v175, v814); - svfloat32_t v454; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v454) : "w"(v443), "w"(v445)); - svfloat32_t v456; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v456) : "w"(v444), "w"(v446)); - svfloat32_t v458; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v458) : "w"(v443), "w"(v447)); - svfloat32_t v460; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v460) : "w"(v444), "w"(v448)); - svfloat32_t v481; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v481) : "w"(v470), "w"(v472)); - svfloat32_t v482; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v482) : "w"(v470), "w"(v472)); - svfloat32_t v483; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v483) : "w"(v471), "w"(v473)); - svfloat32_t v484; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v484) : "w"(v471), "w"(v473)); - svfloat32_t v485; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v485) : "w"(v474), "w"(v476)); - svfloat32_t v486; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v486) : "w"(v476), "w"(v474)); - svfloat32_t v487; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v487) : "w"(v475), "w"(v477)); - svfloat32_t v488; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v488) : "w"(v477), "w"(v475)); - svfloat32_t v211; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v211) : "w"(v206), "w"(v210)); - svfloat32_t zero435; - asm volatile("mov %0.s, #0" : "=w"(zero435)); + svfloat32_t v454 = svsub_f32_x(svptrue_b32(), v443, v445); + svfloat32_t v456 = svadd_f32_x(svptrue_b32(), v444, v446); + svfloat32_t v458 = svadd_f32_x(svptrue_b32(), v443, v447); + svfloat32_t v460 = svadd_f32_x(svptrue_b32(), v444, v448); + svfloat32_t v481 = svadd_f32_x(svptrue_b32(), v470, v472); + svfloat32_t v482 = svsub_f32_x(svptrue_b32(), v470, v472); + svfloat32_t v483 = svadd_f32_x(svptrue_b32(), v471, v473); + svfloat32_t v484 = svsub_f32_x(svptrue_b32(), v471, v473); + svfloat32_t v485 = svadd_f32_x(svptrue_b32(), v474, v476); + svfloat32_t v486 = svsub_f32_x(svptrue_b32(), v476, v474); + svfloat32_t v487 = svadd_f32_x(svptrue_b32(), v475, v477); + svfloat32_t v488 = svsub_f32_x(svptrue_b32(), v477, v475); + svfloat32_t v211 = svadd_f32_x(svptrue_b32(), v206, v210); + svfloat32_t zero435 = svdup_n_f32(0); svfloat32_t v435 = svcmla_f32_x(pred_full, zero435, v840, v210, 90); - svfloat32_t v455; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v455) : "w"(v449), "w"(v452)); - svfloat32_t v457; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v457) : "w"(v450), "w"(v453)); - svfloat32_t v459; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v459) : "w"(v452), "w"(v449)); - svfloat32_t v461; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v461) : "w"(v453), "w"(v450)); - svfloat32_t v498; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v498) : "w"(v483), "w"(v487)); - svfloat32_t v500; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v500) : "w"(v482), "w"(v488)); - svfloat32_t v502; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v502) : "w"(v481), "w"(v485)); - svfloat32_t v504; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v504) : "w"(v488), "w"(v482)); - svfloat32_t v506; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v506) : "w"(v481), "w"(v485)); - svfloat32_t v509; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v509) : "w"(v486), "w"(v484)); - svfloat32_t v512; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v512) : "w"(v487), "w"(v483)); - svfloat32_t v515; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v515) : "w"(v484), "w"(v486)); - svfloat32_t v462; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v462) : "w"(v454), "w"(v455)); - svfloat32_t v463; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v463) : "w"(v456), "w"(v457)); - svfloat32_t v464; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v464) : "w"(v458), "w"(v459)); - svfloat32_t v465; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v465) : "w"(v460), "w"(v461)); - svfloat32_t v466; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v466) : "w"(v455), "w"(v454)); - svfloat32_t v467; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v467) : "w"(v457), "w"(v456)); - svfloat32_t v468; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v468) : "w"(v459), "w"(v458)); - svfloat32_t v469; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v469) : "w"(v461), "w"(v460)); - svfloat32_t v489; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v489) : "w"(v428), "w"(v435)); + svfloat32_t v455 = svadd_f32_x(svptrue_b32(), v449, v452); + svfloat32_t v457 = svadd_f32_x(svptrue_b32(), v450, v453); + svfloat32_t v459 = svsub_f32_x(svptrue_b32(), v452, v449); + svfloat32_t v461 = svsub_f32_x(svptrue_b32(), v453, v450); + svfloat32_t v498 = svadd_f32_x(svptrue_b32(), v483, v487); + svfloat32_t v500 = svadd_f32_x(svptrue_b32(), v482, v488); + svfloat32_t v502 = svsub_f32_x(svptrue_b32(), v481, v485); + svfloat32_t v504 = svsub_f32_x(svptrue_b32(), v488, v482); + svfloat32_t v506 = svadd_f32_x(svptrue_b32(), v481, v485); + svfloat32_t v509 = svsub_f32_x(svptrue_b32(), v486, v484); + svfloat32_t v512 = svsub_f32_x(svptrue_b32(), v487, v483); + svfloat32_t v515 = svadd_f32_x(svptrue_b32(), v484, v486); + svfloat32_t v462 = svadd_f32_x(svptrue_b32(), v454, v455); + svfloat32_t v463 = svadd_f32_x(svptrue_b32(), v456, v457); + svfloat32_t v464 = svadd_f32_x(svptrue_b32(), v458, v459); + svfloat32_t v465 = svadd_f32_x(svptrue_b32(), v460, v461); + svfloat32_t v466 = svsub_f32_x(svptrue_b32(), v455, v454); + svfloat32_t v467 = svsub_f32_x(svptrue_b32(), v457, v456); + svfloat32_t v468 = svsub_f32_x(svptrue_b32(), v459, v458); + svfloat32_t v469 = svsub_f32_x(svptrue_b32(), v461, v460); + svfloat32_t v489 = svsub_f32_x(svptrue_b32(), v428, v435); svfloat32_t v478 = svcmla_f32_x(pred_full, v435, v841, v211, 90); - svfloat32_t v491; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v491) : "w"(v489), "w"(v489)); - svfloat32_t v516; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v516) : "w"(v515), "w"(v489)); + svfloat32_t v491 = svadd_f32_x(svptrue_b32(), v489, v489); + svfloat32_t v516 = svsub_f32_x(svptrue_b32(), v515, v489); svfloat32_t v479 = svcmla_f32_x(pred_full, v478, v833, v202, 90); - svfloat32_t v492; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v492) : "w"(v407), "w"(v491)); - svfloat32_t v495; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v495) : "w"(v478), "w"(v478)); - svfloat32_t v513; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v513) : "w"(v512), "w"(v491)); - svfloat32_t v556; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v556) : "w"(v469), "w"(v516)); - svfloat32_t v564; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v564) : "w"(v469), "w"(v516)); + svfloat32_t v492 = svsub_f32_x(svptrue_b32(), v407, v491); + svfloat32_t v495 = svadd_f32_x(svptrue_b32(), v478, v478); + svfloat32_t v513 = svadd_f32_x(svptrue_b32(), v512, v491); + svfloat32_t v556 = svadd_f32_x(svptrue_b32(), v469, v516); + svfloat32_t v564 = svsub_f32_x(svptrue_b32(), v469, v516); svfloat32_t v480 = svcmla_f32_x(pred_full, v479, v834, v35, 90); svfloat32_t v490 = svcmla_f32_x(pred_full, v479, v835, v107, 90); svfloat32_t v493 = svcmla_f32_x(pred_full, v492, v837, v89, 90); svfloat32_t v494 = svcmla_f32_x(pred_full, v492, v838, v161, 90); - svfloat32_t v496; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v496) : "w"(v495), "w"(v495)); - svfloat32_t v497; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v497) : "w"(v489), "w"(v495)); - svfloat32_t v503; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v503) : "w"(v502), "w"(v495)); - svfloat32_t v514; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v514) : "w"(v513), "w"(v495)); + svfloat32_t v496 = svadd_f32_x(svptrue_b32(), v495, v495); + svfloat32_t v497 = svadd_f32_x(svptrue_b32(), v489, v495); + svfloat32_t v503 = svadd_f32_x(svptrue_b32(), v502, v495); + svfloat32_t v514 = svadd_f32_x(svptrue_b32(), v513, v495); svst1_f64(pred_full, (double *)(v894), svreinterpret_f64_f32(v556)); svst1_f64(pred_full, (double *)(v903), svreinterpret_f64_f32(v564)); - svfloat32_t v499; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v499) : "w"(v498), "w"(v490)); - svfloat32_t v501; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v501) : "w"(v500), "w"(v493)); - svfloat32_t v505; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v505) : "w"(v504), "w"(v497)); - svfloat32_t v507; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v507) : "w"(v506), "w"(v480)); - svfloat32_t v510; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v510) : "w"(v509), "w"(v494)); - svfloat32_t v540; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v540) : "w"(v464), "w"(v503)); - svfloat32_t v548; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v548) : "w"(v464), "w"(v503)); - svfloat32_t v636; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v636) : "w"(v468), "w"(v514)); - svfloat32_t v644; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v644) : "w"(v468), "w"(v514)); - svfloat32_t v508; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v508) : "w"(v507), "w"(v489)); - svfloat32_t v511; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v511) : "w"(v510), "w"(v496)); - svfloat32_t v524; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v524) : "w"(v462), "w"(v499)); - svfloat32_t v532; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v532) : "w"(v462), "w"(v499)); - svfloat32_t v588; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v588) : "w"(v465), "w"(v505)); - svfloat32_t v596; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v596) : "w"(v465), "w"(v505)); - svfloat32_t v604; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v604) : "w"(v463), "w"(v501)); - svfloat32_t v612; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v612) : "w"(v463), "w"(v501)); + svfloat32_t v499 = svadd_f32_x(svptrue_b32(), v498, v490); + svfloat32_t v501 = svadd_f32_x(svptrue_b32(), v500, v493); + svfloat32_t v505 = svsub_f32_x(svptrue_b32(), v504, v497); + svfloat32_t v507 = svadd_f32_x(svptrue_b32(), v506, v480); + svfloat32_t v510 = svsub_f32_x(svptrue_b32(), v509, v494); + svfloat32_t v540 = svadd_f32_x(svptrue_b32(), v464, v503); + svfloat32_t v548 = svsub_f32_x(svptrue_b32(), v464, v503); + svfloat32_t v636 = svadd_f32_x(svptrue_b32(), v468, v514); + svfloat32_t v644 = svsub_f32_x(svptrue_b32(), v468, v514); + svfloat32_t v508 = svadd_f32_x(svptrue_b32(), v507, v489); + svfloat32_t v511 = svadd_f32_x(svptrue_b32(), v510, v496); + svfloat32_t v524 = svadd_f32_x(svptrue_b32(), v462, v499); + svfloat32_t v532 = svsub_f32_x(svptrue_b32(), v462, v499); + svfloat32_t v588 = svadd_f32_x(svptrue_b32(), v465, v505); + svfloat32_t v596 = svsub_f32_x(svptrue_b32(), v465, v505); + svfloat32_t v604 = svadd_f32_x(svptrue_b32(), v463, v501); + svfloat32_t v612 = svsub_f32_x(svptrue_b32(), v463, v501); svst1_f64(pred_full, (double *)(v876), svreinterpret_f64_f32(v540)); svst1_f64(pred_full, (double *)(v885), svreinterpret_f64_f32(v548)); svst1_f64(pred_full, (double *)(v984), svreinterpret_f64_f32(v636)); svst1_f64(pred_full, (double *)(v993), svreinterpret_f64_f32(v644)); - svfloat32_t v572; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v572) : "w"(v466), "w"(v508)); - svfloat32_t v580; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v580) : "w"(v466), "w"(v508)); - svfloat32_t v620; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v620) : "w"(v467), "w"(v511)); - svfloat32_t v628; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v628) : "w"(v467), "w"(v511)); + svfloat32_t v572 = svadd_f32_x(svptrue_b32(), v466, v508); + svfloat32_t v580 = svsub_f32_x(svptrue_b32(), v466, v508); + svfloat32_t v620 = svadd_f32_x(svptrue_b32(), v467, v511); + svfloat32_t v628 = svsub_f32_x(svptrue_b32(), v467, v511); svst1_f64(pred_full, (double *)(v858), svreinterpret_f64_f32(v524)); svst1_f64(pred_full, (double *)(v867), svreinterpret_f64_f32(v532)); svst1_f64(pred_full, (double *)(v930), svreinterpret_f64_f32(v588)); @@ -7290,209 +6653,117 @@ void armral_fft_cs16_cf32_cf32_ac_n_uu18(const armral_cmplx_int16_t *restrict x, svcvt_f32_s32_x(pred_full, svld1sh_s32(pred_full, (const int16_t *)&v686[0])), 1.F / (1ULL << 15ULL)); - svfloat32_t v34; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v34) : "w"(v25), "w"(v33)); - svfloat32_t v35; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v35) : "w"(v25), "w"(v33)); - svfloat32_t v52; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v52) : "w"(v43), "w"(v51)); - svfloat32_t v53; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v53) : "w"(v43), "w"(v51)); - svfloat32_t v70; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v70) : "w"(v61), "w"(v69)); - svfloat32_t v71; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v71) : "w"(v61), "w"(v69)); - svfloat32_t v88; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v88) : "w"(v79), "w"(v87)); - svfloat32_t v89; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v89) : "w"(v79), "w"(v87)); - svfloat32_t v106; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v106) : "w"(v97), "w"(v105)); - svfloat32_t v107; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v107) : "w"(v97), "w"(v105)); - svfloat32_t v124; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v124) : "w"(v115), "w"(v123)); - svfloat32_t v125; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v125) : "w"(v115), "w"(v123)); - svfloat32_t v142; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v142) : "w"(v133), "w"(v141)); - svfloat32_t v143; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v143) : "w"(v133), "w"(v141)); - svfloat32_t v160; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v160) : "w"(v151), "w"(v159)); - svfloat32_t v161; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v161) : "w"(v151), "w"(v159)); - svfloat32_t v178; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v178) : "w"(v169), "w"(v177)); - svfloat32_t v179; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v179) : "w"(v169), "w"(v177)); - svfloat32_t v180; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v180) : "w"(v52), "w"(v178)); - svfloat32_t v181; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v181) : "w"(v52), "w"(v178)); - svfloat32_t v182; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v182) : "w"(v160), "w"(v70)); - svfloat32_t v183; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v183) : "w"(v160), "w"(v70)); - svfloat32_t v184; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v184) : "w"(v88), "w"(v142)); - svfloat32_t v185; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v185) : "w"(v88), "w"(v142)); - svfloat32_t v186; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v186) : "w"(v106), "w"(v124)); - svfloat32_t v187; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v187) : "w"(v106), "w"(v124)); - svfloat32_t v290; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v290) : "w"(v53), "w"(v179)); - svfloat32_t v291; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v291) : "w"(v53), "w"(v179)); - svfloat32_t v292; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v292) : "w"(v161), "w"(v71)); - svfloat32_t v293; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v293) : "w"(v161), "w"(v71)); - svfloat32_t v294; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v294) : "w"(v89), "w"(v143)); - svfloat32_t v295; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v295) : "w"(v89), "w"(v143)); - svfloat32_t v296; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v296) : "w"(v107), "w"(v125)); - svfloat32_t v297; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v297) : "w"(v107), "w"(v125)); - svfloat32_t v188; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v188) : "w"(v180), "w"(v182)); - svfloat32_t v192; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v192) : "w"(v181), "w"(v183)); - svfloat32_t v194; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v194) : "w"(v180), "w"(v182)); - svfloat32_t v195; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v195) : "w"(v182), "w"(v186)); - svfloat32_t v196; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v196) : "w"(v186), "w"(v180)); - svfloat32_t v197; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v197) : "w"(v181), "w"(v183)); - svfloat32_t v198; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v198) : "w"(v183), "w"(v187)); - svfloat32_t v199; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v199) : "w"(v187), "w"(v181)); - svfloat32_t zero228; - asm volatile("mov %0.s, #0" : "=w"(zero228)); + svfloat32_t v34 = svadd_f32_x(svptrue_b32(), v25, v33); + svfloat32_t v35 = svsub_f32_x(svptrue_b32(), v25, v33); + svfloat32_t v52 = svadd_f32_x(svptrue_b32(), v43, v51); + svfloat32_t v53 = svsub_f32_x(svptrue_b32(), v43, v51); + svfloat32_t v70 = svadd_f32_x(svptrue_b32(), v61, v69); + svfloat32_t v71 = svsub_f32_x(svptrue_b32(), v61, v69); + svfloat32_t v88 = svadd_f32_x(svptrue_b32(), v79, v87); + svfloat32_t v89 = svsub_f32_x(svptrue_b32(), v79, v87); + svfloat32_t v106 = svadd_f32_x(svptrue_b32(), v97, v105); + svfloat32_t v107 = svsub_f32_x(svptrue_b32(), v97, v105); + svfloat32_t v124 = svadd_f32_x(svptrue_b32(), v115, v123); + svfloat32_t v125 = svsub_f32_x(svptrue_b32(), v115, v123); + svfloat32_t v142 = svadd_f32_x(svptrue_b32(), v133, v141); + svfloat32_t v143 = svsub_f32_x(svptrue_b32(), v133, v141); + svfloat32_t v160 = svadd_f32_x(svptrue_b32(), v151, v159); + svfloat32_t v161 = svsub_f32_x(svptrue_b32(), v151, v159); + svfloat32_t v178 = svadd_f32_x(svptrue_b32(), v169, v177); + svfloat32_t v179 = svsub_f32_x(svptrue_b32(), v169, v177); + svfloat32_t v180 = svadd_f32_x(svptrue_b32(), v52, v178); + svfloat32_t v181 = svsub_f32_x(svptrue_b32(), v52, v178); + svfloat32_t v182 = svadd_f32_x(svptrue_b32(), v160, v70); + svfloat32_t v183 = svsub_f32_x(svptrue_b32(), v160, v70); + svfloat32_t v184 = svadd_f32_x(svptrue_b32(), v88, v142); + svfloat32_t v185 = svsub_f32_x(svptrue_b32(), v88, v142); + svfloat32_t v186 = svadd_f32_x(svptrue_b32(), v106, v124); + svfloat32_t v187 = svsub_f32_x(svptrue_b32(), v106, v124); + svfloat32_t v290 = svadd_f32_x(svptrue_b32(), v53, v179); + svfloat32_t v291 = svsub_f32_x(svptrue_b32(), v53, v179); + svfloat32_t v292 = svadd_f32_x(svptrue_b32(), v161, v71); + svfloat32_t v293 = svsub_f32_x(svptrue_b32(), v161, v71); + svfloat32_t v294 = svadd_f32_x(svptrue_b32(), v89, v143); + svfloat32_t v295 = svsub_f32_x(svptrue_b32(), v89, v143); + svfloat32_t v296 = svadd_f32_x(svptrue_b32(), v107, v125); + svfloat32_t v297 = svsub_f32_x(svptrue_b32(), v107, v125); + svfloat32_t v188 = svadd_f32_x(svptrue_b32(), v180, v182); + svfloat32_t v192 = svadd_f32_x(svptrue_b32(), v181, v183); + svfloat32_t v194 = svsub_f32_x(svptrue_b32(), v180, v182); + svfloat32_t v195 = svsub_f32_x(svptrue_b32(), v182, v186); + svfloat32_t v196 = svsub_f32_x(svptrue_b32(), v186, v180); + svfloat32_t v197 = svsub_f32_x(svptrue_b32(), v181, v183); + svfloat32_t v198 = svsub_f32_x(svptrue_b32(), v183, v187); + svfloat32_t v199 = svsub_f32_x(svptrue_b32(), v187, v181); + svfloat32_t zero228 = svdup_n_f32(0); svfloat32_t v228 = svcmla_f32_x(pred_full, zero228, v704, v185, 90); - svfloat32_t v298; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v298) : "w"(v290), "w"(v292)); - svfloat32_t v302; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v302) : "w"(v291), "w"(v293)); - svfloat32_t v304; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v304) : "w"(v290), "w"(v292)); - svfloat32_t v305; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v305) : "w"(v292), "w"(v296)); - svfloat32_t v306; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v306) : "w"(v296), "w"(v290)); - svfloat32_t v307; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v307) : "w"(v291), "w"(v293)); - svfloat32_t v308; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v308) : "w"(v293), "w"(v297)); - svfloat32_t v309; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v309) : "w"(v297), "w"(v291)); - svfloat32_t zero338; - asm volatile("mov %0.s, #0" : "=w"(zero338)); + svfloat32_t v298 = svadd_f32_x(svptrue_b32(), v290, v292); + svfloat32_t v302 = svadd_f32_x(svptrue_b32(), v291, v293); + svfloat32_t v304 = svsub_f32_x(svptrue_b32(), v290, v292); + svfloat32_t v305 = svsub_f32_x(svptrue_b32(), v292, v296); + svfloat32_t v306 = svsub_f32_x(svptrue_b32(), v296, v290); + svfloat32_t v307 = svsub_f32_x(svptrue_b32(), v291, v293); + svfloat32_t v308 = svsub_f32_x(svptrue_b32(), v293, v297); + svfloat32_t v309 = svsub_f32_x(svptrue_b32(), v297, v291); + svfloat32_t zero338 = svdup_n_f32(0); svfloat32_t v338 = svcmla_f32_x(pred_full, zero338, v704, v295, 90); - svfloat32_t v189; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v189) : "w"(v188), "w"(v186)); - svfloat32_t v193; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v193) : "w"(v192), "w"(v187)); - svfloat32_t zero250; - asm volatile("mov %0.s, #0" : "=w"(zero250)); + svfloat32_t v189 = svadd_f32_x(svptrue_b32(), v188, v186); + svfloat32_t v193 = svadd_f32_x(svptrue_b32(), v192, v187); + svfloat32_t zero250 = svdup_n_f32(0); svfloat32_t v250 = svcmla_f32_x(pred_full, zero250, v708, v197, 90); - svfloat32_t zero257; - asm volatile("mov %0.s, #0" : "=w"(zero257)); + svfloat32_t zero257 = svdup_n_f32(0); svfloat32_t v257 = svcmla_f32_x(pred_full, zero257, v709, v198, 90); - svfloat32_t zero264; - asm volatile("mov %0.s, #0" : "=w"(zero264)); + svfloat32_t zero264 = svdup_n_f32(0); svfloat32_t v264 = svcmla_f32_x(pred_full, zero264, v710, v199, 90); - svfloat32_t v299; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v299) : "w"(v298), "w"(v296)); - svfloat32_t v303; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v303) : "w"(v302), "w"(v297)); - svfloat32_t zero360; - asm volatile("mov %0.s, #0" : "=w"(zero360)); + svfloat32_t v299 = svadd_f32_x(svptrue_b32(), v298, v296); + svfloat32_t v303 = svadd_f32_x(svptrue_b32(), v302, v297); + svfloat32_t zero360 = svdup_n_f32(0); svfloat32_t v360 = svcmla_f32_x(pred_full, zero360, v708, v307, 90); - svfloat32_t zero367; - asm volatile("mov %0.s, #0" : "=w"(zero367)); + svfloat32_t zero367 = svdup_n_f32(0); svfloat32_t v367 = svcmla_f32_x(pred_full, zero367, v709, v308, 90); - svfloat32_t zero374; - asm volatile("mov %0.s, #0" : "=w"(zero374)); + svfloat32_t zero374 = svdup_n_f32(0); svfloat32_t v374 = svcmla_f32_x(pred_full, zero374, v710, v309, 90); - svfloat32_t v190; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v190) : "w"(v189), "w"(v184)); - svfloat32_t v209; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v209) : "w"(v189), "w"(v701)); - svfloat32_t zero216; - asm volatile("mov %0.s, #0" : "=w"(zero216)); + svfloat32_t v190 = svadd_f32_x(svptrue_b32(), v189, v184); + svfloat32_t v209 = svmul_f32_x(svptrue_b32(), v189, v701); + svfloat32_t zero216 = svdup_n_f32(0); svfloat32_t v216 = svcmla_f32_x(pred_full, zero216, v704, v193, 90); - svfloat32_t v278; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v278) : "w"(v228), "w"(v250)); - svfloat32_t v280; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v280) : "w"(v228), "w"(v257)); - svfloat32_t v282; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v282) : "w"(v228), "w"(v250)); - svfloat32_t v300; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v300) : "w"(v299), "w"(v294)); - svfloat32_t v319; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v319) : "w"(v299), "w"(v701)); - svfloat32_t zero326; - asm volatile("mov %0.s, #0" : "=w"(zero326)); + svfloat32_t v278 = svadd_f32_x(svptrue_b32(), v228, v250); + svfloat32_t v280 = svsub_f32_x(svptrue_b32(), v228, v257); + svfloat32_t v282 = svsub_f32_x(svptrue_b32(), v228, v250); + svfloat32_t v300 = svadd_f32_x(svptrue_b32(), v299, v294); + svfloat32_t v319 = svmul_f32_x(svptrue_b32(), v299, v701); + svfloat32_t zero326 = svdup_n_f32(0); svfloat32_t v326 = svcmla_f32_x(pred_full, zero326, v704, v303, 90); - svfloat32_t v388; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v388) : "w"(v338), "w"(v360)); - svfloat32_t v390; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v390) : "w"(v338), "w"(v367)); - svfloat32_t v392; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v392) : "w"(v338), "w"(v360)); - svfloat32_t v191; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v191) : "w"(v190), "w"(v34)); - svfloat32_t v265; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v265) : "w"(v209), "w"(v209)); - svfloat32_t v279; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v279) : "w"(v278), "w"(v257)); - svfloat32_t v281; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v281) : "w"(v280), "w"(v264)); - svfloat32_t v283; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v283) : "w"(v282), "w"(v264)); - svfloat32_t v301; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v301) : "w"(v300), "w"(v35)); - svfloat32_t v375; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v375) : "w"(v319), "w"(v319)); - svfloat32_t v389; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v389) : "w"(v388), "w"(v367)); - svfloat32_t v391; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v391) : "w"(v390), "w"(v374)); - svfloat32_t v393; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v393) : "w"(v392), "w"(v374)); + svfloat32_t v388 = svadd_f32_x(svptrue_b32(), v338, v360); + svfloat32_t v390 = svsub_f32_x(svptrue_b32(), v338, v367); + svfloat32_t v392 = svsub_f32_x(svptrue_b32(), v338, v360); + svfloat32_t v191 = svadd_f32_x(svptrue_b32(), v190, v34); + svfloat32_t v265 = svadd_f32_x(svptrue_b32(), v209, v209); + svfloat32_t v279 = svadd_f32_x(svptrue_b32(), v278, v257); + svfloat32_t v281 = svadd_f32_x(svptrue_b32(), v280, v264); + svfloat32_t v283 = svsub_f32_x(svptrue_b32(), v282, v264); + svfloat32_t v301 = svadd_f32_x(svptrue_b32(), v300, v35); + svfloat32_t v375 = svadd_f32_x(svptrue_b32(), v319, v319); + svfloat32_t v389 = svadd_f32_x(svptrue_b32(), v388, v367); + svfloat32_t v391 = svadd_f32_x(svptrue_b32(), v390, v374); + svfloat32_t v393 = svsub_f32_x(svptrue_b32(), v392, v374); svfloat32_t v266 = svmla_f32_x(pred_full, v265, v189, v701); svfloat32_t v270 = svmla_f32_x(pred_full, v191, v184, v703); svfloat32_t v376 = svmla_f32_x(pred_full, v375, v299, v701); svfloat32_t v380 = svmla_f32_x(pred_full, v301, v294, v703); svst1_f64(pred_full, (double *)(v718), svreinterpret_f64_f32(v191)); svst1_f64(pred_full, (double *)(v727), svreinterpret_f64_f32(v301)); - svfloat32_t v267; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v267) : "w"(v191), "w"(v266)); - svfloat32_t v271; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v271) : "w"(v270), "w"(v265)); - svfloat32_t v377; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v377) : "w"(v301), "w"(v376)); - svfloat32_t v381; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v381) : "w"(v380), "w"(v375)); - svfloat32_t v268; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v268) : "w"(v267), "w"(v216)); - svfloat32_t v269; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v269) : "w"(v267), "w"(v216)); + svfloat32_t v267 = svadd_f32_x(svptrue_b32(), v191, v266); + svfloat32_t v271 = svadd_f32_x(svptrue_b32(), v270, v265); + svfloat32_t v377 = svadd_f32_x(svptrue_b32(), v301, v376); + svfloat32_t v381 = svadd_f32_x(svptrue_b32(), v380, v375); + svfloat32_t v268 = svadd_f32_x(svptrue_b32(), v267, v216); + svfloat32_t v269 = svsub_f32_x(svptrue_b32(), v267, v216); svfloat32_t v272 = svmla_f32_x(pred_full, v271, v194, v705); svfloat32_t v274 = svmls_f32_x(pred_full, v271, v195, v706); svfloat32_t v276 = svmls_f32_x(pred_full, v271, v194, v705); - svfloat32_t v378; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v378) : "w"(v377), "w"(v326)); - svfloat32_t v379; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v379) : "w"(v377), "w"(v326)); + svfloat32_t v378 = svadd_f32_x(svptrue_b32(), v377, v326); + svfloat32_t v379 = svsub_f32_x(svptrue_b32(), v377, v326); svfloat32_t v382 = svmla_f32_x(pred_full, v381, v304, v705); svfloat32_t v384 = svmls_f32_x(pred_full, v381, v305, v706); svfloat32_t v386 = svmls_f32_x(pred_full, v381, v304, v705); @@ -7506,30 +6777,18 @@ void armral_fft_cs16_cf32_cf32_ac_n_uu18(const armral_cmplx_int16_t *restrict x, svst1_f64(pred_full, (double *)(v781), svreinterpret_f64_f32(v379)); svst1_f64(pred_full, (double *)(v826), svreinterpret_f64_f32(v268)); svst1_f64(pred_full, (double *)(v835), svreinterpret_f64_f32(v378)); - svfloat32_t v284; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v284) : "w"(v273), "w"(v279)); - svfloat32_t v285; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v285) : "w"(v273), "w"(v279)); - svfloat32_t v286; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v286) : "w"(v275), "w"(v281)); - svfloat32_t v287; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v287) : "w"(v275), "w"(v281)); - svfloat32_t v288; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v288) : "w"(v277), "w"(v283)); - svfloat32_t v289; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v289) : "w"(v277), "w"(v283)); - svfloat32_t v394; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v394) : "w"(v383), "w"(v389)); - svfloat32_t v395; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v395) : "w"(v383), "w"(v389)); - svfloat32_t v396; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v396) : "w"(v385), "w"(v391)); - svfloat32_t v397; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v397) : "w"(v385), "w"(v391)); - svfloat32_t v398; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v398) : "w"(v387), "w"(v393)); - svfloat32_t v399; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v399) : "w"(v387), "w"(v393)); + svfloat32_t v284 = svadd_f32_x(svptrue_b32(), v273, v279); + svfloat32_t v285 = svsub_f32_x(svptrue_b32(), v273, v279); + svfloat32_t v286 = svadd_f32_x(svptrue_b32(), v275, v281); + svfloat32_t v287 = svsub_f32_x(svptrue_b32(), v275, v281); + svfloat32_t v288 = svadd_f32_x(svptrue_b32(), v277, v283); + svfloat32_t v289 = svsub_f32_x(svptrue_b32(), v277, v283); + svfloat32_t v394 = svadd_f32_x(svptrue_b32(), v383, v389); + svfloat32_t v395 = svsub_f32_x(svptrue_b32(), v383, v389); + svfloat32_t v396 = svadd_f32_x(svptrue_b32(), v385, v391); + svfloat32_t v397 = svsub_f32_x(svptrue_b32(), v385, v391); + svfloat32_t v398 = svadd_f32_x(svptrue_b32(), v387, v393); + svfloat32_t v399 = svsub_f32_x(svptrue_b32(), v387, v393); svst1_f64(pred_full, (double *)(v736), svreinterpret_f64_f32(v285)); svst1_f64(pred_full, (double *)(v745), svreinterpret_f64_f32(v395)); svst1_f64(pred_full, (double *)(v754), svreinterpret_f64_f32(v286)); @@ -8749,403 +8008,227 @@ void armral_fft_cs16_cf32_cf32_ac_n_uu19(const armral_cmplx_int16_t *restrict x, svcvt_f32_s32_x(pred_full, svld1sh_s32(pred_full, (const int16_t *)&v881[0])), 1.F / (1ULL << 15ULL)); - svfloat32_t v34; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v34) : "w"(v25), "w"(v33)); - svfloat32_t v35; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v35) : "w"(v25), "w"(v33)); - svfloat32_t v52; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v52) : "w"(v43), "w"(v51)); - svfloat32_t v53; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v53) : "w"(v51), "w"(v43)); - svfloat32_t v70; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v70) : "w"(v61), "w"(v69)); - svfloat32_t v71; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v71) : "w"(v61), "w"(v69)); - svfloat32_t v88; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v88) : "w"(v79), "w"(v87)); - svfloat32_t v89; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v89) : "w"(v87), "w"(v79)); - svfloat32_t v106; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v106) : "w"(v97), "w"(v105)); - svfloat32_t v107; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v107) : "w"(v97), "w"(v105)); - svfloat32_t v124; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v124) : "w"(v115), "w"(v123)); - svfloat32_t v125; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v125) : "w"(v123), "w"(v115)); - svfloat32_t v142; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v142) : "w"(v133), "w"(v141)); - svfloat32_t v143; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v143) : "w"(v133), "w"(v141)); - svfloat32_t v160; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v160) : "w"(v151), "w"(v159)); - svfloat32_t v161; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v161) : "w"(v159), "w"(v151)); - svfloat32_t v178; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v178) : "w"(v169), "w"(v177)); - svfloat32_t v179; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v179) : "w"(v169), "w"(v177)); - svfloat32_t v180; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v180) : "w"(v34), "w"(v142)); - svfloat32_t v181; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v181) : "w"(v52), "w"(v160)); - svfloat32_t v182; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v182) : "w"(v70), "w"(v178)); - svfloat32_t v183; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v183) : "w"(v88), "w"(v142)); - svfloat32_t v184; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v184) : "w"(v106), "w"(v160)); - svfloat32_t v185; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v185) : "w"(v124), "w"(v178)); - svfloat32_t v186; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v186) : "w"(v34), "w"(v88)); - svfloat32_t v188; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v188) : "w"(v52), "w"(v106)); - svfloat32_t v190; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v190) : "w"(v70), "w"(v124)); - svfloat32_t v221; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v221) : "w"(v35), "w"(v143)); - svfloat32_t v222; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v222) : "w"(v53), "w"(v161)); - svfloat32_t v223; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v223) : "w"(v71), "w"(v179)); - svfloat32_t v224; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v224) : "w"(v89), "w"(v143)); - svfloat32_t v225; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v225) : "w"(v107), "w"(v161)); - svfloat32_t v226; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v226) : "w"(v125), "w"(v179)); - svfloat32_t v227; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v227) : "w"(v35), "w"(v89)); - svfloat32_t v229; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v229) : "w"(v53), "w"(v107)); - svfloat32_t v231; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v231) : "w"(v71), "w"(v125)); - svfloat32_t v187; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v187) : "w"(v186), "w"(v142)); - svfloat32_t v189; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v189) : "w"(v188), "w"(v160)); - svfloat32_t v191; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v191) : "w"(v190), "w"(v178)); - svfloat32_t v192; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v192) : "w"(v180), "w"(v182)); - svfloat32_t v193; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v193) : "w"(v183), "w"(v185)); - svfloat32_t v211; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v211) : "w"(v180), "w"(v183)); - svfloat32_t v212; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v212) : "w"(v182), "w"(v185)); - svfloat32_t v228; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v228) : "w"(v227), "w"(v143)); - svfloat32_t v230; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v230) : "w"(v229), "w"(v161)); - svfloat32_t v232; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v232) : "w"(v231), "w"(v179)); - svfloat32_t v233; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v233) : "w"(v221), "w"(v223)); - svfloat32_t v234; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v234) : "w"(v224), "w"(v226)); - svfloat32_t v243; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v243) : "w"(v221), "w"(v224)); - svfloat32_t v244; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v244) : "w"(v223), "w"(v226)); - svfloat32_t zero408; - asm volatile("mov %0.s, #0" : "=w"(zero408)); + svfloat32_t v34 = svadd_f32_x(svptrue_b32(), v25, v33); + svfloat32_t v35 = svsub_f32_x(svptrue_b32(), v25, v33); + svfloat32_t v52 = svadd_f32_x(svptrue_b32(), v43, v51); + svfloat32_t v53 = svsub_f32_x(svptrue_b32(), v51, v43); + svfloat32_t v70 = svadd_f32_x(svptrue_b32(), v61, v69); + svfloat32_t v71 = svsub_f32_x(svptrue_b32(), v61, v69); + svfloat32_t v88 = svadd_f32_x(svptrue_b32(), v79, v87); + svfloat32_t v89 = svsub_f32_x(svptrue_b32(), v87, v79); + svfloat32_t v106 = svadd_f32_x(svptrue_b32(), v97, v105); + svfloat32_t v107 = svsub_f32_x(svptrue_b32(), v97, v105); + svfloat32_t v124 = svadd_f32_x(svptrue_b32(), v115, v123); + svfloat32_t v125 = svsub_f32_x(svptrue_b32(), v123, v115); + svfloat32_t v142 = svadd_f32_x(svptrue_b32(), v133, v141); + svfloat32_t v143 = svsub_f32_x(svptrue_b32(), v133, v141); + svfloat32_t v160 = svadd_f32_x(svptrue_b32(), v151, v159); + svfloat32_t v161 = svsub_f32_x(svptrue_b32(), v159, v151); + svfloat32_t v178 = svadd_f32_x(svptrue_b32(), v169, v177); + svfloat32_t v179 = svsub_f32_x(svptrue_b32(), v169, v177); + svfloat32_t v180 = svsub_f32_x(svptrue_b32(), v34, v142); + svfloat32_t v181 = svsub_f32_x(svptrue_b32(), v52, v160); + svfloat32_t v182 = svsub_f32_x(svptrue_b32(), v70, v178); + svfloat32_t v183 = svsub_f32_x(svptrue_b32(), v88, v142); + svfloat32_t v184 = svsub_f32_x(svptrue_b32(), v106, v160); + svfloat32_t v185 = svsub_f32_x(svptrue_b32(), v124, v178); + svfloat32_t v186 = svadd_f32_x(svptrue_b32(), v34, v88); + svfloat32_t v188 = svadd_f32_x(svptrue_b32(), v52, v106); + svfloat32_t v190 = svadd_f32_x(svptrue_b32(), v70, v124); + svfloat32_t v221 = svsub_f32_x(svptrue_b32(), v35, v143); + svfloat32_t v222 = svsub_f32_x(svptrue_b32(), v53, v161); + svfloat32_t v223 = svsub_f32_x(svptrue_b32(), v71, v179); + svfloat32_t v224 = svsub_f32_x(svptrue_b32(), v89, v143); + svfloat32_t v225 = svsub_f32_x(svptrue_b32(), v107, v161); + svfloat32_t v226 = svsub_f32_x(svptrue_b32(), v125, v179); + svfloat32_t v227 = svadd_f32_x(svptrue_b32(), v35, v89); + svfloat32_t v229 = svadd_f32_x(svptrue_b32(), v53, v107); + svfloat32_t v231 = svadd_f32_x(svptrue_b32(), v71, v125); + svfloat32_t v187 = svadd_f32_x(svptrue_b32(), v186, v142); + svfloat32_t v189 = svadd_f32_x(svptrue_b32(), v188, v160); + svfloat32_t v191 = svadd_f32_x(svptrue_b32(), v190, v178); + svfloat32_t v192 = svadd_f32_x(svptrue_b32(), v180, v182); + svfloat32_t v193 = svadd_f32_x(svptrue_b32(), v183, v185); + svfloat32_t v211 = svsub_f32_x(svptrue_b32(), v180, v183); + svfloat32_t v212 = svsub_f32_x(svptrue_b32(), v182, v185); + svfloat32_t v228 = svadd_f32_x(svptrue_b32(), v227, v143); + svfloat32_t v230 = svadd_f32_x(svptrue_b32(), v229, v161); + svfloat32_t v232 = svadd_f32_x(svptrue_b32(), v231, v179); + svfloat32_t v233 = svadd_f32_x(svptrue_b32(), v221, v223); + svfloat32_t v234 = svadd_f32_x(svptrue_b32(), v224, v226); + svfloat32_t v243 = svsub_f32_x(svptrue_b32(), v221, v224); + svfloat32_t v244 = svsub_f32_x(svptrue_b32(), v223, v226); + svfloat32_t zero408 = svdup_n_f32(0); svfloat32_t v408 = svcmla_f32_x(pred_full, zero408, v921, v224, 90); - svfloat32_t zero429; - asm volatile("mov %0.s, #0" : "=w"(zero429)); + svfloat32_t zero429 = svdup_n_f32(0); svfloat32_t v429 = svcmla_f32_x(pred_full, zero429, v924, v226, 90); - svfloat32_t v194; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v194) : "w"(v187), "w"(v189)); - svfloat32_t v205; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v205) : "w"(v193), "w"(v184)); - svfloat32_t v206; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v206) : "w"(v192), "w"(v181)); - svfloat32_t v208; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v208) : "w"(v193), "w"(v184)); - svfloat32_t v209; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v209) : "w"(v192), "w"(v181)); - svfloat32_t v213; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v213) : "w"(v180), "w"(v212)); - svfloat32_t v215; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v215) : "w"(v211), "w"(v185)); - svfloat32_t v218; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v218) : "w"(v187), "w"(v191)); - svfloat32_t v219; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v219) : "w"(v189), "w"(v191)); - svfloat32_t v235; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v235) : "w"(v228), "w"(v230)); - svfloat32_t v237; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v237) : "w"(v234), "w"(v225)); - svfloat32_t v238; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v238) : "w"(v233), "w"(v222)); - svfloat32_t v240; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v240) : "w"(v234), "w"(v225)); - svfloat32_t v241; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v241) : "w"(v233), "w"(v222)); - svfloat32_t v245; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v245) : "w"(v221), "w"(v244)); - svfloat32_t v247; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v247) : "w"(v243), "w"(v226)); - svfloat32_t v250; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v250) : "w"(v228), "w"(v232)); - svfloat32_t v251; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v251) : "w"(v230), "w"(v232)); - svfloat32_t v195; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v195) : "w"(v194), "w"(v191)); - svfloat32_t v207; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v207) : "w"(v206), "w"(v205)); - svfloat32_t v210; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v210) : "w"(v209), "w"(v208)); - svfloat32_t v214; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v214) : "w"(v213), "w"(v184)); - svfloat32_t v216; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v216) : "w"(v215), "w"(v181)); - svfloat32_t v220; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v220) : "w"(v218), "w"(v219)); - svfloat32_t v236; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v236) : "w"(v235), "w"(v232)); - svfloat32_t v239; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v239) : "w"(v238), "w"(v237)); - svfloat32_t v242; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v242) : "w"(v241), "w"(v240)); - svfloat32_t v246; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v246) : "w"(v245), "w"(v225)); - svfloat32_t v248; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v248) : "w"(v247), "w"(v222)); - svfloat32_t v252; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v252) : "w"(v250), "w"(v251)); - svfloat32_t v272; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v272) : "w"(v206), "w"(v897)); - svfloat32_t v287; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v287) : "w"(v209), "w"(v900)); - svfloat32_t zero366; - asm volatile("mov %0.s, #0" : "=w"(zero366)); + svfloat32_t v194 = svadd_f32_x(svptrue_b32(), v187, v189); + svfloat32_t v205 = svadd_f32_x(svptrue_b32(), v193, v184); + svfloat32_t v206 = svadd_f32_x(svptrue_b32(), v192, v181); + svfloat32_t v208 = svsub_f32_x(svptrue_b32(), v193, v184); + svfloat32_t v209 = svsub_f32_x(svptrue_b32(), v192, v181); + svfloat32_t v213 = svsub_f32_x(svptrue_b32(), v180, v212); + svfloat32_t v215 = svadd_f32_x(svptrue_b32(), v211, v185); + svfloat32_t v218 = svsub_f32_x(svptrue_b32(), v187, v191); + svfloat32_t v219 = svsub_f32_x(svptrue_b32(), v189, v191); + svfloat32_t v235 = svadd_f32_x(svptrue_b32(), v228, v230); + svfloat32_t v237 = svadd_f32_x(svptrue_b32(), v234, v225); + svfloat32_t v238 = svadd_f32_x(svptrue_b32(), v233, v222); + svfloat32_t v240 = svsub_f32_x(svptrue_b32(), v234, v225); + svfloat32_t v241 = svsub_f32_x(svptrue_b32(), v233, v222); + svfloat32_t v245 = svsub_f32_x(svptrue_b32(), v221, v244); + svfloat32_t v247 = svadd_f32_x(svptrue_b32(), v243, v226); + svfloat32_t v250 = svsub_f32_x(svptrue_b32(), v228, v232); + svfloat32_t v251 = svsub_f32_x(svptrue_b32(), v230, v232); + svfloat32_t v195 = svadd_f32_x(svptrue_b32(), v194, v191); + svfloat32_t v207 = svsub_f32_x(svptrue_b32(), v206, v205); + svfloat32_t v210 = svsub_f32_x(svptrue_b32(), v209, v208); + svfloat32_t v214 = svsub_f32_x(svptrue_b32(), v213, v184); + svfloat32_t v216 = svsub_f32_x(svptrue_b32(), v215, v181); + svfloat32_t v220 = svadd_f32_x(svptrue_b32(), v218, v219); + svfloat32_t v236 = svadd_f32_x(svptrue_b32(), v235, v232); + svfloat32_t v239 = svsub_f32_x(svptrue_b32(), v238, v237); + svfloat32_t v242 = svsub_f32_x(svptrue_b32(), v241, v240); + svfloat32_t v246 = svsub_f32_x(svptrue_b32(), v245, v225); + svfloat32_t v248 = svsub_f32_x(svptrue_b32(), v247, v222); + svfloat32_t v252 = svadd_f32_x(svptrue_b32(), v250, v251); + svfloat32_t v272 = svmul_f32_x(svptrue_b32(), v206, v897); + svfloat32_t v287 = svmul_f32_x(svptrue_b32(), v209, v900); + svfloat32_t zero366 = svdup_n_f32(0); svfloat32_t v366 = svcmla_f32_x(pred_full, zero366, v915, v237, 90); - svfloat32_t zero387; - asm volatile("mov %0.s, #0" : "=w"(zero387)); + svfloat32_t zero387 = svdup_n_f32(0); svfloat32_t v387 = svcmla_f32_x(pred_full, zero387, v918, v240, 90); - svfloat32_t zero471; - asm volatile("mov %0.s, #0" : "=w"(zero471)); + svfloat32_t zero471 = svdup_n_f32(0); svfloat32_t v471 = svcmla_f32_x(pred_full, zero471, v930, v250, 90); - svfloat32_t zero478; - asm volatile("mov %0.s, #0" : "=w"(zero478)); + svfloat32_t zero478 = svdup_n_f32(0); svfloat32_t v478 = svcmla_f32_x(pred_full, zero478, v931, v251, 90); - svfloat32_t v204; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v204) : "w"(v203), "w"(v195)); - svfloat32_t v217; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v217) : "w"(v214), "w"(v216)); - svfloat32_t v249; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v249) : "w"(v246), "w"(v248)); - svfloat32_t v277; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v277) : "w"(v207), "w"(v898)); - svfloat32_t v292; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v292) : "w"(v210), "w"(v901)); - svfloat32_t v352; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v352) : "w"(v220), "w"(v913)); - svfloat32_t zero359; - asm volatile("mov %0.s, #0" : "=w"(zero359)); + svfloat32_t v204 = svadd_f32_x(svptrue_b32(), v203, v195); + svfloat32_t v217 = svsub_f32_x(svptrue_b32(), v214, v216); + svfloat32_t v249 = svsub_f32_x(svptrue_b32(), v246, v248); + svfloat32_t v277 = svmul_f32_x(svptrue_b32(), v207, v898); + svfloat32_t v292 = svmul_f32_x(svptrue_b32(), v210, v901); + svfloat32_t v352 = svmul_f32_x(svptrue_b32(), v220, v913); + svfloat32_t zero359 = svdup_n_f32(0); svfloat32_t v359 = svcmla_f32_x(pred_full, zero359, v914, v236, 90); - svfloat32_t zero485; - asm volatile("mov %0.s, #0" : "=w"(zero485)); + svfloat32_t zero485 = svdup_n_f32(0); svfloat32_t v485 = svcmla_f32_x(pred_full, zero485, v932, v252, 90); svfloat32_t v486 = svmla_f32_x(pred_full, v272, v205, v896); svfloat32_t v487 = svmla_f32_x(pred_full, v287, v208, v899); svfloat32_t v517 = svcmla_f32_x(pred_full, v366, v916, v238, 90); svfloat32_t v518 = svcmla_f32_x(pred_full, v387, v919, v241, 90); - svfloat32_t v337; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v337) : "w"(v217), "w"(v910)); - svfloat32_t zero464; - asm volatile("mov %0.s, #0" : "=w"(zero464)); + svfloat32_t v337 = svmul_f32_x(svptrue_b32(), v217, v910); + svfloat32_t zero464 = svdup_n_f32(0); svfloat32_t v464 = svcmla_f32_x(pred_full, zero464, v929, v249, 90); - svfloat32_t v489; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v489) : "w"(v486), "w"(v487)); + svfloat32_t v489 = svadd_f32_x(svptrue_b32(), v486, v487); svfloat32_t v490 = svmla_f32_x(pred_full, v277, v205, v896); svfloat32_t v491 = svmla_f32_x(pred_full, v292, v208, v899); - svfloat32_t v508; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v508) : "w"(v486), "w"(v487)); + svfloat32_t v508 = svsub_f32_x(svptrue_b32(), v486, v487); svfloat32_t v510 = svnmls_f32_x(pred_full, v352, v218, v911); svfloat32_t v511 = svnmls_f32_x(pred_full, v352, v219, v912); svfloat32_t v512 = svmla_f32_x(pred_full, v204, v195, v895); - svfloat32_t v520; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v520) : "w"(v517), "w"(v518)); + svfloat32_t v520 = svadd_f32_x(svptrue_b32(), v517, v518); svfloat32_t v521 = svcmla_f32_x(pred_full, v366, v917, v239, 90); svfloat32_t v522 = svcmla_f32_x(pred_full, v387, v920, v242, 90); - svfloat32_t v539; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v539) : "w"(v517), "w"(v518)); - svfloat32_t v541; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v541) : "w"(v471), "w"(v485)); - svfloat32_t v542; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v542) : "w"(v478), "w"(v485)); + svfloat32_t v539 = svsub_f32_x(svptrue_b32(), v517, v518); + svfloat32_t v541 = svsub_f32_x(svptrue_b32(), v471, v485); + svfloat32_t v542 = svsub_f32_x(svptrue_b32(), v478, v485); svst1_f64(pred_full, (double *)(v940), svreinterpret_f64_f32(v204)); svfloat32_t v488 = svmla_f32_x(pred_full, v337, v216, v909); svfloat32_t v492 = svmla_f32_x(pred_full, v337, v214, v908); svfloat32_t v493 = svnmls_f32_x(pred_full, v489, v183, v902); - svfloat32_t v494; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v494) : "w"(v490), "w"(v491)); - svfloat32_t v500; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v500) : "w"(v490), "w"(v491)); + svfloat32_t v494 = svadd_f32_x(svptrue_b32(), v490, v491); + svfloat32_t v500 = svsub_f32_x(svptrue_b32(), v490, v491); svfloat32_t v505 = svmla_f32_x(pred_full, v489, v182, v907); - svfloat32_t v513; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v513) : "w"(v512), "w"(v510)); - svfloat32_t v514; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v514) : "w"(v512), "w"(v510)); - svfloat32_t v516; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v516) : "w"(v512), "w"(v511)); + svfloat32_t v513 = svadd_f32_x(svptrue_b32(), v512, v510); + svfloat32_t v514 = svsub_f32_x(svptrue_b32(), v512, v510); + svfloat32_t v516 = svadd_f32_x(svptrue_b32(), v512, v511); svfloat32_t v519 = svcmla_f32_x(pred_full, v464, v928, v248, 90); svfloat32_t v523 = svcmla_f32_x(pred_full, v464, v927, v246, 90); - svfloat32_t v524; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v524) : "w"(v408), "w"(v520)); - svfloat32_t v525; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v525) : "w"(v521), "w"(v522)); - svfloat32_t v531; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v531) : "w"(v521), "w"(v522)); + svfloat32_t v524 = svsub_f32_x(svptrue_b32(), v408, v520); + svfloat32_t v525 = svadd_f32_x(svptrue_b32(), v521, v522); + svfloat32_t v531 = svsub_f32_x(svptrue_b32(), v521, v522); svfloat32_t v536 = svcmla_f32_x(pred_full, v520, v926, v223, 90); - svfloat32_t v543; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v543) : "w"(v359), "w"(v541)); - svfloat32_t v544; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v544) : "w"(v359), "w"(v541)); - svfloat32_t v546; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v546) : "w"(v359), "w"(v542)); + svfloat32_t v543 = svadd_f32_x(svptrue_b32(), v359, v541); + svfloat32_t v544 = svsub_f32_x(svptrue_b32(), v359, v541); + svfloat32_t v546 = svadd_f32_x(svptrue_b32(), v359, v542); svfloat32_t v495 = svnmls_f32_x(pred_full, v492, v185, v905); svfloat32_t v496 = svmla_f32_x(pred_full, v488, v211, v903); svfloat32_t v498 = svmla_f32_x(pred_full, v494, v212, v906); - svfloat32_t v501; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v501) : "w"(v500), "w"(v488)); - svfloat32_t v502; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v502) : "w"(v493), "w"(v494)); - svfloat32_t v509; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v509) : "w"(v508), "w"(v492)); - svfloat32_t v515; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v515) : "w"(v514), "w"(v511)); - svfloat32_t v526; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v526) : "w"(v429), "w"(v523)); + svfloat32_t v501 = svadd_f32_x(svptrue_b32(), v500, v488); + svfloat32_t v502 = svadd_f32_x(svptrue_b32(), v493, v494); + svfloat32_t v509 = svadd_f32_x(svptrue_b32(), v508, v492); + svfloat32_t v515 = svsub_f32_x(svptrue_b32(), v514, v511); + svfloat32_t v526 = svsub_f32_x(svptrue_b32(), v429, v523); svfloat32_t v527 = svcmla_f32_x(pred_full, v519, v922, v243, 90); svfloat32_t v529 = svcmla_f32_x(pred_full, v525, v925, v244, 90); - svfloat32_t v532; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v532) : "w"(v531), "w"(v519)); - svfloat32_t v533; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v533) : "w"(v524), "w"(v525)); - svfloat32_t v540; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v540) : "w"(v539), "w"(v523)); - svfloat32_t v545; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v545) : "w"(v544), "w"(v542)); - svfloat32_t v497; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v497) : "w"(v496), "w"(v493)); - svfloat32_t v499; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v499) : "w"(v498), "w"(v495)); + svfloat32_t v532 = svadd_f32_x(svptrue_b32(), v531, v519); + svfloat32_t v533 = svadd_f32_x(svptrue_b32(), v524, v525); + svfloat32_t v540 = svadd_f32_x(svptrue_b32(), v539, v523); + svfloat32_t v545 = svsub_f32_x(svptrue_b32(), v544, v542); + svfloat32_t v497 = svadd_f32_x(svptrue_b32(), v496, v493); + svfloat32_t v499 = svadd_f32_x(svptrue_b32(), v498, v495); svfloat32_t v503 = svmla_f32_x(pred_full, v502, v180, v904); - svfloat32_t v506; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v506) : "w"(v505), "w"(v495)); - svfloat32_t v528; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v528) : "w"(v527), "w"(v524)); - svfloat32_t v530; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v530) : "w"(v529), "w"(v526)); + svfloat32_t v506 = svadd_f32_x(svptrue_b32(), v505, v495); + svfloat32_t v528 = svadd_f32_x(svptrue_b32(), v527, v524); + svfloat32_t v530 = svadd_f32_x(svptrue_b32(), v529, v526); svfloat32_t v534 = svcmla_f32_x(pred_full, v533, v923, v221, 90); - svfloat32_t v537; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v537) : "w"(v536), "w"(v526)); - svfloat32_t v551; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v551) : "w"(v509), "w"(v501)); - svfloat32_t v555; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v555) : "w"(v516), "w"(v509)); - svfloat32_t v558; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v558) : "w"(v501), "w"(v516)); - svfloat32_t v563; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v563) : "w"(v540), "w"(v532)); - svfloat32_t v567; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v567) : "w"(v540), "w"(v546)); - svfloat32_t v570; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v570) : "w"(v532), "w"(v546)); - svfloat32_t v504; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v504) : "w"(v503), "w"(v492)); - svfloat32_t v507; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v507) : "w"(v506), "w"(v488)); - svfloat32_t v535; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v535) : "w"(v534), "w"(v523)); - svfloat32_t v538; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v538) : "w"(v537), "w"(v519)); - svfloat32_t v552; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v552) : "w"(v551), "w"(v516)); - svfloat32_t v556; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v556) : "w"(v497), "w"(v513)); - svfloat32_t v557; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v557) : "w"(v499), "w"(v515)); - svfloat32_t v564; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v564) : "w"(v563), "w"(v546)); - svfloat32_t v568; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v568) : "w"(v528), "w"(v543)); - svfloat32_t v569; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v569) : "w"(v530), "w"(v545)); - svfloat32_t v594; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v594) : "w"(v558), "w"(v570)); - svfloat32_t v602; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v602) : "w"(v558), "w"(v570)); - svfloat32_t v610; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v610) : "w"(v555), "w"(v567)); - svfloat32_t v618; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v618) : "w"(v555), "w"(v567)); - svfloat32_t v547; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v547) : "w"(v504), "w"(v497)); - svfloat32_t v549; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v549) : "w"(v507), "w"(v499)); - svfloat32_t v553; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v553) : "w"(v513), "w"(v504)); - svfloat32_t v554; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v554) : "w"(v515), "w"(v507)); - svfloat32_t v559; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v559) : "w"(v535), "w"(v528)); - svfloat32_t v561; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v561) : "w"(v538), "w"(v530)); - svfloat32_t v565; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v565) : "w"(v543), "w"(v535)); - svfloat32_t v566; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v566) : "w"(v545), "w"(v538)); - svfloat32_t v626; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v626) : "w"(v557), "w"(v569)); - svfloat32_t v634; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v634) : "w"(v557), "w"(v569)); - svfloat32_t v642; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v642) : "w"(v552), "w"(v564)); - svfloat32_t v650; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v650) : "w"(v552), "w"(v564)); - svfloat32_t v690; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v690) : "w"(v556), "w"(v568)); - svfloat32_t v698; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v698) : "w"(v556), "w"(v568)); + svfloat32_t v537 = svadd_f32_x(svptrue_b32(), v536, v526); + svfloat32_t v551 = svsub_f32_x(svptrue_b32(), v509, v501); + svfloat32_t v555 = svsub_f32_x(svptrue_b32(), v516, v509); + svfloat32_t v558 = svadd_f32_x(svptrue_b32(), v501, v516); + svfloat32_t v563 = svsub_f32_x(svptrue_b32(), v540, v532); + svfloat32_t v567 = svsub_f32_x(svptrue_b32(), v540, v546); + svfloat32_t v570 = svadd_f32_x(svptrue_b32(), v532, v546); + svfloat32_t v504 = svadd_f32_x(svptrue_b32(), v503, v492); + svfloat32_t v507 = svadd_f32_x(svptrue_b32(), v506, v488); + svfloat32_t v535 = svadd_f32_x(svptrue_b32(), v534, v523); + svfloat32_t v538 = svadd_f32_x(svptrue_b32(), v537, v519); + svfloat32_t v552 = svadd_f32_x(svptrue_b32(), v551, v516); + svfloat32_t v556 = svadd_f32_x(svptrue_b32(), v497, v513); + svfloat32_t v557 = svadd_f32_x(svptrue_b32(), v499, v515); + svfloat32_t v564 = svadd_f32_x(svptrue_b32(), v563, v546); + svfloat32_t v568 = svadd_f32_x(svptrue_b32(), v528, v543); + svfloat32_t v569 = svadd_f32_x(svptrue_b32(), v530, v545); + svfloat32_t v594 = svsub_f32_x(svptrue_b32(), v558, v570); + svfloat32_t v602 = svadd_f32_x(svptrue_b32(), v558, v570); + svfloat32_t v610 = svadd_f32_x(svptrue_b32(), v555, v567); + svfloat32_t v618 = svsub_f32_x(svptrue_b32(), v555, v567); + svfloat32_t v547 = svsub_f32_x(svptrue_b32(), v504, v497); + svfloat32_t v549 = svsub_f32_x(svptrue_b32(), v507, v499); + svfloat32_t v553 = svsub_f32_x(svptrue_b32(), v513, v504); + svfloat32_t v554 = svsub_f32_x(svptrue_b32(), v515, v507); + svfloat32_t v559 = svsub_f32_x(svptrue_b32(), v535, v528); + svfloat32_t v561 = svsub_f32_x(svptrue_b32(), v538, v530); + svfloat32_t v565 = svsub_f32_x(svptrue_b32(), v543, v535); + svfloat32_t v566 = svsub_f32_x(svptrue_b32(), v545, v538); + svfloat32_t v626 = svadd_f32_x(svptrue_b32(), v557, v569); + svfloat32_t v634 = svsub_f32_x(svptrue_b32(), v557, v569); + svfloat32_t v642 = svadd_f32_x(svptrue_b32(), v552, v564); + svfloat32_t v650 = svsub_f32_x(svptrue_b32(), v552, v564); + svfloat32_t v690 = svsub_f32_x(svptrue_b32(), v556, v568); + svfloat32_t v698 = svadd_f32_x(svptrue_b32(), v556, v568); svst1_f64(pred_full, (double *)(v967), svreinterpret_f64_f32(v594)); svst1_f64(pred_full, (double *)(v976), svreinterpret_f64_f32(v602)); svst1_f64(pred_full, (double *)(v985), svreinterpret_f64_f32(v610)); svst1_f64(pred_full, (double *)(v994), svreinterpret_f64_f32(v618)); - svfloat32_t v548; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v548) : "w"(v547), "w"(v513)); - svfloat32_t v550; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v550) : "w"(v549), "w"(v515)); - svfloat32_t v560; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v560) : "w"(v559), "w"(v543)); - svfloat32_t v562; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v562) : "w"(v561), "w"(v545)); - svfloat32_t v658; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v658) : "w"(v554), "w"(v566)); - svfloat32_t v666; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v666) : "w"(v554), "w"(v566)); - svfloat32_t v674; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v674) : "w"(v553), "w"(v565)); - svfloat32_t v682; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v682) : "w"(v553), "w"(v565)); + svfloat32_t v548 = svadd_f32_x(svptrue_b32(), v547, v513); + svfloat32_t v550 = svadd_f32_x(svptrue_b32(), v549, v515); + svfloat32_t v560 = svadd_f32_x(svptrue_b32(), v559, v543); + svfloat32_t v562 = svadd_f32_x(svptrue_b32(), v561, v545); + svfloat32_t v658 = svadd_f32_x(svptrue_b32(), v554, v566); + svfloat32_t v666 = svsub_f32_x(svptrue_b32(), v554, v566); + svfloat32_t v674 = svadd_f32_x(svptrue_b32(), v553, v565); + svfloat32_t v682 = svsub_f32_x(svptrue_b32(), v553, v565); svst1_f64(pred_full, (double *)(v1003), svreinterpret_f64_f32(v626)); svst1_f64(pred_full, (double *)(v1012), svreinterpret_f64_f32(v634)); svst1_f64(pred_full, (double *)(v1021), svreinterpret_f64_f32(v642)); svst1_f64(pred_full, (double *)(v1030), svreinterpret_f64_f32(v650)); svst1_f64(pred_full, (double *)(v1075), svreinterpret_f64_f32(v690)); svst1_f64(pred_full, (double *)(v1084), svreinterpret_f64_f32(v698)); - svfloat32_t v578; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v578) : "w"(v548), "w"(v560)); - svfloat32_t v586; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v586) : "w"(v548), "w"(v560)); - svfloat32_t v706; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v706) : "w"(v550), "w"(v562)); - svfloat32_t v714; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v714) : "w"(v550), "w"(v562)); + svfloat32_t v578 = svadd_f32_x(svptrue_b32(), v548, v560); + svfloat32_t v586 = svsub_f32_x(svptrue_b32(), v548, v560); + svfloat32_t v706 = svadd_f32_x(svptrue_b32(), v550, v562); + svfloat32_t v714 = svsub_f32_x(svptrue_b32(), v550, v562); svst1_f64(pred_full, (double *)(v1039), svreinterpret_f64_f32(v658)); svst1_f64(pred_full, (double *)(v1048), svreinterpret_f64_f32(v666)); svst1_f64(pred_full, (double *)(v1057), svreinterpret_f64_f32(v674)); @@ -9911,239 +8994,136 @@ void armral_fft_cs16_cf32_cf32_ac_n_uu20(const armral_cmplx_int16_t *restrict x, svcvt_f32_s32_x(pred_full, svld1sh_s32(pred_full, (const int16_t *)&v748[0])), 1.F / (1ULL << 15ULL)); - svfloat32_t v34; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v34) : "w"(v25), "w"(v33)); - svfloat32_t v35; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v35) : "w"(v25), "w"(v33)); - svfloat32_t v52; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v52) : "w"(v43), "w"(v51)); - svfloat32_t v53; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v53) : "w"(v43), "w"(v51)); - svfloat32_t v72; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v72) : "w"(v63), "w"(v71)); - svfloat32_t v73; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v73) : "w"(v63), "w"(v71)); - svfloat32_t v90; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v90) : "w"(v81), "w"(v89)); - svfloat32_t v91; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v91) : "w"(v81), "w"(v89)); - svfloat32_t v110; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v110) : "w"(v101), "w"(v109)); - svfloat32_t v111; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v111) : "w"(v101), "w"(v109)); - svfloat32_t v128; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v128) : "w"(v119), "w"(v127)); - svfloat32_t v129; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v129) : "w"(v119), "w"(v127)); - svfloat32_t v148; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v148) : "w"(v139), "w"(v147)); - svfloat32_t v149; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v149) : "w"(v139), "w"(v147)); - svfloat32_t v166; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v166) : "w"(v157), "w"(v165)); - svfloat32_t v167; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v167) : "w"(v157), "w"(v165)); - svfloat32_t v186; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v186) : "w"(v177), "w"(v185)); - svfloat32_t v187; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v187) : "w"(v177), "w"(v185)); - svfloat32_t v204; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v204) : "w"(v195), "w"(v203)); - svfloat32_t v205; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v205) : "w"(v195), "w"(v203)); - svfloat32_t v54; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v54) : "w"(v34), "w"(v52)); - svfloat32_t v55; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v55) : "w"(v34), "w"(v52)); - svfloat32_t v92; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v92) : "w"(v72), "w"(v90)); - svfloat32_t v93; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v93) : "w"(v72), "w"(v90)); - svfloat32_t v130; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v130) : "w"(v110), "w"(v128)); - svfloat32_t v131; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v131) : "w"(v110), "w"(v128)); - svfloat32_t v168; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v168) : "w"(v148), "w"(v166)); - svfloat32_t v169; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v169) : "w"(v148), "w"(v166)); - svfloat32_t v206; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v206) : "w"(v186), "w"(v204)); - svfloat32_t v207; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v207) : "w"(v186), "w"(v204)); - svfloat32_t v314; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v314) : "w"(v73), "w"(v187)); - svfloat32_t v315; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v315) : "w"(v73), "w"(v187)); - svfloat32_t v316; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v316) : "w"(v149), "w"(v111)); - svfloat32_t v317; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v317) : "w"(v149), "w"(v111)); - svfloat32_t v367; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v367) : "w"(v91), "w"(v205)); - svfloat32_t v368; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v368) : "w"(v91), "w"(v205)); - svfloat32_t v369; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v369) : "w"(v167), "w"(v129)); - svfloat32_t v370; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v370) : "w"(v167), "w"(v129)); - svfloat32_t v208; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v208) : "w"(v92), "w"(v206)); - svfloat32_t v209; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v209) : "w"(v92), "w"(v206)); - svfloat32_t v210; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v210) : "w"(v168), "w"(v130)); - svfloat32_t v211; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v211) : "w"(v168), "w"(v130)); - svfloat32_t v261; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v261) : "w"(v93), "w"(v207)); - svfloat32_t v262; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v262) : "w"(v93), "w"(v207)); - svfloat32_t v263; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v263) : "w"(v169), "w"(v131)); - svfloat32_t v264; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v264) : "w"(v169), "w"(v131)); - svfloat32_t v318; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v318) : "w"(v314), "w"(v316)); - svfloat32_t v319; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v319) : "w"(v314), "w"(v316)); - svfloat32_t v320; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v320) : "w"(v315), "w"(v317)); - svfloat32_t zero343; - asm volatile("mov %0.s, #0" : "=w"(zero343)); + svfloat32_t v34 = svadd_f32_x(svptrue_b32(), v25, v33); + svfloat32_t v35 = svsub_f32_x(svptrue_b32(), v25, v33); + svfloat32_t v52 = svadd_f32_x(svptrue_b32(), v43, v51); + svfloat32_t v53 = svsub_f32_x(svptrue_b32(), v43, v51); + svfloat32_t v72 = svadd_f32_x(svptrue_b32(), v63, v71); + svfloat32_t v73 = svsub_f32_x(svptrue_b32(), v63, v71); + svfloat32_t v90 = svadd_f32_x(svptrue_b32(), v81, v89); + svfloat32_t v91 = svsub_f32_x(svptrue_b32(), v81, v89); + svfloat32_t v110 = svadd_f32_x(svptrue_b32(), v101, v109); + svfloat32_t v111 = svsub_f32_x(svptrue_b32(), v101, v109); + svfloat32_t v128 = svadd_f32_x(svptrue_b32(), v119, v127); + svfloat32_t v129 = svsub_f32_x(svptrue_b32(), v119, v127); + svfloat32_t v148 = svadd_f32_x(svptrue_b32(), v139, v147); + svfloat32_t v149 = svsub_f32_x(svptrue_b32(), v139, v147); + svfloat32_t v166 = svadd_f32_x(svptrue_b32(), v157, v165); + svfloat32_t v167 = svsub_f32_x(svptrue_b32(), v157, v165); + svfloat32_t v186 = svadd_f32_x(svptrue_b32(), v177, v185); + svfloat32_t v187 = svsub_f32_x(svptrue_b32(), v177, v185); + svfloat32_t v204 = svadd_f32_x(svptrue_b32(), v195, v203); + svfloat32_t v205 = svsub_f32_x(svptrue_b32(), v195, v203); + svfloat32_t v54 = svadd_f32_x(svptrue_b32(), v34, v52); + svfloat32_t v55 = svsub_f32_x(svptrue_b32(), v34, v52); + svfloat32_t v92 = svadd_f32_x(svptrue_b32(), v72, v90); + svfloat32_t v93 = svsub_f32_x(svptrue_b32(), v72, v90); + svfloat32_t v130 = svadd_f32_x(svptrue_b32(), v110, v128); + svfloat32_t v131 = svsub_f32_x(svptrue_b32(), v110, v128); + svfloat32_t v168 = svadd_f32_x(svptrue_b32(), v148, v166); + svfloat32_t v169 = svsub_f32_x(svptrue_b32(), v148, v166); + svfloat32_t v206 = svadd_f32_x(svptrue_b32(), v186, v204); + svfloat32_t v207 = svsub_f32_x(svptrue_b32(), v186, v204); + svfloat32_t v314 = svadd_f32_x(svptrue_b32(), v73, v187); + svfloat32_t v315 = svsub_f32_x(svptrue_b32(), v73, v187); + svfloat32_t v316 = svadd_f32_x(svptrue_b32(), v149, v111); + svfloat32_t v317 = svsub_f32_x(svptrue_b32(), v149, v111); + svfloat32_t v367 = svadd_f32_x(svptrue_b32(), v91, v205); + svfloat32_t v368 = svsub_f32_x(svptrue_b32(), v91, v205); + svfloat32_t v369 = svadd_f32_x(svptrue_b32(), v167, v129); + svfloat32_t v370 = svsub_f32_x(svptrue_b32(), v167, v129); + svfloat32_t v208 = svadd_f32_x(svptrue_b32(), v92, v206); + svfloat32_t v209 = svsub_f32_x(svptrue_b32(), v92, v206); + svfloat32_t v210 = svadd_f32_x(svptrue_b32(), v168, v130); + svfloat32_t v211 = svsub_f32_x(svptrue_b32(), v168, v130); + svfloat32_t v261 = svadd_f32_x(svptrue_b32(), v93, v207); + svfloat32_t v262 = svsub_f32_x(svptrue_b32(), v93, v207); + svfloat32_t v263 = svadd_f32_x(svptrue_b32(), v169, v131); + svfloat32_t v264 = svsub_f32_x(svptrue_b32(), v169, v131); + svfloat32_t v318 = svadd_f32_x(svptrue_b32(), v314, v316); + svfloat32_t v319 = svsub_f32_x(svptrue_b32(), v314, v316); + svfloat32_t v320 = svadd_f32_x(svptrue_b32(), v315, v317); + svfloat32_t zero343 = svdup_n_f32(0); svfloat32_t v343 = svcmla_f32_x(pred_full, zero343, v766, v315, 90); - svfloat32_t v371; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v371) : "w"(v367), "w"(v369)); - svfloat32_t v372; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v372) : "w"(v367), "w"(v369)); - svfloat32_t v373; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v373) : "w"(v368), "w"(v370)); - svfloat32_t v410; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v410) : "w"(v370), "w"(v774)); - svfloat32_t v212; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v212) : "w"(v208), "w"(v210)); - svfloat32_t v213; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v213) : "w"(v208), "w"(v210)); - svfloat32_t v214; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v214) : "w"(v209), "w"(v211)); - svfloat32_t zero237; - asm volatile("mov %0.s, #0" : "=w"(zero237)); + svfloat32_t v371 = svadd_f32_x(svptrue_b32(), v367, v369); + svfloat32_t v372 = svsub_f32_x(svptrue_b32(), v367, v369); + svfloat32_t v373 = svadd_f32_x(svptrue_b32(), v368, v370); + svfloat32_t v410 = svmul_f32_x(svptrue_b32(), v370, v774); + svfloat32_t v212 = svadd_f32_x(svptrue_b32(), v208, v210); + svfloat32_t v213 = svsub_f32_x(svptrue_b32(), v208, v210); + svfloat32_t v214 = svadd_f32_x(svptrue_b32(), v209, v211); + svfloat32_t zero237 = svdup_n_f32(0); svfloat32_t v237 = svcmla_f32_x(pred_full, zero237, v766, v209, 90); - svfloat32_t v265; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v265) : "w"(v261), "w"(v263)); - svfloat32_t v266; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v266) : "w"(v261), "w"(v263)); - svfloat32_t v267; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v267) : "w"(v262), "w"(v264)); - svfloat32_t zero290; - asm volatile("mov %0.s, #0" : "=w"(zero290)); + svfloat32_t v265 = svadd_f32_x(svptrue_b32(), v261, v263); + svfloat32_t v266 = svsub_f32_x(svptrue_b32(), v261, v263); + svfloat32_t v267 = svadd_f32_x(svptrue_b32(), v262, v264); + svfloat32_t zero290 = svdup_n_f32(0); svfloat32_t v290 = svcmla_f32_x(pred_full, zero290, v766, v262, 90); - svfloat32_t v321; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v321) : "w"(v318), "w"(v35)); - svfloat32_t zero350; - asm volatile("mov %0.s, #0" : "=w"(zero350)); + svfloat32_t v321 = svadd_f32_x(svptrue_b32(), v318, v35); + svfloat32_t zero350 = svdup_n_f32(0); svfloat32_t v350 = svcmla_f32_x(pred_full, zero350, v767, v320, 90); - svfloat32_t v374; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v374) : "w"(v371), "w"(v53)); - svfloat32_t zero395; - asm volatile("mov %0.s, #0" : "=w"(zero395)); + svfloat32_t v374 = svadd_f32_x(svptrue_b32(), v371, v53); + svfloat32_t zero395 = svdup_n_f32(0); svfloat32_t v395 = svcmla_f32_x(pred_full, zero395, v771, v372, 90); - svfloat32_t v405; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v405) : "w"(v373), "w"(v773)); - svfloat32_t v215; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v215) : "w"(v212), "w"(v54)); - svfloat32_t zero244; - asm volatile("mov %0.s, #0" : "=w"(zero244)); + svfloat32_t v405 = svmul_f32_x(svptrue_b32(), v373, v773); + svfloat32_t v215 = svadd_f32_x(svptrue_b32(), v212, v54); + svfloat32_t zero244 = svdup_n_f32(0); svfloat32_t v244 = svcmla_f32_x(pred_full, zero244, v767, v214, 90); - svfloat32_t v268; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v268) : "w"(v265), "w"(v55)); - svfloat32_t zero297; - asm volatile("mov %0.s, #0" : "=w"(zero297)); + svfloat32_t v268 = svadd_f32_x(svptrue_b32(), v265, v55); + svfloat32_t zero297 = svdup_n_f32(0); svfloat32_t v297 = svcmla_f32_x(pred_full, zero297, v767, v267, 90); svfloat32_t v358 = svmla_f32_x(pred_full, v321, v318, v764); - svfloat32_t v361; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v361) : "w"(v343), "w"(v350)); + svfloat32_t v361 = svsub_f32_x(svptrue_b32(), v343, v350); svfloat32_t v362 = svcmla_f32_x(pred_full, v350, v768, v317, 90); - svfloat32_t zero381; - asm volatile("mov %0.s, #0" : "=w"(zero381)); + svfloat32_t zero381 = svdup_n_f32(0); svfloat32_t v381 = svcmla_f32_x(pred_full, zero381, v769, v374, 90); svfloat32_t v414 = svnmls_f32_x(pred_full, v405, v368, v772); svfloat32_t v415 = svmla_f32_x(pred_full, v410, v373, v773); svfloat32_t v252 = svmla_f32_x(pred_full, v215, v212, v764); - svfloat32_t v255; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v255) : "w"(v237), "w"(v244)); + svfloat32_t v255 = svsub_f32_x(svptrue_b32(), v237, v244); svfloat32_t v256 = svcmla_f32_x(pred_full, v244, v768, v211, 90); svfloat32_t v305 = svmla_f32_x(pred_full, v268, v265, v764); - svfloat32_t v308; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v308) : "w"(v290), "w"(v297)); + svfloat32_t v308 = svsub_f32_x(svptrue_b32(), v290, v297); svfloat32_t v309 = svcmla_f32_x(pred_full, v297, v768, v264, 90); svfloat32_t v359 = svmla_f32_x(pred_full, v358, v319, v765); svfloat32_t v360 = svmls_f32_x(pred_full, v358, v319, v765); svfloat32_t v411 = svcmla_f32_x(pred_full, v381, v770, v371, 90); - svfloat32_t v420; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v420) : "w"(v321), "w"(v381)); - svfloat32_t v421; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v421) : "w"(v321), "w"(v381)); + svfloat32_t v420 = svadd_f32_x(svptrue_b32(), v321, v381); + svfloat32_t v421 = svsub_f32_x(svptrue_b32(), v321, v381); svst1_f64(pred_full, (double *)(v782), svreinterpret_f64_f32(v215)); svst1_f64(pred_full, (double *)(v800), svreinterpret_f64_f32(v268)); svfloat32_t v253 = svmla_f32_x(pred_full, v252, v213, v765); svfloat32_t v254 = svmls_f32_x(pred_full, v252, v213, v765); svfloat32_t v306 = svmla_f32_x(pred_full, v305, v266, v765); svfloat32_t v307 = svmls_f32_x(pred_full, v305, v266, v765); - svfloat32_t v363; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v363) : "w"(v359), "w"(v361)); - svfloat32_t v364; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v364) : "w"(v359), "w"(v361)); - svfloat32_t v365; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v365) : "w"(v360), "w"(v362)); - svfloat32_t v366; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v366) : "w"(v360), "w"(v362)); - svfloat32_t v412; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v412) : "w"(v411), "w"(v395)); - svfloat32_t v413; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v413) : "w"(v411), "w"(v395)); + svfloat32_t v363 = svadd_f32_x(svptrue_b32(), v359, v361); + svfloat32_t v364 = svsub_f32_x(svptrue_b32(), v359, v361); + svfloat32_t v365 = svadd_f32_x(svptrue_b32(), v360, v362); + svfloat32_t v366 = svsub_f32_x(svptrue_b32(), v360, v362); + svfloat32_t v412 = svadd_f32_x(svptrue_b32(), v411, v395); + svfloat32_t v413 = svsub_f32_x(svptrue_b32(), v411, v395); svst1_f64(pred_full, (double *)(v791), svreinterpret_f64_f32(v421)); svst1_f64(pred_full, (double *)(v809), svreinterpret_f64_f32(v420)); - svfloat32_t v257; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v257) : "w"(v253), "w"(v255)); - svfloat32_t v258; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v258) : "w"(v253), "w"(v255)); - svfloat32_t v259; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v259) : "w"(v254), "w"(v256)); - svfloat32_t v260; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v260) : "w"(v254), "w"(v256)); - svfloat32_t v310; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v310) : "w"(v306), "w"(v308)); - svfloat32_t v311; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v311) : "w"(v306), "w"(v308)); - svfloat32_t v312; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v312) : "w"(v307), "w"(v309)); - svfloat32_t v313; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v313) : "w"(v307), "w"(v309)); - svfloat32_t v416; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v416) : "w"(v412), "w"(v414)); - svfloat32_t v417; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v417) : "w"(v412), "w"(v414)); - svfloat32_t v418; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v418) : "w"(v413), "w"(v415)); - svfloat32_t v419; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v419) : "w"(v413), "w"(v415)); - svfloat32_t v450; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v450) : "w"(v364), "w"(v417)); - svfloat32_t v451; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v451) : "w"(v364), "w"(v417)); - svfloat32_t v480; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v480) : "w"(v366), "w"(v419)); - svfloat32_t v481; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v481) : "w"(v366), "w"(v419)); - svfloat32_t v510; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v510) : "w"(v365), "w"(v418)); - svfloat32_t v511; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v511) : "w"(v365), "w"(v418)); - svfloat32_t v540; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v540) : "w"(v363), "w"(v416)); - svfloat32_t v541; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v541) : "w"(v363), "w"(v416)); + svfloat32_t v257 = svadd_f32_x(svptrue_b32(), v253, v255); + svfloat32_t v258 = svsub_f32_x(svptrue_b32(), v253, v255); + svfloat32_t v259 = svadd_f32_x(svptrue_b32(), v254, v256); + svfloat32_t v260 = svsub_f32_x(svptrue_b32(), v254, v256); + svfloat32_t v310 = svadd_f32_x(svptrue_b32(), v306, v308); + svfloat32_t v311 = svsub_f32_x(svptrue_b32(), v306, v308); + svfloat32_t v312 = svadd_f32_x(svptrue_b32(), v307, v309); + svfloat32_t v313 = svsub_f32_x(svptrue_b32(), v307, v309); + svfloat32_t v416 = svadd_f32_x(svptrue_b32(), v412, v414); + svfloat32_t v417 = svsub_f32_x(svptrue_b32(), v412, v414); + svfloat32_t v418 = svadd_f32_x(svptrue_b32(), v413, v415); + svfloat32_t v419 = svsub_f32_x(svptrue_b32(), v413, v415); + svfloat32_t v450 = svadd_f32_x(svptrue_b32(), v364, v417); + svfloat32_t v451 = svsub_f32_x(svptrue_b32(), v364, v417); + svfloat32_t v480 = svadd_f32_x(svptrue_b32(), v366, v419); + svfloat32_t v481 = svsub_f32_x(svptrue_b32(), v366, v419); + svfloat32_t v510 = svadd_f32_x(svptrue_b32(), v365, v418); + svfloat32_t v511 = svsub_f32_x(svptrue_b32(), v365, v418); + svfloat32_t v540 = svadd_f32_x(svptrue_b32(), v363, v416); + svfloat32_t v541 = svsub_f32_x(svptrue_b32(), v363, v416); svst1_f64(pred_full, (double *)(v818), svreinterpret_f64_f32(v258)); svst1_f64(pred_full, (double *)(v836), svreinterpret_f64_f32(v311)); svst1_f64(pred_full, (double *)(v854), svreinterpret_f64_f32(v260)); @@ -11183,215 +10163,119 @@ void armral_fft_cs16_cf32_cf32_ac_n_uu21(const armral_cmplx_int16_t *restrict x, svcvt_f32_s32_x(pred_full, svld1sh_s32(pred_full, (const int16_t *)&v831[0])), 1.F / (1ULL << 15ULL)); - svfloat32_t v34; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v34) : "w"(v25), "w"(v33)); - svfloat32_t v35; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v35) : "w"(v25), "w"(v33)); - svfloat32_t v61; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v61) : "w"(v52), "w"(v60)); - svfloat32_t v62; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v62) : "w"(v52), "w"(v60)); - svfloat32_t v88; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v88) : "w"(v79), "w"(v87)); - svfloat32_t v89; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v89) : "w"(v79), "w"(v87)); - svfloat32_t v115; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v115) : "w"(v106), "w"(v114)); - svfloat32_t v116; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v116) : "w"(v106), "w"(v114)); - svfloat32_t v142; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v142) : "w"(v133), "w"(v141)); - svfloat32_t v143; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v143) : "w"(v133), "w"(v141)); - svfloat32_t v169; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v169) : "w"(v160), "w"(v168)); - svfloat32_t v170; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v170) : "w"(v160), "w"(v168)); - svfloat32_t v196; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v196) : "w"(v187), "w"(v195)); - svfloat32_t v197; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v197) : "w"(v187), "w"(v195)); - svfloat32_t v44; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v44) : "w"(v34), "w"(v43)); - svfloat32_t v71; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v71) : "w"(v61), "w"(v70)); - svfloat32_t v98; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v98) : "w"(v88), "w"(v97)); - svfloat32_t v125; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v125) : "w"(v115), "w"(v124)); - svfloat32_t v152; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v152) : "w"(v142), "w"(v151)); - svfloat32_t v179; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v179) : "w"(v169), "w"(v178)); - svfloat32_t v206; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v206) : "w"(v196), "w"(v205)); - svfloat32_t v296; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v296) : "w"(v61), "w"(v196)); - svfloat32_t v297; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v297) : "w"(v61), "w"(v196)); - svfloat32_t v298; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v298) : "w"(v142), "w"(v115)); - svfloat32_t v299; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v299) : "w"(v142), "w"(v115)); - svfloat32_t v300; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v300) : "w"(v88), "w"(v169)); - svfloat32_t v301; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v301) : "w"(v88), "w"(v169)); - svfloat32_t v385; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v385) : "w"(v62), "w"(v197)); - svfloat32_t v386; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v386) : "w"(v62), "w"(v197)); - svfloat32_t v387; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v387) : "w"(v143), "w"(v116)); - svfloat32_t v388; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v388) : "w"(v143), "w"(v116)); - svfloat32_t v389; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v389) : "w"(v89), "w"(v170)); - svfloat32_t v390; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v390) : "w"(v89), "w"(v170)); - svfloat32_t v207; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v207) : "w"(v71), "w"(v206)); - svfloat32_t v208; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v208) : "w"(v71), "w"(v206)); - svfloat32_t v209; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v209) : "w"(v152), "w"(v125)); - svfloat32_t v210; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v210) : "w"(v152), "w"(v125)); - svfloat32_t v211; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v211) : "w"(v98), "w"(v179)); - svfloat32_t v212; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v212) : "w"(v98), "w"(v179)); - svfloat32_t v302; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v302) : "w"(v296), "w"(v298)); - svfloat32_t v305; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v305) : "w"(v296), "w"(v298)); - svfloat32_t v306; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v306) : "w"(v298), "w"(v300)); - svfloat32_t v307; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v307) : "w"(v300), "w"(v296)); - svfloat32_t v308; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v308) : "w"(v297), "w"(v299)); - svfloat32_t v310; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v310) : "w"(v297), "w"(v299)); - svfloat32_t v311; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v311) : "w"(v299), "w"(v301)); - svfloat32_t v312; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v312) : "w"(v301), "w"(v297)); - svfloat32_t v391; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v391) : "w"(v385), "w"(v387)); - svfloat32_t v394; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v394) : "w"(v385), "w"(v387)); - svfloat32_t v395; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v395) : "w"(v387), "w"(v389)); - svfloat32_t v396; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v396) : "w"(v389), "w"(v385)); - svfloat32_t v397; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v397) : "w"(v386), "w"(v388)); - svfloat32_t v399; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v399) : "w"(v386), "w"(v388)); - svfloat32_t v400; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v400) : "w"(v388), "w"(v390)); - svfloat32_t v401; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v401) : "w"(v390), "w"(v386)); - svfloat32_t v213; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v213) : "w"(v207), "w"(v209)); - svfloat32_t v216; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v216) : "w"(v207), "w"(v209)); - svfloat32_t v217; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v217) : "w"(v209), "w"(v211)); - svfloat32_t v218; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v218) : "w"(v211), "w"(v207)); - svfloat32_t v219; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v219) : "w"(v208), "w"(v210)); - svfloat32_t v221; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v221) : "w"(v208), "w"(v210)); - svfloat32_t v222; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v222) : "w"(v210), "w"(v212)); - svfloat32_t v223; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v223) : "w"(v212), "w"(v208)); - svfloat32_t v303; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v303) : "w"(v302), "w"(v300)); - svfloat32_t v309; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v309) : "w"(v308), "w"(v301)); - svfloat32_t zero351; - asm volatile("mov %0.s, #0" : "=w"(zero351)); + svfloat32_t v34 = svadd_f32_x(svptrue_b32(), v25, v33); + svfloat32_t v35 = svsub_f32_x(svptrue_b32(), v25, v33); + svfloat32_t v61 = svadd_f32_x(svptrue_b32(), v52, v60); + svfloat32_t v62 = svsub_f32_x(svptrue_b32(), v52, v60); + svfloat32_t v88 = svadd_f32_x(svptrue_b32(), v79, v87); + svfloat32_t v89 = svsub_f32_x(svptrue_b32(), v79, v87); + svfloat32_t v115 = svadd_f32_x(svptrue_b32(), v106, v114); + svfloat32_t v116 = svsub_f32_x(svptrue_b32(), v106, v114); + svfloat32_t v142 = svadd_f32_x(svptrue_b32(), v133, v141); + svfloat32_t v143 = svsub_f32_x(svptrue_b32(), v133, v141); + svfloat32_t v169 = svadd_f32_x(svptrue_b32(), v160, v168); + svfloat32_t v170 = svsub_f32_x(svptrue_b32(), v160, v168); + svfloat32_t v196 = svadd_f32_x(svptrue_b32(), v187, v195); + svfloat32_t v197 = svsub_f32_x(svptrue_b32(), v187, v195); + svfloat32_t v44 = svadd_f32_x(svptrue_b32(), v34, v43); + svfloat32_t v71 = svadd_f32_x(svptrue_b32(), v61, v70); + svfloat32_t v98 = svadd_f32_x(svptrue_b32(), v88, v97); + svfloat32_t v125 = svadd_f32_x(svptrue_b32(), v115, v124); + svfloat32_t v152 = svadd_f32_x(svptrue_b32(), v142, v151); + svfloat32_t v179 = svadd_f32_x(svptrue_b32(), v169, v178); + svfloat32_t v206 = svadd_f32_x(svptrue_b32(), v196, v205); + svfloat32_t v296 = svadd_f32_x(svptrue_b32(), v61, v196); + svfloat32_t v297 = svsub_f32_x(svptrue_b32(), v61, v196); + svfloat32_t v298 = svadd_f32_x(svptrue_b32(), v142, v115); + svfloat32_t v299 = svsub_f32_x(svptrue_b32(), v142, v115); + svfloat32_t v300 = svadd_f32_x(svptrue_b32(), v88, v169); + svfloat32_t v301 = svsub_f32_x(svptrue_b32(), v88, v169); + svfloat32_t v385 = svadd_f32_x(svptrue_b32(), v62, v197); + svfloat32_t v386 = svsub_f32_x(svptrue_b32(), v62, v197); + svfloat32_t v387 = svadd_f32_x(svptrue_b32(), v143, v116); + svfloat32_t v388 = svsub_f32_x(svptrue_b32(), v143, v116); + svfloat32_t v389 = svadd_f32_x(svptrue_b32(), v89, v170); + svfloat32_t v390 = svsub_f32_x(svptrue_b32(), v89, v170); + svfloat32_t v207 = svadd_f32_x(svptrue_b32(), v71, v206); + svfloat32_t v208 = svsub_f32_x(svptrue_b32(), v71, v206); + svfloat32_t v209 = svadd_f32_x(svptrue_b32(), v152, v125); + svfloat32_t v210 = svsub_f32_x(svptrue_b32(), v152, v125); + svfloat32_t v211 = svadd_f32_x(svptrue_b32(), v98, v179); + svfloat32_t v212 = svsub_f32_x(svptrue_b32(), v98, v179); + svfloat32_t v302 = svadd_f32_x(svptrue_b32(), v296, v298); + svfloat32_t v305 = svsub_f32_x(svptrue_b32(), v296, v298); + svfloat32_t v306 = svsub_f32_x(svptrue_b32(), v298, v300); + svfloat32_t v307 = svsub_f32_x(svptrue_b32(), v300, v296); + svfloat32_t v308 = svadd_f32_x(svptrue_b32(), v297, v299); + svfloat32_t v310 = svsub_f32_x(svptrue_b32(), v297, v299); + svfloat32_t v311 = svsub_f32_x(svptrue_b32(), v299, v301); + svfloat32_t v312 = svsub_f32_x(svptrue_b32(), v301, v297); + svfloat32_t v391 = svadd_f32_x(svptrue_b32(), v385, v387); + svfloat32_t v394 = svsub_f32_x(svptrue_b32(), v385, v387); + svfloat32_t v395 = svsub_f32_x(svptrue_b32(), v387, v389); + svfloat32_t v396 = svsub_f32_x(svptrue_b32(), v389, v385); + svfloat32_t v397 = svadd_f32_x(svptrue_b32(), v386, v388); + svfloat32_t v399 = svsub_f32_x(svptrue_b32(), v386, v388); + svfloat32_t v400 = svsub_f32_x(svptrue_b32(), v388, v390); + svfloat32_t v401 = svsub_f32_x(svptrue_b32(), v390, v386); + svfloat32_t v213 = svadd_f32_x(svptrue_b32(), v207, v209); + svfloat32_t v216 = svsub_f32_x(svptrue_b32(), v207, v209); + svfloat32_t v217 = svsub_f32_x(svptrue_b32(), v209, v211); + svfloat32_t v218 = svsub_f32_x(svptrue_b32(), v211, v207); + svfloat32_t v219 = svadd_f32_x(svptrue_b32(), v208, v210); + svfloat32_t v221 = svsub_f32_x(svptrue_b32(), v208, v210); + svfloat32_t v222 = svsub_f32_x(svptrue_b32(), v210, v212); + svfloat32_t v223 = svsub_f32_x(svptrue_b32(), v212, v208); + svfloat32_t v303 = svadd_f32_x(svptrue_b32(), v302, v300); + svfloat32_t v309 = svadd_f32_x(svptrue_b32(), v308, v301); + svfloat32_t zero351 = svdup_n_f32(0); svfloat32_t v351 = svcmla_f32_x(pred_full, zero351, v849, v310, 90); - svfloat32_t zero358; - asm volatile("mov %0.s, #0" : "=w"(zero358)); + svfloat32_t zero358 = svdup_n_f32(0); svfloat32_t v358 = svcmla_f32_x(pred_full, zero358, v850, v311, 90); - svfloat32_t zero365; - asm volatile("mov %0.s, #0" : "=w"(zero365)); + svfloat32_t zero365 = svdup_n_f32(0); svfloat32_t v365 = svcmla_f32_x(pred_full, zero365, v851, v312, 90); - svfloat32_t v392; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v392) : "w"(v391), "w"(v389)); - svfloat32_t v398; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v398) : "w"(v397), "w"(v390)); - svfloat32_t zero422; - asm volatile("mov %0.s, #0" : "=w"(zero422)); + svfloat32_t v392 = svadd_f32_x(svptrue_b32(), v391, v389); + svfloat32_t v398 = svadd_f32_x(svptrue_b32(), v397, v390); + svfloat32_t zero422 = svdup_n_f32(0); svfloat32_t v422 = svcmla_f32_x(pred_full, zero422, v854, v394, 90); - svfloat32_t zero429; - asm volatile("mov %0.s, #0" : "=w"(zero429)); + svfloat32_t zero429 = svdup_n_f32(0); svfloat32_t v429 = svcmla_f32_x(pred_full, zero429, v855, v395, 90); - svfloat32_t zero436; - asm volatile("mov %0.s, #0" : "=w"(zero436)); + svfloat32_t zero436 = svdup_n_f32(0); svfloat32_t v436 = svcmla_f32_x(pred_full, zero436, v856, v396, 90); - svfloat32_t v446; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v446) : "w"(v399), "w"(v858)); - svfloat32_t v451; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v451) : "w"(v400), "w"(v859)); - svfloat32_t v214; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v214) : "w"(v213), "w"(v211)); - svfloat32_t v220; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v220) : "w"(v219), "w"(v212)); - svfloat32_t zero262; - asm volatile("mov %0.s, #0" : "=w"(zero262)); + svfloat32_t v446 = svmul_f32_x(svptrue_b32(), v399, v858); + svfloat32_t v451 = svmul_f32_x(svptrue_b32(), v400, v859); + svfloat32_t v214 = svadd_f32_x(svptrue_b32(), v213, v211); + svfloat32_t v220 = svadd_f32_x(svptrue_b32(), v219, v212); + svfloat32_t zero262 = svdup_n_f32(0); svfloat32_t v262 = svcmla_f32_x(pred_full, zero262, v840, v221, 90); - svfloat32_t zero269; - asm volatile("mov %0.s, #0" : "=w"(zero269)); + svfloat32_t zero269 = svdup_n_f32(0); svfloat32_t v269 = svcmla_f32_x(pred_full, zero269, v841, v222, 90); - svfloat32_t zero276; - asm volatile("mov %0.s, #0" : "=w"(zero276)); + svfloat32_t zero276 = svdup_n_f32(0); svfloat32_t v276 = svcmla_f32_x(pred_full, zero276, v842, v223, 90); - svfloat32_t v304; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v304) : "w"(v303), "w"(v34)); - svfloat32_t v322; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v322) : "w"(v303), "w"(v844)); - svfloat32_t zero344; - asm volatile("mov %0.s, #0" : "=w"(zero344)); + svfloat32_t v304 = svadd_f32_x(svptrue_b32(), v303, v34); + svfloat32_t v322 = svmul_f32_x(svptrue_b32(), v303, v844); + svfloat32_t zero344 = svdup_n_f32(0); svfloat32_t v344 = svcmla_f32_x(pred_full, zero344, v848, v309, 90); - svfloat32_t v393; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v393) : "w"(v392), "w"(v35)); - svfloat32_t v215; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v215) : "w"(v214), "w"(v44)); - svfloat32_t zero255; - asm volatile("mov %0.s, #0" : "=w"(zero255)); + svfloat32_t v393 = svadd_f32_x(svptrue_b32(), v392, v35); + svfloat32_t v215 = svadd_f32_x(svptrue_b32(), v214, v44); + svfloat32_t zero255 = svdup_n_f32(0); svfloat32_t v255 = svcmla_f32_x(pred_full, zero255, v839, v220, 90); - svfloat32_t v373; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v373) : "w"(v344), "w"(v351)); - svfloat32_t v375; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v375) : "w"(v344), "w"(v351)); - svfloat32_t v377; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v377) : "w"(v344), "w"(v358)); - svfloat32_t zero408; - asm volatile("mov %0.s, #0" : "=w"(zero408)); + svfloat32_t v373 = svadd_f32_x(svptrue_b32(), v344, v351); + svfloat32_t v375 = svsub_f32_x(svptrue_b32(), v344, v351); + svfloat32_t v377 = svsub_f32_x(svptrue_b32(), v344, v358); + svfloat32_t zero408 = svdup_n_f32(0); svfloat32_t v408 = svcmla_f32_x(pred_full, zero408, v852, v393, 90); svfloat32_t v464 = svmla_f32_x(pred_full, v446, v398, v857); svfloat32_t v466 = svnmls_f32_x(pred_full, v446, v398, v857); svfloat32_t v468 = svnmls_f32_x(pred_full, v451, v398, v857); svfloat32_t v277 = svmla_f32_x(pred_full, v215, v214, v835); - svfloat32_t v284; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v284) : "w"(v255), "w"(v262)); - svfloat32_t v286; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v286) : "w"(v255), "w"(v262)); - svfloat32_t v288; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v288) : "w"(v255), "w"(v269)); + svfloat32_t v284 = svadd_f32_x(svptrue_b32(), v255, v262); + svfloat32_t v286 = svsub_f32_x(svptrue_b32(), v255, v262); + svfloat32_t v288 = svsub_f32_x(svptrue_b32(), v255, v269); svfloat32_t v366 = svmla_f32_x(pred_full, v322, v304, v843); - svfloat32_t v374; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v374) : "w"(v373), "w"(v358)); - svfloat32_t v376; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v376) : "w"(v375), "w"(v365)); - svfloat32_t v378; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v378) : "w"(v377), "w"(v365)); + svfloat32_t v374 = svadd_f32_x(svptrue_b32(), v373, v358); + svfloat32_t v376 = svsub_f32_x(svptrue_b32(), v375, v365); + svfloat32_t v378 = svadd_f32_x(svptrue_b32(), v377, v365); svfloat32_t v457 = svcmla_f32_x(pred_full, v408, v853, v392, 90); svfloat32_t v465 = svmla_f32_x(pred_full, v464, v400, v859); svfloat32_t v467 = svmls_f32_x(pred_full, v466, v401, v860); @@ -11401,117 +10285,70 @@ void armral_fft_cs16_cf32_cf32_ac_n_uu21(const armral_cmplx_int16_t *restrict x, svfloat32_t v278 = svmla_f32_x(pred_full, v277, v216, v836); svfloat32_t v280 = svmls_f32_x(pred_full, v277, v216, v836); svfloat32_t v282 = svmls_f32_x(pred_full, v277, v217, v837); - svfloat32_t v285; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v285) : "w"(v284), "w"(v269)); - svfloat32_t v287; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v287) : "w"(v286), "w"(v276)); - svfloat32_t v289; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v289) : "w"(v288), "w"(v276)); + svfloat32_t v285 = svadd_f32_x(svptrue_b32(), v284, v269); + svfloat32_t v287 = svsub_f32_x(svptrue_b32(), v286, v276); + svfloat32_t v289 = svadd_f32_x(svptrue_b32(), v288, v276); svfloat32_t v367 = svmla_f32_x(pred_full, v366, v305, v845); svfloat32_t v369 = svmls_f32_x(pred_full, v366, v305, v845); svfloat32_t v371 = svmls_f32_x(pred_full, v366, v306, v846); - svfloat32_t v458; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v458) : "w"(v457), "w"(v422)); - svfloat32_t v460; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v460) : "w"(v457), "w"(v422)); - svfloat32_t v462; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v462) : "w"(v457), "w"(v429)); - svfloat32_t v477; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v477) : "w"(v476), "w"(v408)); - svfloat32_t v478; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v478) : "w"(v476), "w"(v408)); + svfloat32_t v458 = svadd_f32_x(svptrue_b32(), v457, v422); + svfloat32_t v460 = svsub_f32_x(svptrue_b32(), v457, v422); + svfloat32_t v462 = svsub_f32_x(svptrue_b32(), v457, v429); + svfloat32_t v477 = svadd_f32_x(svptrue_b32(), v476, v408); + svfloat32_t v478 = svsub_f32_x(svptrue_b32(), v476, v408); svfloat32_t v279 = svmla_f32_x(pred_full, v278, v217, v837); svfloat32_t v281 = svmls_f32_x(pred_full, v280, v218, v838); svfloat32_t v283 = svmla_f32_x(pred_full, v282, v218, v838); svfloat32_t v368 = svmla_f32_x(pred_full, v367, v306, v846); svfloat32_t v370 = svmls_f32_x(pred_full, v369, v307, v847); svfloat32_t v372 = svmla_f32_x(pred_full, v371, v307, v847); - svfloat32_t v459; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v459) : "w"(v458), "w"(v429)); - svfloat32_t v461; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v461) : "w"(v460), "w"(v436)); - svfloat32_t v463; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v463) : "w"(v462), "w"(v436)); + svfloat32_t v459 = svadd_f32_x(svptrue_b32(), v458, v429); + svfloat32_t v461 = svsub_f32_x(svptrue_b32(), v460, v436); + svfloat32_t v463 = svadd_f32_x(svptrue_b32(), v462, v436); svst1_f64(pred_full, (double *)(v877), svreinterpret_f64_f32(v478)); svst1_f64(pred_full, (double *)(v886), svreinterpret_f64_f32(v477)); - svfloat32_t v290; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v290) : "w"(v279), "w"(v285)); - svfloat32_t v291; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v291) : "w"(v279), "w"(v285)); - svfloat32_t v292; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v292) : "w"(v281), "w"(v287)); - svfloat32_t v293; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v293) : "w"(v281), "w"(v287)); - svfloat32_t v294; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v294) : "w"(v283), "w"(v289)); - svfloat32_t v295; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v295) : "w"(v283), "w"(v289)); - svfloat32_t v379; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v379) : "w"(v368), "w"(v374)); - svfloat32_t v380; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v380) : "w"(v368), "w"(v374)); - svfloat32_t v381; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v381) : "w"(v370), "w"(v376)); - svfloat32_t v382; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v382) : "w"(v370), "w"(v376)); - svfloat32_t v383; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v383) : "w"(v372), "w"(v378)); - svfloat32_t v384; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v384) : "w"(v372), "w"(v378)); - svfloat32_t v470; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v470) : "w"(v459), "w"(v465)); - svfloat32_t v471; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v471) : "w"(v459), "w"(v465)); - svfloat32_t v472; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v472) : "w"(v461), "w"(v467)); - svfloat32_t v473; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v473) : "w"(v461), "w"(v467)); - svfloat32_t v474; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v474) : "w"(v463), "w"(v469)); - svfloat32_t v475; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v475) : "w"(v463), "w"(v469)); - svfloat32_t v500; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v500) : "w"(v291), "w"(v380)); - svfloat32_t v524; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v524) : "w"(v293), "w"(v382)); - svfloat32_t v548; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v548) : "w"(v294), "w"(v383)); - svfloat32_t v572; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v572) : "w"(v295), "w"(v384)); - svfloat32_t v596; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v596) : "w"(v292), "w"(v381)); - svfloat32_t v620; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v620) : "w"(v290), "w"(v379)); + svfloat32_t v290 = svadd_f32_x(svptrue_b32(), v279, v285); + svfloat32_t v291 = svsub_f32_x(svptrue_b32(), v279, v285); + svfloat32_t v292 = svadd_f32_x(svptrue_b32(), v281, v287); + svfloat32_t v293 = svsub_f32_x(svptrue_b32(), v281, v287); + svfloat32_t v294 = svadd_f32_x(svptrue_b32(), v283, v289); + svfloat32_t v295 = svsub_f32_x(svptrue_b32(), v283, v289); + svfloat32_t v379 = svadd_f32_x(svptrue_b32(), v368, v374); + svfloat32_t v380 = svsub_f32_x(svptrue_b32(), v368, v374); + svfloat32_t v381 = svadd_f32_x(svptrue_b32(), v370, v376); + svfloat32_t v382 = svsub_f32_x(svptrue_b32(), v370, v376); + svfloat32_t v383 = svadd_f32_x(svptrue_b32(), v372, v378); + svfloat32_t v384 = svsub_f32_x(svptrue_b32(), v372, v378); + svfloat32_t v470 = svadd_f32_x(svptrue_b32(), v459, v465); + svfloat32_t v471 = svsub_f32_x(svptrue_b32(), v459, v465); + svfloat32_t v472 = svadd_f32_x(svptrue_b32(), v461, v467); + svfloat32_t v473 = svsub_f32_x(svptrue_b32(), v461, v467); + svfloat32_t v474 = svadd_f32_x(svptrue_b32(), v463, v469); + svfloat32_t v475 = svsub_f32_x(svptrue_b32(), v463, v469); + svfloat32_t v500 = svadd_f32_x(svptrue_b32(), v291, v380); + svfloat32_t v524 = svadd_f32_x(svptrue_b32(), v293, v382); + svfloat32_t v548 = svadd_f32_x(svptrue_b32(), v294, v383); + svfloat32_t v572 = svadd_f32_x(svptrue_b32(), v295, v384); + svfloat32_t v596 = svadd_f32_x(svptrue_b32(), v292, v381); + svfloat32_t v620 = svadd_f32_x(svptrue_b32(), v290, v379); svst1_f64(pred_full, (double *)(v895), svreinterpret_f64_f32(v291)); svst1_f64(pred_full, (double *)(v922), svreinterpret_f64_f32(v293)); svst1_f64(pred_full, (double *)(v949), svreinterpret_f64_f32(v294)); svst1_f64(pred_full, (double *)(v976), svreinterpret_f64_f32(v295)); svst1_f64(pred_full, (double *)(v1003), svreinterpret_f64_f32(v292)); svst1_f64(pred_full, (double *)(v1030), svreinterpret_f64_f32(v290)); - svfloat32_t v501; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v501) : "w"(v500), "w"(v471)); - svfloat32_t v502; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v502) : "w"(v500), "w"(v471)); - svfloat32_t v525; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v525) : "w"(v524), "w"(v473)); - svfloat32_t v526; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v526) : "w"(v524), "w"(v473)); - svfloat32_t v549; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v549) : "w"(v548), "w"(v474)); - svfloat32_t v550; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v550) : "w"(v548), "w"(v474)); - svfloat32_t v573; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v573) : "w"(v572), "w"(v475)); - svfloat32_t v574; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v574) : "w"(v572), "w"(v475)); - svfloat32_t v597; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v597) : "w"(v596), "w"(v472)); - svfloat32_t v598; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v598) : "w"(v596), "w"(v472)); - svfloat32_t v621; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v621) : "w"(v620), "w"(v470)); - svfloat32_t v622; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v622) : "w"(v620), "w"(v470)); + svfloat32_t v501 = svadd_f32_x(svptrue_b32(), v500, v471); + svfloat32_t v502 = svsub_f32_x(svptrue_b32(), v500, v471); + svfloat32_t v525 = svadd_f32_x(svptrue_b32(), v524, v473); + svfloat32_t v526 = svsub_f32_x(svptrue_b32(), v524, v473); + svfloat32_t v549 = svadd_f32_x(svptrue_b32(), v548, v474); + svfloat32_t v550 = svsub_f32_x(svptrue_b32(), v548, v474); + svfloat32_t v573 = svadd_f32_x(svptrue_b32(), v572, v475); + svfloat32_t v574 = svsub_f32_x(svptrue_b32(), v572, v475); + svfloat32_t v597 = svadd_f32_x(svptrue_b32(), v596, v472); + svfloat32_t v598 = svsub_f32_x(svptrue_b32(), v596, v472); + svfloat32_t v621 = svadd_f32_x(svptrue_b32(), v620, v470); + svfloat32_t v622 = svsub_f32_x(svptrue_b32(), v620, v470); svst1_f64(pred_full, (double *)(v904), svreinterpret_f64_f32(v502)); svst1_f64(pred_full, (double *)(v913), svreinterpret_f64_f32(v501)); svst1_f64(pred_full, (double *)(v931), svreinterpret_f64_f32(v526)); @@ -12629,240 +11466,128 @@ void armral_fft_cs16_cf32_cf32_ac_n_uu22(const armral_cmplx_int16_t *restrict x, svcvt_f32_s32_x(pred_full, svld1sh_s32(pred_full, (const int16_t *)&v984[0])), 1.F / (1ULL << 15ULL)); - svfloat32_t v34; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v34) : "w"(v25), "w"(v33)); - svfloat32_t v35; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v35) : "w"(v25), "w"(v33)); - svfloat32_t v52; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v52) : "w"(v43), "w"(v51)); - svfloat32_t v53; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v53) : "w"(v43), "w"(v51)); - svfloat32_t v70; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v70) : "w"(v61), "w"(v69)); - svfloat32_t v71; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v71) : "w"(v61), "w"(v69)); - svfloat32_t v88; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v88) : "w"(v79), "w"(v87)); - svfloat32_t v89; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v89) : "w"(v79), "w"(v87)); - svfloat32_t v106; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v106) : "w"(v97), "w"(v105)); - svfloat32_t v107; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v107) : "w"(v97), "w"(v105)); - svfloat32_t v124; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v124) : "w"(v115), "w"(v123)); - svfloat32_t v125; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v125) : "w"(v115), "w"(v123)); - svfloat32_t v142; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v142) : "w"(v133), "w"(v141)); - svfloat32_t v143; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v143) : "w"(v133), "w"(v141)); - svfloat32_t v160; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v160) : "w"(v151), "w"(v159)); - svfloat32_t v161; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v161) : "w"(v151), "w"(v159)); - svfloat32_t v178; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v178) : "w"(v169), "w"(v177)); - svfloat32_t v179; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v179) : "w"(v169), "w"(v177)); - svfloat32_t v196; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v196) : "w"(v187), "w"(v195)); - svfloat32_t v197; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v197) : "w"(v187), "w"(v195)); - svfloat32_t v214; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v214) : "w"(v205), "w"(v213)); - svfloat32_t v215; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v215) : "w"(v205), "w"(v213)); - svfloat32_t v216; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v216) : "w"(v52), "w"(v214)); - svfloat32_t v217; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v217) : "w"(v70), "w"(v196)); - svfloat32_t v218; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v218) : "w"(v88), "w"(v178)); - svfloat32_t v219; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v219) : "w"(v106), "w"(v160)); - svfloat32_t v220; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v220) : "w"(v124), "w"(v142)); - svfloat32_t v221; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v221) : "w"(v52), "w"(v214)); - svfloat32_t v222; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v222) : "w"(v70), "w"(v196)); - svfloat32_t v223; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v223) : "w"(v88), "w"(v178)); - svfloat32_t v224; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v224) : "w"(v106), "w"(v160)); - svfloat32_t v225; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v225) : "w"(v124), "w"(v142)); - svfloat32_t v425; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v425) : "w"(v53), "w"(v215)); - svfloat32_t v426; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v426) : "w"(v71), "w"(v197)); - svfloat32_t v427; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v427) : "w"(v89), "w"(v179)); - svfloat32_t v428; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v428) : "w"(v107), "w"(v161)); - svfloat32_t v429; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v429) : "w"(v125), "w"(v143)); - svfloat32_t v430; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v430) : "w"(v53), "w"(v215)); - svfloat32_t v431; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v431) : "w"(v71), "w"(v197)); - svfloat32_t v432; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v432) : "w"(v89), "w"(v179)); - svfloat32_t v433; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v433) : "w"(v107), "w"(v161)); - svfloat32_t v434; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v434) : "w"(v125), "w"(v143)); - svfloat32_t v226; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v226) : "w"(v216), "w"(v217)); - svfloat32_t v227; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v227) : "w"(v218), "w"(v220)); - svfloat32_t v229; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v229) : "w"(v222), "w"(v223)); - svfloat32_t v230; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v230) : "w"(v221), "w"(v225)); - svfloat32_t v235; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v235) : "w"(v217), "w"(v219)); - svfloat32_t v236; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v236) : "w"(v216), "w"(v219)); - svfloat32_t v237; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v237) : "w"(v217), "w"(v216)); - svfloat32_t v238; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v238) : "w"(v220), "w"(v219)); - svfloat32_t v239; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v239) : "w"(v218), "w"(v219)); - svfloat32_t v240; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v240) : "w"(v220), "w"(v218)); - svfloat32_t v241; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v241) : "w"(v217), "w"(v220)); - svfloat32_t v242; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v242) : "w"(v216), "w"(v218)); - svfloat32_t v244; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v244) : "w"(v222), "w"(v224)); - svfloat32_t v245; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v245) : "w"(v221), "w"(v224)); - svfloat32_t v246; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v246) : "w"(v221), "w"(v222)); - svfloat32_t v247; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v247) : "w"(v224), "w"(v225)); - svfloat32_t v248; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v248) : "w"(v223), "w"(v224)); - svfloat32_t v249; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v249) : "w"(v223), "w"(v225)); - svfloat32_t v250; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v250) : "w"(v222), "w"(v225)); - svfloat32_t v251; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v251) : "w"(v221), "w"(v223)); - svfloat32_t v435; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v435) : "w"(v425), "w"(v426)); - svfloat32_t v436; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v436) : "w"(v427), "w"(v429)); - svfloat32_t v438; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v438) : "w"(v431), "w"(v432)); - svfloat32_t v439; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v439) : "w"(v430), "w"(v434)); - svfloat32_t v444; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v444) : "w"(v426), "w"(v428)); - svfloat32_t v445; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v445) : "w"(v425), "w"(v428)); - svfloat32_t v446; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v446) : "w"(v426), "w"(v425)); - svfloat32_t v447; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v447) : "w"(v429), "w"(v428)); - svfloat32_t v448; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v448) : "w"(v427), "w"(v428)); - svfloat32_t v449; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v449) : "w"(v429), "w"(v427)); - svfloat32_t v450; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v450) : "w"(v426), "w"(v429)); - svfloat32_t v451; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v451) : "w"(v425), "w"(v427)); - svfloat32_t v453; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v453) : "w"(v431), "w"(v433)); - svfloat32_t v454; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v454) : "w"(v430), "w"(v433)); - svfloat32_t v455; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v455) : "w"(v430), "w"(v431)); - svfloat32_t v456; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v456) : "w"(v433), "w"(v434)); - svfloat32_t v457; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v457) : "w"(v432), "w"(v433)); - svfloat32_t v458; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v458) : "w"(v432), "w"(v434)); - svfloat32_t v459; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v459) : "w"(v431), "w"(v434)); - svfloat32_t v460; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v460) : "w"(v430), "w"(v432)); - svfloat32_t v228; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v228) : "w"(v219), "w"(v226)); - svfloat32_t v233; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v233) : "w"(v229), "w"(v230)); - svfloat32_t v243; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v243) : "w"(v227), "w"(v226)); - svfloat32_t v252; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v252) : "w"(v229), "w"(v230)); - svfloat32_t v279; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v279) : "w"(v236), "w"(v1012)); - svfloat32_t v284; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v284) : "w"(v237), "w"(v1013)); - svfloat32_t v294; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v294) : "w"(v239), "w"(v1015)); - svfloat32_t v299; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v299) : "w"(v240), "w"(v1016)); - svfloat32_t zero321; - asm volatile("mov %0.s, #0" : "=w"(zero321)); + svfloat32_t v34 = svadd_f32_x(svptrue_b32(), v25, v33); + svfloat32_t v35 = svsub_f32_x(svptrue_b32(), v25, v33); + svfloat32_t v52 = svadd_f32_x(svptrue_b32(), v43, v51); + svfloat32_t v53 = svsub_f32_x(svptrue_b32(), v43, v51); + svfloat32_t v70 = svadd_f32_x(svptrue_b32(), v61, v69); + svfloat32_t v71 = svsub_f32_x(svptrue_b32(), v61, v69); + svfloat32_t v88 = svadd_f32_x(svptrue_b32(), v79, v87); + svfloat32_t v89 = svsub_f32_x(svptrue_b32(), v79, v87); + svfloat32_t v106 = svadd_f32_x(svptrue_b32(), v97, v105); + svfloat32_t v107 = svsub_f32_x(svptrue_b32(), v97, v105); + svfloat32_t v124 = svadd_f32_x(svptrue_b32(), v115, v123); + svfloat32_t v125 = svsub_f32_x(svptrue_b32(), v115, v123); + svfloat32_t v142 = svadd_f32_x(svptrue_b32(), v133, v141); + svfloat32_t v143 = svsub_f32_x(svptrue_b32(), v133, v141); + svfloat32_t v160 = svadd_f32_x(svptrue_b32(), v151, v159); + svfloat32_t v161 = svsub_f32_x(svptrue_b32(), v151, v159); + svfloat32_t v178 = svadd_f32_x(svptrue_b32(), v169, v177); + svfloat32_t v179 = svsub_f32_x(svptrue_b32(), v169, v177); + svfloat32_t v196 = svadd_f32_x(svptrue_b32(), v187, v195); + svfloat32_t v197 = svsub_f32_x(svptrue_b32(), v187, v195); + svfloat32_t v214 = svadd_f32_x(svptrue_b32(), v205, v213); + svfloat32_t v215 = svsub_f32_x(svptrue_b32(), v205, v213); + svfloat32_t v216 = svadd_f32_x(svptrue_b32(), v52, v214); + svfloat32_t v217 = svadd_f32_x(svptrue_b32(), v70, v196); + svfloat32_t v218 = svadd_f32_x(svptrue_b32(), v88, v178); + svfloat32_t v219 = svadd_f32_x(svptrue_b32(), v106, v160); + svfloat32_t v220 = svadd_f32_x(svptrue_b32(), v124, v142); + svfloat32_t v221 = svsub_f32_x(svptrue_b32(), v52, v214); + svfloat32_t v222 = svsub_f32_x(svptrue_b32(), v70, v196); + svfloat32_t v223 = svsub_f32_x(svptrue_b32(), v88, v178); + svfloat32_t v224 = svsub_f32_x(svptrue_b32(), v106, v160); + svfloat32_t v225 = svsub_f32_x(svptrue_b32(), v124, v142); + svfloat32_t v425 = svadd_f32_x(svptrue_b32(), v53, v215); + svfloat32_t v426 = svadd_f32_x(svptrue_b32(), v71, v197); + svfloat32_t v427 = svadd_f32_x(svptrue_b32(), v89, v179); + svfloat32_t v428 = svadd_f32_x(svptrue_b32(), v107, v161); + svfloat32_t v429 = svadd_f32_x(svptrue_b32(), v125, v143); + svfloat32_t v430 = svsub_f32_x(svptrue_b32(), v53, v215); + svfloat32_t v431 = svsub_f32_x(svptrue_b32(), v71, v197); + svfloat32_t v432 = svsub_f32_x(svptrue_b32(), v89, v179); + svfloat32_t v433 = svsub_f32_x(svptrue_b32(), v107, v161); + svfloat32_t v434 = svsub_f32_x(svptrue_b32(), v125, v143); + svfloat32_t v226 = svadd_f32_x(svptrue_b32(), v216, v217); + svfloat32_t v227 = svadd_f32_x(svptrue_b32(), v218, v220); + svfloat32_t v229 = svsub_f32_x(svptrue_b32(), v222, v223); + svfloat32_t v230 = svadd_f32_x(svptrue_b32(), v221, v225); + svfloat32_t v235 = svsub_f32_x(svptrue_b32(), v217, v219); + svfloat32_t v236 = svsub_f32_x(svptrue_b32(), v216, v219); + svfloat32_t v237 = svsub_f32_x(svptrue_b32(), v217, v216); + svfloat32_t v238 = svsub_f32_x(svptrue_b32(), v220, v219); + svfloat32_t v239 = svsub_f32_x(svptrue_b32(), v218, v219); + svfloat32_t v240 = svsub_f32_x(svptrue_b32(), v220, v218); + svfloat32_t v241 = svsub_f32_x(svptrue_b32(), v217, v220); + svfloat32_t v242 = svsub_f32_x(svptrue_b32(), v216, v218); + svfloat32_t v244 = svadd_f32_x(svptrue_b32(), v222, v224); + svfloat32_t v245 = svsub_f32_x(svptrue_b32(), v221, v224); + svfloat32_t v246 = svadd_f32_x(svptrue_b32(), v221, v222); + svfloat32_t v247 = svsub_f32_x(svptrue_b32(), v224, v225); + svfloat32_t v248 = svsub_f32_x(svptrue_b32(), v223, v224); + svfloat32_t v249 = svsub_f32_x(svptrue_b32(), v223, v225); + svfloat32_t v250 = svadd_f32_x(svptrue_b32(), v222, v225); + svfloat32_t v251 = svsub_f32_x(svptrue_b32(), v221, v223); + svfloat32_t v435 = svadd_f32_x(svptrue_b32(), v425, v426); + svfloat32_t v436 = svadd_f32_x(svptrue_b32(), v427, v429); + svfloat32_t v438 = svsub_f32_x(svptrue_b32(), v431, v432); + svfloat32_t v439 = svadd_f32_x(svptrue_b32(), v430, v434); + svfloat32_t v444 = svsub_f32_x(svptrue_b32(), v426, v428); + svfloat32_t v445 = svsub_f32_x(svptrue_b32(), v425, v428); + svfloat32_t v446 = svsub_f32_x(svptrue_b32(), v426, v425); + svfloat32_t v447 = svsub_f32_x(svptrue_b32(), v429, v428); + svfloat32_t v448 = svsub_f32_x(svptrue_b32(), v427, v428); + svfloat32_t v449 = svsub_f32_x(svptrue_b32(), v429, v427); + svfloat32_t v450 = svsub_f32_x(svptrue_b32(), v426, v429); + svfloat32_t v451 = svsub_f32_x(svptrue_b32(), v425, v427); + svfloat32_t v453 = svadd_f32_x(svptrue_b32(), v431, v433); + svfloat32_t v454 = svsub_f32_x(svptrue_b32(), v430, v433); + svfloat32_t v455 = svadd_f32_x(svptrue_b32(), v430, v431); + svfloat32_t v456 = svsub_f32_x(svptrue_b32(), v433, v434); + svfloat32_t v457 = svsub_f32_x(svptrue_b32(), v432, v433); + svfloat32_t v458 = svsub_f32_x(svptrue_b32(), v432, v434); + svfloat32_t v459 = svadd_f32_x(svptrue_b32(), v431, v434); + svfloat32_t v460 = svsub_f32_x(svptrue_b32(), v430, v432); + svfloat32_t v228 = svadd_f32_x(svptrue_b32(), v219, v226); + svfloat32_t v233 = svsub_f32_x(svptrue_b32(), v229, v230); + svfloat32_t v243 = svsub_f32_x(svptrue_b32(), v227, v226); + svfloat32_t v252 = svadd_f32_x(svptrue_b32(), v229, v230); + svfloat32_t v279 = svmul_f32_x(svptrue_b32(), v236, v1012); + svfloat32_t v284 = svmul_f32_x(svptrue_b32(), v237, v1013); + svfloat32_t v294 = svmul_f32_x(svptrue_b32(), v239, v1015); + svfloat32_t v299 = svmul_f32_x(svptrue_b32(), v240, v1016); + svfloat32_t zero321 = svdup_n_f32(0); svfloat32_t v321 = svcmla_f32_x(pred_full, zero321, v1020, v244, 90); - svfloat32_t zero335; - asm volatile("mov %0.s, #0" : "=w"(zero335)); + svfloat32_t zero335 = svdup_n_f32(0); svfloat32_t v335 = svcmla_f32_x(pred_full, zero335, v1022, v246, 90); - svfloat32_t zero342; - asm volatile("mov %0.s, #0" : "=w"(zero342)); + svfloat32_t zero342 = svdup_n_f32(0); svfloat32_t v342 = svcmla_f32_x(pred_full, zero342, v1023, v247, 90); - svfloat32_t zero356; - asm volatile("mov %0.s, #0" : "=w"(zero356)); + svfloat32_t zero356 = svdup_n_f32(0); svfloat32_t v356 = svcmla_f32_x(pred_full, zero356, v1025, v249, 90); - svfloat32_t zero363; - asm volatile("mov %0.s, #0" : "=w"(zero363)); + svfloat32_t zero363 = svdup_n_f32(0); svfloat32_t v363 = svcmla_f32_x(pred_full, zero363, v1026, v250, 90); - svfloat32_t v437; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v437) : "w"(v428), "w"(v435)); - svfloat32_t v442; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v442) : "w"(v438), "w"(v439)); - svfloat32_t v452; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v452) : "w"(v436), "w"(v435)); - svfloat32_t v461; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v461) : "w"(v438), "w"(v439)); - svfloat32_t v488; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v488) : "w"(v445), "w"(v1012)); - svfloat32_t v493; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v493) : "w"(v446), "w"(v1013)); - svfloat32_t v503; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v503) : "w"(v448), "w"(v1015)); - svfloat32_t v508; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v508) : "w"(v449), "w"(v1016)); - svfloat32_t zero530; - asm volatile("mov %0.s, #0" : "=w"(zero530)); + svfloat32_t v437 = svadd_f32_x(svptrue_b32(), v428, v435); + svfloat32_t v442 = svsub_f32_x(svptrue_b32(), v438, v439); + svfloat32_t v452 = svsub_f32_x(svptrue_b32(), v436, v435); + svfloat32_t v461 = svadd_f32_x(svptrue_b32(), v438, v439); + svfloat32_t v488 = svmul_f32_x(svptrue_b32(), v445, v1012); + svfloat32_t v493 = svmul_f32_x(svptrue_b32(), v446, v1013); + svfloat32_t v503 = svmul_f32_x(svptrue_b32(), v448, v1015); + svfloat32_t v508 = svmul_f32_x(svptrue_b32(), v449, v1016); + svfloat32_t zero530 = svdup_n_f32(0); svfloat32_t v530 = svcmla_f32_x(pred_full, zero530, v1020, v453, 90); - svfloat32_t zero544; - asm volatile("mov %0.s, #0" : "=w"(zero544)); + svfloat32_t zero544 = svdup_n_f32(0); svfloat32_t v544 = svcmla_f32_x(pred_full, zero544, v1022, v455, 90); - svfloat32_t zero551; - asm volatile("mov %0.s, #0" : "=w"(zero551)); + svfloat32_t zero551 = svdup_n_f32(0); svfloat32_t v551 = svcmla_f32_x(pred_full, zero551, v1023, v456, 90); - svfloat32_t zero565; - asm volatile("mov %0.s, #0" : "=w"(zero565)); + svfloat32_t zero565 = svdup_n_f32(0); svfloat32_t v565 = svcmla_f32_x(pred_full, zero565, v1025, v458, 90); - svfloat32_t zero572; - asm volatile("mov %0.s, #0" : "=w"(zero572)); + svfloat32_t zero572 = svdup_n_f32(0); svfloat32_t v572 = svcmla_f32_x(pred_full, zero572, v1026, v459, 90); - svfloat32_t v231; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v231) : "w"(v228), "w"(v227)); - svfloat32_t v234; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v234) : "w"(v233), "w"(v224)); - svfloat32_t v314; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v314) : "w"(v243), "w"(v1019)); - svfloat32_t zero377; - asm volatile("mov %0.s, #0" : "=w"(zero377)); + svfloat32_t v231 = svadd_f32_x(svptrue_b32(), v228, v227); + svfloat32_t v234 = svsub_f32_x(svptrue_b32(), v233, v224); + svfloat32_t v314 = svmul_f32_x(svptrue_b32(), v243, v1019); + svfloat32_t zero377 = svdup_n_f32(0); svfloat32_t v377 = svcmla_f32_x(pred_full, zero377, v1028, v252, 90); svfloat32_t v379 = svmla_f32_x(pred_full, v279, v235, v1011); svfloat32_t v380 = svmla_f32_x(pred_full, v284, v236, v1012); @@ -12871,19 +11596,13 @@ void armral_fft_cs16_cf32_cf32_ac_n_uu22(const armral_cmplx_int16_t *restrict x, svfloat32_t v383 = svmla_f32_x(pred_full, v299, v239, v1015); svfloat32_t v384 = svnmls_f32_x(pred_full, v299, v238, v1014); svfloat32_t v387 = svcmla_f32_x(pred_full, v335, v1021, v245, 90); - svfloat32_t v388; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v388) : "w"(v321), "w"(v335)); + svfloat32_t v388 = svsub_f32_x(svptrue_b32(), v321, v335); svfloat32_t v389 = svcmla_f32_x(pred_full, v356, v1024, v248, 90); - svfloat32_t v390; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v390) : "w"(v342), "w"(v356)); - svfloat32_t v440; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v440) : "w"(v437), "w"(v436)); - svfloat32_t v443; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v443) : "w"(v442), "w"(v433)); - svfloat32_t v523; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v523) : "w"(v452), "w"(v1019)); - svfloat32_t zero586; - asm volatile("mov %0.s, #0" : "=w"(zero586)); + svfloat32_t v390 = svsub_f32_x(svptrue_b32(), v342, v356); + svfloat32_t v440 = svadd_f32_x(svptrue_b32(), v437, v436); + svfloat32_t v443 = svsub_f32_x(svptrue_b32(), v442, v433); + svfloat32_t v523 = svmul_f32_x(svptrue_b32(), v452, v1019); + svfloat32_t zero586 = svdup_n_f32(0); svfloat32_t v586 = svcmla_f32_x(pred_full, zero586, v1028, v461, 90); svfloat32_t v588 = svmla_f32_x(pred_full, v488, v444, v1011); svfloat32_t v589 = svmla_f32_x(pred_full, v493, v445, v1012); @@ -12892,163 +11611,91 @@ void armral_fft_cs16_cf32_cf32_ac_n_uu22(const armral_cmplx_int16_t *restrict x, svfloat32_t v592 = svmla_f32_x(pred_full, v508, v448, v1015); svfloat32_t v593 = svnmls_f32_x(pred_full, v508, v447, v1014); svfloat32_t v596 = svcmla_f32_x(pred_full, v544, v1021, v454, 90); - svfloat32_t v597; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v597) : "w"(v530), "w"(v544)); + svfloat32_t v597 = svsub_f32_x(svptrue_b32(), v530, v544); svfloat32_t v598 = svcmla_f32_x(pred_full, v565, v1024, v457, 90); - svfloat32_t v599; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v599) : "w"(v551), "w"(v565)); - svfloat32_t v232; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v232) : "w"(v34), "w"(v231)); - svfloat32_t zero269; - asm volatile("mov %0.s, #0" : "=w"(zero269)); + svfloat32_t v599 = svsub_f32_x(svptrue_b32(), v551, v565); + svfloat32_t v232 = svadd_f32_x(svptrue_b32(), v34, v231); + svfloat32_t zero269 = svdup_n_f32(0); svfloat32_t v269 = svcmla_f32_x(pred_full, zero269, v1010, v234, 90); svfloat32_t v385 = svmla_f32_x(pred_full, v314, v242, v1018); svfloat32_t v386 = svmla_f32_x(pred_full, v314, v241, v1017); svfloat32_t v391 = svcmla_f32_x(pred_full, v377, v1027, v251, 90); - svfloat32_t v392; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v392) : "w"(v363), "w"(v377)); - svfloat32_t v411; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v411) : "w"(v387), "w"(v388)); - svfloat32_t v441; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v441) : "w"(v35), "w"(v440)); - svfloat32_t zero478; - asm volatile("mov %0.s, #0" : "=w"(zero478)); + svfloat32_t v392 = svsub_f32_x(svptrue_b32(), v363, v377); + svfloat32_t v411 = svadd_f32_x(svptrue_b32(), v387, v388); + svfloat32_t v441 = svadd_f32_x(svptrue_b32(), v35, v440); + svfloat32_t zero478 = svdup_n_f32(0); svfloat32_t v478 = svcmla_f32_x(pred_full, zero478, v1010, v443, 90); svfloat32_t v594 = svmla_f32_x(pred_full, v523, v451, v1018); svfloat32_t v595 = svmla_f32_x(pred_full, v523, v450, v1017); svfloat32_t v600 = svcmla_f32_x(pred_full, v586, v1027, v460, 90); - svfloat32_t v601; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v601) : "w"(v572), "w"(v586)); - svfloat32_t v620; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v620) : "w"(v596), "w"(v597)); + svfloat32_t v601 = svsub_f32_x(svptrue_b32(), v572, v586); + svfloat32_t v620 = svadd_f32_x(svptrue_b32(), v596, v597); svfloat32_t v378 = svmls_f32_x(pred_full, v232, v231, v1009); - svfloat32_t v393; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v393) : "w"(v383), "w"(v385)); - svfloat32_t v403; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v403) : "w"(v269), "w"(v389)); - svfloat32_t v405; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v405) : "w"(v391), "w"(v387)); - svfloat32_t v407; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v407) : "w"(v269), "w"(v392)); - svfloat32_t v409; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v409) : "w"(v392), "w"(v388)); - svfloat32_t v412; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v412) : "w"(v411), "w"(v389)); + svfloat32_t v393 = svadd_f32_x(svptrue_b32(), v383, v385); + svfloat32_t v403 = svadd_f32_x(svptrue_b32(), v269, v389); + svfloat32_t v405 = svsub_f32_x(svptrue_b32(), v391, v387); + svfloat32_t v407 = svadd_f32_x(svptrue_b32(), v269, v392); + svfloat32_t v409 = svsub_f32_x(svptrue_b32(), v392, v388); + svfloat32_t v412 = svadd_f32_x(svptrue_b32(), v411, v389); svfloat32_t v587 = svmls_f32_x(pred_full, v441, v440, v1009); - svfloat32_t v602; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v602) : "w"(v592), "w"(v594)); - svfloat32_t v612; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v612) : "w"(v478), "w"(v598)); - svfloat32_t v614; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v614) : "w"(v600), "w"(v596)); - svfloat32_t v616; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v616) : "w"(v478), "w"(v601)); - svfloat32_t v618; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v618) : "w"(v601), "w"(v597)); - svfloat32_t v621; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v621) : "w"(v620), "w"(v598)); + svfloat32_t v602 = svadd_f32_x(svptrue_b32(), v592, v594); + svfloat32_t v612 = svadd_f32_x(svptrue_b32(), v478, v598); + svfloat32_t v614 = svsub_f32_x(svptrue_b32(), v600, v596); + svfloat32_t v616 = svadd_f32_x(svptrue_b32(), v478, v601); + svfloat32_t v618 = svsub_f32_x(svptrue_b32(), v601, v597); + svfloat32_t v621 = svadd_f32_x(svptrue_b32(), v620, v598); svst1_f64(pred_full, (double *)(v1036), svreinterpret_f64_f32(v232)); svst1_f64(pred_full, (double *)(v1045), svreinterpret_f64_f32(v441)); - svfloat32_t v394; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v394) : "w"(v393), "w"(v378)); - svfloat32_t v395; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v395) : "w"(v378), "w"(v380)); - svfloat32_t v397; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v397) : "w"(v378), "w"(v384)); - svfloat32_t v399; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v399) : "w"(v378), "w"(v381)); - svfloat32_t v401; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v401) : "w"(v378), "w"(v379)); - svfloat32_t v404; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v404) : "w"(v403), "w"(v391)); - svfloat32_t v406; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v406) : "w"(v405), "w"(v269)); - svfloat32_t v408; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v408) : "w"(v407), "w"(v390)); - svfloat32_t v410; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v410) : "w"(v409), "w"(v269)); - svfloat32_t v413; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v413) : "w"(v412), "w"(v390)); - svfloat32_t v603; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v603) : "w"(v602), "w"(v587)); - svfloat32_t v604; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v604) : "w"(v587), "w"(v589)); - svfloat32_t v606; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v606) : "w"(v587), "w"(v593)); - svfloat32_t v608; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v608) : "w"(v587), "w"(v590)); - svfloat32_t v610; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v610) : "w"(v587), "w"(v588)); - svfloat32_t v613; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v613) : "w"(v612), "w"(v600)); - svfloat32_t v615; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v615) : "w"(v614), "w"(v478)); - svfloat32_t v617; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v617) : "w"(v616), "w"(v599)); - svfloat32_t v619; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v619) : "w"(v618), "w"(v478)); - svfloat32_t v622; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v622) : "w"(v621), "w"(v599)); - svfloat32_t v396; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v396) : "w"(v395), "w"(v385)); - svfloat32_t v398; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v398) : "w"(v397), "w"(v386)); - svfloat32_t v400; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v400) : "w"(v399), "w"(v386)); - svfloat32_t v402; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v402) : "w"(v401), "w"(v382)); - svfloat32_t v414; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v414) : "w"(v413), "w"(v269)); - svfloat32_t v416; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v416) : "w"(v394), "w"(v404)); - svfloat32_t v423; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v423) : "w"(v394), "w"(v404)); - svfloat32_t v605; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v605) : "w"(v604), "w"(v594)); - svfloat32_t v607; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v607) : "w"(v606), "w"(v595)); - svfloat32_t v609; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v609) : "w"(v608), "w"(v595)); - svfloat32_t v611; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v611) : "w"(v610), "w"(v591)); - svfloat32_t v623; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v623) : "w"(v622), "w"(v478)); - svfloat32_t v625; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v625) : "w"(v603), "w"(v613)); - svfloat32_t v632; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v632) : "w"(v603), "w"(v613)); - svfloat32_t v415; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v415) : "w"(v402), "w"(v414)); - svfloat32_t v417; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v417) : "w"(v396), "w"(v406)); - svfloat32_t v418; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v418) : "w"(v398), "w"(v408)); - svfloat32_t v419; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v419) : "w"(v400), "w"(v410)); - svfloat32_t v420; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v420) : "w"(v400), "w"(v410)); - svfloat32_t v421; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v421) : "w"(v398), "w"(v408)); - svfloat32_t v422; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v422) : "w"(v396), "w"(v406)); - svfloat32_t v424; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v424) : "w"(v402), "w"(v414)); - svfloat32_t v624; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v624) : "w"(v611), "w"(v623)); - svfloat32_t v626; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v626) : "w"(v605), "w"(v615)); - svfloat32_t v627; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v627) : "w"(v607), "w"(v617)); - svfloat32_t v628; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v628) : "w"(v609), "w"(v619)); - svfloat32_t v629; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v629) : "w"(v609), "w"(v619)); - svfloat32_t v630; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v630) : "w"(v607), "w"(v617)); - svfloat32_t v631; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v631) : "w"(v605), "w"(v615)); - svfloat32_t v633; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v633) : "w"(v611), "w"(v623)); + svfloat32_t v394 = svadd_f32_x(svptrue_b32(), v393, v378); + svfloat32_t v395 = svsub_f32_x(svptrue_b32(), v378, v380); + svfloat32_t v397 = svadd_f32_x(svptrue_b32(), v378, v384); + svfloat32_t v399 = svsub_f32_x(svptrue_b32(), v378, v381); + svfloat32_t v401 = svadd_f32_x(svptrue_b32(), v378, v379); + svfloat32_t v404 = svadd_f32_x(svptrue_b32(), v403, v391); + svfloat32_t v406 = svsub_f32_x(svptrue_b32(), v405, v269); + svfloat32_t v408 = svadd_f32_x(svptrue_b32(), v407, v390); + svfloat32_t v410 = svsub_f32_x(svptrue_b32(), v409, v269); + svfloat32_t v413 = svadd_f32_x(svptrue_b32(), v412, v390); + svfloat32_t v603 = svadd_f32_x(svptrue_b32(), v602, v587); + svfloat32_t v604 = svsub_f32_x(svptrue_b32(), v587, v589); + svfloat32_t v606 = svadd_f32_x(svptrue_b32(), v587, v593); + svfloat32_t v608 = svsub_f32_x(svptrue_b32(), v587, v590); + svfloat32_t v610 = svadd_f32_x(svptrue_b32(), v587, v588); + svfloat32_t v613 = svadd_f32_x(svptrue_b32(), v612, v600); + svfloat32_t v615 = svsub_f32_x(svptrue_b32(), v614, v478); + svfloat32_t v617 = svadd_f32_x(svptrue_b32(), v616, v599); + svfloat32_t v619 = svsub_f32_x(svptrue_b32(), v618, v478); + svfloat32_t v622 = svadd_f32_x(svptrue_b32(), v621, v599); + svfloat32_t v396 = svsub_f32_x(svptrue_b32(), v395, v385); + svfloat32_t v398 = svadd_f32_x(svptrue_b32(), v397, v386); + svfloat32_t v400 = svsub_f32_x(svptrue_b32(), v399, v386); + svfloat32_t v402 = svsub_f32_x(svptrue_b32(), v401, v382); + svfloat32_t v414 = svsub_f32_x(svptrue_b32(), v413, v269); + svfloat32_t v416 = svadd_f32_x(svptrue_b32(), v394, v404); + svfloat32_t v423 = svsub_f32_x(svptrue_b32(), v394, v404); + svfloat32_t v605 = svsub_f32_x(svptrue_b32(), v604, v594); + svfloat32_t v607 = svadd_f32_x(svptrue_b32(), v606, v595); + svfloat32_t v609 = svsub_f32_x(svptrue_b32(), v608, v595); + svfloat32_t v611 = svsub_f32_x(svptrue_b32(), v610, v591); + svfloat32_t v623 = svsub_f32_x(svptrue_b32(), v622, v478); + svfloat32_t v625 = svadd_f32_x(svptrue_b32(), v603, v613); + svfloat32_t v632 = svsub_f32_x(svptrue_b32(), v603, v613); + svfloat32_t v415 = svadd_f32_x(svptrue_b32(), v402, v414); + svfloat32_t v417 = svadd_f32_x(svptrue_b32(), v396, v406); + svfloat32_t v418 = svsub_f32_x(svptrue_b32(), v398, v408); + svfloat32_t v419 = svadd_f32_x(svptrue_b32(), v400, v410); + svfloat32_t v420 = svsub_f32_x(svptrue_b32(), v400, v410); + svfloat32_t v421 = svadd_f32_x(svptrue_b32(), v398, v408); + svfloat32_t v422 = svsub_f32_x(svptrue_b32(), v396, v406); + svfloat32_t v424 = svsub_f32_x(svptrue_b32(), v402, v414); + svfloat32_t v624 = svadd_f32_x(svptrue_b32(), v611, v623); + svfloat32_t v626 = svadd_f32_x(svptrue_b32(), v605, v615); + svfloat32_t v627 = svsub_f32_x(svptrue_b32(), v607, v617); + svfloat32_t v628 = svadd_f32_x(svptrue_b32(), v609, v619); + svfloat32_t v629 = svsub_f32_x(svptrue_b32(), v609, v619); + svfloat32_t v630 = svadd_f32_x(svptrue_b32(), v607, v617); + svfloat32_t v631 = svsub_f32_x(svptrue_b32(), v605, v615); + svfloat32_t v633 = svsub_f32_x(svptrue_b32(), v611, v623); svst1_f64(pred_full, (double *)(v1072), svreinterpret_f64_f32(v423)); svst1_f64(pred_full, (double *)(v1081), svreinterpret_f64_f32(v632)); svst1_f64(pred_full, (double *)(v1198), svreinterpret_f64_f32(v416)); @@ -13927,271 +12574,152 @@ void armral_fft_cs16_cf32_cf32_ac_n_uu24(const armral_cmplx_int16_t *restrict x, svcvt_f32_s32_x(pred_full, svld1sh_s32(pred_full, (const int16_t *)&v860[0])), 1.F / (1ULL << 15ULL)); - svfloat32_t v34; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v34) : "w"(v25), "w"(v33)); - svfloat32_t v35; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v35) : "w"(v25), "w"(v33)); - svfloat32_t v61; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v61) : "w"(v52), "w"(v60)); - svfloat32_t v62; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v62) : "w"(v52), "w"(v60)); - svfloat32_t v88; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v88) : "w"(v79), "w"(v87)); - svfloat32_t v89; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v89) : "w"(v79), "w"(v87)); - svfloat32_t v115; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v115) : "w"(v106), "w"(v114)); - svfloat32_t v116; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v116) : "w"(v106), "w"(v114)); - svfloat32_t v142; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v142) : "w"(v133), "w"(v141)); - svfloat32_t v143; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v143) : "w"(v133), "w"(v141)); - svfloat32_t v169; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v169) : "w"(v160), "w"(v168)); - svfloat32_t v170; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v170) : "w"(v160), "w"(v168)); - svfloat32_t v196; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v196) : "w"(v187), "w"(v195)); - svfloat32_t v197; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v197) : "w"(v187), "w"(v195)); - svfloat32_t v223; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v223) : "w"(v214), "w"(v222)); - svfloat32_t v224; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v224) : "w"(v214), "w"(v222)); - svfloat32_t v44; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v44) : "w"(v34), "w"(v43)); - svfloat32_t v71; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v71) : "w"(v61), "w"(v70)); - svfloat32_t v98; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v98) : "w"(v88), "w"(v97)); - svfloat32_t v125; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v125) : "w"(v115), "w"(v124)); - svfloat32_t v152; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v152) : "w"(v142), "w"(v151)); - svfloat32_t v179; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v179) : "w"(v169), "w"(v178)); - svfloat32_t v206; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v206) : "w"(v196), "w"(v205)); - svfloat32_t v233; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v233) : "w"(v223), "w"(v232)); - svfloat32_t v306; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v306) : "w"(v34), "w"(v142)); - svfloat32_t v307; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v307) : "w"(v34), "w"(v142)); - svfloat32_t v308; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v308) : "w"(v88), "w"(v196)); - svfloat32_t v309; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v309) : "w"(v88), "w"(v196)); - svfloat32_t v310; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v310) : "w"(v61), "w"(v169)); - svfloat32_t v311; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v311) : "w"(v61), "w"(v169)); - svfloat32_t v312; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v312) : "w"(v115), "w"(v223)); - svfloat32_t v313; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v313) : "w"(v115), "w"(v223)); - svfloat32_t v378; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v378) : "w"(v35), "w"(v143)); - svfloat32_t v379; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v379) : "w"(v35), "w"(v143)); - svfloat32_t v380; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v380) : "w"(v89), "w"(v197)); - svfloat32_t v381; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v381) : "w"(v89), "w"(v197)); - svfloat32_t v382; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v382) : "w"(v62), "w"(v170)); - svfloat32_t v383; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v383) : "w"(v62), "w"(v170)); - svfloat32_t v384; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v384) : "w"(v116), "w"(v224)); - svfloat32_t v385; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v385) : "w"(v116), "w"(v224)); - svfloat32_t v234; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v234) : "w"(v44), "w"(v152)); - svfloat32_t v235; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v235) : "w"(v44), "w"(v152)); - svfloat32_t v236; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v236) : "w"(v98), "w"(v206)); - svfloat32_t v237; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v237) : "w"(v98), "w"(v206)); - svfloat32_t v238; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v238) : "w"(v71), "w"(v179)); - svfloat32_t v239; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v239) : "w"(v71), "w"(v179)); - svfloat32_t v240; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v240) : "w"(v125), "w"(v233)); - svfloat32_t v241; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v241) : "w"(v125), "w"(v233)); - svfloat32_t v314; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v314) : "w"(v306), "w"(v308)); - svfloat32_t v315; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v315) : "w"(v306), "w"(v308)); - svfloat32_t v316; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v316) : "w"(v310), "w"(v312)); - svfloat32_t v317; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v317) : "w"(v310), "w"(v312)); - svfloat32_t v320; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v320) : "w"(v311), "w"(v313)); - svfloat32_t v321; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v321) : "w"(v311), "w"(v313)); - svfloat32_t zero355; - asm volatile("mov %0.s, #0" : "=w"(zero355)); + svfloat32_t v34 = svadd_f32_x(svptrue_b32(), v25, v33); + svfloat32_t v35 = svsub_f32_x(svptrue_b32(), v25, v33); + svfloat32_t v61 = svadd_f32_x(svptrue_b32(), v52, v60); + svfloat32_t v62 = svsub_f32_x(svptrue_b32(), v52, v60); + svfloat32_t v88 = svadd_f32_x(svptrue_b32(), v79, v87); + svfloat32_t v89 = svsub_f32_x(svptrue_b32(), v79, v87); + svfloat32_t v115 = svadd_f32_x(svptrue_b32(), v106, v114); + svfloat32_t v116 = svsub_f32_x(svptrue_b32(), v106, v114); + svfloat32_t v142 = svadd_f32_x(svptrue_b32(), v133, v141); + svfloat32_t v143 = svsub_f32_x(svptrue_b32(), v133, v141); + svfloat32_t v169 = svadd_f32_x(svptrue_b32(), v160, v168); + svfloat32_t v170 = svsub_f32_x(svptrue_b32(), v160, v168); + svfloat32_t v196 = svadd_f32_x(svptrue_b32(), v187, v195); + svfloat32_t v197 = svsub_f32_x(svptrue_b32(), v187, v195); + svfloat32_t v223 = svadd_f32_x(svptrue_b32(), v214, v222); + svfloat32_t v224 = svsub_f32_x(svptrue_b32(), v214, v222); + svfloat32_t v44 = svadd_f32_x(svptrue_b32(), v34, v43); + svfloat32_t v71 = svadd_f32_x(svptrue_b32(), v61, v70); + svfloat32_t v98 = svadd_f32_x(svptrue_b32(), v88, v97); + svfloat32_t v125 = svadd_f32_x(svptrue_b32(), v115, v124); + svfloat32_t v152 = svadd_f32_x(svptrue_b32(), v142, v151); + svfloat32_t v179 = svadd_f32_x(svptrue_b32(), v169, v178); + svfloat32_t v206 = svadd_f32_x(svptrue_b32(), v196, v205); + svfloat32_t v233 = svadd_f32_x(svptrue_b32(), v223, v232); + svfloat32_t v306 = svadd_f32_x(svptrue_b32(), v34, v142); + svfloat32_t v307 = svsub_f32_x(svptrue_b32(), v34, v142); + svfloat32_t v308 = svadd_f32_x(svptrue_b32(), v88, v196); + svfloat32_t v309 = svsub_f32_x(svptrue_b32(), v88, v196); + svfloat32_t v310 = svadd_f32_x(svptrue_b32(), v61, v169); + svfloat32_t v311 = svsub_f32_x(svptrue_b32(), v61, v169); + svfloat32_t v312 = svadd_f32_x(svptrue_b32(), v115, v223); + svfloat32_t v313 = svsub_f32_x(svptrue_b32(), v115, v223); + svfloat32_t v378 = svadd_f32_x(svptrue_b32(), v35, v143); + svfloat32_t v379 = svsub_f32_x(svptrue_b32(), v35, v143); + svfloat32_t v380 = svadd_f32_x(svptrue_b32(), v89, v197); + svfloat32_t v381 = svsub_f32_x(svptrue_b32(), v89, v197); + svfloat32_t v382 = svadd_f32_x(svptrue_b32(), v62, v170); + svfloat32_t v383 = svsub_f32_x(svptrue_b32(), v62, v170); + svfloat32_t v384 = svadd_f32_x(svptrue_b32(), v116, v224); + svfloat32_t v385 = svsub_f32_x(svptrue_b32(), v116, v224); + svfloat32_t v234 = svadd_f32_x(svptrue_b32(), v44, v152); + svfloat32_t v235 = svsub_f32_x(svptrue_b32(), v44, v152); + svfloat32_t v236 = svadd_f32_x(svptrue_b32(), v98, v206); + svfloat32_t v237 = svsub_f32_x(svptrue_b32(), v98, v206); + svfloat32_t v238 = svadd_f32_x(svptrue_b32(), v71, v179); + svfloat32_t v239 = svsub_f32_x(svptrue_b32(), v71, v179); + svfloat32_t v240 = svadd_f32_x(svptrue_b32(), v125, v233); + svfloat32_t v241 = svsub_f32_x(svptrue_b32(), v125, v233); + svfloat32_t v314 = svadd_f32_x(svptrue_b32(), v306, v308); + svfloat32_t v315 = svsub_f32_x(svptrue_b32(), v306, v308); + svfloat32_t v316 = svadd_f32_x(svptrue_b32(), v310, v312); + svfloat32_t v317 = svsub_f32_x(svptrue_b32(), v310, v312); + svfloat32_t v320 = svadd_f32_x(svptrue_b32(), v311, v313); + svfloat32_t v321 = svsub_f32_x(svptrue_b32(), v311, v313); + svfloat32_t zero355 = svdup_n_f32(0); svfloat32_t v355 = svcmla_f32_x(pred_full, zero355, v876, v309, 90); - svfloat32_t v386; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v386) : "w"(v378), "w"(v380)); - svfloat32_t v387; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v387) : "w"(v378), "w"(v380)); - svfloat32_t v388; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v388) : "w"(v382), "w"(v384)); - svfloat32_t v389; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v389) : "w"(v382), "w"(v384)); - svfloat32_t v392; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v392) : "w"(v383), "w"(v385)); - svfloat32_t v393; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v393) : "w"(v383), "w"(v385)); - svfloat32_t zero426; - asm volatile("mov %0.s, #0" : "=w"(zero426)); + svfloat32_t v386 = svadd_f32_x(svptrue_b32(), v378, v380); + svfloat32_t v387 = svsub_f32_x(svptrue_b32(), v378, v380); + svfloat32_t v388 = svadd_f32_x(svptrue_b32(), v382, v384); + svfloat32_t v389 = svsub_f32_x(svptrue_b32(), v382, v384); + svfloat32_t v392 = svadd_f32_x(svptrue_b32(), v383, v385); + svfloat32_t v393 = svsub_f32_x(svptrue_b32(), v383, v385); + svfloat32_t zero426 = svdup_n_f32(0); svfloat32_t v426 = svcmla_f32_x(pred_full, zero426, v883, v379, 90); - svfloat32_t v242; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v242) : "w"(v234), "w"(v236)); - svfloat32_t v243; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v243) : "w"(v234), "w"(v236)); - svfloat32_t v244; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v244) : "w"(v238), "w"(v240)); - svfloat32_t v245; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v245) : "w"(v238), "w"(v240)); - svfloat32_t v248; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v248) : "w"(v239), "w"(v241)); - svfloat32_t v249; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v249) : "w"(v239), "w"(v241)); - svfloat32_t zero283; - asm volatile("mov %0.s, #0" : "=w"(zero283)); + svfloat32_t v242 = svadd_f32_x(svptrue_b32(), v234, v236); + svfloat32_t v243 = svsub_f32_x(svptrue_b32(), v234, v236); + svfloat32_t v244 = svadd_f32_x(svptrue_b32(), v238, v240); + svfloat32_t v245 = svsub_f32_x(svptrue_b32(), v238, v240); + svfloat32_t v248 = svadd_f32_x(svptrue_b32(), v239, v241); + svfloat32_t v249 = svsub_f32_x(svptrue_b32(), v239, v241); + svfloat32_t zero283 = svdup_n_f32(0); svfloat32_t v283 = svcmla_f32_x(pred_full, zero283, v868, v237, 90); - svfloat32_t v318; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v318) : "w"(v314), "w"(v316)); - svfloat32_t v319; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v319) : "w"(v314), "w"(v316)); - svfloat32_t zero343; - asm volatile("mov %0.s, #0" : "=w"(zero343)); + svfloat32_t v318 = svadd_f32_x(svptrue_b32(), v314, v316); + svfloat32_t v319 = svsub_f32_x(svptrue_b32(), v314, v316); + svfloat32_t zero343 = svdup_n_f32(0); svfloat32_t v343 = svcmla_f32_x(pred_full, zero343, v876, v317, 90); - svfloat32_t zero362; - asm volatile("mov %0.s, #0" : "=w"(zero362)); + svfloat32_t zero362 = svdup_n_f32(0); svfloat32_t v362 = svcmla_f32_x(pred_full, zero362, v877, v320, 90); - svfloat32_t v367; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v367) : "w"(v321), "w"(v878)); - svfloat32_t v390; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v390) : "w"(v386), "w"(v388)); - svfloat32_t v391; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v391) : "w"(v386), "w"(v388)); - svfloat32_t zero414; - asm volatile("mov %0.s, #0" : "=w"(zero414)); + svfloat32_t v367 = svmul_f32_x(svptrue_b32(), v321, v878); + svfloat32_t v390 = svadd_f32_x(svptrue_b32(), v386, v388); + svfloat32_t v391 = svsub_f32_x(svptrue_b32(), v386, v388); + svfloat32_t zero414 = svdup_n_f32(0); svfloat32_t v414 = svcmla_f32_x(pred_full, zero414, v883, v387, 90); - svfloat32_t v436; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v436) : "w"(v392), "w"(v885)); - svfloat32_t zero443; - asm volatile("mov %0.s, #0" : "=w"(zero443)); + svfloat32_t v436 = svmul_f32_x(svptrue_b32(), v392, v885); + svfloat32_t zero443 = svdup_n_f32(0); svfloat32_t v443 = svcmla_f32_x(pred_full, zero443, v886, v393, 90); - svfloat32_t v246; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v246) : "w"(v242), "w"(v244)); - svfloat32_t v247; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v247) : "w"(v242), "w"(v244)); - svfloat32_t zero271; - asm volatile("mov %0.s, #0" : "=w"(zero271)); + svfloat32_t v246 = svadd_f32_x(svptrue_b32(), v242, v244); + svfloat32_t v247 = svsub_f32_x(svptrue_b32(), v242, v244); + svfloat32_t zero271 = svdup_n_f32(0); svfloat32_t v271 = svcmla_f32_x(pred_full, zero271, v868, v245, 90); - svfloat32_t zero290; - asm volatile("mov %0.s, #0" : "=w"(zero290)); + svfloat32_t zero290 = svdup_n_f32(0); svfloat32_t v290 = svcmla_f32_x(pred_full, zero290, v869, v248, 90); svfloat32_t v368 = svmla_f32_x(pred_full, v343, v315, v875); svfloat32_t v369 = svnmls_f32_x(pred_full, v343, v315, v875); svfloat32_t v370 = svmla_f32_x(pred_full, v367, v307, v875); svfloat32_t v371 = svnmls_f32_x(pred_full, v367, v307, v875); - svfloat32_t v372; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v372) : "w"(v355), "w"(v362)); - svfloat32_t v373; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v373) : "w"(v355), "w"(v362)); - svfloat32_t zero400; - asm volatile("mov %0.s, #0" : "=w"(zero400)); + svfloat32_t v372 = svadd_f32_x(svptrue_b32(), v355, v362); + svfloat32_t v373 = svsub_f32_x(svptrue_b32(), v355, v362); + svfloat32_t zero400 = svdup_n_f32(0); svfloat32_t v400 = svcmla_f32_x(pred_full, zero400, v883, v390, 90); - svfloat32_t zero407; - asm volatile("mov %0.s, #0" : "=w"(zero407)); + svfloat32_t zero407 = svdup_n_f32(0); svfloat32_t v407 = svcmla_f32_x(pred_full, zero407, v883, v391, 90); svfloat32_t v444 = svmla_f32_x(pred_full, v414, v389, v884); svfloat32_t v445 = svmls_f32_x(pred_full, v414, v389, v884); - svfloat32_t v446; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v446) : "w"(v426), "w"(v443)); - svfloat32_t v447; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v447) : "w"(v426), "w"(v443)); + svfloat32_t v446 = svadd_f32_x(svptrue_b32(), v426, v443); + svfloat32_t v447 = svsub_f32_x(svptrue_b32(), v426, v443); svfloat32_t v448 = svmla_f32_x(pred_full, v436, v381, v884); svfloat32_t v449 = svnmls_f32_x(pred_full, v436, v381, v884); - svfloat32_t v296; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v296) : "w"(v243), "w"(v271)); - svfloat32_t v297; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v297) : "w"(v243), "w"(v271)); + svfloat32_t v296 = svadd_f32_x(svptrue_b32(), v243, v271); + svfloat32_t v297 = svsub_f32_x(svptrue_b32(), v243, v271); svfloat32_t v298 = svmla_f32_x(pred_full, v235, v249, v870); svfloat32_t v299 = svmls_f32_x(pred_full, v235, v249, v870); - svfloat32_t v300; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v300) : "w"(v283), "w"(v290)); - svfloat32_t v301; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v301) : "w"(v283), "w"(v290)); - svfloat32_t v374; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v374) : "w"(v370), "w"(v372)); - svfloat32_t v375; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v375) : "w"(v370), "w"(v372)); - svfloat32_t v376; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v376) : "w"(v371), "w"(v373)); - svfloat32_t v377; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v377) : "w"(v371), "w"(v373)); - svfloat32_t v450; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v450) : "w"(v446), "w"(v448)); - svfloat32_t v451; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v451) : "w"(v446), "w"(v448)); - svfloat32_t v452; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v452) : "w"(v447), "w"(v449)); - svfloat32_t v453; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v453) : "w"(v447), "w"(v449)); + svfloat32_t v300 = svadd_f32_x(svptrue_b32(), v283, v290); + svfloat32_t v301 = svsub_f32_x(svptrue_b32(), v283, v290); + svfloat32_t v374 = svadd_f32_x(svptrue_b32(), v370, v372); + svfloat32_t v375 = svsub_f32_x(svptrue_b32(), v370, v372); + svfloat32_t v376 = svadd_f32_x(svptrue_b32(), v371, v373); + svfloat32_t v377 = svsub_f32_x(svptrue_b32(), v371, v373); + svfloat32_t v450 = svadd_f32_x(svptrue_b32(), v446, v448); + svfloat32_t v451 = svsub_f32_x(svptrue_b32(), v446, v448); + svfloat32_t v452 = svadd_f32_x(svptrue_b32(), v447, v449); + svfloat32_t v453 = svsub_f32_x(svptrue_b32(), v447, v449); svfloat32_t v454 = svmla_f32_x(pred_full, v246, v318, v875); svfloat32_t v550 = svmla_f32_x(pred_full, v247, v319, v875); svst1_f64(pred_full, (double *)(v894), svreinterpret_f64_f32(v246)); svst1_f64(pred_full, (double *)(v1002), svreinterpret_f64_f32(v247)); - svfloat32_t v302; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v302) : "w"(v298), "w"(v300)); - svfloat32_t v303; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v303) : "w"(v298), "w"(v300)); - svfloat32_t v304; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v304) : "w"(v299), "w"(v301)); - svfloat32_t v305; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v305) : "w"(v299), "w"(v301)); - svfloat32_t v455; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v455) : "w"(v454), "w"(v400)); - svfloat32_t v456; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v456) : "w"(v454), "w"(v400)); - svfloat32_t v502; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v502) : "w"(v297), "w"(v369)); - svfloat32_t v551; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v551) : "w"(v550), "w"(v407)); - svfloat32_t v552; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v552) : "w"(v550), "w"(v407)); - svfloat32_t v598; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v598) : "w"(v296), "w"(v368)); + svfloat32_t v302 = svadd_f32_x(svptrue_b32(), v298, v300); + svfloat32_t v303 = svsub_f32_x(svptrue_b32(), v298, v300); + svfloat32_t v304 = svadd_f32_x(svptrue_b32(), v299, v301); + svfloat32_t v305 = svsub_f32_x(svptrue_b32(), v299, v301); + svfloat32_t v455 = svadd_f32_x(svptrue_b32(), v454, v400); + svfloat32_t v456 = svsub_f32_x(svptrue_b32(), v454, v400); + svfloat32_t v502 = svadd_f32_x(svptrue_b32(), v297, v369); + svfloat32_t v551 = svadd_f32_x(svptrue_b32(), v550, v407); + svfloat32_t v552 = svsub_f32_x(svptrue_b32(), v550, v407); + svfloat32_t v598 = svadd_f32_x(svptrue_b32(), v296, v368); svst1_f64(pred_full, (double *)(v948), svreinterpret_f64_f32(v297)); svst1_f64(pred_full, (double *)(v1056), svreinterpret_f64_f32(v296)); - svfloat32_t v478; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v478) : "w"(v303), "w"(v375)); - svfloat32_t v503; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v503) : "w"(v502), "w"(v445)); - svfloat32_t v504; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v504) : "w"(v502), "w"(v445)); - svfloat32_t v526; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v526) : "w"(v304), "w"(v376)); - svfloat32_t v574; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v574) : "w"(v305), "w"(v377)); - svfloat32_t v599; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v599) : "w"(v598), "w"(v444)); - svfloat32_t v600; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v600) : "w"(v598), "w"(v444)); - svfloat32_t v622; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v622) : "w"(v302), "w"(v374)); + svfloat32_t v478 = svadd_f32_x(svptrue_b32(), v303, v375); + svfloat32_t v503 = svadd_f32_x(svptrue_b32(), v502, v445); + svfloat32_t v504 = svsub_f32_x(svptrue_b32(), v502, v445); + svfloat32_t v526 = svadd_f32_x(svptrue_b32(), v304, v376); + svfloat32_t v574 = svadd_f32_x(svptrue_b32(), v305, v377); + svfloat32_t v599 = svadd_f32_x(svptrue_b32(), v598, v444); + svfloat32_t v600 = svsub_f32_x(svptrue_b32(), v598, v444); + svfloat32_t v622 = svadd_f32_x(svptrue_b32(), v302, v374); svst1_f64(pred_full, (double *)(v903), svreinterpret_f64_f32(v456)); svst1_f64(pred_full, (double *)(v912), svreinterpret_f64_f32(v455)); svst1_f64(pred_full, (double *)(v921), svreinterpret_f64_f32(v303)); @@ -14200,22 +12728,14 @@ void armral_fft_cs16_cf32_cf32_ac_n_uu24(const armral_cmplx_int16_t *restrict x, svst1_f64(pred_full, (double *)(v1020), svreinterpret_f64_f32(v551)); svst1_f64(pred_full, (double *)(v1029), svreinterpret_f64_f32(v305)); svst1_f64(pred_full, (double *)(v1083), svreinterpret_f64_f32(v302)); - svfloat32_t v479; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v479) : "w"(v478), "w"(v451)); - svfloat32_t v480; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v480) : "w"(v478), "w"(v451)); - svfloat32_t v527; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v527) : "w"(v526), "w"(v452)); - svfloat32_t v528; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v528) : "w"(v526), "w"(v452)); - svfloat32_t v575; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v575) : "w"(v574), "w"(v453)); - svfloat32_t v576; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v576) : "w"(v574), "w"(v453)); - svfloat32_t v623; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v623) : "w"(v622), "w"(v450)); - svfloat32_t v624; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v624) : "w"(v622), "w"(v450)); + svfloat32_t v479 = svadd_f32_x(svptrue_b32(), v478, v451); + svfloat32_t v480 = svsub_f32_x(svptrue_b32(), v478, v451); + svfloat32_t v527 = svadd_f32_x(svptrue_b32(), v526, v452); + svfloat32_t v528 = svsub_f32_x(svptrue_b32(), v526, v452); + svfloat32_t v575 = svadd_f32_x(svptrue_b32(), v574, v453); + svfloat32_t v576 = svsub_f32_x(svptrue_b32(), v574, v453); + svfloat32_t v623 = svadd_f32_x(svptrue_b32(), v622, v450); + svfloat32_t v624 = svsub_f32_x(svptrue_b32(), v622, v450); svst1_f64(pred_full, (double *)(v957), svreinterpret_f64_f32(v504)); svst1_f64(pred_full, (double *)(v966), svreinterpret_f64_f32(v503)); svst1_f64(pred_full, (double *)(v1065), svreinterpret_f64_f32(v600)); @@ -14245,7 +12765,6 @@ void armral_fft_cs16_cf32_cf32_ac_n_uu25(const armral_cmplx_int16_t *restrict x, int64_t v12 = howmany - 1; int64_t v1726 = howmany / 2; for (int j = 0; j < v12; j += 2) { - float v941 = 0.0000000000000000e+00F; float v1055 = 9.6858316112863108e-01F; float v1059 = -2.4868988716485479e-01F; float v1060 = 2.4868988716485479e-01F; @@ -14279,7 +12798,6 @@ void armral_fft_cs16_cf32_cf32_ac_n_uu25(const armral_cmplx_int16_t *restrict x, float v1714 = 2.0000000000000000e+00F; const int32_t *v3205 = &v5[istride]; float32x2_t *v3431 = &v6[ostride]; - float v944 = dir * v941; float32x2_t v1056 = (float32x2_t){v1055, v1055}; float32x2_t v1061 = (float32x2_t){v1059, v1060}; float32x2_t v1224 = (float32x2_t){v1223, v1223}; @@ -14304,9 +12822,8 @@ void armral_fft_cs16_cf32_cf32_ac_n_uu25(const armral_cmplx_int16_t *restrict x, float32x2_t v1715 = (float32x2_t){v1714, v1714}; const int32_t *v3160 = &v5[0]; float32x2_t *v3386 = &v6[0]; - int16x4_t v3616 = vld1_s16((const int16_t *)v3205); - float32x4_t v201 = vcvtq_n_f32_s32(vmovl_s16(v3616), 15); - float32x2_t v942 = (float32x2_t){v941, v944}; + int16x4_t v3664 = vld1_s16((const int16_t *)v3205); + float32x4_t v201 = vcvtq_n_f32_s32(vmovl_s16(v3664), 15); float32x4_t v1057 = vcombine_f32(v1056, v1056); float32x2_t v1063 = vmul_f32(v1687, v1061); float32x4_t v1225 = vcombine_f32(v1224, v1224); @@ -14375,9 +12892,8 @@ void armral_fft_cs16_cf32_cf32_ac_n_uu25(const armral_cmplx_int16_t *restrict x, float32x2_t *v3584 = &v6[ostride * 14]; float32x2_t *v3593 = &v6[ostride * 19]; float32x2_t *v3602 = &v6[ostride * 24]; - int16x4_t v3606 = vld1_s16((const int16_t *)v3160); - float32x4_t v28 = vcvtq_n_f32_s32(vmovl_s16(v3606), 15); - float32x4_t v946 = vcombine_f32(v942, v942); + int16x4_t v3654 = vld1_s16((const int16_t *)v3160); + float32x4_t v28 = vcvtq_n_f32_s32(vmovl_s16(v3654), 15); float32x4_t v1065 = vcombine_f32(v1063, v1063); float32x4_t v1233 = vcombine_f32(v1231, v1231); float32x4_t v1401 = vcombine_f32(v1399, v1399); @@ -14388,132 +12904,72 @@ void armral_fft_cs16_cf32_cf32_ac_n_uu25(const armral_cmplx_int16_t *restrict x, float32x4_t v1597 = vcombine_f32(v1595, v1595); float32x4_t v1618 = vcombine_f32(v1616, v1616); float32x4_t v1690 = vcombine_f32(v1688, v1688); - int16x4_t v3608 = vld1_s16((const int16_t *)v3169); - int16x4_t v3610 = vld1_s16((const int16_t *)v3178); - int16x4_t v3612 = vld1_s16((const int16_t *)v3187); - int16x4_t v3614 = vld1_s16((const int16_t *)v3196); - int16x4_t v3618 = vld1_s16((const int16_t *)v3214); - int16x4_t v3620 = vld1_s16((const int16_t *)v3223); - int16x4_t v3622 = vld1_s16((const int16_t *)v3232); - int16x4_t v3624 = vld1_s16((const int16_t *)v3241); - int16x4_t v3626 = vld1_s16((const int16_t *)v3250); - int16x4_t v3628 = vld1_s16((const int16_t *)v3259); - int16x4_t v3630 = vld1_s16((const int16_t *)v3268); - int16x4_t v3632 = vld1_s16((const int16_t *)v3277); - int16x4_t v3634 = vld1_s16((const int16_t *)v3286); - int16x4_t v3636 = vld1_s16((const int16_t *)v3295); - int16x4_t v3638 = vld1_s16((const int16_t *)v3304); - int16x4_t v3640 = vld1_s16((const int16_t *)v3313); - int16x4_t v3642 = vld1_s16((const int16_t *)v3322); - int16x4_t v3644 = vld1_s16((const int16_t *)v3331); - int16x4_t v3646 = vld1_s16((const int16_t *)v3340); - int16x4_t v3648 = vld1_s16((const int16_t *)v3349); - int16x4_t v3650 = vld1_s16((const int16_t *)v3358); - int16x4_t v3652 = vld1_s16((const int16_t *)v3367); - int16x4_t v3654 = vld1_s16((const int16_t *)v3376); - float32x4_t v36 = vcvtq_n_f32_s32(vmovl_s16(v3608), 15); - float32x4_t v44 = vcvtq_n_f32_s32(vmovl_s16(v3610), 15); - float32x4_t v52 = vcvtq_n_f32_s32(vmovl_s16(v3612), 15); - float32x4_t v60 = vcvtq_n_f32_s32(vmovl_s16(v3614), 15); - float32x4_t v209 = vcvtq_n_f32_s32(vmovl_s16(v3618), 15); - float32x4_t v217 = vcvtq_n_f32_s32(vmovl_s16(v3620), 15); - float32x4_t v225 = vcvtq_n_f32_s32(vmovl_s16(v3622), 15); - float32x4_t v233 = vcvtq_n_f32_s32(vmovl_s16(v3624), 15); - float32x4_t v374 = vcvtq_n_f32_s32(vmovl_s16(v3626), 15); - float32x4_t v382 = vcvtq_n_f32_s32(vmovl_s16(v3628), 15); - float32x4_t v390 = vcvtq_n_f32_s32(vmovl_s16(v3630), 15); - float32x4_t v398 = vcvtq_n_f32_s32(vmovl_s16(v3632), 15); - float32x4_t v406 = vcvtq_n_f32_s32(vmovl_s16(v3634), 15); - float32x4_t v547 = vcvtq_n_f32_s32(vmovl_s16(v3636), 15); - float32x4_t v555 = vcvtq_n_f32_s32(vmovl_s16(v3638), 15); - float32x4_t v563 = vcvtq_n_f32_s32(vmovl_s16(v3640), 15); - float32x4_t v571 = vcvtq_n_f32_s32(vmovl_s16(v3642), 15); - float32x4_t v579 = vcvtq_n_f32_s32(vmovl_s16(v3644), 15); - float32x4_t v720 = vcvtq_n_f32_s32(vmovl_s16(v3646), 15); - float32x4_t v728 = vcvtq_n_f32_s32(vmovl_s16(v3648), 15); - float32x4_t v736 = vcvtq_n_f32_s32(vmovl_s16(v3650), 15); - float32x4_t v744 = vcvtq_n_f32_s32(vmovl_s16(v3652), 15); - float32x4_t v752 = vcvtq_n_f32_s32(vmovl_s16(v3654), 15); - float32x4_t v71 = vrev64q_f32(v36); - float32x4_t v85 = vrev64q_f32(v44); - float32x4_t v99 = vrev64q_f32(v60); - float32x4_t v120 = vrev64q_f32(v52); - float32x4_t v244 = vrev64q_f32(v209); - float32x4_t v258 = vrev64q_f32(v217); - float32x4_t v272 = vrev64q_f32(v233); - float32x4_t v293 = vrev64q_f32(v225); - float32x4_t v417 = vrev64q_f32(v382); - float32x4_t v431 = vrev64q_f32(v390); - float32x4_t v445 = vrev64q_f32(v406); - float32x4_t v466 = vrev64q_f32(v398); - float32x4_t v590 = vrev64q_f32(v555); - float32x4_t v604 = vrev64q_f32(v563); - float32x4_t v618 = vrev64q_f32(v579); - float32x4_t v639 = vrev64q_f32(v571); - float32x4_t v763 = vrev64q_f32(v728); - float32x4_t v777 = vrev64q_f32(v736); - float32x4_t v791 = vrev64q_f32(v752); - float32x4_t v812 = vrev64q_f32(v744); - float32x4_t v73 = vmulq_f32(v71, v946); - float32x4_t v87 = vmulq_f32(v85, v946); - float32x4_t v101 = vmulq_f32(v99, v946); - float32x4_t v122 = vmulq_f32(v120, v946); - float32x4_t v246 = vmulq_f32(v244, v946); - float32x4_t v260 = vmulq_f32(v258, v946); - float32x4_t v274 = vmulq_f32(v272, v946); - float32x4_t v295 = vmulq_f32(v293, v946); - float32x4_t v419 = vmulq_f32(v417, v946); - float32x4_t v433 = vmulq_f32(v431, v946); - float32x4_t v447 = vmulq_f32(v445, v946); - float32x4_t v468 = vmulq_f32(v466, v946); - float32x4_t v592 = vmulq_f32(v590, v946); - float32x4_t v606 = vmulq_f32(v604, v946); - float32x4_t v620 = vmulq_f32(v618, v946); - float32x4_t v641 = vmulq_f32(v639, v946); - float32x4_t v765 = vmulq_f32(v763, v946); - float32x4_t v779 = vmulq_f32(v777, v946); - float32x4_t v793 = vmulq_f32(v791, v946); - float32x4_t v814 = vmulq_f32(v812, v946); - float32x4_t v74 = vaddq_f32(v73, v36); - float32x4_t v88 = vaddq_f32(v87, v44); - float32x4_t v102 = vaddq_f32(v101, v60); - float32x4_t v123 = vaddq_f32(v122, v52); - float32x4_t v247 = vaddq_f32(v246, v209); - float32x4_t v261 = vaddq_f32(v260, v217); - float32x4_t v275 = vaddq_f32(v274, v233); - float32x4_t v296 = vaddq_f32(v295, v225); - float32x4_t v420 = vaddq_f32(v419, v382); - float32x4_t v434 = vaddq_f32(v433, v390); - float32x4_t v448 = vaddq_f32(v447, v406); - float32x4_t v469 = vaddq_f32(v468, v398); - float32x4_t v593 = vaddq_f32(v592, v555); - float32x4_t v607 = vaddq_f32(v606, v563); - float32x4_t v621 = vaddq_f32(v620, v579); - float32x4_t v642 = vaddq_f32(v641, v571); - float32x4_t v766 = vaddq_f32(v765, v728); - float32x4_t v780 = vaddq_f32(v779, v736); - float32x4_t v794 = vaddq_f32(v793, v752); - float32x4_t v815 = vaddq_f32(v814, v744); - float32x4_t v103 = vsubq_f32(v74, v102); - float32x4_t v108 = vmulq_f32(v74, v1716); - float32x4_t v124 = vsubq_f32(v88, v123); - float32x4_t v129 = vmulq_f32(v88, v1716); - float32x4_t v276 = vsubq_f32(v247, v275); - float32x4_t v281 = vmulq_f32(v247, v1716); - float32x4_t v297 = vsubq_f32(v261, v296); - float32x4_t v302 = vmulq_f32(v261, v1716); - float32x4_t v449 = vsubq_f32(v420, v448); - float32x4_t v454 = vmulq_f32(v420, v1716); - float32x4_t v470 = vsubq_f32(v434, v469); - float32x4_t v475 = vmulq_f32(v434, v1716); - float32x4_t v622 = vsubq_f32(v593, v621); - float32x4_t v627 = vmulq_f32(v593, v1716); - float32x4_t v643 = vsubq_f32(v607, v642); - float32x4_t v648 = vmulq_f32(v607, v1716); - float32x4_t v795 = vsubq_f32(v766, v794); - float32x4_t v800 = vmulq_f32(v766, v1716); - float32x4_t v816 = vsubq_f32(v780, v815); - float32x4_t v821 = vmulq_f32(v780, v1716); + int16x4_t v3656 = vld1_s16((const int16_t *)v3169); + int16x4_t v3658 = vld1_s16((const int16_t *)v3178); + int16x4_t v3660 = vld1_s16((const int16_t *)v3187); + int16x4_t v3662 = vld1_s16((const int16_t *)v3196); + int16x4_t v3666 = vld1_s16((const int16_t *)v3214); + int16x4_t v3668 = vld1_s16((const int16_t *)v3223); + int16x4_t v3670 = vld1_s16((const int16_t *)v3232); + int16x4_t v3672 = vld1_s16((const int16_t *)v3241); + int16x4_t v3674 = vld1_s16((const int16_t *)v3250); + int16x4_t v3676 = vld1_s16((const int16_t *)v3259); + int16x4_t v3678 = vld1_s16((const int16_t *)v3268); + int16x4_t v3680 = vld1_s16((const int16_t *)v3277); + int16x4_t v3682 = vld1_s16((const int16_t *)v3286); + int16x4_t v3684 = vld1_s16((const int16_t *)v3295); + int16x4_t v3686 = vld1_s16((const int16_t *)v3304); + int16x4_t v3688 = vld1_s16((const int16_t *)v3313); + int16x4_t v3690 = vld1_s16((const int16_t *)v3322); + int16x4_t v3692 = vld1_s16((const int16_t *)v3331); + int16x4_t v3694 = vld1_s16((const int16_t *)v3340); + int16x4_t v3696 = vld1_s16((const int16_t *)v3349); + int16x4_t v3698 = vld1_s16((const int16_t *)v3358); + int16x4_t v3700 = vld1_s16((const int16_t *)v3367); + int16x4_t v3702 = vld1_s16((const int16_t *)v3376); + float32x4_t v36 = vcvtq_n_f32_s32(vmovl_s16(v3656), 15); + float32x4_t v44 = vcvtq_n_f32_s32(vmovl_s16(v3658), 15); + float32x4_t v52 = vcvtq_n_f32_s32(vmovl_s16(v3660), 15); + float32x4_t v60 = vcvtq_n_f32_s32(vmovl_s16(v3662), 15); + float32x4_t v209 = vcvtq_n_f32_s32(vmovl_s16(v3666), 15); + float32x4_t v217 = vcvtq_n_f32_s32(vmovl_s16(v3668), 15); + float32x4_t v225 = vcvtq_n_f32_s32(vmovl_s16(v3670), 15); + float32x4_t v233 = vcvtq_n_f32_s32(vmovl_s16(v3672), 15); + float32x4_t v374 = vcvtq_n_f32_s32(vmovl_s16(v3674), 15); + float32x4_t v382 = vcvtq_n_f32_s32(vmovl_s16(v3676), 15); + float32x4_t v390 = vcvtq_n_f32_s32(vmovl_s16(v3678), 15); + float32x4_t v398 = vcvtq_n_f32_s32(vmovl_s16(v3680), 15); + float32x4_t v406 = vcvtq_n_f32_s32(vmovl_s16(v3682), 15); + float32x4_t v547 = vcvtq_n_f32_s32(vmovl_s16(v3684), 15); + float32x4_t v555 = vcvtq_n_f32_s32(vmovl_s16(v3686), 15); + float32x4_t v563 = vcvtq_n_f32_s32(vmovl_s16(v3688), 15); + float32x4_t v571 = vcvtq_n_f32_s32(vmovl_s16(v3690), 15); + float32x4_t v579 = vcvtq_n_f32_s32(vmovl_s16(v3692), 15); + float32x4_t v720 = vcvtq_n_f32_s32(vmovl_s16(v3694), 15); + float32x4_t v728 = vcvtq_n_f32_s32(vmovl_s16(v3696), 15); + float32x4_t v736 = vcvtq_n_f32_s32(vmovl_s16(v3698), 15); + float32x4_t v744 = vcvtq_n_f32_s32(vmovl_s16(v3700), 15); + float32x4_t v752 = vcvtq_n_f32_s32(vmovl_s16(v3702), 15); + float32x4_t v103 = vsubq_f32(v36, v60); + float32x4_t v108 = vmulq_f32(v36, v1716); + float32x4_t v124 = vsubq_f32(v44, v52); + float32x4_t v129 = vmulq_f32(v44, v1716); + float32x4_t v276 = vsubq_f32(v209, v233); + float32x4_t v281 = vmulq_f32(v209, v1716); + float32x4_t v297 = vsubq_f32(v217, v225); + float32x4_t v302 = vmulq_f32(v217, v1716); + float32x4_t v449 = vsubq_f32(v382, v406); + float32x4_t v454 = vmulq_f32(v382, v1716); + float32x4_t v470 = vsubq_f32(v390, v398); + float32x4_t v475 = vmulq_f32(v390, v1716); + float32x4_t v622 = vsubq_f32(v555, v579); + float32x4_t v627 = vmulq_f32(v555, v1716); + float32x4_t v643 = vsubq_f32(v563, v571); + float32x4_t v648 = vmulq_f32(v563, v1716); + float32x4_t v795 = vsubq_f32(v728, v752); + float32x4_t v800 = vmulq_f32(v728, v1716); + float32x4_t v816 = vsubq_f32(v736, v744); + float32x4_t v821 = vmulq_f32(v736, v1716); float32x4_t v109 = vsubq_f32(v108, v103); float32x4_t v130 = vsubq_f32(v129, v124); float32x4_t v143 = vmulq_f32(v124, v1657); @@ -14594,10 +13050,10 @@ void armral_fft_cs16_cf32_cf32_ac_n_uu25(const armral_cmplx_int16_t *restrict x, float32x4_t v830 = vsubq_f32(v720, v829); float32x4_t v863 = vmulq_f32(v861, v1690); float32x4_t v872 = vmulq_f32(v870, v1690); - float32x4_t v896 = vrev64q_f32(v336); - float32x4_t v910 = vrev64q_f32(v509); - float32x4_t v924 = vrev64q_f32(v855); - float32x4_t v945 = vrev64q_f32(v682); + float32x4_t v928 = vsubq_f32(v336, v855); + float32x4_t v933 = vmulq_f32(v336, v1716); + float32x4_t v949 = vsubq_f32(v509, v682); + float32x4_t v954 = vmulq_f32(v509, v1716); float32x4_t v150 = vsubq_f32(v138, v149); float32x4_t v155 = vmulq_f32(v138, v1716); float32x4_t v323 = vsubq_f32(v311, v322); @@ -14608,10 +13064,10 @@ void armral_fft_cs16_cf32_cf32_ac_n_uu25(const armral_cmplx_int16_t *restrict x, float32x4_t v674 = vmulq_f32(v657, v1716); float32x4_t v842 = vsubq_f32(v830, v841); float32x4_t v847 = vmulq_f32(v830, v1716); - float32x4_t v898 = vmulq_f32(v896, v946); - float32x4_t v912 = vmulq_f32(v910, v946); - float32x4_t v926 = vmulq_f32(v924, v946); - float32x4_t v947 = vmulq_f32(v945, v946); + float32x4_t v934 = vsubq_f32(v933, v928); + float32x4_t v955 = vsubq_f32(v954, v949); + float32x4_t v968 = vmulq_f32(v949, v1657); + float32x4_t v986 = vmulq_f32(v928, v1657); float32x4_t v156 = vsubq_f32(v155, v150); float32x4_t v181 = vsubq_f32(v150, v180); float32x4_t v186 = vmulq_f32(v150, v1716); @@ -14627,10 +13083,10 @@ void armral_fft_cs16_cf32_cf32_ac_n_uu25(const armral_cmplx_int16_t *restrict x, float32x4_t v848 = vsubq_f32(v847, v842); float32x4_t v873 = vsubq_f32(v842, v872); float32x4_t v878 = vmulq_f32(v842, v1716); - float32x4_t v899 = vaddq_f32(v898, v336); - float32x4_t v913 = vaddq_f32(v912, v509); - float32x4_t v927 = vaddq_f32(v926, v855); - float32x4_t v948 = vaddq_f32(v947, v682); + float32x4_t v956 = vaddq_f32(v934, v955); + float32x4_t v957 = vsubq_f32(v934, v955); + float32x4_t v969 = vaddq_f32(v928, v968); + float32x4_t v987 = vsubq_f32(v986, v949); float32x4_t v172 = vsubq_f32(v156, v171); float32x4_t v187 = vsubq_f32(v186, v181); float32x4_t v192 = vmulq_f32(v156, v1716); @@ -14646,10 +13102,11 @@ void armral_fft_cs16_cf32_cf32_ac_n_uu25(const armral_cmplx_int16_t *restrict x, float32x4_t v864 = vsubq_f32(v848, v863); float32x4_t v879 = vsubq_f32(v878, v873); float32x4_t v884 = vmulq_f32(v848, v1716); - float32x4_t v928 = vsubq_f32(v899, v927); - float32x4_t v933 = vmulq_f32(v899, v1716); - float32x4_t v949 = vsubq_f32(v913, v948); - float32x4_t v954 = vmulq_f32(v913, v1716); + float32x4_t v962 = vmulq_f32(v956, v1633); + float32x4_t v974 = vmulq_f32(v957, v1645); + float32x4_t v988 = vaddq_f32(v163, v956); + float32x4_t v1001 = vrev64q_f32(v969); + float32x4_t v1017 = vrev64q_f32(v987); float32x4_t v1232 = vrev64q_f32(v354); float32x4_t v1246 = vrev64q_f32(v527); float32x4_t v1260 = vrev64q_f32(v873); @@ -14659,10 +13116,9 @@ void armral_fft_cs16_cf32_cf32_ac_n_uu25(const armral_cmplx_int16_t *restrict x, float32x4_t v539 = vsubq_f32(v538, v518); float32x4_t v712 = vsubq_f32(v711, v691); float32x4_t v885 = vsubq_f32(v884, v864); - float32x4_t v934 = vsubq_f32(v933, v928); - float32x4_t v955 = vsubq_f32(v954, v949); - float32x4_t v968 = vmulq_f32(v949, v1657); - float32x4_t v986 = vmulq_f32(v928, v1657); + float32x4_t v963 = vsubq_f32(v163, v962); + float32x4_t v1003 = vmulq_f32(v1001, v1690); + float32x4_t v1019 = vmulq_f32(v1017, v1690); float32x4_t v1064 = vrev64q_f32(v345); float32x4_t v1078 = vrev64q_f32(v518); float32x4_t v1092 = vrev64q_f32(v864); @@ -14675,10 +13131,9 @@ void armral_fft_cs16_cf32_cf32_ac_n_uu25(const armral_cmplx_int16_t *restrict x, float32x4_t v1414 = vrev64q_f32(v533); float32x4_t v1428 = vrev64q_f32(v879); float32x4_t v1449 = vrev64q_f32(v706); - float32x4_t v956 = vaddq_f32(v934, v955); - float32x4_t v957 = vsubq_f32(v934, v955); - float32x4_t v969 = vaddq_f32(v928, v968); - float32x4_t v987 = vsubq_f32(v986, v949); + vst1q_f32((float32_t *)v3386, v988); + float32x4_t v975 = vsubq_f32(v963, v974); + float32x4_t v980 = vmulq_f32(v963, v1716); float32x4_t v1066 = vmulq_f32(v1064, v1065); float32x4_t v1080 = vmulq_f32(v1078, v1233); float32x4_t v1094 = vmulq_f32(v1092, v1569); @@ -14695,11 +13150,9 @@ void armral_fft_cs16_cf32_cf32_ac_n_uu25(const armral_cmplx_int16_t *restrict x, float32x4_t v1582 = vrev64q_f32(v539); float32x4_t v1596 = vrev64q_f32(v885); float32x4_t v1617 = vrev64q_f32(v712); - float32x4_t v962 = vmulq_f32(v956, v1633); - float32x4_t v974 = vmulq_f32(v957, v1645); - float32x4_t v988 = vaddq_f32(v163, v956); - float32x4_t v1001 = vrev64q_f32(v969); - float32x4_t v1017 = vrev64q_f32(v987); + float32x4_t v981 = vsubq_f32(v980, v975); + float32x4_t v1020 = vsubq_f32(v975, v1019); + float32x4_t v1032 = vmulq_f32(v975, v1716); float32x4_t v1067 = vfmaq_f32(v1066, v345, v1057); float32x4_t v1081 = vfmaq_f32(v1080, v518, v1225); float32x4_t v1095 = vfmaq_f32(v1094, v864, v1561); @@ -14716,9 +13169,9 @@ void armral_fft_cs16_cf32_cf32_ac_n_uu25(const armral_cmplx_int16_t *restrict x, float32x4_t v1584 = vmulq_f32(v1582, v1583); float32x4_t v1598 = vmulq_f32(v1596, v1597); float32x4_t v1619 = vmulq_f32(v1617, v1618); - float32x4_t v963 = vsubq_f32(v163, v962); - float32x4_t v1003 = vmulq_f32(v1001, v1690); - float32x4_t v1019 = vmulq_f32(v1017, v1690); + float32x4_t v1004 = vsubq_f32(v981, v1003); + float32x4_t v1033 = vsubq_f32(v1032, v1020); + float32x4_t v1045 = vmulq_f32(v981, v1716); float32x4_t v1096 = vsubq_f32(v1067, v1095); float32x4_t v1101 = vmulq_f32(v1067, v1716); float32x4_t v1117 = vsubq_f32(v1081, v1116); @@ -14735,9 +13188,8 @@ void armral_fft_cs16_cf32_cf32_ac_n_uu25(const armral_cmplx_int16_t *restrict x, float32x4_t v1585 = vfmaq_f32(v1584, v539, v1575); float32x4_t v1599 = vfmaq_f32(v1598, v885, v1589); float32x4_t v1620 = vfmaq_f32(v1619, v712, v1610); - vst1q_f32((float32_t *)v3386, v988); - float32x4_t v975 = vsubq_f32(v963, v974); - float32x4_t v980 = vmulq_f32(v963, v1716); + vst1q_f32((float32_t *)v3404, v1020); + float32x4_t v1046 = vsubq_f32(v1045, v1004); float32x4_t v1102 = vsubq_f32(v1101, v1096); float32x4_t v1123 = vsubq_f32(v1122, v1117); float32x4_t v1136 = vmulq_f32(v1117, v1657); @@ -14754,9 +13206,8 @@ void armral_fft_cs16_cf32_cf32_ac_n_uu25(const armral_cmplx_int16_t *restrict x, float32x4_t v1605 = vmulq_f32(v1571, v1716); float32x4_t v1621 = vsubq_f32(v1585, v1620); float32x4_t v1626 = vmulq_f32(v1585, v1716); - float32x4_t v981 = vsubq_f32(v980, v975); - float32x4_t v1020 = vsubq_f32(v975, v1019); - float32x4_t v1032 = vmulq_f32(v975, v1716); + vst1q_f32((float32_t *)v3395, v1004); + vst1q_f32((float32_t *)v3413, v1033); float32x4_t v1124 = vaddq_f32(v1102, v1123); float32x4_t v1125 = vsubq_f32(v1102, v1123); float32x4_t v1137 = vaddq_f32(v1096, v1136); @@ -14774,9 +13225,7 @@ void armral_fft_cs16_cf32_cf32_ac_n_uu25(const armral_cmplx_int16_t *restrict x, float32x4_t v1627 = vsubq_f32(v1626, v1621); float32x4_t v1640 = vmulq_f32(v1621, v1657); float32x4_t v1658 = vmulq_f32(v1600, v1657); - float32x4_t v1004 = vsubq_f32(v981, v1003); - float32x4_t v1033 = vsubq_f32(v1032, v1020); - float32x4_t v1045 = vmulq_f32(v981, v1716); + vst1q_f32((float32_t *)v3422, v1046); float32x4_t v1130 = vmulq_f32(v1124, v1633); float32x4_t v1142 = vmulq_f32(v1125, v1645); float32x4_t v1156 = vaddq_f32(v172, v1124); @@ -14794,9 +13243,7 @@ void armral_fft_cs16_cf32_cf32_ac_n_uu25(const armral_cmplx_int16_t *restrict x, float32x4_t v1629 = vsubq_f32(v1606, v1627); float32x4_t v1641 = vaddq_f32(v1600, v1640); float32x4_t v1659 = vsubq_f32(v1658, v1621); - vst1q_f32((float32_t *)v3404, v1020); vst1q_f32((float32_t *)v3476, v1324); - float32x4_t v1046 = vsubq_f32(v1045, v1004); float32x4_t v1131 = vsubq_f32(v172, v1130); float32x4_t v1171 = vmulq_f32(v1169, v1690); float32x4_t v1187 = vmulq_f32(v1185, v1690); @@ -14810,8 +13257,6 @@ void armral_fft_cs16_cf32_cf32_ac_n_uu25(const armral_cmplx_int16_t *restrict x, float32x4_t v1660 = vaddq_f32(v193, v1628); float32x4_t v1673 = vrev64q_f32(v1641); float32x4_t v1689 = vrev64q_f32(v1659); - vst1q_f32((float32_t *)v3395, v1004); - vst1q_f32((float32_t *)v3413, v1033); vst1q_f32((float32_t *)v3431, v1156); vst1q_f32((float32_t *)v3521, v1492); float32x4_t v1143 = vsubq_f32(v1131, v1142); @@ -14824,7 +13269,6 @@ void armral_fft_cs16_cf32_cf32_ac_n_uu25(const armral_cmplx_int16_t *restrict x, float32x4_t v1635 = vsubq_f32(v193, v1634); float32x4_t v1675 = vmulq_f32(v1673, v1690); float32x4_t v1691 = vmulq_f32(v1689, v1690); - vst1q_f32((float32_t *)v3422, v1046); vst1q_f32((float32_t *)v3566, v1660); float32x4_t v1149 = vsubq_f32(v1148, v1143); float32x4_t v1188 = vsubq_f32(v1143, v1187); @@ -14874,7 +13318,6 @@ void armral_fft_cs16_cf32_cf32_ac_n_uu25(const armral_cmplx_int16_t *restrict x, } for (int j = v1726 * 2; j < howmany; j += 1) { int16x4_t v1882 = vld1s_s16(&v5[istride]); - float v2501 = 0.0000000000000000e+00F; float v2594 = 9.6858316112863108e-01F; float v2597 = -2.4868988716485479e-01F; float v2598 = 2.4868988716485479e-01F; @@ -14908,7 +13351,6 @@ void armral_fft_cs16_cf32_cf32_ac_n_uu25(const armral_cmplx_int16_t *restrict x, float v3140 = 2.0000000000000000e+00F; int16x4_t v1738 = vld1s_s16(&v5[0]); float32x2_t v1883 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v1882)), 15); - float v2504 = dir * v2501; float32x2_t v2595 = (float32x2_t){v2594, v2594}; float32x2_t v2599 = (float32x2_t){v2597, v2598}; float32x2_t v2734 = (float32x2_t){v2733, v2733}; @@ -14955,7 +13397,6 @@ void armral_fft_cs16_cf32_cf32_ac_n_uu25(const armral_cmplx_int16_t *restrict x, int16x4_t v2326 = vld1s_s16(&v5[istride * 14]); int16x4_t v2332 = vld1s_s16(&v5[istride * 19]); int16x4_t v2338 = vld1s_s16(&v5[istride * 24]); - float32x2_t v2502 = (float32x2_t){v2501, v2504}; float32x2_t v2601 = vmul_f32(v3119, v2599); float32x2_t v2740 = vmul_f32(v3119, v2738); float32x2_t v2879 = vmul_f32(v3119, v2877); @@ -14989,86 +13430,26 @@ void armral_fft_cs16_cf32_cf32_ac_n_uu25(const armral_cmplx_int16_t *restrict x, float32x2_t v2327 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v2326)), 15); float32x2_t v2333 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v2332)), 15); float32x2_t v2339 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v2338)), 15); - float32x2_t v1773 = vrev64_f32(v1745); - float32x2_t v1785 = vrev64_f32(v1751); - float32x2_t v1797 = vrev64_f32(v1763); - float32x2_t v1815 = vrev64_f32(v1757); - float32x2_t v1917 = vrev64_f32(v1889); - float32x2_t v1929 = vrev64_f32(v1895); - float32x2_t v1941 = vrev64_f32(v1907); - float32x2_t v1959 = vrev64_f32(v1901); - float32x2_t v2061 = vrev64_f32(v2033); - float32x2_t v2073 = vrev64_f32(v2039); - float32x2_t v2085 = vrev64_f32(v2051); - float32x2_t v2103 = vrev64_f32(v2045); - float32x2_t v2205 = vrev64_f32(v2177); - float32x2_t v2217 = vrev64_f32(v2183); - float32x2_t v2229 = vrev64_f32(v2195); - float32x2_t v2247 = vrev64_f32(v2189); - float32x2_t v2349 = vrev64_f32(v2321); - float32x2_t v2361 = vrev64_f32(v2327); - float32x2_t v2373 = vrev64_f32(v2339); - float32x2_t v2391 = vrev64_f32(v2333); - float32x2_t v1774 = vmul_f32(v1773, v2502); - float32x2_t v1786 = vmul_f32(v1785, v2502); - float32x2_t v1798 = vmul_f32(v1797, v2502); - float32x2_t v1816 = vmul_f32(v1815, v2502); - float32x2_t v1918 = vmul_f32(v1917, v2502); - float32x2_t v1930 = vmul_f32(v1929, v2502); - float32x2_t v1942 = vmul_f32(v1941, v2502); - float32x2_t v1960 = vmul_f32(v1959, v2502); - float32x2_t v2062 = vmul_f32(v2061, v2502); - float32x2_t v2074 = vmul_f32(v2073, v2502); - float32x2_t v2086 = vmul_f32(v2085, v2502); - float32x2_t v2104 = vmul_f32(v2103, v2502); - float32x2_t v2206 = vmul_f32(v2205, v2502); - float32x2_t v2218 = vmul_f32(v2217, v2502); - float32x2_t v2230 = vmul_f32(v2229, v2502); - float32x2_t v2248 = vmul_f32(v2247, v2502); - float32x2_t v2350 = vmul_f32(v2349, v2502); - float32x2_t v2362 = vmul_f32(v2361, v2502); - float32x2_t v2374 = vmul_f32(v2373, v2502); - float32x2_t v2392 = vmul_f32(v2391, v2502); - float32x2_t v1775 = vadd_f32(v1774, v1745); - float32x2_t v1787 = vadd_f32(v1786, v1751); - float32x2_t v1799 = vadd_f32(v1798, v1763); - float32x2_t v1817 = vadd_f32(v1816, v1757); - float32x2_t v1919 = vadd_f32(v1918, v1889); - float32x2_t v1931 = vadd_f32(v1930, v1895); - float32x2_t v1943 = vadd_f32(v1942, v1907); - float32x2_t v1961 = vadd_f32(v1960, v1901); - float32x2_t v2063 = vadd_f32(v2062, v2033); - float32x2_t v2075 = vadd_f32(v2074, v2039); - float32x2_t v2087 = vadd_f32(v2086, v2051); - float32x2_t v2105 = vadd_f32(v2104, v2045); - float32x2_t v2207 = vadd_f32(v2206, v2177); - float32x2_t v2219 = vadd_f32(v2218, v2183); - float32x2_t v2231 = vadd_f32(v2230, v2195); - float32x2_t v2249 = vadd_f32(v2248, v2189); - float32x2_t v2351 = vadd_f32(v2350, v2321); - float32x2_t v2363 = vadd_f32(v2362, v2327); - float32x2_t v2375 = vadd_f32(v2374, v2339); - float32x2_t v2393 = vadd_f32(v2392, v2333); - float32x2_t v1800 = vsub_f32(v1775, v1799); - float32x2_t v1804 = vmul_f32(v1775, v3141); - float32x2_t v1818 = vsub_f32(v1787, v1817); - float32x2_t v1822 = vmul_f32(v1787, v3141); - float32x2_t v1944 = vsub_f32(v1919, v1943); - float32x2_t v1948 = vmul_f32(v1919, v3141); - float32x2_t v1962 = vsub_f32(v1931, v1961); - float32x2_t v1966 = vmul_f32(v1931, v3141); - float32x2_t v2088 = vsub_f32(v2063, v2087); - float32x2_t v2092 = vmul_f32(v2063, v3141); - float32x2_t v2106 = vsub_f32(v2075, v2105); - float32x2_t v2110 = vmul_f32(v2075, v3141); - float32x2_t v2232 = vsub_f32(v2207, v2231); - float32x2_t v2236 = vmul_f32(v2207, v3141); - float32x2_t v2250 = vsub_f32(v2219, v2249); - float32x2_t v2254 = vmul_f32(v2219, v3141); - float32x2_t v2376 = vsub_f32(v2351, v2375); - float32x2_t v2380 = vmul_f32(v2351, v3141); - float32x2_t v2394 = vsub_f32(v2363, v2393); - float32x2_t v2398 = vmul_f32(v2363, v3141); + float32x2_t v1800 = vsub_f32(v1745, v1763); + float32x2_t v1804 = vmul_f32(v1745, v3141); + float32x2_t v1818 = vsub_f32(v1751, v1757); + float32x2_t v1822 = vmul_f32(v1751, v3141); + float32x2_t v1944 = vsub_f32(v1889, v1907); + float32x2_t v1948 = vmul_f32(v1889, v3141); + float32x2_t v1962 = vsub_f32(v1895, v1901); + float32x2_t v1966 = vmul_f32(v1895, v3141); + float32x2_t v2088 = vsub_f32(v2033, v2051); + float32x2_t v2092 = vmul_f32(v2033, v3141); + float32x2_t v2106 = vsub_f32(v2039, v2045); + float32x2_t v2110 = vmul_f32(v2039, v3141); + float32x2_t v2232 = vsub_f32(v2177, v2195); + float32x2_t v2236 = vmul_f32(v2177, v3141); + float32x2_t v2250 = vsub_f32(v2183, v2189); + float32x2_t v2254 = vmul_f32(v2183, v3141); + float32x2_t v2376 = vsub_f32(v2321, v2339); + float32x2_t v2380 = vmul_f32(v2321, v3141); + float32x2_t v2394 = vsub_f32(v2327, v2333); + float32x2_t v2398 = vmul_f32(v2327, v3141); float32x2_t v1805 = vsub_f32(v1804, v1800); float32x2_t v1823 = vsub_f32(v1822, v1818); float32x2_t v1834 = vmul_f32(v1818, v3094); @@ -15149,10 +13530,10 @@ void armral_fft_cs16_cf32_cf32_ac_n_uu25(const armral_cmplx_int16_t *restrict x, float32x2_t v2406 = vsub_f32(v2315, v2405); float32x2_t v2434 = vmul_f32(v2433, v3120); float32x2_t v2442 = vmul_f32(v2441, v3120); - float32x2_t v2463 = vrev64_f32(v1995); - float32x2_t v2475 = vrev64_f32(v2139); - float32x2_t v2487 = vrev64_f32(v2427); - float32x2_t v2505 = vrev64_f32(v2283); + float32x2_t v2490 = vsub_f32(v1995, v2427); + float32x2_t v2494 = vmul_f32(v1995, v3141); + float32x2_t v2508 = vsub_f32(v2139, v2283); + float32x2_t v2512 = vmul_f32(v2139, v3141); float32x2_t v1840 = vsub_f32(v1830, v1839); float32x2_t v1844 = vmul_f32(v1830, v3141); float32x2_t v1984 = vsub_f32(v1974, v1983); @@ -15163,10 +13544,10 @@ void armral_fft_cs16_cf32_cf32_ac_n_uu25(const armral_cmplx_int16_t *restrict x, float32x2_t v2276 = vmul_f32(v2262, v3141); float32x2_t v2416 = vsub_f32(v2406, v2415); float32x2_t v2420 = vmul_f32(v2406, v3141); - float32x2_t v2464 = vmul_f32(v2463, v2502); - float32x2_t v2476 = vmul_f32(v2475, v2502); - float32x2_t v2488 = vmul_f32(v2487, v2502); - float32x2_t v2506 = vmul_f32(v2505, v2502); + float32x2_t v2495 = vsub_f32(v2494, v2490); + float32x2_t v2513 = vsub_f32(v2512, v2508); + float32x2_t v2524 = vmul_f32(v2508, v3094); + float32x2_t v2539 = vmul_f32(v2490, v3094); float32x2_t v1845 = vsub_f32(v1844, v1840); float32x2_t v1867 = vsub_f32(v1840, v1866); float32x2_t v1871 = vmul_f32(v1840, v3141); @@ -15182,10 +13563,10 @@ void armral_fft_cs16_cf32_cf32_ac_n_uu25(const armral_cmplx_int16_t *restrict x, float32x2_t v2421 = vsub_f32(v2420, v2416); float32x2_t v2443 = vsub_f32(v2416, v2442); float32x2_t v2447 = vmul_f32(v2416, v3141); - float32x2_t v2465 = vadd_f32(v2464, v1995); - float32x2_t v2477 = vadd_f32(v2476, v2139); - float32x2_t v2489 = vadd_f32(v2488, v2427); - float32x2_t v2507 = vadd_f32(v2506, v2283); + float32x2_t v2514 = vadd_f32(v2495, v2513); + float32x2_t v2515 = vsub_f32(v2495, v2513); + float32x2_t v2525 = vadd_f32(v2490, v2524); + float32x2_t v2540 = vsub_f32(v2539, v2508); float32x2_t v1859 = vsub_f32(v1845, v1858); float32x2_t v1872 = vsub_f32(v1871, v1867); float32x2_t v1876 = vmul_f32(v1845, v3141); @@ -15201,10 +13582,11 @@ void armral_fft_cs16_cf32_cf32_ac_n_uu25(const armral_cmplx_int16_t *restrict x, float32x2_t v2435 = vsub_f32(v2421, v2434); float32x2_t v2448 = vsub_f32(v2447, v2443); float32x2_t v2452 = vmul_f32(v2421, v3141); - float32x2_t v2490 = vsub_f32(v2465, v2489); - float32x2_t v2494 = vmul_f32(v2465, v3141); - float32x2_t v2508 = vsub_f32(v2477, v2507); - float32x2_t v2512 = vmul_f32(v2477, v3141); + float32x2_t v2519 = vmul_f32(v2514, v3074); + float32x2_t v2529 = vmul_f32(v2515, v3084); + float32x2_t v2541 = vadd_f32(v1851, v2514); + float32x2_t v2552 = vrev64_f32(v2525); + float32x2_t v2565 = vrev64_f32(v2540); float32x2_t v2741 = vrev64_f32(v2011); float32x2_t v2753 = vrev64_f32(v2155); float32x2_t v2765 = vrev64_f32(v2443); @@ -15214,10 +13596,10 @@ void armral_fft_cs16_cf32_cf32_ac_n_uu25(const armral_cmplx_int16_t *restrict x, float32x2_t v2165 = vsub_f32(v2164, v2147); float32x2_t v2309 = vsub_f32(v2308, v2291); float32x2_t v2453 = vsub_f32(v2452, v2435); - float32x2_t v2495 = vsub_f32(v2494, v2490); - float32x2_t v2513 = vsub_f32(v2512, v2508); - float32x2_t v2524 = vmul_f32(v2508, v3094); - float32x2_t v2539 = vmul_f32(v2490, v3094); + float32x2_t v2520 = vsub_f32(v1851, v2519); + v6[0] = v2541; + float32x2_t v2553 = vmul_f32(v2552, v3120); + float32x2_t v2566 = vmul_f32(v2565, v3120); float32x2_t v2602 = vrev64_f32(v2003); float32x2_t v2614 = vrev64_f32(v2147); float32x2_t v2626 = vrev64_f32(v2435); @@ -15230,10 +13612,8 @@ void armral_fft_cs16_cf32_cf32_ac_n_uu25(const armral_cmplx_int16_t *restrict x, float32x2_t v2892 = vrev64_f32(v2160); float32x2_t v2904 = vrev64_f32(v2448); float32x2_t v2922 = vrev64_f32(v2304); - float32x2_t v2514 = vadd_f32(v2495, v2513); - float32x2_t v2515 = vsub_f32(v2495, v2513); - float32x2_t v2525 = vadd_f32(v2490, v2524); - float32x2_t v2540 = vsub_f32(v2539, v2508); + float32x2_t v2530 = vsub_f32(v2520, v2529); + float32x2_t v2534 = vmul_f32(v2520, v3141); float32x2_t v2603 = vmul_f32(v2602, v2601); float32x2_t v2615 = vmul_f32(v2614, v2740); float32x2_t v2627 = vmul_f32(v2626, v3018); @@ -15250,11 +13630,9 @@ void armral_fft_cs16_cf32_cf32_ac_n_uu25(const armral_cmplx_int16_t *restrict x, float32x2_t v3031 = vrev64_f32(v2165); float32x2_t v3043 = vrev64_f32(v2453); float32x2_t v3061 = vrev64_f32(v2309); - float32x2_t v2519 = vmul_f32(v2514, v3074); - float32x2_t v2529 = vmul_f32(v2515, v3084); - float32x2_t v2541 = vadd_f32(v1851, v2514); - float32x2_t v2552 = vrev64_f32(v2525); - float32x2_t v2565 = vrev64_f32(v2540); + float32x2_t v2535 = vsub_f32(v2534, v2530); + float32x2_t v2567 = vsub_f32(v2530, v2566); + float32x2_t v2576 = vmul_f32(v2530, v3141); float32x2_t v2604 = vfma_f32(v2603, v2003, v2595); float32x2_t v2616 = vfma_f32(v2615, v2147, v2734); float32x2_t v2628 = vfma_f32(v2627, v2435, v3012); @@ -15271,10 +13649,10 @@ void armral_fft_cs16_cf32_cf32_ac_n_uu25(const armral_cmplx_int16_t *restrict x, float32x2_t v3032 = vmul_f32(v3031, v3030); float32x2_t v3044 = vmul_f32(v3043, v3042); float32x2_t v3062 = vmul_f32(v3061, v3060); - float32x2_t v2520 = vsub_f32(v1851, v2519); - v6[0] = v2541; - float32x2_t v2553 = vmul_f32(v2552, v3120); - float32x2_t v2566 = vmul_f32(v2565, v3120); + float32x2_t v2554 = vsub_f32(v2535, v2553); + v6[ostride * 10] = v2567; + float32x2_t v2577 = vsub_f32(v2576, v2567); + float32x2_t v2586 = vmul_f32(v2535, v3141); float32x2_t v2629 = vsub_f32(v2604, v2628); float32x2_t v2633 = vmul_f32(v2604, v3141); float32x2_t v2647 = vsub_f32(v2616, v2646); @@ -15291,8 +13669,9 @@ void armral_fft_cs16_cf32_cf32_ac_n_uu25(const armral_cmplx_int16_t *restrict x, float32x2_t v3033 = vfma_f32(v3032, v2165, v3024); float32x2_t v3045 = vfma_f32(v3044, v2453, v3036); float32x2_t v3063 = vfma_f32(v3062, v2309, v3054); - float32x2_t v2530 = vsub_f32(v2520, v2529); - float32x2_t v2534 = vmul_f32(v2520, v3141); + v6[ostride * 5] = v2554; + v6[ostride * 15] = v2577; + float32x2_t v2587 = vsub_f32(v2586, v2554); float32x2_t v2634 = vsub_f32(v2633, v2629); float32x2_t v2652 = vsub_f32(v2651, v2647); float32x2_t v2663 = vmul_f32(v2647, v3094); @@ -15309,9 +13688,7 @@ void armral_fft_cs16_cf32_cf32_ac_n_uu25(const armral_cmplx_int16_t *restrict x, float32x2_t v3050 = vmul_f32(v3021, v3141); float32x2_t v3064 = vsub_f32(v3033, v3063); float32x2_t v3068 = vmul_f32(v3033, v3141); - float32x2_t v2535 = vsub_f32(v2534, v2530); - float32x2_t v2567 = vsub_f32(v2530, v2566); - float32x2_t v2576 = vmul_f32(v2530, v3141); + v6[ostride * 20] = v2587; float32x2_t v2653 = vadd_f32(v2634, v2652); float32x2_t v2654 = vsub_f32(v2634, v2652); float32x2_t v2664 = vadd_f32(v2629, v2663); @@ -15329,10 +13706,6 @@ void armral_fft_cs16_cf32_cf32_ac_n_uu25(const armral_cmplx_int16_t *restrict x, float32x2_t v3069 = vsub_f32(v3068, v3064); float32x2_t v3080 = vmul_f32(v3064, v3094); float32x2_t v3095 = vmul_f32(v3046, v3094); - float32x2_t v2554 = vsub_f32(v2535, v2553); - v6[ostride * 10] = v2567; - float32x2_t v2577 = vsub_f32(v2576, v2567); - float32x2_t v2586 = vmul_f32(v2535, v3141); float32x2_t v2658 = vmul_f32(v2653, v3074); float32x2_t v2668 = vmul_f32(v2654, v3084); float32x2_t v2680 = vadd_f32(v1859, v2653); @@ -15351,9 +13724,6 @@ void armral_fft_cs16_cf32_cf32_ac_n_uu25(const armral_cmplx_int16_t *restrict x, float32x2_t v3071 = vsub_f32(v3051, v3069); float32x2_t v3081 = vadd_f32(v3046, v3080); float32x2_t v3096 = vsub_f32(v3095, v3064); - v6[ostride * 5] = v2554; - v6[ostride * 15] = v2577; - float32x2_t v2587 = vsub_f32(v2586, v2554); float32x2_t v2659 = vsub_f32(v1859, v2658); v6[ostride] = v2680; float32x2_t v2692 = vmul_f32(v2691, v3120); @@ -15369,7 +13739,6 @@ void armral_fft_cs16_cf32_cf32_ac_n_uu25(const armral_cmplx_int16_t *restrict x, float32x2_t v3097 = vadd_f32(v1877, v3070); float32x2_t v3108 = vrev64_f32(v3081); float32x2_t v3121 = vrev64_f32(v3096); - v6[ostride * 20] = v2587; float32x2_t v2669 = vsub_f32(v2659, v2668); float32x2_t v2673 = vmul_f32(v2659, v3141); float32x2_t v2813 = vsub_f32(v2812, v2808); @@ -15466,7 +13835,6 @@ void armral_fft_cs16_cf32_cf32_ac_n_uu25(const armral_cmplx_int16_t *restrict x, float v1570 = 2.5000000000000000e-01F; float v1582 = 5.5901699437494745e-01F; float v1594 = 6.1803398874989490e-01F; - float v1622 = 0.0000000000000000e+00F; float v1623 = -9.5105651629515353e-01F; float v1651 = 2.0000000000000000e+00F; const int32_t *v1734 = &v5[v0]; @@ -15494,7 +13862,6 @@ void armral_fft_cs16_cf32_cf32_ac_n_uu25(const armral_cmplx_int16_t *restrict x, int64_t v703 = v0 * 14; int64_t v711 = v0 * 19; int64_t v719 = v0 * 24; - float v908 = v4 * v1622; int64_t v968 = v2 * 5; int64_t v983 = v2 * 10; int64_t v996 = v2 * 15; @@ -15529,6 +13896,7 @@ void armral_fft_cs16_cf32_cf32_ac_n_uu25(const armral_cmplx_int16_t *restrict x, int64_t v1644 = v2 * 19; int64_t v1657 = v2 * 24; const int32_t *v1670 = &v5[0]; + svfloat32_t v1992 = svdup_n_f32(0); float32x2_t *v2006 = &v6[0]; svfloat32_t v2049 = svdup_n_f32(v1016); svfloat32_t v2113 = svdup_n_f32(v1178); @@ -15570,7 +13938,6 @@ void armral_fft_cs16_cf32_cf32_ac_n_uu25(const armral_cmplx_int16_t *restrict x, const int32_t *v1944 = &v5[v703]; const int32_t *v1953 = &v5[v711]; const int32_t *v1962 = &v5[v719]; - svfloat32_t v1992 = svdup_n_f32(v908); float32x2_t *v2016 = &v6[v968]; float32x2_t *v2026 = &v6[v983]; float32x2_t *v2036 = &v6[v996]; @@ -15744,26 +14111,16 @@ void armral_fft_cs16_cf32_cf32_ac_n_uu25(const armral_cmplx_int16_t *restrict x, svfloat32_t v751 = svcmla_f32_x(pred_full, v709, v1992, v709, 90); svfloat32_t v764 = svcmla_f32_x(pred_full, v725, v1992, v725, 90); svfloat32_t v784 = svcmla_f32_x(pred_full, v717, v1992, v717, 90); - svfloat32_t v97; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v97) : "w"(v70), "w"(v96)); - svfloat32_t v117; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v117) : "w"(v83), "w"(v116)); - svfloat32_t v264; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v264) : "w"(v237), "w"(v263)); - svfloat32_t v284; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v284) : "w"(v250), "w"(v283)); - svfloat32_t v431; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v431) : "w"(v404), "w"(v430)); - svfloat32_t v451; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v451) : "w"(v417), "w"(v450)); - svfloat32_t v598; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v598) : "w"(v571), "w"(v597)); - svfloat32_t v618; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v618) : "w"(v584), "w"(v617)); - svfloat32_t v765; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v765) : "w"(v738), "w"(v764)); - svfloat32_t v785; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v785) : "w"(v751), "w"(v784)); + svfloat32_t v97 = svsub_f32_x(svptrue_b32(), v70, v96); + svfloat32_t v117 = svsub_f32_x(svptrue_b32(), v83, v116); + svfloat32_t v264 = svsub_f32_x(svptrue_b32(), v237, v263); + svfloat32_t v284 = svsub_f32_x(svptrue_b32(), v250, v283); + svfloat32_t v431 = svsub_f32_x(svptrue_b32(), v404, v430); + svfloat32_t v451 = svsub_f32_x(svptrue_b32(), v417, v450); + svfloat32_t v598 = svsub_f32_x(svptrue_b32(), v571, v597); + svfloat32_t v618 = svsub_f32_x(svptrue_b32(), v584, v617); + svfloat32_t v765 = svsub_f32_x(svptrue_b32(), v738, v764); + svfloat32_t v785 = svsub_f32_x(svptrue_b32(), v751, v784); svfloat32_t v103 = svnmls_f32_x(pred_full, v97, v70, v2295); svfloat32_t v123 = svnmls_f32_x(pred_full, v117, v83, v2295); svfloat32_t v270 = svnmls_f32_x(pred_full, v264, v237, v2295); @@ -15774,75 +14131,50 @@ void armral_fft_cs16_cf32_cf32_ac_n_uu25(const armral_cmplx_int16_t *restrict x, svfloat32_t v624 = svnmls_f32_x(pred_full, v618, v584, v2295); svfloat32_t v771 = svnmls_f32_x(pred_full, v765, v738, v2295); svfloat32_t v791 = svnmls_f32_x(pred_full, v785, v751, v2295); - svfloat32_t v124; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v124) : "w"(v103), "w"(v123)); - svfloat32_t v125; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v125) : "w"(v103), "w"(v123)); + svfloat32_t v124 = svadd_f32_x(svptrue_b32(), v103, v123); + svfloat32_t v125 = svsub_f32_x(svptrue_b32(), v103, v123); svfloat32_t v137 = svmla_f32_x(pred_full, v97, v117, v2255); svfloat32_t v155 = svnmls_f32_x(pred_full, v117, v97, v2255); - svfloat32_t v291; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v291) : "w"(v270), "w"(v290)); - svfloat32_t v292; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v292) : "w"(v270), "w"(v290)); + svfloat32_t v291 = svadd_f32_x(svptrue_b32(), v270, v290); + svfloat32_t v292 = svsub_f32_x(svptrue_b32(), v270, v290); svfloat32_t v304 = svmla_f32_x(pred_full, v264, v284, v2255); svfloat32_t v322 = svnmls_f32_x(pred_full, v284, v264, v2255); - svfloat32_t v458; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v458) : "w"(v437), "w"(v457)); - svfloat32_t v459; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v459) : "w"(v437), "w"(v457)); + svfloat32_t v458 = svadd_f32_x(svptrue_b32(), v437, v457); + svfloat32_t v459 = svsub_f32_x(svptrue_b32(), v437, v457); svfloat32_t v471 = svmla_f32_x(pred_full, v431, v451, v2255); svfloat32_t v489 = svnmls_f32_x(pred_full, v451, v431, v2255); - svfloat32_t v625; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v625) : "w"(v604), "w"(v624)); - svfloat32_t v626; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v626) : "w"(v604), "w"(v624)); + svfloat32_t v625 = svadd_f32_x(svptrue_b32(), v604, v624); + svfloat32_t v626 = svsub_f32_x(svptrue_b32(), v604, v624); svfloat32_t v638 = svmla_f32_x(pred_full, v598, v618, v2255); svfloat32_t v656 = svnmls_f32_x(pred_full, v618, v598, v2255); - svfloat32_t v792; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v792) : "w"(v771), "w"(v791)); - svfloat32_t v793; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v793) : "w"(v771), "w"(v791)); + svfloat32_t v792 = svadd_f32_x(svptrue_b32(), v771, v791); + svfloat32_t v793 = svsub_f32_x(svptrue_b32(), v771, v791); svfloat32_t v805 = svmla_f32_x(pred_full, v765, v785, v2255); svfloat32_t v823 = svnmls_f32_x(pred_full, v785, v765, v2255); - svfloat32_t v156; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v156) : "w"(v25), "w"(v124)); - svfloat32_t zero163; - asm volatile("mov %0.s, #0" : "=w"(zero163)); + svfloat32_t v156 = svadd_f32_x(svptrue_b32(), v25, v124); + svfloat32_t zero163 = svdup_n_f32(0); svfloat32_t v163 = svcmla_f32_x(pred_full, zero163, v2275, v137, 90); - svfloat32_t zero171; - asm volatile("mov %0.s, #0" : "=w"(zero171)); + svfloat32_t zero171 = svdup_n_f32(0); svfloat32_t v171 = svcmla_f32_x(pred_full, zero171, v2275, v155, 90); - svfloat32_t v323; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v323) : "w"(v192), "w"(v291)); - svfloat32_t zero330; - asm volatile("mov %0.s, #0" : "=w"(zero330)); + svfloat32_t v323 = svadd_f32_x(svptrue_b32(), v192, v291); + svfloat32_t zero330 = svdup_n_f32(0); svfloat32_t v330 = svcmla_f32_x(pred_full, zero330, v2275, v304, 90); - svfloat32_t zero338; - asm volatile("mov %0.s, #0" : "=w"(zero338)); + svfloat32_t zero338 = svdup_n_f32(0); svfloat32_t v338 = svcmla_f32_x(pred_full, zero338, v2275, v322, 90); - svfloat32_t v490; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v490) : "w"(v359), "w"(v458)); - svfloat32_t zero497; - asm volatile("mov %0.s, #0" : "=w"(zero497)); + svfloat32_t v490 = svadd_f32_x(svptrue_b32(), v359, v458); + svfloat32_t zero497 = svdup_n_f32(0); svfloat32_t v497 = svcmla_f32_x(pred_full, zero497, v2275, v471, 90); - svfloat32_t zero505; - asm volatile("mov %0.s, #0" : "=w"(zero505)); + svfloat32_t zero505 = svdup_n_f32(0); svfloat32_t v505 = svcmla_f32_x(pred_full, zero505, v2275, v489, 90); - svfloat32_t v657; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v657) : "w"(v526), "w"(v625)); - svfloat32_t zero664; - asm volatile("mov %0.s, #0" : "=w"(zero664)); + svfloat32_t v657 = svadd_f32_x(svptrue_b32(), v526, v625); + svfloat32_t zero664 = svdup_n_f32(0); svfloat32_t v664 = svcmla_f32_x(pred_full, zero664, v2275, v638, 90); - svfloat32_t zero672; - asm volatile("mov %0.s, #0" : "=w"(zero672)); + svfloat32_t zero672 = svdup_n_f32(0); svfloat32_t v672 = svcmla_f32_x(pred_full, zero672, v2275, v656, 90); - svfloat32_t v824; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v824) : "w"(v693), "w"(v792)); - svfloat32_t zero831; - asm volatile("mov %0.s, #0" : "=w"(zero831)); + svfloat32_t v824 = svadd_f32_x(svptrue_b32(), v693, v792); + svfloat32_t zero831 = svdup_n_f32(0); svfloat32_t v831 = svcmla_f32_x(pred_full, zero831, v2275, v805, 90); - svfloat32_t zero839; - asm volatile("mov %0.s, #0" : "=w"(zero839)); + svfloat32_t zero839 = svdup_n_f32(0); svfloat32_t v839 = svcmla_f32_x(pred_full, zero839, v2275, v823, 90); svfloat32_t v131 = svmls_f32_x(pred_full, v25, v124, v2251); svfloat32_t v298 = svmls_f32_x(pred_full, v192, v291, v2251); @@ -15859,119 +14191,80 @@ void armral_fft_cs16_cf32_cf32_ac_n_uu25(const armral_cmplx_int16_t *restrict x, svfloat32_t v891 = svcmla_f32_x(pred_full, v824, v1992, v824, 90); svfloat32_t v911 = svcmla_f32_x(pred_full, v657, v1992, v657, 90); svfloat32_t v149 = svnmls_f32_x(pred_full, v143, v131, v2295); - svfloat32_t v172; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v172) : "w"(v143), "w"(v171)); + svfloat32_t v172 = svsub_f32_x(svptrue_b32(), v143, v171); svfloat32_t v316 = svnmls_f32_x(pred_full, v310, v298, v2295); - svfloat32_t v339; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v339) : "w"(v310), "w"(v338)); + svfloat32_t v339 = svsub_f32_x(svptrue_b32(), v310, v338); svfloat32_t v483 = svnmls_f32_x(pred_full, v477, v465, v2295); - svfloat32_t v506; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v506) : "w"(v477), "w"(v505)); + svfloat32_t v506 = svsub_f32_x(svptrue_b32(), v477, v505); svfloat32_t v650 = svnmls_f32_x(pred_full, v644, v632, v2295); - svfloat32_t v673; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v673) : "w"(v644), "w"(v672)); + svfloat32_t v673 = svsub_f32_x(svptrue_b32(), v644, v672); svfloat32_t v817 = svnmls_f32_x(pred_full, v811, v799, v2295); - svfloat32_t v840; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v840) : "w"(v811), "w"(v839)); - svfloat32_t v892; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v892) : "w"(v865), "w"(v891)); - svfloat32_t v912; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v912) : "w"(v878), "w"(v911)); - svfloat32_t v164; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v164) : "w"(v149), "w"(v163)); + svfloat32_t v840 = svsub_f32_x(svptrue_b32(), v811, v839); + svfloat32_t v892 = svsub_f32_x(svptrue_b32(), v865, v891); + svfloat32_t v912 = svsub_f32_x(svptrue_b32(), v878, v911); + svfloat32_t v164 = svsub_f32_x(svptrue_b32(), v149, v163); svfloat32_t v178 = svnmls_f32_x(pred_full, v172, v143, v2295); - svfloat32_t v331; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v331) : "w"(v316), "w"(v330)); + svfloat32_t v331 = svsub_f32_x(svptrue_b32(), v316, v330); svfloat32_t v345 = svnmls_f32_x(pred_full, v339, v310, v2295); - svfloat32_t v498; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v498) : "w"(v483), "w"(v497)); + svfloat32_t v498 = svsub_f32_x(svptrue_b32(), v483, v497); svfloat32_t v512 = svnmls_f32_x(pred_full, v506, v477, v2295); - svfloat32_t v665; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v665) : "w"(v650), "w"(v664)); + svfloat32_t v665 = svsub_f32_x(svptrue_b32(), v650, v664); svfloat32_t v679 = svnmls_f32_x(pred_full, v673, v644, v2295); - svfloat32_t v832; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v832) : "w"(v817), "w"(v831)); + svfloat32_t v832 = svsub_f32_x(svptrue_b32(), v817, v831); svfloat32_t v846 = svnmls_f32_x(pred_full, v840, v811, v2295); svfloat32_t v898 = svnmls_f32_x(pred_full, v892, v865, v2295); svfloat32_t v918 = svnmls_f32_x(pred_full, v912, v878, v2295); - svfloat32_t v1181; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v1181) : "w"(v339), "w"(v2113)); - svfloat32_t v1194; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v1194) : "w"(v506), "w"(v2241)); - svfloat32_t v1207; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v1207) : "w"(v840), "w"(v2243)); - svfloat32_t v1227; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v1227) : "w"(v673), "w"(v2179)); + svfloat32_t v1181 = svmul_f32_x(svptrue_b32(), v339, v2113); + svfloat32_t v1194 = svmul_f32_x(svptrue_b32(), v506, v2241); + svfloat32_t v1207 = svmul_f32_x(svptrue_b32(), v840, v2243); + svfloat32_t v1227 = svmul_f32_x(svptrue_b32(), v673, v2179); svfloat32_t v184 = svnmls_f32_x(pred_full, v164, v149, v2295); svfloat32_t v351 = svnmls_f32_x(pred_full, v331, v316, v2295); svfloat32_t v518 = svnmls_f32_x(pred_full, v498, v483, v2295); svfloat32_t v685 = svnmls_f32_x(pred_full, v665, v650, v2295); svfloat32_t v852 = svnmls_f32_x(pred_full, v832, v817, v2295); - svfloat32_t v919; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v919) : "w"(v898), "w"(v918)); - svfloat32_t v920; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v920) : "w"(v898), "w"(v918)); + svfloat32_t v919 = svadd_f32_x(svptrue_b32(), v898, v918); + svfloat32_t v920 = svsub_f32_x(svptrue_b32(), v898, v918); svfloat32_t v932 = svmla_f32_x(pred_full, v892, v912, v2255); svfloat32_t v950 = svnmls_f32_x(pred_full, v912, v892, v2255); - svfloat32_t v1019; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v1019) : "w"(v331), "w"(v2049)); - svfloat32_t v1032; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v1032) : "w"(v498), "w"(v2113)); - svfloat32_t v1045; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v1045) : "w"(v832), "w"(v2241)); - svfloat32_t v1065; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v1065) : "w"(v665), "w"(v2177)); + svfloat32_t v1019 = svmul_f32_x(svptrue_b32(), v331, v2049); + svfloat32_t v1032 = svmul_f32_x(svptrue_b32(), v498, v2113); + svfloat32_t v1045 = svmul_f32_x(svptrue_b32(), v832, v2241); + svfloat32_t v1065 = svmul_f32_x(svptrue_b32(), v665, v2177); svfloat32_t v1189 = svcmla_f32_x(pred_full, v1181, v2114, v339, 90); svfloat32_t v1202 = svcmla_f32_x(pred_full, v1194, v2242, v506, 90); svfloat32_t v1215 = svcmla_f32_x(pred_full, v1207, v2244, v840, 90); svfloat32_t v1235 = svcmla_f32_x(pred_full, v1227, v2180, v673, 90); - svfloat32_t v1343; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v1343) : "w"(v345), "w"(v2177)); - svfloat32_t v1356; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v1356) : "w"(v512), "w"(v2179)); - svfloat32_t v1369; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v1369) : "w"(v846), "w"(v2248)); - svfloat32_t v1389; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v1389) : "w"(v679), "w"(v2245)); - svfloat32_t v951; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v951) : "w"(v156), "w"(v919)); - svfloat32_t zero965; - asm volatile("mov %0.s, #0" : "=w"(zero965)); + svfloat32_t v1343 = svmul_f32_x(svptrue_b32(), v345, v2177); + svfloat32_t v1356 = svmul_f32_x(svptrue_b32(), v512, v2179); + svfloat32_t v1369 = svmul_f32_x(svptrue_b32(), v846, v2248); + svfloat32_t v1389 = svmul_f32_x(svptrue_b32(), v679, v2245); + svfloat32_t v951 = svadd_f32_x(svptrue_b32(), v156, v919); + svfloat32_t zero965 = svdup_n_f32(0); svfloat32_t v965 = svcmla_f32_x(pred_full, zero965, v2275, v932, 90); - svfloat32_t zero980; - asm volatile("mov %0.s, #0" : "=w"(zero980)); + svfloat32_t zero980 = svdup_n_f32(0); svfloat32_t v980 = svcmla_f32_x(pred_full, zero980, v2275, v950, 90); svfloat32_t v1027 = svcmla_f32_x(pred_full, v1019, v2050, v331, 90); svfloat32_t v1040 = svcmla_f32_x(pred_full, v1032, v2114, v498, 90); svfloat32_t v1053 = svcmla_f32_x(pred_full, v1045, v2242, v832, 90); svfloat32_t v1073 = svcmla_f32_x(pred_full, v1065, v2178, v665, 90); - svfloat32_t v1216; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1216) : "w"(v1189), "w"(v1215)); - svfloat32_t v1236; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1236) : "w"(v1202), "w"(v1235)); + svfloat32_t v1216 = svsub_f32_x(svptrue_b32(), v1189, v1215); + svfloat32_t v1236 = svsub_f32_x(svptrue_b32(), v1202, v1235); svfloat32_t v1351 = svcmla_f32_x(pred_full, v1343, v2178, v345, 90); svfloat32_t v1364 = svcmla_f32_x(pred_full, v1356, v2180, v512, 90); svfloat32_t v1377 = svcmla_f32_x(pred_full, v1369, v2249, v846, 90); svfloat32_t v1397 = svcmla_f32_x(pred_full, v1389, v2185, v679, 90); - svfloat32_t v1505; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v1505) : "w"(v351), "w"(v2241)); - svfloat32_t v1518; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v1518) : "w"(v518), "w"(v2243)); - svfloat32_t v1531; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v1531) : "w"(v852), "w"(v2245)); - svfloat32_t v1551; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v1551) : "w"(v685), "w"(v2248)); + svfloat32_t v1505 = svmul_f32_x(svptrue_b32(), v351, v2241); + svfloat32_t v1518 = svmul_f32_x(svptrue_b32(), v518, v2243); + svfloat32_t v1531 = svmul_f32_x(svptrue_b32(), v852, v2245); + svfloat32_t v1551 = svmul_f32_x(svptrue_b32(), v685, v2248); svfloat32_t v926 = svmls_f32_x(pred_full, v156, v919, v2251); - svfloat32_t v1054; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1054) : "w"(v1027), "w"(v1053)); - svfloat32_t v1074; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1074) : "w"(v1040), "w"(v1073)); + svfloat32_t v1054 = svsub_f32_x(svptrue_b32(), v1027, v1053); + svfloat32_t v1074 = svsub_f32_x(svptrue_b32(), v1040, v1073); svfloat32_t v1222 = svnmls_f32_x(pred_full, v1216, v1189, v2295); svfloat32_t v1242 = svnmls_f32_x(pred_full, v1236, v1202, v2295); - svfloat32_t v1378; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1378) : "w"(v1351), "w"(v1377)); - svfloat32_t v1398; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1398) : "w"(v1364), "w"(v1397)); + svfloat32_t v1378 = svsub_f32_x(svptrue_b32(), v1351, v1377); + svfloat32_t v1398 = svsub_f32_x(svptrue_b32(), v1364, v1397); svfloat32_t v1513 = svcmla_f32_x(pred_full, v1505, v2242, v351, 90); svfloat32_t v1526 = svcmla_f32_x(pred_full, v1518, v2244, v518, 90); svfloat32_t v1539 = svcmla_f32_x(pred_full, v1531, v2246, v852, 90); @@ -15980,67 +14273,46 @@ void armral_fft_cs16_cf32_cf32_ac_n_uu25(const armral_cmplx_int16_t *restrict x, svfloat32_t v938 = svmls_f32_x(pred_full, v926, v920, v2253); svfloat32_t v1060 = svnmls_f32_x(pred_full, v1054, v1027, v2295); svfloat32_t v1080 = svnmls_f32_x(pred_full, v1074, v1040, v2295); - svfloat32_t v1243; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v1243) : "w"(v1222), "w"(v1242)); - svfloat32_t v1244; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1244) : "w"(v1222), "w"(v1242)); + svfloat32_t v1243 = svadd_f32_x(svptrue_b32(), v1222, v1242); + svfloat32_t v1244 = svsub_f32_x(svptrue_b32(), v1222, v1242); svfloat32_t v1256 = svmla_f32_x(pred_full, v1216, v1236, v2255); svfloat32_t v1274 = svnmls_f32_x(pred_full, v1236, v1216, v2255); svfloat32_t v1384 = svnmls_f32_x(pred_full, v1378, v1351, v2295); svfloat32_t v1404 = svnmls_f32_x(pred_full, v1398, v1364, v2295); - svfloat32_t v1540; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1540) : "w"(v1513), "w"(v1539)); - svfloat32_t v1560; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1560) : "w"(v1526), "w"(v1559)); + svfloat32_t v1540 = svsub_f32_x(svptrue_b32(), v1513, v1539); + svfloat32_t v1560 = svsub_f32_x(svptrue_b32(), v1526, v1559); svfloat32_t v944 = svnmls_f32_x(pred_full, v938, v926, v2295); - svfloat32_t v981; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v981) : "w"(v938), "w"(v980)); - svfloat32_t v1081; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v1081) : "w"(v1060), "w"(v1080)); - svfloat32_t v1082; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1082) : "w"(v1060), "w"(v1080)); + svfloat32_t v981 = svsub_f32_x(svptrue_b32(), v938, v980); + svfloat32_t v1081 = svadd_f32_x(svptrue_b32(), v1060, v1080); + svfloat32_t v1082 = svsub_f32_x(svptrue_b32(), v1060, v1080); svfloat32_t v1094 = svmla_f32_x(pred_full, v1054, v1074, v2255); svfloat32_t v1112 = svnmls_f32_x(pred_full, v1074, v1054, v2255); - svfloat32_t v1275; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v1275) : "w"(v172), "w"(v1243)); - svfloat32_t zero1289; - asm volatile("mov %0.s, #0" : "=w"(zero1289)); + svfloat32_t v1275 = svadd_f32_x(svptrue_b32(), v172, v1243); + svfloat32_t zero1289 = svdup_n_f32(0); svfloat32_t v1289 = svcmla_f32_x(pred_full, zero1289, v2275, v1256, 90); - svfloat32_t zero1304; - asm volatile("mov %0.s, #0" : "=w"(zero1304)); + svfloat32_t zero1304 = svdup_n_f32(0); svfloat32_t v1304 = svcmla_f32_x(pred_full, zero1304, v2275, v1274, 90); - svfloat32_t v1405; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v1405) : "w"(v1384), "w"(v1404)); - svfloat32_t v1406; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1406) : "w"(v1384), "w"(v1404)); + svfloat32_t v1405 = svadd_f32_x(svptrue_b32(), v1384, v1404); + svfloat32_t v1406 = svsub_f32_x(svptrue_b32(), v1384, v1404); svfloat32_t v1418 = svmla_f32_x(pred_full, v1378, v1398, v2255); svfloat32_t v1436 = svnmls_f32_x(pred_full, v1398, v1378, v2255); svfloat32_t v1546 = svnmls_f32_x(pred_full, v1540, v1513, v2295); svfloat32_t v1566 = svnmls_f32_x(pred_full, v1560, v1526, v2295); - svfloat32_t v966; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v966) : "w"(v944), "w"(v965)); + svfloat32_t v966 = svsub_f32_x(svptrue_b32(), v944, v965); svfloat32_t v994 = svnmls_f32_x(pred_full, v981, v938, v2295); - svfloat32_t v1113; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v1113) : "w"(v164), "w"(v1081)); - svfloat32_t zero1127; - asm volatile("mov %0.s, #0" : "=w"(zero1127)); + svfloat32_t v1113 = svadd_f32_x(svptrue_b32(), v164, v1081); + svfloat32_t zero1127 = svdup_n_f32(0); svfloat32_t v1127 = svcmla_f32_x(pred_full, zero1127, v2275, v1094, 90); - svfloat32_t zero1142; - asm volatile("mov %0.s, #0" : "=w"(zero1142)); + svfloat32_t zero1142 = svdup_n_f32(0); svfloat32_t v1142 = svcmla_f32_x(pred_full, zero1142, v2275, v1112, 90); svfloat32_t v1250 = svmls_f32_x(pred_full, v172, v1243, v2251); - svfloat32_t v1437; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v1437) : "w"(v178), "w"(v1405)); - svfloat32_t zero1451; - asm volatile("mov %0.s, #0" : "=w"(zero1451)); + svfloat32_t v1437 = svadd_f32_x(svptrue_b32(), v178, v1405); + svfloat32_t zero1451 = svdup_n_f32(0); svfloat32_t v1451 = svcmla_f32_x(pred_full, zero1451, v2275, v1418, 90); - svfloat32_t zero1466; - asm volatile("mov %0.s, #0" : "=w"(zero1466)); + svfloat32_t zero1466 = svdup_n_f32(0); svfloat32_t v1466 = svcmla_f32_x(pred_full, zero1466, v2275, v1436, 90); - svfloat32_t v1567; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v1567) : "w"(v1546), "w"(v1566)); - svfloat32_t v1568; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1568) : "w"(v1546), "w"(v1566)); + svfloat32_t v1567 = svadd_f32_x(svptrue_b32(), v1546, v1566); + svfloat32_t v1568 = svsub_f32_x(svptrue_b32(), v1546, v1566); svfloat32_t v1580 = svmla_f32_x(pred_full, v1540, v1560, v2255); svfloat32_t v1598 = svnmls_f32_x(pred_full, v1560, v1540, v2255); svst1_f64(pred_full, (double *)(v2026), svreinterpret_f64_f32(v981)); @@ -16049,13 +14321,10 @@ void armral_fft_cs16_cf32_cf32_ac_n_uu25(const armral_cmplx_int16_t *restrict x, svfloat32_t v1088 = svmls_f32_x(pred_full, v164, v1081, v2251); svfloat32_t v1262 = svmls_f32_x(pred_full, v1250, v1244, v2253); svfloat32_t v1412 = svmls_f32_x(pred_full, v178, v1405, v2251); - svfloat32_t v1599; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v1599) : "w"(v184), "w"(v1567)); - svfloat32_t zero1613; - asm volatile("mov %0.s, #0" : "=w"(zero1613)); + svfloat32_t v1599 = svadd_f32_x(svptrue_b32(), v184, v1567); + svfloat32_t zero1613 = svdup_n_f32(0); svfloat32_t v1613 = svcmla_f32_x(pred_full, zero1613, v2275, v1580, 90); - svfloat32_t zero1628; - asm volatile("mov %0.s, #0" : "=w"(zero1628)); + svfloat32_t zero1628 = svdup_n_f32(0); svfloat32_t v1628 = svcmla_f32_x(pred_full, zero1628, v2275, v1598, 90); svst1_f64(pred_full, (double *)(v2016), svreinterpret_f64_f32(v966)); svst1_f64(pred_full, (double *)(v2036), svreinterpret_f64_f32(v994)); @@ -16063,41 +14332,33 @@ void armral_fft_cs16_cf32_cf32_ac_n_uu25(const armral_cmplx_int16_t *restrict x, svst1_f64(pred_full, (double *)(v2198), svreinterpret_f64_f32(v1437)); svfloat32_t v1100 = svmls_f32_x(pred_full, v1088, v1082, v2253); svfloat32_t v1268 = svnmls_f32_x(pred_full, v1262, v1250, v2295); - svfloat32_t v1305; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1305) : "w"(v1262), "w"(v1304)); + svfloat32_t v1305 = svsub_f32_x(svptrue_b32(), v1262, v1304); svfloat32_t v1424 = svmls_f32_x(pred_full, v1412, v1406, v2253); svfloat32_t v1574 = svmls_f32_x(pred_full, v184, v1567, v2251); svst1_f64(pred_full, (double *)(v2046), svreinterpret_f64_f32(v1007)); svst1_f64(pred_full, (double *)(v2262), svreinterpret_f64_f32(v1599)); svfloat32_t v1106 = svnmls_f32_x(pred_full, v1100, v1088, v2295); - svfloat32_t v1143; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1143) : "w"(v1100), "w"(v1142)); - svfloat32_t v1290; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1290) : "w"(v1268), "w"(v1289)); + svfloat32_t v1143 = svsub_f32_x(svptrue_b32(), v1100, v1142); + svfloat32_t v1290 = svsub_f32_x(svptrue_b32(), v1268, v1289); svfloat32_t v1318 = svnmls_f32_x(pred_full, v1305, v1262, v2295); svfloat32_t v1430 = svnmls_f32_x(pred_full, v1424, v1412, v2295); - svfloat32_t v1467; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1467) : "w"(v1424), "w"(v1466)); + svfloat32_t v1467 = svsub_f32_x(svptrue_b32(), v1424, v1466); svfloat32_t v1586 = svmls_f32_x(pred_full, v1574, v1568, v2253); svst1_f64(pred_full, (double *)(v2154), svreinterpret_f64_f32(v1305)); - svfloat32_t v1128; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1128) : "w"(v1106), "w"(v1127)); + svfloat32_t v1128 = svsub_f32_x(svptrue_b32(), v1106, v1127); svfloat32_t v1156 = svnmls_f32_x(pred_full, v1143, v1100, v2295); svfloat32_t v1331 = svnmls_f32_x(pred_full, v1290, v1268, v2295); - svfloat32_t v1452; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1452) : "w"(v1430), "w"(v1451)); + svfloat32_t v1452 = svsub_f32_x(svptrue_b32(), v1430, v1451); svfloat32_t v1480 = svnmls_f32_x(pred_full, v1467, v1424, v2295); svfloat32_t v1592 = svnmls_f32_x(pred_full, v1586, v1574, v2295); - svfloat32_t v1629; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1629) : "w"(v1586), "w"(v1628)); + svfloat32_t v1629 = svsub_f32_x(svptrue_b32(), v1586, v1628); svst1_f64(pred_full, (double *)(v2090), svreinterpret_f64_f32(v1143)); svst1_f64(pred_full, (double *)(v2144), svreinterpret_f64_f32(v1290)); svst1_f64(pred_full, (double *)(v2164), svreinterpret_f64_f32(v1318)); svst1_f64(pred_full, (double *)(v2218), svreinterpret_f64_f32(v1467)); svfloat32_t v1169 = svnmls_f32_x(pred_full, v1128, v1106, v2295); svfloat32_t v1493 = svnmls_f32_x(pred_full, v1452, v1430, v2295); - svfloat32_t v1614; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1614) : "w"(v1592), "w"(v1613)); + svfloat32_t v1614 = svsub_f32_x(svptrue_b32(), v1592, v1613); svfloat32_t v1642 = svnmls_f32_x(pred_full, v1629, v1586, v2295); svst1_f64(pred_full, (double *)(v2080), svreinterpret_f64_f32(v1128)); svst1_f64(pred_full, (double *)(v2100), svreinterpret_f64_f32(v1156)); @@ -17133,7 +15394,6 @@ void armral_fft_cs16_cf32_cf32_ac_n_uu32(const armral_cmplx_int16_t *restrict x, float v1096 = 9.8078528040323043e-01F; float v1103 = -5.5557023301960218e-01F; float v1108 = -8.3146961230254524e-01F; - float v1119 = 1.0000000000000000e+00F; const int32_t *v1333 = &v5[v0]; float32x2_t *v1534 = &v6[v2]; int64_t v27 = v0 * 16; @@ -17201,7 +15461,6 @@ void armral_fft_cs16_cf32_cf32_ac_n_uu32(const armral_cmplx_int16_t *restrict x, int64_t v1084 = v2 * 30; float v1099 = v4 * v1096; float v1111 = v4 * v1108; - float v1122 = v4 * v1119; int64_t v1130 = v2 * 7; int64_t v1137 = v2 * 15; int64_t v1144 = v2 * 23; @@ -17220,6 +15479,7 @@ void armral_fft_cs16_cf32_cf32_ac_n_uu32(const armral_cmplx_int16_t *restrict x, svfloat32_t v1730 = svdup_n_f32(v1036); svfloat32_t v1769 = svdup_n_f32(v1091); svfloat32_t v1771 = svdup_n_f32(v1103); + svfloat32_t v1773 = svdup_n_f32(v4); svfloat32_t v384 = svmul_n_f32_x( pred_full, svcvt_f32_s32_x(pred_full, @@ -17290,7 +15550,6 @@ void armral_fft_cs16_cf32_cf32_ac_n_uu32(const armral_cmplx_int16_t *restrict x, float32x2_t *v1766 = &v6[v1084]; svfloat32_t v1770 = svdup_n_f32(v1099); svfloat32_t v1772 = svdup_n_f32(v1111); - svfloat32_t v1773 = svdup_n_f32(v1122); float32x2_t *v1780 = &v6[v1130]; float32x2_t *v1789 = &v6[v1137]; float32x2_t *v1798 = &v6[v1144]; @@ -17450,293 +15709,162 @@ void armral_fft_cs16_cf32_cf32_ac_n_uu32(const armral_cmplx_int16_t *restrict x, svcvt_f32_s32_x(pred_full, svld1sh_s32(pred_full, (const int16_t *)&v1476[0])), 1.F / (1ULL << 15ULL)); - svfloat32_t v34; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v34) : "w"(v25), "w"(v33)); - svfloat32_t v35; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v35) : "w"(v25), "w"(v33)); - svfloat32_t v52; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v52) : "w"(v43), "w"(v51)); - svfloat32_t v53; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v53) : "w"(v43), "w"(v51)); - svfloat32_t v81; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v81) : "w"(v72), "w"(v80)); - svfloat32_t v82; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v82) : "w"(v72), "w"(v80)); - svfloat32_t v99; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v99) : "w"(v90), "w"(v98)); - svfloat32_t v100; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v100) : "w"(v90), "w"(v98)); - svfloat32_t v169; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v169) : "w"(v160), "w"(v168)); - svfloat32_t v170; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v170) : "w"(v160), "w"(v168)); - svfloat32_t v187; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v187) : "w"(v178), "w"(v186)); - svfloat32_t v188; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v188) : "w"(v178), "w"(v186)); - svfloat32_t v216; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v216) : "w"(v207), "w"(v215)); - svfloat32_t v217; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v217) : "w"(v207), "w"(v215)); - svfloat32_t v234; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v234) : "w"(v225), "w"(v233)); - svfloat32_t v235; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v235) : "w"(v225), "w"(v233)); - svfloat32_t v393; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v393) : "w"(v384), "w"(v392)); - svfloat32_t v394; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v394) : "w"(v384), "w"(v392)); - svfloat32_t v411; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v411) : "w"(v402), "w"(v410)); - svfloat32_t v412; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v412) : "w"(v402), "w"(v410)); - svfloat32_t v440; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v440) : "w"(v431), "w"(v439)); - svfloat32_t v441; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v441) : "w"(v431), "w"(v439)); - svfloat32_t v458; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v458) : "w"(v449), "w"(v457)); - svfloat32_t v459; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v459) : "w"(v449), "w"(v457)); - svfloat32_t v528; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v528) : "w"(v519), "w"(v527)); - svfloat32_t v529; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v529) : "w"(v519), "w"(v527)); - svfloat32_t v546; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v546) : "w"(v537), "w"(v545)); - svfloat32_t v547; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v547) : "w"(v537), "w"(v545)); - svfloat32_t v575; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v575) : "w"(v566), "w"(v574)); - svfloat32_t v576; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v576) : "w"(v566), "w"(v574)); - svfloat32_t v593; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v593) : "w"(v584), "w"(v592)); - svfloat32_t v594; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v594) : "w"(v584), "w"(v592)); - svfloat32_t zero60; - asm volatile("mov %0.s, #0" : "=w"(zero60)); + svfloat32_t v34 = svadd_f32_x(svptrue_b32(), v25, v33); + svfloat32_t v35 = svsub_f32_x(svptrue_b32(), v25, v33); + svfloat32_t v52 = svadd_f32_x(svptrue_b32(), v43, v51); + svfloat32_t v53 = svsub_f32_x(svptrue_b32(), v43, v51); + svfloat32_t v81 = svadd_f32_x(svptrue_b32(), v72, v80); + svfloat32_t v82 = svsub_f32_x(svptrue_b32(), v72, v80); + svfloat32_t v99 = svadd_f32_x(svptrue_b32(), v90, v98); + svfloat32_t v100 = svsub_f32_x(svptrue_b32(), v90, v98); + svfloat32_t v169 = svadd_f32_x(svptrue_b32(), v160, v168); + svfloat32_t v170 = svsub_f32_x(svptrue_b32(), v160, v168); + svfloat32_t v187 = svadd_f32_x(svptrue_b32(), v178, v186); + svfloat32_t v188 = svsub_f32_x(svptrue_b32(), v178, v186); + svfloat32_t v216 = svadd_f32_x(svptrue_b32(), v207, v215); + svfloat32_t v217 = svsub_f32_x(svptrue_b32(), v207, v215); + svfloat32_t v234 = svadd_f32_x(svptrue_b32(), v225, v233); + svfloat32_t v235 = svsub_f32_x(svptrue_b32(), v225, v233); + svfloat32_t v393 = svadd_f32_x(svptrue_b32(), v384, v392); + svfloat32_t v394 = svsub_f32_x(svptrue_b32(), v384, v392); + svfloat32_t v411 = svadd_f32_x(svptrue_b32(), v402, v410); + svfloat32_t v412 = svsub_f32_x(svptrue_b32(), v402, v410); + svfloat32_t v440 = svadd_f32_x(svptrue_b32(), v431, v439); + svfloat32_t v441 = svsub_f32_x(svptrue_b32(), v431, v439); + svfloat32_t v458 = svadd_f32_x(svptrue_b32(), v449, v457); + svfloat32_t v459 = svsub_f32_x(svptrue_b32(), v449, v457); + svfloat32_t v528 = svadd_f32_x(svptrue_b32(), v519, v527); + svfloat32_t v529 = svsub_f32_x(svptrue_b32(), v519, v527); + svfloat32_t v546 = svadd_f32_x(svptrue_b32(), v537, v545); + svfloat32_t v547 = svsub_f32_x(svptrue_b32(), v537, v545); + svfloat32_t v575 = svadd_f32_x(svptrue_b32(), v566, v574); + svfloat32_t v576 = svsub_f32_x(svptrue_b32(), v566, v574); + svfloat32_t v593 = svadd_f32_x(svptrue_b32(), v584, v592); + svfloat32_t v594 = svsub_f32_x(svptrue_b32(), v584, v592); + svfloat32_t zero60 = svdup_n_f32(0); svfloat32_t v60 = svcmla_f32_x(pred_full, zero60, v1649, v53, 90); - svfloat32_t v61; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v61) : "w"(v34), "w"(v52)); - svfloat32_t v62; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v62) : "w"(v34), "w"(v52)); - svfloat32_t v101; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v101) : "w"(v81), "w"(v99)); - svfloat32_t v102; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v102) : "w"(v81), "w"(v99)); - svfloat32_t v118; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v118) : "w"(v82), "w"(v1646)); - svfloat32_t v130; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v130) : "w"(v100), "w"(v1648)); - svfloat32_t zero195; - asm volatile("mov %0.s, #0" : "=w"(zero195)); + svfloat32_t v61 = svadd_f32_x(svptrue_b32(), v34, v52); + svfloat32_t v62 = svsub_f32_x(svptrue_b32(), v34, v52); + svfloat32_t v101 = svadd_f32_x(svptrue_b32(), v81, v99); + svfloat32_t v102 = svsub_f32_x(svptrue_b32(), v81, v99); + svfloat32_t v118 = svmul_f32_x(svptrue_b32(), v82, v1646); + svfloat32_t v130 = svmul_f32_x(svptrue_b32(), v100, v1648); + svfloat32_t zero195 = svdup_n_f32(0); svfloat32_t v195 = svcmla_f32_x(pred_full, zero195, v1649, v188, 90); - svfloat32_t v196; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v196) : "w"(v169), "w"(v187)); - svfloat32_t v197; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v197) : "w"(v169), "w"(v187)); - svfloat32_t zero242; - asm volatile("mov %0.s, #0" : "=w"(zero242)); + svfloat32_t v196 = svadd_f32_x(svptrue_b32(), v169, v187); + svfloat32_t v197 = svsub_f32_x(svptrue_b32(), v169, v187); + svfloat32_t zero242 = svdup_n_f32(0); svfloat32_t v242 = svcmla_f32_x(pred_full, zero242, v1649, v235, 90); - svfloat32_t v243; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v243) : "w"(v216), "w"(v234)); - svfloat32_t v244; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v244) : "w"(v216), "w"(v234)); - svfloat32_t zero419; - asm volatile("mov %0.s, #0" : "=w"(zero419)); + svfloat32_t v243 = svadd_f32_x(svptrue_b32(), v216, v234); + svfloat32_t v244 = svsub_f32_x(svptrue_b32(), v216, v234); + svfloat32_t zero419 = svdup_n_f32(0); svfloat32_t v419 = svcmla_f32_x(pred_full, zero419, v1649, v412, 90); - svfloat32_t v420; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v420) : "w"(v393), "w"(v411)); - svfloat32_t v421; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v421) : "w"(v393), "w"(v411)); - svfloat32_t v460; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v460) : "w"(v440), "w"(v458)); - svfloat32_t v461; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v461) : "w"(v440), "w"(v458)); - svfloat32_t v477; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v477) : "w"(v441), "w"(v1646)); - svfloat32_t v489; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v489) : "w"(v459), "w"(v1648)); - svfloat32_t zero554; - asm volatile("mov %0.s, #0" : "=w"(zero554)); + svfloat32_t v420 = svadd_f32_x(svptrue_b32(), v393, v411); + svfloat32_t v421 = svsub_f32_x(svptrue_b32(), v393, v411); + svfloat32_t v460 = svadd_f32_x(svptrue_b32(), v440, v458); + svfloat32_t v461 = svsub_f32_x(svptrue_b32(), v440, v458); + svfloat32_t v477 = svmul_f32_x(svptrue_b32(), v441, v1646); + svfloat32_t v489 = svmul_f32_x(svptrue_b32(), v459, v1648); + svfloat32_t zero554 = svdup_n_f32(0); svfloat32_t v554 = svcmla_f32_x(pred_full, zero554, v1649, v547, 90); - svfloat32_t v555; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v555) : "w"(v528), "w"(v546)); - svfloat32_t v556; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v556) : "w"(v528), "w"(v546)); - svfloat32_t v595; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v595) : "w"(v575), "w"(v593)); - svfloat32_t v596; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v596) : "w"(v575), "w"(v593)); - svfloat32_t v612; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v612) : "w"(v576), "w"(v1646)); - svfloat32_t v624; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v624) : "w"(v594), "w"(v1648)); - svfloat32_t v63; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v63) : "w"(v35), "w"(v60)); - svfloat32_t v64; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v64) : "w"(v35), "w"(v60)); - svfloat32_t zero109; - asm volatile("mov %0.s, #0" : "=w"(zero109)); + svfloat32_t v555 = svadd_f32_x(svptrue_b32(), v528, v546); + svfloat32_t v556 = svsub_f32_x(svptrue_b32(), v528, v546); + svfloat32_t v595 = svadd_f32_x(svptrue_b32(), v575, v593); + svfloat32_t v596 = svsub_f32_x(svptrue_b32(), v575, v593); + svfloat32_t v612 = svmul_f32_x(svptrue_b32(), v576, v1646); + svfloat32_t v624 = svmul_f32_x(svptrue_b32(), v594, v1648); + svfloat32_t v63 = svsub_f32_x(svptrue_b32(), v35, v60); + svfloat32_t v64 = svadd_f32_x(svptrue_b32(), v35, v60); + svfloat32_t zero109 = svdup_n_f32(0); svfloat32_t v109 = svcmla_f32_x(pred_full, zero109, v1649, v102, 90); - svfloat32_t v110; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v110) : "w"(v61), "w"(v101)); - svfloat32_t v111; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v111) : "w"(v61), "w"(v101)); - svfloat32_t v198; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v198) : "w"(v170), "w"(v195)); - svfloat32_t v199; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v199) : "w"(v170), "w"(v195)); - svfloat32_t v245; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v245) : "w"(v217), "w"(v242)); - svfloat32_t v246; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v246) : "w"(v217), "w"(v242)); - svfloat32_t v247; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v247) : "w"(v196), "w"(v243)); - svfloat32_t v248; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v248) : "w"(v196), "w"(v243)); - svfloat32_t v303; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v303) : "w"(v197), "w"(v1646)); - svfloat32_t v315; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v315) : "w"(v244), "w"(v1648)); - svfloat32_t v422; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v422) : "w"(v394), "w"(v419)); - svfloat32_t v423; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v423) : "w"(v394), "w"(v419)); - svfloat32_t zero468; - asm volatile("mov %0.s, #0" : "=w"(zero468)); + svfloat32_t v110 = svadd_f32_x(svptrue_b32(), v61, v101); + svfloat32_t v111 = svsub_f32_x(svptrue_b32(), v61, v101); + svfloat32_t v198 = svsub_f32_x(svptrue_b32(), v170, v195); + svfloat32_t v199 = svadd_f32_x(svptrue_b32(), v170, v195); + svfloat32_t v245 = svsub_f32_x(svptrue_b32(), v217, v242); + svfloat32_t v246 = svadd_f32_x(svptrue_b32(), v217, v242); + svfloat32_t v247 = svadd_f32_x(svptrue_b32(), v196, v243); + svfloat32_t v248 = svsub_f32_x(svptrue_b32(), v196, v243); + svfloat32_t v303 = svmul_f32_x(svptrue_b32(), v197, v1646); + svfloat32_t v315 = svmul_f32_x(svptrue_b32(), v244, v1648); + svfloat32_t v422 = svsub_f32_x(svptrue_b32(), v394, v419); + svfloat32_t v423 = svadd_f32_x(svptrue_b32(), v394, v419); + svfloat32_t zero468 = svdup_n_f32(0); svfloat32_t v468 = svcmla_f32_x(pred_full, zero468, v1649, v461, 90); - svfloat32_t v469; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v469) : "w"(v420), "w"(v460)); - svfloat32_t v470; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v470) : "w"(v420), "w"(v460)); - svfloat32_t v557; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v557) : "w"(v529), "w"(v554)); - svfloat32_t v558; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v558) : "w"(v529), "w"(v554)); - svfloat32_t zero603; - asm volatile("mov %0.s, #0" : "=w"(zero603)); + svfloat32_t v469 = svadd_f32_x(svptrue_b32(), v420, v460); + svfloat32_t v470 = svsub_f32_x(svptrue_b32(), v420, v460); + svfloat32_t v557 = svsub_f32_x(svptrue_b32(), v529, v554); + svfloat32_t v558 = svadd_f32_x(svptrue_b32(), v529, v554); + svfloat32_t zero603 = svdup_n_f32(0); svfloat32_t v603 = svcmla_f32_x(pred_full, zero603, v1649, v596, 90); - svfloat32_t v604; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v604) : "w"(v555), "w"(v595)); - svfloat32_t v605; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v605) : "w"(v555), "w"(v595)); - svfloat32_t v112; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v112) : "w"(v62), "w"(v109)); - svfloat32_t v113; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v113) : "w"(v62), "w"(v109)); + svfloat32_t v604 = svadd_f32_x(svptrue_b32(), v555, v595); + svfloat32_t v605 = svsub_f32_x(svptrue_b32(), v555, v595); + svfloat32_t v112 = svsub_f32_x(svptrue_b32(), v62, v109); + svfloat32_t v113 = svadd_f32_x(svptrue_b32(), v62, v109); svfloat32_t v138 = svcmla_f32_x(pred_full, v118, v1773, v118, 90); svfloat32_t v139 = svcmla_f32_x(pred_full, v130, v1649, v130, 90); - svfloat32_t zero255; - asm volatile("mov %0.s, #0" : "=w"(zero255)); + svfloat32_t zero255 = svdup_n_f32(0); svfloat32_t v255 = svcmla_f32_x(pred_full, zero255, v1649, v248, 90); - svfloat32_t v256; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v256) : "w"(v110), "w"(v247)); - svfloat32_t v257; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v257) : "w"(v110), "w"(v247)); - svfloat32_t v264; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v264) : "w"(v198), "w"(v1564)); - svfloat32_t v276; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v276) : "w"(v245), "w"(v1728)); - svfloat32_t v342; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v342) : "w"(v199), "w"(v1728)); - svfloat32_t v354; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v354) : "w"(v246), "w"(v1730)); - svfloat32_t v471; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v471) : "w"(v421), "w"(v468)); - svfloat32_t v472; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v472) : "w"(v421), "w"(v468)); + svfloat32_t v256 = svadd_f32_x(svptrue_b32(), v110, v247); + svfloat32_t v257 = svsub_f32_x(svptrue_b32(), v110, v247); + svfloat32_t v264 = svmul_f32_x(svptrue_b32(), v198, v1564); + svfloat32_t v276 = svmul_f32_x(svptrue_b32(), v245, v1728); + svfloat32_t v342 = svmul_f32_x(svptrue_b32(), v199, v1728); + svfloat32_t v354 = svmul_f32_x(svptrue_b32(), v246, v1730); + svfloat32_t v471 = svsub_f32_x(svptrue_b32(), v421, v468); + svfloat32_t v472 = svadd_f32_x(svptrue_b32(), v421, v468); svfloat32_t v497 = svcmla_f32_x(pred_full, v477, v1773, v477, 90); svfloat32_t v498 = svcmla_f32_x(pred_full, v489, v1649, v489, 90); - svfloat32_t v606; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v606) : "w"(v556), "w"(v603)); - svfloat32_t v607; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v607) : "w"(v556), "w"(v603)); + svfloat32_t v606 = svsub_f32_x(svptrue_b32(), v556, v603); + svfloat32_t v607 = svadd_f32_x(svptrue_b32(), v556, v603); svfloat32_t v632 = svcmla_f32_x(pred_full, v612, v1773, v612, 90); svfloat32_t v633 = svcmla_f32_x(pred_full, v624, v1649, v624, 90); - svfloat32_t v647; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v647) : "w"(v469), "w"(v604)); - svfloat32_t v648; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v648) : "w"(v469), "w"(v604)); - svfloat32_t v893; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v893) : "w"(v470), "w"(v1646)); - svfloat32_t v905; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v905) : "w"(v605), "w"(v1648)); - svfloat32_t v140; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v140) : "w"(v138), "w"(v139)); - svfloat32_t v141; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v141) : "w"(v139), "w"(v138)); - svfloat32_t v258; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v258) : "w"(v111), "w"(v255)); - svfloat32_t v259; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v259) : "w"(v111), "w"(v255)); + svfloat32_t v647 = svadd_f32_x(svptrue_b32(), v469, v604); + svfloat32_t v648 = svsub_f32_x(svptrue_b32(), v469, v604); + svfloat32_t v893 = svmul_f32_x(svptrue_b32(), v470, v1646); + svfloat32_t v905 = svmul_f32_x(svptrue_b32(), v605, v1648); + svfloat32_t v140 = svadd_f32_x(svptrue_b32(), v138, v139); + svfloat32_t v141 = svsub_f32_x(svptrue_b32(), v139, v138); + svfloat32_t v258 = svsub_f32_x(svptrue_b32(), v111, v255); + svfloat32_t v259 = svadd_f32_x(svptrue_b32(), v111, v255); svfloat32_t v284 = svcmla_f32_x(pred_full, v264, v1565, v198, 90); svfloat32_t v285 = svcmla_f32_x(pred_full, v276, v1729, v245, 90); svfloat32_t v323 = svcmla_f32_x(pred_full, v303, v1773, v303, 90); svfloat32_t v324 = svcmla_f32_x(pred_full, v315, v1649, v315, 90); svfloat32_t v362 = svcmla_f32_x(pred_full, v342, v1729, v199, 90); svfloat32_t v363 = svcmla_f32_x(pred_full, v354, v1731, v246, 90); - svfloat32_t v499; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v499) : "w"(v497), "w"(v498)); - svfloat32_t v500; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v500) : "w"(v498), "w"(v497)); - svfloat32_t v634; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v634) : "w"(v632), "w"(v633)); - svfloat32_t v635; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v635) : "w"(v633), "w"(v632)); - svfloat32_t zero655; - asm volatile("mov %0.s, #0" : "=w"(zero655)); + svfloat32_t v499 = svadd_f32_x(svptrue_b32(), v497, v498); + svfloat32_t v500 = svsub_f32_x(svptrue_b32(), v498, v497); + svfloat32_t v634 = svadd_f32_x(svptrue_b32(), v632, v633); + svfloat32_t v635 = svsub_f32_x(svptrue_b32(), v633, v632); + svfloat32_t zero655 = svdup_n_f32(0); svfloat32_t v655 = svcmla_f32_x(pred_full, zero655, v1649, v648, 90); - svfloat32_t v656; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v656) : "w"(v256), "w"(v647)); - svfloat32_t v657; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v657) : "w"(v256), "w"(v647)); - svfloat32_t v759; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v759) : "w"(v471), "w"(v1564)); - svfloat32_t v771; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v771) : "w"(v606), "w"(v1728)); - svfloat32_t v1027; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v1027) : "w"(v472), "w"(v1728)); - svfloat32_t v1039; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v1039) : "w"(v607), "w"(v1730)); - svfloat32_t zero148; - asm volatile("mov %0.s, #0" : "=w"(zero148)); + svfloat32_t v656 = svadd_f32_x(svptrue_b32(), v256, v647); + svfloat32_t v657 = svsub_f32_x(svptrue_b32(), v256, v647); + svfloat32_t v759 = svmul_f32_x(svptrue_b32(), v471, v1564); + svfloat32_t v771 = svmul_f32_x(svptrue_b32(), v606, v1728); + svfloat32_t v1027 = svmul_f32_x(svptrue_b32(), v472, v1728); + svfloat32_t v1039 = svmul_f32_x(svptrue_b32(), v607, v1730); + svfloat32_t zero148 = svdup_n_f32(0); svfloat32_t v148 = svcmla_f32_x(pred_full, zero148, v1773, v141, 90); - svfloat32_t v149; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v149) : "w"(v63), "w"(v140)); - svfloat32_t v150; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v150) : "w"(v63), "w"(v140)); - svfloat32_t v286; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v286) : "w"(v284), "w"(v285)); - svfloat32_t v287; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v287) : "w"(v285), "w"(v284)); - svfloat32_t v325; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v325) : "w"(v323), "w"(v324)); - svfloat32_t v326; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v326) : "w"(v324), "w"(v323)); - svfloat32_t v364; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v364) : "w"(v362), "w"(v363)); - svfloat32_t v365; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v365) : "w"(v363), "w"(v362)); - svfloat32_t zero507; - asm volatile("mov %0.s, #0" : "=w"(zero507)); + svfloat32_t v149 = svadd_f32_x(svptrue_b32(), v63, v140); + svfloat32_t v150 = svsub_f32_x(svptrue_b32(), v63, v140); + svfloat32_t v286 = svadd_f32_x(svptrue_b32(), v284, v285); + svfloat32_t v287 = svsub_f32_x(svptrue_b32(), v285, v284); + svfloat32_t v325 = svadd_f32_x(svptrue_b32(), v323, v324); + svfloat32_t v326 = svsub_f32_x(svptrue_b32(), v324, v323); + svfloat32_t v364 = svadd_f32_x(svptrue_b32(), v362, v363); + svfloat32_t v365 = svsub_f32_x(svptrue_b32(), v363, v362); + svfloat32_t zero507 = svdup_n_f32(0); svfloat32_t v507 = svcmla_f32_x(pred_full, zero507, v1773, v500, 90); - svfloat32_t v508; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v508) : "w"(v422), "w"(v499)); - svfloat32_t v509; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v509) : "w"(v422), "w"(v499)); - svfloat32_t zero642; - asm volatile("mov %0.s, #0" : "=w"(zero642)); + svfloat32_t v508 = svadd_f32_x(svptrue_b32(), v422, v499); + svfloat32_t v509 = svsub_f32_x(svptrue_b32(), v422, v499); + svfloat32_t zero642 = svdup_n_f32(0); svfloat32_t v642 = svcmla_f32_x(pred_full, zero642, v1773, v635, 90); - svfloat32_t v643; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v643) : "w"(v557), "w"(v634)); - svfloat32_t v644; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v644) : "w"(v557), "w"(v634)); - svfloat32_t v658; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v658) : "w"(v257), "w"(v655)); - svfloat32_t v659; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v659) : "w"(v257), "w"(v655)); + svfloat32_t v643 = svadd_f32_x(svptrue_b32(), v557, v634); + svfloat32_t v644 = svsub_f32_x(svptrue_b32(), v557, v634); + svfloat32_t v658 = svsub_f32_x(svptrue_b32(), v257, v655); + svfloat32_t v659 = svadd_f32_x(svptrue_b32(), v257, v655); svfloat32_t v779 = svcmla_f32_x(pred_full, v759, v1565, v471, 90); svfloat32_t v780 = svcmla_f32_x(pred_full, v771, v1729, v606, 90); svfloat32_t v913 = svcmla_f32_x(pred_full, v893, v1773, v893, 90); @@ -17745,156 +15873,92 @@ void armral_fft_cs16_cf32_cf32_ac_n_uu32(const armral_cmplx_int16_t *restrict x, svfloat32_t v1048 = svcmla_f32_x(pred_full, v1039, v1731, v607, 90); svst1_f64(pred_full, (double *)(v1493), svreinterpret_f64_f32(v656)); svst1_f64(pred_full, (double *)(v1511), svreinterpret_f64_f32(v657)); - svfloat32_t v151; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v151) : "w"(v64), "w"(v148)); - svfloat32_t v152; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v152) : "w"(v64), "w"(v148)); - svfloat32_t zero294; - asm volatile("mov %0.s, #0" : "=w"(zero294)); + svfloat32_t v151 = svsub_f32_x(svptrue_b32(), v64, v148); + svfloat32_t v152 = svadd_f32_x(svptrue_b32(), v64, v148); + svfloat32_t zero294 = svdup_n_f32(0); svfloat32_t v294 = svcmla_f32_x(pred_full, zero294, v1773, v287, 90); - svfloat32_t v295; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v295) : "w"(v149), "w"(v286)); - svfloat32_t v296; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v296) : "w"(v149), "w"(v286)); - svfloat32_t zero333; - asm volatile("mov %0.s, #0" : "=w"(zero333)); + svfloat32_t v295 = svadd_f32_x(svptrue_b32(), v149, v286); + svfloat32_t v296 = svsub_f32_x(svptrue_b32(), v149, v286); + svfloat32_t zero333 = svdup_n_f32(0); svfloat32_t v333 = svcmla_f32_x(pred_full, zero333, v1773, v326, 90); - svfloat32_t v334; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v334) : "w"(v112), "w"(v325)); - svfloat32_t v335; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v335) : "w"(v112), "w"(v325)); - svfloat32_t zero372; - asm volatile("mov %0.s, #0" : "=w"(zero372)); + svfloat32_t v334 = svadd_f32_x(svptrue_b32(), v112, v325); + svfloat32_t v335 = svsub_f32_x(svptrue_b32(), v112, v325); + svfloat32_t zero372 = svdup_n_f32(0); svfloat32_t v372 = svcmla_f32_x(pred_full, zero372, v1773, v365, 90); - svfloat32_t v510; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v510) : "w"(v423), "w"(v507)); - svfloat32_t v511; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v511) : "w"(v423), "w"(v507)); - svfloat32_t v645; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v645) : "w"(v558), "w"(v642)); - svfloat32_t v646; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v646) : "w"(v558), "w"(v642)); - svfloat32_t v692; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v692) : "w"(v508), "w"(v1523)); - svfloat32_t v704; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v704) : "w"(v643), "w"(v1605)); - svfloat32_t v781; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v781) : "w"(v779), "w"(v780)); - svfloat32_t v782; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v782) : "w"(v780), "w"(v779)); - svfloat32_t v915; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v915) : "w"(v913), "w"(v914)); - svfloat32_t v916; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v916) : "w"(v914), "w"(v913)); - svfloat32_t v960; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v960) : "w"(v509), "w"(v1687)); - svfloat32_t v972; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v972) : "w"(v644), "w"(v1689)); - svfloat32_t v1049; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v1049) : "w"(v1047), "w"(v1048)); - svfloat32_t v1050; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1050) : "w"(v1048), "w"(v1047)); + svfloat32_t v510 = svsub_f32_x(svptrue_b32(), v423, v507); + svfloat32_t v511 = svadd_f32_x(svptrue_b32(), v423, v507); + svfloat32_t v645 = svsub_f32_x(svptrue_b32(), v558, v642); + svfloat32_t v646 = svadd_f32_x(svptrue_b32(), v558, v642); + svfloat32_t v692 = svmul_f32_x(svptrue_b32(), v508, v1523); + svfloat32_t v704 = svmul_f32_x(svptrue_b32(), v643, v1605); + svfloat32_t v781 = svadd_f32_x(svptrue_b32(), v779, v780); + svfloat32_t v782 = svsub_f32_x(svptrue_b32(), v780, v779); + svfloat32_t v915 = svadd_f32_x(svptrue_b32(), v913, v914); + svfloat32_t v916 = svsub_f32_x(svptrue_b32(), v914, v913); + svfloat32_t v960 = svmul_f32_x(svptrue_b32(), v509, v1687); + svfloat32_t v972 = svmul_f32_x(svptrue_b32(), v644, v1689); + svfloat32_t v1049 = svadd_f32_x(svptrue_b32(), v1047, v1048); + svfloat32_t v1050 = svsub_f32_x(svptrue_b32(), v1048, v1047); svst1_f64(pred_full, (double *)(v1502), svreinterpret_f64_f32(v658)); svst1_f64(pred_full, (double *)(v1520), svreinterpret_f64_f32(v659)); - svfloat32_t v297; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v297) : "w"(v150), "w"(v294)); - svfloat32_t v298; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v298) : "w"(v150), "w"(v294)); - svfloat32_t v336; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v336) : "w"(v113), "w"(v333)); - svfloat32_t v337; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v337) : "w"(v113), "w"(v333)); - svfloat32_t v373; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v373) : "w"(v151), "w"(v364)); - svfloat32_t v374; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v374) : "w"(v151), "w"(v364)); - svfloat32_t v375; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v375) : "w"(v152), "w"(v372)); - svfloat32_t v376; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v376) : "w"(v152), "w"(v372)); + svfloat32_t v297 = svsub_f32_x(svptrue_b32(), v150, v294); + svfloat32_t v298 = svadd_f32_x(svptrue_b32(), v150, v294); + svfloat32_t v336 = svsub_f32_x(svptrue_b32(), v113, v333); + svfloat32_t v337 = svadd_f32_x(svptrue_b32(), v113, v333); + svfloat32_t v373 = svadd_f32_x(svptrue_b32(), v151, v364); + svfloat32_t v374 = svsub_f32_x(svptrue_b32(), v151, v364); + svfloat32_t v375 = svsub_f32_x(svptrue_b32(), v152, v372); + svfloat32_t v376 = svadd_f32_x(svptrue_b32(), v152, v372); svfloat32_t v712 = svcmla_f32_x(pred_full, v692, v1690, v508, 90); svfloat32_t v713 = svcmla_f32_x(pred_full, v704, v1606, v643, 90); - svfloat32_t zero789; - asm volatile("mov %0.s, #0" : "=w"(zero789)); + svfloat32_t zero789 = svdup_n_f32(0); svfloat32_t v789 = svcmla_f32_x(pred_full, zero789, v1773, v782, 90); - svfloat32_t v790; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v790) : "w"(v334), "w"(v781)); - svfloat32_t v791; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v791) : "w"(v334), "w"(v781)); - svfloat32_t v826; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v826) : "w"(v510), "w"(v1605)); - svfloat32_t v838; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v838) : "w"(v645), "w"(v1607)); - svfloat32_t zero923; - asm volatile("mov %0.s, #0" : "=w"(zero923)); + svfloat32_t v790 = svadd_f32_x(svptrue_b32(), v334, v781); + svfloat32_t v791 = svsub_f32_x(svptrue_b32(), v334, v781); + svfloat32_t v826 = svmul_f32_x(svptrue_b32(), v510, v1605); + svfloat32_t v838 = svmul_f32_x(svptrue_b32(), v645, v1607); + svfloat32_t zero923 = svdup_n_f32(0); svfloat32_t v923 = svcmla_f32_x(pred_full, zero923, v1773, v916, 90); - svfloat32_t v924; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v924) : "w"(v258), "w"(v915)); - svfloat32_t v925; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v925) : "w"(v258), "w"(v915)); + svfloat32_t v924 = svadd_f32_x(svptrue_b32(), v258, v915); + svfloat32_t v925 = svsub_f32_x(svptrue_b32(), v258, v915); svfloat32_t v980 = svcmla_f32_x(pred_full, v960, v1688, v509, 90); svfloat32_t v981 = svcmla_f32_x(pred_full, v972, v1690, v644, 90); - svfloat32_t zero1057; - asm volatile("mov %0.s, #0" : "=w"(zero1057)); + svfloat32_t zero1057 = svdup_n_f32(0); svfloat32_t v1057 = svcmla_f32_x(pred_full, zero1057, v1773, v1050, 90); - svfloat32_t v1094; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v1094) : "w"(v511), "w"(v1769)); - svfloat32_t v1106; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v1106) : "w"(v646), "w"(v1771)); - svfloat32_t v714; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v714) : "w"(v712), "w"(v713)); - svfloat32_t v715; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v715) : "w"(v713), "w"(v712)); - svfloat32_t v792; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v792) : "w"(v335), "w"(v789)); - svfloat32_t v793; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v793) : "w"(v335), "w"(v789)); + svfloat32_t v1094 = svmul_f32_x(svptrue_b32(), v511, v1769); + svfloat32_t v1106 = svmul_f32_x(svptrue_b32(), v646, v1771); + svfloat32_t v714 = svadd_f32_x(svptrue_b32(), v712, v713); + svfloat32_t v715 = svsub_f32_x(svptrue_b32(), v713, v712); + svfloat32_t v792 = svsub_f32_x(svptrue_b32(), v335, v789); + svfloat32_t v793 = svadd_f32_x(svptrue_b32(), v335, v789); svfloat32_t v846 = svcmla_f32_x(pred_full, v826, v1606, v510, 90); svfloat32_t v847 = svcmla_f32_x(pred_full, v838, v1770, v645, 90); - svfloat32_t v926; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v926) : "w"(v259), "w"(v923)); - svfloat32_t v927; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v927) : "w"(v259), "w"(v923)); - svfloat32_t v982; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v982) : "w"(v980), "w"(v981)); - svfloat32_t v983; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v983) : "w"(v981), "w"(v980)); - svfloat32_t v1058; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v1058) : "w"(v336), "w"(v1049)); - svfloat32_t v1059; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1059) : "w"(v336), "w"(v1049)); - svfloat32_t v1060; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1060) : "w"(v337), "w"(v1057)); - svfloat32_t v1061; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v1061) : "w"(v337), "w"(v1057)); + svfloat32_t v926 = svsub_f32_x(svptrue_b32(), v259, v923); + svfloat32_t v927 = svadd_f32_x(svptrue_b32(), v259, v923); + svfloat32_t v982 = svadd_f32_x(svptrue_b32(), v980, v981); + svfloat32_t v983 = svsub_f32_x(svptrue_b32(), v981, v980); + svfloat32_t v1058 = svadd_f32_x(svptrue_b32(), v336, v1049); + svfloat32_t v1059 = svsub_f32_x(svptrue_b32(), v336, v1049); + svfloat32_t v1060 = svsub_f32_x(svptrue_b32(), v337, v1057); + svfloat32_t v1061 = svadd_f32_x(svptrue_b32(), v337, v1057); svfloat32_t v1114 = svcmla_f32_x(pred_full, v1094, v1770, v511, 90); svfloat32_t v1115 = svcmla_f32_x(pred_full, v1106, v1772, v646, 90); svst1_f64(pred_full, (double *)(v1575), svreinterpret_f64_f32(v790)); svst1_f64(pred_full, (double *)(v1593), svreinterpret_f64_f32(v791)); svst1_f64(pred_full, (double *)(v1657), svreinterpret_f64_f32(v924)); svst1_f64(pred_full, (double *)(v1675), svreinterpret_f64_f32(v925)); - svfloat32_t zero722; - asm volatile("mov %0.s, #0" : "=w"(zero722)); + svfloat32_t zero722 = svdup_n_f32(0); svfloat32_t v722 = svcmla_f32_x(pred_full, zero722, v1773, v715, 90); - svfloat32_t v723; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v723) : "w"(v295), "w"(v714)); - svfloat32_t v724; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v724) : "w"(v295), "w"(v714)); - svfloat32_t v848; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v848) : "w"(v846), "w"(v847)); - svfloat32_t v849; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v849) : "w"(v847), "w"(v846)); - svfloat32_t zero990; - asm volatile("mov %0.s, #0" : "=w"(zero990)); + svfloat32_t v723 = svadd_f32_x(svptrue_b32(), v295, v714); + svfloat32_t v724 = svsub_f32_x(svptrue_b32(), v295, v714); + svfloat32_t v848 = svadd_f32_x(svptrue_b32(), v846, v847); + svfloat32_t v849 = svsub_f32_x(svptrue_b32(), v847, v846); + svfloat32_t zero990 = svdup_n_f32(0); svfloat32_t v990 = svcmla_f32_x(pred_full, zero990, v1773, v983, 90); - svfloat32_t v991; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v991) : "w"(v297), "w"(v982)); - svfloat32_t v992; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v992) : "w"(v297), "w"(v982)); - svfloat32_t v1116; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v1116) : "w"(v1114), "w"(v1115)); - svfloat32_t v1117; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1117) : "w"(v1115), "w"(v1114)); + svfloat32_t v991 = svadd_f32_x(svptrue_b32(), v297, v982); + svfloat32_t v992 = svsub_f32_x(svptrue_b32(), v297, v982); + svfloat32_t v1116 = svadd_f32_x(svptrue_b32(), v1114, v1115); + svfloat32_t v1117 = svsub_f32_x(svptrue_b32(), v1115, v1114); svst1_f64(pred_full, (double *)(v1584), svreinterpret_f64_f32(v792)); svst1_f64(pred_full, (double *)(v1602), svreinterpret_f64_f32(v793)); svst1_f64(pred_full, (double *)(v1666), svreinterpret_f64_f32(v926)); @@ -17903,40 +15967,26 @@ void armral_fft_cs16_cf32_cf32_ac_n_uu32(const armral_cmplx_int16_t *restrict x, svst1_f64(pred_full, (double *)(v1748), svreinterpret_f64_f32(v1060)); svst1_f64(pred_full, (double *)(v1757), svreinterpret_f64_f32(v1059)); svst1_f64(pred_full, (double *)(v1766), svreinterpret_f64_f32(v1061)); - svfloat32_t v725; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v725) : "w"(v296), "w"(v722)); - svfloat32_t v726; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v726) : "w"(v296), "w"(v722)); - svfloat32_t zero856; - asm volatile("mov %0.s, #0" : "=w"(zero856)); + svfloat32_t v725 = svsub_f32_x(svptrue_b32(), v296, v722); + svfloat32_t v726 = svadd_f32_x(svptrue_b32(), v296, v722); + svfloat32_t zero856 = svdup_n_f32(0); svfloat32_t v856 = svcmla_f32_x(pred_full, zero856, v1773, v849, 90); - svfloat32_t v857; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v857) : "w"(v373), "w"(v848)); - svfloat32_t v858; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v858) : "w"(v373), "w"(v848)); - svfloat32_t v993; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v993) : "w"(v298), "w"(v990)); - svfloat32_t v994; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v994) : "w"(v298), "w"(v990)); - svfloat32_t zero1124; - asm volatile("mov %0.s, #0" : "=w"(zero1124)); + svfloat32_t v857 = svadd_f32_x(svptrue_b32(), v373, v848); + svfloat32_t v858 = svsub_f32_x(svptrue_b32(), v373, v848); + svfloat32_t v993 = svsub_f32_x(svptrue_b32(), v298, v990); + svfloat32_t v994 = svadd_f32_x(svptrue_b32(), v298, v990); + svfloat32_t zero1124 = svdup_n_f32(0); svfloat32_t v1124 = svcmla_f32_x(pred_full, zero1124, v1773, v1117, 90); - svfloat32_t v1125; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v1125) : "w"(v375), "w"(v1116)); - svfloat32_t v1126; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1126) : "w"(v375), "w"(v1116)); + svfloat32_t v1125 = svadd_f32_x(svptrue_b32(), v375, v1116); + svfloat32_t v1126 = svsub_f32_x(svptrue_b32(), v375, v1116); svst1_f64(pred_full, (double *)(v1534), svreinterpret_f64_f32(v723)); svst1_f64(pred_full, (double *)(v1552), svreinterpret_f64_f32(v724)); svst1_f64(pred_full, (double *)(v1698), svreinterpret_f64_f32(v991)); svst1_f64(pred_full, (double *)(v1716), svreinterpret_f64_f32(v992)); - svfloat32_t v859; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v859) : "w"(v374), "w"(v856)); - svfloat32_t v860; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v860) : "w"(v374), "w"(v856)); - svfloat32_t v1127; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1127) : "w"(v376), "w"(v1124)); - svfloat32_t v1128; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v1128) : "w"(v376), "w"(v1124)); + svfloat32_t v859 = svsub_f32_x(svptrue_b32(), v374, v856); + svfloat32_t v860 = svadd_f32_x(svptrue_b32(), v374, v856); + svfloat32_t v1127 = svsub_f32_x(svptrue_b32(), v376, v1124); + svfloat32_t v1128 = svadd_f32_x(svptrue_b32(), v376, v1124); svst1_f64(pred_full, (double *)(v1543), svreinterpret_f64_f32(v725)); svst1_f64(pred_full, (double *)(v1561), svreinterpret_f64_f32(v726)); svst1_f64(pred_full, (double *)(v1616), svreinterpret_f64_f32(v857)); diff --git a/src/LowerPHY/FFT/fft_cs16_cf32_cf32_ac_n_uu.h b/src/LowerPHY/FFT/fft_cs16_cf32_cf32_ac_n_uu.h index 8b47abc150d1d22e9d83e935eea8bbe58ab0d5ee..150582894413753b500f965b06e3db6860ddc08a 100644 --- a/src/LowerPHY/FFT/fft_cs16_cf32_cf32_ac_n_uu.h +++ b/src/LowerPHY/FFT/fft_cs16_cf32_cf32_ac_n_uu.h @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #pragma once diff --git a/src/LowerPHY/FFT/fft_cs16_cf32_cs16_ac_n_uu.c b/src/LowerPHY/FFT/fft_cs16_cf32_cs16_ac_n_uu.c deleted file mode 100644 index 9323e563bdb2ff6a2c7581987bfeac58d1ee261e..0000000000000000000000000000000000000000 --- a/src/LowerPHY/FFT/fft_cs16_cf32_cs16_ac_n_uu.c +++ /dev/null @@ -1,22920 +0,0 @@ -/* - Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates -*/ -#include "fft_cs16_cf32_cs16_ac_n_uu.h" - -#include -#ifdef ARMRAL_ARCH_SVE -#include -#endif - -#ifndef ARMRAL_ARCH_SVE -void armral_fft_cs16_cf32_cs16_ac_n_uu2(const armral_cmplx_int16_t *restrict x, - armral_cmplx_int16_t *restrict y, - int istride, int ostride, int howmany, - float dir) { - const int32_t *v5 = (const int32_t *)x; - int32_t *v6 = (int32_t *)y; - int64_t v12 = howmany - 1; - int64_t v65 = howmany / 2; - for (int j = 0; j < v12; j += 2) { - const int32_t *v127 = &v5[istride]; - int32_t *v146 = &v6[ostride]; - const int32_t *v118 = &v5[0]; - int32_t *v137 = &v6[0]; - int16x4_t v152 = vld1_s16((const int16_t *)v127); - float32x4_t v36 = vcvtq_n_f32_s32(vmovl_s16(v152), 15); - int16x4_t v150 = vld1_s16((const int16_t *)v118); - float32x4_t v28 = vcvtq_n_f32_s32(vmovl_s16(v150), 15); - float32x4_t v37 = vaddq_f32(v28, v36); - float32x4_t v38 = vsubq_f32(v28, v36); - int16x4_t v51 = vqmovn_s32(vcvtq_n_s32_f32(v37, 15)); - int16x4_t v59 = vqmovn_s32(vcvtq_n_s32_f32(v38, 15)); - vst1_s16((int16_t *)v137, v51); - vst1_s16((int16_t *)v146, v59); - v5 += 2 * 1; - v6 += 2 * 1; - } - for (int j = v65 * 2; j < howmany; j += 1) { - int16x4_t v83 = vld1s_s16(&v5[istride]); - int16x4_t v77 = vld1s_s16(&v5[0]); - float32x2_t v84 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v83)), 15); - float32x2_t v78 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v77)), 15); - float32x2_t v85 = vadd_f32(v78, v84); - float32x2_t v86 = vsub_f32(v78, v84); - int16x4_t v97 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v85, 15), (int32x2_t){0, 0})); - int16x4_t v103 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v86, 15), (int32x2_t){0, 0})); - v6[0] = vget_lane_s32(vreinterpret_s32_s16(v97), 0); - v6[ostride] = vget_lane_s32(vreinterpret_s32_s16(v103), 0); - v5 += 1 * 1; - v6 += 1 * 1; - } -} -#endif - -#ifdef ARMRAL_ARCH_SVE -void armral_fft_cs16_cf32_cs16_ac_n_uu2(const armral_cmplx_int16_t *restrict x, - armral_cmplx_int16_t *restrict y, - int istride, int ostride, int howmany, - float dir) { - int64_t v0 = istride; - int64_t v2 = ostride; - const int32_t *v5 = (const int32_t *)x; - int32_t *v6 = (int32_t *)y; - int64_t v8 = howmany; - int64_t v10 = svcntd(); - int64_t v11 = v10 * 1; - int64_t v12 = v10 * 1; - for (int j = 0; j < v8; j += v10) { - svbool_t pred_full = svwhilelt_b32(j * 2, howmany * 2); - const int32_t *v78 = &v5[v0]; - int32_t *v99 = &v6[v2]; - const int32_t *v69 = &v5[0]; - int32_t *v90 = &v6[0]; - svfloat32_t v33 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v78[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v25 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v69[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v34; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v34) : "w"(v25), "w"(v33)); - svfloat32_t v35; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v35) : "w"(v25), "w"(v33)); - svint16_t v48 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v34, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svint16_t v56 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v35, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svst1w_u64(pred_full, (unsigned *)(v90), svreinterpret_u64_s16(v48)); - svst1w_u64(pred_full, (unsigned *)(v99), svreinterpret_u64_s16(v56)); - v5 += v11; - v6 += v12; - } -} -#endif - -#ifndef ARMRAL_ARCH_SVE -void armral_fft_cs16_cf32_cs16_ac_n_uu3(const armral_cmplx_int16_t *restrict x, - armral_cmplx_int16_t *restrict y, - int istride, int ostride, int howmany, - float dir) { - float v4 = dir; - const int32_t *v5 = (const int32_t *)x; - int32_t *v6 = (int32_t *)y; - int64_t v12 = howmany - 1; - int64_t v93 = howmany / 2; - for (int j = 0; j < v12; j += 2) { - float v54 = -1.4999999999999998e+00F; - float v58 = 8.6602540378443871e-01F; - float v59 = -8.6602540378443871e-01F; - float32x2_t v61 = (float32x2_t){v4, v4}; - const int32_t *v168 = &v5[istride]; - int32_t *v206 = &v6[ostride]; - float32x2_t v55 = (float32x2_t){v54, v54}; - float32x2_t v60 = (float32x2_t){v58, v59}; - const int32_t *v187 = &v5[0]; - int32_t *v197 = &v6[0]; - int16x4_t v219 = vld1_s16((const int16_t *)v168); - float32x4_t v28 = vcvtq_n_f32_s32(vmovl_s16(v219), 15); - float32x4_t v56 = vcombine_f32(v55, v55); - float32x2_t v62 = vmul_f32(v61, v60); - const int32_t *v177 = &v5[istride * 2]; - int32_t *v215 = &v6[ostride * 2]; - int16x4_t v223 = vld1_s16((const int16_t *)v187); - float32x4_t v46 = vcvtq_n_f32_s32(vmovl_s16(v223), 15); - float32x4_t v64 = vcombine_f32(v62, v62); - int16x4_t v221 = vld1_s16((const int16_t *)v177); - float32x4_t v36 = vcvtq_n_f32_s32(vmovl_s16(v221), 15); - float32x4_t v37 = vaddq_f32(v28, v36); - float32x4_t v38 = vsubq_f32(v28, v36); - float32x4_t v47 = vaddq_f32(v37, v46); - float32x4_t v57 = vmulq_f32(v37, v56); - float32x4_t v63 = vrev64q_f32(v38); - float32x4_t v65 = vmulq_f32(v63, v64); - float32x4_t v66 = vaddq_f32(v47, v57); - int16x4_t v71 = vqmovn_s32(vcvtq_n_s32_f32(v47, 15)); - float32x4_t v67 = vaddq_f32(v66, v65); - float32x4_t v68 = vsubq_f32(v66, v65); - vst1_s16((int16_t *)v197, v71); - int16x4_t v79 = vqmovn_s32(vcvtq_n_s32_f32(v68, 15)); - int16x4_t v87 = vqmovn_s32(vcvtq_n_s32_f32(v67, 15)); - vst1_s16((int16_t *)v206, v79); - vst1_s16((int16_t *)v215, v87); - v5 += 2 * 1; - v6 += 2 * 1; - } - for (int j = v93 * 2; j < howmany; j += 1) { - int16x4_t v105 = vld1s_s16(&v5[istride]); - float v127 = -1.4999999999999998e+00F; - float v130 = 8.6602540378443871e-01F; - float v131 = -8.6602540378443871e-01F; - float32x2_t v133 = (float32x2_t){v4, v4}; - float32x2_t v106 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v105)), 15); - int16x4_t v119 = vld1s_s16(&v5[0]); - float32x2_t v128 = (float32x2_t){v127, v127}; - float32x2_t v132 = (float32x2_t){v130, v131}; - int16x4_t v111 = vld1s_s16(&v5[istride * 2]); - float32x2_t v120 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v119)), 15); - float32x2_t v134 = vmul_f32(v133, v132); - float32x2_t v112 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v111)), 15); - float32x2_t v113 = vadd_f32(v106, v112); - float32x2_t v114 = vsub_f32(v106, v112); - float32x2_t v121 = vadd_f32(v113, v120); - float32x2_t v129 = vmul_f32(v113, v128); - float32x2_t v135 = vrev64_f32(v114); - float32x2_t v136 = vmul_f32(v135, v134); - float32x2_t v137 = vadd_f32(v121, v129); - int16x4_t v142 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v121, 15), (int32x2_t){0, 0})); - float32x2_t v138 = vadd_f32(v137, v136); - float32x2_t v139 = vsub_f32(v137, v136); - v6[0] = vget_lane_s32(vreinterpret_s32_s16(v142), 0); - int16x4_t v148 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v139, 15), (int32x2_t){0, 0})); - int16x4_t v154 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v138, 15), (int32x2_t){0, 0})); - v6[ostride] = vget_lane_s32(vreinterpret_s32_s16(v148), 0); - v6[ostride * 2] = vget_lane_s32(vreinterpret_s32_s16(v154), 0); - v5 += 1 * 1; - v6 += 1 * 1; - } -} -#endif - -#ifdef ARMRAL_ARCH_SVE -void armral_fft_cs16_cf32_cs16_ac_n_uu3(const armral_cmplx_int16_t *restrict x, - armral_cmplx_int16_t *restrict y, - int istride, int ostride, int howmany, - float dir) { - int64_t v0 = istride; - int64_t v2 = ostride; - float v4 = dir; - const int32_t *v5 = (const int32_t *)x; - int32_t *v6 = (int32_t *)y; - int64_t v8 = howmany; - int64_t v10 = svcntd(); - int64_t v11 = v10 * 1; - int64_t v12 = v10 * 1; - for (int j = 0; j < v8; j += v10) { - svbool_t pred_full = svwhilelt_b32(j * 2, howmany * 2); - float v51 = -1.4999999999999998e+00F; - float v56 = -8.6602540378443871e-01F; - const int32_t *v95 = &v5[v0]; - int32_t *v136 = &v6[v2]; - int64_t v27 = v0 * 2; - float v59 = v4 * v56; - int64_t v82 = v2 * 2; - const int32_t *v114 = &v5[0]; - svfloat32_t v118 = svdup_n_f32(v51); - int32_t *v127 = &v6[0]; - svfloat32_t v25 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v95[0])), - 1.F / (1ULL << 15ULL)); - const int32_t *v104 = &v5[v27]; - svfloat32_t v119 = svdup_n_f32(v59); - int32_t *v145 = &v6[v82]; - svfloat32_t v43 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v114[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v33 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v104[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v34; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v34) : "w"(v25), "w"(v33)); - svfloat32_t v35; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v35) : "w"(v25), "w"(v33)); - svfloat32_t v44; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v44) : "w"(v34), "w"(v43)); - svfloat32_t zero61; - asm volatile("mov %0.s, #0" : "=w"(zero61)); - svfloat32_t v61 = svcmla_f32_x(pred_full, zero61, v119, v35, 90); - svfloat32_t v62 = svmla_f32_x(pred_full, v44, v34, v118); - svint16_t v67 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v44, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svfloat32_t v63; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v63) : "w"(v62), "w"(v61)); - svfloat32_t v64; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v64) : "w"(v62), "w"(v61)); - svst1w_u64(pred_full, (unsigned *)(v127), svreinterpret_u64_s16(v67)); - svint16_t v75 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v64, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svint16_t v83 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v63, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svst1w_u64(pred_full, (unsigned *)(v136), svreinterpret_u64_s16(v75)); - svst1w_u64(pred_full, (unsigned *)(v145), svreinterpret_u64_s16(v83)); - v5 += v11; - v6 += v12; - } -} -#endif - -#ifndef ARMRAL_ARCH_SVE -void armral_fft_cs16_cf32_cs16_ac_n_uu4(const armral_cmplx_int16_t *restrict x, - armral_cmplx_int16_t *restrict y, - int istride, int ostride, int howmany, - float dir) { - float v4 = dir; - const int32_t *v5 = (const int32_t *)x; - int32_t *v6 = (int32_t *)y; - int64_t v12 = howmany - 1; - int64_t v116 = howmany / 2; - for (int j = 0; j < v12; j += 2) { - float v74 = 1.0000000000000000e+00F; - float v75 = -1.0000000000000000e+00F; - float32x2_t v77 = (float32x2_t){v4, v4}; - const int32_t *v228 = &v5[istride]; - int32_t *v256 = &v6[ostride]; - float32x2_t v76 = (float32x2_t){v74, v75}; - const int32_t *v210 = &v5[0]; - int32_t *v247 = &v6[0]; - int16x4_t v282 = vld1_s16((const int16_t *)v228); - float32x4_t v46 = vcvtq_n_f32_s32(vmovl_s16(v282), 15); - float32x2_t v78 = vmul_f32(v77, v76); - const int32_t *v219 = &v5[istride * 2]; - const int32_t *v237 = &v5[istride * 3]; - int32_t *v265 = &v6[ostride * 2]; - int32_t *v274 = &v6[ostride * 3]; - int16x4_t v278 = vld1_s16((const int16_t *)v210); - float32x4_t v28 = vcvtq_n_f32_s32(vmovl_s16(v278), 15); - float32x4_t v80 = vcombine_f32(v78, v78); - int16x4_t v280 = vld1_s16((const int16_t *)v219); - int16x4_t v284 = vld1_s16((const int16_t *)v237); - float32x4_t v36 = vcvtq_n_f32_s32(vmovl_s16(v280), 15); - float32x4_t v54 = vcvtq_n_f32_s32(vmovl_s16(v284), 15); - float32x4_t v37 = vaddq_f32(v28, v36); - float32x4_t v38 = vsubq_f32(v28, v36); - float32x4_t v55 = vaddq_f32(v46, v54); - float32x4_t v56 = vsubq_f32(v46, v54); - float32x4_t v57 = vaddq_f32(v37, v55); - float32x4_t v58 = vsubq_f32(v37, v55); - float32x4_t v79 = vrev64q_f32(v56); - float32x4_t v81 = vmulq_f32(v79, v80); - int16x4_t v86 = vqmovn_s32(vcvtq_n_s32_f32(v57, 15)); - int16x4_t v102 = vqmovn_s32(vcvtq_n_s32_f32(v58, 15)); - float32x4_t v82 = vaddq_f32(v38, v81); - float32x4_t v83 = vsubq_f32(v38, v81); - vst1_s16((int16_t *)v247, v86); - vst1_s16((int16_t *)v265, v102); - int16x4_t v94 = vqmovn_s32(vcvtq_n_s32_f32(v83, 15)); - int16x4_t v110 = vqmovn_s32(vcvtq_n_s32_f32(v82, 15)); - vst1_s16((int16_t *)v256, v94); - vst1_s16((int16_t *)v274, v110); - v5 += 2 * 1; - v6 += 2 * 1; - } - for (int j = v116 * 2; j < howmany; j += 1) { - int16x4_t v142 = vld1s_s16(&v5[istride]); - float v166 = 1.0000000000000000e+00F; - float v167 = -1.0000000000000000e+00F; - float32x2_t v169 = (float32x2_t){v4, v4}; - int16x4_t v128 = vld1s_s16(&v5[0]); - float32x2_t v143 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v142)), 15); - float32x2_t v168 = (float32x2_t){v166, v167}; - float32x2_t v129 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v128)), 15); - int16x4_t v134 = vld1s_s16(&v5[istride * 2]); - int16x4_t v148 = vld1s_s16(&v5[istride * 3]); - float32x2_t v170 = vmul_f32(v169, v168); - float32x2_t v135 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v134)), 15); - float32x2_t v149 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v148)), 15); - float32x2_t v136 = vadd_f32(v129, v135); - float32x2_t v137 = vsub_f32(v129, v135); - float32x2_t v150 = vadd_f32(v143, v149); - float32x2_t v151 = vsub_f32(v143, v149); - float32x2_t v152 = vadd_f32(v136, v150); - float32x2_t v153 = vsub_f32(v136, v150); - float32x2_t v171 = vrev64_f32(v151); - float32x2_t v172 = vmul_f32(v171, v170); - int16x4_t v177 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v152, 15), (int32x2_t){0, 0})); - int16x4_t v189 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v153, 15), (int32x2_t){0, 0})); - float32x2_t v173 = vadd_f32(v137, v172); - float32x2_t v174 = vsub_f32(v137, v172); - v6[0] = vget_lane_s32(vreinterpret_s32_s16(v177), 0); - v6[ostride * 2] = vget_lane_s32(vreinterpret_s32_s16(v189), 0); - int16x4_t v183 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v174, 15), (int32x2_t){0, 0})); - int16x4_t v195 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v173, 15), (int32x2_t){0, 0})); - v6[ostride] = vget_lane_s32(vreinterpret_s32_s16(v183), 0); - v6[ostride * 3] = vget_lane_s32(vreinterpret_s32_s16(v195), 0); - v5 += 1 * 1; - v6 += 1 * 1; - } -} -#endif - -#ifdef ARMRAL_ARCH_SVE -void armral_fft_cs16_cf32_cs16_ac_n_uu4(const armral_cmplx_int16_t *restrict x, - armral_cmplx_int16_t *restrict y, - int istride, int ostride, int howmany, - float dir) { - int64_t v0 = istride; - int64_t v2 = ostride; - float v4 = dir; - const int32_t *v5 = (const int32_t *)x; - int32_t *v6 = (int32_t *)y; - int64_t v8 = howmany; - int64_t v10 = svcntd(); - int64_t v11 = v10 * 1; - int64_t v12 = v10 * 1; - for (int j = 0; j < v8; j += v10) { - svbool_t pred_full = svwhilelt_b32(j * 2, howmany * 2); - float v72 = -1.0000000000000000e+00F; - const int32_t *v137 = &v5[v0]; - int32_t *v169 = &v6[v2]; - int64_t v27 = v0 * 2; - int64_t v45 = v0 * 3; - float v75 = v4 * v72; - int64_t v97 = v2 * 2; - int64_t v105 = v2 * 3; - const int32_t *v119 = &v5[0]; - int32_t *v160 = &v6[0]; - svfloat32_t v43 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v137[0])), - 1.F / (1ULL << 15ULL)); - const int32_t *v128 = &v5[v27]; - const int32_t *v146 = &v5[v45]; - svfloat32_t v152 = svdup_n_f32(v75); - int32_t *v178 = &v6[v97]; - int32_t *v187 = &v6[v105]; - svfloat32_t v25 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v119[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v33 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v128[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v51 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v146[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v34; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v34) : "w"(v25), "w"(v33)); - svfloat32_t v35; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v35) : "w"(v25), "w"(v33)); - svfloat32_t v52; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v52) : "w"(v43), "w"(v51)); - svfloat32_t v53; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v53) : "w"(v43), "w"(v51)); - svfloat32_t v54; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v54) : "w"(v34), "w"(v52)); - svfloat32_t v55; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v55) : "w"(v34), "w"(v52)); - svfloat32_t zero77; - asm volatile("mov %0.s, #0" : "=w"(zero77)); - svfloat32_t v77 = svcmla_f32_x(pred_full, zero77, v152, v53, 90); - svfloat32_t v78; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v78) : "w"(v35), "w"(v77)); - svfloat32_t v79; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v79) : "w"(v35), "w"(v77)); - svint16_t v82 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v54, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svint16_t v98 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v55, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svint16_t v90 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v79, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svint16_t v106 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v78, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svst1w_u64(pred_full, (unsigned *)(v160), svreinterpret_u64_s16(v82)); - svst1w_u64(pred_full, (unsigned *)(v178), svreinterpret_u64_s16(v98)); - svst1w_u64(pred_full, (unsigned *)(v169), svreinterpret_u64_s16(v90)); - svst1w_u64(pred_full, (unsigned *)(v187), svreinterpret_u64_s16(v106)); - v5 += v11; - v6 += v12; - } -} -#endif - -#ifndef ARMRAL_ARCH_SVE -void armral_fft_cs16_cf32_cs16_ac_n_uu5(const armral_cmplx_int16_t *restrict x, - armral_cmplx_int16_t *restrict y, - int istride, int ostride, int howmany, - float dir) { - float v4 = dir; - const int32_t *v5 = (const int32_t *)x; - int32_t *v6 = (int32_t *)y; - int64_t v12 = howmany - 1; - int64_t v157 = howmany / 2; - for (int j = 0; j < v12; j += 2) { - float v75 = -1.2500000000000000e+00F; - float v80 = 5.5901699437494745e-01F; - float v84 = 1.5388417685876268e+00F; - float v85 = -1.5388417685876268e+00F; - float v92 = 5.8778525229247325e-01F; - float v93 = -5.8778525229247325e-01F; - float v100 = 3.6327126400268028e-01F; - float v101 = -3.6327126400268028e-01F; - float32x2_t v103 = (float32x2_t){v4, v4}; - const int32_t *v285 = &v5[istride]; - int32_t *v341 = &v6[ostride]; - float32x2_t v76 = (float32x2_t){v75, v75}; - float32x2_t v81 = (float32x2_t){v80, v80}; - float32x2_t v86 = (float32x2_t){v84, v85}; - float32x2_t v94 = (float32x2_t){v92, v93}; - float32x2_t v102 = (float32x2_t){v100, v101}; - const int32_t *v322 = &v5[0]; - int32_t *v332 = &v6[0]; - int16x4_t v372 = vld1_s16((const int16_t *)v285); - float32x4_t v28 = vcvtq_n_f32_s32(vmovl_s16(v372), 15); - float32x4_t v77 = vcombine_f32(v76, v76); - float32x4_t v82 = vcombine_f32(v81, v81); - float32x2_t v88 = vmul_f32(v103, v86); - float32x2_t v96 = vmul_f32(v103, v94); - float32x2_t v104 = vmul_f32(v103, v102); - const int32_t *v294 = &v5[istride * 4]; - const int32_t *v303 = &v5[istride * 3]; - const int32_t *v312 = &v5[istride * 2]; - int32_t *v350 = &v6[ostride * 2]; - int32_t *v359 = &v6[ostride * 3]; - int32_t *v368 = &v6[ostride * 4]; - int16x4_t v380 = vld1_s16((const int16_t *)v322); - float32x4_t v67 = vcvtq_n_f32_s32(vmovl_s16(v380), 15); - float32x4_t v90 = vcombine_f32(v88, v88); - float32x4_t v98 = vcombine_f32(v96, v96); - float32x4_t v106 = vcombine_f32(v104, v104); - int16x4_t v374 = vld1_s16((const int16_t *)v294); - int16x4_t v376 = vld1_s16((const int16_t *)v303); - int16x4_t v378 = vld1_s16((const int16_t *)v312); - float32x4_t v36 = vcvtq_n_f32_s32(vmovl_s16(v374), 15); - float32x4_t v46 = vcvtq_n_f32_s32(vmovl_s16(v376), 15); - float32x4_t v54 = vcvtq_n_f32_s32(vmovl_s16(v378), 15); - float32x4_t v37 = vaddq_f32(v28, v36); - float32x4_t v38 = vsubq_f32(v28, v36); - float32x4_t v55 = vaddq_f32(v46, v54); - float32x4_t v56 = vsubq_f32(v46, v54); - float32x4_t v57 = vaddq_f32(v37, v55); - float32x4_t v58 = vsubq_f32(v37, v55); - float32x4_t v59 = vaddq_f32(v38, v56); - float32x4_t v89 = vrev64q_f32(v38); - float32x4_t v105 = vrev64q_f32(v56); - float32x4_t v68 = vaddq_f32(v57, v67); - float32x4_t v78 = vmulq_f32(v57, v77); - float32x4_t v83 = vmulq_f32(v58, v82); - float32x4_t v91 = vmulq_f32(v89, v90); - float32x4_t v97 = vrev64q_f32(v59); - float32x4_t v107 = vmulq_f32(v105, v106); - float32x4_t v99 = vmulq_f32(v97, v98); - float32x4_t v108 = vaddq_f32(v68, v78); - int16x4_t v119 = vqmovn_s32(vcvtq_n_s32_f32(v68, 15)); - float32x4_t v109 = vaddq_f32(v108, v83); - float32x4_t v110 = vsubq_f32(v108, v83); - float32x4_t v111 = vsubq_f32(v91, v99); - float32x4_t v112 = vaddq_f32(v99, v107); - vst1_s16((int16_t *)v332, v119); - float32x4_t v113 = vaddq_f32(v109, v111); - float32x4_t v114 = vsubq_f32(v109, v111); - float32x4_t v115 = vaddq_f32(v110, v112); - float32x4_t v116 = vsubq_f32(v110, v112); - int16x4_t v127 = vqmovn_s32(vcvtq_n_s32_f32(v114, 15)); - int16x4_t v135 = vqmovn_s32(vcvtq_n_s32_f32(v116, 15)); - int16x4_t v143 = vqmovn_s32(vcvtq_n_s32_f32(v115, 15)); - int16x4_t v151 = vqmovn_s32(vcvtq_n_s32_f32(v113, 15)); - vst1_s16((int16_t *)v341, v127); - vst1_s16((int16_t *)v350, v135); - vst1_s16((int16_t *)v359, v143); - vst1_s16((int16_t *)v368, v151); - v5 += 2 * 1; - v6 += 2 * 1; - } - for (int j = v157 * 2; j < howmany; j += 1) { - int16x4_t v169 = vld1s_s16(&v5[istride]); - float v208 = -1.2500000000000000e+00F; - float v212 = 5.5901699437494745e-01F; - float v215 = 1.5388417685876268e+00F; - float v216 = -1.5388417685876268e+00F; - float v222 = 5.8778525229247325e-01F; - float v223 = -5.8778525229247325e-01F; - float v229 = 3.6327126400268028e-01F; - float v230 = -3.6327126400268028e-01F; - float32x2_t v232 = (float32x2_t){v4, v4}; - float32x2_t v170 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v169)), 15); - int16x4_t v200 = vld1s_s16(&v5[0]); - float32x2_t v209 = (float32x2_t){v208, v208}; - float32x2_t v213 = (float32x2_t){v212, v212}; - float32x2_t v217 = (float32x2_t){v215, v216}; - float32x2_t v224 = (float32x2_t){v222, v223}; - float32x2_t v231 = (float32x2_t){v229, v230}; - int16x4_t v175 = vld1s_s16(&v5[istride * 4]); - int16x4_t v183 = vld1s_s16(&v5[istride * 3]); - int16x4_t v189 = vld1s_s16(&v5[istride * 2]); - float32x2_t v201 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v200)), 15); - float32x2_t v219 = vmul_f32(v232, v217); - float32x2_t v226 = vmul_f32(v232, v224); - float32x2_t v233 = vmul_f32(v232, v231); - float32x2_t v176 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v175)), 15); - float32x2_t v184 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v183)), 15); - float32x2_t v190 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v189)), 15); - float32x2_t v177 = vadd_f32(v170, v176); - float32x2_t v178 = vsub_f32(v170, v176); - float32x2_t v191 = vadd_f32(v184, v190); - float32x2_t v192 = vsub_f32(v184, v190); - float32x2_t v193 = vadd_f32(v177, v191); - float32x2_t v194 = vsub_f32(v177, v191); - float32x2_t v195 = vadd_f32(v178, v192); - float32x2_t v220 = vrev64_f32(v178); - float32x2_t v234 = vrev64_f32(v192); - float32x2_t v202 = vadd_f32(v193, v201); - float32x2_t v210 = vmul_f32(v193, v209); - float32x2_t v214 = vmul_f32(v194, v213); - float32x2_t v221 = vmul_f32(v220, v219); - float32x2_t v227 = vrev64_f32(v195); - float32x2_t v235 = vmul_f32(v234, v233); - float32x2_t v228 = vmul_f32(v227, v226); - float32x2_t v236 = vadd_f32(v202, v210); - int16x4_t v247 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v202, 15), (int32x2_t){0, 0})); - float32x2_t v237 = vadd_f32(v236, v214); - float32x2_t v238 = vsub_f32(v236, v214); - float32x2_t v239 = vsub_f32(v221, v228); - float32x2_t v240 = vadd_f32(v228, v235); - v6[0] = vget_lane_s32(vreinterpret_s32_s16(v247), 0); - float32x2_t v241 = vadd_f32(v237, v239); - float32x2_t v242 = vsub_f32(v237, v239); - float32x2_t v243 = vadd_f32(v238, v240); - float32x2_t v244 = vsub_f32(v238, v240); - int16x4_t v253 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v242, 15), (int32x2_t){0, 0})); - int16x4_t v259 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v244, 15), (int32x2_t){0, 0})); - int16x4_t v265 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v243, 15), (int32x2_t){0, 0})); - int16x4_t v271 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v241, 15), (int32x2_t){0, 0})); - v6[ostride] = vget_lane_s32(vreinterpret_s32_s16(v253), 0); - v6[ostride * 2] = vget_lane_s32(vreinterpret_s32_s16(v259), 0); - v6[ostride * 3] = vget_lane_s32(vreinterpret_s32_s16(v265), 0); - v6[ostride * 4] = vget_lane_s32(vreinterpret_s32_s16(v271), 0); - v5 += 1 * 1; - v6 += 1 * 1; - } -} -#endif - -#ifdef ARMRAL_ARCH_SVE -void armral_fft_cs16_cf32_cs16_ac_n_uu5(const armral_cmplx_int16_t *restrict x, - armral_cmplx_int16_t *restrict y, - int istride, int ostride, int howmany, - float dir) { - int64_t v0 = istride; - int64_t v2 = ostride; - float v4 = dir; - const int32_t *v5 = (const int32_t *)x; - int32_t *v6 = (int32_t *)y; - int64_t v8 = howmany; - int64_t v10 = svcntd(); - int64_t v11 = v10 * 1; - int64_t v12 = v10 * 1; - for (int j = 0; j < v8; j += v10) { - svbool_t pred_full = svwhilelt_b32(j * 2, howmany * 2); - float v72 = -1.2500000000000000e+00F; - float v77 = 5.5901699437494745e-01F; - float v82 = -1.5388417685876268e+00F; - float v89 = -5.8778525229247325e-01F; - float v96 = -3.6327126400268028e-01F; - const int32_t *v157 = &v5[v0]; - int32_t *v219 = &v6[v2]; - int64_t v27 = v0 * 4; - int64_t v37 = v0 * 3; - int64_t v45 = v0 * 2; - float v85 = v4 * v82; - float v92 = v4 * v89; - float v99 = v4 * v96; - int64_t v128 = v2 * 2; - int64_t v136 = v2 * 3; - int64_t v144 = v2 * 4; - const int32_t *v194 = &v5[0]; - svfloat32_t v198 = svdup_n_f32(v72); - svfloat32_t v199 = svdup_n_f32(v77); - int32_t *v210 = &v6[0]; - svfloat32_t v25 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v157[0])), - 1.F / (1ULL << 15ULL)); - const int32_t *v166 = &v5[v27]; - const int32_t *v175 = &v5[v37]; - const int32_t *v184 = &v5[v45]; - svfloat32_t v200 = svdup_n_f32(v85); - svfloat32_t v201 = svdup_n_f32(v92); - svfloat32_t v202 = svdup_n_f32(v99); - int32_t *v228 = &v6[v128]; - int32_t *v237 = &v6[v136]; - int32_t *v246 = &v6[v144]; - svfloat32_t v64 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v194[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v33 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v166[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v43 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v175[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v51 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v184[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v34; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v34) : "w"(v25), "w"(v33)); - svfloat32_t v35; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v35) : "w"(v25), "w"(v33)); - svfloat32_t v52; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v52) : "w"(v43), "w"(v51)); - svfloat32_t v53; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v53) : "w"(v43), "w"(v51)); - svfloat32_t v54; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v54) : "w"(v34), "w"(v52)); - svfloat32_t v55; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v55) : "w"(v34), "w"(v52)); - svfloat32_t v56; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v56) : "w"(v35), "w"(v53)); - svfloat32_t zero87; - asm volatile("mov %0.s, #0" : "=w"(zero87)); - svfloat32_t v87 = svcmla_f32_x(pred_full, zero87, v200, v35, 90); - svfloat32_t v65; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v65) : "w"(v54), "w"(v64)); - svfloat32_t zero94; - asm volatile("mov %0.s, #0" : "=w"(zero94)); - svfloat32_t v94 = svcmla_f32_x(pred_full, zero94, v201, v56, 90); - svfloat32_t v102 = svmla_f32_x(pred_full, v65, v54, v198); - svfloat32_t v105; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v105) : "w"(v87), "w"(v94)); - svfloat32_t v106 = svcmla_f32_x(pred_full, v94, v202, v53, 90); - svint16_t v113 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v65, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svfloat32_t v103 = svmla_f32_x(pred_full, v102, v55, v199); - svfloat32_t v104 = svmls_f32_x(pred_full, v102, v55, v199); - svst1w_u64(pred_full, (unsigned *)(v210), svreinterpret_u64_s16(v113)); - svfloat32_t v107; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v107) : "w"(v103), "w"(v105)); - svfloat32_t v108; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v108) : "w"(v103), "w"(v105)); - svfloat32_t v109; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v109) : "w"(v104), "w"(v106)); - svfloat32_t v110; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v110) : "w"(v104), "w"(v106)); - svint16_t v121 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v108, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svint16_t v129 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v110, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svint16_t v137 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v109, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svint16_t v145 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v107, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svst1w_u64(pred_full, (unsigned *)(v219), svreinterpret_u64_s16(v121)); - svst1w_u64(pred_full, (unsigned *)(v228), svreinterpret_u64_s16(v129)); - svst1w_u64(pred_full, (unsigned *)(v237), svreinterpret_u64_s16(v137)); - svst1w_u64(pred_full, (unsigned *)(v246), svreinterpret_u64_s16(v145)); - v5 += v11; - v6 += v12; - } -} -#endif - -#ifndef ARMRAL_ARCH_SVE -void armral_fft_cs16_cf32_cs16_ac_n_uu6(const armral_cmplx_int16_t *restrict x, - armral_cmplx_int16_t *restrict y, - int istride, int ostride, int howmany, - float dir) { - float v4 = dir; - const int32_t *v5 = (const int32_t *)x; - int32_t *v6 = (int32_t *)y; - int64_t v12 = howmany - 1; - int64_t v171 = howmany / 2; - for (int j = 0; j < v12; j += 2) { - float v108 = -1.4999999999999998e+00F; - float v112 = 8.6602540378443871e-01F; - float v113 = -8.6602540378443871e-01F; - float32x2_t v115 = (float32x2_t){v4, v4}; - const int32_t *v355 = &v5[istride]; - int32_t *v392 = &v6[ostride]; - float32x2_t v109 = (float32x2_t){v108, v108}; - float32x2_t v114 = (float32x2_t){v112, v113}; - const int32_t *v310 = &v5[0]; - int32_t *v365 = &v6[0]; - int16x4_t v424 = vld1_s16((const int16_t *)v355); - float32x4_t v72 = vcvtq_n_f32_s32(vmovl_s16(v424), 15); - float32x4_t v110 = vcombine_f32(v109, v109); - float32x2_t v116 = vmul_f32(v115, v114); - const int32_t *v319 = &v5[istride * 3]; - const int32_t *v328 = &v5[istride * 2]; - const int32_t *v337 = &v5[istride * 5]; - const int32_t *v346 = &v5[istride * 4]; - int32_t *v374 = &v6[ostride * 3]; - int32_t *v383 = &v6[ostride * 4]; - int32_t *v401 = &v6[ostride * 2]; - int32_t *v410 = &v6[ostride * 5]; - int16x4_t v414 = vld1_s16((const int16_t *)v310); - float32x4_t v28 = vcvtq_n_f32_s32(vmovl_s16(v414), 15); - float32x4_t v118 = vcombine_f32(v116, v116); - int16x4_t v416 = vld1_s16((const int16_t *)v319); - int16x4_t v418 = vld1_s16((const int16_t *)v328); - int16x4_t v420 = vld1_s16((const int16_t *)v337); - int16x4_t v422 = vld1_s16((const int16_t *)v346); - float32x4_t v36 = vcvtq_n_f32_s32(vmovl_s16(v416), 15); - float32x4_t v46 = vcvtq_n_f32_s32(vmovl_s16(v418), 15); - float32x4_t v54 = vcvtq_n_f32_s32(vmovl_s16(v420), 15); - float32x4_t v64 = vcvtq_n_f32_s32(vmovl_s16(v422), 15); - float32x4_t v37 = vaddq_f32(v28, v36); - float32x4_t v38 = vsubq_f32(v28, v36); - float32x4_t v55 = vaddq_f32(v46, v54); - float32x4_t v56 = vsubq_f32(v46, v54); - float32x4_t v73 = vaddq_f32(v64, v72); - float32x4_t v74 = vsubq_f32(v64, v72); - float32x4_t v75 = vaddq_f32(v55, v73); - float32x4_t v76 = vsubq_f32(v55, v73); - float32x4_t v99 = vaddq_f32(v56, v74); - float32x4_t v100 = vsubq_f32(v56, v74); - float32x4_t v77 = vaddq_f32(v75, v37); - float32x4_t v87 = vmulq_f32(v75, v110); - float32x4_t v93 = vrev64q_f32(v76); - float32x4_t v101 = vaddq_f32(v99, v38); - float32x4_t v111 = vmulq_f32(v99, v110); - float32x4_t v117 = vrev64q_f32(v100); - float32x4_t v95 = vmulq_f32(v93, v118); - float32x4_t v96 = vaddq_f32(v77, v87); - float32x4_t v119 = vmulq_f32(v117, v118); - float32x4_t v120 = vaddq_f32(v101, v111); - int16x4_t v125 = vqmovn_s32(vcvtq_n_s32_f32(v77, 15)); - int16x4_t v133 = vqmovn_s32(vcvtq_n_s32_f32(v101, 15)); - float32x4_t v97 = vaddq_f32(v96, v95); - float32x4_t v98 = vsubq_f32(v96, v95); - float32x4_t v121 = vaddq_f32(v120, v119); - float32x4_t v122 = vsubq_f32(v120, v119); - vst1_s16((int16_t *)v365, v125); - vst1_s16((int16_t *)v374, v133); - int16x4_t v141 = vqmovn_s32(vcvtq_n_s32_f32(v98, 15)); - int16x4_t v149 = vqmovn_s32(vcvtq_n_s32_f32(v122, 15)); - int16x4_t v157 = vqmovn_s32(vcvtq_n_s32_f32(v97, 15)); - int16x4_t v165 = vqmovn_s32(vcvtq_n_s32_f32(v121, 15)); - vst1_s16((int16_t *)v383, v141); - vst1_s16((int16_t *)v392, v149); - vst1_s16((int16_t *)v401, v157); - vst1_s16((int16_t *)v410, v165); - v5 += 2 * 1; - v6 += 2 * 1; - } - for (int j = v171 * 2; j < howmany; j += 1) { - int16x4_t v217 = vld1s_s16(&v5[istride]); - float v250 = -1.4999999999999998e+00F; - float v253 = 8.6602540378443871e-01F; - float v254 = -8.6602540378443871e-01F; - float32x2_t v256 = (float32x2_t){v4, v4}; - int16x4_t v183 = vld1s_s16(&v5[0]); - float32x2_t v218 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v217)), 15); - float32x2_t v251 = (float32x2_t){v250, v250}; - float32x2_t v255 = (float32x2_t){v253, v254}; - float32x2_t v184 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v183)), 15); - int16x4_t v189 = vld1s_s16(&v5[istride * 3]); - int16x4_t v197 = vld1s_s16(&v5[istride * 2]); - int16x4_t v203 = vld1s_s16(&v5[istride * 5]); - int16x4_t v211 = vld1s_s16(&v5[istride * 4]); - float32x2_t v257 = vmul_f32(v256, v255); - float32x2_t v190 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v189)), 15); - float32x2_t v198 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v197)), 15); - float32x2_t v204 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v203)), 15); - float32x2_t v212 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v211)), 15); - float32x2_t v191 = vadd_f32(v184, v190); - float32x2_t v192 = vsub_f32(v184, v190); - float32x2_t v205 = vadd_f32(v198, v204); - float32x2_t v206 = vsub_f32(v198, v204); - float32x2_t v219 = vadd_f32(v212, v218); - float32x2_t v220 = vsub_f32(v212, v218); - float32x2_t v221 = vadd_f32(v205, v219); - float32x2_t v222 = vsub_f32(v205, v219); - float32x2_t v242 = vadd_f32(v206, v220); - float32x2_t v243 = vsub_f32(v206, v220); - float32x2_t v223 = vadd_f32(v221, v191); - float32x2_t v231 = vmul_f32(v221, v251); - float32x2_t v237 = vrev64_f32(v222); - float32x2_t v244 = vadd_f32(v242, v192); - float32x2_t v252 = vmul_f32(v242, v251); - float32x2_t v258 = vrev64_f32(v243); - float32x2_t v238 = vmul_f32(v237, v257); - float32x2_t v239 = vadd_f32(v223, v231); - float32x2_t v259 = vmul_f32(v258, v257); - float32x2_t v260 = vadd_f32(v244, v252); - int16x4_t v265 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v223, 15), (int32x2_t){0, 0})); - int16x4_t v271 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v244, 15), (int32x2_t){0, 0})); - float32x2_t v240 = vadd_f32(v239, v238); - float32x2_t v241 = vsub_f32(v239, v238); - float32x2_t v261 = vadd_f32(v260, v259); - float32x2_t v262 = vsub_f32(v260, v259); - v6[0] = vget_lane_s32(vreinterpret_s32_s16(v265), 0); - v6[ostride * 3] = vget_lane_s32(vreinterpret_s32_s16(v271), 0); - int16x4_t v277 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v241, 15), (int32x2_t){0, 0})); - int16x4_t v283 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v262, 15), (int32x2_t){0, 0})); - int16x4_t v289 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v240, 15), (int32x2_t){0, 0})); - int16x4_t v295 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v261, 15), (int32x2_t){0, 0})); - v6[ostride * 4] = vget_lane_s32(vreinterpret_s32_s16(v277), 0); - v6[ostride] = vget_lane_s32(vreinterpret_s32_s16(v283), 0); - v6[ostride * 2] = vget_lane_s32(vreinterpret_s32_s16(v289), 0); - v6[ostride * 5] = vget_lane_s32(vreinterpret_s32_s16(v295), 0); - v5 += 1 * 1; - v6 += 1 * 1; - } -} -#endif - -#ifdef ARMRAL_ARCH_SVE -void armral_fft_cs16_cf32_cs16_ac_n_uu6(const armral_cmplx_int16_t *restrict x, - armral_cmplx_int16_t *restrict y, - int istride, int ostride, int howmany, - float dir) { - int64_t v0 = istride; - int64_t v2 = ostride; - float v4 = dir; - const int32_t *v5 = (const int32_t *)x; - int32_t *v6 = (int32_t *)y; - int64_t v8 = howmany; - int64_t v10 = svcntd(); - int64_t v11 = v10 * 1; - int64_t v12 = v10 * 1; - for (int j = 0; j < v8; j += v10) { - svbool_t pred_full = svwhilelt_b32(j * 2, howmany * 2); - float v104 = -1.4999999999999998e+00F; - float v109 = -8.6602540378443871e-01F; - const int32_t *v218 = &v5[v0]; - int32_t *v261 = &v6[v2]; - int64_t v27 = v0 * 3; - int64_t v37 = v0 * 2; - int64_t v45 = v0 * 5; - int64_t v55 = v0 * 4; - float v112 = v4 * v109; - int64_t v127 = v2 * 3; - int64_t v135 = v2 * 4; - int64_t v151 = v2 * 2; - int64_t v159 = v2 * 5; - const int32_t *v173 = &v5[0]; - svfloat32_t v225 = svdup_n_f32(v104); - int32_t *v234 = &v6[0]; - svfloat32_t v69 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v218[0])), - 1.F / (1ULL << 15ULL)); - const int32_t *v182 = &v5[v27]; - const int32_t *v191 = &v5[v37]; - const int32_t *v200 = &v5[v45]; - const int32_t *v209 = &v5[v55]; - svfloat32_t v226 = svdup_n_f32(v112); - int32_t *v243 = &v6[v127]; - int32_t *v252 = &v6[v135]; - int32_t *v270 = &v6[v151]; - int32_t *v279 = &v6[v159]; - svfloat32_t v25 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v173[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v33 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v182[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v43 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v191[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v51 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v200[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v61 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v209[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v34; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v34) : "w"(v25), "w"(v33)); - svfloat32_t v35; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v35) : "w"(v25), "w"(v33)); - svfloat32_t v52; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v52) : "w"(v43), "w"(v51)); - svfloat32_t v53; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v53) : "w"(v43), "w"(v51)); - svfloat32_t v70; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v70) : "w"(v61), "w"(v69)); - svfloat32_t v71; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v71) : "w"(v61), "w"(v69)); - svfloat32_t v72; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v72) : "w"(v52), "w"(v70)); - svfloat32_t v73; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v73) : "w"(v52), "w"(v70)); - svfloat32_t v95; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v95) : "w"(v53), "w"(v71)); - svfloat32_t v96; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v96) : "w"(v53), "w"(v71)); - svfloat32_t v74; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v74) : "w"(v72), "w"(v34)); - svfloat32_t zero91; - asm volatile("mov %0.s, #0" : "=w"(zero91)); - svfloat32_t v91 = svcmla_f32_x(pred_full, zero91, v226, v73, 90); - svfloat32_t v97; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v97) : "w"(v95), "w"(v35)); - svfloat32_t zero114; - asm volatile("mov %0.s, #0" : "=w"(zero114)); - svfloat32_t v114 = svcmla_f32_x(pred_full, zero114, v226, v96, 90); - svfloat32_t v92 = svmla_f32_x(pred_full, v74, v72, v225); - svfloat32_t v115 = svmla_f32_x(pred_full, v97, v95, v225); - svint16_t v120 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v74, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svint16_t v128 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v97, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svfloat32_t v93; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v93) : "w"(v92), "w"(v91)); - svfloat32_t v94; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v94) : "w"(v92), "w"(v91)); - svfloat32_t v116; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v116) : "w"(v115), "w"(v114)); - svfloat32_t v117; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v117) : "w"(v115), "w"(v114)); - svst1w_u64(pred_full, (unsigned *)(v234), svreinterpret_u64_s16(v120)); - svst1w_u64(pred_full, (unsigned *)(v243), svreinterpret_u64_s16(v128)); - svint16_t v136 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v94, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svint16_t v144 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v117, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svint16_t v152 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v93, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svint16_t v160 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v116, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svst1w_u64(pred_full, (unsigned *)(v252), svreinterpret_u64_s16(v136)); - svst1w_u64(pred_full, (unsigned *)(v261), svreinterpret_u64_s16(v144)); - svst1w_u64(pred_full, (unsigned *)(v270), svreinterpret_u64_s16(v152)); - svst1w_u64(pred_full, (unsigned *)(v279), svreinterpret_u64_s16(v160)); - v5 += v11; - v6 += v12; - } -} -#endif - -#ifndef ARMRAL_ARCH_SVE -void armral_fft_cs16_cf32_cs16_ac_n_uu7(const armral_cmplx_int16_t *restrict x, - armral_cmplx_int16_t *restrict y, - int istride, int ostride, int howmany, - float dir) { - float v4 = dir; - const int32_t *v5 = (const int32_t *)x; - int32_t *v6 = (int32_t *)y; - int64_t v12 = howmany - 1; - int64_t v226 = howmany / 2; - for (int j = 0; j < v12; j += 2) { - float v100 = -1.1666666666666665e+00F; - float v105 = 7.9015646852540022e-01F; - float v110 = 5.5854267289647742e-02F; - float v115 = 7.3430220123575241e-01F; - float v119 = 4.4095855184409838e-01F; - float v120 = -4.4095855184409838e-01F; - float v127 = 3.4087293062393137e-01F; - float v128 = -3.4087293062393137e-01F; - float v135 = -5.3396936033772524e-01F; - float v136 = 5.3396936033772524e-01F; - float v143 = 8.7484229096165667e-01F; - float v144 = -8.7484229096165667e-01F; - float32x2_t v146 = (float32x2_t){v4, v4}; - const int32_t *v412 = &v5[istride]; - int32_t *v486 = &v6[ostride]; - float32x2_t v101 = (float32x2_t){v100, v100}; - float32x2_t v106 = (float32x2_t){v105, v105}; - float32x2_t v111 = (float32x2_t){v110, v110}; - float32x2_t v116 = (float32x2_t){v115, v115}; - float32x2_t v121 = (float32x2_t){v119, v120}; - float32x2_t v129 = (float32x2_t){v127, v128}; - float32x2_t v137 = (float32x2_t){v135, v136}; - float32x2_t v145 = (float32x2_t){v143, v144}; - const int32_t *v467 = &v5[0]; - int32_t *v477 = &v6[0]; - int16x4_t v535 = vld1_s16((const int16_t *)v412); - float32x4_t v28 = vcvtq_n_f32_s32(vmovl_s16(v535), 15); - float32x4_t v102 = vcombine_f32(v101, v101); - float32x4_t v107 = vcombine_f32(v106, v106); - float32x4_t v112 = vcombine_f32(v111, v111); - float32x4_t v117 = vcombine_f32(v116, v116); - float32x2_t v123 = vmul_f32(v146, v121); - float32x2_t v131 = vmul_f32(v146, v129); - float32x2_t v139 = vmul_f32(v146, v137); - float32x2_t v147 = vmul_f32(v146, v145); - const int32_t *v421 = &v5[istride * 6]; - const int32_t *v430 = &v5[istride * 4]; - const int32_t *v439 = &v5[istride * 3]; - const int32_t *v448 = &v5[istride * 2]; - const int32_t *v457 = &v5[istride * 5]; - int32_t *v495 = &v6[ostride * 2]; - int32_t *v504 = &v6[ostride * 3]; - int32_t *v513 = &v6[ostride * 4]; - int32_t *v522 = &v6[ostride * 5]; - int32_t *v531 = &v6[ostride * 6]; - int16x4_t v547 = vld1_s16((const int16_t *)v467); - float32x4_t v84 = vcvtq_n_f32_s32(vmovl_s16(v547), 15); - float32x4_t v125 = vcombine_f32(v123, v123); - float32x4_t v133 = vcombine_f32(v131, v131); - float32x4_t v141 = vcombine_f32(v139, v139); - float32x4_t v149 = vcombine_f32(v147, v147); - int16x4_t v537 = vld1_s16((const int16_t *)v421); - int16x4_t v539 = vld1_s16((const int16_t *)v430); - int16x4_t v541 = vld1_s16((const int16_t *)v439); - int16x4_t v543 = vld1_s16((const int16_t *)v448); - int16x4_t v545 = vld1_s16((const int16_t *)v457); - float32x4_t v36 = vcvtq_n_f32_s32(vmovl_s16(v537), 15); - float32x4_t v46 = vcvtq_n_f32_s32(vmovl_s16(v539), 15); - float32x4_t v54 = vcvtq_n_f32_s32(vmovl_s16(v541), 15); - float32x4_t v64 = vcvtq_n_f32_s32(vmovl_s16(v543), 15); - float32x4_t v72 = vcvtq_n_f32_s32(vmovl_s16(v545), 15); - float32x4_t v37 = vaddq_f32(v28, v36); - float32x4_t v38 = vsubq_f32(v28, v36); - float32x4_t v55 = vaddq_f32(v46, v54); - float32x4_t v56 = vsubq_f32(v46, v54); - float32x4_t v73 = vaddq_f32(v64, v72); - float32x4_t v74 = vsubq_f32(v64, v72); - float32x4_t v75 = vaddq_f32(v37, v55); - float32x4_t v86 = vsubq_f32(v37, v55); - float32x4_t v87 = vsubq_f32(v55, v73); - float32x4_t v88 = vsubq_f32(v73, v37); - float32x4_t v89 = vaddq_f32(v38, v56); - float32x4_t v91 = vsubq_f32(v38, v56); - float32x4_t v92 = vsubq_f32(v56, v74); - float32x4_t v93 = vsubq_f32(v74, v38); - float32x4_t v76 = vaddq_f32(v75, v73); - float32x4_t v90 = vaddq_f32(v89, v74); - float32x4_t v108 = vmulq_f32(v86, v107); - float32x4_t v113 = vmulq_f32(v87, v112); - float32x4_t v118 = vmulq_f32(v88, v117); - float32x4_t v132 = vrev64q_f32(v91); - float32x4_t v140 = vrev64q_f32(v92); - float32x4_t v148 = vrev64q_f32(v93); - float32x4_t v85 = vaddq_f32(v76, v84); - float32x4_t v103 = vmulq_f32(v76, v102); - float32x4_t v124 = vrev64q_f32(v90); - float32x4_t v134 = vmulq_f32(v132, v133); - float32x4_t v142 = vmulq_f32(v140, v141); - float32x4_t v150 = vmulq_f32(v148, v149); - float32x4_t v126 = vmulq_f32(v124, v125); - float32x4_t v151 = vaddq_f32(v85, v103); - int16x4_t v172 = vqmovn_s32(vcvtq_n_s32_f32(v85, 15)); - float32x4_t v152 = vaddq_f32(v151, v108); - float32x4_t v154 = vsubq_f32(v151, v108); - float32x4_t v156 = vsubq_f32(v151, v113); - float32x4_t v158 = vaddq_f32(v126, v134); - float32x4_t v160 = vsubq_f32(v126, v134); - float32x4_t v162 = vsubq_f32(v126, v142); - vst1_s16((int16_t *)v477, v172); - float32x4_t v153 = vaddq_f32(v152, v113); - float32x4_t v155 = vsubq_f32(v154, v118); - float32x4_t v157 = vaddq_f32(v156, v118); - float32x4_t v159 = vaddq_f32(v158, v142); - float32x4_t v161 = vsubq_f32(v160, v150); - float32x4_t v163 = vaddq_f32(v162, v150); - float32x4_t v164 = vaddq_f32(v153, v159); - float32x4_t v165 = vsubq_f32(v153, v159); - float32x4_t v166 = vaddq_f32(v155, v161); - float32x4_t v167 = vsubq_f32(v155, v161); - float32x4_t v168 = vaddq_f32(v157, v163); - float32x4_t v169 = vsubq_f32(v157, v163); - int16x4_t v180 = vqmovn_s32(vcvtq_n_s32_f32(v165, 15)); - int16x4_t v188 = vqmovn_s32(vcvtq_n_s32_f32(v167, 15)); - int16x4_t v196 = vqmovn_s32(vcvtq_n_s32_f32(v168, 15)); - int16x4_t v204 = vqmovn_s32(vcvtq_n_s32_f32(v169, 15)); - int16x4_t v212 = vqmovn_s32(vcvtq_n_s32_f32(v166, 15)); - int16x4_t v220 = vqmovn_s32(vcvtq_n_s32_f32(v164, 15)); - vst1_s16((int16_t *)v486, v180); - vst1_s16((int16_t *)v495, v188); - vst1_s16((int16_t *)v504, v196); - vst1_s16((int16_t *)v513, v204); - vst1_s16((int16_t *)v522, v212); - vst1_s16((int16_t *)v531, v220); - v5 += 2 * 1; - v6 += 2 * 1; - } - for (int j = v226 * 2; j < howmany; j += 1) { - int16x4_t v238 = vld1s_s16(&v5[istride]); - float v298 = -1.1666666666666665e+00F; - float v302 = 7.9015646852540022e-01F; - float v306 = 5.5854267289647742e-02F; - float v310 = 7.3430220123575241e-01F; - float v313 = 4.4095855184409838e-01F; - float v314 = -4.4095855184409838e-01F; - float v320 = 3.4087293062393137e-01F; - float v321 = -3.4087293062393137e-01F; - float v327 = -5.3396936033772524e-01F; - float v328 = 5.3396936033772524e-01F; - float v334 = 8.7484229096165667e-01F; - float v335 = -8.7484229096165667e-01F; - float32x2_t v337 = (float32x2_t){v4, v4}; - float32x2_t v239 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v238)), 15); - int16x4_t v282 = vld1s_s16(&v5[0]); - float32x2_t v299 = (float32x2_t){v298, v298}; - float32x2_t v303 = (float32x2_t){v302, v302}; - float32x2_t v307 = (float32x2_t){v306, v306}; - float32x2_t v311 = (float32x2_t){v310, v310}; - float32x2_t v315 = (float32x2_t){v313, v314}; - float32x2_t v322 = (float32x2_t){v320, v321}; - float32x2_t v329 = (float32x2_t){v327, v328}; - float32x2_t v336 = (float32x2_t){v334, v335}; - int16x4_t v244 = vld1s_s16(&v5[istride * 6]); - int16x4_t v252 = vld1s_s16(&v5[istride * 4]); - int16x4_t v258 = vld1s_s16(&v5[istride * 3]); - int16x4_t v266 = vld1s_s16(&v5[istride * 2]); - int16x4_t v272 = vld1s_s16(&v5[istride * 5]); - float32x2_t v283 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v282)), 15); - float32x2_t v317 = vmul_f32(v337, v315); - float32x2_t v324 = vmul_f32(v337, v322); - float32x2_t v331 = vmul_f32(v337, v329); - float32x2_t v338 = vmul_f32(v337, v336); - float32x2_t v245 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v244)), 15); - float32x2_t v253 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v252)), 15); - float32x2_t v259 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v258)), 15); - float32x2_t v267 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v266)), 15); - float32x2_t v273 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v272)), 15); - float32x2_t v246 = vadd_f32(v239, v245); - float32x2_t v247 = vsub_f32(v239, v245); - float32x2_t v260 = vadd_f32(v253, v259); - float32x2_t v261 = vsub_f32(v253, v259); - float32x2_t v274 = vadd_f32(v267, v273); - float32x2_t v275 = vsub_f32(v267, v273); - float32x2_t v276 = vadd_f32(v246, v260); - float32x2_t v285 = vsub_f32(v246, v260); - float32x2_t v286 = vsub_f32(v260, v274); - float32x2_t v287 = vsub_f32(v274, v246); - float32x2_t v288 = vadd_f32(v247, v261); - float32x2_t v290 = vsub_f32(v247, v261); - float32x2_t v291 = vsub_f32(v261, v275); - float32x2_t v292 = vsub_f32(v275, v247); - float32x2_t v277 = vadd_f32(v276, v274); - float32x2_t v289 = vadd_f32(v288, v275); - float32x2_t v304 = vmul_f32(v285, v303); - float32x2_t v308 = vmul_f32(v286, v307); - float32x2_t v312 = vmul_f32(v287, v311); - float32x2_t v325 = vrev64_f32(v290); - float32x2_t v332 = vrev64_f32(v291); - float32x2_t v339 = vrev64_f32(v292); - float32x2_t v284 = vadd_f32(v277, v283); - float32x2_t v300 = vmul_f32(v277, v299); - float32x2_t v318 = vrev64_f32(v289); - float32x2_t v326 = vmul_f32(v325, v324); - float32x2_t v333 = vmul_f32(v332, v331); - float32x2_t v340 = vmul_f32(v339, v338); - float32x2_t v319 = vmul_f32(v318, v317); - float32x2_t v341 = vadd_f32(v284, v300); - int16x4_t v362 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v284, 15), (int32x2_t){0, 0})); - float32x2_t v342 = vadd_f32(v341, v304); - float32x2_t v344 = vsub_f32(v341, v304); - float32x2_t v346 = vsub_f32(v341, v308); - float32x2_t v348 = vadd_f32(v319, v326); - float32x2_t v350 = vsub_f32(v319, v326); - float32x2_t v352 = vsub_f32(v319, v333); - v6[0] = vget_lane_s32(vreinterpret_s32_s16(v362), 0); - float32x2_t v343 = vadd_f32(v342, v308); - float32x2_t v345 = vsub_f32(v344, v312); - float32x2_t v347 = vadd_f32(v346, v312); - float32x2_t v349 = vadd_f32(v348, v333); - float32x2_t v351 = vsub_f32(v350, v340); - float32x2_t v353 = vadd_f32(v352, v340); - float32x2_t v354 = vadd_f32(v343, v349); - float32x2_t v355 = vsub_f32(v343, v349); - float32x2_t v356 = vadd_f32(v345, v351); - float32x2_t v357 = vsub_f32(v345, v351); - float32x2_t v358 = vadd_f32(v347, v353); - float32x2_t v359 = vsub_f32(v347, v353); - int16x4_t v368 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v355, 15), (int32x2_t){0, 0})); - int16x4_t v374 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v357, 15), (int32x2_t){0, 0})); - int16x4_t v380 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v358, 15), (int32x2_t){0, 0})); - int16x4_t v386 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v359, 15), (int32x2_t){0, 0})); - int16x4_t v392 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v356, 15), (int32x2_t){0, 0})); - int16x4_t v398 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v354, 15), (int32x2_t){0, 0})); - v6[ostride] = vget_lane_s32(vreinterpret_s32_s16(v368), 0); - v6[ostride * 2] = vget_lane_s32(vreinterpret_s32_s16(v374), 0); - v6[ostride * 3] = vget_lane_s32(vreinterpret_s32_s16(v380), 0); - v6[ostride * 4] = vget_lane_s32(vreinterpret_s32_s16(v386), 0); - v6[ostride * 5] = vget_lane_s32(vreinterpret_s32_s16(v392), 0); - v6[ostride * 6] = vget_lane_s32(vreinterpret_s32_s16(v398), 0); - v5 += 1 * 1; - v6 += 1 * 1; - } -} -#endif - -#ifdef ARMRAL_ARCH_SVE -void armral_fft_cs16_cf32_cs16_ac_n_uu7(const armral_cmplx_int16_t *restrict x, - armral_cmplx_int16_t *restrict y, - int istride, int ostride, int howmany, - float dir) { - int64_t v0 = istride; - int64_t v2 = ostride; - float v4 = dir; - const int32_t *v5 = (const int32_t *)x; - int32_t *v6 = (int32_t *)y; - int64_t v8 = howmany; - int64_t v10 = svcntd(); - int64_t v11 = v10 * 1; - int64_t v12 = v10 * 1; - for (int j = 0; j < v8; j += v10) { - svbool_t pred_full = svwhilelt_b32(j * 2, howmany * 2); - float v97 = -1.1666666666666665e+00F; - float v102 = 7.9015646852540022e-01F; - float v107 = 5.5854267289647742e-02F; - float v112 = 7.3430220123575241e-01F; - float v117 = -4.4095855184409838e-01F; - float v124 = -3.4087293062393137e-01F; - float v131 = 5.3396936033772524e-01F; - float v138 = -8.7484229096165667e-01F; - const int32_t *v225 = &v5[v0]; - int32_t *v308 = &v6[v2]; - int64_t v27 = v0 * 6; - int64_t v37 = v0 * 4; - int64_t v45 = v0 * 3; - int64_t v55 = v0 * 2; - int64_t v63 = v0 * 5; - float v120 = v4 * v117; - float v127 = v4 * v124; - float v134 = v4 * v131; - float v141 = v4 * v138; - int64_t v180 = v2 * 2; - int64_t v188 = v2 * 3; - int64_t v196 = v2 * 4; - int64_t v204 = v2 * 5; - int64_t v212 = v2 * 6; - const int32_t *v280 = &v5[0]; - svfloat32_t v284 = svdup_n_f32(v97); - svfloat32_t v285 = svdup_n_f32(v102); - svfloat32_t v286 = svdup_n_f32(v107); - svfloat32_t v287 = svdup_n_f32(v112); - int32_t *v299 = &v6[0]; - svfloat32_t v25 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v225[0])), - 1.F / (1ULL << 15ULL)); - const int32_t *v234 = &v5[v27]; - const int32_t *v243 = &v5[v37]; - const int32_t *v252 = &v5[v45]; - const int32_t *v261 = &v5[v55]; - const int32_t *v270 = &v5[v63]; - svfloat32_t v288 = svdup_n_f32(v120); - svfloat32_t v289 = svdup_n_f32(v127); - svfloat32_t v290 = svdup_n_f32(v134); - svfloat32_t v291 = svdup_n_f32(v141); - int32_t *v317 = &v6[v180]; - int32_t *v326 = &v6[v188]; - int32_t *v335 = &v6[v196]; - int32_t *v344 = &v6[v204]; - int32_t *v353 = &v6[v212]; - svfloat32_t v81 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v280[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v33 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v234[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v43 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v243[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v51 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v252[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v61 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v261[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v69 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v270[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v34; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v34) : "w"(v25), "w"(v33)); - svfloat32_t v35; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v35) : "w"(v25), "w"(v33)); - svfloat32_t v52; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v52) : "w"(v43), "w"(v51)); - svfloat32_t v53; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v53) : "w"(v43), "w"(v51)); - svfloat32_t v70; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v70) : "w"(v61), "w"(v69)); - svfloat32_t v71; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v71) : "w"(v61), "w"(v69)); - svfloat32_t v72; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v72) : "w"(v34), "w"(v52)); - svfloat32_t v83; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v83) : "w"(v34), "w"(v52)); - svfloat32_t v84; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v84) : "w"(v52), "w"(v70)); - svfloat32_t v85; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v85) : "w"(v70), "w"(v34)); - svfloat32_t v86; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v86) : "w"(v35), "w"(v53)); - svfloat32_t v88; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v88) : "w"(v35), "w"(v53)); - svfloat32_t v89; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v89) : "w"(v53), "w"(v71)); - svfloat32_t v90; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v90) : "w"(v71), "w"(v35)); - svfloat32_t v73; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v73) : "w"(v72), "w"(v70)); - svfloat32_t v87; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v87) : "w"(v86), "w"(v71)); - svfloat32_t zero129; - asm volatile("mov %0.s, #0" : "=w"(zero129)); - svfloat32_t v129 = svcmla_f32_x(pred_full, zero129, v289, v88, 90); - svfloat32_t zero136; - asm volatile("mov %0.s, #0" : "=w"(zero136)); - svfloat32_t v136 = svcmla_f32_x(pred_full, zero136, v290, v89, 90); - svfloat32_t zero143; - asm volatile("mov %0.s, #0" : "=w"(zero143)); - svfloat32_t v143 = svcmla_f32_x(pred_full, zero143, v291, v90, 90); - svfloat32_t v82; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v82) : "w"(v73), "w"(v81)); - svfloat32_t zero122; - asm volatile("mov %0.s, #0" : "=w"(zero122)); - svfloat32_t v122 = svcmla_f32_x(pred_full, zero122, v288, v87, 90); - svfloat32_t v144 = svmla_f32_x(pred_full, v82, v73, v284); - svfloat32_t v151; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v151) : "w"(v122), "w"(v129)); - svfloat32_t v153; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v153) : "w"(v122), "w"(v129)); - svfloat32_t v155; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v155) : "w"(v122), "w"(v136)); - svint16_t v165 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v82, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svfloat32_t v145 = svmla_f32_x(pred_full, v144, v83, v285); - svfloat32_t v147 = svmls_f32_x(pred_full, v144, v83, v285); - svfloat32_t v149 = svmls_f32_x(pred_full, v144, v84, v286); - svfloat32_t v152; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v152) : "w"(v151), "w"(v136)); - svfloat32_t v154; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v154) : "w"(v153), "w"(v143)); - svfloat32_t v156; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v156) : "w"(v155), "w"(v143)); - svst1w_u64(pred_full, (unsigned *)(v299), svreinterpret_u64_s16(v165)); - svfloat32_t v146 = svmla_f32_x(pred_full, v145, v84, v286); - svfloat32_t v148 = svmls_f32_x(pred_full, v147, v85, v287); - svfloat32_t v150 = svmla_f32_x(pred_full, v149, v85, v287); - svfloat32_t v157; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v157) : "w"(v146), "w"(v152)); - svfloat32_t v158; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v158) : "w"(v146), "w"(v152)); - svfloat32_t v159; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v159) : "w"(v148), "w"(v154)); - svfloat32_t v160; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v160) : "w"(v148), "w"(v154)); - svfloat32_t v161; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v161) : "w"(v150), "w"(v156)); - svfloat32_t v162; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v162) : "w"(v150), "w"(v156)); - svint16_t v173 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v158, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svint16_t v181 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v160, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svint16_t v189 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v161, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svint16_t v197 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v162, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svint16_t v205 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v159, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svint16_t v213 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v157, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svst1w_u64(pred_full, (unsigned *)(v308), svreinterpret_u64_s16(v173)); - svst1w_u64(pred_full, (unsigned *)(v317), svreinterpret_u64_s16(v181)); - svst1w_u64(pred_full, (unsigned *)(v326), svreinterpret_u64_s16(v189)); - svst1w_u64(pred_full, (unsigned *)(v335), svreinterpret_u64_s16(v197)); - svst1w_u64(pred_full, (unsigned *)(v344), svreinterpret_u64_s16(v205)); - svst1w_u64(pred_full, (unsigned *)(v353), svreinterpret_u64_s16(v213)); - v5 += v11; - v6 += v12; - } -} -#endif - -#ifndef ARMRAL_ARCH_SVE -void armral_fft_cs16_cf32_cs16_ac_n_uu8(const armral_cmplx_int16_t *restrict x, - armral_cmplx_int16_t *restrict y, - int istride, int ostride, int howmany, - float dir) { - float v4 = dir; - const int32_t *v5 = (const int32_t *)x; - int32_t *v6 = (int32_t *)y; - int64_t v12 = howmany - 1; - int64_t v224 = howmany / 2; - for (int j = 0; j < v12; j += 2) { - float v129 = 1.0000000000000000e+00F; - float v130 = -1.0000000000000000e+00F; - float v138 = -7.0710678118654746e-01F; - float32x2_t v140 = (float32x2_t){v4, v4}; - float v146 = 7.0710678118654757e-01F; - const int32_t *v442 = &v5[istride]; - int32_t *v488 = &v6[ostride]; - float32x2_t v131 = (float32x2_t){v129, v130}; - float32x2_t v139 = (float32x2_t){v146, v138}; - float32x2_t v147 = (float32x2_t){v146, v146}; - const int32_t *v406 = &v5[0]; - int32_t *v479 = &v6[0]; - int16x4_t v554 = vld1_s16((const int16_t *)v442); - float32x4_t v64 = vcvtq_n_f32_s32(vmovl_s16(v554), 15); - float32x2_t v133 = vmul_f32(v140, v131); - float32x2_t v141 = vmul_f32(v140, v139); - float32x4_t v148 = vcombine_f32(v147, v147); - const int32_t *v415 = &v5[istride * 4]; - const int32_t *v424 = &v5[istride * 2]; - const int32_t *v433 = &v5[istride * 6]; - const int32_t *v451 = &v5[istride * 5]; - const int32_t *v460 = &v5[istride * 3]; - const int32_t *v469 = &v5[istride * 7]; - int32_t *v497 = &v6[ostride * 2]; - int32_t *v506 = &v6[ostride * 3]; - int32_t *v515 = &v6[ostride * 4]; - int32_t *v524 = &v6[ostride * 5]; - int32_t *v533 = &v6[ostride * 6]; - int32_t *v542 = &v6[ostride * 7]; - int16x4_t v546 = vld1_s16((const int16_t *)v406); - float32x4_t v28 = vcvtq_n_f32_s32(vmovl_s16(v546), 15); - float32x4_t v135 = vcombine_f32(v133, v133); - float32x4_t v143 = vcombine_f32(v141, v141); - int16x4_t v548 = vld1_s16((const int16_t *)v415); - int16x4_t v550 = vld1_s16((const int16_t *)v424); - int16x4_t v552 = vld1_s16((const int16_t *)v433); - int16x4_t v556 = vld1_s16((const int16_t *)v451); - int16x4_t v558 = vld1_s16((const int16_t *)v460); - int16x4_t v560 = vld1_s16((const int16_t *)v469); - float32x4_t v36 = vcvtq_n_f32_s32(vmovl_s16(v548), 15); - float32x4_t v46 = vcvtq_n_f32_s32(vmovl_s16(v550), 15); - float32x4_t v54 = vcvtq_n_f32_s32(vmovl_s16(v552), 15); - float32x4_t v72 = vcvtq_n_f32_s32(vmovl_s16(v556), 15); - float32x4_t v82 = vcvtq_n_f32_s32(vmovl_s16(v558), 15); - float32x4_t v90 = vcvtq_n_f32_s32(vmovl_s16(v560), 15); - float32x4_t v37 = vaddq_f32(v28, v36); - float32x4_t v38 = vsubq_f32(v28, v36); - float32x4_t v55 = vaddq_f32(v46, v54); - float32x4_t v56 = vsubq_f32(v46, v54); - float32x4_t v73 = vaddq_f32(v64, v72); - float32x4_t v74 = vsubq_f32(v64, v72); - float32x4_t v91 = vaddq_f32(v82, v90); - float32x4_t v92 = vsubq_f32(v82, v90); - float32x4_t v93 = vaddq_f32(v37, v55); - float32x4_t v94 = vsubq_f32(v37, v55); - float32x4_t v95 = vaddq_f32(v73, v91); - float32x4_t v96 = vsubq_f32(v73, v91); - float32x4_t v99 = vaddq_f32(v74, v92); - float32x4_t v100 = vsubq_f32(v74, v92); - float32x4_t v134 = vrev64q_f32(v56); - float32x4_t v97 = vaddq_f32(v93, v95); - float32x4_t v98 = vsubq_f32(v93, v95); - float32x4_t v121 = vrev64q_f32(v96); - float32x4_t v136 = vmulq_f32(v134, v135); - float32x4_t v142 = vrev64q_f32(v99); - float32x4_t v149 = vmulq_f32(v100, v148); - float32x4_t v123 = vmulq_f32(v121, v135); - float32x4_t v144 = vmulq_f32(v142, v143); - float32x4_t v152 = vaddq_f32(v38, v149); - float32x4_t v153 = vsubq_f32(v38, v149); - int16x4_t v162 = vqmovn_s32(vcvtq_n_s32_f32(v97, 15)); - int16x4_t v194 = vqmovn_s32(vcvtq_n_s32_f32(v98, 15)); - float32x4_t v150 = vaddq_f32(v94, v123); - float32x4_t v151 = vsubq_f32(v94, v123); - float32x4_t v154 = vaddq_f32(v136, v144); - float32x4_t v155 = vsubq_f32(v136, v144); - vst1_s16((int16_t *)v479, v162); - vst1_s16((int16_t *)v515, v194); - float32x4_t v156 = vaddq_f32(v152, v154); - float32x4_t v157 = vsubq_f32(v152, v154); - float32x4_t v158 = vaddq_f32(v153, v155); - float32x4_t v159 = vsubq_f32(v153, v155); - int16x4_t v178 = vqmovn_s32(vcvtq_n_s32_f32(v151, 15)); - int16x4_t v210 = vqmovn_s32(vcvtq_n_s32_f32(v150, 15)); - int16x4_t v170 = vqmovn_s32(vcvtq_n_s32_f32(v157, 15)); - int16x4_t v186 = vqmovn_s32(vcvtq_n_s32_f32(v158, 15)); - int16x4_t v202 = vqmovn_s32(vcvtq_n_s32_f32(v159, 15)); - int16x4_t v218 = vqmovn_s32(vcvtq_n_s32_f32(v156, 15)); - vst1_s16((int16_t *)v497, v178); - vst1_s16((int16_t *)v533, v210); - vst1_s16((int16_t *)v488, v170); - vst1_s16((int16_t *)v506, v186); - vst1_s16((int16_t *)v524, v202); - vst1_s16((int16_t *)v542, v218); - v5 += 2 * 1; - v6 += 2 * 1; - } - for (int j = v224 * 2; j < howmany; j += 1) { - int16x4_t v264 = vld1s_s16(&v5[istride]); - float v319 = 1.0000000000000000e+00F; - float v320 = -1.0000000000000000e+00F; - float v327 = -7.0710678118654746e-01F; - float32x2_t v329 = (float32x2_t){v4, v4}; - float v334 = 7.0710678118654757e-01F; - int16x4_t v236 = vld1s_s16(&v5[0]); - float32x2_t v265 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v264)), 15); - float32x2_t v321 = (float32x2_t){v319, v320}; - float32x2_t v328 = (float32x2_t){v334, v327}; - float32x2_t v335 = (float32x2_t){v334, v334}; - float32x2_t v237 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v236)), 15); - int16x4_t v242 = vld1s_s16(&v5[istride * 4]); - int16x4_t v250 = vld1s_s16(&v5[istride * 2]); - int16x4_t v256 = vld1s_s16(&v5[istride * 6]); - int16x4_t v270 = vld1s_s16(&v5[istride * 5]); - int16x4_t v278 = vld1s_s16(&v5[istride * 3]); - int16x4_t v284 = vld1s_s16(&v5[istride * 7]); - float32x2_t v323 = vmul_f32(v329, v321); - float32x2_t v330 = vmul_f32(v329, v328); - float32x2_t v243 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v242)), 15); - float32x2_t v251 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v250)), 15); - float32x2_t v257 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v256)), 15); - float32x2_t v271 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v270)), 15); - float32x2_t v279 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v278)), 15); - float32x2_t v285 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v284)), 15); - float32x2_t v244 = vadd_f32(v237, v243); - float32x2_t v245 = vsub_f32(v237, v243); - float32x2_t v258 = vadd_f32(v251, v257); - float32x2_t v259 = vsub_f32(v251, v257); - float32x2_t v272 = vadd_f32(v265, v271); - float32x2_t v273 = vsub_f32(v265, v271); - float32x2_t v286 = vadd_f32(v279, v285); - float32x2_t v287 = vsub_f32(v279, v285); - float32x2_t v288 = vadd_f32(v244, v258); - float32x2_t v289 = vsub_f32(v244, v258); - float32x2_t v290 = vadd_f32(v272, v286); - float32x2_t v291 = vsub_f32(v272, v286); - float32x2_t v294 = vadd_f32(v273, v287); - float32x2_t v295 = vsub_f32(v273, v287); - float32x2_t v324 = vrev64_f32(v259); - float32x2_t v292 = vadd_f32(v288, v290); - float32x2_t v293 = vsub_f32(v288, v290); - float32x2_t v313 = vrev64_f32(v291); - float32x2_t v325 = vmul_f32(v324, v323); - float32x2_t v331 = vrev64_f32(v294); - float32x2_t v336 = vmul_f32(v295, v335); - float32x2_t v314 = vmul_f32(v313, v323); - float32x2_t v332 = vmul_f32(v331, v330); - float32x2_t v339 = vadd_f32(v245, v336); - float32x2_t v340 = vsub_f32(v245, v336); - int16x4_t v349 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v292, 15), (int32x2_t){0, 0})); - int16x4_t v373 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v293, 15), (int32x2_t){0, 0})); - float32x2_t v337 = vadd_f32(v289, v314); - float32x2_t v338 = vsub_f32(v289, v314); - float32x2_t v341 = vadd_f32(v325, v332); - float32x2_t v342 = vsub_f32(v325, v332); - v6[0] = vget_lane_s32(vreinterpret_s32_s16(v349), 0); - v6[ostride * 4] = vget_lane_s32(vreinterpret_s32_s16(v373), 0); - float32x2_t v343 = vadd_f32(v339, v341); - float32x2_t v344 = vsub_f32(v339, v341); - float32x2_t v345 = vadd_f32(v340, v342); - float32x2_t v346 = vsub_f32(v340, v342); - int16x4_t v361 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v338, 15), (int32x2_t){0, 0})); - int16x4_t v385 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v337, 15), (int32x2_t){0, 0})); - int16x4_t v355 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v344, 15), (int32x2_t){0, 0})); - v6[ostride * 2] = vget_lane_s32(vreinterpret_s32_s16(v361), 0); - int16x4_t v367 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v345, 15), (int32x2_t){0, 0})); - int16x4_t v379 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v346, 15), (int32x2_t){0, 0})); - v6[ostride * 6] = vget_lane_s32(vreinterpret_s32_s16(v385), 0); - int16x4_t v391 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v343, 15), (int32x2_t){0, 0})); - v6[ostride] = vget_lane_s32(vreinterpret_s32_s16(v355), 0); - v6[ostride * 3] = vget_lane_s32(vreinterpret_s32_s16(v367), 0); - v6[ostride * 5] = vget_lane_s32(vreinterpret_s32_s16(v379), 0); - v6[ostride * 7] = vget_lane_s32(vreinterpret_s32_s16(v391), 0); - v5 += 1 * 1; - v6 += 1 * 1; - } -} -#endif - -#ifdef ARMRAL_ARCH_SVE -void armral_fft_cs16_cf32_cs16_ac_n_uu8(const armral_cmplx_int16_t *restrict x, - armral_cmplx_int16_t *restrict y, - int istride, int ostride, int howmany, - float dir) { - int64_t v0 = istride; - int64_t v2 = ostride; - float v4 = dir; - const int32_t *v5 = (const int32_t *)x; - int32_t *v6 = (int32_t *)y; - int64_t v8 = howmany; - int64_t v10 = svcntd(); - int64_t v11 = v10 * 1; - int64_t v12 = v10 * 1; - for (int j = 0; j < v8; j += v10) { - svbool_t pred_full = svwhilelt_b32(j * 2, howmany * 2); - float v126 = -1.0000000000000000e+00F; - float v133 = -7.0710678118654746e-01F; - float v140 = 7.0710678118654757e-01F; - const int32_t *v261 = &v5[v0]; - int32_t *v315 = &v6[v2]; - int64_t v27 = v0 * 4; - int64_t v37 = v0 * 2; - int64_t v45 = v0 * 6; - int64_t v63 = v0 * 5; - int64_t v73 = v0 * 3; - int64_t v81 = v0 * 7; - float v129 = v4 * v126; - float v136 = v4 * v133; - int64_t v171 = v2 * 2; - int64_t v179 = v2 * 3; - int64_t v187 = v2 * 4; - int64_t v195 = v2 * 5; - int64_t v203 = v2 * 6; - int64_t v211 = v2 * 7; - const int32_t *v225 = &v5[0]; - svfloat32_t v298 = svdup_n_f32(v140); - int32_t *v306 = &v6[0]; - svfloat32_t v61 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v261[0])), - 1.F / (1ULL << 15ULL)); - const int32_t *v234 = &v5[v27]; - const int32_t *v243 = &v5[v37]; - const int32_t *v252 = &v5[v45]; - const int32_t *v270 = &v5[v63]; - const int32_t *v279 = &v5[v73]; - const int32_t *v288 = &v5[v81]; - svfloat32_t v296 = svdup_n_f32(v129); - svfloat32_t v297 = svdup_n_f32(v136); - int32_t *v324 = &v6[v171]; - int32_t *v333 = &v6[v179]; - int32_t *v342 = &v6[v187]; - int32_t *v351 = &v6[v195]; - int32_t *v360 = &v6[v203]; - int32_t *v369 = &v6[v211]; - svfloat32_t v25 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v225[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v33 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v234[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v43 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v243[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v51 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v252[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v69 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v270[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v79 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v279[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v87 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v288[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v34; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v34) : "w"(v25), "w"(v33)); - svfloat32_t v35; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v35) : "w"(v25), "w"(v33)); - svfloat32_t v52; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v52) : "w"(v43), "w"(v51)); - svfloat32_t v53; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v53) : "w"(v43), "w"(v51)); - svfloat32_t v70; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v70) : "w"(v61), "w"(v69)); - svfloat32_t v71; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v71) : "w"(v61), "w"(v69)); - svfloat32_t v88; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v88) : "w"(v79), "w"(v87)); - svfloat32_t v89; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v89) : "w"(v79), "w"(v87)); - svfloat32_t v90; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v90) : "w"(v34), "w"(v52)); - svfloat32_t v91; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v91) : "w"(v34), "w"(v52)); - svfloat32_t v92; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v92) : "w"(v70), "w"(v88)); - svfloat32_t v93; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v93) : "w"(v70), "w"(v88)); - svfloat32_t v96; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v96) : "w"(v71), "w"(v89)); - svfloat32_t v97; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v97) : "w"(v71), "w"(v89)); - svfloat32_t zero131; - asm volatile("mov %0.s, #0" : "=w"(zero131)); - svfloat32_t v131 = svcmla_f32_x(pred_full, zero131, v296, v53, 90); - svfloat32_t v94; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v94) : "w"(v90), "w"(v92)); - svfloat32_t v95; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v95) : "w"(v90), "w"(v92)); - svfloat32_t zero119; - asm volatile("mov %0.s, #0" : "=w"(zero119)); - svfloat32_t v119 = svcmla_f32_x(pred_full, zero119, v296, v93, 90); - svfloat32_t zero138; - asm volatile("mov %0.s, #0" : "=w"(zero138)); - svfloat32_t v138 = svcmla_f32_x(pred_full, zero138, v297, v96, 90); - svfloat32_t v144; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v144) : "w"(v91), "w"(v119)); - svfloat32_t v145; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v145) : "w"(v91), "w"(v119)); - svfloat32_t v146 = svmla_f32_x(pred_full, v35, v97, v298); - svfloat32_t v147 = svmls_f32_x(pred_full, v35, v97, v298); - svfloat32_t v148; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v148) : "w"(v131), "w"(v138)); - svfloat32_t v149; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v149) : "w"(v131), "w"(v138)); - svint16_t v156 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v94, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svint16_t v188 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v95, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svfloat32_t v150; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v150) : "w"(v146), "w"(v148)); - svfloat32_t v151; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v151) : "w"(v146), "w"(v148)); - svfloat32_t v152; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v152) : "w"(v147), "w"(v149)); - svfloat32_t v153; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v153) : "w"(v147), "w"(v149)); - svint16_t v172 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v145, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svint16_t v204 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v144, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svst1w_u64(pred_full, (unsigned *)(v306), svreinterpret_u64_s16(v156)); - svst1w_u64(pred_full, (unsigned *)(v342), svreinterpret_u64_s16(v188)); - svint16_t v164 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v151, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svint16_t v180 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v152, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svint16_t v196 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v153, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svint16_t v212 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v150, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svst1w_u64(pred_full, (unsigned *)(v324), svreinterpret_u64_s16(v172)); - svst1w_u64(pred_full, (unsigned *)(v360), svreinterpret_u64_s16(v204)); - svst1w_u64(pred_full, (unsigned *)(v315), svreinterpret_u64_s16(v164)); - svst1w_u64(pred_full, (unsigned *)(v333), svreinterpret_u64_s16(v180)); - svst1w_u64(pred_full, (unsigned *)(v351), svreinterpret_u64_s16(v196)); - svst1w_u64(pred_full, (unsigned *)(v369), svreinterpret_u64_s16(v212)); - v5 += v11; - v6 += v12; - } -} -#endif - -#ifndef ARMRAL_ARCH_SVE -void armral_fft_cs16_cf32_cs16_ac_n_uu9(const armral_cmplx_int16_t *restrict x, - armral_cmplx_int16_t *restrict y, - int istride, int ostride, int howmany, - float dir) { - float v4 = dir; - const int32_t *v5 = (const int32_t *)x; - int32_t *v6 = (int32_t *)y; - int64_t v12 = howmany - 1; - int64_t v280 = howmany / 2; - for (int j = 0; j < v12; j += 2) { - float v119 = -5.0000000000000000e-01F; - float v132 = -1.4999999999999998e+00F; - float v136 = 8.6602540378443871e-01F; - float v137 = -8.6602540378443871e-01F; - float v145 = 7.6604444311897801e-01F; - float v150 = 9.3969262078590832e-01F; - float v155 = -1.7364817766693039e-01F; - float v159 = 6.4278760968653925e-01F; - float v160 = -6.4278760968653925e-01F; - float v167 = -3.4202014332566888e-01F; - float v168 = 3.4202014332566888e-01F; - float v175 = 9.8480775301220802e-01F; - float v176 = -9.8480775301220802e-01F; - float32x2_t v178 = (float32x2_t){v4, v4}; - const int32_t *v510 = &v5[istride]; - int32_t *v602 = &v6[ostride]; - float32x2_t v120 = (float32x2_t){v119, v119}; - float32x2_t v133 = (float32x2_t){v132, v132}; - float32x2_t v138 = (float32x2_t){v136, v137}; - float32x2_t v146 = (float32x2_t){v145, v145}; - float32x2_t v151 = (float32x2_t){v150, v150}; - float32x2_t v156 = (float32x2_t){v155, v155}; - float32x2_t v161 = (float32x2_t){v159, v160}; - float32x2_t v169 = (float32x2_t){v167, v168}; - float32x2_t v177 = (float32x2_t){v175, v176}; - const int32_t *v583 = &v5[0]; - int32_t *v593 = &v6[0]; - int16x4_t v669 = vld1_s16((const int16_t *)v510); - float32x4_t v28 = vcvtq_n_f32_s32(vmovl_s16(v669), 15); - float32x4_t v121 = vcombine_f32(v120, v120); - float32x4_t v134 = vcombine_f32(v133, v133); - float32x2_t v140 = vmul_f32(v178, v138); - float32x4_t v147 = vcombine_f32(v146, v146); - float32x4_t v152 = vcombine_f32(v151, v151); - float32x4_t v157 = vcombine_f32(v156, v156); - float32x2_t v163 = vmul_f32(v178, v161); - float32x2_t v171 = vmul_f32(v178, v169); - float32x2_t v179 = vmul_f32(v178, v177); - const int32_t *v519 = &v5[istride * 8]; - const int32_t *v528 = &v5[istride * 7]; - const int32_t *v537 = &v5[istride * 2]; - const int32_t *v546 = &v5[istride * 3]; - const int32_t *v555 = &v5[istride * 6]; - const int32_t *v564 = &v5[istride * 4]; - const int32_t *v573 = &v5[istride * 5]; - int32_t *v611 = &v6[ostride * 2]; - int32_t *v620 = &v6[ostride * 3]; - int32_t *v629 = &v6[ostride * 4]; - int32_t *v638 = &v6[ostride * 5]; - int32_t *v647 = &v6[ostride * 6]; - int32_t *v656 = &v6[ostride * 7]; - int32_t *v665 = &v6[ostride * 8]; - int16x4_t v685 = vld1_s16((const int16_t *)v583); - float32x4_t v103 = vcvtq_n_f32_s32(vmovl_s16(v685), 15); - float32x4_t v142 = vcombine_f32(v140, v140); - float32x4_t v165 = vcombine_f32(v163, v163); - float32x4_t v173 = vcombine_f32(v171, v171); - float32x4_t v181 = vcombine_f32(v179, v179); - int16x4_t v671 = vld1_s16((const int16_t *)v519); - int16x4_t v673 = vld1_s16((const int16_t *)v528); - int16x4_t v675 = vld1_s16((const int16_t *)v537); - int16x4_t v677 = vld1_s16((const int16_t *)v546); - int16x4_t v679 = vld1_s16((const int16_t *)v555); - int16x4_t v681 = vld1_s16((const int16_t *)v564); - int16x4_t v683 = vld1_s16((const int16_t *)v573); - float32x4_t v36 = vcvtq_n_f32_s32(vmovl_s16(v671), 15); - float32x4_t v46 = vcvtq_n_f32_s32(vmovl_s16(v673), 15); - float32x4_t v54 = vcvtq_n_f32_s32(vmovl_s16(v675), 15); - float32x4_t v64 = vcvtq_n_f32_s32(vmovl_s16(v677), 15); - float32x4_t v72 = vcvtq_n_f32_s32(vmovl_s16(v679), 15); - float32x4_t v82 = vcvtq_n_f32_s32(vmovl_s16(v681), 15); - float32x4_t v90 = vcvtq_n_f32_s32(vmovl_s16(v683), 15); - float32x4_t v37 = vaddq_f32(v28, v36); - float32x4_t v38 = vsubq_f32(v28, v36); - float32x4_t v55 = vaddq_f32(v46, v54); - float32x4_t v56 = vsubq_f32(v46, v54); - float32x4_t v73 = vaddq_f32(v64, v72); - float32x4_t v74 = vsubq_f32(v64, v72); - float32x4_t v91 = vaddq_f32(v82, v90); - float32x4_t v92 = vsubq_f32(v82, v90); - float32x4_t v93 = vaddq_f32(v37, v55); - float32x4_t v105 = vaddq_f32(v38, v56); - float32x4_t v107 = vsubq_f32(v37, v55); - float32x4_t v108 = vsubq_f32(v55, v91); - float32x4_t v109 = vsubq_f32(v91, v37); - float32x4_t v110 = vsubq_f32(v38, v56); - float32x4_t v111 = vsubq_f32(v56, v92); - float32x4_t v112 = vsubq_f32(v92, v38); - float32x4_t v135 = vmulq_f32(v73, v134); - float32x4_t v141 = vrev64q_f32(v74); - float32x4_t v94 = vaddq_f32(v93, v91); - float32x4_t v106 = vaddq_f32(v105, v92); - float32x4_t v143 = vmulq_f32(v141, v142); - float32x4_t v148 = vmulq_f32(v107, v147); - float32x4_t v153 = vmulq_f32(v108, v152); - float32x4_t v158 = vmulq_f32(v109, v157); - float32x4_t v164 = vrev64q_f32(v110); - float32x4_t v172 = vrev64q_f32(v111); - float32x4_t v180 = vrev64q_f32(v112); - float32x4_t v95 = vaddq_f32(v94, v73); - float32x4_t v122 = vmulq_f32(v94, v121); - float32x4_t v128 = vrev64q_f32(v106); - float32x4_t v166 = vmulq_f32(v164, v165); - float32x4_t v174 = vmulq_f32(v172, v173); - float32x4_t v182 = vmulq_f32(v180, v181); - float32x4_t v104 = vaddq_f32(v95, v103); - float32x4_t v130 = vmulq_f32(v128, v142); - float32x4_t v183 = vaddq_f32(v122, v122); - float32x4_t v196 = vaddq_f32(v143, v166); - float32x4_t v198 = vsubq_f32(v143, v174); - float32x4_t v200 = vsubq_f32(v143, v166); - float32x4_t v184 = vaddq_f32(v183, v122); - float32x4_t v188 = vaddq_f32(v104, v135); - float32x4_t v197 = vaddq_f32(v196, v174); - float32x4_t v199 = vaddq_f32(v198, v182); - float32x4_t v201 = vsubq_f32(v200, v182); - int16x4_t v210 = vqmovn_s32(vcvtq_n_s32_f32(v104, 15)); - float32x4_t v185 = vaddq_f32(v104, v184); - float32x4_t v189 = vaddq_f32(v188, v183); - vst1_s16((int16_t *)v593, v210); - float32x4_t v186 = vaddq_f32(v185, v130); - float32x4_t v187 = vsubq_f32(v185, v130); - float32x4_t v190 = vaddq_f32(v189, v148); - float32x4_t v192 = vsubq_f32(v189, v153); - float32x4_t v194 = vsubq_f32(v189, v148); - float32x4_t v191 = vaddq_f32(v190, v153); - float32x4_t v193 = vaddq_f32(v192, v158); - float32x4_t v195 = vsubq_f32(v194, v158); - int16x4_t v234 = vqmovn_s32(vcvtq_n_s32_f32(v187, 15)); - int16x4_t v258 = vqmovn_s32(vcvtq_n_s32_f32(v186, 15)); - float32x4_t v202 = vaddq_f32(v191, v197); - float32x4_t v203 = vsubq_f32(v191, v197); - float32x4_t v204 = vaddq_f32(v193, v199); - float32x4_t v205 = vsubq_f32(v193, v199); - float32x4_t v206 = vaddq_f32(v195, v201); - float32x4_t v207 = vsubq_f32(v195, v201); - vst1_s16((int16_t *)v620, v234); - vst1_s16((int16_t *)v647, v258); - int16x4_t v218 = vqmovn_s32(vcvtq_n_s32_f32(v203, 15)); - int16x4_t v226 = vqmovn_s32(vcvtq_n_s32_f32(v204, 15)); - int16x4_t v242 = vqmovn_s32(vcvtq_n_s32_f32(v207, 15)); - int16x4_t v250 = vqmovn_s32(vcvtq_n_s32_f32(v206, 15)); - int16x4_t v266 = vqmovn_s32(vcvtq_n_s32_f32(v205, 15)); - int16x4_t v274 = vqmovn_s32(vcvtq_n_s32_f32(v202, 15)); - vst1_s16((int16_t *)v602, v218); - vst1_s16((int16_t *)v611, v226); - vst1_s16((int16_t *)v629, v242); - vst1_s16((int16_t *)v638, v250); - vst1_s16((int16_t *)v656, v266); - vst1_s16((int16_t *)v665, v274); - v5 += 2 * 1; - v6 += 2 * 1; - } - for (int j = v280 * 2; j < howmany; j += 1) { - int16x4_t v292 = vld1s_s16(&v5[istride]); - float v367 = -5.0000000000000000e-01F; - float v378 = -1.4999999999999998e+00F; - float v381 = 8.6602540378443871e-01F; - float v382 = -8.6602540378443871e-01F; - float v389 = 7.6604444311897801e-01F; - float v393 = 9.3969262078590832e-01F; - float v397 = -1.7364817766693039e-01F; - float v400 = 6.4278760968653925e-01F; - float v401 = -6.4278760968653925e-01F; - float v407 = -3.4202014332566888e-01F; - float v408 = 3.4202014332566888e-01F; - float v414 = 9.8480775301220802e-01F; - float v415 = -9.8480775301220802e-01F; - float32x2_t v417 = (float32x2_t){v4, v4}; - float32x2_t v293 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v292)), 15); - int16x4_t v351 = vld1s_s16(&v5[0]); - float32x2_t v368 = (float32x2_t){v367, v367}; - float32x2_t v379 = (float32x2_t){v378, v378}; - float32x2_t v383 = (float32x2_t){v381, v382}; - float32x2_t v390 = (float32x2_t){v389, v389}; - float32x2_t v394 = (float32x2_t){v393, v393}; - float32x2_t v398 = (float32x2_t){v397, v397}; - float32x2_t v402 = (float32x2_t){v400, v401}; - float32x2_t v409 = (float32x2_t){v407, v408}; - float32x2_t v416 = (float32x2_t){v414, v415}; - int16x4_t v298 = vld1s_s16(&v5[istride * 8]); - int16x4_t v306 = vld1s_s16(&v5[istride * 7]); - int16x4_t v312 = vld1s_s16(&v5[istride * 2]); - int16x4_t v320 = vld1s_s16(&v5[istride * 3]); - int16x4_t v326 = vld1s_s16(&v5[istride * 6]); - int16x4_t v334 = vld1s_s16(&v5[istride * 4]); - int16x4_t v340 = vld1s_s16(&v5[istride * 5]); - float32x2_t v352 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v351)), 15); - float32x2_t v385 = vmul_f32(v417, v383); - float32x2_t v404 = vmul_f32(v417, v402); - float32x2_t v411 = vmul_f32(v417, v409); - float32x2_t v418 = vmul_f32(v417, v416); - float32x2_t v299 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v298)), 15); - float32x2_t v307 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v306)), 15); - float32x2_t v313 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v312)), 15); - float32x2_t v321 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v320)), 15); - float32x2_t v327 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v326)), 15); - float32x2_t v335 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v334)), 15); - float32x2_t v341 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v340)), 15); - float32x2_t v300 = vadd_f32(v293, v299); - float32x2_t v301 = vsub_f32(v293, v299); - float32x2_t v314 = vadd_f32(v307, v313); - float32x2_t v315 = vsub_f32(v307, v313); - float32x2_t v328 = vadd_f32(v321, v327); - float32x2_t v329 = vsub_f32(v321, v327); - float32x2_t v342 = vadd_f32(v335, v341); - float32x2_t v343 = vsub_f32(v335, v341); - float32x2_t v344 = vadd_f32(v300, v314); - float32x2_t v354 = vadd_f32(v301, v315); - float32x2_t v356 = vsub_f32(v300, v314); - float32x2_t v357 = vsub_f32(v314, v342); - float32x2_t v358 = vsub_f32(v342, v300); - float32x2_t v359 = vsub_f32(v301, v315); - float32x2_t v360 = vsub_f32(v315, v343); - float32x2_t v361 = vsub_f32(v343, v301); - float32x2_t v380 = vmul_f32(v328, v379); - float32x2_t v386 = vrev64_f32(v329); - float32x2_t v345 = vadd_f32(v344, v342); - float32x2_t v355 = vadd_f32(v354, v343); - float32x2_t v387 = vmul_f32(v386, v385); - float32x2_t v391 = vmul_f32(v356, v390); - float32x2_t v395 = vmul_f32(v357, v394); - float32x2_t v399 = vmul_f32(v358, v398); - float32x2_t v405 = vrev64_f32(v359); - float32x2_t v412 = vrev64_f32(v360); - float32x2_t v419 = vrev64_f32(v361); - float32x2_t v346 = vadd_f32(v345, v328); - float32x2_t v369 = vmul_f32(v345, v368); - float32x2_t v375 = vrev64_f32(v355); - float32x2_t v406 = vmul_f32(v405, v404); - float32x2_t v413 = vmul_f32(v412, v411); - float32x2_t v420 = vmul_f32(v419, v418); - float32x2_t v353 = vadd_f32(v346, v352); - float32x2_t v376 = vmul_f32(v375, v385); - float32x2_t v421 = vadd_f32(v369, v369); - float32x2_t v434 = vadd_f32(v387, v406); - float32x2_t v436 = vsub_f32(v387, v413); - float32x2_t v438 = vsub_f32(v387, v406); - float32x2_t v422 = vadd_f32(v421, v369); - float32x2_t v426 = vadd_f32(v353, v380); - float32x2_t v435 = vadd_f32(v434, v413); - float32x2_t v437 = vadd_f32(v436, v420); - float32x2_t v439 = vsub_f32(v438, v420); - int16x4_t v448 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v353, 15), (int32x2_t){0, 0})); - float32x2_t v423 = vadd_f32(v353, v422); - float32x2_t v427 = vadd_f32(v426, v421); - v6[0] = vget_lane_s32(vreinterpret_s32_s16(v448), 0); - float32x2_t v424 = vadd_f32(v423, v376); - float32x2_t v425 = vsub_f32(v423, v376); - float32x2_t v428 = vadd_f32(v427, v391); - float32x2_t v430 = vsub_f32(v427, v395); - float32x2_t v432 = vsub_f32(v427, v391); - float32x2_t v429 = vadd_f32(v428, v395); - float32x2_t v431 = vadd_f32(v430, v399); - float32x2_t v433 = vsub_f32(v432, v399); - int16x4_t v466 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v425, 15), (int32x2_t){0, 0})); - int16x4_t v484 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v424, 15), (int32x2_t){0, 0})); - float32x2_t v440 = vadd_f32(v429, v435); - float32x2_t v441 = vsub_f32(v429, v435); - float32x2_t v442 = vadd_f32(v431, v437); - float32x2_t v443 = vsub_f32(v431, v437); - float32x2_t v444 = vadd_f32(v433, v439); - float32x2_t v445 = vsub_f32(v433, v439); - v6[ostride * 3] = vget_lane_s32(vreinterpret_s32_s16(v466), 0); - v6[ostride * 6] = vget_lane_s32(vreinterpret_s32_s16(v484), 0); - int16x4_t v454 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v441, 15), (int32x2_t){0, 0})); - int16x4_t v460 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v442, 15), (int32x2_t){0, 0})); - int16x4_t v472 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v445, 15), (int32x2_t){0, 0})); - int16x4_t v478 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v444, 15), (int32x2_t){0, 0})); - int16x4_t v490 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v443, 15), (int32x2_t){0, 0})); - int16x4_t v496 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v440, 15), (int32x2_t){0, 0})); - v6[ostride] = vget_lane_s32(vreinterpret_s32_s16(v454), 0); - v6[ostride * 2] = vget_lane_s32(vreinterpret_s32_s16(v460), 0); - v6[ostride * 4] = vget_lane_s32(vreinterpret_s32_s16(v472), 0); - v6[ostride * 5] = vget_lane_s32(vreinterpret_s32_s16(v478), 0); - v6[ostride * 7] = vget_lane_s32(vreinterpret_s32_s16(v490), 0); - v6[ostride * 8] = vget_lane_s32(vreinterpret_s32_s16(v496), 0); - v5 += 1 * 1; - v6 += 1 * 1; - } -} -#endif - -#ifdef ARMRAL_ARCH_SVE -void armral_fft_cs16_cf32_cs16_ac_n_uu9(const armral_cmplx_int16_t *restrict x, - armral_cmplx_int16_t *restrict y, - int istride, int ostride, int howmany, - float dir) { - int64_t v0 = istride; - int64_t v2 = ostride; - float v4 = dir; - const int32_t *v5 = (const int32_t *)x; - int32_t *v6 = (int32_t *)y; - int64_t v8 = howmany; - int64_t v10 = svcntd(); - int64_t v11 = v10 * 1; - int64_t v12 = v10 * 1; - for (int j = 0; j < v8; j += v10) { - svbool_t pred_full = svwhilelt_b32(j * 2, howmany * 2); - float v116 = -5.0000000000000000e-01F; - float v128 = -1.4999999999999998e+00F; - float v133 = -8.6602540378443871e-01F; - float v140 = 7.6604444311897801e-01F; - float v145 = 9.3969262078590832e-01F; - float v150 = -1.7364817766693039e-01F; - float v155 = -6.4278760968653925e-01F; - float v162 = 3.4202014332566888e-01F; - float v169 = -9.8480775301220802e-01F; - const int32_t *v278 = &v5[v0]; - int32_t *v381 = &v6[v2]; - int64_t v27 = v0 * 8; - int64_t v37 = v0 * 7; - int64_t v45 = v0 * 2; - int64_t v55 = v0 * 3; - int64_t v63 = v0 * 6; - int64_t v73 = v0 * 4; - int64_t v81 = v0 * 5; - float v136 = v4 * v133; - float v158 = v4 * v155; - float v165 = v4 * v162; - float v172 = v4 * v169; - int64_t v217 = v2 * 2; - int64_t v225 = v2 * 3; - int64_t v233 = v2 * 4; - int64_t v241 = v2 * 5; - int64_t v249 = v2 * 6; - int64_t v257 = v2 * 7; - int64_t v265 = v2 * 8; - const int32_t *v351 = &v5[0]; - svfloat32_t v355 = svdup_n_f32(v116); - svfloat32_t v357 = svdup_n_f32(v128); - svfloat32_t v359 = svdup_n_f32(v140); - svfloat32_t v360 = svdup_n_f32(v145); - svfloat32_t v361 = svdup_n_f32(v150); - int32_t *v372 = &v6[0]; - svfloat32_t v25 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v278[0])), - 1.F / (1ULL << 15ULL)); - const int32_t *v287 = &v5[v27]; - const int32_t *v296 = &v5[v37]; - const int32_t *v305 = &v5[v45]; - const int32_t *v314 = &v5[v55]; - const int32_t *v323 = &v5[v63]; - const int32_t *v332 = &v5[v73]; - const int32_t *v341 = &v5[v81]; - svfloat32_t v358 = svdup_n_f32(v136); - svfloat32_t v362 = svdup_n_f32(v158); - svfloat32_t v363 = svdup_n_f32(v165); - svfloat32_t v364 = svdup_n_f32(v172); - int32_t *v390 = &v6[v217]; - int32_t *v399 = &v6[v225]; - int32_t *v408 = &v6[v233]; - int32_t *v417 = &v6[v241]; - int32_t *v426 = &v6[v249]; - int32_t *v435 = &v6[v257]; - int32_t *v444 = &v6[v265]; - svfloat32_t v100 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v351[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v33 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v287[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v43 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v296[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v51 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v305[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v61 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v314[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v69 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v323[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v79 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v332[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v87 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v341[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v34; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v34) : "w"(v25), "w"(v33)); - svfloat32_t v35; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v35) : "w"(v25), "w"(v33)); - svfloat32_t v52; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v52) : "w"(v43), "w"(v51)); - svfloat32_t v53; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v53) : "w"(v43), "w"(v51)); - svfloat32_t v70; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v70) : "w"(v61), "w"(v69)); - svfloat32_t v71; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v71) : "w"(v61), "w"(v69)); - svfloat32_t v88; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v88) : "w"(v79), "w"(v87)); - svfloat32_t v89; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v89) : "w"(v79), "w"(v87)); - svfloat32_t v90; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v90) : "w"(v34), "w"(v52)); - svfloat32_t v102; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v102) : "w"(v35), "w"(v53)); - svfloat32_t v104; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v104) : "w"(v34), "w"(v52)); - svfloat32_t v105; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v105) : "w"(v52), "w"(v88)); - svfloat32_t v106; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v106) : "w"(v88), "w"(v34)); - svfloat32_t v107; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v107) : "w"(v35), "w"(v53)); - svfloat32_t v108; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v108) : "w"(v53), "w"(v89)); - svfloat32_t v109; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v109) : "w"(v89), "w"(v35)); - svfloat32_t zero138; - asm volatile("mov %0.s, #0" : "=w"(zero138)); - svfloat32_t v138 = svcmla_f32_x(pred_full, zero138, v358, v71, 90); - svfloat32_t v91; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v91) : "w"(v90), "w"(v88)); - svfloat32_t v103; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v103) : "w"(v102), "w"(v89)); - svfloat32_t zero160; - asm volatile("mov %0.s, #0" : "=w"(zero160)); - svfloat32_t v160 = svcmla_f32_x(pred_full, zero160, v362, v107, 90); - svfloat32_t zero167; - asm volatile("mov %0.s, #0" : "=w"(zero167)); - svfloat32_t v167 = svcmla_f32_x(pred_full, zero167, v363, v108, 90); - svfloat32_t zero174; - asm volatile("mov %0.s, #0" : "=w"(zero174)); - svfloat32_t v174 = svcmla_f32_x(pred_full, zero174, v364, v109, 90); - svfloat32_t v92; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v92) : "w"(v91), "w"(v70)); - svfloat32_t v119; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v119) : "w"(v91), "w"(v355)); - svfloat32_t zero126; - asm volatile("mov %0.s, #0" : "=w"(zero126)); - svfloat32_t v126 = svcmla_f32_x(pred_full, zero126, v358, v103, 90); - svfloat32_t v188; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v188) : "w"(v138), "w"(v160)); - svfloat32_t v190; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v190) : "w"(v138), "w"(v167)); - svfloat32_t v192; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v192) : "w"(v138), "w"(v160)); - svfloat32_t v101; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v101) : "w"(v92), "w"(v100)); - svfloat32_t v175; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v175) : "w"(v119), "w"(v119)); - svfloat32_t v189; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v189) : "w"(v188), "w"(v167)); - svfloat32_t v191; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v191) : "w"(v190), "w"(v174)); - svfloat32_t v193; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v193) : "w"(v192), "w"(v174)); - svfloat32_t v176 = svmla_f32_x(pred_full, v175, v91, v355); - svfloat32_t v180 = svmla_f32_x(pred_full, v101, v70, v357); - svint16_t v202 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v101, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svfloat32_t v177; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v177) : "w"(v101), "w"(v176)); - svfloat32_t v181; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v181) : "w"(v180), "w"(v175)); - svst1w_u64(pred_full, (unsigned *)(v372), svreinterpret_u64_s16(v202)); - svfloat32_t v178; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v178) : "w"(v177), "w"(v126)); - svfloat32_t v179; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v179) : "w"(v177), "w"(v126)); - svfloat32_t v182 = svmla_f32_x(pred_full, v181, v104, v359); - svfloat32_t v184 = svmls_f32_x(pred_full, v181, v105, v360); - svfloat32_t v186 = svmls_f32_x(pred_full, v181, v104, v359); - svfloat32_t v183 = svmla_f32_x(pred_full, v182, v105, v360); - svfloat32_t v185 = svmla_f32_x(pred_full, v184, v106, v361); - svfloat32_t v187 = svmls_f32_x(pred_full, v186, v106, v361); - svint16_t v226 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v179, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svint16_t v250 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v178, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svfloat32_t v194; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v194) : "w"(v183), "w"(v189)); - svfloat32_t v195; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v195) : "w"(v183), "w"(v189)); - svfloat32_t v196; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v196) : "w"(v185), "w"(v191)); - svfloat32_t v197; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v197) : "w"(v185), "w"(v191)); - svfloat32_t v198; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v198) : "w"(v187), "w"(v193)); - svfloat32_t v199; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v199) : "w"(v187), "w"(v193)); - svst1w_u64(pred_full, (unsigned *)(v399), svreinterpret_u64_s16(v226)); - svst1w_u64(pred_full, (unsigned *)(v426), svreinterpret_u64_s16(v250)); - svint16_t v210 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v195, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svint16_t v218 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v196, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svint16_t v234 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v199, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svint16_t v242 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v198, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svint16_t v258 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v197, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svint16_t v266 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v194, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svst1w_u64(pred_full, (unsigned *)(v381), svreinterpret_u64_s16(v210)); - svst1w_u64(pred_full, (unsigned *)(v390), svreinterpret_u64_s16(v218)); - svst1w_u64(pred_full, (unsigned *)(v408), svreinterpret_u64_s16(v234)); - svst1w_u64(pred_full, (unsigned *)(v417), svreinterpret_u64_s16(v242)); - svst1w_u64(pred_full, (unsigned *)(v435), svreinterpret_u64_s16(v258)); - svst1w_u64(pred_full, (unsigned *)(v444), svreinterpret_u64_s16(v266)); - v5 += v11; - v6 += v12; - } -} -#endif - -#ifndef ARMRAL_ARCH_SVE -void armral_fft_cs16_cf32_cs16_ac_n_uu10(const armral_cmplx_int16_t *restrict x, - armral_cmplx_int16_t *restrict y, - int istride, int ostride, int howmany, - float dir) { - float v4 = dir; - const int32_t *v5 = (const int32_t *)x; - int32_t *v6 = (int32_t *)y; - int64_t v12 = howmany - 1; - int64_t v303 = howmany / 2; - for (int j = 0; j < v12; j += 2) { - float v181 = -1.2500000000000000e+00F; - float v186 = 5.5901699437494745e-01F; - float v190 = 1.5388417685876268e+00F; - float v191 = -1.5388417685876268e+00F; - float v198 = 5.8778525229247325e-01F; - float v199 = -5.8778525229247325e-01F; - float v206 = 3.6327126400268028e-01F; - float v207 = -3.6327126400268028e-01F; - float32x2_t v209 = (float32x2_t){v4, v4}; - const int32_t *v615 = &v5[istride]; - int32_t *v670 = &v6[ostride]; - float32x2_t v182 = (float32x2_t){v181, v181}; - float32x2_t v187 = (float32x2_t){v186, v186}; - float32x2_t v192 = (float32x2_t){v190, v191}; - float32x2_t v200 = (float32x2_t){v198, v199}; - float32x2_t v208 = (float32x2_t){v206, v207}; - const int32_t *v552 = &v5[0]; - int32_t *v643 = &v6[0]; - int16x4_t v742 = vld1_s16((const int16_t *)v615); - float32x4_t v90 = vcvtq_n_f32_s32(vmovl_s16(v742), 15); - float32x4_t v183 = vcombine_f32(v182, v182); - float32x4_t v188 = vcombine_f32(v187, v187); - float32x2_t v194 = vmul_f32(v209, v192); - float32x2_t v202 = vmul_f32(v209, v200); - float32x2_t v210 = vmul_f32(v209, v208); - const int32_t *v561 = &v5[istride * 5]; - const int32_t *v570 = &v5[istride * 2]; - const int32_t *v579 = &v5[istride * 7]; - const int32_t *v588 = &v5[istride * 4]; - const int32_t *v597 = &v5[istride * 9]; - const int32_t *v606 = &v5[istride * 6]; - const int32_t *v624 = &v5[istride * 8]; - const int32_t *v633 = &v5[istride * 3]; - int32_t *v652 = &v6[ostride * 5]; - int32_t *v661 = &v6[ostride * 6]; - int32_t *v679 = &v6[ostride * 2]; - int32_t *v688 = &v6[ostride * 7]; - int32_t *v697 = &v6[ostride * 8]; - int32_t *v706 = &v6[ostride * 3]; - int32_t *v715 = &v6[ostride * 4]; - int32_t *v724 = &v6[ostride * 9]; - int16x4_t v728 = vld1_s16((const int16_t *)v552); - float32x4_t v28 = vcvtq_n_f32_s32(vmovl_s16(v728), 15); - float32x4_t v196 = vcombine_f32(v194, v194); - float32x4_t v204 = vcombine_f32(v202, v202); - float32x4_t v212 = vcombine_f32(v210, v210); - int16x4_t v730 = vld1_s16((const int16_t *)v561); - int16x4_t v732 = vld1_s16((const int16_t *)v570); - int16x4_t v734 = vld1_s16((const int16_t *)v579); - int16x4_t v736 = vld1_s16((const int16_t *)v588); - int16x4_t v738 = vld1_s16((const int16_t *)v597); - int16x4_t v740 = vld1_s16((const int16_t *)v606); - int16x4_t v744 = vld1_s16((const int16_t *)v624); - int16x4_t v746 = vld1_s16((const int16_t *)v633); - float32x4_t v36 = vcvtq_n_f32_s32(vmovl_s16(v730), 15); - float32x4_t v46 = vcvtq_n_f32_s32(vmovl_s16(v732), 15); - float32x4_t v54 = vcvtq_n_f32_s32(vmovl_s16(v734), 15); - float32x4_t v64 = vcvtq_n_f32_s32(vmovl_s16(v736), 15); - float32x4_t v72 = vcvtq_n_f32_s32(vmovl_s16(v738), 15); - float32x4_t v82 = vcvtq_n_f32_s32(vmovl_s16(v740), 15); - float32x4_t v100 = vcvtq_n_f32_s32(vmovl_s16(v744), 15); - float32x4_t v108 = vcvtq_n_f32_s32(vmovl_s16(v746), 15); - float32x4_t v37 = vaddq_f32(v28, v36); - float32x4_t v38 = vsubq_f32(v28, v36); - float32x4_t v55 = vaddq_f32(v46, v54); - float32x4_t v56 = vsubq_f32(v46, v54); - float32x4_t v73 = vaddq_f32(v64, v72); - float32x4_t v74 = vsubq_f32(v64, v72); - float32x4_t v91 = vaddq_f32(v82, v90); - float32x4_t v92 = vsubq_f32(v82, v90); - float32x4_t v109 = vaddq_f32(v100, v108); - float32x4_t v110 = vsubq_f32(v100, v108); - float32x4_t v111 = vaddq_f32(v55, v109); - float32x4_t v112 = vsubq_f32(v55, v109); - float32x4_t v113 = vaddq_f32(v91, v73); - float32x4_t v114 = vsubq_f32(v91, v73); - float32x4_t v167 = vaddq_f32(v56, v110); - float32x4_t v168 = vsubq_f32(v56, v110); - float32x4_t v169 = vaddq_f32(v92, v74); - float32x4_t v170 = vsubq_f32(v92, v74); - float32x4_t v115 = vaddq_f32(v111, v113); - float32x4_t v116 = vsubq_f32(v111, v113); - float32x4_t v117 = vaddq_f32(v112, v114); - float32x4_t v139 = vrev64q_f32(v112); - float32x4_t v155 = vrev64q_f32(v114); - float32x4_t v171 = vaddq_f32(v167, v169); - float32x4_t v172 = vsubq_f32(v167, v169); - float32x4_t v173 = vaddq_f32(v168, v170); - float32x4_t v195 = vrev64q_f32(v168); - float32x4_t v211 = vrev64q_f32(v170); - float32x4_t v118 = vaddq_f32(v115, v37); - float32x4_t v128 = vmulq_f32(v115, v183); - float32x4_t v133 = vmulq_f32(v116, v188); - float32x4_t v141 = vmulq_f32(v139, v196); - float32x4_t v147 = vrev64q_f32(v117); - float32x4_t v157 = vmulq_f32(v155, v212); - float32x4_t v174 = vaddq_f32(v171, v38); - float32x4_t v184 = vmulq_f32(v171, v183); - float32x4_t v189 = vmulq_f32(v172, v188); - float32x4_t v197 = vmulq_f32(v195, v196); - float32x4_t v203 = vrev64q_f32(v173); - float32x4_t v213 = vmulq_f32(v211, v212); - float32x4_t v149 = vmulq_f32(v147, v204); - float32x4_t v158 = vaddq_f32(v118, v128); - float32x4_t v205 = vmulq_f32(v203, v204); - float32x4_t v214 = vaddq_f32(v174, v184); - int16x4_t v225 = vqmovn_s32(vcvtq_n_s32_f32(v118, 15)); - int16x4_t v233 = vqmovn_s32(vcvtq_n_s32_f32(v174, 15)); - float32x4_t v159 = vaddq_f32(v158, v133); - float32x4_t v160 = vsubq_f32(v158, v133); - float32x4_t v161 = vsubq_f32(v141, v149); - float32x4_t v162 = vaddq_f32(v149, v157); - float32x4_t v215 = vaddq_f32(v214, v189); - float32x4_t v216 = vsubq_f32(v214, v189); - float32x4_t v217 = vsubq_f32(v197, v205); - float32x4_t v218 = vaddq_f32(v205, v213); - vst1_s16((int16_t *)v643, v225); - vst1_s16((int16_t *)v652, v233); - float32x4_t v163 = vaddq_f32(v159, v161); - float32x4_t v164 = vsubq_f32(v159, v161); - float32x4_t v165 = vaddq_f32(v160, v162); - float32x4_t v166 = vsubq_f32(v160, v162); - float32x4_t v219 = vaddq_f32(v215, v217); - float32x4_t v220 = vsubq_f32(v215, v217); - float32x4_t v221 = vaddq_f32(v216, v218); - float32x4_t v222 = vsubq_f32(v216, v218); - int16x4_t v241 = vqmovn_s32(vcvtq_n_s32_f32(v164, 15)); - int16x4_t v249 = vqmovn_s32(vcvtq_n_s32_f32(v220, 15)); - int16x4_t v257 = vqmovn_s32(vcvtq_n_s32_f32(v166, 15)); - int16x4_t v265 = vqmovn_s32(vcvtq_n_s32_f32(v222, 15)); - int16x4_t v273 = vqmovn_s32(vcvtq_n_s32_f32(v165, 15)); - int16x4_t v281 = vqmovn_s32(vcvtq_n_s32_f32(v221, 15)); - int16x4_t v289 = vqmovn_s32(vcvtq_n_s32_f32(v163, 15)); - int16x4_t v297 = vqmovn_s32(vcvtq_n_s32_f32(v219, 15)); - vst1_s16((int16_t *)v661, v241); - vst1_s16((int16_t *)v670, v249); - vst1_s16((int16_t *)v679, v257); - vst1_s16((int16_t *)v688, v265); - vst1_s16((int16_t *)v697, v273); - vst1_s16((int16_t *)v706, v281); - vst1_s16((int16_t *)v715, v289); - vst1_s16((int16_t *)v724, v297); - v5 += 2 * 1; - v6 += 2 * 1; - } - for (int j = v303 * 2; j < howmany; j += 1) { - int16x4_t v363 = vld1s_s16(&v5[istride]); - float v444 = -1.2500000000000000e+00F; - float v448 = 5.5901699437494745e-01F; - float v451 = 1.5388417685876268e+00F; - float v452 = -1.5388417685876268e+00F; - float v458 = 5.8778525229247325e-01F; - float v459 = -5.8778525229247325e-01F; - float v465 = 3.6327126400268028e-01F; - float v466 = -3.6327126400268028e-01F; - float32x2_t v468 = (float32x2_t){v4, v4}; - int16x4_t v315 = vld1s_s16(&v5[0]); - float32x2_t v364 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v363)), 15); - float32x2_t v445 = (float32x2_t){v444, v444}; - float32x2_t v449 = (float32x2_t){v448, v448}; - float32x2_t v453 = (float32x2_t){v451, v452}; - float32x2_t v460 = (float32x2_t){v458, v459}; - float32x2_t v467 = (float32x2_t){v465, v466}; - float32x2_t v316 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v315)), 15); - int16x4_t v321 = vld1s_s16(&v5[istride * 5]); - int16x4_t v329 = vld1s_s16(&v5[istride * 2]); - int16x4_t v335 = vld1s_s16(&v5[istride * 7]); - int16x4_t v343 = vld1s_s16(&v5[istride * 4]); - int16x4_t v349 = vld1s_s16(&v5[istride * 9]); - int16x4_t v357 = vld1s_s16(&v5[istride * 6]); - int16x4_t v371 = vld1s_s16(&v5[istride * 8]); - int16x4_t v377 = vld1s_s16(&v5[istride * 3]); - float32x2_t v455 = vmul_f32(v468, v453); - float32x2_t v462 = vmul_f32(v468, v460); - float32x2_t v469 = vmul_f32(v468, v467); - float32x2_t v322 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v321)), 15); - float32x2_t v330 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v329)), 15); - float32x2_t v336 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v335)), 15); - float32x2_t v344 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v343)), 15); - float32x2_t v350 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v349)), 15); - float32x2_t v358 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v357)), 15); - float32x2_t v372 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v371)), 15); - float32x2_t v378 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v377)), 15); - float32x2_t v323 = vadd_f32(v316, v322); - float32x2_t v324 = vsub_f32(v316, v322); - float32x2_t v337 = vadd_f32(v330, v336); - float32x2_t v338 = vsub_f32(v330, v336); - float32x2_t v351 = vadd_f32(v344, v350); - float32x2_t v352 = vsub_f32(v344, v350); - float32x2_t v365 = vadd_f32(v358, v364); - float32x2_t v366 = vsub_f32(v358, v364); - float32x2_t v379 = vadd_f32(v372, v378); - float32x2_t v380 = vsub_f32(v372, v378); - float32x2_t v381 = vadd_f32(v337, v379); - float32x2_t v382 = vsub_f32(v337, v379); - float32x2_t v383 = vadd_f32(v365, v351); - float32x2_t v384 = vsub_f32(v365, v351); - float32x2_t v431 = vadd_f32(v338, v380); - float32x2_t v432 = vsub_f32(v338, v380); - float32x2_t v433 = vadd_f32(v366, v352); - float32x2_t v434 = vsub_f32(v366, v352); - float32x2_t v385 = vadd_f32(v381, v383); - float32x2_t v386 = vsub_f32(v381, v383); - float32x2_t v387 = vadd_f32(v382, v384); - float32x2_t v406 = vrev64_f32(v382); - float32x2_t v420 = vrev64_f32(v384); - float32x2_t v435 = vadd_f32(v431, v433); - float32x2_t v436 = vsub_f32(v431, v433); - float32x2_t v437 = vadd_f32(v432, v434); - float32x2_t v456 = vrev64_f32(v432); - float32x2_t v470 = vrev64_f32(v434); - float32x2_t v388 = vadd_f32(v385, v323); - float32x2_t v396 = vmul_f32(v385, v445); - float32x2_t v400 = vmul_f32(v386, v449); - float32x2_t v407 = vmul_f32(v406, v455); - float32x2_t v413 = vrev64_f32(v387); - float32x2_t v421 = vmul_f32(v420, v469); - float32x2_t v438 = vadd_f32(v435, v324); - float32x2_t v446 = vmul_f32(v435, v445); - float32x2_t v450 = vmul_f32(v436, v449); - float32x2_t v457 = vmul_f32(v456, v455); - float32x2_t v463 = vrev64_f32(v437); - float32x2_t v471 = vmul_f32(v470, v469); - float32x2_t v414 = vmul_f32(v413, v462); - float32x2_t v422 = vadd_f32(v388, v396); - float32x2_t v464 = vmul_f32(v463, v462); - float32x2_t v472 = vadd_f32(v438, v446); - int16x4_t v483 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v388, 15), (int32x2_t){0, 0})); - int16x4_t v489 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v438, 15), (int32x2_t){0, 0})); - float32x2_t v423 = vadd_f32(v422, v400); - float32x2_t v424 = vsub_f32(v422, v400); - float32x2_t v425 = vsub_f32(v407, v414); - float32x2_t v426 = vadd_f32(v414, v421); - float32x2_t v473 = vadd_f32(v472, v450); - float32x2_t v474 = vsub_f32(v472, v450); - float32x2_t v475 = vsub_f32(v457, v464); - float32x2_t v476 = vadd_f32(v464, v471); - v6[0] = vget_lane_s32(vreinterpret_s32_s16(v483), 0); - v6[ostride * 5] = vget_lane_s32(vreinterpret_s32_s16(v489), 0); - float32x2_t v427 = vadd_f32(v423, v425); - float32x2_t v428 = vsub_f32(v423, v425); - float32x2_t v429 = vadd_f32(v424, v426); - float32x2_t v430 = vsub_f32(v424, v426); - float32x2_t v477 = vadd_f32(v473, v475); - float32x2_t v478 = vsub_f32(v473, v475); - float32x2_t v479 = vadd_f32(v474, v476); - float32x2_t v480 = vsub_f32(v474, v476); - int16x4_t v495 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v428, 15), (int32x2_t){0, 0})); - int16x4_t v501 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v478, 15), (int32x2_t){0, 0})); - int16x4_t v507 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v430, 15), (int32x2_t){0, 0})); - int16x4_t v513 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v480, 15), (int32x2_t){0, 0})); - int16x4_t v519 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v429, 15), (int32x2_t){0, 0})); - int16x4_t v525 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v479, 15), (int32x2_t){0, 0})); - int16x4_t v531 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v427, 15), (int32x2_t){0, 0})); - int16x4_t v537 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v477, 15), (int32x2_t){0, 0})); - v6[ostride * 6] = vget_lane_s32(vreinterpret_s32_s16(v495), 0); - v6[ostride] = vget_lane_s32(vreinterpret_s32_s16(v501), 0); - v6[ostride * 2] = vget_lane_s32(vreinterpret_s32_s16(v507), 0); - v6[ostride * 7] = vget_lane_s32(vreinterpret_s32_s16(v513), 0); - v6[ostride * 8] = vget_lane_s32(vreinterpret_s32_s16(v519), 0); - v6[ostride * 3] = vget_lane_s32(vreinterpret_s32_s16(v525), 0); - v6[ostride * 4] = vget_lane_s32(vreinterpret_s32_s16(v531), 0); - v6[ostride * 9] = vget_lane_s32(vreinterpret_s32_s16(v537), 0); - v5 += 1 * 1; - v6 += 1 * 1; - } -} -#endif - -#ifdef ARMRAL_ARCH_SVE -void armral_fft_cs16_cf32_cs16_ac_n_uu10(const armral_cmplx_int16_t *restrict x, - armral_cmplx_int16_t *restrict y, - int istride, int ostride, int howmany, - float dir) { - int64_t v0 = istride; - int64_t v2 = ostride; - float v4 = dir; - const int32_t *v5 = (const int32_t *)x; - int32_t *v6 = (int32_t *)y; - int64_t v8 = howmany; - int64_t v10 = svcntd(); - int64_t v11 = v10 * 1; - int64_t v12 = v10 * 1; - for (int j = 0; j < v8; j += v10) { - svbool_t pred_full = svwhilelt_b32(j * 2, howmany * 2); - float v175 = -1.2500000000000000e+00F; - float v180 = 5.5901699437494745e-01F; - float v185 = -1.5388417685876268e+00F; - float v192 = -5.8778525229247325e-01F; - float v199 = -3.6327126400268028e-01F; - const int32_t *v364 = &v5[v0]; - int32_t *v431 = &v6[v2]; - int64_t v27 = v0 * 5; - int64_t v37 = v0 * 2; - int64_t v45 = v0 * 7; - int64_t v55 = v0 * 4; - int64_t v63 = v0 * 9; - int64_t v73 = v0 * 6; - int64_t v91 = v0 * 8; - int64_t v99 = v0 * 3; - float v188 = v4 * v185; - float v195 = v4 * v192; - float v202 = v4 * v199; - int64_t v223 = v2 * 5; - int64_t v231 = v2 * 6; - int64_t v247 = v2 * 2; - int64_t v255 = v2 * 7; - int64_t v263 = v2 * 8; - int64_t v271 = v2 * 3; - int64_t v279 = v2 * 4; - int64_t v287 = v2 * 9; - const int32_t *v301 = &v5[0]; - svfloat32_t v392 = svdup_n_f32(v175); - svfloat32_t v393 = svdup_n_f32(v180); - int32_t *v404 = &v6[0]; - svfloat32_t v87 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v364[0])), - 1.F / (1ULL << 15ULL)); - const int32_t *v310 = &v5[v27]; - const int32_t *v319 = &v5[v37]; - const int32_t *v328 = &v5[v45]; - const int32_t *v337 = &v5[v55]; - const int32_t *v346 = &v5[v63]; - const int32_t *v355 = &v5[v73]; - const int32_t *v373 = &v5[v91]; - const int32_t *v382 = &v5[v99]; - svfloat32_t v394 = svdup_n_f32(v188); - svfloat32_t v395 = svdup_n_f32(v195); - svfloat32_t v396 = svdup_n_f32(v202); - int32_t *v413 = &v6[v223]; - int32_t *v422 = &v6[v231]; - int32_t *v440 = &v6[v247]; - int32_t *v449 = &v6[v255]; - int32_t *v458 = &v6[v263]; - int32_t *v467 = &v6[v271]; - int32_t *v476 = &v6[v279]; - int32_t *v485 = &v6[v287]; - svfloat32_t v25 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v301[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v33 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v310[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v43 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v319[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v51 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v328[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v61 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v337[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v69 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v346[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v79 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v355[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v97 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v373[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v105 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v382[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v34; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v34) : "w"(v25), "w"(v33)); - svfloat32_t v35; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v35) : "w"(v25), "w"(v33)); - svfloat32_t v52; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v52) : "w"(v43), "w"(v51)); - svfloat32_t v53; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v53) : "w"(v43), "w"(v51)); - svfloat32_t v70; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v70) : "w"(v61), "w"(v69)); - svfloat32_t v71; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v71) : "w"(v61), "w"(v69)); - svfloat32_t v88; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v88) : "w"(v79), "w"(v87)); - svfloat32_t v89; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v89) : "w"(v79), "w"(v87)); - svfloat32_t v106; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v106) : "w"(v97), "w"(v105)); - svfloat32_t v107; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v107) : "w"(v97), "w"(v105)); - svfloat32_t v108; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v108) : "w"(v52), "w"(v106)); - svfloat32_t v109; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v109) : "w"(v52), "w"(v106)); - svfloat32_t v110; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v110) : "w"(v88), "w"(v70)); - svfloat32_t v111; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v111) : "w"(v88), "w"(v70)); - svfloat32_t v161; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v161) : "w"(v53), "w"(v107)); - svfloat32_t v162; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v162) : "w"(v53), "w"(v107)); - svfloat32_t v163; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v163) : "w"(v89), "w"(v71)); - svfloat32_t v164; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v164) : "w"(v89), "w"(v71)); - svfloat32_t v112; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v112) : "w"(v108), "w"(v110)); - svfloat32_t v113; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v113) : "w"(v108), "w"(v110)); - svfloat32_t v114; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v114) : "w"(v109), "w"(v111)); - svfloat32_t zero137; - asm volatile("mov %0.s, #0" : "=w"(zero137)); - svfloat32_t v137 = svcmla_f32_x(pred_full, zero137, v394, v109, 90); - svfloat32_t v165; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v165) : "w"(v161), "w"(v163)); - svfloat32_t v166; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v166) : "w"(v161), "w"(v163)); - svfloat32_t v167; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v167) : "w"(v162), "w"(v164)); - svfloat32_t zero190; - asm volatile("mov %0.s, #0" : "=w"(zero190)); - svfloat32_t v190 = svcmla_f32_x(pred_full, zero190, v394, v162, 90); - svfloat32_t v115; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v115) : "w"(v112), "w"(v34)); - svfloat32_t zero144; - asm volatile("mov %0.s, #0" : "=w"(zero144)); - svfloat32_t v144 = svcmla_f32_x(pred_full, zero144, v395, v114, 90); - svfloat32_t v168; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v168) : "w"(v165), "w"(v35)); - svfloat32_t zero197; - asm volatile("mov %0.s, #0" : "=w"(zero197)); - svfloat32_t v197 = svcmla_f32_x(pred_full, zero197, v395, v167, 90); - svfloat32_t v152 = svmla_f32_x(pred_full, v115, v112, v392); - svfloat32_t v155; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v155) : "w"(v137), "w"(v144)); - svfloat32_t v156 = svcmla_f32_x(pred_full, v144, v396, v111, 90); - svfloat32_t v205 = svmla_f32_x(pred_full, v168, v165, v392); - svfloat32_t v208; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v208) : "w"(v190), "w"(v197)); - svfloat32_t v209 = svcmla_f32_x(pred_full, v197, v396, v164, 90); - svint16_t v216 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v115, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svint16_t v224 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v168, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svfloat32_t v153 = svmla_f32_x(pred_full, v152, v113, v393); - svfloat32_t v154 = svmls_f32_x(pred_full, v152, v113, v393); - svfloat32_t v206 = svmla_f32_x(pred_full, v205, v166, v393); - svfloat32_t v207 = svmls_f32_x(pred_full, v205, v166, v393); - svst1w_u64(pred_full, (unsigned *)(v404), svreinterpret_u64_s16(v216)); - svst1w_u64(pred_full, (unsigned *)(v413), svreinterpret_u64_s16(v224)); - svfloat32_t v157; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v157) : "w"(v153), "w"(v155)); - svfloat32_t v158; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v158) : "w"(v153), "w"(v155)); - svfloat32_t v159; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v159) : "w"(v154), "w"(v156)); - svfloat32_t v160; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v160) : "w"(v154), "w"(v156)); - svfloat32_t v210; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v210) : "w"(v206), "w"(v208)); - svfloat32_t v211; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v211) : "w"(v206), "w"(v208)); - svfloat32_t v212; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v212) : "w"(v207), "w"(v209)); - svfloat32_t v213; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v213) : "w"(v207), "w"(v209)); - svint16_t v232 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v158, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svint16_t v240 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v211, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svint16_t v248 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v160, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svint16_t v256 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v213, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svint16_t v264 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v159, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svint16_t v272 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v212, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svint16_t v280 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v157, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svint16_t v288 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v210, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svst1w_u64(pred_full, (unsigned *)(v422), svreinterpret_u64_s16(v232)); - svst1w_u64(pred_full, (unsigned *)(v431), svreinterpret_u64_s16(v240)); - svst1w_u64(pred_full, (unsigned *)(v440), svreinterpret_u64_s16(v248)); - svst1w_u64(pred_full, (unsigned *)(v449), svreinterpret_u64_s16(v256)); - svst1w_u64(pred_full, (unsigned *)(v458), svreinterpret_u64_s16(v264)); - svst1w_u64(pred_full, (unsigned *)(v467), svreinterpret_u64_s16(v272)); - svst1w_u64(pred_full, (unsigned *)(v476), svreinterpret_u64_s16(v280)); - svst1w_u64(pred_full, (unsigned *)(v485), svreinterpret_u64_s16(v288)); - v5 += v11; - v6 += v12; - } -} -#endif - -#ifndef ARMRAL_ARCH_SVE -void armral_fft_cs16_cf32_cs16_ac_n_uu11(const armral_cmplx_int16_t *restrict x, - armral_cmplx_int16_t *restrict y, - int istride, int ostride, int howmany, - float dir) { - float v4 = dir; - const int32_t *v5 = (const int32_t *)x; - int32_t *v6 = (int32_t *)y; - int64_t v12 = howmany - 1; - int64_t v416 = howmany / 2; - for (int j = 0; j < v12; j += 2) { - float v152 = 1.1000000000000001e+00F; - float v156 = 3.3166247903554003e-01F; - float v157 = -3.3166247903554003e-01F; - float v165 = 5.1541501300188641e-01F; - float v170 = 9.4125353283118118e-01F; - float v175 = 1.4143537075597825e+00F; - float v180 = 8.5949297361449750e-01F; - float v185 = 4.2314838273285138e-02F; - float v190 = 3.8639279888589606e-01F; - float v195 = 5.1254589567200015e-01F; - float v200 = 1.0702757469471715e+00F; - float v205 = 5.5486073394528512e-01F; - float v209 = 1.2412944743900585e+00F; - float v210 = -1.2412944743900585e+00F; - float v217 = 2.0897833842005756e-01F; - float v218 = -2.0897833842005756e-01F; - float v225 = 3.7415717312460811e-01F; - float v226 = -3.7415717312460811e-01F; - float v233 = 4.9929922194110327e-02F; - float v234 = -4.9929922194110327e-02F; - float v241 = 6.5815896284539266e-01F; - float v242 = -6.5815896284539266e-01F; - float v249 = 6.3306543373877577e-01F; - float v250 = -6.3306543373877577e-01F; - float v257 = 1.0822460581641109e+00F; - float v258 = -1.0822460581641109e+00F; - float v265 = 8.1720737907134022e-01F; - float v266 = -8.1720737907134022e-01F; - float v273 = 4.2408709531871824e-01F; - float v274 = -4.2408709531871824e-01F; - float32x2_t v276 = (float32x2_t){v4, v4}; - const int32_t *v764 = &v5[istride]; - int32_t *v955 = &v6[ostride]; - float32x2_t v153 = (float32x2_t){v152, v152}; - float32x2_t v158 = (float32x2_t){v156, v157}; - float32x2_t v166 = (float32x2_t){v165, v165}; - float32x2_t v171 = (float32x2_t){v170, v170}; - float32x2_t v176 = (float32x2_t){v175, v175}; - float32x2_t v181 = (float32x2_t){v180, v180}; - float32x2_t v186 = (float32x2_t){v185, v185}; - float32x2_t v191 = (float32x2_t){v190, v190}; - float32x2_t v196 = (float32x2_t){v195, v195}; - float32x2_t v201 = (float32x2_t){v200, v200}; - float32x2_t v206 = (float32x2_t){v205, v205}; - float32x2_t v211 = (float32x2_t){v209, v210}; - float32x2_t v219 = (float32x2_t){v217, v218}; - float32x2_t v227 = (float32x2_t){v225, v226}; - float32x2_t v235 = (float32x2_t){v233, v234}; - float32x2_t v243 = (float32x2_t){v241, v242}; - float32x2_t v251 = (float32x2_t){v249, v250}; - float32x2_t v259 = (float32x2_t){v257, v258}; - float32x2_t v267 = (float32x2_t){v265, v266}; - float32x2_t v275 = (float32x2_t){v273, v274}; - const int32_t *v855 = &v5[0]; - int32_t *v865 = &v6[0]; - int16x4_t v959 = vld1_s16((const int16_t *)v764); - float32x4_t v28 = vcvtq_n_f32_s32(vmovl_s16(v959), 15); - float32x4_t v154 = vcombine_f32(v153, v153); - float32x2_t v160 = vmul_f32(v276, v158); - float32x4_t v167 = vcombine_f32(v166, v166); - float32x4_t v172 = vcombine_f32(v171, v171); - float32x4_t v177 = vcombine_f32(v176, v176); - float32x4_t v182 = vcombine_f32(v181, v181); - float32x4_t v187 = vcombine_f32(v186, v186); - float32x4_t v192 = vcombine_f32(v191, v191); - float32x4_t v197 = vcombine_f32(v196, v196); - float32x4_t v202 = vcombine_f32(v201, v201); - float32x4_t v207 = vcombine_f32(v206, v206); - float32x2_t v213 = vmul_f32(v276, v211); - float32x2_t v221 = vmul_f32(v276, v219); - float32x2_t v229 = vmul_f32(v276, v227); - float32x2_t v237 = vmul_f32(v276, v235); - float32x2_t v245 = vmul_f32(v276, v243); - float32x2_t v253 = vmul_f32(v276, v251); - float32x2_t v261 = vmul_f32(v276, v259); - float32x2_t v269 = vmul_f32(v276, v267); - float32x2_t v277 = vmul_f32(v276, v275); - const int32_t *v773 = &v5[istride * 10]; - const int32_t *v782 = &v5[istride * 2]; - const int32_t *v791 = &v5[istride * 9]; - const int32_t *v800 = &v5[istride * 3]; - const int32_t *v809 = &v5[istride * 8]; - const int32_t *v818 = &v5[istride * 4]; - const int32_t *v827 = &v5[istride * 7]; - const int32_t *v836 = &v5[istride * 5]; - const int32_t *v845 = &v5[istride * 6]; - int32_t *v874 = &v6[ostride * 10]; - int32_t *v883 = &v6[ostride * 9]; - int32_t *v892 = &v6[ostride * 8]; - int32_t *v901 = &v6[ostride * 7]; - int32_t *v910 = &v6[ostride * 6]; - int32_t *v919 = &v6[ostride * 5]; - int32_t *v928 = &v6[ostride * 4]; - int32_t *v937 = &v6[ostride * 3]; - int32_t *v946 = &v6[ostride * 2]; - int16x4_t v979 = vld1_s16((const int16_t *)v855); - float32x4_t v124 = vcvtq_n_f32_s32(vmovl_s16(v979), 15); - float32x4_t v162 = vcombine_f32(v160, v160); - float32x4_t v215 = vcombine_f32(v213, v213); - float32x4_t v223 = vcombine_f32(v221, v221); - float32x4_t v231 = vcombine_f32(v229, v229); - float32x4_t v239 = vcombine_f32(v237, v237); - float32x4_t v247 = vcombine_f32(v245, v245); - float32x4_t v255 = vcombine_f32(v253, v253); - float32x4_t v263 = vcombine_f32(v261, v261); - float32x4_t v271 = vcombine_f32(v269, v269); - float32x4_t v279 = vcombine_f32(v277, v277); - int16x4_t v961 = vld1_s16((const int16_t *)v773); - int16x4_t v963 = vld1_s16((const int16_t *)v782); - int16x4_t v965 = vld1_s16((const int16_t *)v791); - int16x4_t v967 = vld1_s16((const int16_t *)v800); - int16x4_t v969 = vld1_s16((const int16_t *)v809); - int16x4_t v971 = vld1_s16((const int16_t *)v818); - int16x4_t v973 = vld1_s16((const int16_t *)v827); - int16x4_t v975 = vld1_s16((const int16_t *)v836); - int16x4_t v977 = vld1_s16((const int16_t *)v845); - float32x4_t v36 = vcvtq_n_f32_s32(vmovl_s16(v961), 15); - float32x4_t v45 = vcvtq_n_f32_s32(vmovl_s16(v963), 15); - float32x4_t v53 = vcvtq_n_f32_s32(vmovl_s16(v965), 15); - float32x4_t v62 = vcvtq_n_f32_s32(vmovl_s16(v967), 15); - float32x4_t v70 = vcvtq_n_f32_s32(vmovl_s16(v969), 15); - float32x4_t v79 = vcvtq_n_f32_s32(vmovl_s16(v971), 15); - float32x4_t v87 = vcvtq_n_f32_s32(vmovl_s16(v973), 15); - float32x4_t v96 = vcvtq_n_f32_s32(vmovl_s16(v975), 15); - float32x4_t v104 = vcvtq_n_f32_s32(vmovl_s16(v977), 15); - float32x4_t v37 = vaddq_f32(v28, v36); - float32x4_t v54 = vaddq_f32(v45, v53); - float32x4_t v71 = vaddq_f32(v62, v70); - float32x4_t v88 = vaddq_f32(v79, v87); - float32x4_t v105 = vaddq_f32(v96, v104); - float32x4_t v106 = vsubq_f32(v28, v36); - float32x4_t v107 = vsubq_f32(v45, v53); - float32x4_t v108 = vsubq_f32(v62, v70); - float32x4_t v109 = vsubq_f32(v79, v87); - float32x4_t v110 = vsubq_f32(v96, v104); - float32x4_t v111 = vaddq_f32(v37, v54); - float32x4_t v112 = vaddq_f32(v71, v105); - float32x4_t v114 = vsubq_f32(v107, v108); - float32x4_t v115 = vaddq_f32(v106, v110); - float32x4_t v128 = vsubq_f32(v54, v88); - float32x4_t v129 = vsubq_f32(v37, v88); - float32x4_t v130 = vsubq_f32(v54, v37); - float32x4_t v131 = vsubq_f32(v105, v88); - float32x4_t v132 = vsubq_f32(v71, v88); - float32x4_t v133 = vsubq_f32(v105, v71); - float32x4_t v134 = vsubq_f32(v54, v105); - float32x4_t v135 = vsubq_f32(v37, v71); - float32x4_t v137 = vaddq_f32(v107, v109); - float32x4_t v138 = vsubq_f32(v106, v109); - float32x4_t v139 = vaddq_f32(v106, v107); - float32x4_t v140 = vsubq_f32(v109, v110); - float32x4_t v141 = vsubq_f32(v108, v109); - float32x4_t v142 = vsubq_f32(v108, v110); - float32x4_t v143 = vaddq_f32(v107, v110); - float32x4_t v144 = vsubq_f32(v106, v108); - float32x4_t v113 = vaddq_f32(v88, v111); - float32x4_t v126 = vsubq_f32(v114, v115); - float32x4_t v136 = vsubq_f32(v112, v111); - float32x4_t v145 = vaddq_f32(v114, v115); - float32x4_t v168 = vmulq_f32(v128, v167); - float32x4_t v173 = vmulq_f32(v129, v172); - float32x4_t v178 = vmulq_f32(v130, v177); - float32x4_t v183 = vmulq_f32(v131, v182); - float32x4_t v188 = vmulq_f32(v132, v187); - float32x4_t v193 = vmulq_f32(v133, v192); - float32x4_t v198 = vmulq_f32(v134, v197); - float32x4_t v203 = vmulq_f32(v135, v202); - float32x4_t v214 = vrev64q_f32(v137); - float32x4_t v222 = vrev64q_f32(v138); - float32x4_t v230 = vrev64q_f32(v139); - float32x4_t v238 = vrev64q_f32(v140); - float32x4_t v246 = vrev64q_f32(v141); - float32x4_t v254 = vrev64q_f32(v142); - float32x4_t v262 = vrev64q_f32(v143); - float32x4_t v270 = vrev64q_f32(v144); - float32x4_t v116 = vaddq_f32(v113, v112); - float32x4_t v127 = vsubq_f32(v126, v109); - float32x4_t v208 = vmulq_f32(v136, v207); - float32x4_t v216 = vmulq_f32(v214, v215); - float32x4_t v224 = vmulq_f32(v222, v223); - float32x4_t v232 = vmulq_f32(v230, v231); - float32x4_t v240 = vmulq_f32(v238, v239); - float32x4_t v248 = vmulq_f32(v246, v247); - float32x4_t v256 = vmulq_f32(v254, v255); - float32x4_t v264 = vmulq_f32(v262, v263); - float32x4_t v272 = vmulq_f32(v270, v271); - float32x4_t v278 = vrev64q_f32(v145); - float32x4_t v282 = vaddq_f32(v168, v173); - float32x4_t v283 = vaddq_f32(v173, v178); - float32x4_t v284 = vsubq_f32(v168, v178); - float32x4_t v285 = vaddq_f32(v183, v188); - float32x4_t v286 = vaddq_f32(v188, v193); - float32x4_t v287 = vsubq_f32(v183, v193); - float32x4_t v125 = vaddq_f32(v124, v116); - float32x4_t v155 = vmulq_f32(v116, v154); - float32x4_t v161 = vrev64q_f32(v127); - float32x4_t v280 = vmulq_f32(v278, v279); - float32x4_t v288 = vaddq_f32(v203, v208); - float32x4_t v289 = vaddq_f32(v198, v208); - float32x4_t v290 = vaddq_f32(v224, v232); - float32x4_t v291 = vsubq_f32(v216, v232); - float32x4_t v292 = vaddq_f32(v248, v256); - float32x4_t v293 = vsubq_f32(v240, v256); - float32x4_t v163 = vmulq_f32(v161, v162); - float32x4_t v281 = vsubq_f32(v125, v155); - float32x4_t v294 = vaddq_f32(v272, v280); - float32x4_t v295 = vsubq_f32(v264, v280); - float32x4_t v296 = vaddq_f32(v286, v288); - float32x4_t v314 = vaddq_f32(v290, v291); - int16x4_t v330 = vqmovn_s32(vcvtq_n_s32_f32(v125, 15)); - float32x4_t v297 = vaddq_f32(v296, v281); - float32x4_t v298 = vsubq_f32(v281, v283); - float32x4_t v300 = vaddq_f32(v281, v287); - float32x4_t v302 = vsubq_f32(v281, v284); - float32x4_t v304 = vaddq_f32(v281, v282); - float32x4_t v306 = vaddq_f32(v163, v292); - float32x4_t v308 = vsubq_f32(v294, v290); - float32x4_t v310 = vaddq_f32(v163, v295); - float32x4_t v312 = vsubq_f32(v295, v291); - float32x4_t v315 = vaddq_f32(v314, v292); - vst1_s16((int16_t *)v865, v330); - float32x4_t v299 = vsubq_f32(v298, v288); - float32x4_t v301 = vaddq_f32(v300, v289); - float32x4_t v303 = vsubq_f32(v302, v289); - float32x4_t v305 = vsubq_f32(v304, v285); - float32x4_t v307 = vaddq_f32(v306, v294); - float32x4_t v309 = vsubq_f32(v308, v163); - float32x4_t v311 = vaddq_f32(v310, v293); - float32x4_t v313 = vsubq_f32(v312, v163); - float32x4_t v316 = vaddq_f32(v315, v293); - float32x4_t v317 = vsubq_f32(v316, v163); - float32x4_t v319 = vaddq_f32(v297, v307); - float32x4_t v320 = vaddq_f32(v299, v309); - float32x4_t v321 = vsubq_f32(v301, v311); - float32x4_t v322 = vaddq_f32(v303, v313); - float32x4_t v323 = vsubq_f32(v303, v313); - float32x4_t v324 = vaddq_f32(v301, v311); - float32x4_t v325 = vsubq_f32(v299, v309); - float32x4_t v326 = vsubq_f32(v297, v307); - float32x4_t v318 = vaddq_f32(v305, v317); - float32x4_t v327 = vsubq_f32(v305, v317); - int16x4_t v346 = vqmovn_s32(vcvtq_n_s32_f32(v319, 15)); - int16x4_t v354 = vqmovn_s32(vcvtq_n_s32_f32(v320, 15)); - int16x4_t v362 = vqmovn_s32(vcvtq_n_s32_f32(v321, 15)); - int16x4_t v370 = vqmovn_s32(vcvtq_n_s32_f32(v322, 15)); - int16x4_t v378 = vqmovn_s32(vcvtq_n_s32_f32(v323, 15)); - int16x4_t v386 = vqmovn_s32(vcvtq_n_s32_f32(v324, 15)); - int16x4_t v394 = vqmovn_s32(vcvtq_n_s32_f32(v325, 15)); - int16x4_t v402 = vqmovn_s32(vcvtq_n_s32_f32(v326, 15)); - int16x4_t v338 = vqmovn_s32(vcvtq_n_s32_f32(v318, 15)); - int16x4_t v410 = vqmovn_s32(vcvtq_n_s32_f32(v327, 15)); - vst1_s16((int16_t *)v883, v346); - vst1_s16((int16_t *)v892, v354); - vst1_s16((int16_t *)v901, v362); - vst1_s16((int16_t *)v910, v370); - vst1_s16((int16_t *)v919, v378); - vst1_s16((int16_t *)v928, v386); - vst1_s16((int16_t *)v937, v394); - vst1_s16((int16_t *)v946, v402); - vst1_s16((int16_t *)v874, v338); - vst1_s16((int16_t *)v955, v410); - v5 += 2 * 1; - v6 += 2 * 1; - } - for (int j = v416 * 2; j < howmany; j += 1) { - int16x4_t v428 = vld1s_s16(&v5[istride]); - float v532 = 1.1000000000000001e+00F; - float v535 = 3.3166247903554003e-01F; - float v536 = -3.3166247903554003e-01F; - float v543 = 5.1541501300188641e-01F; - float v547 = 9.4125353283118118e-01F; - float v551 = 1.4143537075597825e+00F; - float v555 = 8.5949297361449750e-01F; - float v559 = 4.2314838273285138e-02F; - float v563 = 3.8639279888589606e-01F; - float v567 = 5.1254589567200015e-01F; - float v571 = 1.0702757469471715e+00F; - float v575 = 5.5486073394528512e-01F; - float v578 = 1.2412944743900585e+00F; - float v579 = -1.2412944743900585e+00F; - float v585 = 2.0897833842005756e-01F; - float v586 = -2.0897833842005756e-01F; - float v592 = 3.7415717312460811e-01F; - float v593 = -3.7415717312460811e-01F; - float v599 = 4.9929922194110327e-02F; - float v600 = -4.9929922194110327e-02F; - float v606 = 6.5815896284539266e-01F; - float v607 = -6.5815896284539266e-01F; - float v613 = 6.3306543373877577e-01F; - float v614 = -6.3306543373877577e-01F; - float v620 = 1.0822460581641109e+00F; - float v621 = -1.0822460581641109e+00F; - float v627 = 8.1720737907134022e-01F; - float v628 = -8.1720737907134022e-01F; - float v634 = 4.2408709531871824e-01F; - float v635 = -4.2408709531871824e-01F; - float32x2_t v637 = (float32x2_t){v4, v4}; - float32x2_t v429 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v428)), 15); - int16x4_t v504 = vld1s_s16(&v5[0]); - float32x2_t v533 = (float32x2_t){v532, v532}; - float32x2_t v537 = (float32x2_t){v535, v536}; - float32x2_t v544 = (float32x2_t){v543, v543}; - float32x2_t v548 = (float32x2_t){v547, v547}; - float32x2_t v552 = (float32x2_t){v551, v551}; - float32x2_t v556 = (float32x2_t){v555, v555}; - float32x2_t v560 = (float32x2_t){v559, v559}; - float32x2_t v564 = (float32x2_t){v563, v563}; - float32x2_t v568 = (float32x2_t){v567, v567}; - float32x2_t v572 = (float32x2_t){v571, v571}; - float32x2_t v576 = (float32x2_t){v575, v575}; - float32x2_t v580 = (float32x2_t){v578, v579}; - float32x2_t v587 = (float32x2_t){v585, v586}; - float32x2_t v594 = (float32x2_t){v592, v593}; - float32x2_t v601 = (float32x2_t){v599, v600}; - float32x2_t v608 = (float32x2_t){v606, v607}; - float32x2_t v615 = (float32x2_t){v613, v614}; - float32x2_t v622 = (float32x2_t){v620, v621}; - float32x2_t v629 = (float32x2_t){v627, v628}; - float32x2_t v636 = (float32x2_t){v634, v635}; - int16x4_t v434 = vld1s_s16(&v5[istride * 10]); - int16x4_t v441 = vld1s_s16(&v5[istride * 2]); - int16x4_t v447 = vld1s_s16(&v5[istride * 9]); - int16x4_t v454 = vld1s_s16(&v5[istride * 3]); - int16x4_t v460 = vld1s_s16(&v5[istride * 8]); - int16x4_t v467 = vld1s_s16(&v5[istride * 4]); - int16x4_t v473 = vld1s_s16(&v5[istride * 7]); - int16x4_t v480 = vld1s_s16(&v5[istride * 5]); - int16x4_t v486 = vld1s_s16(&v5[istride * 6]); - float32x2_t v505 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v504)), 15); - float32x2_t v539 = vmul_f32(v637, v537); - float32x2_t v582 = vmul_f32(v637, v580); - float32x2_t v589 = vmul_f32(v637, v587); - float32x2_t v596 = vmul_f32(v637, v594); - float32x2_t v603 = vmul_f32(v637, v601); - float32x2_t v610 = vmul_f32(v637, v608); - float32x2_t v617 = vmul_f32(v637, v615); - float32x2_t v624 = vmul_f32(v637, v622); - float32x2_t v631 = vmul_f32(v637, v629); - float32x2_t v638 = vmul_f32(v637, v636); - float32x2_t v435 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v434)), 15); - float32x2_t v442 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v441)), 15); - float32x2_t v448 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v447)), 15); - float32x2_t v455 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v454)), 15); - float32x2_t v461 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v460)), 15); - float32x2_t v468 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v467)), 15); - float32x2_t v474 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v473)), 15); - float32x2_t v481 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v480)), 15); - float32x2_t v487 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v486)), 15); - float32x2_t v436 = vadd_f32(v429, v435); - float32x2_t v449 = vadd_f32(v442, v448); - float32x2_t v462 = vadd_f32(v455, v461); - float32x2_t v475 = vadd_f32(v468, v474); - float32x2_t v488 = vadd_f32(v481, v487); - float32x2_t v489 = vsub_f32(v429, v435); - float32x2_t v490 = vsub_f32(v442, v448); - float32x2_t v491 = vsub_f32(v455, v461); - float32x2_t v492 = vsub_f32(v468, v474); - float32x2_t v493 = vsub_f32(v481, v487); - float32x2_t v494 = vadd_f32(v436, v449); - float32x2_t v495 = vadd_f32(v462, v488); - float32x2_t v497 = vsub_f32(v490, v491); - float32x2_t v498 = vadd_f32(v489, v493); - float32x2_t v509 = vsub_f32(v449, v475); - float32x2_t v510 = vsub_f32(v436, v475); - float32x2_t v511 = vsub_f32(v449, v436); - float32x2_t v512 = vsub_f32(v488, v475); - float32x2_t v513 = vsub_f32(v462, v475); - float32x2_t v514 = vsub_f32(v488, v462); - float32x2_t v515 = vsub_f32(v449, v488); - float32x2_t v516 = vsub_f32(v436, v462); - float32x2_t v518 = vadd_f32(v490, v492); - float32x2_t v519 = vsub_f32(v489, v492); - float32x2_t v520 = vadd_f32(v489, v490); - float32x2_t v521 = vsub_f32(v492, v493); - float32x2_t v522 = vsub_f32(v491, v492); - float32x2_t v523 = vsub_f32(v491, v493); - float32x2_t v524 = vadd_f32(v490, v493); - float32x2_t v525 = vsub_f32(v489, v491); - float32x2_t v496 = vadd_f32(v475, v494); - float32x2_t v507 = vsub_f32(v497, v498); - float32x2_t v517 = vsub_f32(v495, v494); - float32x2_t v526 = vadd_f32(v497, v498); - float32x2_t v545 = vmul_f32(v509, v544); - float32x2_t v549 = vmul_f32(v510, v548); - float32x2_t v553 = vmul_f32(v511, v552); - float32x2_t v557 = vmul_f32(v512, v556); - float32x2_t v561 = vmul_f32(v513, v560); - float32x2_t v565 = vmul_f32(v514, v564); - float32x2_t v569 = vmul_f32(v515, v568); - float32x2_t v573 = vmul_f32(v516, v572); - float32x2_t v583 = vrev64_f32(v518); - float32x2_t v590 = vrev64_f32(v519); - float32x2_t v597 = vrev64_f32(v520); - float32x2_t v604 = vrev64_f32(v521); - float32x2_t v611 = vrev64_f32(v522); - float32x2_t v618 = vrev64_f32(v523); - float32x2_t v625 = vrev64_f32(v524); - float32x2_t v632 = vrev64_f32(v525); - float32x2_t v499 = vadd_f32(v496, v495); - float32x2_t v508 = vsub_f32(v507, v492); - float32x2_t v577 = vmul_f32(v517, v576); - float32x2_t v584 = vmul_f32(v583, v582); - float32x2_t v591 = vmul_f32(v590, v589); - float32x2_t v598 = vmul_f32(v597, v596); - float32x2_t v605 = vmul_f32(v604, v603); - float32x2_t v612 = vmul_f32(v611, v610); - float32x2_t v619 = vmul_f32(v618, v617); - float32x2_t v626 = vmul_f32(v625, v624); - float32x2_t v633 = vmul_f32(v632, v631); - float32x2_t v639 = vrev64_f32(v526); - float32x2_t v642 = vadd_f32(v545, v549); - float32x2_t v643 = vadd_f32(v549, v553); - float32x2_t v644 = vsub_f32(v545, v553); - float32x2_t v645 = vadd_f32(v557, v561); - float32x2_t v646 = vadd_f32(v561, v565); - float32x2_t v647 = vsub_f32(v557, v565); - float32x2_t v506 = vadd_f32(v505, v499); - float32x2_t v534 = vmul_f32(v499, v533); - float32x2_t v540 = vrev64_f32(v508); - float32x2_t v640 = vmul_f32(v639, v638); - float32x2_t v648 = vadd_f32(v573, v577); - float32x2_t v649 = vadd_f32(v569, v577); - float32x2_t v650 = vadd_f32(v591, v598); - float32x2_t v651 = vsub_f32(v584, v598); - float32x2_t v652 = vadd_f32(v612, v619); - float32x2_t v653 = vsub_f32(v605, v619); - float32x2_t v541 = vmul_f32(v540, v539); - float32x2_t v641 = vsub_f32(v506, v534); - float32x2_t v654 = vadd_f32(v633, v640); - float32x2_t v655 = vsub_f32(v626, v640); - float32x2_t v656 = vadd_f32(v646, v648); - float32x2_t v674 = vadd_f32(v650, v651); - int16x4_t v690 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v506, 15), (int32x2_t){0, 0})); - float32x2_t v657 = vadd_f32(v656, v641); - float32x2_t v658 = vsub_f32(v641, v643); - float32x2_t v660 = vadd_f32(v641, v647); - float32x2_t v662 = vsub_f32(v641, v644); - float32x2_t v664 = vadd_f32(v641, v642); - float32x2_t v666 = vadd_f32(v541, v652); - float32x2_t v668 = vsub_f32(v654, v650); - float32x2_t v670 = vadd_f32(v541, v655); - float32x2_t v672 = vsub_f32(v655, v651); - float32x2_t v675 = vadd_f32(v674, v652); - v6[0] = vget_lane_s32(vreinterpret_s32_s16(v690), 0); - float32x2_t v659 = vsub_f32(v658, v648); - float32x2_t v661 = vadd_f32(v660, v649); - float32x2_t v663 = vsub_f32(v662, v649); - float32x2_t v665 = vsub_f32(v664, v645); - float32x2_t v667 = vadd_f32(v666, v654); - float32x2_t v669 = vsub_f32(v668, v541); - float32x2_t v671 = vadd_f32(v670, v653); - float32x2_t v673 = vsub_f32(v672, v541); - float32x2_t v676 = vadd_f32(v675, v653); - float32x2_t v677 = vsub_f32(v676, v541); - float32x2_t v679 = vadd_f32(v657, v667); - float32x2_t v680 = vadd_f32(v659, v669); - float32x2_t v681 = vsub_f32(v661, v671); - float32x2_t v682 = vadd_f32(v663, v673); - float32x2_t v683 = vsub_f32(v663, v673); - float32x2_t v684 = vadd_f32(v661, v671); - float32x2_t v685 = vsub_f32(v659, v669); - float32x2_t v686 = vsub_f32(v657, v667); - float32x2_t v678 = vadd_f32(v665, v677); - float32x2_t v687 = vsub_f32(v665, v677); - int16x4_t v702 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v679, 15), (int32x2_t){0, 0})); - int16x4_t v708 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v680, 15), (int32x2_t){0, 0})); - int16x4_t v714 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v681, 15), (int32x2_t){0, 0})); - int16x4_t v720 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v682, 15), (int32x2_t){0, 0})); - int16x4_t v726 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v683, 15), (int32x2_t){0, 0})); - int16x4_t v732 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v684, 15), (int32x2_t){0, 0})); - int16x4_t v738 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v685, 15), (int32x2_t){0, 0})); - int16x4_t v744 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v686, 15), (int32x2_t){0, 0})); - int16x4_t v696 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v678, 15), (int32x2_t){0, 0})); - v6[ostride * 9] = vget_lane_s32(vreinterpret_s32_s16(v702), 0); - v6[ostride * 8] = vget_lane_s32(vreinterpret_s32_s16(v708), 0); - v6[ostride * 7] = vget_lane_s32(vreinterpret_s32_s16(v714), 0); - v6[ostride * 6] = vget_lane_s32(vreinterpret_s32_s16(v720), 0); - v6[ostride * 5] = vget_lane_s32(vreinterpret_s32_s16(v726), 0); - v6[ostride * 4] = vget_lane_s32(vreinterpret_s32_s16(v732), 0); - v6[ostride * 3] = vget_lane_s32(vreinterpret_s32_s16(v738), 0); - v6[ostride * 2] = vget_lane_s32(vreinterpret_s32_s16(v744), 0); - int16x4_t v750 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v687, 15), (int32x2_t){0, 0})); - v6[ostride * 10] = vget_lane_s32(vreinterpret_s32_s16(v696), 0); - v6[ostride] = vget_lane_s32(vreinterpret_s32_s16(v750), 0); - v5 += 1 * 1; - v6 += 1 * 1; - } -} -#endif - -#ifdef ARMRAL_ARCH_SVE -void armral_fft_cs16_cf32_cs16_ac_n_uu11(const armral_cmplx_int16_t *restrict x, - armral_cmplx_int16_t *restrict y, - int istride, int ostride, int howmany, - float dir) { - int64_t v0 = istride; - int64_t v2 = ostride; - float v4 = dir; - const int32_t *v5 = (const int32_t *)x; - int32_t *v6 = (int32_t *)y; - int64_t v8 = howmany; - int64_t v10 = svcntd(); - int64_t v11 = v10 * 1; - int64_t v12 = v10 * 1; - for (int j = 0; j < v8; j += v10) { - svbool_t pred_full = svwhilelt_b32(j * 2, howmany * 2); - float v149 = 1.1000000000000001e+00F; - float v154 = -3.3166247903554003e-01F; - float v161 = 5.1541501300188641e-01F; - float v166 = 9.4125353283118118e-01F; - float v171 = 1.4143537075597825e+00F; - float v176 = 8.5949297361449750e-01F; - float v181 = 4.2314838273285138e-02F; - float v186 = 3.8639279888589606e-01F; - float v191 = 5.1254589567200015e-01F; - float v196 = 1.0702757469471715e+00F; - float v201 = 5.5486073394528512e-01F; - float v206 = -1.2412944743900585e+00F; - float v213 = -2.0897833842005756e-01F; - float v220 = -3.7415717312460811e-01F; - float v227 = -4.9929922194110327e-02F; - float v234 = -6.5815896284539266e-01F; - float v241 = -6.3306543373877577e-01F; - float v248 = -1.0822460581641109e+00F; - float v255 = -8.1720737907134022e-01F; - float v262 = -4.2408709531871824e-01F; - const int32_t *v409 = &v5[v0]; - int32_t *v621 = &v6[v2]; - int64_t v27 = v0 * 10; - int64_t v36 = v0 * 2; - int64_t v44 = v0 * 9; - int64_t v53 = v0 * 3; - int64_t v61 = v0 * 8; - int64_t v70 = v0 * 4; - int64_t v78 = v0 * 7; - int64_t v87 = v0 * 5; - int64_t v95 = v0 * 6; - float v157 = v4 * v154; - float v209 = v4 * v206; - float v216 = v4 * v213; - float v223 = v4 * v220; - float v230 = v4 * v227; - float v237 = v4 * v234; - float v244 = v4 * v241; - float v251 = v4 * v248; - float v258 = v4 * v255; - float v265 = v4 * v262; - int64_t v324 = v2 * 10; - int64_t v332 = v2 * 9; - int64_t v340 = v2 * 8; - int64_t v348 = v2 * 7; - int64_t v356 = v2 * 6; - int64_t v364 = v2 * 5; - int64_t v372 = v2 * 4; - int64_t v380 = v2 * 3; - int64_t v388 = v2 * 2; - const int32_t *v500 = &v5[0]; - svfloat32_t v504 = svdup_n_f32(v149); - svfloat32_t v506 = svdup_n_f32(v161); - svfloat32_t v507 = svdup_n_f32(v166); - svfloat32_t v508 = svdup_n_f32(v171); - svfloat32_t v509 = svdup_n_f32(v176); - svfloat32_t v510 = svdup_n_f32(v181); - svfloat32_t v511 = svdup_n_f32(v186); - svfloat32_t v512 = svdup_n_f32(v191); - svfloat32_t v513 = svdup_n_f32(v196); - svfloat32_t v514 = svdup_n_f32(v201); - int32_t *v531 = &v6[0]; - svfloat32_t v25 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v409[0])), - 1.F / (1ULL << 15ULL)); - const int32_t *v418 = &v5[v27]; - const int32_t *v427 = &v5[v36]; - const int32_t *v436 = &v5[v44]; - const int32_t *v445 = &v5[v53]; - const int32_t *v454 = &v5[v61]; - const int32_t *v463 = &v5[v70]; - const int32_t *v472 = &v5[v78]; - const int32_t *v481 = &v5[v87]; - const int32_t *v490 = &v5[v95]; - svfloat32_t v505 = svdup_n_f32(v157); - svfloat32_t v515 = svdup_n_f32(v209); - svfloat32_t v516 = svdup_n_f32(v216); - svfloat32_t v517 = svdup_n_f32(v223); - svfloat32_t v518 = svdup_n_f32(v230); - svfloat32_t v519 = svdup_n_f32(v237); - svfloat32_t v520 = svdup_n_f32(v244); - svfloat32_t v521 = svdup_n_f32(v251); - svfloat32_t v522 = svdup_n_f32(v258); - svfloat32_t v523 = svdup_n_f32(v265); - int32_t *v540 = &v6[v324]; - int32_t *v549 = &v6[v332]; - int32_t *v558 = &v6[v340]; - int32_t *v567 = &v6[v348]; - int32_t *v576 = &v6[v356]; - int32_t *v585 = &v6[v364]; - int32_t *v594 = &v6[v372]; - int32_t *v603 = &v6[v380]; - int32_t *v612 = &v6[v388]; - svfloat32_t v121 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v500[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v33 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v418[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v42 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v427[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v50 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v436[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v59 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v445[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v67 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v454[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v76 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v463[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v84 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v472[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v93 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v481[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v101 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v490[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v34; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v34) : "w"(v25), "w"(v33)); - svfloat32_t v51; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v51) : "w"(v42), "w"(v50)); - svfloat32_t v68; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v68) : "w"(v59), "w"(v67)); - svfloat32_t v85; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v85) : "w"(v76), "w"(v84)); - svfloat32_t v102; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v102) : "w"(v93), "w"(v101)); - svfloat32_t v103; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v103) : "w"(v25), "w"(v33)); - svfloat32_t v104; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v104) : "w"(v42), "w"(v50)); - svfloat32_t v105; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v105) : "w"(v59), "w"(v67)); - svfloat32_t v106; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v106) : "w"(v76), "w"(v84)); - svfloat32_t v107; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v107) : "w"(v93), "w"(v101)); - svfloat32_t v108; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v108) : "w"(v34), "w"(v51)); - svfloat32_t v109; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v109) : "w"(v68), "w"(v102)); - svfloat32_t v111; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v111) : "w"(v104), "w"(v105)); - svfloat32_t v112; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v112) : "w"(v103), "w"(v107)); - svfloat32_t v125; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v125) : "w"(v51), "w"(v85)); - svfloat32_t v126; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v126) : "w"(v34), "w"(v85)); - svfloat32_t v127; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v127) : "w"(v51), "w"(v34)); - svfloat32_t v128; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v128) : "w"(v102), "w"(v85)); - svfloat32_t v129; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v129) : "w"(v68), "w"(v85)); - svfloat32_t v130; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v130) : "w"(v102), "w"(v68)); - svfloat32_t v131; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v131) : "w"(v51), "w"(v102)); - svfloat32_t v132; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v132) : "w"(v34), "w"(v68)); - svfloat32_t v134; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v134) : "w"(v104), "w"(v106)); - svfloat32_t v135; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v135) : "w"(v103), "w"(v106)); - svfloat32_t v136; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v136) : "w"(v103), "w"(v104)); - svfloat32_t v137; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v137) : "w"(v106), "w"(v107)); - svfloat32_t v138; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v138) : "w"(v105), "w"(v106)); - svfloat32_t v139; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v139) : "w"(v105), "w"(v107)); - svfloat32_t v140; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v140) : "w"(v104), "w"(v107)); - svfloat32_t v141; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v141) : "w"(v103), "w"(v105)); - svfloat32_t v110; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v110) : "w"(v85), "w"(v108)); - svfloat32_t v123; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v123) : "w"(v111), "w"(v112)); - svfloat32_t v133; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v133) : "w"(v109), "w"(v108)); - svfloat32_t v142; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v142) : "w"(v111), "w"(v112)); - svfloat32_t v169; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v169) : "w"(v126), "w"(v507)); - svfloat32_t v174; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v174) : "w"(v127), "w"(v508)); - svfloat32_t v184; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v184) : "w"(v129), "w"(v510)); - svfloat32_t v189; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v189) : "w"(v130), "w"(v511)); - svfloat32_t zero211; - asm volatile("mov %0.s, #0" : "=w"(zero211)); - svfloat32_t v211 = svcmla_f32_x(pred_full, zero211, v515, v134, 90); - svfloat32_t zero225; - asm volatile("mov %0.s, #0" : "=w"(zero225)); - svfloat32_t v225 = svcmla_f32_x(pred_full, zero225, v517, v136, 90); - svfloat32_t zero232; - asm volatile("mov %0.s, #0" : "=w"(zero232)); - svfloat32_t v232 = svcmla_f32_x(pred_full, zero232, v518, v137, 90); - svfloat32_t zero246; - asm volatile("mov %0.s, #0" : "=w"(zero246)); - svfloat32_t v246 = svcmla_f32_x(pred_full, zero246, v520, v139, 90); - svfloat32_t zero253; - asm volatile("mov %0.s, #0" : "=w"(zero253)); - svfloat32_t v253 = svcmla_f32_x(pred_full, zero253, v521, v140, 90); - svfloat32_t v113; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v113) : "w"(v110), "w"(v109)); - svfloat32_t v124; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v124) : "w"(v123), "w"(v106)); - svfloat32_t v204; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v204) : "w"(v133), "w"(v514)); - svfloat32_t zero267; - asm volatile("mov %0.s, #0" : "=w"(zero267)); - svfloat32_t v267 = svcmla_f32_x(pred_full, zero267, v523, v142, 90); - svfloat32_t v269 = svmla_f32_x(pred_full, v169, v125, v506); - svfloat32_t v270 = svmla_f32_x(pred_full, v174, v126, v507); - svfloat32_t v271 = svnmls_f32_x(pred_full, v174, v125, v506); - svfloat32_t v272 = svmla_f32_x(pred_full, v184, v128, v509); - svfloat32_t v273 = svmla_f32_x(pred_full, v189, v129, v510); - svfloat32_t v274 = svnmls_f32_x(pred_full, v189, v128, v509); - svfloat32_t v277 = svcmla_f32_x(pred_full, v225, v516, v135, 90); - svfloat32_t v278; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v278) : "w"(v211), "w"(v225)); - svfloat32_t v279 = svcmla_f32_x(pred_full, v246, v519, v138, 90); - svfloat32_t v280; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v280) : "w"(v232), "w"(v246)); - svfloat32_t v122; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v122) : "w"(v121), "w"(v113)); - svfloat32_t zero159; - asm volatile("mov %0.s, #0" : "=w"(zero159)); - svfloat32_t v159 = svcmla_f32_x(pred_full, zero159, v505, v124, 90); - svfloat32_t v275 = svmla_f32_x(pred_full, v204, v132, v513); - svfloat32_t v276 = svmla_f32_x(pred_full, v204, v131, v512); - svfloat32_t v281 = svcmla_f32_x(pred_full, v267, v522, v141, 90); - svfloat32_t v282; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v282) : "w"(v253), "w"(v267)); - svfloat32_t v301; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v301) : "w"(v277), "w"(v278)); - svfloat32_t v268 = svmls_f32_x(pred_full, v122, v113, v504); - svfloat32_t v283; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v283) : "w"(v273), "w"(v275)); - svfloat32_t v293; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v293) : "w"(v159), "w"(v279)); - svfloat32_t v295; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v295) : "w"(v281), "w"(v277)); - svfloat32_t v297; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v297) : "w"(v159), "w"(v282)); - svfloat32_t v299; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v299) : "w"(v282), "w"(v278)); - svfloat32_t v302; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v302) : "w"(v301), "w"(v279)); - svint16_t v317 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v122, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svfloat32_t v284; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v284) : "w"(v283), "w"(v268)); - svfloat32_t v285; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v285) : "w"(v268), "w"(v270)); - svfloat32_t v287; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v287) : "w"(v268), "w"(v274)); - svfloat32_t v289; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v289) : "w"(v268), "w"(v271)); - svfloat32_t v291; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v291) : "w"(v268), "w"(v269)); - svfloat32_t v294; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v294) : "w"(v293), "w"(v281)); - svfloat32_t v296; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v296) : "w"(v295), "w"(v159)); - svfloat32_t v298; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v298) : "w"(v297), "w"(v280)); - svfloat32_t v300; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v300) : "w"(v299), "w"(v159)); - svfloat32_t v303; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v303) : "w"(v302), "w"(v280)); - svst1w_u64(pred_full, (unsigned *)(v531), svreinterpret_u64_s16(v317)); - svfloat32_t v286; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v286) : "w"(v285), "w"(v275)); - svfloat32_t v288; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v288) : "w"(v287), "w"(v276)); - svfloat32_t v290; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v290) : "w"(v289), "w"(v276)); - svfloat32_t v292; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v292) : "w"(v291), "w"(v272)); - svfloat32_t v304; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v304) : "w"(v303), "w"(v159)); - svfloat32_t v306; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v306) : "w"(v284), "w"(v294)); - svfloat32_t v313; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v313) : "w"(v284), "w"(v294)); - svfloat32_t v305; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v305) : "w"(v292), "w"(v304)); - svfloat32_t v307; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v307) : "w"(v286), "w"(v296)); - svfloat32_t v308; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v308) : "w"(v288), "w"(v298)); - svfloat32_t v309; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v309) : "w"(v290), "w"(v300)); - svfloat32_t v310; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v310) : "w"(v290), "w"(v300)); - svfloat32_t v311; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v311) : "w"(v288), "w"(v298)); - svfloat32_t v312; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v312) : "w"(v286), "w"(v296)); - svfloat32_t v314; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v314) : "w"(v292), "w"(v304)); - svint16_t v333 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v306, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svint16_t v389 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v313, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svint16_t v325 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v305, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svint16_t v341 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v307, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svint16_t v349 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v308, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svint16_t v357 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v309, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svint16_t v365 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v310, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svint16_t v373 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v311, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svint16_t v381 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v312, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svint16_t v397 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v314, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svst1w_u64(pred_full, (unsigned *)(v549), svreinterpret_u64_s16(v333)); - svst1w_u64(pred_full, (unsigned *)(v612), svreinterpret_u64_s16(v389)); - svst1w_u64(pred_full, (unsigned *)(v540), svreinterpret_u64_s16(v325)); - svst1w_u64(pred_full, (unsigned *)(v558), svreinterpret_u64_s16(v341)); - svst1w_u64(pred_full, (unsigned *)(v567), svreinterpret_u64_s16(v349)); - svst1w_u64(pred_full, (unsigned *)(v576), svreinterpret_u64_s16(v357)); - svst1w_u64(pred_full, (unsigned *)(v585), svreinterpret_u64_s16(v365)); - svst1w_u64(pred_full, (unsigned *)(v594), svreinterpret_u64_s16(v373)); - svst1w_u64(pred_full, (unsigned *)(v603), svreinterpret_u64_s16(v381)); - svst1w_u64(pred_full, (unsigned *)(v621), svreinterpret_u64_s16(v397)); - v5 += v11; - v6 += v12; - } -} -#endif - -#ifndef ARMRAL_ARCH_SVE -void armral_fft_cs16_cf32_cs16_ac_n_uu12(const armral_cmplx_int16_t *restrict x, - armral_cmplx_int16_t *restrict y, - int istride, int ostride, int howmany, - float dir) { - float v4 = dir; - const int32_t *v5 = (const int32_t *)x; - int32_t *v6 = (int32_t *)y; - int64_t v12 = howmany - 1; - int64_t v336 = howmany / 2; - for (int j = 0; j < v12; j += 2) { - float v150 = 1.0000000000000000e+00F; - float v151 = -1.0000000000000000e+00F; - float v181 = -1.4999999999999998e+00F; - float v182 = 1.4999999999999998e+00F; - float v213 = 8.6602540378443871e-01F; - float32x2_t v216 = (float32x2_t){v4, v4}; - float v222 = -8.6602540378443871e-01F; - const int32_t *v691 = &v5[istride]; - int32_t *v755 = &v6[ostride]; - float32x2_t v152 = (float32x2_t){v150, v151}; - float32x2_t v178 = (float32x2_t){v181, v181}; - float32x2_t v183 = (float32x2_t){v181, v182}; - float32x2_t v215 = (float32x2_t){v213, v222}; - float32x2_t v223 = (float32x2_t){v222, v222}; - const int32_t *v628 = &v5[0]; - int32_t *v719 = &v6[0]; - int16x4_t v840 = vld1_s16((const int16_t *)v691); - float32x4_t v109 = vcvtq_n_f32_s32(vmovl_s16(v840), 15); - float32x2_t v154 = vmul_f32(v216, v152); - float32x4_t v179 = vcombine_f32(v178, v178); - float32x2_t v185 = vmul_f32(v216, v183); - float32x2_t v217 = vmul_f32(v216, v215); - float32x4_t v224 = vcombine_f32(v223, v223); - const int32_t *v609 = &v5[istride * 4]; - const int32_t *v618 = &v5[istride * 8]; - const int32_t *v637 = &v5[istride * 7]; - const int32_t *v646 = &v5[istride * 11]; - const int32_t *v655 = &v5[istride * 3]; - const int32_t *v664 = &v5[istride * 10]; - const int32_t *v673 = &v5[istride * 2]; - const int32_t *v682 = &v5[istride * 6]; - const int32_t *v700 = &v5[istride * 5]; - const int32_t *v709 = &v5[istride * 9]; - int32_t *v728 = &v6[ostride * 4]; - int32_t *v737 = &v6[ostride * 8]; - int32_t *v746 = &v6[ostride * 9]; - int32_t *v764 = &v6[ostride * 5]; - int32_t *v773 = &v6[ostride * 6]; - int32_t *v782 = &v6[ostride * 10]; - int32_t *v791 = &v6[ostride * 2]; - int32_t *v800 = &v6[ostride * 3]; - int32_t *v809 = &v6[ostride * 7]; - int32_t *v818 = &v6[ostride * 11]; - int16x4_t v826 = vld1_s16((const int16_t *)v628); - float32x4_t v46 = vcvtq_n_f32_s32(vmovl_s16(v826), 15); - float32x4_t v156 = vcombine_f32(v154, v154); - float32x4_t v187 = vcombine_f32(v185, v185); - float32x4_t v219 = vcombine_f32(v217, v217); - int16x4_t v822 = vld1_s16((const int16_t *)v609); - int16x4_t v824 = vld1_s16((const int16_t *)v618); - int16x4_t v828 = vld1_s16((const int16_t *)v637); - int16x4_t v830 = vld1_s16((const int16_t *)v646); - int16x4_t v832 = vld1_s16((const int16_t *)v655); - int16x4_t v834 = vld1_s16((const int16_t *)v664); - int16x4_t v836 = vld1_s16((const int16_t *)v673); - int16x4_t v838 = vld1_s16((const int16_t *)v682); - int16x4_t v842 = vld1_s16((const int16_t *)v700); - int16x4_t v844 = vld1_s16((const int16_t *)v709); - float32x4_t v28 = vcvtq_n_f32_s32(vmovl_s16(v822), 15); - float32x4_t v36 = vcvtq_n_f32_s32(vmovl_s16(v824), 15); - float32x4_t v55 = vcvtq_n_f32_s32(vmovl_s16(v828), 15); - float32x4_t v63 = vcvtq_n_f32_s32(vmovl_s16(v830), 15); - float32x4_t v73 = vcvtq_n_f32_s32(vmovl_s16(v832), 15); - float32x4_t v82 = vcvtq_n_f32_s32(vmovl_s16(v834), 15); - float32x4_t v90 = vcvtq_n_f32_s32(vmovl_s16(v836), 15); - float32x4_t v100 = vcvtq_n_f32_s32(vmovl_s16(v838), 15); - float32x4_t v117 = vcvtq_n_f32_s32(vmovl_s16(v842), 15); - float32x4_t v127 = vcvtq_n_f32_s32(vmovl_s16(v844), 15); - float32x4_t v37 = vaddq_f32(v28, v36); - float32x4_t v38 = vsubq_f32(v28, v36); - float32x4_t v64 = vaddq_f32(v55, v63); - float32x4_t v65 = vsubq_f32(v55, v63); - float32x4_t v91 = vaddq_f32(v82, v90); - float32x4_t v92 = vsubq_f32(v82, v90); - float32x4_t v118 = vaddq_f32(v109, v117); - float32x4_t v119 = vsubq_f32(v109, v117); - float32x4_t v47 = vaddq_f32(v37, v46); - float32x4_t v74 = vaddq_f32(v64, v73); - float32x4_t v101 = vaddq_f32(v91, v100); - float32x4_t v128 = vaddq_f32(v118, v127); - float32x4_t v160 = vaddq_f32(v37, v91); - float32x4_t v161 = vsubq_f32(v37, v91); - float32x4_t v162 = vaddq_f32(v64, v118); - float32x4_t v163 = vsubq_f32(v64, v118); - float32x4_t v191 = vaddq_f32(v38, v92); - float32x4_t v192 = vsubq_f32(v38, v92); - float32x4_t v193 = vaddq_f32(v65, v119); - float32x4_t v194 = vsubq_f32(v65, v119); - float32x4_t v129 = vaddq_f32(v47, v101); - float32x4_t v130 = vsubq_f32(v47, v101); - float32x4_t v131 = vaddq_f32(v74, v128); - float32x4_t v132 = vsubq_f32(v74, v128); - float32x4_t v164 = vaddq_f32(v160, v162); - float32x4_t v165 = vsubq_f32(v160, v162); - float32x4_t v180 = vmulq_f32(v161, v179); - float32x4_t v186 = vrev64q_f32(v163); - float32x4_t v195 = vaddq_f32(v191, v193); - float32x4_t v196 = vsubq_f32(v191, v193); - float32x4_t v218 = vrev64q_f32(v192); - float32x4_t v225 = vmulq_f32(v194, v224); - float32x4_t v133 = vaddq_f32(v129, v131); - float32x4_t v134 = vsubq_f32(v129, v131); - float32x4_t v155 = vrev64q_f32(v132); - float32x4_t v170 = vmulq_f32(v164, v179); - float32x4_t v175 = vmulq_f32(v165, v179); - float32x4_t v188 = vmulq_f32(v186, v187); - float32x4_t v202 = vrev64q_f32(v195); - float32x4_t v210 = vrev64q_f32(v196); - float32x4_t v220 = vmulq_f32(v218, v219); - float32x4_t v157 = vmulq_f32(v155, v156); - float32x4_t v189 = vaddq_f32(v180, v188); - float32x4_t v190 = vsubq_f32(v180, v188); - float32x4_t v204 = vmulq_f32(v202, v219); - float32x4_t v212 = vmulq_f32(v210, v219); - float32x4_t v226 = vaddq_f32(v220, v225); - float32x4_t v227 = vsubq_f32(v220, v225); - float32x4_t v228 = vaddq_f32(v133, v170); - int16x4_t v233 = vqmovn_s32(vcvtq_n_s32_f32(v133, 15)); - float32x4_t v282 = vaddq_f32(v134, v175); - int16x4_t v287 = vqmovn_s32(vcvtq_n_s32_f32(v134, 15)); - float32x4_t v158 = vaddq_f32(v130, v157); - float32x4_t v159 = vsubq_f32(v130, v157); - float32x4_t v229 = vaddq_f32(v228, v204); - float32x4_t v230 = vsubq_f32(v228, v204); - float32x4_t v283 = vaddq_f32(v282, v212); - float32x4_t v284 = vsubq_f32(v282, v212); - vst1_s16((int16_t *)v719, v233); - vst1_s16((int16_t *)v773, v287); - int16x4_t v241 = vqmovn_s32(vcvtq_n_s32_f32(v230, 15)); - int16x4_t v249 = vqmovn_s32(vcvtq_n_s32_f32(v229, 15)); - float32x4_t v255 = vaddq_f32(v159, v190); - int16x4_t v260 = vqmovn_s32(vcvtq_n_s32_f32(v159, 15)); - int16x4_t v295 = vqmovn_s32(vcvtq_n_s32_f32(v284, 15)); - int16x4_t v303 = vqmovn_s32(vcvtq_n_s32_f32(v283, 15)); - float32x4_t v309 = vaddq_f32(v158, v189); - int16x4_t v314 = vqmovn_s32(vcvtq_n_s32_f32(v158, 15)); - float32x4_t v256 = vaddq_f32(v255, v227); - float32x4_t v257 = vsubq_f32(v255, v227); - float32x4_t v310 = vaddq_f32(v309, v226); - float32x4_t v311 = vsubq_f32(v309, v226); - vst1_s16((int16_t *)v728, v241); - vst1_s16((int16_t *)v737, v249); - vst1_s16((int16_t *)v746, v260); - vst1_s16((int16_t *)v782, v295); - vst1_s16((int16_t *)v791, v303); - vst1_s16((int16_t *)v800, v314); - int16x4_t v268 = vqmovn_s32(vcvtq_n_s32_f32(v257, 15)); - int16x4_t v276 = vqmovn_s32(vcvtq_n_s32_f32(v256, 15)); - int16x4_t v322 = vqmovn_s32(vcvtq_n_s32_f32(v311, 15)); - int16x4_t v330 = vqmovn_s32(vcvtq_n_s32_f32(v310, 15)); - vst1_s16((int16_t *)v755, v268); - vst1_s16((int16_t *)v764, v276); - vst1_s16((int16_t *)v809, v322); - vst1_s16((int16_t *)v818, v330); - v5 += 2 * 1; - v6 += 2 * 1; - } - for (int j = v336 * 2; j < howmany; j += 1) { - int16x4_t v411 = vld1s_s16(&v5[istride]); - float v446 = 1.0000000000000000e+00F; - float v447 = -1.0000000000000000e+00F; - float v473 = -1.4999999999999998e+00F; - float v474 = 1.4999999999999998e+00F; - float v502 = 8.6602540378443871e-01F; - float32x2_t v505 = (float32x2_t){v4, v4}; - float v510 = -8.6602540378443871e-01F; - int16x4_t v362 = vld1s_s16(&v5[0]); - float32x2_t v412 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v411)), 15); - float32x2_t v448 = (float32x2_t){v446, v447}; - float32x2_t v471 = (float32x2_t){v473, v473}; - float32x2_t v475 = (float32x2_t){v473, v474}; - float32x2_t v504 = (float32x2_t){v502, v510}; - float32x2_t v511 = (float32x2_t){v510, v510}; - int16x4_t v348 = vld1s_s16(&v5[istride * 4]); - int16x4_t v354 = vld1s_s16(&v5[istride * 8]); - float32x2_t v363 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v362)), 15); - int16x4_t v369 = vld1s_s16(&v5[istride * 7]); - int16x4_t v375 = vld1s_s16(&v5[istride * 11]); - int16x4_t v383 = vld1s_s16(&v5[istride * 3]); - int16x4_t v390 = vld1s_s16(&v5[istride * 10]); - int16x4_t v396 = vld1s_s16(&v5[istride * 2]); - int16x4_t v404 = vld1s_s16(&v5[istride * 6]); - int16x4_t v417 = vld1s_s16(&v5[istride * 5]); - int16x4_t v425 = vld1s_s16(&v5[istride * 9]); - float32x2_t v450 = vmul_f32(v505, v448); - float32x2_t v477 = vmul_f32(v505, v475); - float32x2_t v506 = vmul_f32(v505, v504); - float32x2_t v349 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v348)), 15); - float32x2_t v355 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v354)), 15); - float32x2_t v370 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v369)), 15); - float32x2_t v376 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v375)), 15); - float32x2_t v384 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v383)), 15); - float32x2_t v391 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v390)), 15); - float32x2_t v397 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v396)), 15); - float32x2_t v405 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v404)), 15); - float32x2_t v418 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v417)), 15); - float32x2_t v426 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v425)), 15); - float32x2_t v356 = vadd_f32(v349, v355); - float32x2_t v357 = vsub_f32(v349, v355); - float32x2_t v377 = vadd_f32(v370, v376); - float32x2_t v378 = vsub_f32(v370, v376); - float32x2_t v398 = vadd_f32(v391, v397); - float32x2_t v399 = vsub_f32(v391, v397); - float32x2_t v419 = vadd_f32(v412, v418); - float32x2_t v420 = vsub_f32(v412, v418); - float32x2_t v364 = vadd_f32(v356, v363); - float32x2_t v385 = vadd_f32(v377, v384); - float32x2_t v406 = vadd_f32(v398, v405); - float32x2_t v427 = vadd_f32(v419, v426); - float32x2_t v455 = vadd_f32(v356, v398); - float32x2_t v456 = vsub_f32(v356, v398); - float32x2_t v457 = vadd_f32(v377, v419); - float32x2_t v458 = vsub_f32(v377, v419); - float32x2_t v482 = vadd_f32(v357, v399); - float32x2_t v483 = vsub_f32(v357, v399); - float32x2_t v484 = vadd_f32(v378, v420); - float32x2_t v485 = vsub_f32(v378, v420); - float32x2_t v428 = vadd_f32(v364, v406); - float32x2_t v429 = vsub_f32(v364, v406); - float32x2_t v430 = vadd_f32(v385, v427); - float32x2_t v431 = vsub_f32(v385, v427); - float32x2_t v459 = vadd_f32(v455, v457); - float32x2_t v460 = vsub_f32(v455, v457); - float32x2_t v472 = vmul_f32(v456, v471); - float32x2_t v478 = vrev64_f32(v458); - float32x2_t v486 = vadd_f32(v482, v484); - float32x2_t v487 = vsub_f32(v482, v484); - float32x2_t v507 = vrev64_f32(v483); - float32x2_t v512 = vmul_f32(v485, v511); - float32x2_t v432 = vadd_f32(v428, v430); - float32x2_t v433 = vsub_f32(v428, v430); - float32x2_t v451 = vrev64_f32(v431); - float32x2_t v464 = vmul_f32(v459, v471); - float32x2_t v468 = vmul_f32(v460, v471); - float32x2_t v479 = vmul_f32(v478, v477); - float32x2_t v493 = vrev64_f32(v486); - float32x2_t v500 = vrev64_f32(v487); - float32x2_t v508 = vmul_f32(v507, v506); - float32x2_t v452 = vmul_f32(v451, v450); - float32x2_t v480 = vadd_f32(v472, v479); - float32x2_t v481 = vsub_f32(v472, v479); - float32x2_t v494 = vmul_f32(v493, v506); - float32x2_t v501 = vmul_f32(v500, v506); - float32x2_t v513 = vadd_f32(v508, v512); - float32x2_t v514 = vsub_f32(v508, v512); - float32x2_t v515 = vadd_f32(v432, v464); - int16x4_t v520 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v432, 15), (int32x2_t){0, 0})); - float32x2_t v557 = vadd_f32(v433, v468); - int16x4_t v562 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v433, 15), (int32x2_t){0, 0})); - float32x2_t v453 = vadd_f32(v429, v452); - float32x2_t v454 = vsub_f32(v429, v452); - float32x2_t v516 = vadd_f32(v515, v494); - float32x2_t v517 = vsub_f32(v515, v494); - v6[0] = vget_lane_s32(vreinterpret_s32_s16(v520), 0); - float32x2_t v558 = vadd_f32(v557, v501); - float32x2_t v559 = vsub_f32(v557, v501); - v6[ostride * 6] = vget_lane_s32(vreinterpret_s32_s16(v562), 0); - int16x4_t v526 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v517, 15), (int32x2_t){0, 0})); - int16x4_t v532 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v516, 15), (int32x2_t){0, 0})); - float32x2_t v536 = vadd_f32(v454, v481); - int16x4_t v541 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v454, 15), (int32x2_t){0, 0})); - int16x4_t v568 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v559, 15), (int32x2_t){0, 0})); - int16x4_t v574 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v558, 15), (int32x2_t){0, 0})); - float32x2_t v578 = vadd_f32(v453, v480); - int16x4_t v583 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v453, 15), (int32x2_t){0, 0})); - v6[ostride * 4] = vget_lane_s32(vreinterpret_s32_s16(v526), 0); - v6[ostride * 8] = vget_lane_s32(vreinterpret_s32_s16(v532), 0); - float32x2_t v537 = vadd_f32(v536, v514); - float32x2_t v538 = vsub_f32(v536, v514); - v6[ostride * 9] = vget_lane_s32(vreinterpret_s32_s16(v541), 0); - v6[ostride * 10] = vget_lane_s32(vreinterpret_s32_s16(v568), 0); - v6[ostride * 2] = vget_lane_s32(vreinterpret_s32_s16(v574), 0); - float32x2_t v579 = vadd_f32(v578, v513); - float32x2_t v580 = vsub_f32(v578, v513); - v6[ostride * 3] = vget_lane_s32(vreinterpret_s32_s16(v583), 0); - int16x4_t v547 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v538, 15), (int32x2_t){0, 0})); - int16x4_t v553 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v537, 15), (int32x2_t){0, 0})); - int16x4_t v589 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v580, 15), (int32x2_t){0, 0})); - int16x4_t v595 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v579, 15), (int32x2_t){0, 0})); - v6[ostride] = vget_lane_s32(vreinterpret_s32_s16(v547), 0); - v6[ostride * 5] = vget_lane_s32(vreinterpret_s32_s16(v553), 0); - v6[ostride * 7] = vget_lane_s32(vreinterpret_s32_s16(v589), 0); - v6[ostride * 11] = vget_lane_s32(vreinterpret_s32_s16(v595), 0); - v5 += 1 * 1; - v6 += 1 * 1; - } -} -#endif - -#ifdef ARMRAL_ARCH_SVE -void armral_fft_cs16_cf32_cs16_ac_n_uu12(const armral_cmplx_int16_t *restrict x, - armral_cmplx_int16_t *restrict y, - int istride, int ostride, int howmany, - float dir) { - int64_t v0 = istride; - int64_t v2 = ostride; - float v4 = dir; - const int32_t *v5 = (const int32_t *)x; - int32_t *v6 = (int32_t *)y; - int64_t v8 = howmany; - int64_t v10 = svcntd(); - int64_t v11 = v10 * 1; - int64_t v12 = v10 * 1; - for (int j = 0; j < v8; j += v10) { - svbool_t pred_full = svwhilelt_b32(j * 2, howmany * 2); - float v148 = -1.0000000000000000e+00F; - float v173 = -1.4999999999999998e+00F; - float v178 = 1.4999999999999998e+00F; - float v214 = -8.6602540378443871e-01F; - const int32_t *v416 = &v5[v0]; - int32_t *v492 = &v6[v2]; - int64_t v19 = v0 * 4; - int64_t v27 = v0 * 8; - int64_t v46 = v0 * 7; - int64_t v54 = v0 * 11; - int64_t v64 = v0 * 3; - int64_t v73 = v0 * 10; - int64_t v81 = v0 * 2; - int64_t v91 = v0 * 6; - int64_t v108 = v0 * 5; - int64_t v118 = v0 * 9; - float v151 = v4 * v148; - float v181 = v4 * v178; - float v210 = v4 * v214; - int64_t v232 = v2 * 4; - int64_t v240 = v2 * 8; - int64_t v251 = v2 * 9; - int64_t v267 = v2 * 5; - int64_t v278 = v2 * 6; - int64_t v286 = v2 * 10; - int64_t v294 = v2 * 2; - int64_t v305 = v2 * 3; - int64_t v313 = v2 * 7; - int64_t v321 = v2 * 11; - const int32_t *v353 = &v5[0]; - svfloat32_t v443 = svdup_n_f32(v173); - svfloat32_t v448 = svdup_n_f32(v214); - int32_t *v456 = &v6[0]; - svfloat32_t v106 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v416[0])), - 1.F / (1ULL << 15ULL)); - const int32_t *v334 = &v5[v19]; - const int32_t *v343 = &v5[v27]; - const int32_t *v362 = &v5[v46]; - const int32_t *v371 = &v5[v54]; - const int32_t *v380 = &v5[v64]; - const int32_t *v389 = &v5[v73]; - const int32_t *v398 = &v5[v81]; - const int32_t *v407 = &v5[v91]; - const int32_t *v425 = &v5[v108]; - const int32_t *v434 = &v5[v118]; - svfloat32_t v440 = svdup_n_f32(v151); - svfloat32_t v444 = svdup_n_f32(v181); - svfloat32_t v447 = svdup_n_f32(v210); - int32_t *v465 = &v6[v232]; - int32_t *v474 = &v6[v240]; - int32_t *v483 = &v6[v251]; - int32_t *v501 = &v6[v267]; - int32_t *v510 = &v6[v278]; - int32_t *v519 = &v6[v286]; - int32_t *v528 = &v6[v294]; - int32_t *v537 = &v6[v305]; - int32_t *v546 = &v6[v313]; - int32_t *v555 = &v6[v321]; - svfloat32_t v43 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v353[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v25 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v334[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v33 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v343[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v52 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v362[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v60 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v371[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v70 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v380[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v79 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v389[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v87 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v398[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v97 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v407[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v114 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v425[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v124 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v434[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v34; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v34) : "w"(v25), "w"(v33)); - svfloat32_t v35; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v35) : "w"(v25), "w"(v33)); - svfloat32_t v61; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v61) : "w"(v52), "w"(v60)); - svfloat32_t v62; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v62) : "w"(v52), "w"(v60)); - svfloat32_t v88; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v88) : "w"(v79), "w"(v87)); - svfloat32_t v89; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v89) : "w"(v79), "w"(v87)); - svfloat32_t v115; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v115) : "w"(v106), "w"(v114)); - svfloat32_t v116; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v116) : "w"(v106), "w"(v114)); - svfloat32_t v44; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v44) : "w"(v34), "w"(v43)); - svfloat32_t v71; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v71) : "w"(v61), "w"(v70)); - svfloat32_t v98; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v98) : "w"(v88), "w"(v97)); - svfloat32_t v125; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v125) : "w"(v115), "w"(v124)); - svfloat32_t v156; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v156) : "w"(v34), "w"(v88)); - svfloat32_t v157; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v157) : "w"(v34), "w"(v88)); - svfloat32_t v158; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v158) : "w"(v61), "w"(v115)); - svfloat32_t v159; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v159) : "w"(v61), "w"(v115)); - svfloat32_t v186; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v186) : "w"(v35), "w"(v89)); - svfloat32_t v187; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v187) : "w"(v35), "w"(v89)); - svfloat32_t v188; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v188) : "w"(v62), "w"(v116)); - svfloat32_t v189; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v189) : "w"(v62), "w"(v116)); - svfloat32_t v126; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v126) : "w"(v44), "w"(v98)); - svfloat32_t v127; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v127) : "w"(v44), "w"(v98)); - svfloat32_t v128; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v128) : "w"(v71), "w"(v125)); - svfloat32_t v129; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v129) : "w"(v71), "w"(v125)); - svfloat32_t v160; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v160) : "w"(v156), "w"(v158)); - svfloat32_t v161; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v161) : "w"(v156), "w"(v158)); - svfloat32_t zero183; - asm volatile("mov %0.s, #0" : "=w"(zero183)); - svfloat32_t v183 = svcmla_f32_x(pred_full, zero183, v444, v159, 90); - svfloat32_t v190; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v190) : "w"(v186), "w"(v188)); - svfloat32_t v191; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v191) : "w"(v186), "w"(v188)); - svfloat32_t zero212; - asm volatile("mov %0.s, #0" : "=w"(zero212)); - svfloat32_t v212 = svcmla_f32_x(pred_full, zero212, v447, v187, 90); - svfloat32_t v130; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v130) : "w"(v126), "w"(v128)); - svfloat32_t v131; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v131) : "w"(v126), "w"(v128)); - svfloat32_t zero153; - asm volatile("mov %0.s, #0" : "=w"(zero153)); - svfloat32_t v153 = svcmla_f32_x(pred_full, zero153, v440, v129, 90); - svfloat32_t v184 = svmla_f32_x(pred_full, v183, v157, v443); - svfloat32_t v185 = svnmls_f32_x(pred_full, v183, v157, v443); - svfloat32_t zero198; - asm volatile("mov %0.s, #0" : "=w"(zero198)); - svfloat32_t v198 = svcmla_f32_x(pred_full, zero198, v447, v190, 90); - svfloat32_t zero205; - asm volatile("mov %0.s, #0" : "=w"(zero205)); - svfloat32_t v205 = svcmla_f32_x(pred_full, zero205, v447, v191, 90); - svfloat32_t v218 = svmla_f32_x(pred_full, v212, v189, v448); - svfloat32_t v219 = svmls_f32_x(pred_full, v212, v189, v448); - svfloat32_t v154; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v154) : "w"(v127), "w"(v153)); - svfloat32_t v155; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v155) : "w"(v127), "w"(v153)); - svfloat32_t v220 = svmla_f32_x(pred_full, v130, v160, v443); - svint16_t v225 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v130, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svfloat32_t v274 = svmla_f32_x(pred_full, v131, v161, v443); - svint16_t v279 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v131, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svfloat32_t v221; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v221) : "w"(v220), "w"(v198)); - svfloat32_t v222; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v222) : "w"(v220), "w"(v198)); - svfloat32_t v247; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v247) : "w"(v155), "w"(v185)); - svint16_t v252 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v155, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svfloat32_t v275; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v275) : "w"(v274), "w"(v205)); - svfloat32_t v276; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v276) : "w"(v274), "w"(v205)); - svfloat32_t v301; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v301) : "w"(v154), "w"(v184)); - svint16_t v306 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v154, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svst1w_u64(pred_full, (unsigned *)(v456), svreinterpret_u64_s16(v225)); - svst1w_u64(pred_full, (unsigned *)(v510), svreinterpret_u64_s16(v279)); - svint16_t v233 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v222, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svint16_t v241 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v221, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svfloat32_t v248; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v248) : "w"(v247), "w"(v219)); - svfloat32_t v249; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v249) : "w"(v247), "w"(v219)); - svint16_t v287 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v276, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svint16_t v295 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v275, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svfloat32_t v302; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v302) : "w"(v301), "w"(v218)); - svfloat32_t v303; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v303) : "w"(v301), "w"(v218)); - svst1w_u64(pred_full, (unsigned *)(v483), svreinterpret_u64_s16(v252)); - svst1w_u64(pred_full, (unsigned *)(v537), svreinterpret_u64_s16(v306)); - svint16_t v260 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v249, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svint16_t v268 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v248, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svint16_t v314 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v303, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svint16_t v322 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v302, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svst1w_u64(pred_full, (unsigned *)(v465), svreinterpret_u64_s16(v233)); - svst1w_u64(pred_full, (unsigned *)(v474), svreinterpret_u64_s16(v241)); - svst1w_u64(pred_full, (unsigned *)(v519), svreinterpret_u64_s16(v287)); - svst1w_u64(pred_full, (unsigned *)(v528), svreinterpret_u64_s16(v295)); - svst1w_u64(pred_full, (unsigned *)(v492), svreinterpret_u64_s16(v260)); - svst1w_u64(pred_full, (unsigned *)(v501), svreinterpret_u64_s16(v268)); - svst1w_u64(pred_full, (unsigned *)(v546), svreinterpret_u64_s16(v314)); - svst1w_u64(pred_full, (unsigned *)(v555), svreinterpret_u64_s16(v322)); - v5 += v11; - v6 += v12; - } -} -#endif - -#ifndef ARMRAL_ARCH_SVE -void armral_fft_cs16_cf32_cs16_ac_n_uu13(const armral_cmplx_int16_t *restrict x, - armral_cmplx_int16_t *restrict y, - int istride, int ostride, int howmany, - float dir) { - float v4 = dir; - const int32_t *v5 = (const int32_t *)x; - int32_t *v6 = (int32_t *)y; - int64_t v12 = howmany - 1; - int64_t v464 = howmany / 2; - for (int j = 0; j < v12; j += 2) { - float v174 = 1.0833333333333333e+00F; - float v179 = -3.0046260628866578e-01F; - float v183 = 7.4927933062613905e-01F; - float v184 = -7.4927933062613905e-01F; - float v191 = 4.0100212832186721e-01F; - float v192 = -4.0100212832186721e-01F; - float v199 = 5.7514072947400308e-01F; - float v200 = -5.7514072947400308e-01F; - float v208 = 5.2422663952658211e-01F; - float v213 = 5.1652078062348972e-01F; - float v218 = 7.7058589030924258e-03F; - float v223 = 4.2763404682656941e-01F; - float v228 = 1.5180597207438440e-01F; - float v233 = 5.7944001890096386e-01F; - float v237 = 1.1543953381323635e+00F; - float v238 = -1.1543953381323635e+00F; - float v245 = 9.0655220171271012e-01F; - float v246 = -9.0655220171271012e-01F; - float v253 = 8.1857027294591811e-01F; - float v254 = -8.1857027294591811e-01F; - float v261 = 1.1971367726043427e+00F; - float v262 = -1.1971367726043427e+00F; - float v269 = 8.6131170741789742e-01F; - float v270 = -8.6131170741789742e-01F; - float v277 = 1.1091548438375507e+00F; - float v278 = -1.1091548438375507e+00F; - float v285 = 4.2741434471979367e-02F; - float v286 = -4.2741434471979367e-02F; - float v293 = -4.5240494294812715e-02F; - float v294 = 4.5240494294812715e-02F; - float v301 = 2.9058457089163264e-01F; - float v302 = -2.9058457089163264e-01F; - float32x2_t v304 = (float32x2_t){v4, v4}; - const int32_t *v852 = &v5[istride]; - int32_t *v1079 = &v6[ostride]; - float32x2_t v175 = (float32x2_t){v174, v174}; - float32x2_t v180 = (float32x2_t){v179, v179}; - float32x2_t v185 = (float32x2_t){v183, v184}; - float32x2_t v193 = (float32x2_t){v191, v192}; - float32x2_t v201 = (float32x2_t){v199, v200}; - float32x2_t v209 = (float32x2_t){v208, v208}; - float32x2_t v214 = (float32x2_t){v213, v213}; - float32x2_t v219 = (float32x2_t){v218, v218}; - float32x2_t v224 = (float32x2_t){v223, v223}; - float32x2_t v229 = (float32x2_t){v228, v228}; - float32x2_t v234 = (float32x2_t){v233, v233}; - float32x2_t v239 = (float32x2_t){v237, v238}; - float32x2_t v247 = (float32x2_t){v245, v246}; - float32x2_t v255 = (float32x2_t){v253, v254}; - float32x2_t v263 = (float32x2_t){v261, v262}; - float32x2_t v271 = (float32x2_t){v269, v270}; - float32x2_t v279 = (float32x2_t){v277, v278}; - float32x2_t v287 = (float32x2_t){v285, v286}; - float32x2_t v295 = (float32x2_t){v293, v294}; - float32x2_t v303 = (float32x2_t){v301, v302}; - const int32_t *v961 = &v5[0]; - int32_t *v971 = &v6[0]; - int16x4_t v1083 = vld1_s16((const int16_t *)v852); - float32x4_t v28 = vcvtq_n_f32_s32(vmovl_s16(v1083), 15); - float32x4_t v176 = vcombine_f32(v175, v175); - float32x4_t v181 = vcombine_f32(v180, v180); - float32x2_t v187 = vmul_f32(v304, v185); - float32x2_t v195 = vmul_f32(v304, v193); - float32x2_t v203 = vmul_f32(v304, v201); - float32x4_t v210 = vcombine_f32(v209, v209); - float32x4_t v215 = vcombine_f32(v214, v214); - float32x4_t v220 = vcombine_f32(v219, v219); - float32x4_t v225 = vcombine_f32(v224, v224); - float32x4_t v230 = vcombine_f32(v229, v229); - float32x4_t v235 = vcombine_f32(v234, v234); - float32x2_t v241 = vmul_f32(v304, v239); - float32x2_t v249 = vmul_f32(v304, v247); - float32x2_t v257 = vmul_f32(v304, v255); - float32x2_t v265 = vmul_f32(v304, v263); - float32x2_t v273 = vmul_f32(v304, v271); - float32x2_t v281 = vmul_f32(v304, v279); - float32x2_t v289 = vmul_f32(v304, v287); - float32x2_t v297 = vmul_f32(v304, v295); - float32x2_t v305 = vmul_f32(v304, v303); - const int32_t *v861 = &v5[istride * 12]; - const int32_t *v870 = &v5[istride * 2]; - const int32_t *v879 = &v5[istride * 11]; - const int32_t *v888 = &v5[istride * 3]; - const int32_t *v897 = &v5[istride * 10]; - const int32_t *v906 = &v5[istride * 4]; - const int32_t *v915 = &v5[istride * 9]; - const int32_t *v924 = &v5[istride * 5]; - const int32_t *v933 = &v5[istride * 8]; - const int32_t *v942 = &v5[istride * 6]; - const int32_t *v951 = &v5[istride * 7]; - int32_t *v980 = &v6[ostride * 12]; - int32_t *v989 = &v6[ostride * 11]; - int32_t *v998 = &v6[ostride * 10]; - int32_t *v1007 = &v6[ostride * 9]; - int32_t *v1016 = &v6[ostride * 8]; - int32_t *v1025 = &v6[ostride * 7]; - int32_t *v1034 = &v6[ostride * 6]; - int32_t *v1043 = &v6[ostride * 5]; - int32_t *v1052 = &v6[ostride * 4]; - int32_t *v1061 = &v6[ostride * 3]; - int32_t *v1070 = &v6[ostride * 2]; - int16x4_t v1107 = vld1_s16((const int16_t *)v961); - float32x4_t v159 = vcvtq_n_f32_s32(vmovl_s16(v1107), 15); - float32x4_t v189 = vcombine_f32(v187, v187); - float32x4_t v197 = vcombine_f32(v195, v195); - float32x4_t v205 = vcombine_f32(v203, v203); - float32x4_t v243 = vcombine_f32(v241, v241); - float32x4_t v251 = vcombine_f32(v249, v249); - float32x4_t v259 = vcombine_f32(v257, v257); - float32x4_t v267 = vcombine_f32(v265, v265); - float32x4_t v275 = vcombine_f32(v273, v273); - float32x4_t v283 = vcombine_f32(v281, v281); - float32x4_t v291 = vcombine_f32(v289, v289); - float32x4_t v299 = vcombine_f32(v297, v297); - float32x4_t v307 = vcombine_f32(v305, v305); - int16x4_t v1085 = vld1_s16((const int16_t *)v861); - int16x4_t v1087 = vld1_s16((const int16_t *)v870); - int16x4_t v1089 = vld1_s16((const int16_t *)v879); - int16x4_t v1091 = vld1_s16((const int16_t *)v888); - int16x4_t v1093 = vld1_s16((const int16_t *)v897); - int16x4_t v1095 = vld1_s16((const int16_t *)v906); - int16x4_t v1097 = vld1_s16((const int16_t *)v915); - int16x4_t v1099 = vld1_s16((const int16_t *)v924); - int16x4_t v1101 = vld1_s16((const int16_t *)v933); - int16x4_t v1103 = vld1_s16((const int16_t *)v942); - int16x4_t v1105 = vld1_s16((const int16_t *)v951); - float32x4_t v36 = vcvtq_n_f32_s32(vmovl_s16(v1085), 15); - float32x4_t v45 = vcvtq_n_f32_s32(vmovl_s16(v1087), 15); - float32x4_t v53 = vcvtq_n_f32_s32(vmovl_s16(v1089), 15); - float32x4_t v62 = vcvtq_n_f32_s32(vmovl_s16(v1091), 15); - float32x4_t v70 = vcvtq_n_f32_s32(vmovl_s16(v1093), 15); - float32x4_t v79 = vcvtq_n_f32_s32(vmovl_s16(v1095), 15); - float32x4_t v87 = vcvtq_n_f32_s32(vmovl_s16(v1097), 15); - float32x4_t v96 = vcvtq_n_f32_s32(vmovl_s16(v1099), 15); - float32x4_t v104 = vcvtq_n_f32_s32(vmovl_s16(v1101), 15); - float32x4_t v113 = vcvtq_n_f32_s32(vmovl_s16(v1103), 15); - float32x4_t v121 = vcvtq_n_f32_s32(vmovl_s16(v1105), 15); - float32x4_t v37 = vaddq_f32(v28, v36); - float32x4_t v54 = vaddq_f32(v45, v53); - float32x4_t v71 = vaddq_f32(v62, v70); - float32x4_t v88 = vaddq_f32(v79, v87); - float32x4_t v105 = vaddq_f32(v96, v104); - float32x4_t v122 = vaddq_f32(v113, v121); - float32x4_t v123 = vsubq_f32(v28, v36); - float32x4_t v124 = vsubq_f32(v45, v53); - float32x4_t v125 = vsubq_f32(v62, v70); - float32x4_t v126 = vsubq_f32(v79, v87); - float32x4_t v127 = vsubq_f32(v96, v104); - float32x4_t v128 = vsubq_f32(v113, v121); - float32x4_t v129 = vaddq_f32(v54, v105); - float32x4_t v131 = vaddq_f32(v37, v71); - float32x4_t v134 = vaddq_f32(v124, v127); - float32x4_t v136 = vaddq_f32(v123, v125); - float32x4_t v138 = vsubq_f32(v54, v122); - float32x4_t v139 = vsubq_f32(v71, v88); - float32x4_t v140 = vsubq_f32(v37, v88); - float32x4_t v141 = vsubq_f32(v105, v122); - float32x4_t v146 = vsubq_f32(v124, v128); - float32x4_t v147 = vsubq_f32(v123, v125); - float32x4_t v148 = vsubq_f32(v124, v127); - float32x4_t v149 = vaddq_f32(v123, v126); - float32x4_t v150 = vsubq_f32(v127, v128); - float32x4_t v151 = vaddq_f32(v125, v126); - float32x4_t v130 = vaddq_f32(v129, v122); - float32x4_t v132 = vaddq_f32(v131, v88); - float32x4_t v135 = vaddq_f32(v134, v128); - float32x4_t v137 = vsubq_f32(v136, v126); - float32x4_t v142 = vsubq_f32(v138, v139); - float32x4_t v143 = vsubq_f32(v140, v141); - float32x4_t v144 = vaddq_f32(v138, v139); - float32x4_t v145 = vaddq_f32(v140, v141); - float32x4_t v165 = vaddq_f32(v146, v147); - float32x4_t v166 = vaddq_f32(v148, v149); - float32x4_t v167 = vsubq_f32(v150, v151); - float32x4_t v242 = vrev64q_f32(v146); - float32x4_t v250 = vrev64q_f32(v147); - float32x4_t v266 = vrev64q_f32(v148); - float32x4_t v274 = vrev64q_f32(v149); - float32x4_t v290 = vrev64q_f32(v150); - float32x4_t v298 = vrev64q_f32(v151); - float32x4_t v133 = vaddq_f32(v130, v132); - float32x4_t v161 = vsubq_f32(v132, v130); - float32x4_t v162 = vaddq_f32(v135, v137); - float32x4_t v163 = vaddq_f32(v142, v143); - float32x4_t v164 = vsubq_f32(v144, v145); - float32x4_t v188 = vrev64q_f32(v135); - float32x4_t v196 = vrev64q_f32(v137); - float32x4_t v211 = vmulq_f32(v142, v210); - float32x4_t v216 = vmulq_f32(v143, v215); - float32x4_t v226 = vmulq_f32(v144, v225); - float32x4_t v231 = vmulq_f32(v145, v230); - float32x4_t v244 = vmulq_f32(v242, v243); - float32x4_t v252 = vmulq_f32(v250, v251); - float32x4_t v258 = vrev64q_f32(v165); - float32x4_t v268 = vmulq_f32(v266, v267); - float32x4_t v276 = vmulq_f32(v274, v275); - float32x4_t v282 = vrev64q_f32(v166); - float32x4_t v292 = vmulq_f32(v290, v291); - float32x4_t v300 = vmulq_f32(v298, v299); - float32x4_t v306 = vrev64q_f32(v167); - float32x4_t v160 = vaddq_f32(v159, v133); - float32x4_t v177 = vmulq_f32(v133, v176); - float32x4_t v182 = vmulq_f32(v161, v181); - float32x4_t v190 = vmulq_f32(v188, v189); - float32x4_t v198 = vmulq_f32(v196, v197); - float32x4_t v204 = vrev64q_f32(v162); - float32x4_t v221 = vmulq_f32(v163, v220); - float32x4_t v236 = vmulq_f32(v164, v235); - float32x4_t v260 = vmulq_f32(v258, v259); - float32x4_t v284 = vmulq_f32(v282, v283); - float32x4_t v308 = vmulq_f32(v306, v307); - float32x4_t v310 = vaddq_f32(v216, v211); - float32x4_t v206 = vmulq_f32(v204, v205); - float32x4_t v309 = vsubq_f32(v160, v177); - float32x4_t v311 = vsubq_f32(v310, v182); - float32x4_t v312 = vaddq_f32(v216, v221); - float32x4_t v314 = vsubq_f32(v221, v211); - float32x4_t v322 = vsubq_f32(v244, v260); - float32x4_t v323 = vsubq_f32(v252, v260); - float32x4_t v324 = vsubq_f32(v268, v284); - float32x4_t v325 = vsubq_f32(v276, v284); - float32x4_t v326 = vsubq_f32(v292, v308); - float32x4_t v327 = vaddq_f32(v300, v308); - int16x4_t v362 = vqmovn_s32(vcvtq_n_s32_f32(v160, 15)); - float32x4_t v313 = vaddq_f32(v312, v182); - float32x4_t v315 = vsubq_f32(v314, v182); - float32x4_t v316 = vaddq_f32(v309, v226); - float32x4_t v318 = vsubq_f32(v309, v231); - float32x4_t v320 = vsubq_f32(v309, v226); - float32x4_t v328 = vsubq_f32(v190, v206); - float32x4_t v329 = vsubq_f32(v198, v206); - float32x4_t v340 = vaddq_f32(v322, v326); - float32x4_t v342 = vaddq_f32(v324, v326); - float32x4_t v344 = vsubq_f32(v323, v327); - vst1_s16((int16_t *)v971, v362); - float32x4_t v317 = vaddq_f32(v316, v231); - float32x4_t v319 = vsubq_f32(v318, v236); - float32x4_t v321 = vaddq_f32(v320, v236); - float32x4_t v336 = vsubq_f32(v329, v322); - float32x4_t v338 = vsubq_f32(v327, v328); - float32x4_t v341 = vaddq_f32(v340, v329); - float32x4_t v343 = vsubq_f32(v342, v329); - float32x4_t v345 = vsubq_f32(v344, v328); - float32x4_t v346 = vaddq_f32(v328, v323); - float32x4_t v330 = vaddq_f32(v311, v317); - float32x4_t v331 = vaddq_f32(v313, v319); - float32x4_t v332 = vsubq_f32(v319, v313); - float32x4_t v333 = vaddq_f32(v315, v321); - float32x4_t v334 = vsubq_f32(v317, v311); - float32x4_t v335 = vsubq_f32(v321, v315); - float32x4_t v337 = vaddq_f32(v336, v324); - float32x4_t v339 = vsubq_f32(v338, v325); - float32x4_t v347 = vsubq_f32(v346, v325); - float32x4_t v348 = vsubq_f32(v330, v337); - float32x4_t v349 = vaddq_f32(v331, v339); - float32x4_t v350 = vsubq_f32(v332, v341); - float32x4_t v351 = vsubq_f32(v333, v343); - float32x4_t v352 = vaddq_f32(v334, v345); - float32x4_t v353 = vsubq_f32(v335, v347); - float32x4_t v354 = vaddq_f32(v335, v347); - float32x4_t v355 = vsubq_f32(v334, v345); - float32x4_t v356 = vaddq_f32(v333, v343); - float32x4_t v357 = vaddq_f32(v332, v341); - float32x4_t v358 = vsubq_f32(v331, v339); - float32x4_t v359 = vaddq_f32(v330, v337); - int16x4_t v370 = vqmovn_s32(vcvtq_n_s32_f32(v348, 15)); - int16x4_t v378 = vqmovn_s32(vcvtq_n_s32_f32(v349, 15)); - int16x4_t v386 = vqmovn_s32(vcvtq_n_s32_f32(v350, 15)); - int16x4_t v394 = vqmovn_s32(vcvtq_n_s32_f32(v351, 15)); - int16x4_t v402 = vqmovn_s32(vcvtq_n_s32_f32(v352, 15)); - int16x4_t v410 = vqmovn_s32(vcvtq_n_s32_f32(v353, 15)); - int16x4_t v418 = vqmovn_s32(vcvtq_n_s32_f32(v354, 15)); - int16x4_t v426 = vqmovn_s32(vcvtq_n_s32_f32(v355, 15)); - int16x4_t v434 = vqmovn_s32(vcvtq_n_s32_f32(v356, 15)); - int16x4_t v442 = vqmovn_s32(vcvtq_n_s32_f32(v357, 15)); - int16x4_t v450 = vqmovn_s32(vcvtq_n_s32_f32(v358, 15)); - int16x4_t v458 = vqmovn_s32(vcvtq_n_s32_f32(v359, 15)); - vst1_s16((int16_t *)v980, v370); - vst1_s16((int16_t *)v989, v378); - vst1_s16((int16_t *)v998, v386); - vst1_s16((int16_t *)v1007, v394); - vst1_s16((int16_t *)v1016, v402); - vst1_s16((int16_t *)v1025, v410); - vst1_s16((int16_t *)v1034, v418); - vst1_s16((int16_t *)v1043, v426); - vst1_s16((int16_t *)v1052, v434); - vst1_s16((int16_t *)v1061, v442); - vst1_s16((int16_t *)v1070, v450); - vst1_s16((int16_t *)v1079, v458); - v5 += 2 * 1; - v6 += 2 * 1; - } - for (int j = v464 * 2; j < howmany; j += 1) { - int16x4_t v476 = vld1s_s16(&v5[istride]); - float v598 = 1.0833333333333333e+00F; - float v602 = -3.0046260628866578e-01F; - float v605 = 7.4927933062613905e-01F; - float v606 = -7.4927933062613905e-01F; - float v612 = 4.0100212832186721e-01F; - float v613 = -4.0100212832186721e-01F; - float v619 = 5.7514072947400308e-01F; - float v620 = -5.7514072947400308e-01F; - float v627 = 5.2422663952658211e-01F; - float v631 = 5.1652078062348972e-01F; - float v635 = 7.7058589030924258e-03F; - float v639 = 4.2763404682656941e-01F; - float v643 = 1.5180597207438440e-01F; - float v647 = 5.7944001890096386e-01F; - float v650 = 1.1543953381323635e+00F; - float v651 = -1.1543953381323635e+00F; - float v657 = 9.0655220171271012e-01F; - float v658 = -9.0655220171271012e-01F; - float v664 = 8.1857027294591811e-01F; - float v665 = -8.1857027294591811e-01F; - float v671 = 1.1971367726043427e+00F; - float v672 = -1.1971367726043427e+00F; - float v678 = 8.6131170741789742e-01F; - float v679 = -8.6131170741789742e-01F; - float v685 = 1.1091548438375507e+00F; - float v686 = -1.1091548438375507e+00F; - float v692 = 4.2741434471979367e-02F; - float v693 = -4.2741434471979367e-02F; - float v699 = -4.5240494294812715e-02F; - float v700 = 4.5240494294812715e-02F; - float v706 = 2.9058457089163264e-01F; - float v707 = -2.9058457089163264e-01F; - float32x2_t v709 = (float32x2_t){v4, v4}; - float32x2_t v477 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v476)), 15); - int16x4_t v583 = vld1s_s16(&v5[0]); - float32x2_t v599 = (float32x2_t){v598, v598}; - float32x2_t v603 = (float32x2_t){v602, v602}; - float32x2_t v607 = (float32x2_t){v605, v606}; - float32x2_t v614 = (float32x2_t){v612, v613}; - float32x2_t v621 = (float32x2_t){v619, v620}; - float32x2_t v628 = (float32x2_t){v627, v627}; - float32x2_t v632 = (float32x2_t){v631, v631}; - float32x2_t v636 = (float32x2_t){v635, v635}; - float32x2_t v640 = (float32x2_t){v639, v639}; - float32x2_t v644 = (float32x2_t){v643, v643}; - float32x2_t v648 = (float32x2_t){v647, v647}; - float32x2_t v652 = (float32x2_t){v650, v651}; - float32x2_t v659 = (float32x2_t){v657, v658}; - float32x2_t v666 = (float32x2_t){v664, v665}; - float32x2_t v673 = (float32x2_t){v671, v672}; - float32x2_t v680 = (float32x2_t){v678, v679}; - float32x2_t v687 = (float32x2_t){v685, v686}; - float32x2_t v694 = (float32x2_t){v692, v693}; - float32x2_t v701 = (float32x2_t){v699, v700}; - float32x2_t v708 = (float32x2_t){v706, v707}; - int16x4_t v482 = vld1s_s16(&v5[istride * 12]); - int16x4_t v489 = vld1s_s16(&v5[istride * 2]); - int16x4_t v495 = vld1s_s16(&v5[istride * 11]); - int16x4_t v502 = vld1s_s16(&v5[istride * 3]); - int16x4_t v508 = vld1s_s16(&v5[istride * 10]); - int16x4_t v515 = vld1s_s16(&v5[istride * 4]); - int16x4_t v521 = vld1s_s16(&v5[istride * 9]); - int16x4_t v528 = vld1s_s16(&v5[istride * 5]); - int16x4_t v534 = vld1s_s16(&v5[istride * 8]); - int16x4_t v541 = vld1s_s16(&v5[istride * 6]); - int16x4_t v547 = vld1s_s16(&v5[istride * 7]); - float32x2_t v584 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v583)), 15); - float32x2_t v609 = vmul_f32(v709, v607); - float32x2_t v616 = vmul_f32(v709, v614); - float32x2_t v623 = vmul_f32(v709, v621); - float32x2_t v654 = vmul_f32(v709, v652); - float32x2_t v661 = vmul_f32(v709, v659); - float32x2_t v668 = vmul_f32(v709, v666); - float32x2_t v675 = vmul_f32(v709, v673); - float32x2_t v682 = vmul_f32(v709, v680); - float32x2_t v689 = vmul_f32(v709, v687); - float32x2_t v696 = vmul_f32(v709, v694); - float32x2_t v703 = vmul_f32(v709, v701); - float32x2_t v710 = vmul_f32(v709, v708); - float32x2_t v483 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v482)), 15); - float32x2_t v490 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v489)), 15); - float32x2_t v496 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v495)), 15); - float32x2_t v503 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v502)), 15); - float32x2_t v509 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v508)), 15); - float32x2_t v516 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v515)), 15); - float32x2_t v522 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v521)), 15); - float32x2_t v529 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v528)), 15); - float32x2_t v535 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v534)), 15); - float32x2_t v542 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v541)), 15); - float32x2_t v548 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v547)), 15); - float32x2_t v484 = vadd_f32(v477, v483); - float32x2_t v497 = vadd_f32(v490, v496); - float32x2_t v510 = vadd_f32(v503, v509); - float32x2_t v523 = vadd_f32(v516, v522); - float32x2_t v536 = vadd_f32(v529, v535); - float32x2_t v549 = vadd_f32(v542, v548); - float32x2_t v550 = vsub_f32(v477, v483); - float32x2_t v551 = vsub_f32(v490, v496); - float32x2_t v552 = vsub_f32(v503, v509); - float32x2_t v553 = vsub_f32(v516, v522); - float32x2_t v554 = vsub_f32(v529, v535); - float32x2_t v555 = vsub_f32(v542, v548); - float32x2_t v556 = vadd_f32(v497, v536); - float32x2_t v558 = vadd_f32(v484, v510); - float32x2_t v561 = vadd_f32(v551, v554); - float32x2_t v563 = vadd_f32(v550, v552); - float32x2_t v565 = vsub_f32(v497, v549); - float32x2_t v566 = vsub_f32(v510, v523); - float32x2_t v567 = vsub_f32(v484, v523); - float32x2_t v568 = vsub_f32(v536, v549); - float32x2_t v573 = vsub_f32(v551, v555); - float32x2_t v574 = vsub_f32(v550, v552); - float32x2_t v575 = vsub_f32(v551, v554); - float32x2_t v576 = vadd_f32(v550, v553); - float32x2_t v577 = vsub_f32(v554, v555); - float32x2_t v578 = vadd_f32(v552, v553); - float32x2_t v557 = vadd_f32(v556, v549); - float32x2_t v559 = vadd_f32(v558, v523); - float32x2_t v562 = vadd_f32(v561, v555); - float32x2_t v564 = vsub_f32(v563, v553); - float32x2_t v569 = vsub_f32(v565, v566); - float32x2_t v570 = vsub_f32(v567, v568); - float32x2_t v571 = vadd_f32(v565, v566); - float32x2_t v572 = vadd_f32(v567, v568); - float32x2_t v590 = vadd_f32(v573, v574); - float32x2_t v591 = vadd_f32(v575, v576); - float32x2_t v592 = vsub_f32(v577, v578); - float32x2_t v655 = vrev64_f32(v573); - float32x2_t v662 = vrev64_f32(v574); - float32x2_t v676 = vrev64_f32(v575); - float32x2_t v683 = vrev64_f32(v576); - float32x2_t v697 = vrev64_f32(v577); - float32x2_t v704 = vrev64_f32(v578); - float32x2_t v560 = vadd_f32(v557, v559); - float32x2_t v586 = vsub_f32(v559, v557); - float32x2_t v587 = vadd_f32(v562, v564); - float32x2_t v588 = vadd_f32(v569, v570); - float32x2_t v589 = vsub_f32(v571, v572); - float32x2_t v610 = vrev64_f32(v562); - float32x2_t v617 = vrev64_f32(v564); - float32x2_t v629 = vmul_f32(v569, v628); - float32x2_t v633 = vmul_f32(v570, v632); - float32x2_t v641 = vmul_f32(v571, v640); - float32x2_t v645 = vmul_f32(v572, v644); - float32x2_t v656 = vmul_f32(v655, v654); - float32x2_t v663 = vmul_f32(v662, v661); - float32x2_t v669 = vrev64_f32(v590); - float32x2_t v677 = vmul_f32(v676, v675); - float32x2_t v684 = vmul_f32(v683, v682); - float32x2_t v690 = vrev64_f32(v591); - float32x2_t v698 = vmul_f32(v697, v696); - float32x2_t v705 = vmul_f32(v704, v703); - float32x2_t v711 = vrev64_f32(v592); - float32x2_t v585 = vadd_f32(v584, v560); - float32x2_t v600 = vmul_f32(v560, v599); - float32x2_t v604 = vmul_f32(v586, v603); - float32x2_t v611 = vmul_f32(v610, v609); - float32x2_t v618 = vmul_f32(v617, v616); - float32x2_t v624 = vrev64_f32(v587); - float32x2_t v637 = vmul_f32(v588, v636); - float32x2_t v649 = vmul_f32(v589, v648); - float32x2_t v670 = vmul_f32(v669, v668); - float32x2_t v691 = vmul_f32(v690, v689); - float32x2_t v712 = vmul_f32(v711, v710); - float32x2_t v714 = vadd_f32(v633, v629); - float32x2_t v625 = vmul_f32(v624, v623); - float32x2_t v713 = vsub_f32(v585, v600); - float32x2_t v715 = vsub_f32(v714, v604); - float32x2_t v716 = vadd_f32(v633, v637); - float32x2_t v718 = vsub_f32(v637, v629); - float32x2_t v726 = vsub_f32(v656, v670); - float32x2_t v727 = vsub_f32(v663, v670); - float32x2_t v728 = vsub_f32(v677, v691); - float32x2_t v729 = vsub_f32(v684, v691); - float32x2_t v730 = vsub_f32(v698, v712); - float32x2_t v731 = vadd_f32(v705, v712); - int16x4_t v766 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v585, 15), (int32x2_t){0, 0})); - float32x2_t v717 = vadd_f32(v716, v604); - float32x2_t v719 = vsub_f32(v718, v604); - float32x2_t v720 = vadd_f32(v713, v641); - float32x2_t v722 = vsub_f32(v713, v645); - float32x2_t v724 = vsub_f32(v713, v641); - float32x2_t v732 = vsub_f32(v611, v625); - float32x2_t v733 = vsub_f32(v618, v625); - float32x2_t v744 = vadd_f32(v726, v730); - float32x2_t v746 = vadd_f32(v728, v730); - float32x2_t v748 = vsub_f32(v727, v731); - v6[0] = vget_lane_s32(vreinterpret_s32_s16(v766), 0); - float32x2_t v721 = vadd_f32(v720, v645); - float32x2_t v723 = vsub_f32(v722, v649); - float32x2_t v725 = vadd_f32(v724, v649); - float32x2_t v740 = vsub_f32(v733, v726); - float32x2_t v742 = vsub_f32(v731, v732); - float32x2_t v745 = vadd_f32(v744, v733); - float32x2_t v747 = vsub_f32(v746, v733); - float32x2_t v749 = vsub_f32(v748, v732); - float32x2_t v750 = vadd_f32(v732, v727); - float32x2_t v734 = vadd_f32(v715, v721); - float32x2_t v735 = vadd_f32(v717, v723); - float32x2_t v736 = vsub_f32(v723, v717); - float32x2_t v737 = vadd_f32(v719, v725); - float32x2_t v738 = vsub_f32(v721, v715); - float32x2_t v739 = vsub_f32(v725, v719); - float32x2_t v741 = vadd_f32(v740, v728); - float32x2_t v743 = vsub_f32(v742, v729); - float32x2_t v751 = vsub_f32(v750, v729); - float32x2_t v752 = vsub_f32(v734, v741); - float32x2_t v753 = vadd_f32(v735, v743); - float32x2_t v754 = vsub_f32(v736, v745); - float32x2_t v755 = vsub_f32(v737, v747); - float32x2_t v756 = vadd_f32(v738, v749); - float32x2_t v757 = vsub_f32(v739, v751); - float32x2_t v758 = vadd_f32(v739, v751); - float32x2_t v759 = vsub_f32(v738, v749); - float32x2_t v760 = vadd_f32(v737, v747); - float32x2_t v761 = vadd_f32(v736, v745); - float32x2_t v762 = vsub_f32(v735, v743); - float32x2_t v763 = vadd_f32(v734, v741); - int16x4_t v772 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v752, 15), (int32x2_t){0, 0})); - int16x4_t v778 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v753, 15), (int32x2_t){0, 0})); - int16x4_t v784 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v754, 15), (int32x2_t){0, 0})); - int16x4_t v790 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v755, 15), (int32x2_t){0, 0})); - int16x4_t v796 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v756, 15), (int32x2_t){0, 0})); - int16x4_t v802 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v757, 15), (int32x2_t){0, 0})); - int16x4_t v808 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v758, 15), (int32x2_t){0, 0})); - int16x4_t v814 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v759, 15), (int32x2_t){0, 0})); - int16x4_t v820 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v760, 15), (int32x2_t){0, 0})); - int16x4_t v826 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v761, 15), (int32x2_t){0, 0})); - int16x4_t v832 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v762, 15), (int32x2_t){0, 0})); - int16x4_t v838 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v763, 15), (int32x2_t){0, 0})); - v6[ostride * 12] = vget_lane_s32(vreinterpret_s32_s16(v772), 0); - v6[ostride * 11] = vget_lane_s32(vreinterpret_s32_s16(v778), 0); - v6[ostride * 10] = vget_lane_s32(vreinterpret_s32_s16(v784), 0); - v6[ostride * 9] = vget_lane_s32(vreinterpret_s32_s16(v790), 0); - v6[ostride * 8] = vget_lane_s32(vreinterpret_s32_s16(v796), 0); - v6[ostride * 7] = vget_lane_s32(vreinterpret_s32_s16(v802), 0); - v6[ostride * 6] = vget_lane_s32(vreinterpret_s32_s16(v808), 0); - v6[ostride * 5] = vget_lane_s32(vreinterpret_s32_s16(v814), 0); - v6[ostride * 4] = vget_lane_s32(vreinterpret_s32_s16(v820), 0); - v6[ostride * 3] = vget_lane_s32(vreinterpret_s32_s16(v826), 0); - v6[ostride * 2] = vget_lane_s32(vreinterpret_s32_s16(v832), 0); - v6[ostride] = vget_lane_s32(vreinterpret_s32_s16(v838), 0); - v5 += 1 * 1; - v6 += 1 * 1; - } -} -#endif - -#ifdef ARMRAL_ARCH_SVE -void armral_fft_cs16_cf32_cs16_ac_n_uu13(const armral_cmplx_int16_t *restrict x, - armral_cmplx_int16_t *restrict y, - int istride, int ostride, int howmany, - float dir) { - int64_t v0 = istride; - int64_t v2 = ostride; - float v4 = dir; - const int32_t *v5 = (const int32_t *)x; - int32_t *v6 = (int32_t *)y; - int64_t v8 = howmany; - int64_t v10 = svcntd(); - int64_t v11 = v10 * 1; - int64_t v12 = v10 * 1; - for (int j = 0; j < v8; j += v10) { - svbool_t pred_full = svwhilelt_b32(j * 2, howmany * 2); - float v171 = 1.0833333333333333e+00F; - float v176 = -3.0046260628866578e-01F; - float v181 = -7.4927933062613905e-01F; - float v188 = -4.0100212832186721e-01F; - float v195 = -5.7514072947400308e-01F; - float v202 = 5.2422663952658211e-01F; - float v207 = 5.1652078062348972e-01F; - float v212 = 7.7058589030924258e-03F; - float v217 = 4.2763404682656941e-01F; - float v222 = 1.5180597207438440e-01F; - float v227 = 5.7944001890096386e-01F; - float v232 = -1.1543953381323635e+00F; - float v239 = -9.0655220171271012e-01F; - float v246 = -8.1857027294591811e-01F; - float v253 = -1.1971367726043427e+00F; - float v260 = -8.6131170741789742e-01F; - float v267 = -1.1091548438375507e+00F; - float v274 = -4.2741434471979367e-02F; - float v281 = 4.5240494294812715e-02F; - float v288 = -2.9058457089163264e-01F; - const int32_t *v455 = &v5[v0]; - int32_t *v703 = &v6[v2]; - int64_t v27 = v0 * 12; - int64_t v36 = v0 * 2; - int64_t v44 = v0 * 11; - int64_t v53 = v0 * 3; - int64_t v61 = v0 * 10; - int64_t v70 = v0 * 4; - int64_t v78 = v0 * 9; - int64_t v87 = v0 * 5; - int64_t v95 = v0 * 8; - int64_t v104 = v0 * 6; - int64_t v112 = v0 * 7; - float v184 = v4 * v181; - float v191 = v4 * v188; - float v198 = v4 * v195; - float v235 = v4 * v232; - float v242 = v4 * v239; - float v249 = v4 * v246; - float v256 = v4 * v253; - float v263 = v4 * v260; - float v270 = v4 * v267; - float v277 = v4 * v274; - float v284 = v4 * v281; - float v291 = v4 * v288; - int64_t v354 = v2 * 12; - int64_t v362 = v2 * 11; - int64_t v370 = v2 * 10; - int64_t v378 = v2 * 9; - int64_t v386 = v2 * 8; - int64_t v394 = v2 * 7; - int64_t v402 = v2 * 6; - int64_t v410 = v2 * 5; - int64_t v418 = v2 * 4; - int64_t v426 = v2 * 3; - int64_t v434 = v2 * 2; - const int32_t *v564 = &v5[0]; - svfloat32_t v568 = svdup_n_f32(v171); - svfloat32_t v569 = svdup_n_f32(v176); - svfloat32_t v573 = svdup_n_f32(v202); - svfloat32_t v574 = svdup_n_f32(v207); - svfloat32_t v575 = svdup_n_f32(v212); - svfloat32_t v576 = svdup_n_f32(v217); - svfloat32_t v577 = svdup_n_f32(v222); - svfloat32_t v578 = svdup_n_f32(v227); - int32_t *v595 = &v6[0]; - svfloat32_t v25 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v455[0])), - 1.F / (1ULL << 15ULL)); - const int32_t *v464 = &v5[v27]; - const int32_t *v473 = &v5[v36]; - const int32_t *v482 = &v5[v44]; - const int32_t *v491 = &v5[v53]; - const int32_t *v500 = &v5[v61]; - const int32_t *v509 = &v5[v70]; - const int32_t *v518 = &v5[v78]; - const int32_t *v527 = &v5[v87]; - const int32_t *v536 = &v5[v95]; - const int32_t *v545 = &v5[v104]; - const int32_t *v554 = &v5[v112]; - svfloat32_t v570 = svdup_n_f32(v184); - svfloat32_t v571 = svdup_n_f32(v191); - svfloat32_t v572 = svdup_n_f32(v198); - svfloat32_t v579 = svdup_n_f32(v235); - svfloat32_t v580 = svdup_n_f32(v242); - svfloat32_t v581 = svdup_n_f32(v249); - svfloat32_t v582 = svdup_n_f32(v256); - svfloat32_t v583 = svdup_n_f32(v263); - svfloat32_t v584 = svdup_n_f32(v270); - svfloat32_t v585 = svdup_n_f32(v277); - svfloat32_t v586 = svdup_n_f32(v284); - svfloat32_t v587 = svdup_n_f32(v291); - int32_t *v604 = &v6[v354]; - int32_t *v613 = &v6[v362]; - int32_t *v622 = &v6[v370]; - int32_t *v631 = &v6[v378]; - int32_t *v640 = &v6[v386]; - int32_t *v649 = &v6[v394]; - int32_t *v658 = &v6[v402]; - int32_t *v667 = &v6[v410]; - int32_t *v676 = &v6[v418]; - int32_t *v685 = &v6[v426]; - int32_t *v694 = &v6[v434]; - svfloat32_t v156 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v564[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v33 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v464[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v42 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v473[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v50 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v482[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v59 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v491[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v67 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v500[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v76 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v509[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v84 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v518[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v93 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v527[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v101 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v536[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v110 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v545[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v118 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v554[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v34; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v34) : "w"(v25), "w"(v33)); - svfloat32_t v51; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v51) : "w"(v42), "w"(v50)); - svfloat32_t v68; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v68) : "w"(v59), "w"(v67)); - svfloat32_t v85; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v85) : "w"(v76), "w"(v84)); - svfloat32_t v102; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v102) : "w"(v93), "w"(v101)); - svfloat32_t v119; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v119) : "w"(v110), "w"(v118)); - svfloat32_t v120; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v120) : "w"(v25), "w"(v33)); - svfloat32_t v121; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v121) : "w"(v42), "w"(v50)); - svfloat32_t v122; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v122) : "w"(v59), "w"(v67)); - svfloat32_t v123; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v123) : "w"(v76), "w"(v84)); - svfloat32_t v124; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v124) : "w"(v93), "w"(v101)); - svfloat32_t v125; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v125) : "w"(v110), "w"(v118)); - svfloat32_t v126; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v126) : "w"(v51), "w"(v102)); - svfloat32_t v128; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v128) : "w"(v34), "w"(v68)); - svfloat32_t v131; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v131) : "w"(v121), "w"(v124)); - svfloat32_t v133; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v133) : "w"(v120), "w"(v122)); - svfloat32_t v135; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v135) : "w"(v51), "w"(v119)); - svfloat32_t v136; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v136) : "w"(v68), "w"(v85)); - svfloat32_t v137; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v137) : "w"(v34), "w"(v85)); - svfloat32_t v138; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v138) : "w"(v102), "w"(v119)); - svfloat32_t v143; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v143) : "w"(v121), "w"(v125)); - svfloat32_t v144; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v144) : "w"(v120), "w"(v122)); - svfloat32_t v145; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v145) : "w"(v121), "w"(v124)); - svfloat32_t v146; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v146) : "w"(v120), "w"(v123)); - svfloat32_t v147; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v147) : "w"(v124), "w"(v125)); - svfloat32_t v148; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v148) : "w"(v122), "w"(v123)); - svfloat32_t v127; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v127) : "w"(v126), "w"(v119)); - svfloat32_t v129; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v129) : "w"(v128), "w"(v85)); - svfloat32_t v132; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v132) : "w"(v131), "w"(v125)); - svfloat32_t v134; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v134) : "w"(v133), "w"(v123)); - svfloat32_t v139; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v139) : "w"(v135), "w"(v136)); - svfloat32_t v140; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v140) : "w"(v137), "w"(v138)); - svfloat32_t v141; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v141) : "w"(v135), "w"(v136)); - svfloat32_t v142; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v142) : "w"(v137), "w"(v138)); - svfloat32_t v162; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v162) : "w"(v143), "w"(v144)); - svfloat32_t v163; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v163) : "w"(v145), "w"(v146)); - svfloat32_t v164; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v164) : "w"(v147), "w"(v148)); - svfloat32_t zero237; - asm volatile("mov %0.s, #0" : "=w"(zero237)); - svfloat32_t v237 = svcmla_f32_x(pred_full, zero237, v579, v143, 90); - svfloat32_t zero244; - asm volatile("mov %0.s, #0" : "=w"(zero244)); - svfloat32_t v244 = svcmla_f32_x(pred_full, zero244, v580, v144, 90); - svfloat32_t zero258; - asm volatile("mov %0.s, #0" : "=w"(zero258)); - svfloat32_t v258 = svcmla_f32_x(pred_full, zero258, v582, v145, 90); - svfloat32_t zero265; - asm volatile("mov %0.s, #0" : "=w"(zero265)); - svfloat32_t v265 = svcmla_f32_x(pred_full, zero265, v583, v146, 90); - svfloat32_t zero279; - asm volatile("mov %0.s, #0" : "=w"(zero279)); - svfloat32_t v279 = svcmla_f32_x(pred_full, zero279, v585, v147, 90); - svfloat32_t v130; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v130) : "w"(v127), "w"(v129)); - svfloat32_t v158; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v158) : "w"(v129), "w"(v127)); - svfloat32_t v159; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v159) : "w"(v132), "w"(v134)); - svfloat32_t v160; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v160) : "w"(v139), "w"(v140)); - svfloat32_t v161; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v161) : "w"(v141), "w"(v142)); - svfloat32_t zero186; - asm volatile("mov %0.s, #0" : "=w"(zero186)); - svfloat32_t v186 = svcmla_f32_x(pred_full, zero186, v570, v132, 90); - svfloat32_t zero193; - asm volatile("mov %0.s, #0" : "=w"(zero193)); - svfloat32_t v193 = svcmla_f32_x(pred_full, zero193, v571, v134, 90); - svfloat32_t v205; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v205) : "w"(v139), "w"(v573)); - svfloat32_t zero251; - asm volatile("mov %0.s, #0" : "=w"(zero251)); - svfloat32_t v251 = svcmla_f32_x(pred_full, zero251, v581, v162, 90); - svfloat32_t zero272; - asm volatile("mov %0.s, #0" : "=w"(zero272)); - svfloat32_t v272 = svcmla_f32_x(pred_full, zero272, v584, v163, 90); - svfloat32_t zero293; - asm volatile("mov %0.s, #0" : "=w"(zero293)); - svfloat32_t v293 = svcmla_f32_x(pred_full, zero293, v587, v164, 90); - svfloat32_t v157; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v157) : "w"(v156), "w"(v130)); - svfloat32_t zero200; - asm volatile("mov %0.s, #0" : "=w"(zero200)); - svfloat32_t v200 = svcmla_f32_x(pred_full, zero200, v572, v159, 90); - svfloat32_t v215; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v215) : "w"(v160), "w"(v575)); - svfloat32_t v295 = svmla_f32_x(pred_full, v205, v140, v574); - svfloat32_t v307; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v307) : "w"(v237), "w"(v251)); - svfloat32_t v308; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v308) : "w"(v244), "w"(v251)); - svfloat32_t v309; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v309) : "w"(v258), "w"(v272)); - svfloat32_t v310; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v310) : "w"(v265), "w"(v272)); - svfloat32_t v311; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v311) : "w"(v279), "w"(v293)); - svfloat32_t v312 = svcmla_f32_x(pred_full, v293, v586, v148, 90); - svfloat32_t v294 = svmls_f32_x(pred_full, v157, v130, v568); - svfloat32_t v296 = svmls_f32_x(pred_full, v295, v158, v569); - svfloat32_t v297 = svmla_f32_x(pred_full, v215, v140, v574); - svfloat32_t v299 = svnmls_f32_x(pred_full, v205, v160, v575); - svfloat32_t v313; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v313) : "w"(v186), "w"(v200)); - svfloat32_t v314; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v314) : "w"(v193), "w"(v200)); - svfloat32_t v325; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v325) : "w"(v307), "w"(v311)); - svfloat32_t v327; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v327) : "w"(v309), "w"(v311)); - svfloat32_t v329; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v329) : "w"(v308), "w"(v312)); - svint16_t v347 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v157, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svfloat32_t v298 = svmla_f32_x(pred_full, v297, v158, v569); - svfloat32_t v300 = svmls_f32_x(pred_full, v299, v158, v569); - svfloat32_t v301 = svmla_f32_x(pred_full, v294, v141, v576); - svfloat32_t v303 = svmls_f32_x(pred_full, v294, v142, v577); - svfloat32_t v305 = svmls_f32_x(pred_full, v294, v141, v576); - svfloat32_t v321; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v321) : "w"(v314), "w"(v307)); - svfloat32_t v323; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v323) : "w"(v312), "w"(v313)); - svfloat32_t v326; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v326) : "w"(v325), "w"(v314)); - svfloat32_t v328; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v328) : "w"(v327), "w"(v314)); - svfloat32_t v330; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v330) : "w"(v329), "w"(v313)); - svfloat32_t v331; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v331) : "w"(v313), "w"(v308)); - svst1w_u64(pred_full, (unsigned *)(v595), svreinterpret_u64_s16(v347)); - svfloat32_t v302 = svmla_f32_x(pred_full, v301, v142, v577); - svfloat32_t v304 = svmls_f32_x(pred_full, v303, v161, v578); - svfloat32_t v306 = svmla_f32_x(pred_full, v305, v161, v578); - svfloat32_t v322; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v322) : "w"(v321), "w"(v309)); - svfloat32_t v324; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v324) : "w"(v323), "w"(v310)); - svfloat32_t v332; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v332) : "w"(v331), "w"(v310)); - svfloat32_t v315; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v315) : "w"(v296), "w"(v302)); - svfloat32_t v316; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v316) : "w"(v298), "w"(v304)); - svfloat32_t v317; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v317) : "w"(v304), "w"(v298)); - svfloat32_t v318; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v318) : "w"(v300), "w"(v306)); - svfloat32_t v319; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v319) : "w"(v302), "w"(v296)); - svfloat32_t v320; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v320) : "w"(v306), "w"(v300)); - svfloat32_t v333; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v333) : "w"(v315), "w"(v322)); - svfloat32_t v334; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v334) : "w"(v316), "w"(v324)); - svfloat32_t v335; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v335) : "w"(v317), "w"(v326)); - svfloat32_t v336; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v336) : "w"(v318), "w"(v328)); - svfloat32_t v337; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v337) : "w"(v319), "w"(v330)); - svfloat32_t v338; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v338) : "w"(v320), "w"(v332)); - svfloat32_t v339; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v339) : "w"(v320), "w"(v332)); - svfloat32_t v340; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v340) : "w"(v319), "w"(v330)); - svfloat32_t v341; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v341) : "w"(v318), "w"(v328)); - svfloat32_t v342; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v342) : "w"(v317), "w"(v326)); - svfloat32_t v343; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v343) : "w"(v316), "w"(v324)); - svfloat32_t v344; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v344) : "w"(v315), "w"(v322)); - svint16_t v355 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v333, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svint16_t v363 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v334, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svint16_t v371 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v335, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svint16_t v379 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v336, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svint16_t v387 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v337, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svint16_t v395 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v338, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svint16_t v403 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v339, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svint16_t v411 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v340, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svint16_t v419 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v341, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svint16_t v427 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v342, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svint16_t v435 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v343, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svint16_t v443 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v344, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svst1w_u64(pred_full, (unsigned *)(v604), svreinterpret_u64_s16(v355)); - svst1w_u64(pred_full, (unsigned *)(v613), svreinterpret_u64_s16(v363)); - svst1w_u64(pred_full, (unsigned *)(v622), svreinterpret_u64_s16(v371)); - svst1w_u64(pred_full, (unsigned *)(v631), svreinterpret_u64_s16(v379)); - svst1w_u64(pred_full, (unsigned *)(v640), svreinterpret_u64_s16(v387)); - svst1w_u64(pred_full, (unsigned *)(v649), svreinterpret_u64_s16(v395)); - svst1w_u64(pred_full, (unsigned *)(v658), svreinterpret_u64_s16(v403)); - svst1w_u64(pred_full, (unsigned *)(v667), svreinterpret_u64_s16(v411)); - svst1w_u64(pred_full, (unsigned *)(v676), svreinterpret_u64_s16(v419)); - svst1w_u64(pred_full, (unsigned *)(v685), svreinterpret_u64_s16(v427)); - svst1w_u64(pred_full, (unsigned *)(v694), svreinterpret_u64_s16(v435)); - svst1w_u64(pred_full, (unsigned *)(v703), svreinterpret_u64_s16(v443)); - v5 += v11; - v6 += v12; - } -} -#endif - -#ifndef ARMRAL_ARCH_SVE -void armral_fft_cs16_cf32_cs16_ac_n_uu14(const armral_cmplx_int16_t *restrict x, - armral_cmplx_int16_t *restrict y, - int istride, int ostride, int howmany, - float dir) { - float v4 = dir; - const int32_t *v5 = (const int32_t *)x; - int32_t *v6 = (int32_t *)y; - int64_t v12 = howmany - 1; - int64_t v445 = howmany / 2; - for (int j = 0; j < v12; j += 2) { - float v263 = -1.1666666666666665e+00F; - float v268 = 7.9015646852540022e-01F; - float v273 = 5.5854267289647742e-02F; - float v278 = 7.3430220123575241e-01F; - float v282 = 4.4095855184409838e-01F; - float v283 = -4.4095855184409838e-01F; - float v290 = 3.4087293062393137e-01F; - float v291 = -3.4087293062393137e-01F; - float v298 = -5.3396936033772524e-01F; - float v299 = 5.3396936033772524e-01F; - float v306 = 8.7484229096165667e-01F; - float v307 = -8.7484229096165667e-01F; - float32x2_t v309 = (float32x2_t){v4, v4}; - const int32_t *v895 = &v5[istride]; - int32_t *v968 = &v6[ostride]; - float32x2_t v264 = (float32x2_t){v263, v263}; - float32x2_t v269 = (float32x2_t){v268, v268}; - float32x2_t v274 = (float32x2_t){v273, v273}; - float32x2_t v279 = (float32x2_t){v278, v278}; - float32x2_t v284 = (float32x2_t){v282, v283}; - float32x2_t v292 = (float32x2_t){v290, v291}; - float32x2_t v300 = (float32x2_t){v298, v299}; - float32x2_t v308 = (float32x2_t){v306, v307}; - const int32_t *v814 = &v5[0]; - int32_t *v941 = &v6[0]; - int16x4_t v1080 = vld1_s16((const int16_t *)v895); - float32x4_t v108 = vcvtq_n_f32_s32(vmovl_s16(v1080), 15); - float32x4_t v265 = vcombine_f32(v264, v264); - float32x4_t v270 = vcombine_f32(v269, v269); - float32x4_t v275 = vcombine_f32(v274, v274); - float32x4_t v280 = vcombine_f32(v279, v279); - float32x2_t v286 = vmul_f32(v309, v284); - float32x2_t v294 = vmul_f32(v309, v292); - float32x2_t v302 = vmul_f32(v309, v300); - float32x2_t v310 = vmul_f32(v309, v308); - const int32_t *v823 = &v5[istride * 7]; - const int32_t *v832 = &v5[istride * 2]; - const int32_t *v841 = &v5[istride * 9]; - const int32_t *v850 = &v5[istride * 4]; - const int32_t *v859 = &v5[istride * 11]; - const int32_t *v868 = &v5[istride * 6]; - const int32_t *v877 = &v5[istride * 13]; - const int32_t *v886 = &v5[istride * 8]; - const int32_t *v904 = &v5[istride * 10]; - const int32_t *v913 = &v5[istride * 3]; - const int32_t *v922 = &v5[istride * 12]; - const int32_t *v931 = &v5[istride * 5]; - int32_t *v950 = &v6[ostride * 7]; - int32_t *v959 = &v6[ostride * 8]; - int32_t *v977 = &v6[ostride * 2]; - int32_t *v986 = &v6[ostride * 9]; - int32_t *v995 = &v6[ostride * 10]; - int32_t *v1004 = &v6[ostride * 3]; - int32_t *v1013 = &v6[ostride * 4]; - int32_t *v1022 = &v6[ostride * 11]; - int32_t *v1031 = &v6[ostride * 12]; - int32_t *v1040 = &v6[ostride * 5]; - int32_t *v1049 = &v6[ostride * 6]; - int32_t *v1058 = &v6[ostride * 13]; - int16x4_t v1062 = vld1_s16((const int16_t *)v814); - float32x4_t v28 = vcvtq_n_f32_s32(vmovl_s16(v1062), 15); - float32x4_t v288 = vcombine_f32(v286, v286); - float32x4_t v296 = vcombine_f32(v294, v294); - float32x4_t v304 = vcombine_f32(v302, v302); - float32x4_t v312 = vcombine_f32(v310, v310); - int16x4_t v1064 = vld1_s16((const int16_t *)v823); - int16x4_t v1066 = vld1_s16((const int16_t *)v832); - int16x4_t v1068 = vld1_s16((const int16_t *)v841); - int16x4_t v1070 = vld1_s16((const int16_t *)v850); - int16x4_t v1072 = vld1_s16((const int16_t *)v859); - int16x4_t v1074 = vld1_s16((const int16_t *)v868); - int16x4_t v1076 = vld1_s16((const int16_t *)v877); - int16x4_t v1078 = vld1_s16((const int16_t *)v886); - int16x4_t v1082 = vld1_s16((const int16_t *)v904); - int16x4_t v1084 = vld1_s16((const int16_t *)v913); - int16x4_t v1086 = vld1_s16((const int16_t *)v922); - int16x4_t v1088 = vld1_s16((const int16_t *)v931); - float32x4_t v36 = vcvtq_n_f32_s32(vmovl_s16(v1064), 15); - float32x4_t v46 = vcvtq_n_f32_s32(vmovl_s16(v1066), 15); - float32x4_t v54 = vcvtq_n_f32_s32(vmovl_s16(v1068), 15); - float32x4_t v64 = vcvtq_n_f32_s32(vmovl_s16(v1070), 15); - float32x4_t v72 = vcvtq_n_f32_s32(vmovl_s16(v1072), 15); - float32x4_t v82 = vcvtq_n_f32_s32(vmovl_s16(v1074), 15); - float32x4_t v90 = vcvtq_n_f32_s32(vmovl_s16(v1076), 15); - float32x4_t v100 = vcvtq_n_f32_s32(vmovl_s16(v1078), 15); - float32x4_t v118 = vcvtq_n_f32_s32(vmovl_s16(v1082), 15); - float32x4_t v126 = vcvtq_n_f32_s32(vmovl_s16(v1084), 15); - float32x4_t v136 = vcvtq_n_f32_s32(vmovl_s16(v1086), 15); - float32x4_t v144 = vcvtq_n_f32_s32(vmovl_s16(v1088), 15); - float32x4_t v37 = vaddq_f32(v28, v36); - float32x4_t v38 = vsubq_f32(v28, v36); - float32x4_t v55 = vaddq_f32(v46, v54); - float32x4_t v56 = vsubq_f32(v46, v54); - float32x4_t v73 = vaddq_f32(v64, v72); - float32x4_t v74 = vsubq_f32(v64, v72); - float32x4_t v91 = vaddq_f32(v82, v90); - float32x4_t v92 = vsubq_f32(v82, v90); - float32x4_t v109 = vaddq_f32(v100, v108); - float32x4_t v110 = vsubq_f32(v100, v108); - float32x4_t v127 = vaddq_f32(v118, v126); - float32x4_t v128 = vsubq_f32(v118, v126); - float32x4_t v145 = vaddq_f32(v136, v144); - float32x4_t v146 = vsubq_f32(v136, v144); - float32x4_t v147 = vaddq_f32(v55, v145); - float32x4_t v148 = vsubq_f32(v55, v145); - float32x4_t v149 = vaddq_f32(v109, v91); - float32x4_t v150 = vsubq_f32(v109, v91); - float32x4_t v151 = vaddq_f32(v73, v127); - float32x4_t v152 = vsubq_f32(v73, v127); - float32x4_t v240 = vaddq_f32(v56, v146); - float32x4_t v241 = vsubq_f32(v56, v146); - float32x4_t v242 = vaddq_f32(v110, v92); - float32x4_t v243 = vsubq_f32(v110, v92); - float32x4_t v244 = vaddq_f32(v74, v128); - float32x4_t v245 = vsubq_f32(v74, v128); - float32x4_t v153 = vaddq_f32(v147, v149); - float32x4_t v156 = vsubq_f32(v147, v149); - float32x4_t v157 = vsubq_f32(v149, v151); - float32x4_t v158 = vsubq_f32(v151, v147); - float32x4_t v159 = vaddq_f32(v148, v150); - float32x4_t v161 = vsubq_f32(v148, v150); - float32x4_t v162 = vsubq_f32(v150, v152); - float32x4_t v163 = vsubq_f32(v152, v148); - float32x4_t v246 = vaddq_f32(v240, v242); - float32x4_t v249 = vsubq_f32(v240, v242); - float32x4_t v250 = vsubq_f32(v242, v244); - float32x4_t v251 = vsubq_f32(v244, v240); - float32x4_t v252 = vaddq_f32(v241, v243); - float32x4_t v254 = vsubq_f32(v241, v243); - float32x4_t v255 = vsubq_f32(v243, v245); - float32x4_t v256 = vsubq_f32(v245, v241); - float32x4_t v154 = vaddq_f32(v153, v151); - float32x4_t v160 = vaddq_f32(v159, v152); - float32x4_t v178 = vmulq_f32(v156, v270); - float32x4_t v183 = vmulq_f32(v157, v275); - float32x4_t v188 = vmulq_f32(v158, v280); - float32x4_t v202 = vrev64q_f32(v161); - float32x4_t v210 = vrev64q_f32(v162); - float32x4_t v218 = vrev64q_f32(v163); - float32x4_t v247 = vaddq_f32(v246, v244); - float32x4_t v253 = vaddq_f32(v252, v245); - float32x4_t v271 = vmulq_f32(v249, v270); - float32x4_t v276 = vmulq_f32(v250, v275); - float32x4_t v281 = vmulq_f32(v251, v280); - float32x4_t v295 = vrev64q_f32(v254); - float32x4_t v303 = vrev64q_f32(v255); - float32x4_t v311 = vrev64q_f32(v256); - float32x4_t v155 = vaddq_f32(v154, v37); - float32x4_t v173 = vmulq_f32(v154, v265); - float32x4_t v194 = vrev64q_f32(v160); - float32x4_t v204 = vmulq_f32(v202, v296); - float32x4_t v212 = vmulq_f32(v210, v304); - float32x4_t v220 = vmulq_f32(v218, v312); - float32x4_t v248 = vaddq_f32(v247, v38); - float32x4_t v266 = vmulq_f32(v247, v265); - float32x4_t v287 = vrev64q_f32(v253); - float32x4_t v297 = vmulq_f32(v295, v296); - float32x4_t v305 = vmulq_f32(v303, v304); - float32x4_t v313 = vmulq_f32(v311, v312); - float32x4_t v196 = vmulq_f32(v194, v288); - float32x4_t v221 = vaddq_f32(v155, v173); - float32x4_t v289 = vmulq_f32(v287, v288); - float32x4_t v314 = vaddq_f32(v248, v266); - int16x4_t v335 = vqmovn_s32(vcvtq_n_s32_f32(v155, 15)); - int16x4_t v343 = vqmovn_s32(vcvtq_n_s32_f32(v248, 15)); - float32x4_t v222 = vaddq_f32(v221, v178); - float32x4_t v224 = vsubq_f32(v221, v178); - float32x4_t v226 = vsubq_f32(v221, v183); - float32x4_t v228 = vaddq_f32(v196, v204); - float32x4_t v230 = vsubq_f32(v196, v204); - float32x4_t v232 = vsubq_f32(v196, v212); - float32x4_t v315 = vaddq_f32(v314, v271); - float32x4_t v317 = vsubq_f32(v314, v271); - float32x4_t v319 = vsubq_f32(v314, v276); - float32x4_t v321 = vaddq_f32(v289, v297); - float32x4_t v323 = vsubq_f32(v289, v297); - float32x4_t v325 = vsubq_f32(v289, v305); - vst1_s16((int16_t *)v941, v335); - vst1_s16((int16_t *)v950, v343); - float32x4_t v223 = vaddq_f32(v222, v183); - float32x4_t v225 = vsubq_f32(v224, v188); - float32x4_t v227 = vaddq_f32(v226, v188); - float32x4_t v229 = vaddq_f32(v228, v212); - float32x4_t v231 = vsubq_f32(v230, v220); - float32x4_t v233 = vaddq_f32(v232, v220); - float32x4_t v316 = vaddq_f32(v315, v276); - float32x4_t v318 = vsubq_f32(v317, v281); - float32x4_t v320 = vaddq_f32(v319, v281); - float32x4_t v322 = vaddq_f32(v321, v305); - float32x4_t v324 = vsubq_f32(v323, v313); - float32x4_t v326 = vaddq_f32(v325, v313); - float32x4_t v234 = vaddq_f32(v223, v229); - float32x4_t v235 = vsubq_f32(v223, v229); - float32x4_t v236 = vaddq_f32(v225, v231); - float32x4_t v237 = vsubq_f32(v225, v231); - float32x4_t v238 = vaddq_f32(v227, v233); - float32x4_t v239 = vsubq_f32(v227, v233); - float32x4_t v327 = vaddq_f32(v316, v322); - float32x4_t v328 = vsubq_f32(v316, v322); - float32x4_t v329 = vaddq_f32(v318, v324); - float32x4_t v330 = vsubq_f32(v318, v324); - float32x4_t v331 = vaddq_f32(v320, v326); - float32x4_t v332 = vsubq_f32(v320, v326); - int16x4_t v351 = vqmovn_s32(vcvtq_n_s32_f32(v235, 15)); - int16x4_t v359 = vqmovn_s32(vcvtq_n_s32_f32(v328, 15)); - int16x4_t v367 = vqmovn_s32(vcvtq_n_s32_f32(v237, 15)); - int16x4_t v375 = vqmovn_s32(vcvtq_n_s32_f32(v330, 15)); - int16x4_t v383 = vqmovn_s32(vcvtq_n_s32_f32(v238, 15)); - int16x4_t v391 = vqmovn_s32(vcvtq_n_s32_f32(v331, 15)); - int16x4_t v399 = vqmovn_s32(vcvtq_n_s32_f32(v239, 15)); - int16x4_t v407 = vqmovn_s32(vcvtq_n_s32_f32(v332, 15)); - int16x4_t v415 = vqmovn_s32(vcvtq_n_s32_f32(v236, 15)); - int16x4_t v423 = vqmovn_s32(vcvtq_n_s32_f32(v329, 15)); - int16x4_t v431 = vqmovn_s32(vcvtq_n_s32_f32(v234, 15)); - int16x4_t v439 = vqmovn_s32(vcvtq_n_s32_f32(v327, 15)); - vst1_s16((int16_t *)v959, v351); - vst1_s16((int16_t *)v968, v359); - vst1_s16((int16_t *)v977, v367); - vst1_s16((int16_t *)v986, v375); - vst1_s16((int16_t *)v995, v383); - vst1_s16((int16_t *)v1004, v391); - vst1_s16((int16_t *)v1013, v399); - vst1_s16((int16_t *)v1022, v407); - vst1_s16((int16_t *)v1031, v415); - vst1_s16((int16_t *)v1040, v423); - vst1_s16((int16_t *)v1049, v431); - vst1_s16((int16_t *)v1058, v439); - v5 += 2 * 1; - v6 += 2 * 1; - } - for (int j = v445 * 2; j < howmany; j += 1) { - int16x4_t v519 = vld1s_s16(&v5[istride]); - float v657 = -1.1666666666666665e+00F; - float v661 = 7.9015646852540022e-01F; - float v665 = 5.5854267289647742e-02F; - float v669 = 7.3430220123575241e-01F; - float v672 = 4.4095855184409838e-01F; - float v673 = -4.4095855184409838e-01F; - float v679 = 3.4087293062393137e-01F; - float v680 = -3.4087293062393137e-01F; - float v686 = -5.3396936033772524e-01F; - float v687 = 5.3396936033772524e-01F; - float v693 = 8.7484229096165667e-01F; - float v694 = -8.7484229096165667e-01F; - float32x2_t v696 = (float32x2_t){v4, v4}; - int16x4_t v457 = vld1s_s16(&v5[0]); - float32x2_t v520 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v519)), 15); - float32x2_t v658 = (float32x2_t){v657, v657}; - float32x2_t v662 = (float32x2_t){v661, v661}; - float32x2_t v666 = (float32x2_t){v665, v665}; - float32x2_t v670 = (float32x2_t){v669, v669}; - float32x2_t v674 = (float32x2_t){v672, v673}; - float32x2_t v681 = (float32x2_t){v679, v680}; - float32x2_t v688 = (float32x2_t){v686, v687}; - float32x2_t v695 = (float32x2_t){v693, v694}; - float32x2_t v458 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v457)), 15); - int16x4_t v463 = vld1s_s16(&v5[istride * 7]); - int16x4_t v471 = vld1s_s16(&v5[istride * 2]); - int16x4_t v477 = vld1s_s16(&v5[istride * 9]); - int16x4_t v485 = vld1s_s16(&v5[istride * 4]); - int16x4_t v491 = vld1s_s16(&v5[istride * 11]); - int16x4_t v499 = vld1s_s16(&v5[istride * 6]); - int16x4_t v505 = vld1s_s16(&v5[istride * 13]); - int16x4_t v513 = vld1s_s16(&v5[istride * 8]); - int16x4_t v527 = vld1s_s16(&v5[istride * 10]); - int16x4_t v533 = vld1s_s16(&v5[istride * 3]); - int16x4_t v541 = vld1s_s16(&v5[istride * 12]); - int16x4_t v547 = vld1s_s16(&v5[istride * 5]); - float32x2_t v676 = vmul_f32(v696, v674); - float32x2_t v683 = vmul_f32(v696, v681); - float32x2_t v690 = vmul_f32(v696, v688); - float32x2_t v697 = vmul_f32(v696, v695); - float32x2_t v464 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v463)), 15); - float32x2_t v472 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v471)), 15); - float32x2_t v478 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v477)), 15); - float32x2_t v486 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v485)), 15); - float32x2_t v492 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v491)), 15); - float32x2_t v500 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v499)), 15); - float32x2_t v506 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v505)), 15); - float32x2_t v514 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v513)), 15); - float32x2_t v528 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v527)), 15); - float32x2_t v534 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v533)), 15); - float32x2_t v542 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v541)), 15); - float32x2_t v548 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v547)), 15); - float32x2_t v465 = vadd_f32(v458, v464); - float32x2_t v466 = vsub_f32(v458, v464); - float32x2_t v479 = vadd_f32(v472, v478); - float32x2_t v480 = vsub_f32(v472, v478); - float32x2_t v493 = vadd_f32(v486, v492); - float32x2_t v494 = vsub_f32(v486, v492); - float32x2_t v507 = vadd_f32(v500, v506); - float32x2_t v508 = vsub_f32(v500, v506); - float32x2_t v521 = vadd_f32(v514, v520); - float32x2_t v522 = vsub_f32(v514, v520); - float32x2_t v535 = vadd_f32(v528, v534); - float32x2_t v536 = vsub_f32(v528, v534); - float32x2_t v549 = vadd_f32(v542, v548); - float32x2_t v550 = vsub_f32(v542, v548); - float32x2_t v551 = vadd_f32(v479, v549); - float32x2_t v552 = vsub_f32(v479, v549); - float32x2_t v553 = vadd_f32(v521, v507); - float32x2_t v554 = vsub_f32(v521, v507); - float32x2_t v555 = vadd_f32(v493, v535); - float32x2_t v556 = vsub_f32(v493, v535); - float32x2_t v635 = vadd_f32(v480, v550); - float32x2_t v636 = vsub_f32(v480, v550); - float32x2_t v637 = vadd_f32(v522, v508); - float32x2_t v638 = vsub_f32(v522, v508); - float32x2_t v639 = vadd_f32(v494, v536); - float32x2_t v640 = vsub_f32(v494, v536); - float32x2_t v557 = vadd_f32(v551, v553); - float32x2_t v560 = vsub_f32(v551, v553); - float32x2_t v561 = vsub_f32(v553, v555); - float32x2_t v562 = vsub_f32(v555, v551); - float32x2_t v563 = vadd_f32(v552, v554); - float32x2_t v565 = vsub_f32(v552, v554); - float32x2_t v566 = vsub_f32(v554, v556); - float32x2_t v567 = vsub_f32(v556, v552); - float32x2_t v641 = vadd_f32(v635, v637); - float32x2_t v644 = vsub_f32(v635, v637); - float32x2_t v645 = vsub_f32(v637, v639); - float32x2_t v646 = vsub_f32(v639, v635); - float32x2_t v647 = vadd_f32(v636, v638); - float32x2_t v649 = vsub_f32(v636, v638); - float32x2_t v650 = vsub_f32(v638, v640); - float32x2_t v651 = vsub_f32(v640, v636); - float32x2_t v558 = vadd_f32(v557, v555); - float32x2_t v564 = vadd_f32(v563, v556); - float32x2_t v579 = vmul_f32(v560, v662); - float32x2_t v583 = vmul_f32(v561, v666); - float32x2_t v587 = vmul_f32(v562, v670); - float32x2_t v600 = vrev64_f32(v565); - float32x2_t v607 = vrev64_f32(v566); - float32x2_t v614 = vrev64_f32(v567); - float32x2_t v642 = vadd_f32(v641, v639); - float32x2_t v648 = vadd_f32(v647, v640); - float32x2_t v663 = vmul_f32(v644, v662); - float32x2_t v667 = vmul_f32(v645, v666); - float32x2_t v671 = vmul_f32(v646, v670); - float32x2_t v684 = vrev64_f32(v649); - float32x2_t v691 = vrev64_f32(v650); - float32x2_t v698 = vrev64_f32(v651); - float32x2_t v559 = vadd_f32(v558, v465); - float32x2_t v575 = vmul_f32(v558, v658); - float32x2_t v593 = vrev64_f32(v564); - float32x2_t v601 = vmul_f32(v600, v683); - float32x2_t v608 = vmul_f32(v607, v690); - float32x2_t v615 = vmul_f32(v614, v697); - float32x2_t v643 = vadd_f32(v642, v466); - float32x2_t v659 = vmul_f32(v642, v658); - float32x2_t v677 = vrev64_f32(v648); - float32x2_t v685 = vmul_f32(v684, v683); - float32x2_t v692 = vmul_f32(v691, v690); - float32x2_t v699 = vmul_f32(v698, v697); - float32x2_t v594 = vmul_f32(v593, v676); - float32x2_t v616 = vadd_f32(v559, v575); - float32x2_t v678 = vmul_f32(v677, v676); - float32x2_t v700 = vadd_f32(v643, v659); - int16x4_t v721 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v559, 15), (int32x2_t){0, 0})); - int16x4_t v727 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v643, 15), (int32x2_t){0, 0})); - float32x2_t v617 = vadd_f32(v616, v579); - float32x2_t v619 = vsub_f32(v616, v579); - float32x2_t v621 = vsub_f32(v616, v583); - float32x2_t v623 = vadd_f32(v594, v601); - float32x2_t v625 = vsub_f32(v594, v601); - float32x2_t v627 = vsub_f32(v594, v608); - float32x2_t v701 = vadd_f32(v700, v663); - float32x2_t v703 = vsub_f32(v700, v663); - float32x2_t v705 = vsub_f32(v700, v667); - float32x2_t v707 = vadd_f32(v678, v685); - float32x2_t v709 = vsub_f32(v678, v685); - float32x2_t v711 = vsub_f32(v678, v692); - v6[0] = vget_lane_s32(vreinterpret_s32_s16(v721), 0); - v6[ostride * 7] = vget_lane_s32(vreinterpret_s32_s16(v727), 0); - float32x2_t v618 = vadd_f32(v617, v583); - float32x2_t v620 = vsub_f32(v619, v587); - float32x2_t v622 = vadd_f32(v621, v587); - float32x2_t v624 = vadd_f32(v623, v608); - float32x2_t v626 = vsub_f32(v625, v615); - float32x2_t v628 = vadd_f32(v627, v615); - float32x2_t v702 = vadd_f32(v701, v667); - float32x2_t v704 = vsub_f32(v703, v671); - float32x2_t v706 = vadd_f32(v705, v671); - float32x2_t v708 = vadd_f32(v707, v692); - float32x2_t v710 = vsub_f32(v709, v699); - float32x2_t v712 = vadd_f32(v711, v699); - float32x2_t v629 = vadd_f32(v618, v624); - float32x2_t v630 = vsub_f32(v618, v624); - float32x2_t v631 = vadd_f32(v620, v626); - float32x2_t v632 = vsub_f32(v620, v626); - float32x2_t v633 = vadd_f32(v622, v628); - float32x2_t v634 = vsub_f32(v622, v628); - float32x2_t v713 = vadd_f32(v702, v708); - float32x2_t v714 = vsub_f32(v702, v708); - float32x2_t v715 = vadd_f32(v704, v710); - float32x2_t v716 = vsub_f32(v704, v710); - float32x2_t v717 = vadd_f32(v706, v712); - float32x2_t v718 = vsub_f32(v706, v712); - int16x4_t v733 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v630, 15), (int32x2_t){0, 0})); - int16x4_t v739 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v714, 15), (int32x2_t){0, 0})); - int16x4_t v745 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v632, 15), (int32x2_t){0, 0})); - int16x4_t v751 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v716, 15), (int32x2_t){0, 0})); - int16x4_t v757 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v633, 15), (int32x2_t){0, 0})); - int16x4_t v763 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v717, 15), (int32x2_t){0, 0})); - int16x4_t v769 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v634, 15), (int32x2_t){0, 0})); - int16x4_t v775 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v718, 15), (int32x2_t){0, 0})); - int16x4_t v781 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v631, 15), (int32x2_t){0, 0})); - int16x4_t v787 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v715, 15), (int32x2_t){0, 0})); - int16x4_t v793 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v629, 15), (int32x2_t){0, 0})); - int16x4_t v799 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v713, 15), (int32x2_t){0, 0})); - v6[ostride * 8] = vget_lane_s32(vreinterpret_s32_s16(v733), 0); - v6[ostride] = vget_lane_s32(vreinterpret_s32_s16(v739), 0); - v6[ostride * 2] = vget_lane_s32(vreinterpret_s32_s16(v745), 0); - v6[ostride * 9] = vget_lane_s32(vreinterpret_s32_s16(v751), 0); - v6[ostride * 10] = vget_lane_s32(vreinterpret_s32_s16(v757), 0); - v6[ostride * 3] = vget_lane_s32(vreinterpret_s32_s16(v763), 0); - v6[ostride * 4] = vget_lane_s32(vreinterpret_s32_s16(v769), 0); - v6[ostride * 11] = vget_lane_s32(vreinterpret_s32_s16(v775), 0); - v6[ostride * 12] = vget_lane_s32(vreinterpret_s32_s16(v781), 0); - v6[ostride * 5] = vget_lane_s32(vreinterpret_s32_s16(v787), 0); - v6[ostride * 6] = vget_lane_s32(vreinterpret_s32_s16(v793), 0); - v6[ostride * 13] = vget_lane_s32(vreinterpret_s32_s16(v799), 0); - v5 += 1 * 1; - v6 += 1 * 1; - } -} -#endif - -#ifdef ARMRAL_ARCH_SVE -void armral_fft_cs16_cf32_cs16_ac_n_uu14(const armral_cmplx_int16_t *restrict x, - armral_cmplx_int16_t *restrict y, - int istride, int ostride, int howmany, - float dir) { - int64_t v0 = istride; - int64_t v2 = ostride; - float v4 = dir; - const int32_t *v5 = (const int32_t *)x; - int32_t *v6 = (int32_t *)y; - int64_t v8 = howmany; - int64_t v10 = svcntd(); - int64_t v11 = v10 * 1; - int64_t v12 = v10 * 1; - for (int j = 0; j < v8; j += v10) { - svbool_t pred_full = svwhilelt_b32(j * 2, howmany * 2); - float v256 = -1.1666666666666665e+00F; - float v261 = 7.9015646852540022e-01F; - float v266 = 5.5854267289647742e-02F; - float v271 = 7.3430220123575241e-01F; - float v276 = -4.4095855184409838e-01F; - float v283 = -3.4087293062393137e-01F; - float v290 = 5.3396936033772524e-01F; - float v297 = -8.7484229096165667e-01F; - const int32_t *v522 = &v5[v0]; - int32_t *v613 = &v6[v2]; - int64_t v27 = v0 * 7; - int64_t v37 = v0 * 2; - int64_t v45 = v0 * 9; - int64_t v55 = v0 * 4; - int64_t v63 = v0 * 11; - int64_t v73 = v0 * 6; - int64_t v81 = v0 * 13; - int64_t v91 = v0 * 8; - int64_t v109 = v0 * 10; - int64_t v117 = v0 * 3; - int64_t v127 = v0 * 12; - int64_t v135 = v0 * 5; - float v279 = v4 * v276; - float v286 = v4 * v283; - float v293 = v4 * v290; - float v300 = v4 * v297; - int64_t v331 = v2 * 7; - int64_t v339 = v2 * 8; - int64_t v355 = v2 * 2; - int64_t v363 = v2 * 9; - int64_t v371 = v2 * 10; - int64_t v379 = v2 * 3; - int64_t v387 = v2 * 4; - int64_t v395 = v2 * 11; - int64_t v403 = v2 * 12; - int64_t v411 = v2 * 5; - int64_t v419 = v2 * 6; - int64_t v427 = v2 * 13; - const int32_t *v441 = &v5[0]; - svfloat32_t v571 = svdup_n_f32(v256); - svfloat32_t v572 = svdup_n_f32(v261); - svfloat32_t v573 = svdup_n_f32(v266); - svfloat32_t v574 = svdup_n_f32(v271); - int32_t *v586 = &v6[0]; - svfloat32_t v105 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v522[0])), - 1.F / (1ULL << 15ULL)); - const int32_t *v450 = &v5[v27]; - const int32_t *v459 = &v5[v37]; - const int32_t *v468 = &v5[v45]; - const int32_t *v477 = &v5[v55]; - const int32_t *v486 = &v5[v63]; - const int32_t *v495 = &v5[v73]; - const int32_t *v504 = &v5[v81]; - const int32_t *v513 = &v5[v91]; - const int32_t *v531 = &v5[v109]; - const int32_t *v540 = &v5[v117]; - const int32_t *v549 = &v5[v127]; - const int32_t *v558 = &v5[v135]; - svfloat32_t v575 = svdup_n_f32(v279); - svfloat32_t v576 = svdup_n_f32(v286); - svfloat32_t v577 = svdup_n_f32(v293); - svfloat32_t v578 = svdup_n_f32(v300); - int32_t *v595 = &v6[v331]; - int32_t *v604 = &v6[v339]; - int32_t *v622 = &v6[v355]; - int32_t *v631 = &v6[v363]; - int32_t *v640 = &v6[v371]; - int32_t *v649 = &v6[v379]; - int32_t *v658 = &v6[v387]; - int32_t *v667 = &v6[v395]; - int32_t *v676 = &v6[v403]; - int32_t *v685 = &v6[v411]; - int32_t *v694 = &v6[v419]; - int32_t *v703 = &v6[v427]; - svfloat32_t v25 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v441[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v33 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v450[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v43 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v459[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v51 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v468[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v61 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v477[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v69 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v486[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v79 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v495[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v87 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v504[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v97 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v513[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v115 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v531[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v123 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v540[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v133 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v549[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v141 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v558[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v34; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v34) : "w"(v25), "w"(v33)); - svfloat32_t v35; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v35) : "w"(v25), "w"(v33)); - svfloat32_t v52; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v52) : "w"(v43), "w"(v51)); - svfloat32_t v53; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v53) : "w"(v43), "w"(v51)); - svfloat32_t v70; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v70) : "w"(v61), "w"(v69)); - svfloat32_t v71; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v71) : "w"(v61), "w"(v69)); - svfloat32_t v88; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v88) : "w"(v79), "w"(v87)); - svfloat32_t v89; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v89) : "w"(v79), "w"(v87)); - svfloat32_t v106; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v106) : "w"(v97), "w"(v105)); - svfloat32_t v107; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v107) : "w"(v97), "w"(v105)); - svfloat32_t v124; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v124) : "w"(v115), "w"(v123)); - svfloat32_t v125; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v125) : "w"(v115), "w"(v123)); - svfloat32_t v142; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v142) : "w"(v133), "w"(v141)); - svfloat32_t v143; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v143) : "w"(v133), "w"(v141)); - svfloat32_t v144; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v144) : "w"(v52), "w"(v142)); - svfloat32_t v145; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v145) : "w"(v52), "w"(v142)); - svfloat32_t v146; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v146) : "w"(v106), "w"(v88)); - svfloat32_t v147; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v147) : "w"(v106), "w"(v88)); - svfloat32_t v148; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v148) : "w"(v70), "w"(v124)); - svfloat32_t v149; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v149) : "w"(v70), "w"(v124)); - svfloat32_t v233; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v233) : "w"(v53), "w"(v143)); - svfloat32_t v234; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v234) : "w"(v53), "w"(v143)); - svfloat32_t v235; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v235) : "w"(v107), "w"(v89)); - svfloat32_t v236; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v236) : "w"(v107), "w"(v89)); - svfloat32_t v237; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v237) : "w"(v71), "w"(v125)); - svfloat32_t v238; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v238) : "w"(v71), "w"(v125)); - svfloat32_t v150; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v150) : "w"(v144), "w"(v146)); - svfloat32_t v153; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v153) : "w"(v144), "w"(v146)); - svfloat32_t v154; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v154) : "w"(v146), "w"(v148)); - svfloat32_t v155; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v155) : "w"(v148), "w"(v144)); - svfloat32_t v156; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v156) : "w"(v145), "w"(v147)); - svfloat32_t v158; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v158) : "w"(v145), "w"(v147)); - svfloat32_t v159; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v159) : "w"(v147), "w"(v149)); - svfloat32_t v160; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v160) : "w"(v149), "w"(v145)); - svfloat32_t v239; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v239) : "w"(v233), "w"(v235)); - svfloat32_t v242; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v242) : "w"(v233), "w"(v235)); - svfloat32_t v243; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v243) : "w"(v235), "w"(v237)); - svfloat32_t v244; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v244) : "w"(v237), "w"(v233)); - svfloat32_t v245; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v245) : "w"(v234), "w"(v236)); - svfloat32_t v247; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v247) : "w"(v234), "w"(v236)); - svfloat32_t v248; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v248) : "w"(v236), "w"(v238)); - svfloat32_t v249; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v249) : "w"(v238), "w"(v234)); - svfloat32_t v151; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v151) : "w"(v150), "w"(v148)); - svfloat32_t v157; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v157) : "w"(v156), "w"(v149)); - svfloat32_t zero199; - asm volatile("mov %0.s, #0" : "=w"(zero199)); - svfloat32_t v199 = svcmla_f32_x(pred_full, zero199, v576, v158, 90); - svfloat32_t zero206; - asm volatile("mov %0.s, #0" : "=w"(zero206)); - svfloat32_t v206 = svcmla_f32_x(pred_full, zero206, v577, v159, 90); - svfloat32_t zero213; - asm volatile("mov %0.s, #0" : "=w"(zero213)); - svfloat32_t v213 = svcmla_f32_x(pred_full, zero213, v578, v160, 90); - svfloat32_t v240; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v240) : "w"(v239), "w"(v237)); - svfloat32_t v246; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v246) : "w"(v245), "w"(v238)); - svfloat32_t zero288; - asm volatile("mov %0.s, #0" : "=w"(zero288)); - svfloat32_t v288 = svcmla_f32_x(pred_full, zero288, v576, v247, 90); - svfloat32_t zero295; - asm volatile("mov %0.s, #0" : "=w"(zero295)); - svfloat32_t v295 = svcmla_f32_x(pred_full, zero295, v577, v248, 90); - svfloat32_t zero302; - asm volatile("mov %0.s, #0" : "=w"(zero302)); - svfloat32_t v302 = svcmla_f32_x(pred_full, zero302, v578, v249, 90); - svfloat32_t v152; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v152) : "w"(v151), "w"(v34)); - svfloat32_t zero192; - asm volatile("mov %0.s, #0" : "=w"(zero192)); - svfloat32_t v192 = svcmla_f32_x(pred_full, zero192, v575, v157, 90); - svfloat32_t v241; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v241) : "w"(v240), "w"(v35)); - svfloat32_t zero281; - asm volatile("mov %0.s, #0" : "=w"(zero281)); - svfloat32_t v281 = svcmla_f32_x(pred_full, zero281, v575, v246, 90); - svfloat32_t v214 = svmla_f32_x(pred_full, v152, v151, v571); - svfloat32_t v221; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v221) : "w"(v192), "w"(v199)); - svfloat32_t v223; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v223) : "w"(v192), "w"(v199)); - svfloat32_t v225; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v225) : "w"(v192), "w"(v206)); - svfloat32_t v303 = svmla_f32_x(pred_full, v241, v240, v571); - svfloat32_t v310; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v310) : "w"(v281), "w"(v288)); - svfloat32_t v312; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v312) : "w"(v281), "w"(v288)); - svfloat32_t v314; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v314) : "w"(v281), "w"(v295)); - svint16_t v324 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v152, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svint16_t v332 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v241, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svfloat32_t v215 = svmla_f32_x(pred_full, v214, v153, v572); - svfloat32_t v217 = svmls_f32_x(pred_full, v214, v153, v572); - svfloat32_t v219 = svmls_f32_x(pred_full, v214, v154, v573); - svfloat32_t v222; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v222) : "w"(v221), "w"(v206)); - svfloat32_t v224; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v224) : "w"(v223), "w"(v213)); - svfloat32_t v226; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v226) : "w"(v225), "w"(v213)); - svfloat32_t v304 = svmla_f32_x(pred_full, v303, v242, v572); - svfloat32_t v306 = svmls_f32_x(pred_full, v303, v242, v572); - svfloat32_t v308 = svmls_f32_x(pred_full, v303, v243, v573); - svfloat32_t v311; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v311) : "w"(v310), "w"(v295)); - svfloat32_t v313; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v313) : "w"(v312), "w"(v302)); - svfloat32_t v315; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v315) : "w"(v314), "w"(v302)); - svst1w_u64(pred_full, (unsigned *)(v586), svreinterpret_u64_s16(v324)); - svst1w_u64(pred_full, (unsigned *)(v595), svreinterpret_u64_s16(v332)); - svfloat32_t v216 = svmla_f32_x(pred_full, v215, v154, v573); - svfloat32_t v218 = svmls_f32_x(pred_full, v217, v155, v574); - svfloat32_t v220 = svmla_f32_x(pred_full, v219, v155, v574); - svfloat32_t v305 = svmla_f32_x(pred_full, v304, v243, v573); - svfloat32_t v307 = svmls_f32_x(pred_full, v306, v244, v574); - svfloat32_t v309 = svmla_f32_x(pred_full, v308, v244, v574); - svfloat32_t v227; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v227) : "w"(v216), "w"(v222)); - svfloat32_t v228; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v228) : "w"(v216), "w"(v222)); - svfloat32_t v229; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v229) : "w"(v218), "w"(v224)); - svfloat32_t v230; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v230) : "w"(v218), "w"(v224)); - svfloat32_t v231; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v231) : "w"(v220), "w"(v226)); - svfloat32_t v232; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v232) : "w"(v220), "w"(v226)); - svfloat32_t v316; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v316) : "w"(v305), "w"(v311)); - svfloat32_t v317; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v317) : "w"(v305), "w"(v311)); - svfloat32_t v318; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v318) : "w"(v307), "w"(v313)); - svfloat32_t v319; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v319) : "w"(v307), "w"(v313)); - svfloat32_t v320; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v320) : "w"(v309), "w"(v315)); - svfloat32_t v321; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v321) : "w"(v309), "w"(v315)); - svint16_t v340 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v228, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svint16_t v348 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v317, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svint16_t v356 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v230, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svint16_t v364 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v319, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svint16_t v372 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v231, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svint16_t v380 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v320, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svint16_t v388 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v232, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svint16_t v396 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v321, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svint16_t v404 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v229, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svint16_t v412 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v318, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svint16_t v420 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v227, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svint16_t v428 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v316, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svst1w_u64(pred_full, (unsigned *)(v604), svreinterpret_u64_s16(v340)); - svst1w_u64(pred_full, (unsigned *)(v613), svreinterpret_u64_s16(v348)); - svst1w_u64(pred_full, (unsigned *)(v622), svreinterpret_u64_s16(v356)); - svst1w_u64(pred_full, (unsigned *)(v631), svreinterpret_u64_s16(v364)); - svst1w_u64(pred_full, (unsigned *)(v640), svreinterpret_u64_s16(v372)); - svst1w_u64(pred_full, (unsigned *)(v649), svreinterpret_u64_s16(v380)); - svst1w_u64(pred_full, (unsigned *)(v658), svreinterpret_u64_s16(v388)); - svst1w_u64(pred_full, (unsigned *)(v667), svreinterpret_u64_s16(v396)); - svst1w_u64(pred_full, (unsigned *)(v676), svreinterpret_u64_s16(v404)); - svst1w_u64(pred_full, (unsigned *)(v685), svreinterpret_u64_s16(v412)); - svst1w_u64(pred_full, (unsigned *)(v694), svreinterpret_u64_s16(v420)); - svst1w_u64(pred_full, (unsigned *)(v703), svreinterpret_u64_s16(v428)); - v5 += v11; - v6 += v12; - } -} -#endif - -#ifndef ARMRAL_ARCH_SVE -void armral_fft_cs16_cf32_cs16_ac_n_uu15(const armral_cmplx_int16_t *restrict x, - armral_cmplx_int16_t *restrict y, - int istride, int ostride, int howmany, - float dir) { - float v4 = dir; - const int32_t *v5 = (const int32_t *)x; - int32_t *v6 = (int32_t *)y; - int64_t v12 = howmany - 1; - int64_t v459 = howmany / 2; - for (int j = 0; j < v12; j += 2) { - float v170 = -1.2500000000000000e+00F; - float v175 = 5.5901699437494745e-01F; - float v179 = 1.5388417685876268e+00F; - float v180 = -1.5388417685876268e+00F; - float v187 = 5.8778525229247325e-01F; - float v188 = -5.8778525229247325e-01F; - float v195 = 3.6327126400268028e-01F; - float v196 = -3.6327126400268028e-01F; - float v221 = -1.4999999999999998e+00F; - float v226 = 1.8749999999999998e+00F; - float v231 = -8.3852549156242107e-01F; - float v235 = -2.3082626528814396e+00F; - float v236 = 2.3082626528814396e+00F; - float v243 = -8.8167787843870971e-01F; - float v244 = 8.8167787843870971e-01F; - float v251 = -5.4490689600402031e-01F; - float v252 = 5.4490689600402031e-01F; - float v276 = 8.6602540378443871e-01F; - float v277 = -8.6602540378443871e-01F; - float v284 = -1.0825317547305484e+00F; - float v285 = 1.0825317547305484e+00F; - float v292 = 4.8412291827592718e-01F; - float v293 = -4.8412291827592718e-01F; - float32x2_t v295 = (float32x2_t){v4, v4}; - float v301 = -1.3326760640014592e+00F; - float v306 = -5.0903696045512736e-01F; - float v311 = -3.1460214309120460e-01F; - const int32_t *v901 = &v5[istride]; - int32_t *v1010 = &v6[ostride]; - float32x2_t v171 = (float32x2_t){v170, v170}; - float32x2_t v176 = (float32x2_t){v175, v175}; - float32x2_t v181 = (float32x2_t){v179, v180}; - float32x2_t v189 = (float32x2_t){v187, v188}; - float32x2_t v197 = (float32x2_t){v195, v196}; - float32x2_t v222 = (float32x2_t){v221, v221}; - float32x2_t v227 = (float32x2_t){v226, v226}; - float32x2_t v232 = (float32x2_t){v231, v231}; - float32x2_t v237 = (float32x2_t){v235, v236}; - float32x2_t v245 = (float32x2_t){v243, v244}; - float32x2_t v253 = (float32x2_t){v251, v252}; - float32x2_t v278 = (float32x2_t){v276, v277}; - float32x2_t v286 = (float32x2_t){v284, v285}; - float32x2_t v294 = (float32x2_t){v292, v293}; - float32x2_t v302 = (float32x2_t){v301, v301}; - float32x2_t v307 = (float32x2_t){v306, v306}; - float32x2_t v312 = (float32x2_t){v311, v311}; - const int32_t *v856 = &v5[0]; - int32_t *v974 = &v6[0]; - int16x4_t v1118 = vld1_s16((const int16_t *)v901); - float32x4_t v90 = vcvtq_n_f32_s32(vmovl_s16(v1118), 15); - float32x4_t v172 = vcombine_f32(v171, v171); - float32x4_t v177 = vcombine_f32(v176, v176); - float32x2_t v183 = vmul_f32(v295, v181); - float32x2_t v191 = vmul_f32(v295, v189); - float32x2_t v199 = vmul_f32(v295, v197); - float32x4_t v223 = vcombine_f32(v222, v222); - float32x4_t v228 = vcombine_f32(v227, v227); - float32x4_t v233 = vcombine_f32(v232, v232); - float32x2_t v239 = vmul_f32(v295, v237); - float32x2_t v247 = vmul_f32(v295, v245); - float32x2_t v255 = vmul_f32(v295, v253); - float32x2_t v280 = vmul_f32(v295, v278); - float32x2_t v288 = vmul_f32(v295, v286); - float32x2_t v296 = vmul_f32(v295, v294); - float32x4_t v303 = vcombine_f32(v302, v302); - float32x4_t v308 = vcombine_f32(v307, v307); - float32x4_t v313 = vcombine_f32(v312, v312); - const int32_t *v837 = &v5[istride * 5]; - const int32_t *v846 = &v5[istride * 10]; - const int32_t *v865 = &v5[istride * 8]; - const int32_t *v874 = &v5[istride * 13]; - const int32_t *v883 = &v5[istride * 3]; - const int32_t *v892 = &v5[istride * 11]; - const int32_t *v910 = &v5[istride * 6]; - const int32_t *v919 = &v5[istride * 14]; - const int32_t *v928 = &v5[istride * 4]; - const int32_t *v937 = &v5[istride * 9]; - const int32_t *v946 = &v5[istride * 2]; - const int32_t *v955 = &v5[istride * 7]; - const int32_t *v964 = &v5[istride * 12]; - int32_t *v983 = &v6[ostride * 10]; - int32_t *v992 = &v6[ostride * 5]; - int32_t *v1001 = &v6[ostride * 6]; - int32_t *v1019 = &v6[ostride * 11]; - int32_t *v1028 = &v6[ostride * 12]; - int32_t *v1037 = &v6[ostride * 7]; - int32_t *v1046 = &v6[ostride * 2]; - int32_t *v1055 = &v6[ostride * 3]; - int32_t *v1064 = &v6[ostride * 13]; - int32_t *v1073 = &v6[ostride * 8]; - int32_t *v1082 = &v6[ostride * 9]; - int32_t *v1091 = &v6[ostride * 4]; - int32_t *v1100 = &v6[ostride * 14]; - int16x4_t v1108 = vld1_s16((const int16_t *)v856); - float32x4_t v46 = vcvtq_n_f32_s32(vmovl_s16(v1108), 15); - float32x4_t v185 = vcombine_f32(v183, v183); - float32x4_t v193 = vcombine_f32(v191, v191); - float32x4_t v201 = vcombine_f32(v199, v199); - float32x4_t v241 = vcombine_f32(v239, v239); - float32x4_t v249 = vcombine_f32(v247, v247); - float32x4_t v257 = vcombine_f32(v255, v255); - float32x4_t v282 = vcombine_f32(v280, v280); - float32x4_t v290 = vcombine_f32(v288, v288); - float32x4_t v298 = vcombine_f32(v296, v296); - int16x4_t v1104 = vld1_s16((const int16_t *)v837); - int16x4_t v1106 = vld1_s16((const int16_t *)v846); - int16x4_t v1110 = vld1_s16((const int16_t *)v865); - int16x4_t v1112 = vld1_s16((const int16_t *)v874); - int16x4_t v1114 = vld1_s16((const int16_t *)v883); - int16x4_t v1116 = vld1_s16((const int16_t *)v892); - int16x4_t v1120 = vld1_s16((const int16_t *)v910); - int16x4_t v1122 = vld1_s16((const int16_t *)v919); - int16x4_t v1124 = vld1_s16((const int16_t *)v928); - int16x4_t v1126 = vld1_s16((const int16_t *)v937); - int16x4_t v1128 = vld1_s16((const int16_t *)v946); - int16x4_t v1130 = vld1_s16((const int16_t *)v955); - int16x4_t v1132 = vld1_s16((const int16_t *)v964); - float32x4_t v28 = vcvtq_n_f32_s32(vmovl_s16(v1104), 15); - float32x4_t v36 = vcvtq_n_f32_s32(vmovl_s16(v1106), 15); - float32x4_t v55 = vcvtq_n_f32_s32(vmovl_s16(v1110), 15); - float32x4_t v63 = vcvtq_n_f32_s32(vmovl_s16(v1112), 15); - float32x4_t v73 = vcvtq_n_f32_s32(vmovl_s16(v1114), 15); - float32x4_t v82 = vcvtq_n_f32_s32(vmovl_s16(v1116), 15); - float32x4_t v100 = vcvtq_n_f32_s32(vmovl_s16(v1120), 15); - float32x4_t v109 = vcvtq_n_f32_s32(vmovl_s16(v1122), 15); - float32x4_t v117 = vcvtq_n_f32_s32(vmovl_s16(v1124), 15); - float32x4_t v127 = vcvtq_n_f32_s32(vmovl_s16(v1126), 15); - float32x4_t v136 = vcvtq_n_f32_s32(vmovl_s16(v1128), 15); - float32x4_t v144 = vcvtq_n_f32_s32(vmovl_s16(v1130), 15); - float32x4_t v154 = vcvtq_n_f32_s32(vmovl_s16(v1132), 15); - float32x4_t v37 = vaddq_f32(v28, v36); - float32x4_t v38 = vsubq_f32(v28, v36); - float32x4_t v64 = vaddq_f32(v55, v63); - float32x4_t v65 = vsubq_f32(v55, v63); - float32x4_t v91 = vaddq_f32(v82, v90); - float32x4_t v92 = vsubq_f32(v82, v90); - float32x4_t v118 = vaddq_f32(v109, v117); - float32x4_t v119 = vsubq_f32(v109, v117); - float32x4_t v145 = vaddq_f32(v136, v144); - float32x4_t v146 = vsubq_f32(v136, v144); - float32x4_t v47 = vaddq_f32(v37, v46); - float32x4_t v74 = vaddq_f32(v64, v73); - float32x4_t v101 = vaddq_f32(v91, v100); - float32x4_t v128 = vaddq_f32(v118, v127); - float32x4_t v155 = vaddq_f32(v145, v154); - float32x4_t v212 = vaddq_f32(v64, v145); - float32x4_t v213 = vsubq_f32(v64, v145); - float32x4_t v214 = vaddq_f32(v118, v91); - float32x4_t v215 = vsubq_f32(v118, v91); - float32x4_t v268 = vaddq_f32(v65, v146); - float32x4_t v269 = vsubq_f32(v65, v146); - float32x4_t v270 = vaddq_f32(v119, v92); - float32x4_t v271 = vsubq_f32(v119, v92); - float32x4_t v156 = vaddq_f32(v74, v155); - float32x4_t v157 = vsubq_f32(v74, v155); - float32x4_t v158 = vaddq_f32(v128, v101); - float32x4_t v159 = vsubq_f32(v128, v101); - float32x4_t v216 = vaddq_f32(v212, v214); - float32x4_t v217 = vsubq_f32(v212, v214); - float32x4_t v218 = vaddq_f32(v213, v215); - float32x4_t v240 = vrev64q_f32(v213); - float32x4_t v256 = vrev64q_f32(v215); - float32x4_t v272 = vaddq_f32(v268, v270); - float32x4_t v273 = vsubq_f32(v268, v270); - float32x4_t v274 = vaddq_f32(v269, v271); - float32x4_t v304 = vmulq_f32(v269, v303); - float32x4_t v314 = vmulq_f32(v271, v313); - float32x4_t v160 = vaddq_f32(v156, v158); - float32x4_t v161 = vsubq_f32(v156, v158); - float32x4_t v162 = vaddq_f32(v157, v159); - float32x4_t v184 = vrev64q_f32(v157); - float32x4_t v200 = vrev64q_f32(v159); - float32x4_t v219 = vaddq_f32(v216, v37); - float32x4_t v229 = vmulq_f32(v216, v228); - float32x4_t v234 = vmulq_f32(v217, v233); - float32x4_t v242 = vmulq_f32(v240, v241); - float32x4_t v248 = vrev64q_f32(v218); - float32x4_t v258 = vmulq_f32(v256, v257); - float32x4_t v275 = vaddq_f32(v272, v38); - float32x4_t v289 = vrev64q_f32(v272); - float32x4_t v297 = vrev64q_f32(v273); - float32x4_t v309 = vmulq_f32(v274, v308); - float32x4_t v163 = vaddq_f32(v160, v47); - float32x4_t v173 = vmulq_f32(v160, v172); - float32x4_t v178 = vmulq_f32(v161, v177); - float32x4_t v186 = vmulq_f32(v184, v185); - float32x4_t v192 = vrev64q_f32(v162); - float32x4_t v202 = vmulq_f32(v200, v201); - float32x4_t v224 = vmulq_f32(v219, v223); - float32x4_t v250 = vmulq_f32(v248, v249); - float32x4_t v281 = vrev64q_f32(v275); - float32x4_t v291 = vmulq_f32(v289, v290); - float32x4_t v299 = vmulq_f32(v297, v298); - float32x4_t v318 = vsubq_f32(v304, v309); - float32x4_t v319 = vaddq_f32(v309, v314); - float32x4_t v194 = vmulq_f32(v192, v193); - float32x4_t v203 = vaddq_f32(v163, v173); - float32x4_t v259 = vaddq_f32(v224, v229); - float32x4_t v262 = vsubq_f32(v242, v250); - float32x4_t v263 = vaddq_f32(v250, v258); - float32x4_t v283 = vmulq_f32(v281, v282); - float32x4_t v324 = vaddq_f32(v163, v224); - int16x4_t v329 = vqmovn_s32(vcvtq_n_s32_f32(v163, 15)); - float32x4_t v204 = vaddq_f32(v203, v178); - float32x4_t v205 = vsubq_f32(v203, v178); - float32x4_t v206 = vsubq_f32(v186, v194); - float32x4_t v207 = vaddq_f32(v194, v202); - float32x4_t v260 = vaddq_f32(v259, v234); - float32x4_t v261 = vsubq_f32(v259, v234); - float32x4_t v315 = vaddq_f32(v283, v291); - float32x4_t v325 = vaddq_f32(v324, v283); - float32x4_t v326 = vsubq_f32(v324, v283); - vst1_s16((int16_t *)v974, v329); - float32x4_t v208 = vaddq_f32(v204, v206); - float32x4_t v209 = vsubq_f32(v204, v206); - float32x4_t v210 = vaddq_f32(v205, v207); - float32x4_t v211 = vsubq_f32(v205, v207); - float32x4_t v264 = vaddq_f32(v260, v262); - float32x4_t v265 = vsubq_f32(v260, v262); - float32x4_t v266 = vaddq_f32(v261, v263); - float32x4_t v267 = vsubq_f32(v261, v263); - float32x4_t v316 = vaddq_f32(v315, v299); - float32x4_t v317 = vsubq_f32(v315, v299); - int16x4_t v337 = vqmovn_s32(vcvtq_n_s32_f32(v326, 15)); - int16x4_t v345 = vqmovn_s32(vcvtq_n_s32_f32(v325, 15)); - float32x4_t v320 = vaddq_f32(v316, v318); - float32x4_t v321 = vsubq_f32(v316, v318); - float32x4_t v322 = vaddq_f32(v317, v319); - float32x4_t v323 = vsubq_f32(v317, v319); - float32x4_t v351 = vaddq_f32(v209, v265); - int16x4_t v356 = vqmovn_s32(vcvtq_n_s32_f32(v209, 15)); - float32x4_t v378 = vaddq_f32(v211, v267); - int16x4_t v383 = vqmovn_s32(vcvtq_n_s32_f32(v211, 15)); - float32x4_t v405 = vaddq_f32(v210, v266); - int16x4_t v410 = vqmovn_s32(vcvtq_n_s32_f32(v210, 15)); - float32x4_t v432 = vaddq_f32(v208, v264); - int16x4_t v437 = vqmovn_s32(vcvtq_n_s32_f32(v208, 15)); - vst1_s16((int16_t *)v983, v337); - vst1_s16((int16_t *)v992, v345); - float32x4_t v352 = vaddq_f32(v351, v321); - float32x4_t v353 = vsubq_f32(v351, v321); - float32x4_t v379 = vaddq_f32(v378, v323); - float32x4_t v380 = vsubq_f32(v378, v323); - float32x4_t v406 = vaddq_f32(v405, v322); - float32x4_t v407 = vsubq_f32(v405, v322); - float32x4_t v433 = vaddq_f32(v432, v320); - float32x4_t v434 = vsubq_f32(v432, v320); - vst1_s16((int16_t *)v1001, v356); - vst1_s16((int16_t *)v1028, v383); - vst1_s16((int16_t *)v1055, v410); - vst1_s16((int16_t *)v1082, v437); - int16x4_t v364 = vqmovn_s32(vcvtq_n_s32_f32(v353, 15)); - int16x4_t v372 = vqmovn_s32(vcvtq_n_s32_f32(v352, 15)); - int16x4_t v391 = vqmovn_s32(vcvtq_n_s32_f32(v380, 15)); - int16x4_t v399 = vqmovn_s32(vcvtq_n_s32_f32(v379, 15)); - int16x4_t v418 = vqmovn_s32(vcvtq_n_s32_f32(v407, 15)); - int16x4_t v426 = vqmovn_s32(vcvtq_n_s32_f32(v406, 15)); - int16x4_t v445 = vqmovn_s32(vcvtq_n_s32_f32(v434, 15)); - int16x4_t v453 = vqmovn_s32(vcvtq_n_s32_f32(v433, 15)); - vst1_s16((int16_t *)v1010, v364); - vst1_s16((int16_t *)v1019, v372); - vst1_s16((int16_t *)v1037, v391); - vst1_s16((int16_t *)v1046, v399); - vst1_s16((int16_t *)v1064, v418); - vst1_s16((int16_t *)v1073, v426); - vst1_s16((int16_t *)v1091, v445); - vst1_s16((int16_t *)v1100, v453); - v5 += 2 * 1; - v6 += 2 * 1; - } - for (int j = v459 * 2; j < howmany; j += 1) { - int16x4_t v519 = vld1s_s16(&v5[istride]); - float v585 = -1.2500000000000000e+00F; - float v589 = 5.5901699437494745e-01F; - float v592 = 1.5388417685876268e+00F; - float v593 = -1.5388417685876268e+00F; - float v599 = 5.8778525229247325e-01F; - float v600 = -5.8778525229247325e-01F; - float v606 = 3.6327126400268028e-01F; - float v607 = -3.6327126400268028e-01F; - float v631 = -1.4999999999999998e+00F; - float v635 = 1.8749999999999998e+00F; - float v639 = -8.3852549156242107e-01F; - float v642 = -2.3082626528814396e+00F; - float v643 = 2.3082626528814396e+00F; - float v649 = -8.8167787843870971e-01F; - float v650 = 8.8167787843870971e-01F; - float v656 = -5.4490689600402031e-01F; - float v657 = 5.4490689600402031e-01F; - float v680 = 8.6602540378443871e-01F; - float v681 = -8.6602540378443871e-01F; - float v687 = -1.0825317547305484e+00F; - float v688 = 1.0825317547305484e+00F; - float v694 = 4.8412291827592718e-01F; - float v695 = -4.8412291827592718e-01F; - float32x2_t v697 = (float32x2_t){v4, v4}; - float v702 = -1.3326760640014592e+00F; - float v706 = -5.0903696045512736e-01F; - float v710 = -3.1460214309120460e-01F; - int16x4_t v485 = vld1s_s16(&v5[0]); - float32x2_t v520 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v519)), 15); - float32x2_t v586 = (float32x2_t){v585, v585}; - float32x2_t v590 = (float32x2_t){v589, v589}; - float32x2_t v594 = (float32x2_t){v592, v593}; - float32x2_t v601 = (float32x2_t){v599, v600}; - float32x2_t v608 = (float32x2_t){v606, v607}; - float32x2_t v632 = (float32x2_t){v631, v631}; - float32x2_t v636 = (float32x2_t){v635, v635}; - float32x2_t v640 = (float32x2_t){v639, v639}; - float32x2_t v644 = (float32x2_t){v642, v643}; - float32x2_t v651 = (float32x2_t){v649, v650}; - float32x2_t v658 = (float32x2_t){v656, v657}; - float32x2_t v682 = (float32x2_t){v680, v681}; - float32x2_t v689 = (float32x2_t){v687, v688}; - float32x2_t v696 = (float32x2_t){v694, v695}; - float32x2_t v703 = (float32x2_t){v702, v702}; - float32x2_t v707 = (float32x2_t){v706, v706}; - float32x2_t v711 = (float32x2_t){v710, v710}; - int16x4_t v471 = vld1s_s16(&v5[istride * 5]); - int16x4_t v477 = vld1s_s16(&v5[istride * 10]); - float32x2_t v486 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v485)), 15); - int16x4_t v492 = vld1s_s16(&v5[istride * 8]); - int16x4_t v498 = vld1s_s16(&v5[istride * 13]); - int16x4_t v506 = vld1s_s16(&v5[istride * 3]); - int16x4_t v513 = vld1s_s16(&v5[istride * 11]); - int16x4_t v527 = vld1s_s16(&v5[istride * 6]); - int16x4_t v534 = vld1s_s16(&v5[istride * 14]); - int16x4_t v540 = vld1s_s16(&v5[istride * 4]); - int16x4_t v548 = vld1s_s16(&v5[istride * 9]); - int16x4_t v555 = vld1s_s16(&v5[istride * 2]); - int16x4_t v561 = vld1s_s16(&v5[istride * 7]); - int16x4_t v569 = vld1s_s16(&v5[istride * 12]); - float32x2_t v596 = vmul_f32(v697, v594); - float32x2_t v603 = vmul_f32(v697, v601); - float32x2_t v610 = vmul_f32(v697, v608); - float32x2_t v646 = vmul_f32(v697, v644); - float32x2_t v653 = vmul_f32(v697, v651); - float32x2_t v660 = vmul_f32(v697, v658); - float32x2_t v684 = vmul_f32(v697, v682); - float32x2_t v691 = vmul_f32(v697, v689); - float32x2_t v698 = vmul_f32(v697, v696); - float32x2_t v472 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v471)), 15); - float32x2_t v478 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v477)), 15); - float32x2_t v493 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v492)), 15); - float32x2_t v499 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v498)), 15); - float32x2_t v507 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v506)), 15); - float32x2_t v514 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v513)), 15); - float32x2_t v528 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v527)), 15); - float32x2_t v535 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v534)), 15); - float32x2_t v541 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v540)), 15); - float32x2_t v549 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v548)), 15); - float32x2_t v556 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v555)), 15); - float32x2_t v562 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v561)), 15); - float32x2_t v570 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v569)), 15); - float32x2_t v479 = vadd_f32(v472, v478); - float32x2_t v480 = vsub_f32(v472, v478); - float32x2_t v500 = vadd_f32(v493, v499); - float32x2_t v501 = vsub_f32(v493, v499); - float32x2_t v521 = vadd_f32(v514, v520); - float32x2_t v522 = vsub_f32(v514, v520); - float32x2_t v542 = vadd_f32(v535, v541); - float32x2_t v543 = vsub_f32(v535, v541); - float32x2_t v563 = vadd_f32(v556, v562); - float32x2_t v564 = vsub_f32(v556, v562); - float32x2_t v487 = vadd_f32(v479, v486); - float32x2_t v508 = vadd_f32(v500, v507); - float32x2_t v529 = vadd_f32(v521, v528); - float32x2_t v550 = vadd_f32(v542, v549); - float32x2_t v571 = vadd_f32(v563, v570); - float32x2_t v622 = vadd_f32(v500, v563); - float32x2_t v623 = vsub_f32(v500, v563); - float32x2_t v624 = vadd_f32(v542, v521); - float32x2_t v625 = vsub_f32(v542, v521); - float32x2_t v672 = vadd_f32(v501, v564); - float32x2_t v673 = vsub_f32(v501, v564); - float32x2_t v674 = vadd_f32(v543, v522); - float32x2_t v675 = vsub_f32(v543, v522); - float32x2_t v572 = vadd_f32(v508, v571); - float32x2_t v573 = vsub_f32(v508, v571); - float32x2_t v574 = vadd_f32(v550, v529); - float32x2_t v575 = vsub_f32(v550, v529); - float32x2_t v626 = vadd_f32(v622, v624); - float32x2_t v627 = vsub_f32(v622, v624); - float32x2_t v628 = vadd_f32(v623, v625); - float32x2_t v647 = vrev64_f32(v623); - float32x2_t v661 = vrev64_f32(v625); - float32x2_t v676 = vadd_f32(v672, v674); - float32x2_t v677 = vsub_f32(v672, v674); - float32x2_t v678 = vadd_f32(v673, v675); - float32x2_t v704 = vmul_f32(v673, v703); - float32x2_t v712 = vmul_f32(v675, v711); - float32x2_t v576 = vadd_f32(v572, v574); - float32x2_t v577 = vsub_f32(v572, v574); - float32x2_t v578 = vadd_f32(v573, v575); - float32x2_t v597 = vrev64_f32(v573); - float32x2_t v611 = vrev64_f32(v575); - float32x2_t v629 = vadd_f32(v626, v479); - float32x2_t v637 = vmul_f32(v626, v636); - float32x2_t v641 = vmul_f32(v627, v640); - float32x2_t v648 = vmul_f32(v647, v646); - float32x2_t v654 = vrev64_f32(v628); - float32x2_t v662 = vmul_f32(v661, v660); - float32x2_t v679 = vadd_f32(v676, v480); - float32x2_t v692 = vrev64_f32(v676); - float32x2_t v699 = vrev64_f32(v677); - float32x2_t v708 = vmul_f32(v678, v707); - float32x2_t v579 = vadd_f32(v576, v487); - float32x2_t v587 = vmul_f32(v576, v586); - float32x2_t v591 = vmul_f32(v577, v590); - float32x2_t v598 = vmul_f32(v597, v596); - float32x2_t v604 = vrev64_f32(v578); - float32x2_t v612 = vmul_f32(v611, v610); - float32x2_t v633 = vmul_f32(v629, v632); - float32x2_t v655 = vmul_f32(v654, v653); - float32x2_t v685 = vrev64_f32(v679); - float32x2_t v693 = vmul_f32(v692, v691); - float32x2_t v700 = vmul_f32(v699, v698); - float32x2_t v716 = vsub_f32(v704, v708); - float32x2_t v717 = vadd_f32(v708, v712); - float32x2_t v605 = vmul_f32(v604, v603); - float32x2_t v613 = vadd_f32(v579, v587); - float32x2_t v663 = vadd_f32(v633, v637); - float32x2_t v666 = vsub_f32(v648, v655); - float32x2_t v667 = vadd_f32(v655, v662); - float32x2_t v686 = vmul_f32(v685, v684); - float32x2_t v722 = vadd_f32(v579, v633); - int16x4_t v727 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v579, 15), (int32x2_t){0, 0})); - float32x2_t v614 = vadd_f32(v613, v591); - float32x2_t v615 = vsub_f32(v613, v591); - float32x2_t v616 = vsub_f32(v598, v605); - float32x2_t v617 = vadd_f32(v605, v612); - float32x2_t v664 = vadd_f32(v663, v641); - float32x2_t v665 = vsub_f32(v663, v641); - float32x2_t v713 = vadd_f32(v686, v693); - float32x2_t v723 = vadd_f32(v722, v686); - float32x2_t v724 = vsub_f32(v722, v686); - v6[0] = vget_lane_s32(vreinterpret_s32_s16(v727), 0); - float32x2_t v618 = vadd_f32(v614, v616); - float32x2_t v619 = vsub_f32(v614, v616); - float32x2_t v620 = vadd_f32(v615, v617); - float32x2_t v621 = vsub_f32(v615, v617); - float32x2_t v668 = vadd_f32(v664, v666); - float32x2_t v669 = vsub_f32(v664, v666); - float32x2_t v670 = vadd_f32(v665, v667); - float32x2_t v671 = vsub_f32(v665, v667); - float32x2_t v714 = vadd_f32(v713, v700); - float32x2_t v715 = vsub_f32(v713, v700); - int16x4_t v733 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v724, 15), (int32x2_t){0, 0})); - int16x4_t v739 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v723, 15), (int32x2_t){0, 0})); - float32x2_t v718 = vadd_f32(v714, v716); - float32x2_t v719 = vsub_f32(v714, v716); - float32x2_t v720 = vadd_f32(v715, v717); - float32x2_t v721 = vsub_f32(v715, v717); - v6[ostride * 10] = vget_lane_s32(vreinterpret_s32_s16(v733), 0); - v6[ostride * 5] = vget_lane_s32(vreinterpret_s32_s16(v739), 0); - float32x2_t v743 = vadd_f32(v619, v669); - int16x4_t v748 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v619, 15), (int32x2_t){0, 0})); - float32x2_t v764 = vadd_f32(v621, v671); - int16x4_t v769 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v621, 15), (int32x2_t){0, 0})); - float32x2_t v785 = vadd_f32(v620, v670); - int16x4_t v790 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v620, 15), (int32x2_t){0, 0})); - float32x2_t v806 = vadd_f32(v618, v668); - int16x4_t v811 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v618, 15), (int32x2_t){0, 0})); - float32x2_t v744 = vadd_f32(v743, v719); - float32x2_t v745 = vsub_f32(v743, v719); - v6[ostride * 6] = vget_lane_s32(vreinterpret_s32_s16(v748), 0); - float32x2_t v765 = vadd_f32(v764, v721); - float32x2_t v766 = vsub_f32(v764, v721); - v6[ostride * 12] = vget_lane_s32(vreinterpret_s32_s16(v769), 0); - float32x2_t v786 = vadd_f32(v785, v720); - float32x2_t v787 = vsub_f32(v785, v720); - v6[ostride * 3] = vget_lane_s32(vreinterpret_s32_s16(v790), 0); - float32x2_t v807 = vadd_f32(v806, v718); - float32x2_t v808 = vsub_f32(v806, v718); - v6[ostride * 9] = vget_lane_s32(vreinterpret_s32_s16(v811), 0); - int16x4_t v754 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v745, 15), (int32x2_t){0, 0})); - int16x4_t v760 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v744, 15), (int32x2_t){0, 0})); - int16x4_t v775 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v766, 15), (int32x2_t){0, 0})); - int16x4_t v781 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v765, 15), (int32x2_t){0, 0})); - int16x4_t v796 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v787, 15), (int32x2_t){0, 0})); - int16x4_t v802 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v786, 15), (int32x2_t){0, 0})); - int16x4_t v817 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v808, 15), (int32x2_t){0, 0})); - int16x4_t v823 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v807, 15), (int32x2_t){0, 0})); - v6[ostride] = vget_lane_s32(vreinterpret_s32_s16(v754), 0); - v6[ostride * 11] = vget_lane_s32(vreinterpret_s32_s16(v760), 0); - v6[ostride * 7] = vget_lane_s32(vreinterpret_s32_s16(v775), 0); - v6[ostride * 2] = vget_lane_s32(vreinterpret_s32_s16(v781), 0); - v6[ostride * 13] = vget_lane_s32(vreinterpret_s32_s16(v796), 0); - v6[ostride * 8] = vget_lane_s32(vreinterpret_s32_s16(v802), 0); - v6[ostride * 4] = vget_lane_s32(vreinterpret_s32_s16(v817), 0); - v6[ostride * 14] = vget_lane_s32(vreinterpret_s32_s16(v823), 0); - v5 += 1 * 1; - v6 += 1 * 1; - } -} -#endif - -#ifdef ARMRAL_ARCH_SVE -void armral_fft_cs16_cf32_cs16_ac_n_uu15(const armral_cmplx_int16_t *restrict x, - armral_cmplx_int16_t *restrict y, - int istride, int ostride, int howmany, - float dir) { - int64_t v0 = istride; - int64_t v2 = ostride; - float v4 = dir; - const int32_t *v5 = (const int32_t *)x; - int32_t *v6 = (int32_t *)y; - int64_t v8 = howmany; - int64_t v10 = svcntd(); - int64_t v11 = v10 * 1; - int64_t v12 = v10 * 1; - for (int j = 0; j < v8; j += v10) { - svbool_t pred_full = svwhilelt_b32(j * 2, howmany * 2); - float v167 = -1.2500000000000000e+00F; - float v172 = 5.5901699437494745e-01F; - float v177 = -1.5388417685876268e+00F; - float v184 = -5.8778525229247325e-01F; - float v191 = -3.6327126400268028e-01F; - float v215 = -1.4999999999999998e+00F; - float v220 = 1.8749999999999998e+00F; - float v225 = -8.3852549156242107e-01F; - float v230 = 2.3082626528814396e+00F; - float v237 = 8.8167787843870971e-01F; - float v244 = 5.4490689600402031e-01F; - float v268 = -8.6602540378443871e-01F; - float v275 = 1.0825317547305484e+00F; - float v282 = -4.8412291827592718e-01F; - float v289 = -1.3326760640014592e+00F; - float v294 = -5.0903696045512736e-01F; - float v299 = -3.1460214309120460e-01F; - const int32_t *v517 = &v5[v0]; - int32_t *v644 = &v6[v2]; - int64_t v19 = v0 * 5; - int64_t v27 = v0 * 10; - int64_t v46 = v0 * 8; - int64_t v54 = v0 * 13; - int64_t v64 = v0 * 3; - int64_t v73 = v0 * 11; - int64_t v91 = v0 * 6; - int64_t v100 = v0 * 14; - int64_t v108 = v0 * 4; - int64_t v118 = v0 * 9; - int64_t v127 = v0 * 2; - int64_t v135 = v0 * 7; - int64_t v145 = v0 * 12; - float v180 = v4 * v177; - float v187 = v4 * v184; - float v194 = v4 * v191; - float v233 = v4 * v230; - float v240 = v4 * v237; - float v247 = v4 * v244; - float v271 = v4 * v268; - float v278 = v4 * v275; - float v285 = v4 * v282; - int64_t v324 = v2 * 10; - int64_t v332 = v2 * 5; - int64_t v343 = v2 * 6; - int64_t v359 = v2 * 11; - int64_t v370 = v2 * 12; - int64_t v378 = v2 * 7; - int64_t v386 = v2 * 2; - int64_t v397 = v2 * 3; - int64_t v405 = v2 * 13; - int64_t v413 = v2 * 8; - int64_t v424 = v2 * 9; - int64_t v432 = v2 * 4; - int64_t v440 = v2 * 14; - const int32_t *v472 = &v5[0]; - svfloat32_t v584 = svdup_n_f32(v167); - svfloat32_t v585 = svdup_n_f32(v172); - svfloat32_t v589 = svdup_n_f32(v215); - svfloat32_t v590 = svdup_n_f32(v220); - svfloat32_t v591 = svdup_n_f32(v225); - svfloat32_t v598 = svdup_n_f32(v289); - svfloat32_t v599 = svdup_n_f32(v294); - svfloat32_t v600 = svdup_n_f32(v299); - int32_t *v608 = &v6[0]; - svfloat32_t v87 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v517[0])), - 1.F / (1ULL << 15ULL)); - const int32_t *v453 = &v5[v19]; - const int32_t *v462 = &v5[v27]; - const int32_t *v481 = &v5[v46]; - const int32_t *v490 = &v5[v54]; - const int32_t *v499 = &v5[v64]; - const int32_t *v508 = &v5[v73]; - const int32_t *v526 = &v5[v91]; - const int32_t *v535 = &v5[v100]; - const int32_t *v544 = &v5[v108]; - const int32_t *v553 = &v5[v118]; - const int32_t *v562 = &v5[v127]; - const int32_t *v571 = &v5[v135]; - const int32_t *v580 = &v5[v145]; - svfloat32_t v586 = svdup_n_f32(v180); - svfloat32_t v587 = svdup_n_f32(v187); - svfloat32_t v588 = svdup_n_f32(v194); - svfloat32_t v592 = svdup_n_f32(v233); - svfloat32_t v593 = svdup_n_f32(v240); - svfloat32_t v594 = svdup_n_f32(v247); - svfloat32_t v595 = svdup_n_f32(v271); - svfloat32_t v596 = svdup_n_f32(v278); - svfloat32_t v597 = svdup_n_f32(v285); - int32_t *v617 = &v6[v324]; - int32_t *v626 = &v6[v332]; - int32_t *v635 = &v6[v343]; - int32_t *v653 = &v6[v359]; - int32_t *v662 = &v6[v370]; - int32_t *v671 = &v6[v378]; - int32_t *v680 = &v6[v386]; - int32_t *v689 = &v6[v397]; - int32_t *v698 = &v6[v405]; - int32_t *v707 = &v6[v413]; - int32_t *v716 = &v6[v424]; - int32_t *v725 = &v6[v432]; - int32_t *v734 = &v6[v440]; - svfloat32_t v43 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v472[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v25 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v453[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v33 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v462[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v52 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v481[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v60 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v490[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v70 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v499[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v79 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v508[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v97 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v526[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v106 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v535[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v114 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v544[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v124 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v553[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v133 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v562[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v141 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v571[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v151 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v580[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v34; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v34) : "w"(v25), "w"(v33)); - svfloat32_t v35; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v35) : "w"(v25), "w"(v33)); - svfloat32_t v61; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v61) : "w"(v52), "w"(v60)); - svfloat32_t v62; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v62) : "w"(v52), "w"(v60)); - svfloat32_t v88; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v88) : "w"(v79), "w"(v87)); - svfloat32_t v89; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v89) : "w"(v79), "w"(v87)); - svfloat32_t v115; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v115) : "w"(v106), "w"(v114)); - svfloat32_t v116; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v116) : "w"(v106), "w"(v114)); - svfloat32_t v142; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v142) : "w"(v133), "w"(v141)); - svfloat32_t v143; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v143) : "w"(v133), "w"(v141)); - svfloat32_t v44; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v44) : "w"(v34), "w"(v43)); - svfloat32_t v71; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v71) : "w"(v61), "w"(v70)); - svfloat32_t v98; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v98) : "w"(v88), "w"(v97)); - svfloat32_t v125; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v125) : "w"(v115), "w"(v124)); - svfloat32_t v152; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v152) : "w"(v142), "w"(v151)); - svfloat32_t v206; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v206) : "w"(v61), "w"(v142)); - svfloat32_t v207; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v207) : "w"(v61), "w"(v142)); - svfloat32_t v208; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v208) : "w"(v115), "w"(v88)); - svfloat32_t v209; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v209) : "w"(v115), "w"(v88)); - svfloat32_t v259; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v259) : "w"(v62), "w"(v143)); - svfloat32_t v260; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v260) : "w"(v62), "w"(v143)); - svfloat32_t v261; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v261) : "w"(v116), "w"(v89)); - svfloat32_t v262; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v262) : "w"(v116), "w"(v89)); - svfloat32_t v153; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v153) : "w"(v71), "w"(v152)); - svfloat32_t v154; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v154) : "w"(v71), "w"(v152)); - svfloat32_t v155; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v155) : "w"(v125), "w"(v98)); - svfloat32_t v156; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v156) : "w"(v125), "w"(v98)); - svfloat32_t v210; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v210) : "w"(v206), "w"(v208)); - svfloat32_t v211; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v211) : "w"(v206), "w"(v208)); - svfloat32_t v212; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v212) : "w"(v207), "w"(v209)); - svfloat32_t zero235; - asm volatile("mov %0.s, #0" : "=w"(zero235)); - svfloat32_t v235 = svcmla_f32_x(pred_full, zero235, v592, v207, 90); - svfloat32_t v263; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v263) : "w"(v259), "w"(v261)); - svfloat32_t v264; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v264) : "w"(v259), "w"(v261)); - svfloat32_t v265; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v265) : "w"(v260), "w"(v262)); - svfloat32_t v302; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v302) : "w"(v262), "w"(v600)); - svfloat32_t v157; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v157) : "w"(v153), "w"(v155)); - svfloat32_t v158; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v158) : "w"(v153), "w"(v155)); - svfloat32_t v159; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v159) : "w"(v154), "w"(v156)); - svfloat32_t zero182; - asm volatile("mov %0.s, #0" : "=w"(zero182)); - svfloat32_t v182 = svcmla_f32_x(pred_full, zero182, v586, v154, 90); - svfloat32_t v213; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v213) : "w"(v210), "w"(v34)); - svfloat32_t v223; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v223) : "w"(v210), "w"(v590)); - svfloat32_t zero242; - asm volatile("mov %0.s, #0" : "=w"(zero242)); - svfloat32_t v242 = svcmla_f32_x(pred_full, zero242, v593, v212, 90); - svfloat32_t v266; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v266) : "w"(v263), "w"(v35)); - svfloat32_t zero287; - asm volatile("mov %0.s, #0" : "=w"(zero287)); - svfloat32_t v287 = svcmla_f32_x(pred_full, zero287, v597, v264, 90); - svfloat32_t v297; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v297) : "w"(v265), "w"(v599)); - svfloat32_t v160; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v160) : "w"(v157), "w"(v44)); - svfloat32_t zero189; - asm volatile("mov %0.s, #0" : "=w"(zero189)); - svfloat32_t v189 = svcmla_f32_x(pred_full, zero189, v587, v159, 90); - svfloat32_t v253; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v253) : "w"(v235), "w"(v242)); - svfloat32_t v254 = svcmla_f32_x(pred_full, v242, v594, v209, 90); - svfloat32_t zero273; - asm volatile("mov %0.s, #0" : "=w"(zero273)); - svfloat32_t v273 = svcmla_f32_x(pred_full, zero273, v595, v266, 90); - svfloat32_t v306 = svnmls_f32_x(pred_full, v297, v260, v598); - svfloat32_t v307 = svmla_f32_x(pred_full, v302, v265, v599); - svfloat32_t v197 = svmla_f32_x(pred_full, v160, v157, v584); - svfloat32_t v200; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v200) : "w"(v182), "w"(v189)); - svfloat32_t v201 = svcmla_f32_x(pred_full, v189, v588, v156, 90); - svfloat32_t v250 = svmla_f32_x(pred_full, v223, v213, v589); - svfloat32_t v303 = svcmla_f32_x(pred_full, v273, v596, v263, 90); - svfloat32_t v312 = svmla_f32_x(pred_full, v160, v213, v589); - svint16_t v317 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v160, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svfloat32_t v198 = svmla_f32_x(pred_full, v197, v158, v585); - svfloat32_t v199 = svmls_f32_x(pred_full, v197, v158, v585); - svfloat32_t v251 = svmla_f32_x(pred_full, v250, v211, v591); - svfloat32_t v252 = svmls_f32_x(pred_full, v250, v211, v591); - svfloat32_t v304; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v304) : "w"(v303), "w"(v287)); - svfloat32_t v305; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v305) : "w"(v303), "w"(v287)); - svfloat32_t v313; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v313) : "w"(v312), "w"(v273)); - svfloat32_t v314; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v314) : "w"(v312), "w"(v273)); - svst1w_u64(pred_full, (unsigned *)(v608), svreinterpret_u64_s16(v317)); - svfloat32_t v202; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v202) : "w"(v198), "w"(v200)); - svfloat32_t v203; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v203) : "w"(v198), "w"(v200)); - svfloat32_t v204; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v204) : "w"(v199), "w"(v201)); - svfloat32_t v205; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v205) : "w"(v199), "w"(v201)); - svfloat32_t v255; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v255) : "w"(v251), "w"(v253)); - svfloat32_t v256; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v256) : "w"(v251), "w"(v253)); - svfloat32_t v257; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v257) : "w"(v252), "w"(v254)); - svfloat32_t v258; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v258) : "w"(v252), "w"(v254)); - svfloat32_t v308; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v308) : "w"(v304), "w"(v306)); - svfloat32_t v309; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v309) : "w"(v304), "w"(v306)); - svfloat32_t v310; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v310) : "w"(v305), "w"(v307)); - svfloat32_t v311; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v311) : "w"(v305), "w"(v307)); - svint16_t v325 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v314, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svint16_t v333 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v313, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svfloat32_t v339; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v339) : "w"(v203), "w"(v256)); - svint16_t v344 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v203, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svfloat32_t v366; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v366) : "w"(v205), "w"(v258)); - svint16_t v371 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v205, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svfloat32_t v393; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v393) : "w"(v204), "w"(v257)); - svint16_t v398 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v204, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svfloat32_t v420; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v420) : "w"(v202), "w"(v255)); - svint16_t v425 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v202, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svst1w_u64(pred_full, (unsigned *)(v617), svreinterpret_u64_s16(v325)); - svst1w_u64(pred_full, (unsigned *)(v626), svreinterpret_u64_s16(v333)); - svfloat32_t v340; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v340) : "w"(v339), "w"(v309)); - svfloat32_t v341; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v341) : "w"(v339), "w"(v309)); - svfloat32_t v367; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v367) : "w"(v366), "w"(v311)); - svfloat32_t v368; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v368) : "w"(v366), "w"(v311)); - svfloat32_t v394; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v394) : "w"(v393), "w"(v310)); - svfloat32_t v395; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v395) : "w"(v393), "w"(v310)); - svfloat32_t v421; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v421) : "w"(v420), "w"(v308)); - svfloat32_t v422; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v422) : "w"(v420), "w"(v308)); - svst1w_u64(pred_full, (unsigned *)(v635), svreinterpret_u64_s16(v344)); - svst1w_u64(pred_full, (unsigned *)(v662), svreinterpret_u64_s16(v371)); - svst1w_u64(pred_full, (unsigned *)(v689), svreinterpret_u64_s16(v398)); - svst1w_u64(pred_full, (unsigned *)(v716), svreinterpret_u64_s16(v425)); - svint16_t v352 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v341, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svint16_t v360 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v340, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svint16_t v379 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v368, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svint16_t v387 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v367, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svint16_t v406 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v395, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svint16_t v414 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v394, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svint16_t v433 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v422, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svint16_t v441 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v421, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svst1w_u64(pred_full, (unsigned *)(v644), svreinterpret_u64_s16(v352)); - svst1w_u64(pred_full, (unsigned *)(v653), svreinterpret_u64_s16(v360)); - svst1w_u64(pred_full, (unsigned *)(v671), svreinterpret_u64_s16(v379)); - svst1w_u64(pred_full, (unsigned *)(v680), svreinterpret_u64_s16(v387)); - svst1w_u64(pred_full, (unsigned *)(v698), svreinterpret_u64_s16(v406)); - svst1w_u64(pred_full, (unsigned *)(v707), svreinterpret_u64_s16(v414)); - svst1w_u64(pred_full, (unsigned *)(v725), svreinterpret_u64_s16(v433)); - svst1w_u64(pred_full, (unsigned *)(v734), svreinterpret_u64_s16(v441)); - v5 += v11; - v6 += v12; - } -} -#endif - -#ifndef ARMRAL_ARCH_SVE -void armral_fft_cs16_cf32_cs16_ac_n_uu16(const armral_cmplx_int16_t *restrict x, - armral_cmplx_int16_t *restrict y, - int istride, int ostride, int howmany, - float dir) { - float v4 = dir; - const int32_t *v5 = (const int32_t *)x; - int32_t *v6 = (int32_t *)y; - int64_t v12 = howmany - 1; - int64_t v475 = howmany / 2; - for (int j = 0; j < v12; j += 2) { - float v243 = 1.0000000000000000e+00F; - float v244 = -1.0000000000000000e+00F; - float v252 = -7.0710678118654746e-01F; - float v260 = 7.0710678118654757e-01F; - float v264 = 9.2387953251128674e-01F; - float v265 = -9.2387953251128674e-01F; - float v273 = 5.4119610014619690e-01F; - float v281 = -1.3065629648763766e+00F; - float32x2_t v283 = (float32x2_t){v4, v4}; - float v289 = 3.8268343236508984e-01F; - float v294 = 1.3065629648763766e+00F; - float v299 = -5.4119610014619690e-01F; - const int32_t *v938 = &v5[istride]; - int32_t *v1020 = &v6[ostride]; - float32x2_t v245 = (float32x2_t){v243, v244}; - float32x2_t v253 = (float32x2_t){v260, v252}; - float32x2_t v261 = (float32x2_t){v260, v260}; - float32x2_t v266 = (float32x2_t){v264, v265}; - float32x2_t v274 = (float32x2_t){v299, v273}; - float32x2_t v282 = (float32x2_t){v294, v281}; - float32x2_t v290 = (float32x2_t){v289, v289}; - float32x2_t v295 = (float32x2_t){v294, v294}; - float32x2_t v300 = (float32x2_t){v299, v299}; - const int32_t *v866 = &v5[0]; - int32_t *v1011 = &v6[0]; - int16x4_t v1166 = vld1_s16((const int16_t *)v938); - float32x4_t v100 = vcvtq_n_f32_s32(vmovl_s16(v1166), 15); - float32x2_t v247 = vmul_f32(v283, v245); - float32x2_t v255 = vmul_f32(v283, v253); - float32x4_t v262 = vcombine_f32(v261, v261); - float32x2_t v268 = vmul_f32(v283, v266); - float32x2_t v276 = vmul_f32(v283, v274); - float32x2_t v284 = vmul_f32(v283, v282); - float32x4_t v291 = vcombine_f32(v290, v290); - float32x4_t v296 = vcombine_f32(v295, v295); - float32x4_t v301 = vcombine_f32(v300, v300); - const int32_t *v875 = &v5[istride * 8]; - const int32_t *v884 = &v5[istride * 4]; - const int32_t *v893 = &v5[istride * 12]; - const int32_t *v902 = &v5[istride * 2]; - const int32_t *v911 = &v5[istride * 10]; - const int32_t *v920 = &v5[istride * 6]; - const int32_t *v929 = &v5[istride * 14]; - const int32_t *v947 = &v5[istride * 9]; - const int32_t *v956 = &v5[istride * 5]; - const int32_t *v965 = &v5[istride * 13]; - const int32_t *v974 = &v5[istride * 3]; - const int32_t *v983 = &v5[istride * 11]; - const int32_t *v992 = &v5[istride * 7]; - const int32_t *v1001 = &v5[istride * 15]; - int32_t *v1029 = &v6[ostride * 2]; - int32_t *v1038 = &v6[ostride * 3]; - int32_t *v1047 = &v6[ostride * 4]; - int32_t *v1056 = &v6[ostride * 5]; - int32_t *v1065 = &v6[ostride * 6]; - int32_t *v1074 = &v6[ostride * 7]; - int32_t *v1083 = &v6[ostride * 8]; - int32_t *v1092 = &v6[ostride * 9]; - int32_t *v1101 = &v6[ostride * 10]; - int32_t *v1110 = &v6[ostride * 11]; - int32_t *v1119 = &v6[ostride * 12]; - int32_t *v1128 = &v6[ostride * 13]; - int32_t *v1137 = &v6[ostride * 14]; - int32_t *v1146 = &v6[ostride * 15]; - int16x4_t v1150 = vld1_s16((const int16_t *)v866); - float32x4_t v28 = vcvtq_n_f32_s32(vmovl_s16(v1150), 15); - float32x4_t v249 = vcombine_f32(v247, v247); - float32x4_t v257 = vcombine_f32(v255, v255); - float32x4_t v270 = vcombine_f32(v268, v268); - float32x4_t v278 = vcombine_f32(v276, v276); - float32x4_t v286 = vcombine_f32(v284, v284); - int16x4_t v1152 = vld1_s16((const int16_t *)v875); - int16x4_t v1154 = vld1_s16((const int16_t *)v884); - int16x4_t v1156 = vld1_s16((const int16_t *)v893); - int16x4_t v1158 = vld1_s16((const int16_t *)v902); - int16x4_t v1160 = vld1_s16((const int16_t *)v911); - int16x4_t v1162 = vld1_s16((const int16_t *)v920); - int16x4_t v1164 = vld1_s16((const int16_t *)v929); - int16x4_t v1168 = vld1_s16((const int16_t *)v947); - int16x4_t v1170 = vld1_s16((const int16_t *)v956); - int16x4_t v1172 = vld1_s16((const int16_t *)v965); - int16x4_t v1174 = vld1_s16((const int16_t *)v974); - int16x4_t v1176 = vld1_s16((const int16_t *)v983); - int16x4_t v1178 = vld1_s16((const int16_t *)v992); - int16x4_t v1180 = vld1_s16((const int16_t *)v1001); - float32x4_t v36 = vcvtq_n_f32_s32(vmovl_s16(v1152), 15); - float32x4_t v46 = vcvtq_n_f32_s32(vmovl_s16(v1154), 15); - float32x4_t v54 = vcvtq_n_f32_s32(vmovl_s16(v1156), 15); - float32x4_t v64 = vcvtq_n_f32_s32(vmovl_s16(v1158), 15); - float32x4_t v72 = vcvtq_n_f32_s32(vmovl_s16(v1160), 15); - float32x4_t v82 = vcvtq_n_f32_s32(vmovl_s16(v1162), 15); - float32x4_t v90 = vcvtq_n_f32_s32(vmovl_s16(v1164), 15); - float32x4_t v108 = vcvtq_n_f32_s32(vmovl_s16(v1168), 15); - float32x4_t v118 = vcvtq_n_f32_s32(vmovl_s16(v1170), 15); - float32x4_t v126 = vcvtq_n_f32_s32(vmovl_s16(v1172), 15); - float32x4_t v136 = vcvtq_n_f32_s32(vmovl_s16(v1174), 15); - float32x4_t v144 = vcvtq_n_f32_s32(vmovl_s16(v1176), 15); - float32x4_t v154 = vcvtq_n_f32_s32(vmovl_s16(v1178), 15); - float32x4_t v162 = vcvtq_n_f32_s32(vmovl_s16(v1180), 15); - float32x4_t v37 = vaddq_f32(v28, v36); - float32x4_t v38 = vsubq_f32(v28, v36); - float32x4_t v55 = vaddq_f32(v46, v54); - float32x4_t v56 = vsubq_f32(v46, v54); - float32x4_t v73 = vaddq_f32(v64, v72); - float32x4_t v74 = vsubq_f32(v64, v72); - float32x4_t v91 = vaddq_f32(v82, v90); - float32x4_t v92 = vsubq_f32(v82, v90); - float32x4_t v109 = vaddq_f32(v100, v108); - float32x4_t v110 = vsubq_f32(v100, v108); - float32x4_t v127 = vaddq_f32(v118, v126); - float32x4_t v128 = vsubq_f32(v118, v126); - float32x4_t v145 = vaddq_f32(v136, v144); - float32x4_t v146 = vsubq_f32(v136, v144); - float32x4_t v163 = vaddq_f32(v154, v162); - float32x4_t v164 = vsubq_f32(v154, v162); - float32x4_t v165 = vaddq_f32(v37, v55); - float32x4_t v166 = vsubq_f32(v37, v55); - float32x4_t v167 = vaddq_f32(v73, v91); - float32x4_t v168 = vsubq_f32(v73, v91); - float32x4_t v169 = vaddq_f32(v109, v127); - float32x4_t v170 = vsubq_f32(v109, v127); - float32x4_t v171 = vaddq_f32(v145, v163); - float32x4_t v172 = vsubq_f32(v145, v163); - float32x4_t v181 = vaddq_f32(v74, v92); - float32x4_t v182 = vsubq_f32(v74, v92); - float32x4_t v183 = vaddq_f32(v110, v164); - float32x4_t v184 = vsubq_f32(v110, v164); - float32x4_t v185 = vaddq_f32(v128, v146); - float32x4_t v186 = vsubq_f32(v128, v146); - float32x4_t v248 = vrev64q_f32(v56); - float32x4_t v173 = vaddq_f32(v165, v167); - float32x4_t v174 = vsubq_f32(v165, v167); - float32x4_t v175 = vaddq_f32(v169, v171); - float32x4_t v176 = vsubq_f32(v169, v171); - float32x4_t v179 = vaddq_f32(v170, v172); - float32x4_t v180 = vsubq_f32(v170, v172); - float32x4_t v187 = vaddq_f32(v183, v185); - float32x4_t v188 = vaddq_f32(v184, v186); - float32x4_t v222 = vrev64q_f32(v168); - float32x4_t v250 = vmulq_f32(v248, v249); - float32x4_t v256 = vrev64q_f32(v181); - float32x4_t v263 = vmulq_f32(v182, v262); - float32x4_t v277 = vrev64q_f32(v183); - float32x4_t v285 = vrev64q_f32(v185); - float32x4_t v297 = vmulq_f32(v184, v296); - float32x4_t v302 = vmulq_f32(v186, v301); - float32x4_t v177 = vaddq_f32(v173, v175); - float32x4_t v178 = vsubq_f32(v173, v175); - float32x4_t v209 = vrev64q_f32(v176); - float32x4_t v224 = vmulq_f32(v222, v249); - float32x4_t v230 = vrev64q_f32(v179); - float32x4_t v237 = vmulq_f32(v180, v262); - float32x4_t v258 = vmulq_f32(v256, v257); - float32x4_t v269 = vrev64q_f32(v187); - float32x4_t v279 = vmulq_f32(v277, v278); - float32x4_t v287 = vmulq_f32(v285, v286); - float32x4_t v292 = vmulq_f32(v188, v291); - float32x4_t v313 = vaddq_f32(v38, v263); - float32x4_t v314 = vsubq_f32(v38, v263); - float32x4_t v211 = vmulq_f32(v209, v249); - float32x4_t v232 = vmulq_f32(v230, v257); - float32x4_t v271 = vmulq_f32(v269, v270); - float32x4_t v305 = vaddq_f32(v166, v237); - float32x4_t v307 = vsubq_f32(v166, v237); - float32x4_t v315 = vaddq_f32(v250, v258); - float32x4_t v316 = vsubq_f32(v250, v258); - float32x4_t v319 = vsubq_f32(v297, v292); - float32x4_t v320 = vsubq_f32(v302, v292); - float32x4_t v321 = vsubq_f32(v292, v297); - float32x4_t v322 = vsubq_f32(v292, v302); - int16x4_t v349 = vqmovn_s32(vcvtq_n_s32_f32(v177, 15)); - int16x4_t v413 = vqmovn_s32(vcvtq_n_s32_f32(v178, 15)); - float32x4_t v303 = vaddq_f32(v174, v211); - float32x4_t v304 = vsubq_f32(v174, v211); - float32x4_t v306 = vaddq_f32(v224, v232); - float32x4_t v308 = vsubq_f32(v232, v224); - float32x4_t v317 = vaddq_f32(v271, v279); - float32x4_t v318 = vsubq_f32(v271, v287); - float32x4_t v323 = vaddq_f32(v313, v319); - float32x4_t v324 = vsubq_f32(v313, v319); - float32x4_t v325 = vaddq_f32(v313, v321); - float32x4_t v326 = vsubq_f32(v313, v321); - float32x4_t v327 = vaddq_f32(v314, v316); - float32x4_t v328 = vsubq_f32(v314, v316); - float32x4_t v329 = vaddq_f32(v314, v322); - float32x4_t v330 = vsubq_f32(v314, v322); - vst1_s16((int16_t *)v1011, v349); - vst1_s16((int16_t *)v1083, v413); - float32x4_t v309 = vaddq_f32(v305, v306); - float32x4_t v310 = vaddq_f32(v307, v308); - float32x4_t v311 = vsubq_f32(v307, v308); - float32x4_t v312 = vsubq_f32(v305, v306); - float32x4_t v333 = vaddq_f32(v317, v315); - float32x4_t v334 = vsubq_f32(v317, v315); - float32x4_t v335 = vaddq_f32(v318, v320); - float32x4_t v336 = vsubq_f32(v318, v320); - float32x4_t v337 = vaddq_f32(v318, v316); - float32x4_t v338 = vsubq_f32(v318, v316); - int16x4_t v381 = vqmovn_s32(vcvtq_n_s32_f32(v304, 15)); - int16x4_t v445 = vqmovn_s32(vcvtq_n_s32_f32(v303, 15)); - float32x4_t v339 = vaddq_f32(v323, v333); - float32x4_t v340 = vaddq_f32(v324, v334); - float32x4_t v341 = vsubq_f32(v325, v334); - float32x4_t v342 = vsubq_f32(v326, v333); - float32x4_t v343 = vaddq_f32(v327, v335); - float32x4_t v344 = vaddq_f32(v328, v336); - float32x4_t v345 = vsubq_f32(v329, v338); - float32x4_t v346 = vsubq_f32(v330, v337); - int16x4_t v365 = vqmovn_s32(vcvtq_n_s32_f32(v312, 15)); - int16x4_t v397 = vqmovn_s32(vcvtq_n_s32_f32(v311, 15)); - int16x4_t v429 = vqmovn_s32(vcvtq_n_s32_f32(v310, 15)); - int16x4_t v461 = vqmovn_s32(vcvtq_n_s32_f32(v309, 15)); - vst1_s16((int16_t *)v1047, v381); - vst1_s16((int16_t *)v1119, v445); - int16x4_t v357 = vqmovn_s32(vcvtq_n_s32_f32(v342, 15)); - int16x4_t v373 = vqmovn_s32(vcvtq_n_s32_f32(v345, 15)); - int16x4_t v389 = vqmovn_s32(vcvtq_n_s32_f32(v346, 15)); - int16x4_t v405 = vqmovn_s32(vcvtq_n_s32_f32(v341, 15)); - int16x4_t v421 = vqmovn_s32(vcvtq_n_s32_f32(v340, 15)); - int16x4_t v437 = vqmovn_s32(vcvtq_n_s32_f32(v343, 15)); - int16x4_t v453 = vqmovn_s32(vcvtq_n_s32_f32(v344, 15)); - int16x4_t v469 = vqmovn_s32(vcvtq_n_s32_f32(v339, 15)); - vst1_s16((int16_t *)v1029, v365); - vst1_s16((int16_t *)v1065, v397); - vst1_s16((int16_t *)v1101, v429); - vst1_s16((int16_t *)v1137, v461); - vst1_s16((int16_t *)v1020, v357); - vst1_s16((int16_t *)v1038, v373); - vst1_s16((int16_t *)v1056, v389); - vst1_s16((int16_t *)v1074, v405); - vst1_s16((int16_t *)v1092, v421); - vst1_s16((int16_t *)v1110, v437); - vst1_s16((int16_t *)v1128, v453); - vst1_s16((int16_t *)v1146, v469); - v5 += 2 * 1; - v6 += 2 * 1; - } - for (int j = v475 * 2; j < howmany; j += 1) { - int16x4_t v543 = vld1s_s16(&v5[istride]); - float v664 = 1.0000000000000000e+00F; - float v665 = -1.0000000000000000e+00F; - float v672 = -7.0710678118654746e-01F; - float v679 = 7.0710678118654757e-01F; - float v682 = 9.2387953251128674e-01F; - float v683 = -9.2387953251128674e-01F; - float v690 = 5.4119610014619690e-01F; - float v697 = -1.3065629648763766e+00F; - float32x2_t v699 = (float32x2_t){v4, v4}; - float v704 = 3.8268343236508984e-01F; - float v708 = 1.3065629648763766e+00F; - float v712 = -5.4119610014619690e-01F; - int16x4_t v487 = vld1s_s16(&v5[0]); - float32x2_t v544 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v543)), 15); - float32x2_t v666 = (float32x2_t){v664, v665}; - float32x2_t v673 = (float32x2_t){v679, v672}; - float32x2_t v680 = (float32x2_t){v679, v679}; - float32x2_t v684 = (float32x2_t){v682, v683}; - float32x2_t v691 = (float32x2_t){v712, v690}; - float32x2_t v698 = (float32x2_t){v708, v697}; - float32x2_t v705 = (float32x2_t){v704, v704}; - float32x2_t v709 = (float32x2_t){v708, v708}; - float32x2_t v713 = (float32x2_t){v712, v712}; - float32x2_t v488 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v487)), 15); - int16x4_t v493 = vld1s_s16(&v5[istride * 8]); - int16x4_t v501 = vld1s_s16(&v5[istride * 4]); - int16x4_t v507 = vld1s_s16(&v5[istride * 12]); - int16x4_t v515 = vld1s_s16(&v5[istride * 2]); - int16x4_t v521 = vld1s_s16(&v5[istride * 10]); - int16x4_t v529 = vld1s_s16(&v5[istride * 6]); - int16x4_t v535 = vld1s_s16(&v5[istride * 14]); - int16x4_t v549 = vld1s_s16(&v5[istride * 9]); - int16x4_t v557 = vld1s_s16(&v5[istride * 5]); - int16x4_t v563 = vld1s_s16(&v5[istride * 13]); - int16x4_t v571 = vld1s_s16(&v5[istride * 3]); - int16x4_t v577 = vld1s_s16(&v5[istride * 11]); - int16x4_t v585 = vld1s_s16(&v5[istride * 7]); - int16x4_t v591 = vld1s_s16(&v5[istride * 15]); - float32x2_t v668 = vmul_f32(v699, v666); - float32x2_t v675 = vmul_f32(v699, v673); - float32x2_t v686 = vmul_f32(v699, v684); - float32x2_t v693 = vmul_f32(v699, v691); - float32x2_t v700 = vmul_f32(v699, v698); - float32x2_t v494 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v493)), 15); - float32x2_t v502 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v501)), 15); - float32x2_t v508 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v507)), 15); - float32x2_t v516 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v515)), 15); - float32x2_t v522 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v521)), 15); - float32x2_t v530 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v529)), 15); - float32x2_t v536 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v535)), 15); - float32x2_t v550 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v549)), 15); - float32x2_t v558 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v557)), 15); - float32x2_t v564 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v563)), 15); - float32x2_t v572 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v571)), 15); - float32x2_t v578 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v577)), 15); - float32x2_t v586 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v585)), 15); - float32x2_t v592 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v591)), 15); - float32x2_t v495 = vadd_f32(v488, v494); - float32x2_t v496 = vsub_f32(v488, v494); - float32x2_t v509 = vadd_f32(v502, v508); - float32x2_t v510 = vsub_f32(v502, v508); - float32x2_t v523 = vadd_f32(v516, v522); - float32x2_t v524 = vsub_f32(v516, v522); - float32x2_t v537 = vadd_f32(v530, v536); - float32x2_t v538 = vsub_f32(v530, v536); - float32x2_t v551 = vadd_f32(v544, v550); - float32x2_t v552 = vsub_f32(v544, v550); - float32x2_t v565 = vadd_f32(v558, v564); - float32x2_t v566 = vsub_f32(v558, v564); - float32x2_t v579 = vadd_f32(v572, v578); - float32x2_t v580 = vsub_f32(v572, v578); - float32x2_t v593 = vadd_f32(v586, v592); - float32x2_t v594 = vsub_f32(v586, v592); - float32x2_t v595 = vadd_f32(v495, v509); - float32x2_t v596 = vsub_f32(v495, v509); - float32x2_t v597 = vadd_f32(v523, v537); - float32x2_t v598 = vsub_f32(v523, v537); - float32x2_t v599 = vadd_f32(v551, v565); - float32x2_t v600 = vsub_f32(v551, v565); - float32x2_t v601 = vadd_f32(v579, v593); - float32x2_t v602 = vsub_f32(v579, v593); - float32x2_t v611 = vadd_f32(v524, v538); - float32x2_t v612 = vsub_f32(v524, v538); - float32x2_t v613 = vadd_f32(v552, v594); - float32x2_t v614 = vsub_f32(v552, v594); - float32x2_t v615 = vadd_f32(v566, v580); - float32x2_t v616 = vsub_f32(v566, v580); - float32x2_t v669 = vrev64_f32(v510); - float32x2_t v603 = vadd_f32(v595, v597); - float32x2_t v604 = vsub_f32(v595, v597); - float32x2_t v605 = vadd_f32(v599, v601); - float32x2_t v606 = vsub_f32(v599, v601); - float32x2_t v609 = vadd_f32(v600, v602); - float32x2_t v610 = vsub_f32(v600, v602); - float32x2_t v617 = vadd_f32(v613, v615); - float32x2_t v618 = vadd_f32(v614, v616); - float32x2_t v647 = vrev64_f32(v598); - float32x2_t v670 = vmul_f32(v669, v668); - float32x2_t v676 = vrev64_f32(v611); - float32x2_t v681 = vmul_f32(v612, v680); - float32x2_t v694 = vrev64_f32(v613); - float32x2_t v701 = vrev64_f32(v615); - float32x2_t v710 = vmul_f32(v614, v709); - float32x2_t v714 = vmul_f32(v616, v713); - float32x2_t v607 = vadd_f32(v603, v605); - float32x2_t v608 = vsub_f32(v603, v605); - float32x2_t v636 = vrev64_f32(v606); - float32x2_t v648 = vmul_f32(v647, v668); - float32x2_t v654 = vrev64_f32(v609); - float32x2_t v659 = vmul_f32(v610, v680); - float32x2_t v677 = vmul_f32(v676, v675); - float32x2_t v687 = vrev64_f32(v617); - float32x2_t v695 = vmul_f32(v694, v693); - float32x2_t v702 = vmul_f32(v701, v700); - float32x2_t v706 = vmul_f32(v618, v705); - float32x2_t v725 = vadd_f32(v496, v681); - float32x2_t v726 = vsub_f32(v496, v681); - float32x2_t v637 = vmul_f32(v636, v668); - float32x2_t v655 = vmul_f32(v654, v675); - float32x2_t v688 = vmul_f32(v687, v686); - float32x2_t v717 = vadd_f32(v596, v659); - float32x2_t v719 = vsub_f32(v596, v659); - float32x2_t v727 = vadd_f32(v670, v677); - float32x2_t v728 = vsub_f32(v670, v677); - float32x2_t v731 = vsub_f32(v710, v706); - float32x2_t v732 = vsub_f32(v714, v706); - float32x2_t v733 = vsub_f32(v706, v710); - float32x2_t v734 = vsub_f32(v706, v714); - int16x4_t v761 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v607, 15), (int32x2_t){0, 0})); - int16x4_t v809 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v608, 15), (int32x2_t){0, 0})); - float32x2_t v715 = vadd_f32(v604, v637); - float32x2_t v716 = vsub_f32(v604, v637); - float32x2_t v718 = vadd_f32(v648, v655); - float32x2_t v720 = vsub_f32(v655, v648); - float32x2_t v729 = vadd_f32(v688, v695); - float32x2_t v730 = vsub_f32(v688, v702); - float32x2_t v735 = vadd_f32(v725, v731); - float32x2_t v736 = vsub_f32(v725, v731); - float32x2_t v737 = vadd_f32(v725, v733); - float32x2_t v738 = vsub_f32(v725, v733); - float32x2_t v739 = vadd_f32(v726, v728); - float32x2_t v740 = vsub_f32(v726, v728); - float32x2_t v741 = vadd_f32(v726, v734); - float32x2_t v742 = vsub_f32(v726, v734); - v6[0] = vget_lane_s32(vreinterpret_s32_s16(v761), 0); - v6[ostride * 8] = vget_lane_s32(vreinterpret_s32_s16(v809), 0); - float32x2_t v721 = vadd_f32(v717, v718); - float32x2_t v722 = vadd_f32(v719, v720); - float32x2_t v723 = vsub_f32(v719, v720); - float32x2_t v724 = vsub_f32(v717, v718); - float32x2_t v745 = vadd_f32(v729, v727); - float32x2_t v746 = vsub_f32(v729, v727); - float32x2_t v747 = vadd_f32(v730, v732); - float32x2_t v748 = vsub_f32(v730, v732); - float32x2_t v749 = vadd_f32(v730, v728); - float32x2_t v750 = vsub_f32(v730, v728); - int16x4_t v785 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v716, 15), (int32x2_t){0, 0})); - int16x4_t v833 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v715, 15), (int32x2_t){0, 0})); - float32x2_t v751 = vadd_f32(v735, v745); - float32x2_t v752 = vadd_f32(v736, v746); - float32x2_t v753 = vsub_f32(v737, v746); - float32x2_t v754 = vsub_f32(v738, v745); - float32x2_t v755 = vadd_f32(v739, v747); - float32x2_t v756 = vadd_f32(v740, v748); - float32x2_t v757 = vsub_f32(v741, v750); - float32x2_t v758 = vsub_f32(v742, v749); - int16x4_t v773 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v724, 15), (int32x2_t){0, 0})); - v6[ostride * 4] = vget_lane_s32(vreinterpret_s32_s16(v785), 0); - int16x4_t v797 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v723, 15), (int32x2_t){0, 0})); - int16x4_t v821 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v722, 15), (int32x2_t){0, 0})); - v6[ostride * 12] = vget_lane_s32(vreinterpret_s32_s16(v833), 0); - int16x4_t v845 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v721, 15), (int32x2_t){0, 0})); - int16x4_t v767 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v754, 15), (int32x2_t){0, 0})); - v6[ostride * 2] = vget_lane_s32(vreinterpret_s32_s16(v773), 0); - int16x4_t v779 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v757, 15), (int32x2_t){0, 0})); - int16x4_t v791 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v758, 15), (int32x2_t){0, 0})); - v6[ostride * 6] = vget_lane_s32(vreinterpret_s32_s16(v797), 0); - int16x4_t v803 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v753, 15), (int32x2_t){0, 0})); - int16x4_t v815 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v752, 15), (int32x2_t){0, 0})); - v6[ostride * 10] = vget_lane_s32(vreinterpret_s32_s16(v821), 0); - int16x4_t v827 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v755, 15), (int32x2_t){0, 0})); - int16x4_t v839 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v756, 15), (int32x2_t){0, 0})); - v6[ostride * 14] = vget_lane_s32(vreinterpret_s32_s16(v845), 0); - int16x4_t v851 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v751, 15), (int32x2_t){0, 0})); - v6[ostride] = vget_lane_s32(vreinterpret_s32_s16(v767), 0); - v6[ostride * 3] = vget_lane_s32(vreinterpret_s32_s16(v779), 0); - v6[ostride * 5] = vget_lane_s32(vreinterpret_s32_s16(v791), 0); - v6[ostride * 7] = vget_lane_s32(vreinterpret_s32_s16(v803), 0); - v6[ostride * 9] = vget_lane_s32(vreinterpret_s32_s16(v815), 0); - v6[ostride * 11] = vget_lane_s32(vreinterpret_s32_s16(v827), 0); - v6[ostride * 13] = vget_lane_s32(vreinterpret_s32_s16(v839), 0); - v6[ostride * 15] = vget_lane_s32(vreinterpret_s32_s16(v851), 0); - v5 += 1 * 1; - v6 += 1 * 1; - } -} -#endif - -#ifdef ARMRAL_ARCH_SVE -void armral_fft_cs16_cf32_cs16_ac_n_uu16(const armral_cmplx_int16_t *restrict x, - armral_cmplx_int16_t *restrict y, - int istride, int ostride, int howmany, - float dir) { - int64_t v0 = istride; - int64_t v2 = ostride; - float v4 = dir; - const int32_t *v5 = (const int32_t *)x; - int32_t *v6 = (int32_t *)y; - int64_t v8 = howmany; - int64_t v10 = svcntd(); - int64_t v11 = v10 * 1; - int64_t v12 = v10 * 1; - for (int j = 0; j < v8; j += v10) { - svbool_t pred_full = svwhilelt_b32(j * 2, howmany * 2); - float v238 = -1.0000000000000000e+00F; - float v245 = -7.0710678118654746e-01F; - float v252 = 7.0710678118654757e-01F; - float v257 = -9.2387953251128674e-01F; - float v264 = 5.4119610014619690e-01F; - float v271 = -1.3065629648763766e+00F; - float v278 = 3.8268343236508984e-01F; - float v283 = 1.3065629648763766e+00F; - float v288 = -5.4119610014619690e-01F; - const int32_t *v543 = &v5[v0]; - int32_t *v643 = &v6[v2]; - int64_t v27 = v0 * 8; - int64_t v37 = v0 * 4; - int64_t v45 = v0 * 12; - int64_t v55 = v0 * 2; - int64_t v63 = v0 * 10; - int64_t v73 = v0 * 6; - int64_t v81 = v0 * 14; - int64_t v99 = v0 * 9; - int64_t v109 = v0 * 5; - int64_t v117 = v0 * 13; - int64_t v127 = v0 * 3; - int64_t v135 = v0 * 11; - int64_t v145 = v0 * 7; - int64_t v153 = v0 * 15; - float v241 = v4 * v238; - float v248 = v4 * v245; - float v260 = v4 * v257; - float v267 = v4 * v264; - float v274 = v4 * v271; - int64_t v353 = v2 * 2; - int64_t v361 = v2 * 3; - int64_t v369 = v2 * 4; - int64_t v377 = v2 * 5; - int64_t v385 = v2 * 6; - int64_t v393 = v2 * 7; - int64_t v401 = v2 * 8; - int64_t v409 = v2 * 9; - int64_t v417 = v2 * 10; - int64_t v425 = v2 * 11; - int64_t v433 = v2 * 12; - int64_t v441 = v2 * 13; - int64_t v449 = v2 * 14; - int64_t v457 = v2 * 15; - const int32_t *v471 = &v5[0]; - svfloat32_t v620 = svdup_n_f32(v252); - svfloat32_t v624 = svdup_n_f32(v278); - svfloat32_t v625 = svdup_n_f32(v283); - svfloat32_t v626 = svdup_n_f32(v288); - int32_t *v634 = &v6[0]; - svfloat32_t v97 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v543[0])), - 1.F / (1ULL << 15ULL)); - const int32_t *v480 = &v5[v27]; - const int32_t *v489 = &v5[v37]; - const int32_t *v498 = &v5[v45]; - const int32_t *v507 = &v5[v55]; - const int32_t *v516 = &v5[v63]; - const int32_t *v525 = &v5[v73]; - const int32_t *v534 = &v5[v81]; - const int32_t *v552 = &v5[v99]; - const int32_t *v561 = &v5[v109]; - const int32_t *v570 = &v5[v117]; - const int32_t *v579 = &v5[v127]; - const int32_t *v588 = &v5[v135]; - const int32_t *v597 = &v5[v145]; - const int32_t *v606 = &v5[v153]; - svfloat32_t v618 = svdup_n_f32(v241); - svfloat32_t v619 = svdup_n_f32(v248); - svfloat32_t v621 = svdup_n_f32(v260); - svfloat32_t v622 = svdup_n_f32(v267); - svfloat32_t v623 = svdup_n_f32(v274); - int32_t *v652 = &v6[v353]; - int32_t *v661 = &v6[v361]; - int32_t *v670 = &v6[v369]; - int32_t *v679 = &v6[v377]; - int32_t *v688 = &v6[v385]; - int32_t *v697 = &v6[v393]; - int32_t *v706 = &v6[v401]; - int32_t *v715 = &v6[v409]; - int32_t *v724 = &v6[v417]; - int32_t *v733 = &v6[v425]; - int32_t *v742 = &v6[v433]; - int32_t *v751 = &v6[v441]; - int32_t *v760 = &v6[v449]; - int32_t *v769 = &v6[v457]; - svfloat32_t v25 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v471[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v33 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v480[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v43 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v489[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v51 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v498[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v61 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v507[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v69 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v516[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v79 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v525[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v87 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v534[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v105 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v552[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v115 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v561[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v123 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v570[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v133 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v579[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v141 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v588[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v151 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v597[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v159 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v606[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v34; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v34) : "w"(v25), "w"(v33)); - svfloat32_t v35; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v35) : "w"(v25), "w"(v33)); - svfloat32_t v52; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v52) : "w"(v43), "w"(v51)); - svfloat32_t v53; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v53) : "w"(v43), "w"(v51)); - svfloat32_t v70; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v70) : "w"(v61), "w"(v69)); - svfloat32_t v71; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v71) : "w"(v61), "w"(v69)); - svfloat32_t v88; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v88) : "w"(v79), "w"(v87)); - svfloat32_t v89; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v89) : "w"(v79), "w"(v87)); - svfloat32_t v106; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v106) : "w"(v97), "w"(v105)); - svfloat32_t v107; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v107) : "w"(v97), "w"(v105)); - svfloat32_t v124; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v124) : "w"(v115), "w"(v123)); - svfloat32_t v125; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v125) : "w"(v115), "w"(v123)); - svfloat32_t v142; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v142) : "w"(v133), "w"(v141)); - svfloat32_t v143; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v143) : "w"(v133), "w"(v141)); - svfloat32_t v160; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v160) : "w"(v151), "w"(v159)); - svfloat32_t v161; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v161) : "w"(v151), "w"(v159)); - svfloat32_t v162; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v162) : "w"(v34), "w"(v52)); - svfloat32_t v163; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v163) : "w"(v34), "w"(v52)); - svfloat32_t v164; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v164) : "w"(v70), "w"(v88)); - svfloat32_t v165; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v165) : "w"(v70), "w"(v88)); - svfloat32_t v166; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v166) : "w"(v106), "w"(v124)); - svfloat32_t v167; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v167) : "w"(v106), "w"(v124)); - svfloat32_t v168; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v168) : "w"(v142), "w"(v160)); - svfloat32_t v169; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v169) : "w"(v142), "w"(v160)); - svfloat32_t v178; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v178) : "w"(v71), "w"(v89)); - svfloat32_t v179; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v179) : "w"(v71), "w"(v89)); - svfloat32_t v180; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v180) : "w"(v107), "w"(v161)); - svfloat32_t v181; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v181) : "w"(v107), "w"(v161)); - svfloat32_t v182; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v182) : "w"(v125), "w"(v143)); - svfloat32_t v183; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v183) : "w"(v125), "w"(v143)); - svfloat32_t zero243; - asm volatile("mov %0.s, #0" : "=w"(zero243)); - svfloat32_t v243 = svcmla_f32_x(pred_full, zero243, v618, v53, 90); - svfloat32_t v170; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v170) : "w"(v162), "w"(v164)); - svfloat32_t v171; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v171) : "w"(v162), "w"(v164)); - svfloat32_t v172; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v172) : "w"(v166), "w"(v168)); - svfloat32_t v173; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v173) : "w"(v166), "w"(v168)); - svfloat32_t v176; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v176) : "w"(v167), "w"(v169)); - svfloat32_t v177; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v177) : "w"(v167), "w"(v169)); - svfloat32_t v184; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v184) : "w"(v180), "w"(v182)); - svfloat32_t v185; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v185) : "w"(v181), "w"(v183)); - svfloat32_t zero219; - asm volatile("mov %0.s, #0" : "=w"(zero219)); - svfloat32_t v219 = svcmla_f32_x(pred_full, zero219, v618, v165, 90); - svfloat32_t zero250; - asm volatile("mov %0.s, #0" : "=w"(zero250)); - svfloat32_t v250 = svcmla_f32_x(pred_full, zero250, v619, v178, 90); - svfloat32_t zero276; - asm volatile("mov %0.s, #0" : "=w"(zero276)); - svfloat32_t v276 = svcmla_f32_x(pred_full, zero276, v623, v182, 90); - svfloat32_t v286; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v286) : "w"(v181), "w"(v625)); - svfloat32_t v291; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v291) : "w"(v183), "w"(v626)); - svfloat32_t v174; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v174) : "w"(v170), "w"(v172)); - svfloat32_t v175; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v175) : "w"(v170), "w"(v172)); - svfloat32_t zero207; - asm volatile("mov %0.s, #0" : "=w"(zero207)); - svfloat32_t v207 = svcmla_f32_x(pred_full, zero207, v618, v173, 90); - svfloat32_t zero226; - asm volatile("mov %0.s, #0" : "=w"(zero226)); - svfloat32_t v226 = svcmla_f32_x(pred_full, zero226, v619, v176, 90); - svfloat32_t zero262; - asm volatile("mov %0.s, #0" : "=w"(zero262)); - svfloat32_t v262 = svcmla_f32_x(pred_full, zero262, v621, v184, 90); - svfloat32_t v281; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v281) : "w"(v185), "w"(v624)); - svfloat32_t v302 = svmla_f32_x(pred_full, v35, v179, v620); - svfloat32_t v303 = svmls_f32_x(pred_full, v35, v179, v620); - svfloat32_t v304; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v304) : "w"(v243), "w"(v250)); - svfloat32_t v305; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v305) : "w"(v243), "w"(v250)); - svfloat32_t v292; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v292) : "w"(v171), "w"(v207)); - svfloat32_t v293; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v293) : "w"(v171), "w"(v207)); - svfloat32_t v294 = svmla_f32_x(pred_full, v163, v177, v620); - svfloat32_t v295; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v295) : "w"(v219), "w"(v226)); - svfloat32_t v296 = svmls_f32_x(pred_full, v163, v177, v620); - svfloat32_t v297; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v297) : "w"(v226), "w"(v219)); - svfloat32_t v306 = svcmla_f32_x(pred_full, v262, v622, v180, 90); - svfloat32_t v307; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v307) : "w"(v262), "w"(v276)); - svfloat32_t v308 = svnmls_f32_x(pred_full, v281, v181, v625); - svfloat32_t v309 = svnmls_f32_x(pred_full, v281, v183, v626); - svfloat32_t v310 = svnmls_f32_x(pred_full, v286, v185, v624); - svfloat32_t v311 = svnmls_f32_x(pred_full, v291, v185, v624); - svfloat32_t v316; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v316) : "w"(v303), "w"(v305)); - svfloat32_t v317; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v317) : "w"(v303), "w"(v305)); - svint16_t v338 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v174, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svint16_t v402 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v175, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svfloat32_t v298; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v298) : "w"(v294), "w"(v295)); - svfloat32_t v299; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v299) : "w"(v296), "w"(v297)); - svfloat32_t v300; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v300) : "w"(v296), "w"(v297)); - svfloat32_t v301; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v301) : "w"(v294), "w"(v295)); - svfloat32_t v312; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v312) : "w"(v302), "w"(v308)); - svfloat32_t v313; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v313) : "w"(v302), "w"(v308)); - svfloat32_t v314; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v314) : "w"(v302), "w"(v310)); - svfloat32_t v315; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v315) : "w"(v302), "w"(v310)); - svfloat32_t v318; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v318) : "w"(v303), "w"(v311)); - svfloat32_t v319; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v319) : "w"(v303), "w"(v311)); - svfloat32_t v322; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v322) : "w"(v306), "w"(v304)); - svfloat32_t v323; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v323) : "w"(v306), "w"(v304)); - svfloat32_t v324; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v324) : "w"(v307), "w"(v309)); - svfloat32_t v325; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v325) : "w"(v307), "w"(v309)); - svfloat32_t v326; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v326) : "w"(v307), "w"(v305)); - svfloat32_t v327; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v327) : "w"(v307), "w"(v305)); - svint16_t v370 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v293, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svint16_t v434 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v292, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svst1w_u64(pred_full, (unsigned *)(v634), svreinterpret_u64_s16(v338)); - svst1w_u64(pred_full, (unsigned *)(v706), svreinterpret_u64_s16(v402)); - svfloat32_t v328; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v328) : "w"(v312), "w"(v322)); - svfloat32_t v329; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v329) : "w"(v313), "w"(v323)); - svfloat32_t v330; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v330) : "w"(v314), "w"(v323)); - svfloat32_t v331; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v331) : "w"(v315), "w"(v322)); - svfloat32_t v332; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v332) : "w"(v316), "w"(v324)); - svfloat32_t v333; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v333) : "w"(v317), "w"(v325)); - svfloat32_t v334; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v334) : "w"(v318), "w"(v327)); - svfloat32_t v335; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v335) : "w"(v319), "w"(v326)); - svint16_t v354 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v301, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svint16_t v386 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v300, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svint16_t v418 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v299, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svint16_t v450 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v298, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svst1w_u64(pred_full, (unsigned *)(v670), svreinterpret_u64_s16(v370)); - svst1w_u64(pred_full, (unsigned *)(v742), svreinterpret_u64_s16(v434)); - svint16_t v346 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v331, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svint16_t v362 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v334, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svint16_t v378 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v335, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svint16_t v394 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v330, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svint16_t v410 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v329, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svint16_t v426 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v332, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svint16_t v442 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v333, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svint16_t v458 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v328, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svst1w_u64(pred_full, (unsigned *)(v652), svreinterpret_u64_s16(v354)); - svst1w_u64(pred_full, (unsigned *)(v688), svreinterpret_u64_s16(v386)); - svst1w_u64(pred_full, (unsigned *)(v724), svreinterpret_u64_s16(v418)); - svst1w_u64(pred_full, (unsigned *)(v760), svreinterpret_u64_s16(v450)); - svst1w_u64(pred_full, (unsigned *)(v643), svreinterpret_u64_s16(v346)); - svst1w_u64(pred_full, (unsigned *)(v661), svreinterpret_u64_s16(v362)); - svst1w_u64(pred_full, (unsigned *)(v679), svreinterpret_u64_s16(v378)); - svst1w_u64(pred_full, (unsigned *)(v697), svreinterpret_u64_s16(v394)); - svst1w_u64(pred_full, (unsigned *)(v715), svreinterpret_u64_s16(v410)); - svst1w_u64(pred_full, (unsigned *)(v733), svreinterpret_u64_s16(v426)); - svst1w_u64(pred_full, (unsigned *)(v751), svreinterpret_u64_s16(v442)); - svst1w_u64(pred_full, (unsigned *)(v769), svreinterpret_u64_s16(v458)); - v5 += v11; - v6 += v12; - } -} -#endif - -#ifndef ARMRAL_ARCH_SVE -void armral_fft_cs16_cf32_cs16_ac_n_uu17(const armral_cmplx_int16_t *restrict x, - armral_cmplx_int16_t *restrict y, - int istride, int ostride, int howmany, - float dir) { - float v4 = dir; - const int32_t *v5 = (const int32_t *)x; - int32_t *v6 = (int32_t *)y; - int64_t v12 = howmany - 1; - int64_t v693 = howmany / 2; - for (int j = 0; j < v12; j += 2) { - float v230 = -4.2602849117736000e-02F; - float v235 = 2.0497965023262180e-01F; - float v240 = 1.0451835201736759e+00F; - float v245 = 1.7645848660222969e+00F; - float v250 = -7.2340797728605655e-01F; - float v255 = -8.9055591620606403e-02F; - float v260 = -1.0625000000000000e+00F; - float v265 = 2.5769410160110379e-01F; - float v270 = 7.7980260789483757e-01F; - float v275 = 5.4389318464570580e-01F; - float v280 = 4.2010193497052700e-01F; - float v285 = 1.2810929434228073e+00F; - float v290 = 4.4088907348175338e-01F; - float v295 = 3.1717619283272508e-01F; - float v299 = -9.0138318648016680e-01F; - float v300 = 9.0138318648016680e-01F; - float v307 = -4.3248756360072310e-01F; - float v308 = 4.3248756360072310e-01F; - float v315 = 6.6693537504044498e-01F; - float v316 = -6.6693537504044498e-01F; - float v323 = -6.0389004312516970e-01F; - float v324 = 6.0389004312516970e-01F; - float v331 = -3.6924873198582547e-01F; - float v332 = 3.6924873198582547e-01F; - float v339 = 4.8656938755549761e-01F; - float v340 = -4.8656938755549761e-01F; - float v347 = 2.3813712136760609e-01F; - float v348 = -2.3813712136760609e-01F; - float v355 = -1.5573820617422458e+00F; - float v356 = 1.5573820617422458e+00F; - float v363 = 6.5962247018731990e-01F; - float v364 = -6.5962247018731990e-01F; - float v371 = -1.4316961569866241e-01F; - float v372 = 1.4316961569866241e-01F; - float v379 = 2.3903469959860771e-01F; - float v380 = -2.3903469959860771e-01F; - float v387 = -4.7932541949972603e-02F; - float v388 = 4.7932541949972603e-02F; - float v395 = -2.3188014856550065e+00F; - float v396 = 2.3188014856550065e+00F; - float v403 = 7.8914568419206255e-01F; - float v404 = -7.8914568419206255e-01F; - float v411 = 3.8484572871179505e+00F; - float v412 = -3.8484572871179505e+00F; - float v419 = -1.3003804568801376e+00F; - float v420 = 1.3003804568801376e+00F; - float v427 = 4.0814769046889037e+00F; - float v428 = -4.0814769046889037e+00F; - float v435 = -1.4807159909286283e+00F; - float v436 = 1.4807159909286283e+00F; - float v443 = -1.3332470363551400e-02F; - float v444 = 1.3332470363551400e-02F; - float v451 = -3.7139778690557629e-01F; - float v452 = 3.7139778690557629e-01F; - float v459 = 1.9236512863456379e-01F; - float v460 = -1.9236512863456379e-01F; - float32x2_t v462 = (float32x2_t){v4, v4}; - const int32_t *v1279 = &v5[istride]; - int32_t *v1443 = &v6[ostride]; - float32x2_t v231 = (float32x2_t){v230, v230}; - float32x2_t v236 = (float32x2_t){v235, v235}; - float32x2_t v241 = (float32x2_t){v240, v240}; - float32x2_t v246 = (float32x2_t){v245, v245}; - float32x2_t v251 = (float32x2_t){v250, v250}; - float32x2_t v256 = (float32x2_t){v255, v255}; - float32x2_t v261 = (float32x2_t){v260, v260}; - float32x2_t v266 = (float32x2_t){v265, v265}; - float32x2_t v271 = (float32x2_t){v270, v270}; - float32x2_t v276 = (float32x2_t){v275, v275}; - float32x2_t v281 = (float32x2_t){v280, v280}; - float32x2_t v286 = (float32x2_t){v285, v285}; - float32x2_t v291 = (float32x2_t){v290, v290}; - float32x2_t v296 = (float32x2_t){v295, v295}; - float32x2_t v301 = (float32x2_t){v299, v300}; - float32x2_t v309 = (float32x2_t){v307, v308}; - float32x2_t v317 = (float32x2_t){v315, v316}; - float32x2_t v325 = (float32x2_t){v323, v324}; - float32x2_t v333 = (float32x2_t){v331, v332}; - float32x2_t v341 = (float32x2_t){v339, v340}; - float32x2_t v349 = (float32x2_t){v347, v348}; - float32x2_t v357 = (float32x2_t){v355, v356}; - float32x2_t v365 = (float32x2_t){v363, v364}; - float32x2_t v373 = (float32x2_t){v371, v372}; - float32x2_t v381 = (float32x2_t){v379, v380}; - float32x2_t v389 = (float32x2_t){v387, v388}; - float32x2_t v397 = (float32x2_t){v395, v396}; - float32x2_t v405 = (float32x2_t){v403, v404}; - float32x2_t v413 = (float32x2_t){v411, v412}; - float32x2_t v421 = (float32x2_t){v419, v420}; - float32x2_t v429 = (float32x2_t){v427, v428}; - float32x2_t v437 = (float32x2_t){v435, v436}; - float32x2_t v445 = (float32x2_t){v443, v444}; - float32x2_t v453 = (float32x2_t){v451, v452}; - float32x2_t v461 = (float32x2_t){v459, v460}; - const int32_t *v1424 = &v5[0]; - int32_t *v1434 = &v6[0]; - int16x4_t v1582 = vld1_s16((const int16_t *)v1279); - float32x4_t v28 = vcvtq_n_f32_s32(vmovl_s16(v1582), 15); - float32x4_t v232 = vcombine_f32(v231, v231); - float32x4_t v237 = vcombine_f32(v236, v236); - float32x4_t v242 = vcombine_f32(v241, v241); - float32x4_t v247 = vcombine_f32(v246, v246); - float32x4_t v252 = vcombine_f32(v251, v251); - float32x4_t v257 = vcombine_f32(v256, v256); - float32x4_t v262 = vcombine_f32(v261, v261); - float32x4_t v267 = vcombine_f32(v266, v266); - float32x4_t v272 = vcombine_f32(v271, v271); - float32x4_t v277 = vcombine_f32(v276, v276); - float32x4_t v282 = vcombine_f32(v281, v281); - float32x4_t v287 = vcombine_f32(v286, v286); - float32x4_t v292 = vcombine_f32(v291, v291); - float32x4_t v297 = vcombine_f32(v296, v296); - float32x2_t v303 = vmul_f32(v462, v301); - float32x2_t v311 = vmul_f32(v462, v309); - float32x2_t v319 = vmul_f32(v462, v317); - float32x2_t v327 = vmul_f32(v462, v325); - float32x2_t v335 = vmul_f32(v462, v333); - float32x2_t v343 = vmul_f32(v462, v341); - float32x2_t v351 = vmul_f32(v462, v349); - float32x2_t v359 = vmul_f32(v462, v357); - float32x2_t v367 = vmul_f32(v462, v365); - float32x2_t v375 = vmul_f32(v462, v373); - float32x2_t v383 = vmul_f32(v462, v381); - float32x2_t v391 = vmul_f32(v462, v389); - float32x2_t v399 = vmul_f32(v462, v397); - float32x2_t v407 = vmul_f32(v462, v405); - float32x2_t v415 = vmul_f32(v462, v413); - float32x2_t v423 = vmul_f32(v462, v421); - float32x2_t v431 = vmul_f32(v462, v429); - float32x2_t v439 = vmul_f32(v462, v437); - float32x2_t v447 = vmul_f32(v462, v445); - float32x2_t v455 = vmul_f32(v462, v453); - float32x2_t v463 = vmul_f32(v462, v461); - const int32_t *v1288 = &v5[istride * 16]; - const int32_t *v1297 = &v5[istride * 3]; - const int32_t *v1306 = &v5[istride * 14]; - const int32_t *v1315 = &v5[istride * 9]; - const int32_t *v1324 = &v5[istride * 8]; - const int32_t *v1333 = &v5[istride * 10]; - const int32_t *v1342 = &v5[istride * 7]; - const int32_t *v1351 = &v5[istride * 13]; - const int32_t *v1360 = &v5[istride * 4]; - const int32_t *v1369 = &v5[istride * 5]; - const int32_t *v1378 = &v5[istride * 12]; - const int32_t *v1387 = &v5[istride * 15]; - const int32_t *v1396 = &v5[istride * 2]; - const int32_t *v1405 = &v5[istride * 11]; - const int32_t *v1414 = &v5[istride * 6]; - int32_t *v1452 = &v6[ostride * 16]; - int32_t *v1461 = &v6[ostride * 2]; - int32_t *v1470 = &v6[ostride * 15]; - int32_t *v1479 = &v6[ostride * 3]; - int32_t *v1488 = &v6[ostride * 14]; - int32_t *v1497 = &v6[ostride * 4]; - int32_t *v1506 = &v6[ostride * 13]; - int32_t *v1515 = &v6[ostride * 5]; - int32_t *v1524 = &v6[ostride * 12]; - int32_t *v1533 = &v6[ostride * 6]; - int32_t *v1542 = &v6[ostride * 11]; - int32_t *v1551 = &v6[ostride * 7]; - int32_t *v1560 = &v6[ostride * 10]; - int32_t *v1569 = &v6[ostride * 8]; - int32_t *v1578 = &v6[ostride * 9]; - int16x4_t v1614 = vld1_s16((const int16_t *)v1424); - float32x4_t v222 = vcvtq_n_f32_s32(vmovl_s16(v1614), 15); - float32x4_t v305 = vcombine_f32(v303, v303); - float32x4_t v313 = vcombine_f32(v311, v311); - float32x4_t v321 = vcombine_f32(v319, v319); - float32x4_t v329 = vcombine_f32(v327, v327); - float32x4_t v337 = vcombine_f32(v335, v335); - float32x4_t v345 = vcombine_f32(v343, v343); - float32x4_t v353 = vcombine_f32(v351, v351); - float32x4_t v361 = vcombine_f32(v359, v359); - float32x4_t v369 = vcombine_f32(v367, v367); - float32x4_t v377 = vcombine_f32(v375, v375); - float32x4_t v385 = vcombine_f32(v383, v383); - float32x4_t v393 = vcombine_f32(v391, v391); - float32x4_t v401 = vcombine_f32(v399, v399); - float32x4_t v409 = vcombine_f32(v407, v407); - float32x4_t v417 = vcombine_f32(v415, v415); - float32x4_t v425 = vcombine_f32(v423, v423); - float32x4_t v433 = vcombine_f32(v431, v431); - float32x4_t v441 = vcombine_f32(v439, v439); - float32x4_t v449 = vcombine_f32(v447, v447); - float32x4_t v457 = vcombine_f32(v455, v455); - float32x4_t v465 = vcombine_f32(v463, v463); - int16x4_t v1584 = vld1_s16((const int16_t *)v1288); - int16x4_t v1586 = vld1_s16((const int16_t *)v1297); - int16x4_t v1588 = vld1_s16((const int16_t *)v1306); - int16x4_t v1590 = vld1_s16((const int16_t *)v1315); - int16x4_t v1592 = vld1_s16((const int16_t *)v1324); - int16x4_t v1594 = vld1_s16((const int16_t *)v1333); - int16x4_t v1596 = vld1_s16((const int16_t *)v1342); - int16x4_t v1598 = vld1_s16((const int16_t *)v1351); - int16x4_t v1600 = vld1_s16((const int16_t *)v1360); - int16x4_t v1602 = vld1_s16((const int16_t *)v1369); - int16x4_t v1604 = vld1_s16((const int16_t *)v1378); - int16x4_t v1606 = vld1_s16((const int16_t *)v1387); - int16x4_t v1608 = vld1_s16((const int16_t *)v1396); - int16x4_t v1610 = vld1_s16((const int16_t *)v1405); - int16x4_t v1612 = vld1_s16((const int16_t *)v1414); - float32x4_t v36 = vcvtq_n_f32_s32(vmovl_s16(v1584), 15); - float32x4_t v46 = vcvtq_n_f32_s32(vmovl_s16(v1586), 15); - float32x4_t v54 = vcvtq_n_f32_s32(vmovl_s16(v1588), 15); - float32x4_t v64 = vcvtq_n_f32_s32(vmovl_s16(v1590), 15); - float32x4_t v72 = vcvtq_n_f32_s32(vmovl_s16(v1592), 15); - float32x4_t v82 = vcvtq_n_f32_s32(vmovl_s16(v1594), 15); - float32x4_t v90 = vcvtq_n_f32_s32(vmovl_s16(v1596), 15); - float32x4_t v100 = vcvtq_n_f32_s32(vmovl_s16(v1598), 15); - float32x4_t v108 = vcvtq_n_f32_s32(vmovl_s16(v1600), 15); - float32x4_t v118 = vcvtq_n_f32_s32(vmovl_s16(v1602), 15); - float32x4_t v126 = vcvtq_n_f32_s32(vmovl_s16(v1604), 15); - float32x4_t v136 = vcvtq_n_f32_s32(vmovl_s16(v1606), 15); - float32x4_t v144 = vcvtq_n_f32_s32(vmovl_s16(v1608), 15); - float32x4_t v154 = vcvtq_n_f32_s32(vmovl_s16(v1610), 15); - float32x4_t v162 = vcvtq_n_f32_s32(vmovl_s16(v1612), 15); - float32x4_t v37 = vaddq_f32(v28, v36); - float32x4_t v38 = vsubq_f32(v28, v36); - float32x4_t v55 = vaddq_f32(v46, v54); - float32x4_t v56 = vsubq_f32(v46, v54); - float32x4_t v73 = vaddq_f32(v64, v72); - float32x4_t v74 = vsubq_f32(v64, v72); - float32x4_t v91 = vaddq_f32(v82, v90); - float32x4_t v92 = vsubq_f32(v82, v90); - float32x4_t v109 = vaddq_f32(v100, v108); - float32x4_t v110 = vsubq_f32(v100, v108); - float32x4_t v127 = vaddq_f32(v118, v126); - float32x4_t v128 = vsubq_f32(v118, v126); - float32x4_t v145 = vaddq_f32(v136, v144); - float32x4_t v146 = vsubq_f32(v136, v144); - float32x4_t v163 = vaddq_f32(v154, v162); - float32x4_t v164 = vsubq_f32(v154, v162); - float32x4_t v165 = vaddq_f32(v37, v109); - float32x4_t v166 = vaddq_f32(v55, v127); - float32x4_t v167 = vaddq_f32(v73, v145); - float32x4_t v168 = vaddq_f32(v91, v163); - float32x4_t v171 = vsubq_f32(v37, v109); - float32x4_t v172 = vsubq_f32(v55, v127); - float32x4_t v173 = vsubq_f32(v73, v145); - float32x4_t v174 = vsubq_f32(v91, v163); - float32x4_t v185 = vaddq_f32(v38, v74); - float32x4_t v186 = vaddq_f32(v56, v92); - float32x4_t v187 = vsubq_f32(v38, v74); - float32x4_t v188 = vsubq_f32(v164, v128); - float32x4_t v189 = vaddq_f32(v110, v146); - float32x4_t v190 = vaddq_f32(v128, v164); - float32x4_t v191 = vsubq_f32(v110, v146); - float32x4_t v192 = vsubq_f32(v56, v92); - float32x4_t v205 = vaddq_f32(v38, v110); - float32x4_t v206 = vaddq_f32(v92, v164); - float32x4_t v408 = vrev64q_f32(v38); - float32x4_t v416 = vrev64q_f32(v110); - float32x4_t v432 = vrev64q_f32(v92); - float32x4_t v440 = vrev64q_f32(v164); - float32x4_t v169 = vaddq_f32(v165, v167); - float32x4_t v170 = vaddq_f32(v166, v168); - float32x4_t v175 = vsubq_f32(v165, v167); - float32x4_t v176 = vsubq_f32(v166, v168); - float32x4_t v179 = vaddq_f32(v172, v174); - float32x4_t v180 = vaddq_f32(v171, v173); - float32x4_t v182 = vsubq_f32(v173, v174); - float32x4_t v183 = vsubq_f32(v171, v172); - float32x4_t v193 = vaddq_f32(v185, v186); - float32x4_t v194 = vaddq_f32(v189, v190); - float32x4_t v196 = vsubq_f32(v185, v186); - float32x4_t v197 = vsubq_f32(v189, v190); - float32x4_t v199 = vaddq_f32(v187, v188); - float32x4_t v200 = vaddq_f32(v191, v192); - float32x4_t v202 = vsubq_f32(v187, v188); - float32x4_t v203 = vsubq_f32(v191, v192); - float32x4_t v233 = vmulq_f32(v171, v232); - float32x4_t v238 = vmulq_f32(v172, v237); - float32x4_t v243 = vmulq_f32(v173, v242); - float32x4_t v248 = vmulq_f32(v174, v247); - float32x4_t v400 = vrev64q_f32(v205); - float32x4_t v410 = vmulq_f32(v408, v409); - float32x4_t v418 = vmulq_f32(v416, v417); - float32x4_t v424 = vrev64q_f32(v206); - float32x4_t v434 = vmulq_f32(v432, v433); - float32x4_t v442 = vmulq_f32(v440, v441); - float32x4_t v177 = vaddq_f32(v169, v170); - float32x4_t v178 = vsubq_f32(v169, v170); - float32x4_t v181 = vsubq_f32(v180, v179); - float32x4_t v184 = vaddq_f32(v175, v176); - float32x4_t v195 = vaddq_f32(v193, v194); - float32x4_t v198 = vaddq_f32(v196, v197); - float32x4_t v201 = vaddq_f32(v199, v200); - float32x4_t v204 = vaddq_f32(v202, v203); - float32x4_t v207 = vsubq_f32(v200, v194); - float32x4_t v210 = vsubq_f32(v193, v199); - float32x4_t v253 = vmulq_f32(v175, v252); - float32x4_t v258 = vmulq_f32(v176, v257); - float32x4_t v273 = vmulq_f32(v179, v272); - float32x4_t v278 = vmulq_f32(v180, v277); - float32x4_t v288 = vmulq_f32(v182, v287); - float32x4_t v293 = vmulq_f32(v183, v292); - float32x4_t v304 = vrev64q_f32(v193); - float32x4_t v312 = vrev64q_f32(v194); - float32x4_t v328 = vrev64q_f32(v196); - float32x4_t v336 = vrev64q_f32(v197); - float32x4_t v352 = vrev64q_f32(v199); - float32x4_t v360 = vrev64q_f32(v200); - float32x4_t v376 = vrev64q_f32(v202); - float32x4_t v384 = vrev64q_f32(v203); - float32x4_t v402 = vmulq_f32(v400, v401); - float32x4_t v426 = vmulq_f32(v424, v425); - float32x4_t v208 = vaddq_f32(v207, v38); - float32x4_t v211 = vaddq_f32(v210, v92); - float32x4_t v223 = vaddq_f32(v222, v177); - float32x4_t v263 = vmulq_f32(v177, v262); - float32x4_t v268 = vmulq_f32(v178, v267); - float32x4_t v283 = vmulq_f32(v181, v282); - float32x4_t v298 = vmulq_f32(v184, v297); - float32x4_t v306 = vmulq_f32(v304, v305); - float32x4_t v314 = vmulq_f32(v312, v313); - float32x4_t v320 = vrev64q_f32(v195); - float32x4_t v330 = vmulq_f32(v328, v329); - float32x4_t v338 = vmulq_f32(v336, v337); - float32x4_t v344 = vrev64q_f32(v198); - float32x4_t v354 = vmulq_f32(v352, v353); - float32x4_t v362 = vmulq_f32(v360, v361); - float32x4_t v368 = vrev64q_f32(v201); - float32x4_t v378 = vmulq_f32(v376, v377); - float32x4_t v386 = vmulq_f32(v384, v385); - float32x4_t v392 = vrev64q_f32(v204); - float32x4_t v469 = vaddq_f32(v248, v288); - float32x4_t v470 = vsubq_f32(v288, v243); - float32x4_t v471 = vaddq_f32(v238, v293); - float32x4_t v472 = vsubq_f32(v233, v293); - float32x4_t v209 = vsubq_f32(v208, v206); - float32x4_t v212 = vaddq_f32(v211, v110); - float32x4_t v322 = vmulq_f32(v320, v321); - float32x4_t v346 = vmulq_f32(v344, v345); - float32x4_t v370 = vmulq_f32(v368, v369); - float32x4_t v394 = vmulq_f32(v392, v393); - float32x4_t v467 = vaddq_f32(v273, v283); - float32x4_t v468 = vsubq_f32(v278, v283); - float32x4_t v473 = vsubq_f32(v298, v258); - float32x4_t v474 = vaddq_f32(v298, v253); - float32x4_t v475 = vaddq_f32(v263, v223); - int16x4_t v543 = vqmovn_s32(vcvtq_n_s32_f32(v223, 15)); - float32x4_t v213 = vsubq_f32(v212, v164); - float32x4_t v448 = vrev64q_f32(v209); - float32x4_t v476 = vaddq_f32(v268, v475); - float32x4_t v477 = vsubq_f32(v475, v268); - float32x4_t v478 = vsubq_f32(v467, v469); - float32x4_t v480 = vaddq_f32(v468, v470); - float32x4_t v482 = vaddq_f32(v467, v471); - float32x4_t v484 = vaddq_f32(v468, v472); - float32x4_t v494 = vaddq_f32(v306, v322); - float32x4_t v495 = vaddq_f32(v314, v322); - float32x4_t v496 = vaddq_f32(v330, v346); - float32x4_t v497 = vaddq_f32(v338, v346); - float32x4_t v498 = vaddq_f32(v354, v370); - float32x4_t v499 = vaddq_f32(v362, v370); - float32x4_t v500 = vaddq_f32(v378, v394); - float32x4_t v501 = vaddq_f32(v386, v394); - vst1_s16((int16_t *)v1434, v543); - float32x4_t v214 = vaddq_f32(v209, v213); - float32x4_t v450 = vmulq_f32(v448, v449); - float32x4_t v456 = vrev64q_f32(v213); - float32x4_t v479 = vaddq_f32(v473, v476); - float32x4_t v481 = vaddq_f32(v474, v477); - float32x4_t v483 = vsubq_f32(v476, v473); - float32x4_t v485 = vsubq_f32(v477, v474); - float32x4_t v505 = vaddq_f32(v494, v496); - float32x4_t v506 = vsubq_f32(v494, v496); - float32x4_t v507 = vaddq_f32(v495, v497); - float32x4_t v508 = vsubq_f32(v495, v497); - float32x4_t v509 = vaddq_f32(v498, v500); - float32x4_t v510 = vsubq_f32(v500, v498); - float32x4_t v511 = vaddq_f32(v499, v501); - float32x4_t v512 = vsubq_f32(v501, v499); - float32x4_t v458 = vmulq_f32(v456, v457); - float32x4_t v464 = vrev64q_f32(v214); - float32x4_t v486 = vaddq_f32(v478, v479); - float32x4_t v487 = vaddq_f32(v480, v481); - float32x4_t v488 = vaddq_f32(v482, v483); - float32x4_t v489 = vaddq_f32(v484, v485); - float32x4_t v490 = vsubq_f32(v479, v478); - float32x4_t v491 = vsubq_f32(v481, v480); - float32x4_t v492 = vsubq_f32(v483, v482); - float32x4_t v493 = vsubq_f32(v485, v484); - float32x4_t v522 = vaddq_f32(v507, v511); - float32x4_t v524 = vaddq_f32(v506, v512); - float32x4_t v526 = vsubq_f32(v505, v509); - float32x4_t v528 = vsubq_f32(v512, v506); - float32x4_t v530 = vaddq_f32(v505, v509); - float32x4_t v533 = vsubq_f32(v510, v508); - float32x4_t v536 = vsubq_f32(v511, v507); - float32x4_t v539 = vaddq_f32(v508, v510); - float32x4_t v466 = vmulq_f32(v464, v465); - float32x4_t v513 = vsubq_f32(v450, v458); - float32x4_t v502 = vaddq_f32(v466, v458); - float32x4_t v515 = vaddq_f32(v513, v513); - float32x4_t v540 = vsubq_f32(v539, v513); - float32x4_t v503 = vaddq_f32(v402, v502); - float32x4_t v516 = vsubq_f32(v426, v515); - float32x4_t v519 = vaddq_f32(v502, v502); - float32x4_t v537 = vaddq_f32(v536, v515); - float32x4_t v585 = vaddq_f32(v493, v540); - float32x4_t v594 = vsubq_f32(v493, v540); - float32x4_t v504 = vaddq_f32(v503, v410); - float32x4_t v514 = vaddq_f32(v503, v418); - float32x4_t v517 = vaddq_f32(v516, v434); - float32x4_t v518 = vaddq_f32(v516, v442); - float32x4_t v520 = vaddq_f32(v519, v519); - float32x4_t v521 = vaddq_f32(v513, v519); - float32x4_t v527 = vaddq_f32(v526, v519); - float32x4_t v538 = vaddq_f32(v537, v519); - int16x4_t v588 = vqmovn_s32(vcvtq_n_s32_f32(v585, 15)); - int16x4_t v597 = vqmovn_s32(vcvtq_n_s32_f32(v594, 15)); - float32x4_t v523 = vaddq_f32(v522, v514); - float32x4_t v525 = vaddq_f32(v524, v517); - float32x4_t v529 = vsubq_f32(v528, v521); - float32x4_t v531 = vaddq_f32(v530, v504); - float32x4_t v534 = vsubq_f32(v533, v518); - float32x4_t v567 = vaddq_f32(v488, v527); - float32x4_t v576 = vsubq_f32(v488, v527); - float32x4_t v675 = vaddq_f32(v492, v538); - float32x4_t v684 = vsubq_f32(v492, v538); - vst1_s16((int16_t *)v1479, v588); - vst1_s16((int16_t *)v1488, v597); - float32x4_t v532 = vaddq_f32(v531, v513); - float32x4_t v535 = vaddq_f32(v534, v520); - float32x4_t v549 = vaddq_f32(v486, v523); - float32x4_t v558 = vsubq_f32(v486, v523); - int16x4_t v570 = vqmovn_s32(vcvtq_n_s32_f32(v567, 15)); - int16x4_t v579 = vqmovn_s32(vcvtq_n_s32_f32(v576, 15)); - float32x4_t v621 = vaddq_f32(v489, v529); - float32x4_t v630 = vsubq_f32(v489, v529); - float32x4_t v639 = vaddq_f32(v487, v525); - float32x4_t v648 = vsubq_f32(v487, v525); - int16x4_t v678 = vqmovn_s32(vcvtq_n_s32_f32(v675, 15)); - int16x4_t v687 = vqmovn_s32(vcvtq_n_s32_f32(v684, 15)); - int16x4_t v552 = vqmovn_s32(vcvtq_n_s32_f32(v549, 15)); - int16x4_t v561 = vqmovn_s32(vcvtq_n_s32_f32(v558, 15)); - float32x4_t v603 = vaddq_f32(v490, v532); - float32x4_t v612 = vsubq_f32(v490, v532); - int16x4_t v624 = vqmovn_s32(vcvtq_n_s32_f32(v621, 15)); - int16x4_t v633 = vqmovn_s32(vcvtq_n_s32_f32(v630, 15)); - int16x4_t v642 = vqmovn_s32(vcvtq_n_s32_f32(v639, 15)); - int16x4_t v651 = vqmovn_s32(vcvtq_n_s32_f32(v648, 15)); - float32x4_t v657 = vaddq_f32(v491, v535); - float32x4_t v666 = vsubq_f32(v491, v535); - vst1_s16((int16_t *)v1461, v570); - vst1_s16((int16_t *)v1470, v579); - vst1_s16((int16_t *)v1569, v678); - vst1_s16((int16_t *)v1578, v687); - int16x4_t v606 = vqmovn_s32(vcvtq_n_s32_f32(v603, 15)); - int16x4_t v615 = vqmovn_s32(vcvtq_n_s32_f32(v612, 15)); - int16x4_t v660 = vqmovn_s32(vcvtq_n_s32_f32(v657, 15)); - int16x4_t v669 = vqmovn_s32(vcvtq_n_s32_f32(v666, 15)); - vst1_s16((int16_t *)v1443, v552); - vst1_s16((int16_t *)v1452, v561); - vst1_s16((int16_t *)v1515, v624); - vst1_s16((int16_t *)v1524, v633); - vst1_s16((int16_t *)v1533, v642); - vst1_s16((int16_t *)v1542, v651); - vst1_s16((int16_t *)v1497, v606); - vst1_s16((int16_t *)v1506, v615); - vst1_s16((int16_t *)v1551, v660); - vst1_s16((int16_t *)v1560, v669); - v5 += 2 * 1; - v6 += 2 * 1; - } - for (int j = v693 * 2; j < howmany; j += 1) { - int16x4_t v705 = vld1s_s16(&v5[istride]); - float v875 = -4.2602849117736000e-02F; - float v879 = 2.0497965023262180e-01F; - float v883 = 1.0451835201736759e+00F; - float v887 = 1.7645848660222969e+00F; - float v891 = -7.2340797728605655e-01F; - float v895 = -8.9055591620606403e-02F; - float v899 = -1.0625000000000000e+00F; - float v903 = 2.5769410160110379e-01F; - float v907 = 7.7980260789483757e-01F; - float v911 = 5.4389318464570580e-01F; - float v915 = 4.2010193497052700e-01F; - float v919 = 1.2810929434228073e+00F; - float v923 = 4.4088907348175338e-01F; - float v927 = 3.1717619283272508e-01F; - float v930 = -9.0138318648016680e-01F; - float v931 = 9.0138318648016680e-01F; - float v937 = -4.3248756360072310e-01F; - float v938 = 4.3248756360072310e-01F; - float v944 = 6.6693537504044498e-01F; - float v945 = -6.6693537504044498e-01F; - float v951 = -6.0389004312516970e-01F; - float v952 = 6.0389004312516970e-01F; - float v958 = -3.6924873198582547e-01F; - float v959 = 3.6924873198582547e-01F; - float v965 = 4.8656938755549761e-01F; - float v966 = -4.8656938755549761e-01F; - float v972 = 2.3813712136760609e-01F; - float v973 = -2.3813712136760609e-01F; - float v979 = -1.5573820617422458e+00F; - float v980 = 1.5573820617422458e+00F; - float v986 = 6.5962247018731990e-01F; - float v987 = -6.5962247018731990e-01F; - float v993 = -1.4316961569866241e-01F; - float v994 = 1.4316961569866241e-01F; - float v1000 = 2.3903469959860771e-01F; - float v1001 = -2.3903469959860771e-01F; - float v1007 = -4.7932541949972603e-02F; - float v1008 = 4.7932541949972603e-02F; - float v1014 = -2.3188014856550065e+00F; - float v1015 = 2.3188014856550065e+00F; - float v1021 = 7.8914568419206255e-01F; - float v1022 = -7.8914568419206255e-01F; - float v1028 = 3.8484572871179505e+00F; - float v1029 = -3.8484572871179505e+00F; - float v1035 = -1.3003804568801376e+00F; - float v1036 = 1.3003804568801376e+00F; - float v1042 = 4.0814769046889037e+00F; - float v1043 = -4.0814769046889037e+00F; - float v1049 = -1.4807159909286283e+00F; - float v1050 = 1.4807159909286283e+00F; - float v1056 = -1.3332470363551400e-02F; - float v1057 = 1.3332470363551400e-02F; - float v1063 = -3.7139778690557629e-01F; - float v1064 = 3.7139778690557629e-01F; - float v1070 = 1.9236512863456379e-01F; - float v1071 = -1.9236512863456379e-01F; - float32x2_t v1073 = (float32x2_t){v4, v4}; - float32x2_t v706 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v705)), 15); - int16x4_t v867 = vld1s_s16(&v5[0]); - float32x2_t v876 = (float32x2_t){v875, v875}; - float32x2_t v880 = (float32x2_t){v879, v879}; - float32x2_t v884 = (float32x2_t){v883, v883}; - float32x2_t v888 = (float32x2_t){v887, v887}; - float32x2_t v892 = (float32x2_t){v891, v891}; - float32x2_t v896 = (float32x2_t){v895, v895}; - float32x2_t v900 = (float32x2_t){v899, v899}; - float32x2_t v904 = (float32x2_t){v903, v903}; - float32x2_t v908 = (float32x2_t){v907, v907}; - float32x2_t v912 = (float32x2_t){v911, v911}; - float32x2_t v916 = (float32x2_t){v915, v915}; - float32x2_t v920 = (float32x2_t){v919, v919}; - float32x2_t v924 = (float32x2_t){v923, v923}; - float32x2_t v928 = (float32x2_t){v927, v927}; - float32x2_t v932 = (float32x2_t){v930, v931}; - float32x2_t v939 = (float32x2_t){v937, v938}; - float32x2_t v946 = (float32x2_t){v944, v945}; - float32x2_t v953 = (float32x2_t){v951, v952}; - float32x2_t v960 = (float32x2_t){v958, v959}; - float32x2_t v967 = (float32x2_t){v965, v966}; - float32x2_t v974 = (float32x2_t){v972, v973}; - float32x2_t v981 = (float32x2_t){v979, v980}; - float32x2_t v988 = (float32x2_t){v986, v987}; - float32x2_t v995 = (float32x2_t){v993, v994}; - float32x2_t v1002 = (float32x2_t){v1000, v1001}; - float32x2_t v1009 = (float32x2_t){v1007, v1008}; - float32x2_t v1016 = (float32x2_t){v1014, v1015}; - float32x2_t v1023 = (float32x2_t){v1021, v1022}; - float32x2_t v1030 = (float32x2_t){v1028, v1029}; - float32x2_t v1037 = (float32x2_t){v1035, v1036}; - float32x2_t v1044 = (float32x2_t){v1042, v1043}; - float32x2_t v1051 = (float32x2_t){v1049, v1050}; - float32x2_t v1058 = (float32x2_t){v1056, v1057}; - float32x2_t v1065 = (float32x2_t){v1063, v1064}; - float32x2_t v1072 = (float32x2_t){v1070, v1071}; - int16x4_t v711 = vld1s_s16(&v5[istride * 16]); - int16x4_t v719 = vld1s_s16(&v5[istride * 3]); - int16x4_t v725 = vld1s_s16(&v5[istride * 14]); - int16x4_t v733 = vld1s_s16(&v5[istride * 9]); - int16x4_t v739 = vld1s_s16(&v5[istride * 8]); - int16x4_t v747 = vld1s_s16(&v5[istride * 10]); - int16x4_t v753 = vld1s_s16(&v5[istride * 7]); - int16x4_t v761 = vld1s_s16(&v5[istride * 13]); - int16x4_t v767 = vld1s_s16(&v5[istride * 4]); - int16x4_t v775 = vld1s_s16(&v5[istride * 5]); - int16x4_t v781 = vld1s_s16(&v5[istride * 12]); - int16x4_t v789 = vld1s_s16(&v5[istride * 15]); - int16x4_t v795 = vld1s_s16(&v5[istride * 2]); - int16x4_t v803 = vld1s_s16(&v5[istride * 11]); - int16x4_t v809 = vld1s_s16(&v5[istride * 6]); - float32x2_t v868 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v867)), 15); - float32x2_t v934 = vmul_f32(v1073, v932); - float32x2_t v941 = vmul_f32(v1073, v939); - float32x2_t v948 = vmul_f32(v1073, v946); - float32x2_t v955 = vmul_f32(v1073, v953); - float32x2_t v962 = vmul_f32(v1073, v960); - float32x2_t v969 = vmul_f32(v1073, v967); - float32x2_t v976 = vmul_f32(v1073, v974); - float32x2_t v983 = vmul_f32(v1073, v981); - float32x2_t v990 = vmul_f32(v1073, v988); - float32x2_t v997 = vmul_f32(v1073, v995); - float32x2_t v1004 = vmul_f32(v1073, v1002); - float32x2_t v1011 = vmul_f32(v1073, v1009); - float32x2_t v1018 = vmul_f32(v1073, v1016); - float32x2_t v1025 = vmul_f32(v1073, v1023); - float32x2_t v1032 = vmul_f32(v1073, v1030); - float32x2_t v1039 = vmul_f32(v1073, v1037); - float32x2_t v1046 = vmul_f32(v1073, v1044); - float32x2_t v1053 = vmul_f32(v1073, v1051); - float32x2_t v1060 = vmul_f32(v1073, v1058); - float32x2_t v1067 = vmul_f32(v1073, v1065); - float32x2_t v1074 = vmul_f32(v1073, v1072); - float32x2_t v712 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v711)), 15); - float32x2_t v720 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v719)), 15); - float32x2_t v726 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v725)), 15); - float32x2_t v734 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v733)), 15); - float32x2_t v740 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v739)), 15); - float32x2_t v748 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v747)), 15); - float32x2_t v754 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v753)), 15); - float32x2_t v762 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v761)), 15); - float32x2_t v768 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v767)), 15); - float32x2_t v776 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v775)), 15); - float32x2_t v782 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v781)), 15); - float32x2_t v790 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v789)), 15); - float32x2_t v796 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v795)), 15); - float32x2_t v804 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v803)), 15); - float32x2_t v810 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v809)), 15); - float32x2_t v713 = vadd_f32(v706, v712); - float32x2_t v714 = vsub_f32(v706, v712); - float32x2_t v727 = vadd_f32(v720, v726); - float32x2_t v728 = vsub_f32(v720, v726); - float32x2_t v741 = vadd_f32(v734, v740); - float32x2_t v742 = vsub_f32(v734, v740); - float32x2_t v755 = vadd_f32(v748, v754); - float32x2_t v756 = vsub_f32(v748, v754); - float32x2_t v769 = vadd_f32(v762, v768); - float32x2_t v770 = vsub_f32(v762, v768); - float32x2_t v783 = vadd_f32(v776, v782); - float32x2_t v784 = vsub_f32(v776, v782); - float32x2_t v797 = vadd_f32(v790, v796); - float32x2_t v798 = vsub_f32(v790, v796); - float32x2_t v811 = vadd_f32(v804, v810); - float32x2_t v812 = vsub_f32(v804, v810); - float32x2_t v813 = vadd_f32(v713, v769); - float32x2_t v814 = vadd_f32(v727, v783); - float32x2_t v815 = vadd_f32(v741, v797); - float32x2_t v816 = vadd_f32(v755, v811); - float32x2_t v819 = vsub_f32(v713, v769); - float32x2_t v820 = vsub_f32(v727, v783); - float32x2_t v821 = vsub_f32(v741, v797); - float32x2_t v822 = vsub_f32(v755, v811); - float32x2_t v833 = vadd_f32(v714, v742); - float32x2_t v834 = vadd_f32(v728, v756); - float32x2_t v835 = vsub_f32(v714, v742); - float32x2_t v836 = vsub_f32(v812, v784); - float32x2_t v837 = vadd_f32(v770, v798); - float32x2_t v838 = vadd_f32(v784, v812); - float32x2_t v839 = vsub_f32(v770, v798); - float32x2_t v840 = vsub_f32(v728, v756); - float32x2_t v853 = vadd_f32(v714, v770); - float32x2_t v854 = vadd_f32(v756, v812); - float32x2_t v1026 = vrev64_f32(v714); - float32x2_t v1033 = vrev64_f32(v770); - float32x2_t v1047 = vrev64_f32(v756); - float32x2_t v1054 = vrev64_f32(v812); - float32x2_t v817 = vadd_f32(v813, v815); - float32x2_t v818 = vadd_f32(v814, v816); - float32x2_t v823 = vsub_f32(v813, v815); - float32x2_t v824 = vsub_f32(v814, v816); - float32x2_t v827 = vadd_f32(v820, v822); - float32x2_t v828 = vadd_f32(v819, v821); - float32x2_t v830 = vsub_f32(v821, v822); - float32x2_t v831 = vsub_f32(v819, v820); - float32x2_t v841 = vadd_f32(v833, v834); - float32x2_t v842 = vadd_f32(v837, v838); - float32x2_t v844 = vsub_f32(v833, v834); - float32x2_t v845 = vsub_f32(v837, v838); - float32x2_t v847 = vadd_f32(v835, v836); - float32x2_t v848 = vadd_f32(v839, v840); - float32x2_t v850 = vsub_f32(v835, v836); - float32x2_t v851 = vsub_f32(v839, v840); - float32x2_t v877 = vmul_f32(v819, v876); - float32x2_t v881 = vmul_f32(v820, v880); - float32x2_t v885 = vmul_f32(v821, v884); - float32x2_t v889 = vmul_f32(v822, v888); - float32x2_t v1019 = vrev64_f32(v853); - float32x2_t v1027 = vmul_f32(v1026, v1025); - float32x2_t v1034 = vmul_f32(v1033, v1032); - float32x2_t v1040 = vrev64_f32(v854); - float32x2_t v1048 = vmul_f32(v1047, v1046); - float32x2_t v1055 = vmul_f32(v1054, v1053); - float32x2_t v825 = vadd_f32(v817, v818); - float32x2_t v826 = vsub_f32(v817, v818); - float32x2_t v829 = vsub_f32(v828, v827); - float32x2_t v832 = vadd_f32(v823, v824); - float32x2_t v843 = vadd_f32(v841, v842); - float32x2_t v846 = vadd_f32(v844, v845); - float32x2_t v849 = vadd_f32(v847, v848); - float32x2_t v852 = vadd_f32(v850, v851); - float32x2_t v855 = vsub_f32(v848, v842); - float32x2_t v858 = vsub_f32(v841, v847); - float32x2_t v893 = vmul_f32(v823, v892); - float32x2_t v897 = vmul_f32(v824, v896); - float32x2_t v909 = vmul_f32(v827, v908); - float32x2_t v913 = vmul_f32(v828, v912); - float32x2_t v921 = vmul_f32(v830, v920); - float32x2_t v925 = vmul_f32(v831, v924); - float32x2_t v935 = vrev64_f32(v841); - float32x2_t v942 = vrev64_f32(v842); - float32x2_t v956 = vrev64_f32(v844); - float32x2_t v963 = vrev64_f32(v845); - float32x2_t v977 = vrev64_f32(v847); - float32x2_t v984 = vrev64_f32(v848); - float32x2_t v998 = vrev64_f32(v850); - float32x2_t v1005 = vrev64_f32(v851); - float32x2_t v1020 = vmul_f32(v1019, v1018); - float32x2_t v1041 = vmul_f32(v1040, v1039); - float32x2_t v856 = vadd_f32(v855, v714); - float32x2_t v859 = vadd_f32(v858, v756); - float32x2_t v869 = vadd_f32(v868, v825); - float32x2_t v901 = vmul_f32(v825, v900); - float32x2_t v905 = vmul_f32(v826, v904); - float32x2_t v917 = vmul_f32(v829, v916); - float32x2_t v929 = vmul_f32(v832, v928); - float32x2_t v936 = vmul_f32(v935, v934); - float32x2_t v943 = vmul_f32(v942, v941); - float32x2_t v949 = vrev64_f32(v843); - float32x2_t v957 = vmul_f32(v956, v955); - float32x2_t v964 = vmul_f32(v963, v962); - float32x2_t v970 = vrev64_f32(v846); - float32x2_t v978 = vmul_f32(v977, v976); - float32x2_t v985 = vmul_f32(v984, v983); - float32x2_t v991 = vrev64_f32(v849); - float32x2_t v999 = vmul_f32(v998, v997); - float32x2_t v1006 = vmul_f32(v1005, v1004); - float32x2_t v1012 = vrev64_f32(v852); - float32x2_t v1079 = vadd_f32(v889, v921); - float32x2_t v1080 = vsub_f32(v921, v885); - float32x2_t v1081 = vadd_f32(v881, v925); - float32x2_t v1082 = vsub_f32(v877, v925); - float32x2_t v857 = vsub_f32(v856, v854); - float32x2_t v860 = vadd_f32(v859, v770); - float32x2_t v950 = vmul_f32(v949, v948); - float32x2_t v971 = vmul_f32(v970, v969); - float32x2_t v992 = vmul_f32(v991, v990); - float32x2_t v1013 = vmul_f32(v1012, v1011); - float32x2_t v1077 = vadd_f32(v909, v917); - float32x2_t v1078 = vsub_f32(v913, v917); - float32x2_t v1083 = vsub_f32(v929, v897); - float32x2_t v1084 = vadd_f32(v929, v893); - float32x2_t v1085 = vadd_f32(v901, v869); - int16x4_t v1153 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v869, 15), (int32x2_t){0, 0})); - float32x2_t v861 = vsub_f32(v860, v812); - float32x2_t v1061 = vrev64_f32(v857); - float32x2_t v1086 = vadd_f32(v905, v1085); - float32x2_t v1087 = vsub_f32(v1085, v905); - float32x2_t v1088 = vsub_f32(v1077, v1079); - float32x2_t v1090 = vadd_f32(v1078, v1080); - float32x2_t v1092 = vadd_f32(v1077, v1081); - float32x2_t v1094 = vadd_f32(v1078, v1082); - float32x2_t v1104 = vadd_f32(v936, v950); - float32x2_t v1105 = vadd_f32(v943, v950); - float32x2_t v1106 = vadd_f32(v957, v971); - float32x2_t v1107 = vadd_f32(v964, v971); - float32x2_t v1108 = vadd_f32(v978, v992); - float32x2_t v1109 = vadd_f32(v985, v992); - float32x2_t v1110 = vadd_f32(v999, v1013); - float32x2_t v1111 = vadd_f32(v1006, v1013); - v6[0] = vget_lane_s32(vreinterpret_s32_s16(v1153), 0); - float32x2_t v862 = vadd_f32(v857, v861); - float32x2_t v1062 = vmul_f32(v1061, v1060); - float32x2_t v1068 = vrev64_f32(v861); - float32x2_t v1089 = vadd_f32(v1083, v1086); - float32x2_t v1091 = vadd_f32(v1084, v1087); - float32x2_t v1093 = vsub_f32(v1086, v1083); - float32x2_t v1095 = vsub_f32(v1087, v1084); - float32x2_t v1115 = vadd_f32(v1104, v1106); - float32x2_t v1116 = vsub_f32(v1104, v1106); - float32x2_t v1117 = vadd_f32(v1105, v1107); - float32x2_t v1118 = vsub_f32(v1105, v1107); - float32x2_t v1119 = vadd_f32(v1108, v1110); - float32x2_t v1120 = vsub_f32(v1110, v1108); - float32x2_t v1121 = vadd_f32(v1109, v1111); - float32x2_t v1122 = vsub_f32(v1111, v1109); - float32x2_t v1069 = vmul_f32(v1068, v1067); - float32x2_t v1075 = vrev64_f32(v862); - float32x2_t v1096 = vadd_f32(v1088, v1089); - float32x2_t v1097 = vadd_f32(v1090, v1091); - float32x2_t v1098 = vadd_f32(v1092, v1093); - float32x2_t v1099 = vadd_f32(v1094, v1095); - float32x2_t v1100 = vsub_f32(v1089, v1088); - float32x2_t v1101 = vsub_f32(v1091, v1090); - float32x2_t v1102 = vsub_f32(v1093, v1092); - float32x2_t v1103 = vsub_f32(v1095, v1094); - float32x2_t v1132 = vadd_f32(v1117, v1121); - float32x2_t v1134 = vadd_f32(v1116, v1122); - float32x2_t v1136 = vsub_f32(v1115, v1119); - float32x2_t v1138 = vsub_f32(v1122, v1116); - float32x2_t v1140 = vadd_f32(v1115, v1119); - float32x2_t v1143 = vsub_f32(v1120, v1118); - float32x2_t v1146 = vsub_f32(v1121, v1117); - float32x2_t v1149 = vadd_f32(v1118, v1120); - float32x2_t v1076 = vmul_f32(v1075, v1074); - float32x2_t v1123 = vsub_f32(v1062, v1069); - float32x2_t v1112 = vadd_f32(v1076, v1069); - float32x2_t v1125 = vadd_f32(v1123, v1123); - float32x2_t v1150 = vsub_f32(v1149, v1123); - float32x2_t v1113 = vadd_f32(v1020, v1112); - float32x2_t v1126 = vsub_f32(v1041, v1125); - float32x2_t v1129 = vadd_f32(v1112, v1112); - float32x2_t v1147 = vadd_f32(v1146, v1125); - float32x2_t v1185 = vadd_f32(v1103, v1150); - float32x2_t v1192 = vsub_f32(v1103, v1150); - float32x2_t v1114 = vadd_f32(v1113, v1027); - float32x2_t v1124 = vadd_f32(v1113, v1034); - float32x2_t v1127 = vadd_f32(v1126, v1048); - float32x2_t v1128 = vadd_f32(v1126, v1055); - float32x2_t v1130 = vadd_f32(v1129, v1129); - float32x2_t v1131 = vadd_f32(v1123, v1129); - float32x2_t v1137 = vadd_f32(v1136, v1129); - float32x2_t v1148 = vadd_f32(v1147, v1129); - int16x4_t v1188 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1185, 15), (int32x2_t){0, 0})); - int16x4_t v1195 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1192, 15), (int32x2_t){0, 0})); - float32x2_t v1133 = vadd_f32(v1132, v1124); - float32x2_t v1135 = vadd_f32(v1134, v1127); - float32x2_t v1139 = vsub_f32(v1138, v1131); - float32x2_t v1141 = vadd_f32(v1140, v1114); - float32x2_t v1144 = vsub_f32(v1143, v1128); - float32x2_t v1171 = vadd_f32(v1098, v1137); - float32x2_t v1178 = vsub_f32(v1098, v1137); - v6[ostride * 3] = vget_lane_s32(vreinterpret_s32_s16(v1188), 0); - v6[ostride * 14] = vget_lane_s32(vreinterpret_s32_s16(v1195), 0); - float32x2_t v1255 = vadd_f32(v1102, v1148); - float32x2_t v1262 = vsub_f32(v1102, v1148); - float32x2_t v1142 = vadd_f32(v1141, v1123); - float32x2_t v1145 = vadd_f32(v1144, v1130); - float32x2_t v1157 = vadd_f32(v1096, v1133); - float32x2_t v1164 = vsub_f32(v1096, v1133); - int16x4_t v1174 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1171, 15), (int32x2_t){0, 0})); - int16x4_t v1181 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1178, 15), (int32x2_t){0, 0})); - float32x2_t v1213 = vadd_f32(v1099, v1139); - float32x2_t v1220 = vsub_f32(v1099, v1139); - float32x2_t v1227 = vadd_f32(v1097, v1135); - float32x2_t v1234 = vsub_f32(v1097, v1135); - int16x4_t v1258 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1255, 15), (int32x2_t){0, 0})); - int16x4_t v1265 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1262, 15), (int32x2_t){0, 0})); - int16x4_t v1160 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1157, 15), (int32x2_t){0, 0})); - int16x4_t v1167 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1164, 15), (int32x2_t){0, 0})); - v6[ostride * 2] = vget_lane_s32(vreinterpret_s32_s16(v1174), 0); - v6[ostride * 15] = vget_lane_s32(vreinterpret_s32_s16(v1181), 0); - float32x2_t v1199 = vadd_f32(v1100, v1142); - float32x2_t v1206 = vsub_f32(v1100, v1142); - int16x4_t v1216 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1213, 15), (int32x2_t){0, 0})); - int16x4_t v1223 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1220, 15), (int32x2_t){0, 0})); - int16x4_t v1230 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1227, 15), (int32x2_t){0, 0})); - int16x4_t v1237 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1234, 15), (int32x2_t){0, 0})); - float32x2_t v1241 = vadd_f32(v1101, v1145); - float32x2_t v1248 = vsub_f32(v1101, v1145); - v6[ostride * 8] = vget_lane_s32(vreinterpret_s32_s16(v1258), 0); - v6[ostride * 9] = vget_lane_s32(vreinterpret_s32_s16(v1265), 0); - v6[ostride] = vget_lane_s32(vreinterpret_s32_s16(v1160), 0); - v6[ostride * 16] = vget_lane_s32(vreinterpret_s32_s16(v1167), 0); - int16x4_t v1202 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1199, 15), (int32x2_t){0, 0})); - int16x4_t v1209 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1206, 15), (int32x2_t){0, 0})); - v6[ostride * 5] = vget_lane_s32(vreinterpret_s32_s16(v1216), 0); - v6[ostride * 12] = vget_lane_s32(vreinterpret_s32_s16(v1223), 0); - v6[ostride * 6] = vget_lane_s32(vreinterpret_s32_s16(v1230), 0); - v6[ostride * 11] = vget_lane_s32(vreinterpret_s32_s16(v1237), 0); - int16x4_t v1244 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1241, 15), (int32x2_t){0, 0})); - int16x4_t v1251 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1248, 15), (int32x2_t){0, 0})); - v6[ostride * 4] = vget_lane_s32(vreinterpret_s32_s16(v1202), 0); - v6[ostride * 13] = vget_lane_s32(vreinterpret_s32_s16(v1209), 0); - v6[ostride * 7] = vget_lane_s32(vreinterpret_s32_s16(v1244), 0); - v6[ostride * 10] = vget_lane_s32(vreinterpret_s32_s16(v1251), 0); - v5 += 1 * 1; - v6 += 1 * 1; - } -} -#endif - -#ifdef ARMRAL_ARCH_SVE -void armral_fft_cs16_cf32_cs16_ac_n_uu17(const armral_cmplx_int16_t *restrict x, - armral_cmplx_int16_t *restrict y, - int istride, int ostride, int howmany, - float dir) { - int64_t v0 = istride; - int64_t v2 = ostride; - float v4 = dir; - const int32_t *v5 = (const int32_t *)x; - int32_t *v6 = (int32_t *)y; - int64_t v8 = howmany; - int64_t v10 = svcntd(); - int64_t v11 = v10 * 1; - int64_t v12 = v10 * 1; - for (int j = 0; j < v8; j += v10) { - svbool_t pred_full = svwhilelt_b32(j * 2, howmany * 2); - float v227 = -4.2602849117736000e-02F; - float v232 = 2.0497965023262180e-01F; - float v237 = 1.0451835201736759e+00F; - float v242 = 1.7645848660222969e+00F; - float v247 = -7.2340797728605655e-01F; - float v252 = -8.9055591620606403e-02F; - float v257 = -1.0625000000000000e+00F; - float v262 = 2.5769410160110379e-01F; - float v267 = 7.7980260789483757e-01F; - float v272 = 5.4389318464570580e-01F; - float v277 = 4.2010193497052700e-01F; - float v282 = 1.2810929434228073e+00F; - float v287 = 4.4088907348175338e-01F; - float v292 = 3.1717619283272508e-01F; - float v297 = 9.0138318648016680e-01F; - float v304 = 4.3248756360072310e-01F; - float v311 = -6.6693537504044498e-01F; - float v318 = 6.0389004312516970e-01F; - float v325 = 3.6924873198582547e-01F; - float v332 = -4.8656938755549761e-01F; - float v339 = -2.3813712136760609e-01F; - float v346 = 1.5573820617422458e+00F; - float v353 = -6.5962247018731990e-01F; - float v360 = 1.4316961569866241e-01F; - float v367 = -2.3903469959860771e-01F; - float v374 = 4.7932541949972603e-02F; - float v381 = 2.3188014856550065e+00F; - float v388 = -7.8914568419206255e-01F; - float v395 = -3.8484572871179505e+00F; - float v402 = 1.3003804568801376e+00F; - float v409 = -4.0814769046889037e+00F; - float v416 = 1.4807159909286283e+00F; - float v423 = 1.3332470363551400e-02F; - float v430 = 3.7139778690557629e-01F; - float v437 = -1.9236512863456379e-01F; - const int32_t *v675 = &v5[v0]; - int32_t *v875 = &v6[v2]; - int64_t v27 = v0 * 16; - int64_t v37 = v0 * 3; - int64_t v45 = v0 * 14; - int64_t v55 = v0 * 9; - int64_t v63 = v0 * 8; - int64_t v73 = v0 * 10; - int64_t v81 = v0 * 7; - int64_t v91 = v0 * 13; - int64_t v99 = v0 * 4; - int64_t v109 = v0 * 5; - int64_t v117 = v0 * 12; - int64_t v127 = v0 * 15; - int64_t v135 = v0 * 2; - int64_t v145 = v0 * 11; - int64_t v153 = v0 * 6; - float v300 = v4 * v297; - float v307 = v4 * v304; - float v314 = v4 * v311; - float v321 = v4 * v318; - float v328 = v4 * v325; - float v335 = v4 * v332; - float v342 = v4 * v339; - float v349 = v4 * v346; - float v356 = v4 * v353; - float v363 = v4 * v360; - float v370 = v4 * v367; - float v377 = v4 * v374; - float v384 = v4 * v381; - float v391 = v4 * v388; - float v398 = v4 * v395; - float v405 = v4 * v402; - float v412 = v4 * v409; - float v419 = v4 * v416; - float v426 = v4 * v423; - float v433 = v4 * v430; - float v440 = v4 * v437; - int64_t v536 = v2 * 16; - int64_t v545 = v2 * 2; - int64_t v554 = v2 * 15; - int64_t v563 = v2 * 3; - int64_t v572 = v2 * 14; - int64_t v581 = v2 * 4; - int64_t v590 = v2 * 13; - int64_t v599 = v2 * 5; - int64_t v608 = v2 * 12; - int64_t v617 = v2 * 6; - int64_t v626 = v2 * 11; - int64_t v635 = v2 * 7; - int64_t v644 = v2 * 10; - int64_t v653 = v2 * 8; - int64_t v662 = v2 * 9; - const int32_t *v820 = &v5[0]; - svfloat32_t v824 = svdup_n_f32(v227); - svfloat32_t v825 = svdup_n_f32(v232); - svfloat32_t v826 = svdup_n_f32(v237); - svfloat32_t v827 = svdup_n_f32(v242); - svfloat32_t v828 = svdup_n_f32(v247); - svfloat32_t v829 = svdup_n_f32(v252); - svfloat32_t v830 = svdup_n_f32(v257); - svfloat32_t v831 = svdup_n_f32(v262); - svfloat32_t v832 = svdup_n_f32(v267); - svfloat32_t v833 = svdup_n_f32(v272); - svfloat32_t v834 = svdup_n_f32(v277); - svfloat32_t v835 = svdup_n_f32(v282); - svfloat32_t v836 = svdup_n_f32(v287); - svfloat32_t v837 = svdup_n_f32(v292); - int32_t *v866 = &v6[0]; - svfloat32_t v25 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v675[0])), - 1.F / (1ULL << 15ULL)); - const int32_t *v684 = &v5[v27]; - const int32_t *v693 = &v5[v37]; - const int32_t *v702 = &v5[v45]; - const int32_t *v711 = &v5[v55]; - const int32_t *v720 = &v5[v63]; - const int32_t *v729 = &v5[v73]; - const int32_t *v738 = &v5[v81]; - const int32_t *v747 = &v5[v91]; - const int32_t *v756 = &v5[v99]; - const int32_t *v765 = &v5[v109]; - const int32_t *v774 = &v5[v117]; - const int32_t *v783 = &v5[v127]; - const int32_t *v792 = &v5[v135]; - const int32_t *v801 = &v5[v145]; - const int32_t *v810 = &v5[v153]; - svfloat32_t v838 = svdup_n_f32(v300); - svfloat32_t v839 = svdup_n_f32(v307); - svfloat32_t v840 = svdup_n_f32(v314); - svfloat32_t v841 = svdup_n_f32(v321); - svfloat32_t v842 = svdup_n_f32(v328); - svfloat32_t v843 = svdup_n_f32(v335); - svfloat32_t v844 = svdup_n_f32(v342); - svfloat32_t v845 = svdup_n_f32(v349); - svfloat32_t v846 = svdup_n_f32(v356); - svfloat32_t v847 = svdup_n_f32(v363); - svfloat32_t v848 = svdup_n_f32(v370); - svfloat32_t v849 = svdup_n_f32(v377); - svfloat32_t v850 = svdup_n_f32(v384); - svfloat32_t v851 = svdup_n_f32(v391); - svfloat32_t v852 = svdup_n_f32(v398); - svfloat32_t v853 = svdup_n_f32(v405); - svfloat32_t v854 = svdup_n_f32(v412); - svfloat32_t v855 = svdup_n_f32(v419); - svfloat32_t v856 = svdup_n_f32(v426); - svfloat32_t v857 = svdup_n_f32(v433); - svfloat32_t v858 = svdup_n_f32(v440); - int32_t *v884 = &v6[v536]; - int32_t *v893 = &v6[v545]; - int32_t *v902 = &v6[v554]; - int32_t *v911 = &v6[v563]; - int32_t *v920 = &v6[v572]; - int32_t *v929 = &v6[v581]; - int32_t *v938 = &v6[v590]; - int32_t *v947 = &v6[v599]; - int32_t *v956 = &v6[v608]; - int32_t *v965 = &v6[v617]; - int32_t *v974 = &v6[v626]; - int32_t *v983 = &v6[v635]; - int32_t *v992 = &v6[v644]; - int32_t *v1001 = &v6[v653]; - int32_t *v1010 = &v6[v662]; - svfloat32_t v219 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v820[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v33 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v684[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v43 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v693[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v51 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v702[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v61 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v711[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v69 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v720[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v79 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v729[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v87 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v738[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v97 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v747[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v105 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v756[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v115 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v765[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v123 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v774[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v133 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v783[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v141 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v792[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v151 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v801[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v159 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v810[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v34; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v34) : "w"(v25), "w"(v33)); - svfloat32_t v35; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v35) : "w"(v25), "w"(v33)); - svfloat32_t v52; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v52) : "w"(v43), "w"(v51)); - svfloat32_t v53; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v53) : "w"(v43), "w"(v51)); - svfloat32_t v70; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v70) : "w"(v61), "w"(v69)); - svfloat32_t v71; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v71) : "w"(v61), "w"(v69)); - svfloat32_t v88; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v88) : "w"(v79), "w"(v87)); - svfloat32_t v89; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v89) : "w"(v79), "w"(v87)); - svfloat32_t v106; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v106) : "w"(v97), "w"(v105)); - svfloat32_t v107; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v107) : "w"(v97), "w"(v105)); - svfloat32_t v124; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v124) : "w"(v115), "w"(v123)); - svfloat32_t v125; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v125) : "w"(v115), "w"(v123)); - svfloat32_t v142; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v142) : "w"(v133), "w"(v141)); - svfloat32_t v143; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v143) : "w"(v133), "w"(v141)); - svfloat32_t v160; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v160) : "w"(v151), "w"(v159)); - svfloat32_t v161; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v161) : "w"(v151), "w"(v159)); - svfloat32_t v162; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v162) : "w"(v34), "w"(v106)); - svfloat32_t v163; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v163) : "w"(v52), "w"(v124)); - svfloat32_t v164; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v164) : "w"(v70), "w"(v142)); - svfloat32_t v165; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v165) : "w"(v88), "w"(v160)); - svfloat32_t v168; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v168) : "w"(v34), "w"(v106)); - svfloat32_t v169; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v169) : "w"(v52), "w"(v124)); - svfloat32_t v170; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v170) : "w"(v70), "w"(v142)); - svfloat32_t v171; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v171) : "w"(v88), "w"(v160)); - svfloat32_t v182; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v182) : "w"(v35), "w"(v71)); - svfloat32_t v183; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v183) : "w"(v53), "w"(v89)); - svfloat32_t v184; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v184) : "w"(v35), "w"(v71)); - svfloat32_t v185; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v185) : "w"(v161), "w"(v125)); - svfloat32_t v186; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v186) : "w"(v107), "w"(v143)); - svfloat32_t v187; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v187) : "w"(v125), "w"(v161)); - svfloat32_t v188; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v188) : "w"(v107), "w"(v143)); - svfloat32_t v189; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v189) : "w"(v53), "w"(v89)); - svfloat32_t v202; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v202) : "w"(v35), "w"(v107)); - svfloat32_t v203; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v203) : "w"(v89), "w"(v161)); - svfloat32_t v166; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v166) : "w"(v162), "w"(v164)); - svfloat32_t v167; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v167) : "w"(v163), "w"(v165)); - svfloat32_t v172; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v172) : "w"(v162), "w"(v164)); - svfloat32_t v173; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v173) : "w"(v163), "w"(v165)); - svfloat32_t v176; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v176) : "w"(v169), "w"(v171)); - svfloat32_t v177; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v177) : "w"(v168), "w"(v170)); - svfloat32_t v179; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v179) : "w"(v170), "w"(v171)); - svfloat32_t v180; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v180) : "w"(v168), "w"(v169)); - svfloat32_t v190; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v190) : "w"(v182), "w"(v183)); - svfloat32_t v191; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v191) : "w"(v186), "w"(v187)); - svfloat32_t v193; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v193) : "w"(v182), "w"(v183)); - svfloat32_t v194; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v194) : "w"(v186), "w"(v187)); - svfloat32_t v196; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v196) : "w"(v184), "w"(v185)); - svfloat32_t v197; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v197) : "w"(v188), "w"(v189)); - svfloat32_t v199; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v199) : "w"(v184), "w"(v185)); - svfloat32_t v200; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v200) : "w"(v188), "w"(v189)); - svfloat32_t v240; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v240) : "w"(v170), "w"(v826)); - svfloat32_t zero407; - asm volatile("mov %0.s, #0" : "=w"(zero407)); - svfloat32_t v407 = svcmla_f32_x(pred_full, zero407, v853, v203, 90); - svfloat32_t v174; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v174) : "w"(v166), "w"(v167)); - svfloat32_t v175; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v175) : "w"(v166), "w"(v167)); - svfloat32_t v178; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v178) : "w"(v177), "w"(v176)); - svfloat32_t v181; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v181) : "w"(v172), "w"(v173)); - svfloat32_t v192; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v192) : "w"(v190), "w"(v191)); - svfloat32_t v195; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v195) : "w"(v193), "w"(v194)); - svfloat32_t v198; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v198) : "w"(v196), "w"(v197)); - svfloat32_t v201; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v201) : "w"(v199), "w"(v200)); - svfloat32_t v204; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v204) : "w"(v197), "w"(v191)); - svfloat32_t v207; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v207) : "w"(v190), "w"(v196)); - svfloat32_t v250; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v250) : "w"(v172), "w"(v828)); - svfloat32_t v255; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v255) : "w"(v173), "w"(v829)); - svfloat32_t v285; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v285) : "w"(v179), "w"(v835)); - svfloat32_t v290; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v290) : "w"(v180), "w"(v836)); - svfloat32_t v205; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v205) : "w"(v204), "w"(v35)); - svfloat32_t v208; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v208) : "w"(v207), "w"(v89)); - svfloat32_t v220; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v220) : "w"(v219), "w"(v174)); - svfloat32_t v280; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v280) : "w"(v178), "w"(v834)); - svfloat32_t zero316; - asm volatile("mov %0.s, #0" : "=w"(zero316)); - svfloat32_t v316 = svcmla_f32_x(pred_full, zero316, v840, v192, 90); - svfloat32_t zero337; - asm volatile("mov %0.s, #0" : "=w"(zero337)); - svfloat32_t v337 = svcmla_f32_x(pred_full, zero337, v843, v195, 90); - svfloat32_t zero358; - asm volatile("mov %0.s, #0" : "=w"(zero358)); - svfloat32_t v358 = svcmla_f32_x(pred_full, zero358, v846, v198, 90); - svfloat32_t zero379; - asm volatile("mov %0.s, #0" : "=w"(zero379)); - svfloat32_t v379 = svcmla_f32_x(pred_full, zero379, v849, v201, 90); - svfloat32_t v445 = svmla_f32_x(pred_full, v285, v171, v827); - svfloat32_t v446 = svnmls_f32_x(pred_full, v240, v179, v835); - svfloat32_t v447 = svmla_f32_x(pred_full, v290, v169, v825); - svfloat32_t v448 = svnmls_f32_x(pred_full, v290, v168, v824); - svfloat32_t v206; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v206) : "w"(v205), "w"(v203)); - svfloat32_t v209; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v209) : "w"(v208), "w"(v107)); - svfloat32_t v443 = svmla_f32_x(pred_full, v280, v176, v832); - svfloat32_t v444 = svnmls_f32_x(pred_full, v280, v177, v833); - svfloat32_t v449 = svnmls_f32_x(pred_full, v255, v181, v837); - svfloat32_t v450 = svmla_f32_x(pred_full, v250, v181, v837); - svfloat32_t v451 = svmla_f32_x(pred_full, v220, v174, v830); - svfloat32_t v470 = svcmla_f32_x(pred_full, v316, v838, v190, 90); - svfloat32_t v471 = svcmla_f32_x(pred_full, v316, v839, v191, 90); - svfloat32_t v472 = svcmla_f32_x(pred_full, v337, v841, v193, 90); - svfloat32_t v473 = svcmla_f32_x(pred_full, v337, v842, v194, 90); - svfloat32_t v474 = svcmla_f32_x(pred_full, v358, v844, v196, 90); - svfloat32_t v475 = svcmla_f32_x(pred_full, v358, v845, v197, 90); - svfloat32_t v476 = svcmla_f32_x(pred_full, v379, v847, v199, 90); - svfloat32_t v477 = svcmla_f32_x(pred_full, v379, v848, v200, 90); - svint16_t v519 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v220, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svfloat32_t v210; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v210) : "w"(v209), "w"(v161)); - svfloat32_t zero428; - asm volatile("mov %0.s, #0" : "=w"(zero428)); - svfloat32_t v428 = svcmla_f32_x(pred_full, zero428, v856, v206, 90); - svfloat32_t v452 = svmla_f32_x(pred_full, v451, v175, v831); - svfloat32_t v453 = svmls_f32_x(pred_full, v451, v175, v831); - svfloat32_t v454; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v454) : "w"(v443), "w"(v445)); - svfloat32_t v456; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v456) : "w"(v444), "w"(v446)); - svfloat32_t v458; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v458) : "w"(v443), "w"(v447)); - svfloat32_t v460; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v460) : "w"(v444), "w"(v448)); - svfloat32_t v481; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v481) : "w"(v470), "w"(v472)); - svfloat32_t v482; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v482) : "w"(v470), "w"(v472)); - svfloat32_t v483; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v483) : "w"(v471), "w"(v473)); - svfloat32_t v484; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v484) : "w"(v471), "w"(v473)); - svfloat32_t v485; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v485) : "w"(v474), "w"(v476)); - svfloat32_t v486; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v486) : "w"(v476), "w"(v474)); - svfloat32_t v487; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v487) : "w"(v475), "w"(v477)); - svfloat32_t v488; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v488) : "w"(v477), "w"(v475)); - svst1w_u64(pred_full, (unsigned *)(v866), svreinterpret_u64_s16(v519)); - svfloat32_t v211; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v211) : "w"(v206), "w"(v210)); - svfloat32_t zero435; - asm volatile("mov %0.s, #0" : "=w"(zero435)); - svfloat32_t v435 = svcmla_f32_x(pred_full, zero435, v857, v210, 90); - svfloat32_t v455; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v455) : "w"(v449), "w"(v452)); - svfloat32_t v457; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v457) : "w"(v450), "w"(v453)); - svfloat32_t v459; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v459) : "w"(v452), "w"(v449)); - svfloat32_t v461; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v461) : "w"(v453), "w"(v450)); - svfloat32_t v498; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v498) : "w"(v483), "w"(v487)); - svfloat32_t v500; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v500) : "w"(v482), "w"(v488)); - svfloat32_t v502; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v502) : "w"(v481), "w"(v485)); - svfloat32_t v504; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v504) : "w"(v488), "w"(v482)); - svfloat32_t v506; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v506) : "w"(v481), "w"(v485)); - svfloat32_t v509; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v509) : "w"(v486), "w"(v484)); - svfloat32_t v512; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v512) : "w"(v487), "w"(v483)); - svfloat32_t v515; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v515) : "w"(v484), "w"(v486)); - svfloat32_t v462; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v462) : "w"(v454), "w"(v455)); - svfloat32_t v463; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v463) : "w"(v456), "w"(v457)); - svfloat32_t v464; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v464) : "w"(v458), "w"(v459)); - svfloat32_t v465; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v465) : "w"(v460), "w"(v461)); - svfloat32_t v466; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v466) : "w"(v455), "w"(v454)); - svfloat32_t v467; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v467) : "w"(v457), "w"(v456)); - svfloat32_t v468; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v468) : "w"(v459), "w"(v458)); - svfloat32_t v469; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v469) : "w"(v461), "w"(v460)); - svfloat32_t v489; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v489) : "w"(v428), "w"(v435)); - svfloat32_t v478 = svcmla_f32_x(pred_full, v435, v858, v211, 90); - svfloat32_t v491; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v491) : "w"(v489), "w"(v489)); - svfloat32_t v516; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v516) : "w"(v515), "w"(v489)); - svfloat32_t v479 = svcmla_f32_x(pred_full, v478, v850, v202, 90); - svfloat32_t v492; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v492) : "w"(v407), "w"(v491)); - svfloat32_t v495; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v495) : "w"(v478), "w"(v478)); - svfloat32_t v513; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v513) : "w"(v512), "w"(v491)); - svfloat32_t v561; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v561) : "w"(v469), "w"(v516)); - svfloat32_t v570; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v570) : "w"(v469), "w"(v516)); - svfloat32_t v480 = svcmla_f32_x(pred_full, v479, v851, v35, 90); - svfloat32_t v490 = svcmla_f32_x(pred_full, v479, v852, v107, 90); - svfloat32_t v493 = svcmla_f32_x(pred_full, v492, v854, v89, 90); - svfloat32_t v494 = svcmla_f32_x(pred_full, v492, v855, v161, 90); - svfloat32_t v496; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v496) : "w"(v495), "w"(v495)); - svfloat32_t v497; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v497) : "w"(v489), "w"(v495)); - svfloat32_t v503; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v503) : "w"(v502), "w"(v495)); - svfloat32_t v514; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v514) : "w"(v513), "w"(v495)); - svint16_t v564 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v561, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svint16_t v573 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v570, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svfloat32_t v499; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v499) : "w"(v498), "w"(v490)); - svfloat32_t v501; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v501) : "w"(v500), "w"(v493)); - svfloat32_t v505; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v505) : "w"(v504), "w"(v497)); - svfloat32_t v507; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v507) : "w"(v506), "w"(v480)); - svfloat32_t v510; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v510) : "w"(v509), "w"(v494)); - svfloat32_t v543; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v543) : "w"(v464), "w"(v503)); - svfloat32_t v552; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v552) : "w"(v464), "w"(v503)); - svfloat32_t v651; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v651) : "w"(v468), "w"(v514)); - svfloat32_t v660; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v660) : "w"(v468), "w"(v514)); - svst1w_u64(pred_full, (unsigned *)(v911), svreinterpret_u64_s16(v564)); - svst1w_u64(pred_full, (unsigned *)(v920), svreinterpret_u64_s16(v573)); - svfloat32_t v508; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v508) : "w"(v507), "w"(v489)); - svfloat32_t v511; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v511) : "w"(v510), "w"(v496)); - svfloat32_t v525; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v525) : "w"(v462), "w"(v499)); - svfloat32_t v534; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v534) : "w"(v462), "w"(v499)); - svint16_t v546 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v543, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svint16_t v555 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v552, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svfloat32_t v597; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v597) : "w"(v465), "w"(v505)); - svfloat32_t v606; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v606) : "w"(v465), "w"(v505)); - svfloat32_t v615; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v615) : "w"(v463), "w"(v501)); - svfloat32_t v624; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v624) : "w"(v463), "w"(v501)); - svint16_t v654 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v651, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svint16_t v663 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v660, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svint16_t v528 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v525, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svint16_t v537 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v534, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svfloat32_t v579; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v579) : "w"(v466), "w"(v508)); - svfloat32_t v588; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v588) : "w"(v466), "w"(v508)); - svint16_t v600 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v597, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svint16_t v609 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v606, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svint16_t v618 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v615, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svint16_t v627 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v624, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svfloat32_t v633; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v633) : "w"(v467), "w"(v511)); - svfloat32_t v642; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v642) : "w"(v467), "w"(v511)); - svst1w_u64(pred_full, (unsigned *)(v893), svreinterpret_u64_s16(v546)); - svst1w_u64(pred_full, (unsigned *)(v902), svreinterpret_u64_s16(v555)); - svst1w_u64(pred_full, (unsigned *)(v1001), svreinterpret_u64_s16(v654)); - svst1w_u64(pred_full, (unsigned *)(v1010), svreinterpret_u64_s16(v663)); - svint16_t v582 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v579, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svint16_t v591 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v588, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svint16_t v636 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v633, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svint16_t v645 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v642, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svst1w_u64(pred_full, (unsigned *)(v875), svreinterpret_u64_s16(v528)); - svst1w_u64(pred_full, (unsigned *)(v884), svreinterpret_u64_s16(v537)); - svst1w_u64(pred_full, (unsigned *)(v947), svreinterpret_u64_s16(v600)); - svst1w_u64(pred_full, (unsigned *)(v956), svreinterpret_u64_s16(v609)); - svst1w_u64(pred_full, (unsigned *)(v965), svreinterpret_u64_s16(v618)); - svst1w_u64(pred_full, (unsigned *)(v974), svreinterpret_u64_s16(v627)); - svst1w_u64(pred_full, (unsigned *)(v929), svreinterpret_u64_s16(v582)); - svst1w_u64(pred_full, (unsigned *)(v938), svreinterpret_u64_s16(v591)); - svst1w_u64(pred_full, (unsigned *)(v983), svreinterpret_u64_s16(v636)); - svst1w_u64(pred_full, (unsigned *)(v992), svreinterpret_u64_s16(v645)); - v5 += v11; - v6 += v12; - } -} -#endif - -#ifndef ARMRAL_ARCH_SVE -void armral_fft_cs16_cf32_cs16_ac_n_uu18(const armral_cmplx_int16_t *restrict x, - armral_cmplx_int16_t *restrict y, - int istride, int ostride, int howmany, - float dir) { - float v4 = dir; - const int32_t *v5 = (const int32_t *)x; - int32_t *v6 = (int32_t *)y; - int64_t v12 = howmany - 1; - int64_t v557 = howmany / 2; - for (int j = 0; j < v12; j += 2) { - float v324 = -5.0000000000000000e-01F; - float v337 = -1.4999999999999998e+00F; - float v341 = 8.6602540378443871e-01F; - float v342 = -8.6602540378443871e-01F; - float v350 = 7.6604444311897801e-01F; - float v355 = 9.3969262078590832e-01F; - float v360 = -1.7364817766693039e-01F; - float v364 = 6.4278760968653925e-01F; - float v365 = -6.4278760968653925e-01F; - float v372 = -3.4202014332566888e-01F; - float v373 = 3.4202014332566888e-01F; - float v380 = 9.8480775301220802e-01F; - float v381 = -9.8480775301220802e-01F; - float32x2_t v383 = (float32x2_t){v4, v4}; - const int32_t *v1117 = &v5[istride]; - int32_t *v1208 = &v6[ostride]; - float32x2_t v325 = (float32x2_t){v324, v324}; - float32x2_t v338 = (float32x2_t){v337, v337}; - float32x2_t v343 = (float32x2_t){v341, v342}; - float32x2_t v351 = (float32x2_t){v350, v350}; - float32x2_t v356 = (float32x2_t){v355, v355}; - float32x2_t v361 = (float32x2_t){v360, v360}; - float32x2_t v366 = (float32x2_t){v364, v365}; - float32x2_t v374 = (float32x2_t){v372, v373}; - float32x2_t v382 = (float32x2_t){v380, v381}; - const int32_t *v1018 = &v5[0]; - int32_t *v1181 = &v6[0]; - int16x4_t v1360 = vld1_s16((const int16_t *)v1117); - float32x4_t v126 = vcvtq_n_f32_s32(vmovl_s16(v1360), 15); - float32x4_t v326 = vcombine_f32(v325, v325); - float32x4_t v339 = vcombine_f32(v338, v338); - float32x2_t v345 = vmul_f32(v383, v343); - float32x4_t v352 = vcombine_f32(v351, v351); - float32x4_t v357 = vcombine_f32(v356, v356); - float32x4_t v362 = vcombine_f32(v361, v361); - float32x2_t v368 = vmul_f32(v383, v366); - float32x2_t v376 = vmul_f32(v383, v374); - float32x2_t v384 = vmul_f32(v383, v382); - const int32_t *v1027 = &v5[istride * 9]; - const int32_t *v1036 = &v5[istride * 2]; - const int32_t *v1045 = &v5[istride * 11]; - const int32_t *v1054 = &v5[istride * 4]; - const int32_t *v1063 = &v5[istride * 13]; - const int32_t *v1072 = &v5[istride * 6]; - const int32_t *v1081 = &v5[istride * 15]; - const int32_t *v1090 = &v5[istride * 8]; - const int32_t *v1099 = &v5[istride * 17]; - const int32_t *v1108 = &v5[istride * 10]; - const int32_t *v1126 = &v5[istride * 12]; - const int32_t *v1135 = &v5[istride * 3]; - const int32_t *v1144 = &v5[istride * 14]; - const int32_t *v1153 = &v5[istride * 5]; - const int32_t *v1162 = &v5[istride * 16]; - const int32_t *v1171 = &v5[istride * 7]; - int32_t *v1190 = &v6[ostride * 9]; - int32_t *v1199 = &v6[ostride * 10]; - int32_t *v1217 = &v6[ostride * 2]; - int32_t *v1226 = &v6[ostride * 11]; - int32_t *v1235 = &v6[ostride * 12]; - int32_t *v1244 = &v6[ostride * 3]; - int32_t *v1253 = &v6[ostride * 4]; - int32_t *v1262 = &v6[ostride * 13]; - int32_t *v1271 = &v6[ostride * 14]; - int32_t *v1280 = &v6[ostride * 5]; - int32_t *v1289 = &v6[ostride * 6]; - int32_t *v1298 = &v6[ostride * 15]; - int32_t *v1307 = &v6[ostride * 16]; - int32_t *v1316 = &v6[ostride * 7]; - int32_t *v1325 = &v6[ostride * 8]; - int32_t *v1334 = &v6[ostride * 17]; - int16x4_t v1338 = vld1_s16((const int16_t *)v1018); - float32x4_t v28 = vcvtq_n_f32_s32(vmovl_s16(v1338), 15); - float32x4_t v347 = vcombine_f32(v345, v345); - float32x4_t v370 = vcombine_f32(v368, v368); - float32x4_t v378 = vcombine_f32(v376, v376); - float32x4_t v386 = vcombine_f32(v384, v384); - int16x4_t v1340 = vld1_s16((const int16_t *)v1027); - int16x4_t v1342 = vld1_s16((const int16_t *)v1036); - int16x4_t v1344 = vld1_s16((const int16_t *)v1045); - int16x4_t v1346 = vld1_s16((const int16_t *)v1054); - int16x4_t v1348 = vld1_s16((const int16_t *)v1063); - int16x4_t v1350 = vld1_s16((const int16_t *)v1072); - int16x4_t v1352 = vld1_s16((const int16_t *)v1081); - int16x4_t v1354 = vld1_s16((const int16_t *)v1090); - int16x4_t v1356 = vld1_s16((const int16_t *)v1099); - int16x4_t v1358 = vld1_s16((const int16_t *)v1108); - int16x4_t v1362 = vld1_s16((const int16_t *)v1126); - int16x4_t v1364 = vld1_s16((const int16_t *)v1135); - int16x4_t v1366 = vld1_s16((const int16_t *)v1144); - int16x4_t v1368 = vld1_s16((const int16_t *)v1153); - int16x4_t v1370 = vld1_s16((const int16_t *)v1162); - int16x4_t v1372 = vld1_s16((const int16_t *)v1171); - float32x4_t v36 = vcvtq_n_f32_s32(vmovl_s16(v1340), 15); - float32x4_t v46 = vcvtq_n_f32_s32(vmovl_s16(v1342), 15); - float32x4_t v54 = vcvtq_n_f32_s32(vmovl_s16(v1344), 15); - float32x4_t v64 = vcvtq_n_f32_s32(vmovl_s16(v1346), 15); - float32x4_t v72 = vcvtq_n_f32_s32(vmovl_s16(v1348), 15); - float32x4_t v82 = vcvtq_n_f32_s32(vmovl_s16(v1350), 15); - float32x4_t v90 = vcvtq_n_f32_s32(vmovl_s16(v1352), 15); - float32x4_t v100 = vcvtq_n_f32_s32(vmovl_s16(v1354), 15); - float32x4_t v108 = vcvtq_n_f32_s32(vmovl_s16(v1356), 15); - float32x4_t v118 = vcvtq_n_f32_s32(vmovl_s16(v1358), 15); - float32x4_t v136 = vcvtq_n_f32_s32(vmovl_s16(v1362), 15); - float32x4_t v144 = vcvtq_n_f32_s32(vmovl_s16(v1364), 15); - float32x4_t v154 = vcvtq_n_f32_s32(vmovl_s16(v1366), 15); - float32x4_t v162 = vcvtq_n_f32_s32(vmovl_s16(v1368), 15); - float32x4_t v172 = vcvtq_n_f32_s32(vmovl_s16(v1370), 15); - float32x4_t v180 = vcvtq_n_f32_s32(vmovl_s16(v1372), 15); - float32x4_t v37 = vaddq_f32(v28, v36); - float32x4_t v38 = vsubq_f32(v28, v36); - float32x4_t v55 = vaddq_f32(v46, v54); - float32x4_t v56 = vsubq_f32(v46, v54); - float32x4_t v73 = vaddq_f32(v64, v72); - float32x4_t v74 = vsubq_f32(v64, v72); - float32x4_t v91 = vaddq_f32(v82, v90); - float32x4_t v92 = vsubq_f32(v82, v90); - float32x4_t v109 = vaddq_f32(v100, v108); - float32x4_t v110 = vsubq_f32(v100, v108); - float32x4_t v127 = vaddq_f32(v118, v126); - float32x4_t v128 = vsubq_f32(v118, v126); - float32x4_t v145 = vaddq_f32(v136, v144); - float32x4_t v146 = vsubq_f32(v136, v144); - float32x4_t v163 = vaddq_f32(v154, v162); - float32x4_t v164 = vsubq_f32(v154, v162); - float32x4_t v181 = vaddq_f32(v172, v180); - float32x4_t v182 = vsubq_f32(v172, v180); - float32x4_t v183 = vaddq_f32(v55, v181); - float32x4_t v184 = vsubq_f32(v55, v181); - float32x4_t v185 = vaddq_f32(v163, v73); - float32x4_t v186 = vsubq_f32(v163, v73); - float32x4_t v187 = vaddq_f32(v91, v145); - float32x4_t v188 = vsubq_f32(v91, v145); - float32x4_t v189 = vaddq_f32(v109, v127); - float32x4_t v190 = vsubq_f32(v109, v127); - float32x4_t v298 = vaddq_f32(v56, v182); - float32x4_t v299 = vsubq_f32(v56, v182); - float32x4_t v300 = vaddq_f32(v164, v74); - float32x4_t v301 = vsubq_f32(v164, v74); - float32x4_t v302 = vaddq_f32(v92, v146); - float32x4_t v303 = vsubq_f32(v92, v146); - float32x4_t v304 = vaddq_f32(v110, v128); - float32x4_t v305 = vsubq_f32(v110, v128); - float32x4_t v191 = vaddq_f32(v183, v185); - float32x4_t v195 = vaddq_f32(v184, v186); - float32x4_t v197 = vsubq_f32(v183, v185); - float32x4_t v198 = vsubq_f32(v185, v189); - float32x4_t v199 = vsubq_f32(v189, v183); - float32x4_t v200 = vsubq_f32(v184, v186); - float32x4_t v201 = vsubq_f32(v186, v190); - float32x4_t v202 = vsubq_f32(v190, v184); - float32x4_t v225 = vmulq_f32(v187, v339); - float32x4_t v231 = vrev64q_f32(v188); - float32x4_t v306 = vaddq_f32(v298, v300); - float32x4_t v310 = vaddq_f32(v299, v301); - float32x4_t v312 = vsubq_f32(v298, v300); - float32x4_t v313 = vsubq_f32(v300, v304); - float32x4_t v314 = vsubq_f32(v304, v298); - float32x4_t v315 = vsubq_f32(v299, v301); - float32x4_t v316 = vsubq_f32(v301, v305); - float32x4_t v317 = vsubq_f32(v305, v299); - float32x4_t v340 = vmulq_f32(v302, v339); - float32x4_t v346 = vrev64q_f32(v303); - float32x4_t v192 = vaddq_f32(v191, v189); - float32x4_t v196 = vaddq_f32(v195, v190); - float32x4_t v233 = vmulq_f32(v231, v347); - float32x4_t v238 = vmulq_f32(v197, v352); - float32x4_t v243 = vmulq_f32(v198, v357); - float32x4_t v248 = vmulq_f32(v199, v362); - float32x4_t v254 = vrev64q_f32(v200); - float32x4_t v262 = vrev64q_f32(v201); - float32x4_t v270 = vrev64q_f32(v202); - float32x4_t v307 = vaddq_f32(v306, v304); - float32x4_t v311 = vaddq_f32(v310, v305); - float32x4_t v348 = vmulq_f32(v346, v347); - float32x4_t v353 = vmulq_f32(v312, v352); - float32x4_t v358 = vmulq_f32(v313, v357); - float32x4_t v363 = vmulq_f32(v314, v362); - float32x4_t v369 = vrev64q_f32(v315); - float32x4_t v377 = vrev64q_f32(v316); - float32x4_t v385 = vrev64q_f32(v317); - float32x4_t v193 = vaddq_f32(v192, v187); - float32x4_t v212 = vmulq_f32(v192, v326); - float32x4_t v218 = vrev64q_f32(v196); - float32x4_t v256 = vmulq_f32(v254, v370); - float32x4_t v264 = vmulq_f32(v262, v378); - float32x4_t v272 = vmulq_f32(v270, v386); - float32x4_t v308 = vaddq_f32(v307, v302); - float32x4_t v327 = vmulq_f32(v307, v326); - float32x4_t v333 = vrev64q_f32(v311); - float32x4_t v371 = vmulq_f32(v369, v370); - float32x4_t v379 = vmulq_f32(v377, v378); - float32x4_t v387 = vmulq_f32(v385, v386); - float32x4_t v194 = vaddq_f32(v193, v37); - float32x4_t v220 = vmulq_f32(v218, v347); - float32x4_t v273 = vaddq_f32(v212, v212); - float32x4_t v286 = vaddq_f32(v233, v256); - float32x4_t v288 = vsubq_f32(v233, v264); - float32x4_t v290 = vsubq_f32(v233, v256); - float32x4_t v309 = vaddq_f32(v308, v38); - float32x4_t v335 = vmulq_f32(v333, v347); - float32x4_t v388 = vaddq_f32(v327, v327); - float32x4_t v401 = vaddq_f32(v348, v371); - float32x4_t v403 = vsubq_f32(v348, v379); - float32x4_t v405 = vsubq_f32(v348, v371); - float32x4_t v274 = vaddq_f32(v273, v212); - float32x4_t v278 = vaddq_f32(v194, v225); - float32x4_t v287 = vaddq_f32(v286, v264); - float32x4_t v289 = vaddq_f32(v288, v272); - float32x4_t v291 = vsubq_f32(v290, v272); - float32x4_t v389 = vaddq_f32(v388, v327); - float32x4_t v393 = vaddq_f32(v309, v340); - float32x4_t v402 = vaddq_f32(v401, v379); - float32x4_t v404 = vaddq_f32(v403, v387); - float32x4_t v406 = vsubq_f32(v405, v387); - int16x4_t v415 = vqmovn_s32(vcvtq_n_s32_f32(v194, 15)); - int16x4_t v423 = vqmovn_s32(vcvtq_n_s32_f32(v309, 15)); - float32x4_t v275 = vaddq_f32(v194, v274); - float32x4_t v279 = vaddq_f32(v278, v273); - float32x4_t v390 = vaddq_f32(v309, v389); - float32x4_t v394 = vaddq_f32(v393, v388); - vst1_s16((int16_t *)v1181, v415); - vst1_s16((int16_t *)v1190, v423); - float32x4_t v276 = vaddq_f32(v275, v220); - float32x4_t v277 = vsubq_f32(v275, v220); - float32x4_t v280 = vaddq_f32(v279, v238); - float32x4_t v282 = vsubq_f32(v279, v243); - float32x4_t v284 = vsubq_f32(v279, v238); - float32x4_t v391 = vaddq_f32(v390, v335); - float32x4_t v392 = vsubq_f32(v390, v335); - float32x4_t v395 = vaddq_f32(v394, v353); - float32x4_t v397 = vsubq_f32(v394, v358); - float32x4_t v399 = vsubq_f32(v394, v353); - float32x4_t v281 = vaddq_f32(v280, v243); - float32x4_t v283 = vaddq_f32(v282, v248); - float32x4_t v285 = vsubq_f32(v284, v248); - float32x4_t v396 = vaddq_f32(v395, v358); - float32x4_t v398 = vaddq_f32(v397, v363); - float32x4_t v400 = vsubq_f32(v399, v363); - int16x4_t v463 = vqmovn_s32(vcvtq_n_s32_f32(v277, 15)); - int16x4_t v471 = vqmovn_s32(vcvtq_n_s32_f32(v392, 15)); - int16x4_t v511 = vqmovn_s32(vcvtq_n_s32_f32(v276, 15)); - int16x4_t v519 = vqmovn_s32(vcvtq_n_s32_f32(v391, 15)); - float32x4_t v292 = vaddq_f32(v281, v287); - float32x4_t v293 = vsubq_f32(v281, v287); - float32x4_t v294 = vaddq_f32(v283, v289); - float32x4_t v295 = vsubq_f32(v283, v289); - float32x4_t v296 = vaddq_f32(v285, v291); - float32x4_t v297 = vsubq_f32(v285, v291); - float32x4_t v407 = vaddq_f32(v396, v402); - float32x4_t v408 = vsubq_f32(v396, v402); - float32x4_t v409 = vaddq_f32(v398, v404); - float32x4_t v410 = vsubq_f32(v398, v404); - float32x4_t v411 = vaddq_f32(v400, v406); - float32x4_t v412 = vsubq_f32(v400, v406); - vst1_s16((int16_t *)v1235, v463); - vst1_s16((int16_t *)v1244, v471); - vst1_s16((int16_t *)v1289, v511); - vst1_s16((int16_t *)v1298, v519); - int16x4_t v431 = vqmovn_s32(vcvtq_n_s32_f32(v293, 15)); - int16x4_t v439 = vqmovn_s32(vcvtq_n_s32_f32(v408, 15)); - int16x4_t v447 = vqmovn_s32(vcvtq_n_s32_f32(v294, 15)); - int16x4_t v455 = vqmovn_s32(vcvtq_n_s32_f32(v409, 15)); - int16x4_t v479 = vqmovn_s32(vcvtq_n_s32_f32(v297, 15)); - int16x4_t v487 = vqmovn_s32(vcvtq_n_s32_f32(v412, 15)); - int16x4_t v495 = vqmovn_s32(vcvtq_n_s32_f32(v296, 15)); - int16x4_t v503 = vqmovn_s32(vcvtq_n_s32_f32(v411, 15)); - int16x4_t v527 = vqmovn_s32(vcvtq_n_s32_f32(v295, 15)); - int16x4_t v535 = vqmovn_s32(vcvtq_n_s32_f32(v410, 15)); - int16x4_t v543 = vqmovn_s32(vcvtq_n_s32_f32(v292, 15)); - int16x4_t v551 = vqmovn_s32(vcvtq_n_s32_f32(v407, 15)); - vst1_s16((int16_t *)v1199, v431); - vst1_s16((int16_t *)v1208, v439); - vst1_s16((int16_t *)v1217, v447); - vst1_s16((int16_t *)v1226, v455); - vst1_s16((int16_t *)v1253, v479); - vst1_s16((int16_t *)v1262, v487); - vst1_s16((int16_t *)v1271, v495); - vst1_s16((int16_t *)v1280, v503); - vst1_s16((int16_t *)v1307, v527); - vst1_s16((int16_t *)v1316, v535); - vst1_s16((int16_t *)v1325, v543); - vst1_s16((int16_t *)v1334, v551); - v5 += 2 * 1; - v6 += 2 * 1; - } - for (int j = v557 * 2; j < howmany; j += 1) { - int16x4_t v645 = vld1s_s16(&v5[istride]); - float v820 = -5.0000000000000000e-01F; - float v831 = -1.4999999999999998e+00F; - float v834 = 8.6602540378443871e-01F; - float v835 = -8.6602540378443871e-01F; - float v842 = 7.6604444311897801e-01F; - float v846 = 9.3969262078590832e-01F; - float v850 = -1.7364817766693039e-01F; - float v853 = 6.4278760968653925e-01F; - float v854 = -6.4278760968653925e-01F; - float v860 = -3.4202014332566888e-01F; - float v861 = 3.4202014332566888e-01F; - float v867 = 9.8480775301220802e-01F; - float v868 = -9.8480775301220802e-01F; - float32x2_t v870 = (float32x2_t){v4, v4}; - int16x4_t v569 = vld1s_s16(&v5[0]); - float32x2_t v646 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v645)), 15); - float32x2_t v821 = (float32x2_t){v820, v820}; - float32x2_t v832 = (float32x2_t){v831, v831}; - float32x2_t v836 = (float32x2_t){v834, v835}; - float32x2_t v843 = (float32x2_t){v842, v842}; - float32x2_t v847 = (float32x2_t){v846, v846}; - float32x2_t v851 = (float32x2_t){v850, v850}; - float32x2_t v855 = (float32x2_t){v853, v854}; - float32x2_t v862 = (float32x2_t){v860, v861}; - float32x2_t v869 = (float32x2_t){v867, v868}; - float32x2_t v570 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v569)), 15); - int16x4_t v575 = vld1s_s16(&v5[istride * 9]); - int16x4_t v583 = vld1s_s16(&v5[istride * 2]); - int16x4_t v589 = vld1s_s16(&v5[istride * 11]); - int16x4_t v597 = vld1s_s16(&v5[istride * 4]); - int16x4_t v603 = vld1s_s16(&v5[istride * 13]); - int16x4_t v611 = vld1s_s16(&v5[istride * 6]); - int16x4_t v617 = vld1s_s16(&v5[istride * 15]); - int16x4_t v625 = vld1s_s16(&v5[istride * 8]); - int16x4_t v631 = vld1s_s16(&v5[istride * 17]); - int16x4_t v639 = vld1s_s16(&v5[istride * 10]); - int16x4_t v653 = vld1s_s16(&v5[istride * 12]); - int16x4_t v659 = vld1s_s16(&v5[istride * 3]); - int16x4_t v667 = vld1s_s16(&v5[istride * 14]); - int16x4_t v673 = vld1s_s16(&v5[istride * 5]); - int16x4_t v681 = vld1s_s16(&v5[istride * 16]); - int16x4_t v687 = vld1s_s16(&v5[istride * 7]); - float32x2_t v838 = vmul_f32(v870, v836); - float32x2_t v857 = vmul_f32(v870, v855); - float32x2_t v864 = vmul_f32(v870, v862); - float32x2_t v871 = vmul_f32(v870, v869); - float32x2_t v576 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v575)), 15); - float32x2_t v584 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v583)), 15); - float32x2_t v590 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v589)), 15); - float32x2_t v598 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v597)), 15); - float32x2_t v604 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v603)), 15); - float32x2_t v612 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v611)), 15); - float32x2_t v618 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v617)), 15); - float32x2_t v626 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v625)), 15); - float32x2_t v632 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v631)), 15); - float32x2_t v640 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v639)), 15); - float32x2_t v654 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v653)), 15); - float32x2_t v660 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v659)), 15); - float32x2_t v668 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v667)), 15); - float32x2_t v674 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v673)), 15); - float32x2_t v682 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v681)), 15); - float32x2_t v688 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v687)), 15); - float32x2_t v577 = vadd_f32(v570, v576); - float32x2_t v578 = vsub_f32(v570, v576); - float32x2_t v591 = vadd_f32(v584, v590); - float32x2_t v592 = vsub_f32(v584, v590); - float32x2_t v605 = vadd_f32(v598, v604); - float32x2_t v606 = vsub_f32(v598, v604); - float32x2_t v619 = vadd_f32(v612, v618); - float32x2_t v620 = vsub_f32(v612, v618); - float32x2_t v633 = vadd_f32(v626, v632); - float32x2_t v634 = vsub_f32(v626, v632); - float32x2_t v647 = vadd_f32(v640, v646); - float32x2_t v648 = vsub_f32(v640, v646); - float32x2_t v661 = vadd_f32(v654, v660); - float32x2_t v662 = vsub_f32(v654, v660); - float32x2_t v675 = vadd_f32(v668, v674); - float32x2_t v676 = vsub_f32(v668, v674); - float32x2_t v689 = vadd_f32(v682, v688); - float32x2_t v690 = vsub_f32(v682, v688); - float32x2_t v691 = vadd_f32(v591, v689); - float32x2_t v692 = vsub_f32(v591, v689); - float32x2_t v693 = vadd_f32(v675, v605); - float32x2_t v694 = vsub_f32(v675, v605); - float32x2_t v695 = vadd_f32(v619, v661); - float32x2_t v696 = vsub_f32(v619, v661); - float32x2_t v697 = vadd_f32(v633, v647); - float32x2_t v698 = vsub_f32(v633, v647); - float32x2_t v795 = vadd_f32(v592, v690); - float32x2_t v796 = vsub_f32(v592, v690); - float32x2_t v797 = vadd_f32(v676, v606); - float32x2_t v798 = vsub_f32(v676, v606); - float32x2_t v799 = vadd_f32(v620, v662); - float32x2_t v800 = vsub_f32(v620, v662); - float32x2_t v801 = vadd_f32(v634, v648); - float32x2_t v802 = vsub_f32(v634, v648); - float32x2_t v699 = vadd_f32(v691, v693); - float32x2_t v703 = vadd_f32(v692, v694); - float32x2_t v705 = vsub_f32(v691, v693); - float32x2_t v706 = vsub_f32(v693, v697); - float32x2_t v707 = vsub_f32(v697, v691); - float32x2_t v708 = vsub_f32(v692, v694); - float32x2_t v709 = vsub_f32(v694, v698); - float32x2_t v710 = vsub_f32(v698, v692); - float32x2_t v729 = vmul_f32(v695, v832); - float32x2_t v735 = vrev64_f32(v696); - float32x2_t v803 = vadd_f32(v795, v797); - float32x2_t v807 = vadd_f32(v796, v798); - float32x2_t v809 = vsub_f32(v795, v797); - float32x2_t v810 = vsub_f32(v797, v801); - float32x2_t v811 = vsub_f32(v801, v795); - float32x2_t v812 = vsub_f32(v796, v798); - float32x2_t v813 = vsub_f32(v798, v802); - float32x2_t v814 = vsub_f32(v802, v796); - float32x2_t v833 = vmul_f32(v799, v832); - float32x2_t v839 = vrev64_f32(v800); - float32x2_t v700 = vadd_f32(v699, v697); - float32x2_t v704 = vadd_f32(v703, v698); - float32x2_t v736 = vmul_f32(v735, v838); - float32x2_t v740 = vmul_f32(v705, v843); - float32x2_t v744 = vmul_f32(v706, v847); - float32x2_t v748 = vmul_f32(v707, v851); - float32x2_t v754 = vrev64_f32(v708); - float32x2_t v761 = vrev64_f32(v709); - float32x2_t v768 = vrev64_f32(v710); - float32x2_t v804 = vadd_f32(v803, v801); - float32x2_t v808 = vadd_f32(v807, v802); - float32x2_t v840 = vmul_f32(v839, v838); - float32x2_t v844 = vmul_f32(v809, v843); - float32x2_t v848 = vmul_f32(v810, v847); - float32x2_t v852 = vmul_f32(v811, v851); - float32x2_t v858 = vrev64_f32(v812); - float32x2_t v865 = vrev64_f32(v813); - float32x2_t v872 = vrev64_f32(v814); - float32x2_t v701 = vadd_f32(v700, v695); - float32x2_t v718 = vmul_f32(v700, v821); - float32x2_t v724 = vrev64_f32(v704); - float32x2_t v755 = vmul_f32(v754, v857); - float32x2_t v762 = vmul_f32(v761, v864); - float32x2_t v769 = vmul_f32(v768, v871); - float32x2_t v805 = vadd_f32(v804, v799); - float32x2_t v822 = vmul_f32(v804, v821); - float32x2_t v828 = vrev64_f32(v808); - float32x2_t v859 = vmul_f32(v858, v857); - float32x2_t v866 = vmul_f32(v865, v864); - float32x2_t v873 = vmul_f32(v872, v871); - float32x2_t v702 = vadd_f32(v701, v577); - float32x2_t v725 = vmul_f32(v724, v838); - float32x2_t v770 = vadd_f32(v718, v718); - float32x2_t v783 = vadd_f32(v736, v755); - float32x2_t v785 = vsub_f32(v736, v762); - float32x2_t v787 = vsub_f32(v736, v755); - float32x2_t v806 = vadd_f32(v805, v578); - float32x2_t v829 = vmul_f32(v828, v838); - float32x2_t v874 = vadd_f32(v822, v822); - float32x2_t v887 = vadd_f32(v840, v859); - float32x2_t v889 = vsub_f32(v840, v866); - float32x2_t v891 = vsub_f32(v840, v859); - float32x2_t v771 = vadd_f32(v770, v718); - float32x2_t v775 = vadd_f32(v702, v729); - float32x2_t v784 = vadd_f32(v783, v762); - float32x2_t v786 = vadd_f32(v785, v769); - float32x2_t v788 = vsub_f32(v787, v769); - float32x2_t v875 = vadd_f32(v874, v822); - float32x2_t v879 = vadd_f32(v806, v833); - float32x2_t v888 = vadd_f32(v887, v866); - float32x2_t v890 = vadd_f32(v889, v873); - float32x2_t v892 = vsub_f32(v891, v873); - int16x4_t v901 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v702, 15), (int32x2_t){0, 0})); - int16x4_t v907 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v806, 15), (int32x2_t){0, 0})); - float32x2_t v772 = vadd_f32(v702, v771); - float32x2_t v776 = vadd_f32(v775, v770); - float32x2_t v876 = vadd_f32(v806, v875); - float32x2_t v880 = vadd_f32(v879, v874); - v6[0] = vget_lane_s32(vreinterpret_s32_s16(v901), 0); - v6[ostride * 9] = vget_lane_s32(vreinterpret_s32_s16(v907), 0); - float32x2_t v773 = vadd_f32(v772, v725); - float32x2_t v774 = vsub_f32(v772, v725); - float32x2_t v777 = vadd_f32(v776, v740); - float32x2_t v779 = vsub_f32(v776, v744); - float32x2_t v781 = vsub_f32(v776, v740); - float32x2_t v877 = vadd_f32(v876, v829); - float32x2_t v878 = vsub_f32(v876, v829); - float32x2_t v881 = vadd_f32(v880, v844); - float32x2_t v883 = vsub_f32(v880, v848); - float32x2_t v885 = vsub_f32(v880, v844); - float32x2_t v778 = vadd_f32(v777, v744); - float32x2_t v780 = vadd_f32(v779, v748); - float32x2_t v782 = vsub_f32(v781, v748); - float32x2_t v882 = vadd_f32(v881, v848); - float32x2_t v884 = vadd_f32(v883, v852); - float32x2_t v886 = vsub_f32(v885, v852); - int16x4_t v937 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v774, 15), (int32x2_t){0, 0})); - int16x4_t v943 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v878, 15), (int32x2_t){0, 0})); - int16x4_t v973 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v773, 15), (int32x2_t){0, 0})); - int16x4_t v979 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v877, 15), (int32x2_t){0, 0})); - float32x2_t v789 = vadd_f32(v778, v784); - float32x2_t v790 = vsub_f32(v778, v784); - float32x2_t v791 = vadd_f32(v780, v786); - float32x2_t v792 = vsub_f32(v780, v786); - float32x2_t v793 = vadd_f32(v782, v788); - float32x2_t v794 = vsub_f32(v782, v788); - float32x2_t v893 = vadd_f32(v882, v888); - float32x2_t v894 = vsub_f32(v882, v888); - float32x2_t v895 = vadd_f32(v884, v890); - float32x2_t v896 = vsub_f32(v884, v890); - float32x2_t v897 = vadd_f32(v886, v892); - float32x2_t v898 = vsub_f32(v886, v892); - v6[ostride * 12] = vget_lane_s32(vreinterpret_s32_s16(v937), 0); - v6[ostride * 3] = vget_lane_s32(vreinterpret_s32_s16(v943), 0); - v6[ostride * 6] = vget_lane_s32(vreinterpret_s32_s16(v973), 0); - v6[ostride * 15] = vget_lane_s32(vreinterpret_s32_s16(v979), 0); - int16x4_t v913 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v790, 15), (int32x2_t){0, 0})); - int16x4_t v919 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v894, 15), (int32x2_t){0, 0})); - int16x4_t v925 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v791, 15), (int32x2_t){0, 0})); - int16x4_t v931 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v895, 15), (int32x2_t){0, 0})); - int16x4_t v949 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v794, 15), (int32x2_t){0, 0})); - int16x4_t v955 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v898, 15), (int32x2_t){0, 0})); - int16x4_t v961 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v793, 15), (int32x2_t){0, 0})); - int16x4_t v967 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v897, 15), (int32x2_t){0, 0})); - int16x4_t v985 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v792, 15), (int32x2_t){0, 0})); - int16x4_t v991 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v896, 15), (int32x2_t){0, 0})); - int16x4_t v997 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v789, 15), (int32x2_t){0, 0})); - int16x4_t v1003 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v893, 15), (int32x2_t){0, 0})); - v6[ostride * 10] = vget_lane_s32(vreinterpret_s32_s16(v913), 0); - v6[ostride] = vget_lane_s32(vreinterpret_s32_s16(v919), 0); - v6[ostride * 2] = vget_lane_s32(vreinterpret_s32_s16(v925), 0); - v6[ostride * 11] = vget_lane_s32(vreinterpret_s32_s16(v931), 0); - v6[ostride * 4] = vget_lane_s32(vreinterpret_s32_s16(v949), 0); - v6[ostride * 13] = vget_lane_s32(vreinterpret_s32_s16(v955), 0); - v6[ostride * 14] = vget_lane_s32(vreinterpret_s32_s16(v961), 0); - v6[ostride * 5] = vget_lane_s32(vreinterpret_s32_s16(v967), 0); - v6[ostride * 16] = vget_lane_s32(vreinterpret_s32_s16(v985), 0); - v6[ostride * 7] = vget_lane_s32(vreinterpret_s32_s16(v991), 0); - v6[ostride * 8] = vget_lane_s32(vreinterpret_s32_s16(v997), 0); - v6[ostride * 17] = vget_lane_s32(vreinterpret_s32_s16(v1003), 0); - v5 += 1 * 1; - v6 += 1 * 1; - } -} -#endif - -#ifdef ARMRAL_ARCH_SVE -void armral_fft_cs16_cf32_cs16_ac_n_uu18(const armral_cmplx_int16_t *restrict x, - armral_cmplx_int16_t *restrict y, - int istride, int ostride, int howmany, - float dir) { - int64_t v0 = istride; - int64_t v2 = ostride; - float v4 = dir; - const int32_t *v5 = (const int32_t *)x; - int32_t *v6 = (int32_t *)y; - int64_t v8 = howmany; - int64_t v10 = svcntd(); - int64_t v11 = v10 * 1; - int64_t v12 = v10 * 1; - for (int j = 0; j < v8; j += v10) { - svbool_t pred_full = svwhilelt_b32(j * 2, howmany * 2); - float v316 = -5.0000000000000000e-01F; - float v328 = -1.4999999999999998e+00F; - float v333 = -8.6602540378443871e-01F; - float v340 = 7.6604444311897801e-01F; - float v345 = 9.3969262078590832e-01F; - float v350 = -1.7364817766693039e-01F; - float v355 = -6.4278760968653925e-01F; - float v362 = 3.4202014332566888e-01F; - float v369 = -9.8480775301220802e-01F; - const int32_t *v650 = &v5[v0]; - int32_t *v763 = &v6[v2]; - int64_t v27 = v0 * 9; - int64_t v37 = v0 * 2; - int64_t v45 = v0 * 11; - int64_t v55 = v0 * 4; - int64_t v63 = v0 * 13; - int64_t v73 = v0 * 6; - int64_t v81 = v0 * 15; - int64_t v91 = v0 * 8; - int64_t v99 = v0 * 17; - int64_t v109 = v0 * 10; - int64_t v127 = v0 * 12; - int64_t v135 = v0 * 3; - int64_t v145 = v0 * 14; - int64_t v153 = v0 * 5; - int64_t v163 = v0 * 16; - int64_t v171 = v0 * 7; - float v336 = v4 * v333; - float v358 = v4 * v355; - float v365 = v4 * v362; - float v372 = v4 * v369; - int64_t v409 = v2 * 9; - int64_t v417 = v2 * 10; - int64_t v433 = v2 * 2; - int64_t v441 = v2 * 11; - int64_t v449 = v2 * 12; - int64_t v457 = v2 * 3; - int64_t v465 = v2 * 4; - int64_t v473 = v2 * 13; - int64_t v481 = v2 * 14; - int64_t v489 = v2 * 5; - int64_t v497 = v2 * 6; - int64_t v505 = v2 * 15; - int64_t v513 = v2 * 16; - int64_t v521 = v2 * 7; - int64_t v529 = v2 * 8; - int64_t v537 = v2 * 17; - const int32_t *v551 = &v5[0]; - svfloat32_t v719 = svdup_n_f32(v316); - svfloat32_t v721 = svdup_n_f32(v328); - svfloat32_t v723 = svdup_n_f32(v340); - svfloat32_t v724 = svdup_n_f32(v345); - svfloat32_t v725 = svdup_n_f32(v350); - int32_t *v736 = &v6[0]; - svfloat32_t v123 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v650[0])), - 1.F / (1ULL << 15ULL)); - const int32_t *v560 = &v5[v27]; - const int32_t *v569 = &v5[v37]; - const int32_t *v578 = &v5[v45]; - const int32_t *v587 = &v5[v55]; - const int32_t *v596 = &v5[v63]; - const int32_t *v605 = &v5[v73]; - const int32_t *v614 = &v5[v81]; - const int32_t *v623 = &v5[v91]; - const int32_t *v632 = &v5[v99]; - const int32_t *v641 = &v5[v109]; - const int32_t *v659 = &v5[v127]; - const int32_t *v668 = &v5[v135]; - const int32_t *v677 = &v5[v145]; - const int32_t *v686 = &v5[v153]; - const int32_t *v695 = &v5[v163]; - const int32_t *v704 = &v5[v171]; - svfloat32_t v722 = svdup_n_f32(v336); - svfloat32_t v726 = svdup_n_f32(v358); - svfloat32_t v727 = svdup_n_f32(v365); - svfloat32_t v728 = svdup_n_f32(v372); - int32_t *v745 = &v6[v409]; - int32_t *v754 = &v6[v417]; - int32_t *v772 = &v6[v433]; - int32_t *v781 = &v6[v441]; - int32_t *v790 = &v6[v449]; - int32_t *v799 = &v6[v457]; - int32_t *v808 = &v6[v465]; - int32_t *v817 = &v6[v473]; - int32_t *v826 = &v6[v481]; - int32_t *v835 = &v6[v489]; - int32_t *v844 = &v6[v497]; - int32_t *v853 = &v6[v505]; - int32_t *v862 = &v6[v513]; - int32_t *v871 = &v6[v521]; - int32_t *v880 = &v6[v529]; - int32_t *v889 = &v6[v537]; - svfloat32_t v25 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v551[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v33 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v560[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v43 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v569[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v51 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v578[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v61 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v587[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v69 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v596[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v79 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v605[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v87 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v614[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v97 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v623[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v105 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v632[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v115 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v641[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v133 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v659[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v141 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v668[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v151 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v677[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v159 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v686[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v169 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v695[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v177 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v704[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v34; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v34) : "w"(v25), "w"(v33)); - svfloat32_t v35; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v35) : "w"(v25), "w"(v33)); - svfloat32_t v52; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v52) : "w"(v43), "w"(v51)); - svfloat32_t v53; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v53) : "w"(v43), "w"(v51)); - svfloat32_t v70; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v70) : "w"(v61), "w"(v69)); - svfloat32_t v71; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v71) : "w"(v61), "w"(v69)); - svfloat32_t v88; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v88) : "w"(v79), "w"(v87)); - svfloat32_t v89; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v89) : "w"(v79), "w"(v87)); - svfloat32_t v106; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v106) : "w"(v97), "w"(v105)); - svfloat32_t v107; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v107) : "w"(v97), "w"(v105)); - svfloat32_t v124; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v124) : "w"(v115), "w"(v123)); - svfloat32_t v125; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v125) : "w"(v115), "w"(v123)); - svfloat32_t v142; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v142) : "w"(v133), "w"(v141)); - svfloat32_t v143; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v143) : "w"(v133), "w"(v141)); - svfloat32_t v160; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v160) : "w"(v151), "w"(v159)); - svfloat32_t v161; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v161) : "w"(v151), "w"(v159)); - svfloat32_t v178; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v178) : "w"(v169), "w"(v177)); - svfloat32_t v179; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v179) : "w"(v169), "w"(v177)); - svfloat32_t v180; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v180) : "w"(v52), "w"(v178)); - svfloat32_t v181; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v181) : "w"(v52), "w"(v178)); - svfloat32_t v182; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v182) : "w"(v160), "w"(v70)); - svfloat32_t v183; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v183) : "w"(v160), "w"(v70)); - svfloat32_t v184; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v184) : "w"(v88), "w"(v142)); - svfloat32_t v185; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v185) : "w"(v88), "w"(v142)); - svfloat32_t v186; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v186) : "w"(v106), "w"(v124)); - svfloat32_t v187; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v187) : "w"(v106), "w"(v124)); - svfloat32_t v290; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v290) : "w"(v53), "w"(v179)); - svfloat32_t v291; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v291) : "w"(v53), "w"(v179)); - svfloat32_t v292; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v292) : "w"(v161), "w"(v71)); - svfloat32_t v293; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v293) : "w"(v161), "w"(v71)); - svfloat32_t v294; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v294) : "w"(v89), "w"(v143)); - svfloat32_t v295; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v295) : "w"(v89), "w"(v143)); - svfloat32_t v296; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v296) : "w"(v107), "w"(v125)); - svfloat32_t v297; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v297) : "w"(v107), "w"(v125)); - svfloat32_t v188; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v188) : "w"(v180), "w"(v182)); - svfloat32_t v192; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v192) : "w"(v181), "w"(v183)); - svfloat32_t v194; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v194) : "w"(v180), "w"(v182)); - svfloat32_t v195; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v195) : "w"(v182), "w"(v186)); - svfloat32_t v196; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v196) : "w"(v186), "w"(v180)); - svfloat32_t v197; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v197) : "w"(v181), "w"(v183)); - svfloat32_t v198; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v198) : "w"(v183), "w"(v187)); - svfloat32_t v199; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v199) : "w"(v187), "w"(v181)); - svfloat32_t zero228; - asm volatile("mov %0.s, #0" : "=w"(zero228)); - svfloat32_t v228 = svcmla_f32_x(pred_full, zero228, v722, v185, 90); - svfloat32_t v298; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v298) : "w"(v290), "w"(v292)); - svfloat32_t v302; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v302) : "w"(v291), "w"(v293)); - svfloat32_t v304; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v304) : "w"(v290), "w"(v292)); - svfloat32_t v305; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v305) : "w"(v292), "w"(v296)); - svfloat32_t v306; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v306) : "w"(v296), "w"(v290)); - svfloat32_t v307; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v307) : "w"(v291), "w"(v293)); - svfloat32_t v308; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v308) : "w"(v293), "w"(v297)); - svfloat32_t v309; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v309) : "w"(v297), "w"(v291)); - svfloat32_t zero338; - asm volatile("mov %0.s, #0" : "=w"(zero338)); - svfloat32_t v338 = svcmla_f32_x(pred_full, zero338, v722, v295, 90); - svfloat32_t v189; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v189) : "w"(v188), "w"(v186)); - svfloat32_t v193; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v193) : "w"(v192), "w"(v187)); - svfloat32_t zero250; - asm volatile("mov %0.s, #0" : "=w"(zero250)); - svfloat32_t v250 = svcmla_f32_x(pred_full, zero250, v726, v197, 90); - svfloat32_t zero257; - asm volatile("mov %0.s, #0" : "=w"(zero257)); - svfloat32_t v257 = svcmla_f32_x(pred_full, zero257, v727, v198, 90); - svfloat32_t zero264; - asm volatile("mov %0.s, #0" : "=w"(zero264)); - svfloat32_t v264 = svcmla_f32_x(pred_full, zero264, v728, v199, 90); - svfloat32_t v299; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v299) : "w"(v298), "w"(v296)); - svfloat32_t v303; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v303) : "w"(v302), "w"(v297)); - svfloat32_t zero360; - asm volatile("mov %0.s, #0" : "=w"(zero360)); - svfloat32_t v360 = svcmla_f32_x(pred_full, zero360, v726, v307, 90); - svfloat32_t zero367; - asm volatile("mov %0.s, #0" : "=w"(zero367)); - svfloat32_t v367 = svcmla_f32_x(pred_full, zero367, v727, v308, 90); - svfloat32_t zero374; - asm volatile("mov %0.s, #0" : "=w"(zero374)); - svfloat32_t v374 = svcmla_f32_x(pred_full, zero374, v728, v309, 90); - svfloat32_t v190; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v190) : "w"(v189), "w"(v184)); - svfloat32_t v209; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v209) : "w"(v189), "w"(v719)); - svfloat32_t zero216; - asm volatile("mov %0.s, #0" : "=w"(zero216)); - svfloat32_t v216 = svcmla_f32_x(pred_full, zero216, v722, v193, 90); - svfloat32_t v278; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v278) : "w"(v228), "w"(v250)); - svfloat32_t v280; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v280) : "w"(v228), "w"(v257)); - svfloat32_t v282; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v282) : "w"(v228), "w"(v250)); - svfloat32_t v300; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v300) : "w"(v299), "w"(v294)); - svfloat32_t v319; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v319) : "w"(v299), "w"(v719)); - svfloat32_t zero326; - asm volatile("mov %0.s, #0" : "=w"(zero326)); - svfloat32_t v326 = svcmla_f32_x(pred_full, zero326, v722, v303, 90); - svfloat32_t v388; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v388) : "w"(v338), "w"(v360)); - svfloat32_t v390; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v390) : "w"(v338), "w"(v367)); - svfloat32_t v392; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v392) : "w"(v338), "w"(v360)); - svfloat32_t v191; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v191) : "w"(v190), "w"(v34)); - svfloat32_t v265; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v265) : "w"(v209), "w"(v209)); - svfloat32_t v279; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v279) : "w"(v278), "w"(v257)); - svfloat32_t v281; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v281) : "w"(v280), "w"(v264)); - svfloat32_t v283; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v283) : "w"(v282), "w"(v264)); - svfloat32_t v301; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v301) : "w"(v300), "w"(v35)); - svfloat32_t v375; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v375) : "w"(v319), "w"(v319)); - svfloat32_t v389; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v389) : "w"(v388), "w"(v367)); - svfloat32_t v391; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v391) : "w"(v390), "w"(v374)); - svfloat32_t v393; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v393) : "w"(v392), "w"(v374)); - svfloat32_t v266 = svmla_f32_x(pred_full, v265, v189, v719); - svfloat32_t v270 = svmla_f32_x(pred_full, v191, v184, v721); - svfloat32_t v376 = svmla_f32_x(pred_full, v375, v299, v719); - svfloat32_t v380 = svmla_f32_x(pred_full, v301, v294, v721); - svint16_t v402 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v191, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svint16_t v410 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v301, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svfloat32_t v267; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v267) : "w"(v191), "w"(v266)); - svfloat32_t v271; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v271) : "w"(v270), "w"(v265)); - svfloat32_t v377; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v377) : "w"(v301), "w"(v376)); - svfloat32_t v381; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v381) : "w"(v380), "w"(v375)); - svst1w_u64(pred_full, (unsigned *)(v736), svreinterpret_u64_s16(v402)); - svst1w_u64(pred_full, (unsigned *)(v745), svreinterpret_u64_s16(v410)); - svfloat32_t v268; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v268) : "w"(v267), "w"(v216)); - svfloat32_t v269; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v269) : "w"(v267), "w"(v216)); - svfloat32_t v272 = svmla_f32_x(pred_full, v271, v194, v723); - svfloat32_t v274 = svmls_f32_x(pred_full, v271, v195, v724); - svfloat32_t v276 = svmls_f32_x(pred_full, v271, v194, v723); - svfloat32_t v378; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v378) : "w"(v377), "w"(v326)); - svfloat32_t v379; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v379) : "w"(v377), "w"(v326)); - svfloat32_t v382 = svmla_f32_x(pred_full, v381, v304, v723); - svfloat32_t v384 = svmls_f32_x(pred_full, v381, v305, v724); - svfloat32_t v386 = svmls_f32_x(pred_full, v381, v304, v723); - svfloat32_t v273 = svmla_f32_x(pred_full, v272, v195, v724); - svfloat32_t v275 = svmla_f32_x(pred_full, v274, v196, v725); - svfloat32_t v277 = svmls_f32_x(pred_full, v276, v196, v725); - svfloat32_t v383 = svmla_f32_x(pred_full, v382, v305, v724); - svfloat32_t v385 = svmla_f32_x(pred_full, v384, v306, v725); - svfloat32_t v387 = svmls_f32_x(pred_full, v386, v306, v725); - svint16_t v450 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v269, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svint16_t v458 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v379, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svint16_t v498 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v268, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svint16_t v506 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v378, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svfloat32_t v284; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v284) : "w"(v273), "w"(v279)); - svfloat32_t v285; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v285) : "w"(v273), "w"(v279)); - svfloat32_t v286; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v286) : "w"(v275), "w"(v281)); - svfloat32_t v287; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v287) : "w"(v275), "w"(v281)); - svfloat32_t v288; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v288) : "w"(v277), "w"(v283)); - svfloat32_t v289; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v289) : "w"(v277), "w"(v283)); - svfloat32_t v394; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v394) : "w"(v383), "w"(v389)); - svfloat32_t v395; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v395) : "w"(v383), "w"(v389)); - svfloat32_t v396; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v396) : "w"(v385), "w"(v391)); - svfloat32_t v397; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v397) : "w"(v385), "w"(v391)); - svfloat32_t v398; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v398) : "w"(v387), "w"(v393)); - svfloat32_t v399; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v399) : "w"(v387), "w"(v393)); - svst1w_u64(pred_full, (unsigned *)(v790), svreinterpret_u64_s16(v450)); - svst1w_u64(pred_full, (unsigned *)(v799), svreinterpret_u64_s16(v458)); - svst1w_u64(pred_full, (unsigned *)(v844), svreinterpret_u64_s16(v498)); - svst1w_u64(pred_full, (unsigned *)(v853), svreinterpret_u64_s16(v506)); - svint16_t v418 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v285, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svint16_t v426 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v395, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svint16_t v434 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v286, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svint16_t v442 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v396, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svint16_t v466 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v289, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svint16_t v474 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v399, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svint16_t v482 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v288, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svint16_t v490 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v398, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svint16_t v514 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v287, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svint16_t v522 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v397, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svint16_t v530 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v284, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svint16_t v538 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v394, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svst1w_u64(pred_full, (unsigned *)(v754), svreinterpret_u64_s16(v418)); - svst1w_u64(pred_full, (unsigned *)(v763), svreinterpret_u64_s16(v426)); - svst1w_u64(pred_full, (unsigned *)(v772), svreinterpret_u64_s16(v434)); - svst1w_u64(pred_full, (unsigned *)(v781), svreinterpret_u64_s16(v442)); - svst1w_u64(pred_full, (unsigned *)(v808), svreinterpret_u64_s16(v466)); - svst1w_u64(pred_full, (unsigned *)(v817), svreinterpret_u64_s16(v474)); - svst1w_u64(pred_full, (unsigned *)(v826), svreinterpret_u64_s16(v482)); - svst1w_u64(pred_full, (unsigned *)(v835), svreinterpret_u64_s16(v490)); - svst1w_u64(pred_full, (unsigned *)(v862), svreinterpret_u64_s16(v514)); - svst1w_u64(pred_full, (unsigned *)(v871), svreinterpret_u64_s16(v522)); - svst1w_u64(pred_full, (unsigned *)(v880), svreinterpret_u64_s16(v530)); - svst1w_u64(pred_full, (unsigned *)(v889), svreinterpret_u64_s16(v538)); - v5 += v11; - v6 += v12; - } -} -#endif - -#ifndef ARMRAL_ARCH_SVE -void armral_fft_cs16_cf32_cs16_ac_n_uu19(const armral_cmplx_int16_t *restrict x, - armral_cmplx_int16_t *restrict y, - int istride, int ostride, int howmany, - float dir) { - float v4 = dir; - const int32_t *v5 = (const int32_t *)x; - int32_t *v6 = (int32_t *)y; - int64_t v12 = howmany - 1; - int64_t v763 = howmany / 2; - for (int j = 0; j < v12; j += 2) { - float v262 = -1.0555555555555556e+00F; - float v267 = 1.7752228513927079e-01F; - float v272 = -1.2820077502191529e-01F; - float v277 = 4.9321510117355499e-02F; - float v282 = 5.7611011491005903e-01F; - float v287 = -7.4996449655536279e-01F; - float v292 = -1.7385438164530381e-01F; - float v297 = -2.1729997561977314e+00F; - float v302 = -1.7021211726914738e+00F; - float v307 = 4.7087858350625778e-01F; - float v312 = -2.0239400846888440e+00F; - float v317 = 1.0551641201664090e-01F; - float v322 = 2.1294564967054850e+00F; - float v327 = -7.5087543897371167e-01F; - float v332 = 1.4812817695157160e-01F; - float v337 = 8.9900361592528333e-01F; - float v342 = -6.2148246772602778e-01F; - float v347 = -7.9869352098712687e-01F; - float v352 = -4.7339199623771833e-01F; - float v356 = -2.4216105241892630e-01F; - float v357 = 2.4216105241892630e-01F; - float v364 = -5.9368607967505101e-02F; - float v365 = 5.9368607967505101e-02F; - float v372 = 1.2578688255176201e-02F; - float v373 = -1.2578688255176201e-02F; - float v380 = -4.6789919712328903e-02F; - float v381 = 4.6789919712328903e-02F; - float v388 = -9.3750121913782358e-01F; - float v389 = 9.3750121913782358e-01F; - float v396 = -5.0111537043352902e-02F; - float v397 = 5.0111537043352902e-02F; - float v404 = -9.8761275618117661e-01F; - float v405 = 9.8761275618117661e-01F; - float v412 = -1.1745786501205959e+00F; - float v413 = 1.1745786501205959e+00F; - float v420 = 1.1114482296234993e+00F; - float v421 = -1.1114482296234993e+00F; - float v428 = 2.2860268797440955e+00F; - float v429 = -2.2860268797440955e+00F; - float v436 = 2.6420523257930939e-01F; - float v437 = -2.6420523257930939e-01F; - float v444 = 2.1981792779352136e+00F; - float v445 = -2.1981792779352136e+00F; - float v452 = 1.9339740453559042e+00F; - float v453 = -1.9339740453559042e+00F; - float v460 = -7.4825847091254893e-01F; - float v461 = 7.4825847091254893e-01F; - float v468 = -4.7820835642768872e-01F; - float v469 = 4.7820835642768872e-01F; - float v476 = 2.7005011448486022e-01F; - float v477 = -2.7005011448486022e-01F; - float v484 = -3.4642356159542270e-01F; - float v485 = 3.4642356159542270e-01F; - float v492 = -8.3485429360688279e-01F; - float v493 = 8.3485429360688279e-01F; - float v500 = -3.9375928506743518e-01F; - float v501 = 3.9375928506743518e-01F; - float32x2_t v503 = (float32x2_t){v4, v4}; - const int32_t *v1408 = &v5[istride]; - int32_t *v1590 = &v6[ostride]; - float32x2_t v263 = (float32x2_t){v262, v262}; - float32x2_t v268 = (float32x2_t){v267, v267}; - float32x2_t v273 = (float32x2_t){v272, v272}; - float32x2_t v278 = (float32x2_t){v277, v277}; - float32x2_t v283 = (float32x2_t){v282, v282}; - float32x2_t v288 = (float32x2_t){v287, v287}; - float32x2_t v293 = (float32x2_t){v292, v292}; - float32x2_t v298 = (float32x2_t){v297, v297}; - float32x2_t v303 = (float32x2_t){v302, v302}; - float32x2_t v308 = (float32x2_t){v307, v307}; - float32x2_t v313 = (float32x2_t){v312, v312}; - float32x2_t v318 = (float32x2_t){v317, v317}; - float32x2_t v323 = (float32x2_t){v322, v322}; - float32x2_t v328 = (float32x2_t){v327, v327}; - float32x2_t v333 = (float32x2_t){v332, v332}; - float32x2_t v338 = (float32x2_t){v337, v337}; - float32x2_t v343 = (float32x2_t){v342, v342}; - float32x2_t v348 = (float32x2_t){v347, v347}; - float32x2_t v353 = (float32x2_t){v352, v352}; - float32x2_t v358 = (float32x2_t){v356, v357}; - float32x2_t v366 = (float32x2_t){v364, v365}; - float32x2_t v374 = (float32x2_t){v372, v373}; - float32x2_t v382 = (float32x2_t){v380, v381}; - float32x2_t v390 = (float32x2_t){v388, v389}; - float32x2_t v398 = (float32x2_t){v396, v397}; - float32x2_t v406 = (float32x2_t){v404, v405}; - float32x2_t v414 = (float32x2_t){v412, v413}; - float32x2_t v422 = (float32x2_t){v420, v421}; - float32x2_t v430 = (float32x2_t){v428, v429}; - float32x2_t v438 = (float32x2_t){v436, v437}; - float32x2_t v446 = (float32x2_t){v444, v445}; - float32x2_t v454 = (float32x2_t){v452, v453}; - float32x2_t v462 = (float32x2_t){v460, v461}; - float32x2_t v470 = (float32x2_t){v468, v469}; - float32x2_t v478 = (float32x2_t){v476, v477}; - float32x2_t v486 = (float32x2_t){v484, v485}; - float32x2_t v494 = (float32x2_t){v492, v493}; - float32x2_t v502 = (float32x2_t){v500, v501}; - const int32_t *v1571 = &v5[0]; - int32_t *v1581 = &v6[0]; - int16x4_t v1747 = vld1_s16((const int16_t *)v1408); - float32x4_t v28 = vcvtq_n_f32_s32(vmovl_s16(v1747), 15); - float32x4_t v264 = vcombine_f32(v263, v263); - float32x4_t v269 = vcombine_f32(v268, v268); - float32x4_t v274 = vcombine_f32(v273, v273); - float32x4_t v279 = vcombine_f32(v278, v278); - float32x4_t v284 = vcombine_f32(v283, v283); - float32x4_t v289 = vcombine_f32(v288, v288); - float32x4_t v294 = vcombine_f32(v293, v293); - float32x4_t v299 = vcombine_f32(v298, v298); - float32x4_t v304 = vcombine_f32(v303, v303); - float32x4_t v309 = vcombine_f32(v308, v308); - float32x4_t v314 = vcombine_f32(v313, v313); - float32x4_t v319 = vcombine_f32(v318, v318); - float32x4_t v324 = vcombine_f32(v323, v323); - float32x4_t v329 = vcombine_f32(v328, v328); - float32x4_t v334 = vcombine_f32(v333, v333); - float32x4_t v339 = vcombine_f32(v338, v338); - float32x4_t v344 = vcombine_f32(v343, v343); - float32x4_t v349 = vcombine_f32(v348, v348); - float32x4_t v354 = vcombine_f32(v353, v353); - float32x2_t v360 = vmul_f32(v503, v358); - float32x2_t v368 = vmul_f32(v503, v366); - float32x2_t v376 = vmul_f32(v503, v374); - float32x2_t v384 = vmul_f32(v503, v382); - float32x2_t v392 = vmul_f32(v503, v390); - float32x2_t v400 = vmul_f32(v503, v398); - float32x2_t v408 = vmul_f32(v503, v406); - float32x2_t v416 = vmul_f32(v503, v414); - float32x2_t v424 = vmul_f32(v503, v422); - float32x2_t v432 = vmul_f32(v503, v430); - float32x2_t v440 = vmul_f32(v503, v438); - float32x2_t v448 = vmul_f32(v503, v446); - float32x2_t v456 = vmul_f32(v503, v454); - float32x2_t v464 = vmul_f32(v503, v462); - float32x2_t v472 = vmul_f32(v503, v470); - float32x2_t v480 = vmul_f32(v503, v478); - float32x2_t v488 = vmul_f32(v503, v486); - float32x2_t v496 = vmul_f32(v503, v494); - float32x2_t v504 = vmul_f32(v503, v502); - const int32_t *v1417 = &v5[istride * 18]; - const int32_t *v1426 = &v5[istride * 2]; - const int32_t *v1435 = &v5[istride * 17]; - const int32_t *v1444 = &v5[istride * 4]; - const int32_t *v1453 = &v5[istride * 15]; - const int32_t *v1462 = &v5[istride * 8]; - const int32_t *v1471 = &v5[istride * 11]; - const int32_t *v1480 = &v5[istride * 16]; - const int32_t *v1489 = &v5[istride * 3]; - const int32_t *v1498 = &v5[istride * 13]; - const int32_t *v1507 = &v5[istride * 6]; - const int32_t *v1516 = &v5[istride * 7]; - const int32_t *v1525 = &v5[istride * 12]; - const int32_t *v1534 = &v5[istride * 14]; - const int32_t *v1543 = &v5[istride * 5]; - const int32_t *v1552 = &v5[istride * 9]; - const int32_t *v1561 = &v5[istride * 10]; - int32_t *v1599 = &v6[ostride * 18]; - int32_t *v1608 = &v6[ostride * 2]; - int32_t *v1617 = &v6[ostride * 17]; - int32_t *v1626 = &v6[ostride * 3]; - int32_t *v1635 = &v6[ostride * 16]; - int32_t *v1644 = &v6[ostride * 4]; - int32_t *v1653 = &v6[ostride * 15]; - int32_t *v1662 = &v6[ostride * 5]; - int32_t *v1671 = &v6[ostride * 14]; - int32_t *v1680 = &v6[ostride * 6]; - int32_t *v1689 = &v6[ostride * 13]; - int32_t *v1698 = &v6[ostride * 7]; - int32_t *v1707 = &v6[ostride * 12]; - int32_t *v1716 = &v6[ostride * 8]; - int32_t *v1725 = &v6[ostride * 11]; - int32_t *v1734 = &v6[ostride * 9]; - int32_t *v1743 = &v6[ostride * 10]; - int16x4_t v1783 = vld1_s16((const int16_t *)v1571); - float32x4_t v206 = vcvtq_n_f32_s32(vmovl_s16(v1783), 15); - float32x4_t v362 = vcombine_f32(v360, v360); - float32x4_t v370 = vcombine_f32(v368, v368); - float32x4_t v378 = vcombine_f32(v376, v376); - float32x4_t v386 = vcombine_f32(v384, v384); - float32x4_t v394 = vcombine_f32(v392, v392); - float32x4_t v402 = vcombine_f32(v400, v400); - float32x4_t v410 = vcombine_f32(v408, v408); - float32x4_t v418 = vcombine_f32(v416, v416); - float32x4_t v426 = vcombine_f32(v424, v424); - float32x4_t v434 = vcombine_f32(v432, v432); - float32x4_t v442 = vcombine_f32(v440, v440); - float32x4_t v450 = vcombine_f32(v448, v448); - float32x4_t v458 = vcombine_f32(v456, v456); - float32x4_t v466 = vcombine_f32(v464, v464); - float32x4_t v474 = vcombine_f32(v472, v472); - float32x4_t v482 = vcombine_f32(v480, v480); - float32x4_t v490 = vcombine_f32(v488, v488); - float32x4_t v498 = vcombine_f32(v496, v496); - float32x4_t v506 = vcombine_f32(v504, v504); - int16x4_t v1749 = vld1_s16((const int16_t *)v1417); - int16x4_t v1751 = vld1_s16((const int16_t *)v1426); - int16x4_t v1753 = vld1_s16((const int16_t *)v1435); - int16x4_t v1755 = vld1_s16((const int16_t *)v1444); - int16x4_t v1757 = vld1_s16((const int16_t *)v1453); - int16x4_t v1759 = vld1_s16((const int16_t *)v1462); - int16x4_t v1761 = vld1_s16((const int16_t *)v1471); - int16x4_t v1763 = vld1_s16((const int16_t *)v1480); - int16x4_t v1765 = vld1_s16((const int16_t *)v1489); - int16x4_t v1767 = vld1_s16((const int16_t *)v1498); - int16x4_t v1769 = vld1_s16((const int16_t *)v1507); - int16x4_t v1771 = vld1_s16((const int16_t *)v1516); - int16x4_t v1773 = vld1_s16((const int16_t *)v1525); - int16x4_t v1775 = vld1_s16((const int16_t *)v1534); - int16x4_t v1777 = vld1_s16((const int16_t *)v1543); - int16x4_t v1779 = vld1_s16((const int16_t *)v1552); - int16x4_t v1781 = vld1_s16((const int16_t *)v1561); - float32x4_t v36 = vcvtq_n_f32_s32(vmovl_s16(v1749), 15); - float32x4_t v46 = vcvtq_n_f32_s32(vmovl_s16(v1751), 15); - float32x4_t v54 = vcvtq_n_f32_s32(vmovl_s16(v1753), 15); - float32x4_t v64 = vcvtq_n_f32_s32(vmovl_s16(v1755), 15); - float32x4_t v72 = vcvtq_n_f32_s32(vmovl_s16(v1757), 15); - float32x4_t v82 = vcvtq_n_f32_s32(vmovl_s16(v1759), 15); - float32x4_t v90 = vcvtq_n_f32_s32(vmovl_s16(v1761), 15); - float32x4_t v100 = vcvtq_n_f32_s32(vmovl_s16(v1763), 15); - float32x4_t v108 = vcvtq_n_f32_s32(vmovl_s16(v1765), 15); - float32x4_t v118 = vcvtq_n_f32_s32(vmovl_s16(v1767), 15); - float32x4_t v126 = vcvtq_n_f32_s32(vmovl_s16(v1769), 15); - float32x4_t v136 = vcvtq_n_f32_s32(vmovl_s16(v1771), 15); - float32x4_t v144 = vcvtq_n_f32_s32(vmovl_s16(v1773), 15); - float32x4_t v154 = vcvtq_n_f32_s32(vmovl_s16(v1775), 15); - float32x4_t v162 = vcvtq_n_f32_s32(vmovl_s16(v1777), 15); - float32x4_t v172 = vcvtq_n_f32_s32(vmovl_s16(v1779), 15); - float32x4_t v180 = vcvtq_n_f32_s32(vmovl_s16(v1781), 15); - float32x4_t v37 = vaddq_f32(v28, v36); - float32x4_t v38 = vsubq_f32(v28, v36); - float32x4_t v55 = vaddq_f32(v46, v54); - float32x4_t v56 = vsubq_f32(v54, v46); - float32x4_t v73 = vaddq_f32(v64, v72); - float32x4_t v74 = vsubq_f32(v64, v72); - float32x4_t v91 = vaddq_f32(v82, v90); - float32x4_t v92 = vsubq_f32(v90, v82); - float32x4_t v109 = vaddq_f32(v100, v108); - float32x4_t v110 = vsubq_f32(v100, v108); - float32x4_t v127 = vaddq_f32(v118, v126); - float32x4_t v128 = vsubq_f32(v126, v118); - float32x4_t v145 = vaddq_f32(v136, v144); - float32x4_t v146 = vsubq_f32(v136, v144); - float32x4_t v163 = vaddq_f32(v154, v162); - float32x4_t v164 = vsubq_f32(v162, v154); - float32x4_t v181 = vaddq_f32(v172, v180); - float32x4_t v182 = vsubq_f32(v172, v180); - float32x4_t v183 = vsubq_f32(v37, v145); - float32x4_t v184 = vsubq_f32(v55, v163); - float32x4_t v185 = vsubq_f32(v73, v181); - float32x4_t v186 = vsubq_f32(v91, v145); - float32x4_t v187 = vsubq_f32(v109, v163); - float32x4_t v188 = vsubq_f32(v127, v181); - float32x4_t v189 = vaddq_f32(v37, v91); - float32x4_t v191 = vaddq_f32(v55, v109); - float32x4_t v193 = vaddq_f32(v73, v127); - float32x4_t v224 = vsubq_f32(v38, v146); - float32x4_t v225 = vsubq_f32(v56, v164); - float32x4_t v226 = vsubq_f32(v74, v182); - float32x4_t v227 = vsubq_f32(v92, v146); - float32x4_t v228 = vsubq_f32(v110, v164); - float32x4_t v229 = vsubq_f32(v128, v182); - float32x4_t v230 = vaddq_f32(v38, v92); - float32x4_t v232 = vaddq_f32(v56, v110); - float32x4_t v234 = vaddq_f32(v74, v128); - float32x4_t v190 = vaddq_f32(v189, v145); - float32x4_t v192 = vaddq_f32(v191, v163); - float32x4_t v194 = vaddq_f32(v193, v181); - float32x4_t v195 = vaddq_f32(v183, v185); - float32x4_t v196 = vaddq_f32(v186, v188); - float32x4_t v214 = vsubq_f32(v183, v186); - float32x4_t v215 = vsubq_f32(v185, v188); - float32x4_t v231 = vaddq_f32(v230, v146); - float32x4_t v233 = vaddq_f32(v232, v164); - float32x4_t v235 = vaddq_f32(v234, v182); - float32x4_t v236 = vaddq_f32(v224, v226); - float32x4_t v237 = vaddq_f32(v227, v229); - float32x4_t v246 = vsubq_f32(v224, v227); - float32x4_t v247 = vsubq_f32(v226, v229); - float32x4_t v300 = vmulq_f32(v186, v299); - float32x4_t v315 = vmulq_f32(v188, v314); - float32x4_t v325 = vmulq_f32(v185, v324); - float32x4_t v417 = vrev64q_f32(v227); - float32x4_t v433 = vrev64q_f32(v224); - float32x4_t v441 = vrev64q_f32(v229); - float32x4_t v457 = vrev64q_f32(v226); - float32x4_t v197 = vaddq_f32(v190, v192); - float32x4_t v208 = vaddq_f32(v196, v187); - float32x4_t v209 = vaddq_f32(v195, v184); - float32x4_t v211 = vsubq_f32(v196, v187); - float32x4_t v212 = vsubq_f32(v195, v184); - float32x4_t v216 = vsubq_f32(v183, v215); - float32x4_t v218 = vaddq_f32(v214, v188); - float32x4_t v221 = vsubq_f32(v190, v194); - float32x4_t v222 = vsubq_f32(v192, v194); - float32x4_t v238 = vaddq_f32(v231, v233); - float32x4_t v240 = vaddq_f32(v237, v228); - float32x4_t v241 = vaddq_f32(v236, v225); - float32x4_t v243 = vsubq_f32(v237, v228); - float32x4_t v244 = vsubq_f32(v236, v225); - float32x4_t v248 = vsubq_f32(v224, v247); - float32x4_t v250 = vaddq_f32(v246, v229); - float32x4_t v253 = vsubq_f32(v231, v235); - float32x4_t v254 = vsubq_f32(v233, v235); - float32x4_t v305 = vmulq_f32(v214, v304); - float32x4_t v320 = vmulq_f32(v215, v319); - float32x4_t v419 = vmulq_f32(v417, v418); - float32x4_t v425 = vrev64q_f32(v246); - float32x4_t v443 = vmulq_f32(v441, v442); - float32x4_t v449 = vrev64q_f32(v247); - float32x4_t v459 = vmulq_f32(v457, v458); - float32x4_t v198 = vaddq_f32(v197, v194); - float32x4_t v210 = vsubq_f32(v209, v208); - float32x4_t v213 = vsubq_f32(v212, v211); - float32x4_t v217 = vsubq_f32(v216, v187); - float32x4_t v219 = vsubq_f32(v218, v184); - float32x4_t v223 = vaddq_f32(v221, v222); - float32x4_t v239 = vaddq_f32(v238, v235); - float32x4_t v242 = vsubq_f32(v241, v240); - float32x4_t v245 = vsubq_f32(v244, v243); - float32x4_t v249 = vsubq_f32(v248, v228); - float32x4_t v251 = vsubq_f32(v250, v225); - float32x4_t v255 = vaddq_f32(v253, v254); - float32x4_t v270 = vmulq_f32(v208, v269); - float32x4_t v275 = vmulq_f32(v209, v274); - float32x4_t v285 = vmulq_f32(v211, v284); - float32x4_t v290 = vmulq_f32(v212, v289); - float32x4_t v345 = vmulq_f32(v221, v344); - float32x4_t v350 = vmulq_f32(v222, v349); - float32x4_t v369 = vrev64q_f32(v240); - float32x4_t v377 = vrev64q_f32(v241); - float32x4_t v393 = vrev64q_f32(v243); - float32x4_t v401 = vrev64q_f32(v244); - float32x4_t v427 = vmulq_f32(v425, v426); - float32x4_t v451 = vmulq_f32(v449, v450); - float32x4_t v489 = vrev64q_f32(v253); - float32x4_t v497 = vrev64q_f32(v254); - float32x4_t v207 = vaddq_f32(v206, v198); - float32x4_t v220 = vsubq_f32(v217, v219); - float32x4_t v252 = vsubq_f32(v249, v251); - float32x4_t v265 = vmulq_f32(v198, v264); - float32x4_t v280 = vmulq_f32(v210, v279); - float32x4_t v295 = vmulq_f32(v213, v294); - float32x4_t v330 = vmulq_f32(v217, v329); - float32x4_t v335 = vmulq_f32(v219, v334); - float32x4_t v355 = vmulq_f32(v223, v354); - float32x4_t v361 = vrev64q_f32(v239); - float32x4_t v371 = vmulq_f32(v369, v370); - float32x4_t v379 = vmulq_f32(v377, v378); - float32x4_t v385 = vrev64q_f32(v242); - float32x4_t v395 = vmulq_f32(v393, v394); - float32x4_t v403 = vmulq_f32(v401, v402); - float32x4_t v409 = vrev64q_f32(v245); - float32x4_t v465 = vrev64q_f32(v249); - float32x4_t v473 = vrev64q_f32(v251); - float32x4_t v491 = vmulq_f32(v489, v490); - float32x4_t v499 = vmulq_f32(v497, v498); - float32x4_t v505 = vrev64q_f32(v255); - float32x4_t v508 = vaddq_f32(v270, v275); - float32x4_t v509 = vaddq_f32(v285, v290); - float32x4_t v340 = vmulq_f32(v220, v339); - float32x4_t v363 = vmulq_f32(v361, v362); - float32x4_t v387 = vmulq_f32(v385, v386); - float32x4_t v411 = vmulq_f32(v409, v410); - float32x4_t v467 = vmulq_f32(v465, v466); - float32x4_t v475 = vmulq_f32(v473, v474); - float32x4_t v481 = vrev64q_f32(v252); - float32x4_t v507 = vmulq_f32(v505, v506); - float32x4_t v511 = vaddq_f32(v508, v509); - float32x4_t v512 = vaddq_f32(v270, v280); - float32x4_t v513 = vaddq_f32(v285, v295); - float32x4_t v530 = vsubq_f32(v508, v509); - float32x4_t v532 = vsubq_f32(v345, v355); - float32x4_t v533 = vsubq_f32(v350, v355); - float32x4_t v534 = vaddq_f32(v265, v207); - float32x4_t v539 = vaddq_f32(v371, v379); - float32x4_t v540 = vaddq_f32(v395, v403); - int16x4_t v595 = vqmovn_s32(vcvtq_n_s32_f32(v207, 15)); - float32x4_t v483 = vmulq_f32(v481, v482); - float32x4_t v510 = vaddq_f32(v335, v340); - float32x4_t v514 = vaddq_f32(v330, v340); - float32x4_t v515 = vsubq_f32(v300, v511); - float32x4_t v516 = vaddq_f32(v512, v513); - float32x4_t v522 = vsubq_f32(v512, v513); - float32x4_t v527 = vaddq_f32(v511, v325); - float32x4_t v535 = vaddq_f32(v534, v532); - float32x4_t v536 = vsubq_f32(v534, v532); - float32x4_t v538 = vaddq_f32(v534, v533); - float32x4_t v542 = vaddq_f32(v539, v540); - float32x4_t v543 = vaddq_f32(v371, v387); - float32x4_t v544 = vaddq_f32(v395, v411); - float32x4_t v561 = vsubq_f32(v539, v540); - float32x4_t v563 = vsubq_f32(v491, v507); - float32x4_t v564 = vsubq_f32(v499, v507); - vst1_s16((int16_t *)v1581, v595); - float32x4_t v517 = vsubq_f32(v315, v514); - float32x4_t v518 = vaddq_f32(v305, v510); - float32x4_t v520 = vaddq_f32(v516, v320); - float32x4_t v523 = vaddq_f32(v522, v510); - float32x4_t v524 = vaddq_f32(v515, v516); - float32x4_t v531 = vaddq_f32(v530, v514); - float32x4_t v537 = vsubq_f32(v536, v533); - float32x4_t v541 = vaddq_f32(v475, v483); - float32x4_t v545 = vaddq_f32(v467, v483); - float32x4_t v546 = vsubq_f32(v419, v542); - float32x4_t v547 = vaddq_f32(v543, v544); - float32x4_t v553 = vsubq_f32(v543, v544); - float32x4_t v558 = vaddq_f32(v542, v459); - float32x4_t v565 = vaddq_f32(v363, v563); - float32x4_t v566 = vsubq_f32(v363, v563); - float32x4_t v568 = vaddq_f32(v363, v564); - float32x4_t v519 = vaddq_f32(v518, v515); - float32x4_t v521 = vaddq_f32(v520, v517); - float32x4_t v525 = vfmaq_f32(v524, v183, v309); - float32x4_t v528 = vaddq_f32(v527, v517); - float32x4_t v548 = vsubq_f32(v443, v545); - float32x4_t v549 = vaddq_f32(v427, v541); - float32x4_t v551 = vaddq_f32(v547, v451); - float32x4_t v554 = vaddq_f32(v553, v541); - float32x4_t v555 = vaddq_f32(v546, v547); - float32x4_t v562 = vaddq_f32(v561, v545); - float32x4_t v567 = vsubq_f32(v566, v564); - float32x4_t v573 = vsubq_f32(v531, v523); - float32x4_t v577 = vsubq_f32(v538, v531); - float32x4_t v580 = vaddq_f32(v523, v538); - float32x4_t v526 = vaddq_f32(v525, v514); - float32x4_t v529 = vaddq_f32(v528, v510); - float32x4_t v550 = vaddq_f32(v549, v546); - float32x4_t v552 = vaddq_f32(v551, v548); - float32x4_t v556 = vfmaq_f32(v555, v433, v434); - float32x4_t v559 = vaddq_f32(v558, v548); - float32x4_t v574 = vaddq_f32(v573, v538); - float32x4_t v578 = vaddq_f32(v519, v535); - float32x4_t v579 = vaddq_f32(v521, v537); - float32x4_t v585 = vsubq_f32(v562, v554); - float32x4_t v589 = vsubq_f32(v562, v568); - float32x4_t v592 = vaddq_f32(v554, v568); - float32x4_t v557 = vaddq_f32(v556, v545); - float32x4_t v560 = vaddq_f32(v559, v541); - float32x4_t v569 = vsubq_f32(v526, v519); - float32x4_t v571 = vsubq_f32(v529, v521); - float32x4_t v575 = vsubq_f32(v535, v526); - float32x4_t v576 = vsubq_f32(v537, v529); - float32x4_t v586 = vaddq_f32(v585, v568); - float32x4_t v590 = vaddq_f32(v550, v565); - float32x4_t v591 = vaddq_f32(v552, v567); - float32x4_t v619 = vsubq_f32(v580, v592); - float32x4_t v628 = vaddq_f32(v580, v592); - float32x4_t v637 = vaddq_f32(v577, v589); - float32x4_t v646 = vsubq_f32(v577, v589); - float32x4_t v570 = vaddq_f32(v569, v535); - float32x4_t v572 = vaddq_f32(v571, v537); - float32x4_t v581 = vsubq_f32(v557, v550); - float32x4_t v583 = vsubq_f32(v560, v552); - float32x4_t v587 = vsubq_f32(v565, v557); - float32x4_t v588 = vsubq_f32(v567, v560); - int16x4_t v622 = vqmovn_s32(vcvtq_n_s32_f32(v619, 15)); - int16x4_t v631 = vqmovn_s32(vcvtq_n_s32_f32(v628, 15)); - int16x4_t v640 = vqmovn_s32(vcvtq_n_s32_f32(v637, 15)); - int16x4_t v649 = vqmovn_s32(vcvtq_n_s32_f32(v646, 15)); - float32x4_t v655 = vaddq_f32(v579, v591); - float32x4_t v664 = vsubq_f32(v579, v591); - float32x4_t v673 = vaddq_f32(v574, v586); - float32x4_t v682 = vsubq_f32(v574, v586); - float32x4_t v727 = vsubq_f32(v578, v590); - float32x4_t v736 = vaddq_f32(v578, v590); - float32x4_t v582 = vaddq_f32(v581, v565); - float32x4_t v584 = vaddq_f32(v583, v567); - int16x4_t v658 = vqmovn_s32(vcvtq_n_s32_f32(v655, 15)); - int16x4_t v667 = vqmovn_s32(vcvtq_n_s32_f32(v664, 15)); - int16x4_t v676 = vqmovn_s32(vcvtq_n_s32_f32(v673, 15)); - int16x4_t v685 = vqmovn_s32(vcvtq_n_s32_f32(v682, 15)); - float32x4_t v691 = vaddq_f32(v576, v588); - float32x4_t v700 = vsubq_f32(v576, v588); - float32x4_t v709 = vaddq_f32(v575, v587); - float32x4_t v718 = vsubq_f32(v575, v587); - int16x4_t v730 = vqmovn_s32(vcvtq_n_s32_f32(v727, 15)); - int16x4_t v739 = vqmovn_s32(vcvtq_n_s32_f32(v736, 15)); - vst1_s16((int16_t *)v1608, v622); - vst1_s16((int16_t *)v1617, v631); - vst1_s16((int16_t *)v1626, v640); - vst1_s16((int16_t *)v1635, v649); - float32x4_t v601 = vaddq_f32(v570, v582); - float32x4_t v610 = vsubq_f32(v570, v582); - int16x4_t v694 = vqmovn_s32(vcvtq_n_s32_f32(v691, 15)); - int16x4_t v703 = vqmovn_s32(vcvtq_n_s32_f32(v700, 15)); - int16x4_t v712 = vqmovn_s32(vcvtq_n_s32_f32(v709, 15)); - int16x4_t v721 = vqmovn_s32(vcvtq_n_s32_f32(v718, 15)); - float32x4_t v745 = vaddq_f32(v572, v584); - float32x4_t v754 = vsubq_f32(v572, v584); - vst1_s16((int16_t *)v1644, v658); - vst1_s16((int16_t *)v1653, v667); - vst1_s16((int16_t *)v1662, v676); - vst1_s16((int16_t *)v1671, v685); - vst1_s16((int16_t *)v1716, v730); - vst1_s16((int16_t *)v1725, v739); - int16x4_t v604 = vqmovn_s32(vcvtq_n_s32_f32(v601, 15)); - int16x4_t v613 = vqmovn_s32(vcvtq_n_s32_f32(v610, 15)); - int16x4_t v748 = vqmovn_s32(vcvtq_n_s32_f32(v745, 15)); - int16x4_t v757 = vqmovn_s32(vcvtq_n_s32_f32(v754, 15)); - vst1_s16((int16_t *)v1680, v694); - vst1_s16((int16_t *)v1689, v703); - vst1_s16((int16_t *)v1698, v712); - vst1_s16((int16_t *)v1707, v721); - vst1_s16((int16_t *)v1590, v604); - vst1_s16((int16_t *)v1599, v613); - vst1_s16((int16_t *)v1734, v748); - vst1_s16((int16_t *)v1743, v757); - v5 += 2 * 1; - v6 += 2 * 1; - } - for (int j = v763 * 2; j < howmany; j += 1) { - int16x4_t v775 = vld1s_s16(&v5[istride]); - float v973 = -1.0555555555555556e+00F; - float v977 = 1.7752228513927079e-01F; - float v981 = -1.2820077502191529e-01F; - float v985 = 4.9321510117355499e-02F; - float v989 = 5.7611011491005903e-01F; - float v993 = -7.4996449655536279e-01F; - float v997 = -1.7385438164530381e-01F; - float v1001 = -2.1729997561977314e+00F; - float v1005 = -1.7021211726914738e+00F; - float v1009 = 4.7087858350625778e-01F; - float v1013 = -2.0239400846888440e+00F; - float v1017 = 1.0551641201664090e-01F; - float v1021 = 2.1294564967054850e+00F; - float v1025 = -7.5087543897371167e-01F; - float v1029 = 1.4812817695157160e-01F; - float v1033 = 8.9900361592528333e-01F; - float v1037 = -6.2148246772602778e-01F; - float v1041 = -7.9869352098712687e-01F; - float v1045 = -4.7339199623771833e-01F; - float v1048 = -2.4216105241892630e-01F; - float v1049 = 2.4216105241892630e-01F; - float v1055 = -5.9368607967505101e-02F; - float v1056 = 5.9368607967505101e-02F; - float v1062 = 1.2578688255176201e-02F; - float v1063 = -1.2578688255176201e-02F; - float v1069 = -4.6789919712328903e-02F; - float v1070 = 4.6789919712328903e-02F; - float v1076 = -9.3750121913782358e-01F; - float v1077 = 9.3750121913782358e-01F; - float v1083 = -5.0111537043352902e-02F; - float v1084 = 5.0111537043352902e-02F; - float v1090 = -9.8761275618117661e-01F; - float v1091 = 9.8761275618117661e-01F; - float v1097 = -1.1745786501205959e+00F; - float v1098 = 1.1745786501205959e+00F; - float v1104 = 1.1114482296234993e+00F; - float v1105 = -1.1114482296234993e+00F; - float v1111 = 2.2860268797440955e+00F; - float v1112 = -2.2860268797440955e+00F; - float v1118 = 2.6420523257930939e-01F; - float v1119 = -2.6420523257930939e-01F; - float v1125 = 2.1981792779352136e+00F; - float v1126 = -2.1981792779352136e+00F; - float v1132 = 1.9339740453559042e+00F; - float v1133 = -1.9339740453559042e+00F; - float v1139 = -7.4825847091254893e-01F; - float v1140 = 7.4825847091254893e-01F; - float v1146 = -4.7820835642768872e-01F; - float v1147 = 4.7820835642768872e-01F; - float v1153 = 2.7005011448486022e-01F; - float v1154 = -2.7005011448486022e-01F; - float v1160 = -3.4642356159542270e-01F; - float v1161 = 3.4642356159542270e-01F; - float v1167 = -8.3485429360688279e-01F; - float v1168 = 8.3485429360688279e-01F; - float v1174 = -3.9375928506743518e-01F; - float v1175 = 3.9375928506743518e-01F; - float32x2_t v1177 = (float32x2_t){v4, v4}; - float32x2_t v776 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v775)), 15); - int16x4_t v917 = vld1s_s16(&v5[0]); - float32x2_t v974 = (float32x2_t){v973, v973}; - float32x2_t v978 = (float32x2_t){v977, v977}; - float32x2_t v982 = (float32x2_t){v981, v981}; - float32x2_t v986 = (float32x2_t){v985, v985}; - float32x2_t v990 = (float32x2_t){v989, v989}; - float32x2_t v994 = (float32x2_t){v993, v993}; - float32x2_t v998 = (float32x2_t){v997, v997}; - float32x2_t v1002 = (float32x2_t){v1001, v1001}; - float32x2_t v1006 = (float32x2_t){v1005, v1005}; - float32x2_t v1010 = (float32x2_t){v1009, v1009}; - float32x2_t v1014 = (float32x2_t){v1013, v1013}; - float32x2_t v1018 = (float32x2_t){v1017, v1017}; - float32x2_t v1022 = (float32x2_t){v1021, v1021}; - float32x2_t v1026 = (float32x2_t){v1025, v1025}; - float32x2_t v1030 = (float32x2_t){v1029, v1029}; - float32x2_t v1034 = (float32x2_t){v1033, v1033}; - float32x2_t v1038 = (float32x2_t){v1037, v1037}; - float32x2_t v1042 = (float32x2_t){v1041, v1041}; - float32x2_t v1046 = (float32x2_t){v1045, v1045}; - float32x2_t v1050 = (float32x2_t){v1048, v1049}; - float32x2_t v1057 = (float32x2_t){v1055, v1056}; - float32x2_t v1064 = (float32x2_t){v1062, v1063}; - float32x2_t v1071 = (float32x2_t){v1069, v1070}; - float32x2_t v1078 = (float32x2_t){v1076, v1077}; - float32x2_t v1085 = (float32x2_t){v1083, v1084}; - float32x2_t v1092 = (float32x2_t){v1090, v1091}; - float32x2_t v1099 = (float32x2_t){v1097, v1098}; - float32x2_t v1106 = (float32x2_t){v1104, v1105}; - float32x2_t v1113 = (float32x2_t){v1111, v1112}; - float32x2_t v1120 = (float32x2_t){v1118, v1119}; - float32x2_t v1127 = (float32x2_t){v1125, v1126}; - float32x2_t v1134 = (float32x2_t){v1132, v1133}; - float32x2_t v1141 = (float32x2_t){v1139, v1140}; - float32x2_t v1148 = (float32x2_t){v1146, v1147}; - float32x2_t v1155 = (float32x2_t){v1153, v1154}; - float32x2_t v1162 = (float32x2_t){v1160, v1161}; - float32x2_t v1169 = (float32x2_t){v1167, v1168}; - float32x2_t v1176 = (float32x2_t){v1174, v1175}; - int16x4_t v781 = vld1s_s16(&v5[istride * 18]); - int16x4_t v789 = vld1s_s16(&v5[istride * 2]); - int16x4_t v795 = vld1s_s16(&v5[istride * 17]); - int16x4_t v803 = vld1s_s16(&v5[istride * 4]); - int16x4_t v809 = vld1s_s16(&v5[istride * 15]); - int16x4_t v817 = vld1s_s16(&v5[istride * 8]); - int16x4_t v823 = vld1s_s16(&v5[istride * 11]); - int16x4_t v831 = vld1s_s16(&v5[istride * 16]); - int16x4_t v837 = vld1s_s16(&v5[istride * 3]); - int16x4_t v845 = vld1s_s16(&v5[istride * 13]); - int16x4_t v851 = vld1s_s16(&v5[istride * 6]); - int16x4_t v859 = vld1s_s16(&v5[istride * 7]); - int16x4_t v865 = vld1s_s16(&v5[istride * 12]); - int16x4_t v873 = vld1s_s16(&v5[istride * 14]); - int16x4_t v879 = vld1s_s16(&v5[istride * 5]); - int16x4_t v887 = vld1s_s16(&v5[istride * 9]); - int16x4_t v893 = vld1s_s16(&v5[istride * 10]); - float32x2_t v918 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v917)), 15); - float32x2_t v1052 = vmul_f32(v1177, v1050); - float32x2_t v1059 = vmul_f32(v1177, v1057); - float32x2_t v1066 = vmul_f32(v1177, v1064); - float32x2_t v1073 = vmul_f32(v1177, v1071); - float32x2_t v1080 = vmul_f32(v1177, v1078); - float32x2_t v1087 = vmul_f32(v1177, v1085); - float32x2_t v1094 = vmul_f32(v1177, v1092); - float32x2_t v1101 = vmul_f32(v1177, v1099); - float32x2_t v1108 = vmul_f32(v1177, v1106); - float32x2_t v1115 = vmul_f32(v1177, v1113); - float32x2_t v1122 = vmul_f32(v1177, v1120); - float32x2_t v1129 = vmul_f32(v1177, v1127); - float32x2_t v1136 = vmul_f32(v1177, v1134); - float32x2_t v1143 = vmul_f32(v1177, v1141); - float32x2_t v1150 = vmul_f32(v1177, v1148); - float32x2_t v1157 = vmul_f32(v1177, v1155); - float32x2_t v1164 = vmul_f32(v1177, v1162); - float32x2_t v1171 = vmul_f32(v1177, v1169); - float32x2_t v1178 = vmul_f32(v1177, v1176); - float32x2_t v782 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v781)), 15); - float32x2_t v790 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v789)), 15); - float32x2_t v796 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v795)), 15); - float32x2_t v804 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v803)), 15); - float32x2_t v810 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v809)), 15); - float32x2_t v818 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v817)), 15); - float32x2_t v824 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v823)), 15); - float32x2_t v832 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v831)), 15); - float32x2_t v838 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v837)), 15); - float32x2_t v846 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v845)), 15); - float32x2_t v852 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v851)), 15); - float32x2_t v860 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v859)), 15); - float32x2_t v866 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v865)), 15); - float32x2_t v874 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v873)), 15); - float32x2_t v880 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v879)), 15); - float32x2_t v888 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v887)), 15); - float32x2_t v894 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v893)), 15); - float32x2_t v783 = vadd_f32(v776, v782); - float32x2_t v784 = vsub_f32(v776, v782); - float32x2_t v797 = vadd_f32(v790, v796); - float32x2_t v798 = vsub_f32(v796, v790); - float32x2_t v811 = vadd_f32(v804, v810); - float32x2_t v812 = vsub_f32(v804, v810); - float32x2_t v825 = vadd_f32(v818, v824); - float32x2_t v826 = vsub_f32(v824, v818); - float32x2_t v839 = vadd_f32(v832, v838); - float32x2_t v840 = vsub_f32(v832, v838); - float32x2_t v853 = vadd_f32(v846, v852); - float32x2_t v854 = vsub_f32(v852, v846); - float32x2_t v867 = vadd_f32(v860, v866); - float32x2_t v868 = vsub_f32(v860, v866); - float32x2_t v881 = vadd_f32(v874, v880); - float32x2_t v882 = vsub_f32(v880, v874); - float32x2_t v895 = vadd_f32(v888, v894); - float32x2_t v896 = vsub_f32(v888, v894); - float32x2_t v897 = vsub_f32(v783, v867); - float32x2_t v898 = vsub_f32(v797, v881); - float32x2_t v899 = vsub_f32(v811, v895); - float32x2_t v900 = vsub_f32(v825, v867); - float32x2_t v901 = vsub_f32(v839, v881); - float32x2_t v902 = vsub_f32(v853, v895); - float32x2_t v903 = vadd_f32(v783, v825); - float32x2_t v905 = vadd_f32(v797, v839); - float32x2_t v907 = vadd_f32(v811, v853); - float32x2_t v936 = vsub_f32(v784, v868); - float32x2_t v937 = vsub_f32(v798, v882); - float32x2_t v938 = vsub_f32(v812, v896); - float32x2_t v939 = vsub_f32(v826, v868); - float32x2_t v940 = vsub_f32(v840, v882); - float32x2_t v941 = vsub_f32(v854, v896); - float32x2_t v942 = vadd_f32(v784, v826); - float32x2_t v944 = vadd_f32(v798, v840); - float32x2_t v946 = vadd_f32(v812, v854); - float32x2_t v904 = vadd_f32(v903, v867); - float32x2_t v906 = vadd_f32(v905, v881); - float32x2_t v908 = vadd_f32(v907, v895); - float32x2_t v909 = vadd_f32(v897, v899); - float32x2_t v910 = vadd_f32(v900, v902); - float32x2_t v926 = vsub_f32(v897, v900); - float32x2_t v927 = vsub_f32(v899, v902); - float32x2_t v943 = vadd_f32(v942, v868); - float32x2_t v945 = vadd_f32(v944, v882); - float32x2_t v947 = vadd_f32(v946, v896); - float32x2_t v948 = vadd_f32(v936, v938); - float32x2_t v949 = vadd_f32(v939, v941); - float32x2_t v958 = vsub_f32(v936, v939); - float32x2_t v959 = vsub_f32(v938, v941); - float32x2_t v1003 = vmul_f32(v900, v1002); - float32x2_t v1015 = vmul_f32(v902, v1014); - float32x2_t v1023 = vmul_f32(v899, v1022); - float32x2_t v1102 = vrev64_f32(v939); - float32x2_t v1116 = vrev64_f32(v936); - float32x2_t v1123 = vrev64_f32(v941); - float32x2_t v1137 = vrev64_f32(v938); - float32x2_t v911 = vadd_f32(v904, v906); - float32x2_t v920 = vadd_f32(v910, v901); - float32x2_t v921 = vadd_f32(v909, v898); - float32x2_t v923 = vsub_f32(v910, v901); - float32x2_t v924 = vsub_f32(v909, v898); - float32x2_t v928 = vsub_f32(v897, v927); - float32x2_t v930 = vadd_f32(v926, v902); - float32x2_t v933 = vsub_f32(v904, v908); - float32x2_t v934 = vsub_f32(v906, v908); - float32x2_t v950 = vadd_f32(v943, v945); - float32x2_t v952 = vadd_f32(v949, v940); - float32x2_t v953 = vadd_f32(v948, v937); - float32x2_t v955 = vsub_f32(v949, v940); - float32x2_t v956 = vsub_f32(v948, v937); - float32x2_t v960 = vsub_f32(v936, v959); - float32x2_t v962 = vadd_f32(v958, v941); - float32x2_t v965 = vsub_f32(v943, v947); - float32x2_t v966 = vsub_f32(v945, v947); - float32x2_t v1007 = vmul_f32(v926, v1006); - float32x2_t v1019 = vmul_f32(v927, v1018); - float32x2_t v1103 = vmul_f32(v1102, v1101); - float32x2_t v1109 = vrev64_f32(v958); - float32x2_t v1124 = vmul_f32(v1123, v1122); - float32x2_t v1130 = vrev64_f32(v959); - float32x2_t v1138 = vmul_f32(v1137, v1136); - float32x2_t v912 = vadd_f32(v911, v908); - float32x2_t v922 = vsub_f32(v921, v920); - float32x2_t v925 = vsub_f32(v924, v923); - float32x2_t v929 = vsub_f32(v928, v901); - float32x2_t v931 = vsub_f32(v930, v898); - float32x2_t v935 = vadd_f32(v933, v934); - float32x2_t v951 = vadd_f32(v950, v947); - float32x2_t v954 = vsub_f32(v953, v952); - float32x2_t v957 = vsub_f32(v956, v955); - float32x2_t v961 = vsub_f32(v960, v940); - float32x2_t v963 = vsub_f32(v962, v937); - float32x2_t v967 = vadd_f32(v965, v966); - float32x2_t v979 = vmul_f32(v920, v978); - float32x2_t v983 = vmul_f32(v921, v982); - float32x2_t v991 = vmul_f32(v923, v990); - float32x2_t v995 = vmul_f32(v924, v994); - float32x2_t v1039 = vmul_f32(v933, v1038); - float32x2_t v1043 = vmul_f32(v934, v1042); - float32x2_t v1060 = vrev64_f32(v952); - float32x2_t v1067 = vrev64_f32(v953); - float32x2_t v1081 = vrev64_f32(v955); - float32x2_t v1088 = vrev64_f32(v956); - float32x2_t v1110 = vmul_f32(v1109, v1108); - float32x2_t v1131 = vmul_f32(v1130, v1129); - float32x2_t v1165 = vrev64_f32(v965); - float32x2_t v1172 = vrev64_f32(v966); - float32x2_t v919 = vadd_f32(v918, v912); - float32x2_t v932 = vsub_f32(v929, v931); - float32x2_t v964 = vsub_f32(v961, v963); - float32x2_t v975 = vmul_f32(v912, v974); - float32x2_t v987 = vmul_f32(v922, v986); - float32x2_t v999 = vmul_f32(v925, v998); - float32x2_t v1027 = vmul_f32(v929, v1026); - float32x2_t v1031 = vmul_f32(v931, v1030); - float32x2_t v1047 = vmul_f32(v935, v1046); - float32x2_t v1053 = vrev64_f32(v951); - float32x2_t v1061 = vmul_f32(v1060, v1059); - float32x2_t v1068 = vmul_f32(v1067, v1066); - float32x2_t v1074 = vrev64_f32(v954); - float32x2_t v1082 = vmul_f32(v1081, v1080); - float32x2_t v1089 = vmul_f32(v1088, v1087); - float32x2_t v1095 = vrev64_f32(v957); - float32x2_t v1144 = vrev64_f32(v961); - float32x2_t v1151 = vrev64_f32(v963); - float32x2_t v1166 = vmul_f32(v1165, v1164); - float32x2_t v1173 = vmul_f32(v1172, v1171); - float32x2_t v1179 = vrev64_f32(v967); - float32x2_t v1181 = vadd_f32(v979, v983); - float32x2_t v1182 = vadd_f32(v991, v995); - float32x2_t v1035 = vmul_f32(v932, v1034); - float32x2_t v1054 = vmul_f32(v1053, v1052); - float32x2_t v1075 = vmul_f32(v1074, v1073); - float32x2_t v1096 = vmul_f32(v1095, v1094); - float32x2_t v1145 = vmul_f32(v1144, v1143); - float32x2_t v1152 = vmul_f32(v1151, v1150); - float32x2_t v1158 = vrev64_f32(v964); - float32x2_t v1180 = vmul_f32(v1179, v1178); - float32x2_t v1184 = vadd_f32(v1181, v1182); - float32x2_t v1185 = vadd_f32(v979, v987); - float32x2_t v1186 = vadd_f32(v991, v999); - float32x2_t v1203 = vsub_f32(v1181, v1182); - float32x2_t v1205 = vsub_f32(v1039, v1047); - float32x2_t v1206 = vsub_f32(v1043, v1047); - float32x2_t v1207 = vadd_f32(v975, v919); - float32x2_t v1212 = vadd_f32(v1061, v1068); - float32x2_t v1213 = vadd_f32(v1082, v1089); - int16x4_t v1268 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v919, 15), (int32x2_t){0, 0})); - float32x2_t v1159 = vmul_f32(v1158, v1157); - float32x2_t v1183 = vadd_f32(v1031, v1035); - float32x2_t v1187 = vadd_f32(v1027, v1035); - float32x2_t v1188 = vsub_f32(v1003, v1184); - float32x2_t v1189 = vadd_f32(v1185, v1186); - float32x2_t v1195 = vsub_f32(v1185, v1186); - float32x2_t v1200 = vadd_f32(v1184, v1023); - float32x2_t v1208 = vadd_f32(v1207, v1205); - float32x2_t v1209 = vsub_f32(v1207, v1205); - float32x2_t v1211 = vadd_f32(v1207, v1206); - float32x2_t v1215 = vadd_f32(v1212, v1213); - float32x2_t v1216 = vadd_f32(v1061, v1075); - float32x2_t v1217 = vadd_f32(v1082, v1096); - float32x2_t v1234 = vsub_f32(v1212, v1213); - float32x2_t v1236 = vsub_f32(v1166, v1180); - float32x2_t v1237 = vsub_f32(v1173, v1180); - v6[0] = vget_lane_s32(vreinterpret_s32_s16(v1268), 0); - float32x2_t v1190 = vsub_f32(v1015, v1187); - float32x2_t v1191 = vadd_f32(v1007, v1183); - float32x2_t v1193 = vadd_f32(v1189, v1019); - float32x2_t v1196 = vadd_f32(v1195, v1183); - float32x2_t v1197 = vadd_f32(v1188, v1189); - float32x2_t v1204 = vadd_f32(v1203, v1187); - float32x2_t v1210 = vsub_f32(v1209, v1206); - float32x2_t v1214 = vadd_f32(v1152, v1159); - float32x2_t v1218 = vadd_f32(v1145, v1159); - float32x2_t v1219 = vsub_f32(v1103, v1215); - float32x2_t v1220 = vadd_f32(v1216, v1217); - float32x2_t v1226 = vsub_f32(v1216, v1217); - float32x2_t v1231 = vadd_f32(v1215, v1138); - float32x2_t v1238 = vadd_f32(v1054, v1236); - float32x2_t v1239 = vsub_f32(v1054, v1236); - float32x2_t v1241 = vadd_f32(v1054, v1237); - float32x2_t v1192 = vadd_f32(v1191, v1188); - float32x2_t v1194 = vadd_f32(v1193, v1190); - float32x2_t v1198 = vfma_f32(v1197, v897, v1010); - float32x2_t v1201 = vadd_f32(v1200, v1190); - float32x2_t v1221 = vsub_f32(v1124, v1218); - float32x2_t v1222 = vadd_f32(v1110, v1214); - float32x2_t v1224 = vadd_f32(v1220, v1131); - float32x2_t v1227 = vadd_f32(v1226, v1214); - float32x2_t v1228 = vadd_f32(v1219, v1220); - float32x2_t v1235 = vadd_f32(v1234, v1218); - float32x2_t v1240 = vsub_f32(v1239, v1237); - float32x2_t v1246 = vsub_f32(v1204, v1196); - float32x2_t v1250 = vsub_f32(v1211, v1204); - float32x2_t v1253 = vadd_f32(v1196, v1211); - float32x2_t v1199 = vadd_f32(v1198, v1187); - float32x2_t v1202 = vadd_f32(v1201, v1183); - float32x2_t v1223 = vadd_f32(v1222, v1219); - float32x2_t v1225 = vadd_f32(v1224, v1221); - float32x2_t v1229 = vfma_f32(v1228, v1116, v1115); - float32x2_t v1232 = vadd_f32(v1231, v1221); - float32x2_t v1247 = vadd_f32(v1246, v1211); - float32x2_t v1251 = vadd_f32(v1192, v1208); - float32x2_t v1252 = vadd_f32(v1194, v1210); - float32x2_t v1258 = vsub_f32(v1235, v1227); - float32x2_t v1262 = vsub_f32(v1235, v1241); - float32x2_t v1265 = vadd_f32(v1227, v1241); - float32x2_t v1230 = vadd_f32(v1229, v1218); - float32x2_t v1233 = vadd_f32(v1232, v1214); - float32x2_t v1242 = vsub_f32(v1199, v1192); - float32x2_t v1244 = vsub_f32(v1202, v1194); - float32x2_t v1248 = vsub_f32(v1208, v1199); - float32x2_t v1249 = vsub_f32(v1210, v1202); - float32x2_t v1259 = vadd_f32(v1258, v1241); - float32x2_t v1263 = vadd_f32(v1223, v1238); - float32x2_t v1264 = vadd_f32(v1225, v1240); - float32x2_t v1286 = vsub_f32(v1253, v1265); - float32x2_t v1293 = vadd_f32(v1253, v1265); - float32x2_t v1300 = vadd_f32(v1250, v1262); - float32x2_t v1307 = vsub_f32(v1250, v1262); - float32x2_t v1243 = vadd_f32(v1242, v1208); - float32x2_t v1245 = vadd_f32(v1244, v1210); - float32x2_t v1254 = vsub_f32(v1230, v1223); - float32x2_t v1256 = vsub_f32(v1233, v1225); - float32x2_t v1260 = vsub_f32(v1238, v1230); - float32x2_t v1261 = vsub_f32(v1240, v1233); - int16x4_t v1289 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1286, 15), (int32x2_t){0, 0})); - int16x4_t v1296 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1293, 15), (int32x2_t){0, 0})); - int16x4_t v1303 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1300, 15), (int32x2_t){0, 0})); - int16x4_t v1310 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1307, 15), (int32x2_t){0, 0})); - float32x2_t v1314 = vadd_f32(v1252, v1264); - float32x2_t v1321 = vsub_f32(v1252, v1264); - float32x2_t v1328 = vadd_f32(v1247, v1259); - float32x2_t v1335 = vsub_f32(v1247, v1259); - float32x2_t v1370 = vsub_f32(v1251, v1263); - float32x2_t v1377 = vadd_f32(v1251, v1263); - float32x2_t v1255 = vadd_f32(v1254, v1238); - float32x2_t v1257 = vadd_f32(v1256, v1240); - v6[ostride * 2] = vget_lane_s32(vreinterpret_s32_s16(v1289), 0); - v6[ostride * 17] = vget_lane_s32(vreinterpret_s32_s16(v1296), 0); - v6[ostride * 3] = vget_lane_s32(vreinterpret_s32_s16(v1303), 0); - v6[ostride * 16] = vget_lane_s32(vreinterpret_s32_s16(v1310), 0); - int16x4_t v1317 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1314, 15), (int32x2_t){0, 0})); - int16x4_t v1324 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1321, 15), (int32x2_t){0, 0})); - int16x4_t v1331 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1328, 15), (int32x2_t){0, 0})); - int16x4_t v1338 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1335, 15), (int32x2_t){0, 0})); - float32x2_t v1342 = vadd_f32(v1249, v1261); - float32x2_t v1349 = vsub_f32(v1249, v1261); - float32x2_t v1356 = vadd_f32(v1248, v1260); - float32x2_t v1363 = vsub_f32(v1248, v1260); - int16x4_t v1373 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1370, 15), (int32x2_t){0, 0})); - int16x4_t v1380 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1377, 15), (int32x2_t){0, 0})); - float32x2_t v1272 = vadd_f32(v1243, v1255); - float32x2_t v1279 = vsub_f32(v1243, v1255); - v6[ostride * 4] = vget_lane_s32(vreinterpret_s32_s16(v1317), 0); - v6[ostride * 15] = vget_lane_s32(vreinterpret_s32_s16(v1324), 0); - v6[ostride * 5] = vget_lane_s32(vreinterpret_s32_s16(v1331), 0); - v6[ostride * 14] = vget_lane_s32(vreinterpret_s32_s16(v1338), 0); - int16x4_t v1345 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1342, 15), (int32x2_t){0, 0})); - int16x4_t v1352 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1349, 15), (int32x2_t){0, 0})); - int16x4_t v1359 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1356, 15), (int32x2_t){0, 0})); - int16x4_t v1366 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1363, 15), (int32x2_t){0, 0})); - v6[ostride * 8] = vget_lane_s32(vreinterpret_s32_s16(v1373), 0); - v6[ostride * 11] = vget_lane_s32(vreinterpret_s32_s16(v1380), 0); - float32x2_t v1384 = vadd_f32(v1245, v1257); - float32x2_t v1391 = vsub_f32(v1245, v1257); - int16x4_t v1275 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1272, 15), (int32x2_t){0, 0})); - int16x4_t v1282 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1279, 15), (int32x2_t){0, 0})); - v6[ostride * 6] = vget_lane_s32(vreinterpret_s32_s16(v1345), 0); - v6[ostride * 13] = vget_lane_s32(vreinterpret_s32_s16(v1352), 0); - v6[ostride * 7] = vget_lane_s32(vreinterpret_s32_s16(v1359), 0); - v6[ostride * 12] = vget_lane_s32(vreinterpret_s32_s16(v1366), 0); - int16x4_t v1387 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1384, 15), (int32x2_t){0, 0})); - int16x4_t v1394 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1391, 15), (int32x2_t){0, 0})); - v6[ostride] = vget_lane_s32(vreinterpret_s32_s16(v1275), 0); - v6[ostride * 18] = vget_lane_s32(vreinterpret_s32_s16(v1282), 0); - v6[ostride * 9] = vget_lane_s32(vreinterpret_s32_s16(v1387), 0); - v6[ostride * 10] = vget_lane_s32(vreinterpret_s32_s16(v1394), 0); - v5 += 1 * 1; - v6 += 1 * 1; - } -} -#endif - -#ifdef ARMRAL_ARCH_SVE -void armral_fft_cs16_cf32_cs16_ac_n_uu19(const armral_cmplx_int16_t *restrict x, - armral_cmplx_int16_t *restrict y, - int istride, int ostride, int howmany, - float dir) { - int64_t v0 = istride; - int64_t v2 = ostride; - float v4 = dir; - const int32_t *v5 = (const int32_t *)x; - int32_t *v6 = (int32_t *)y; - int64_t v8 = howmany; - int64_t v10 = svcntd(); - int64_t v11 = v10 * 1; - int64_t v12 = v10 * 1; - for (int j = 0; j < v8; j += v10) { - svbool_t pred_full = svwhilelt_b32(j * 2, howmany * 2); - float v259 = -1.0555555555555556e+00F; - float v264 = 1.7752228513927079e-01F; - float v269 = -1.2820077502191529e-01F; - float v274 = 4.9321510117355499e-02F; - float v279 = 5.7611011491005903e-01F; - float v284 = -7.4996449655536279e-01F; - float v289 = -1.7385438164530381e-01F; - float v294 = -2.1729997561977314e+00F; - float v299 = -1.7021211726914738e+00F; - float v304 = 4.7087858350625778e-01F; - float v309 = -2.0239400846888440e+00F; - float v314 = 1.0551641201664090e-01F; - float v319 = 2.1294564967054850e+00F; - float v324 = -7.5087543897371167e-01F; - float v329 = 1.4812817695157160e-01F; - float v334 = 8.9900361592528333e-01F; - float v339 = -6.2148246772602778e-01F; - float v344 = -7.9869352098712687e-01F; - float v349 = -4.7339199623771833e-01F; - float v354 = 2.4216105241892630e-01F; - float v361 = 5.9368607967505101e-02F; - float v368 = -1.2578688255176201e-02F; - float v375 = 4.6789919712328903e-02F; - float v382 = 9.3750121913782358e-01F; - float v389 = 5.0111537043352902e-02F; - float v396 = 9.8761275618117661e-01F; - float v403 = 1.1745786501205959e+00F; - float v410 = -1.1114482296234993e+00F; - float v417 = -2.2860268797440955e+00F; - float v424 = -2.6420523257930939e-01F; - float v431 = -2.1981792779352136e+00F; - float v438 = -1.9339740453559042e+00F; - float v445 = 7.4825847091254893e-01F; - float v452 = 4.7820835642768872e-01F; - float v459 = -2.7005011448486022e-01F; - float v466 = 3.4642356159542270e-01F; - float v473 = 8.3485429360688279e-01F; - float v480 = 3.9375928506743518e-01F; - const int32_t *v747 = &v5[v0]; - int32_t *v968 = &v6[v2]; - int64_t v27 = v0 * 18; - int64_t v37 = v0 * 2; - int64_t v45 = v0 * 17; - int64_t v55 = v0 * 4; - int64_t v63 = v0 * 15; - int64_t v73 = v0 * 8; - int64_t v81 = v0 * 11; - int64_t v91 = v0 * 16; - int64_t v99 = v0 * 3; - int64_t v109 = v0 * 13; - int64_t v117 = v0 * 6; - int64_t v127 = v0 * 7; - int64_t v135 = v0 * 12; - int64_t v145 = v0 * 14; - int64_t v153 = v0 * 5; - int64_t v163 = v0 * 9; - int64_t v171 = v0 * 10; - float v357 = v4 * v354; - float v364 = v4 * v361; - float v371 = v4 * v368; - float v378 = v4 * v375; - float v385 = v4 * v382; - float v392 = v4 * v389; - float v399 = v4 * v396; - float v406 = v4 * v403; - float v413 = v4 * v410; - float v420 = v4 * v417; - float v427 = v4 * v424; - float v434 = v4 * v431; - float v441 = v4 * v438; - float v448 = v4 * v445; - float v455 = v4 * v452; - float v462 = v4 * v459; - float v469 = v4 * v466; - float v476 = v4 * v473; - float v483 = v4 * v480; - int64_t v590 = v2 * 18; - int64_t v599 = v2 * 2; - int64_t v608 = v2 * 17; - int64_t v617 = v2 * 3; - int64_t v626 = v2 * 16; - int64_t v635 = v2 * 4; - int64_t v644 = v2 * 15; - int64_t v653 = v2 * 5; - int64_t v662 = v2 * 14; - int64_t v671 = v2 * 6; - int64_t v680 = v2 * 13; - int64_t v689 = v2 * 7; - int64_t v698 = v2 * 12; - int64_t v707 = v2 * 8; - int64_t v716 = v2 * 11; - int64_t v725 = v2 * 9; - int64_t v734 = v2 * 10; - const int32_t *v910 = &v5[0]; - svfloat32_t v914 = svdup_n_f32(v259); - svfloat32_t v915 = svdup_n_f32(v264); - svfloat32_t v916 = svdup_n_f32(v269); - svfloat32_t v917 = svdup_n_f32(v274); - svfloat32_t v918 = svdup_n_f32(v279); - svfloat32_t v919 = svdup_n_f32(v284); - svfloat32_t v920 = svdup_n_f32(v289); - svfloat32_t v921 = svdup_n_f32(v294); - svfloat32_t v922 = svdup_n_f32(v299); - svfloat32_t v923 = svdup_n_f32(v304); - svfloat32_t v924 = svdup_n_f32(v309); - svfloat32_t v925 = svdup_n_f32(v314); - svfloat32_t v926 = svdup_n_f32(v319); - svfloat32_t v927 = svdup_n_f32(v324); - svfloat32_t v928 = svdup_n_f32(v329); - svfloat32_t v929 = svdup_n_f32(v334); - svfloat32_t v930 = svdup_n_f32(v339); - svfloat32_t v931 = svdup_n_f32(v344); - svfloat32_t v932 = svdup_n_f32(v349); - int32_t *v959 = &v6[0]; - svfloat32_t v25 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v747[0])), - 1.F / (1ULL << 15ULL)); - const int32_t *v756 = &v5[v27]; - const int32_t *v765 = &v5[v37]; - const int32_t *v774 = &v5[v45]; - const int32_t *v783 = &v5[v55]; - const int32_t *v792 = &v5[v63]; - const int32_t *v801 = &v5[v73]; - const int32_t *v810 = &v5[v81]; - const int32_t *v819 = &v5[v91]; - const int32_t *v828 = &v5[v99]; - const int32_t *v837 = &v5[v109]; - const int32_t *v846 = &v5[v117]; - const int32_t *v855 = &v5[v127]; - const int32_t *v864 = &v5[v135]; - const int32_t *v873 = &v5[v145]; - const int32_t *v882 = &v5[v153]; - const int32_t *v891 = &v5[v163]; - const int32_t *v900 = &v5[v171]; - svfloat32_t v933 = svdup_n_f32(v357); - svfloat32_t v934 = svdup_n_f32(v364); - svfloat32_t v935 = svdup_n_f32(v371); - svfloat32_t v936 = svdup_n_f32(v378); - svfloat32_t v937 = svdup_n_f32(v385); - svfloat32_t v938 = svdup_n_f32(v392); - svfloat32_t v939 = svdup_n_f32(v399); - svfloat32_t v940 = svdup_n_f32(v406); - svfloat32_t v941 = svdup_n_f32(v413); - svfloat32_t v942 = svdup_n_f32(v420); - svfloat32_t v943 = svdup_n_f32(v427); - svfloat32_t v944 = svdup_n_f32(v434); - svfloat32_t v945 = svdup_n_f32(v441); - svfloat32_t v946 = svdup_n_f32(v448); - svfloat32_t v947 = svdup_n_f32(v455); - svfloat32_t v948 = svdup_n_f32(v462); - svfloat32_t v949 = svdup_n_f32(v469); - svfloat32_t v950 = svdup_n_f32(v476); - svfloat32_t v951 = svdup_n_f32(v483); - int32_t *v977 = &v6[v590]; - int32_t *v986 = &v6[v599]; - int32_t *v995 = &v6[v608]; - int32_t *v1004 = &v6[v617]; - int32_t *v1013 = &v6[v626]; - int32_t *v1022 = &v6[v635]; - int32_t *v1031 = &v6[v644]; - int32_t *v1040 = &v6[v653]; - int32_t *v1049 = &v6[v662]; - int32_t *v1058 = &v6[v671]; - int32_t *v1067 = &v6[v680]; - int32_t *v1076 = &v6[v689]; - int32_t *v1085 = &v6[v698]; - int32_t *v1094 = &v6[v707]; - int32_t *v1103 = &v6[v716]; - int32_t *v1112 = &v6[v725]; - int32_t *v1121 = &v6[v734]; - svfloat32_t v203 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v910[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v33 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v756[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v43 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v765[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v51 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v774[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v61 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v783[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v69 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v792[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v79 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v801[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v87 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v810[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v97 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v819[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v105 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v828[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v115 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v837[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v123 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v846[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v133 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v855[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v141 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v864[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v151 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v873[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v159 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v882[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v169 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v891[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v177 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v900[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v34; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v34) : "w"(v25), "w"(v33)); - svfloat32_t v35; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v35) : "w"(v25), "w"(v33)); - svfloat32_t v52; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v52) : "w"(v43), "w"(v51)); - svfloat32_t v53; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v53) : "w"(v51), "w"(v43)); - svfloat32_t v70; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v70) : "w"(v61), "w"(v69)); - svfloat32_t v71; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v71) : "w"(v61), "w"(v69)); - svfloat32_t v88; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v88) : "w"(v79), "w"(v87)); - svfloat32_t v89; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v89) : "w"(v87), "w"(v79)); - svfloat32_t v106; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v106) : "w"(v97), "w"(v105)); - svfloat32_t v107; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v107) : "w"(v97), "w"(v105)); - svfloat32_t v124; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v124) : "w"(v115), "w"(v123)); - svfloat32_t v125; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v125) : "w"(v123), "w"(v115)); - svfloat32_t v142; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v142) : "w"(v133), "w"(v141)); - svfloat32_t v143; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v143) : "w"(v133), "w"(v141)); - svfloat32_t v160; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v160) : "w"(v151), "w"(v159)); - svfloat32_t v161; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v161) : "w"(v159), "w"(v151)); - svfloat32_t v178; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v178) : "w"(v169), "w"(v177)); - svfloat32_t v179; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v179) : "w"(v169), "w"(v177)); - svfloat32_t v180; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v180) : "w"(v34), "w"(v142)); - svfloat32_t v181; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v181) : "w"(v52), "w"(v160)); - svfloat32_t v182; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v182) : "w"(v70), "w"(v178)); - svfloat32_t v183; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v183) : "w"(v88), "w"(v142)); - svfloat32_t v184; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v184) : "w"(v106), "w"(v160)); - svfloat32_t v185; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v185) : "w"(v124), "w"(v178)); - svfloat32_t v186; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v186) : "w"(v34), "w"(v88)); - svfloat32_t v188; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v188) : "w"(v52), "w"(v106)); - svfloat32_t v190; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v190) : "w"(v70), "w"(v124)); - svfloat32_t v221; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v221) : "w"(v35), "w"(v143)); - svfloat32_t v222; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v222) : "w"(v53), "w"(v161)); - svfloat32_t v223; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v223) : "w"(v71), "w"(v179)); - svfloat32_t v224; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v224) : "w"(v89), "w"(v143)); - svfloat32_t v225; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v225) : "w"(v107), "w"(v161)); - svfloat32_t v226; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v226) : "w"(v125), "w"(v179)); - svfloat32_t v227; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v227) : "w"(v35), "w"(v89)); - svfloat32_t v229; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v229) : "w"(v53), "w"(v107)); - svfloat32_t v231; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v231) : "w"(v71), "w"(v125)); - svfloat32_t v187; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v187) : "w"(v186), "w"(v142)); - svfloat32_t v189; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v189) : "w"(v188), "w"(v160)); - svfloat32_t v191; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v191) : "w"(v190), "w"(v178)); - svfloat32_t v192; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v192) : "w"(v180), "w"(v182)); - svfloat32_t v193; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v193) : "w"(v183), "w"(v185)); - svfloat32_t v211; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v211) : "w"(v180), "w"(v183)); - svfloat32_t v212; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v212) : "w"(v182), "w"(v185)); - svfloat32_t v228; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v228) : "w"(v227), "w"(v143)); - svfloat32_t v230; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v230) : "w"(v229), "w"(v161)); - svfloat32_t v232; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v232) : "w"(v231), "w"(v179)); - svfloat32_t v233; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v233) : "w"(v221), "w"(v223)); - svfloat32_t v234; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v234) : "w"(v224), "w"(v226)); - svfloat32_t v243; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v243) : "w"(v221), "w"(v224)); - svfloat32_t v244; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v244) : "w"(v223), "w"(v226)); - svfloat32_t zero408; - asm volatile("mov %0.s, #0" : "=w"(zero408)); - svfloat32_t v408 = svcmla_f32_x(pred_full, zero408, v940, v224, 90); - svfloat32_t zero429; - asm volatile("mov %0.s, #0" : "=w"(zero429)); - svfloat32_t v429 = svcmla_f32_x(pred_full, zero429, v943, v226, 90); - svfloat32_t v194; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v194) : "w"(v187), "w"(v189)); - svfloat32_t v205; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v205) : "w"(v193), "w"(v184)); - svfloat32_t v206; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v206) : "w"(v192), "w"(v181)); - svfloat32_t v208; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v208) : "w"(v193), "w"(v184)); - svfloat32_t v209; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v209) : "w"(v192), "w"(v181)); - svfloat32_t v213; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v213) : "w"(v180), "w"(v212)); - svfloat32_t v215; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v215) : "w"(v211), "w"(v185)); - svfloat32_t v218; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v218) : "w"(v187), "w"(v191)); - svfloat32_t v219; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v219) : "w"(v189), "w"(v191)); - svfloat32_t v235; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v235) : "w"(v228), "w"(v230)); - svfloat32_t v237; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v237) : "w"(v234), "w"(v225)); - svfloat32_t v238; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v238) : "w"(v233), "w"(v222)); - svfloat32_t v240; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v240) : "w"(v234), "w"(v225)); - svfloat32_t v241; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v241) : "w"(v233), "w"(v222)); - svfloat32_t v245; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v245) : "w"(v221), "w"(v244)); - svfloat32_t v247; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v247) : "w"(v243), "w"(v226)); - svfloat32_t v250; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v250) : "w"(v228), "w"(v232)); - svfloat32_t v251; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v251) : "w"(v230), "w"(v232)); - svfloat32_t v195; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v195) : "w"(v194), "w"(v191)); - svfloat32_t v207; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v207) : "w"(v206), "w"(v205)); - svfloat32_t v210; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v210) : "w"(v209), "w"(v208)); - svfloat32_t v214; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v214) : "w"(v213), "w"(v184)); - svfloat32_t v216; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v216) : "w"(v215), "w"(v181)); - svfloat32_t v220; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v220) : "w"(v218), "w"(v219)); - svfloat32_t v236; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v236) : "w"(v235), "w"(v232)); - svfloat32_t v239; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v239) : "w"(v238), "w"(v237)); - svfloat32_t v242; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v242) : "w"(v241), "w"(v240)); - svfloat32_t v246; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v246) : "w"(v245), "w"(v225)); - svfloat32_t v248; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v248) : "w"(v247), "w"(v222)); - svfloat32_t v252; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v252) : "w"(v250), "w"(v251)); - svfloat32_t v272; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v272) : "w"(v206), "w"(v916)); - svfloat32_t v287; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v287) : "w"(v209), "w"(v919)); - svfloat32_t zero366; - asm volatile("mov %0.s, #0" : "=w"(zero366)); - svfloat32_t v366 = svcmla_f32_x(pred_full, zero366, v934, v237, 90); - svfloat32_t zero387; - asm volatile("mov %0.s, #0" : "=w"(zero387)); - svfloat32_t v387 = svcmla_f32_x(pred_full, zero387, v937, v240, 90); - svfloat32_t zero471; - asm volatile("mov %0.s, #0" : "=w"(zero471)); - svfloat32_t v471 = svcmla_f32_x(pred_full, zero471, v949, v250, 90); - svfloat32_t zero478; - asm volatile("mov %0.s, #0" : "=w"(zero478)); - svfloat32_t v478 = svcmla_f32_x(pred_full, zero478, v950, v251, 90); - svfloat32_t v204; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v204) : "w"(v203), "w"(v195)); - svfloat32_t v217; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v217) : "w"(v214), "w"(v216)); - svfloat32_t v249; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v249) : "w"(v246), "w"(v248)); - svfloat32_t v277; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v277) : "w"(v207), "w"(v917)); - svfloat32_t v292; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v292) : "w"(v210), "w"(v920)); - svfloat32_t v352; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v352) : "w"(v220), "w"(v932)); - svfloat32_t zero359; - asm volatile("mov %0.s, #0" : "=w"(zero359)); - svfloat32_t v359 = svcmla_f32_x(pred_full, zero359, v933, v236, 90); - svfloat32_t zero485; - asm volatile("mov %0.s, #0" : "=w"(zero485)); - svfloat32_t v485 = svcmla_f32_x(pred_full, zero485, v951, v252, 90); - svfloat32_t v486 = svmla_f32_x(pred_full, v272, v205, v915); - svfloat32_t v487 = svmla_f32_x(pred_full, v287, v208, v918); - svfloat32_t v517 = svcmla_f32_x(pred_full, v366, v935, v238, 90); - svfloat32_t v518 = svcmla_f32_x(pred_full, v387, v938, v241, 90); - svfloat32_t v337; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v337) : "w"(v217), "w"(v929)); - svfloat32_t zero464; - asm volatile("mov %0.s, #0" : "=w"(zero464)); - svfloat32_t v464 = svcmla_f32_x(pred_full, zero464, v948, v249, 90); - svfloat32_t v489; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v489) : "w"(v486), "w"(v487)); - svfloat32_t v490 = svmla_f32_x(pred_full, v277, v205, v915); - svfloat32_t v491 = svmla_f32_x(pred_full, v292, v208, v918); - svfloat32_t v508; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v508) : "w"(v486), "w"(v487)); - svfloat32_t v510 = svnmls_f32_x(pred_full, v352, v218, v930); - svfloat32_t v511 = svnmls_f32_x(pred_full, v352, v219, v931); - svfloat32_t v512 = svmla_f32_x(pred_full, v204, v195, v914); - svfloat32_t v520; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v520) : "w"(v517), "w"(v518)); - svfloat32_t v521 = svcmla_f32_x(pred_full, v366, v936, v239, 90); - svfloat32_t v522 = svcmla_f32_x(pred_full, v387, v939, v242, 90); - svfloat32_t v539; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v539) : "w"(v517), "w"(v518)); - svfloat32_t v541; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v541) : "w"(v471), "w"(v485)); - svfloat32_t v542; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v542) : "w"(v478), "w"(v485)); - svint16_t v573 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v204, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svfloat32_t v488 = svmla_f32_x(pred_full, v337, v216, v928); - svfloat32_t v492 = svmla_f32_x(pred_full, v337, v214, v927); - svfloat32_t v493 = svnmls_f32_x(pred_full, v489, v183, v921); - svfloat32_t v494; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v494) : "w"(v490), "w"(v491)); - svfloat32_t v500; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v500) : "w"(v490), "w"(v491)); - svfloat32_t v505 = svmla_f32_x(pred_full, v489, v182, v926); - svfloat32_t v513; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v513) : "w"(v512), "w"(v510)); - svfloat32_t v514; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v514) : "w"(v512), "w"(v510)); - svfloat32_t v516; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v516) : "w"(v512), "w"(v511)); - svfloat32_t v519 = svcmla_f32_x(pred_full, v464, v947, v248, 90); - svfloat32_t v523 = svcmla_f32_x(pred_full, v464, v946, v246, 90); - svfloat32_t v524; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v524) : "w"(v408), "w"(v520)); - svfloat32_t v525; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v525) : "w"(v521), "w"(v522)); - svfloat32_t v531; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v531) : "w"(v521), "w"(v522)); - svfloat32_t v536 = svcmla_f32_x(pred_full, v520, v945, v223, 90); - svfloat32_t v543; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v543) : "w"(v359), "w"(v541)); - svfloat32_t v544; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v544) : "w"(v359), "w"(v541)); - svfloat32_t v546; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v546) : "w"(v359), "w"(v542)); - svst1w_u64(pred_full, (unsigned *)(v959), svreinterpret_u64_s16(v573)); - svfloat32_t v495 = svnmls_f32_x(pred_full, v492, v185, v924); - svfloat32_t v496 = svmla_f32_x(pred_full, v488, v211, v922); - svfloat32_t v498 = svmla_f32_x(pred_full, v494, v212, v925); - svfloat32_t v501; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v501) : "w"(v500), "w"(v488)); - svfloat32_t v502; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v502) : "w"(v493), "w"(v494)); - svfloat32_t v509; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v509) : "w"(v508), "w"(v492)); - svfloat32_t v515; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v515) : "w"(v514), "w"(v511)); - svfloat32_t v526; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v526) : "w"(v429), "w"(v523)); - svfloat32_t v527 = svcmla_f32_x(pred_full, v519, v941, v243, 90); - svfloat32_t v529 = svcmla_f32_x(pred_full, v525, v944, v244, 90); - svfloat32_t v532; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v532) : "w"(v531), "w"(v519)); - svfloat32_t v533; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v533) : "w"(v524), "w"(v525)); - svfloat32_t v540; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v540) : "w"(v539), "w"(v523)); - svfloat32_t v545; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v545) : "w"(v544), "w"(v542)); - svfloat32_t v497; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v497) : "w"(v496), "w"(v493)); - svfloat32_t v499; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v499) : "w"(v498), "w"(v495)); - svfloat32_t v503 = svmla_f32_x(pred_full, v502, v180, v923); - svfloat32_t v506; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v506) : "w"(v505), "w"(v495)); - svfloat32_t v528; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v528) : "w"(v527), "w"(v524)); - svfloat32_t v530; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v530) : "w"(v529), "w"(v526)); - svfloat32_t v534 = svcmla_f32_x(pred_full, v533, v942, v221, 90); - svfloat32_t v537; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v537) : "w"(v536), "w"(v526)); - svfloat32_t v551; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v551) : "w"(v509), "w"(v501)); - svfloat32_t v555; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v555) : "w"(v516), "w"(v509)); - svfloat32_t v558; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v558) : "w"(v501), "w"(v516)); - svfloat32_t v563; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v563) : "w"(v540), "w"(v532)); - svfloat32_t v567; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v567) : "w"(v540), "w"(v546)); - svfloat32_t v570; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v570) : "w"(v532), "w"(v546)); - svfloat32_t v504; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v504) : "w"(v503), "w"(v492)); - svfloat32_t v507; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v507) : "w"(v506), "w"(v488)); - svfloat32_t v535; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v535) : "w"(v534), "w"(v523)); - svfloat32_t v538; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v538) : "w"(v537), "w"(v519)); - svfloat32_t v552; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v552) : "w"(v551), "w"(v516)); - svfloat32_t v556; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v556) : "w"(v497), "w"(v513)); - svfloat32_t v557; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v557) : "w"(v499), "w"(v515)); - svfloat32_t v564; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v564) : "w"(v563), "w"(v546)); - svfloat32_t v568; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v568) : "w"(v528), "w"(v543)); - svfloat32_t v569; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v569) : "w"(v530), "w"(v545)); - svfloat32_t v597; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v597) : "w"(v558), "w"(v570)); - svfloat32_t v606; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v606) : "w"(v558), "w"(v570)); - svfloat32_t v615; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v615) : "w"(v555), "w"(v567)); - svfloat32_t v624; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v624) : "w"(v555), "w"(v567)); - svfloat32_t v547; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v547) : "w"(v504), "w"(v497)); - svfloat32_t v549; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v549) : "w"(v507), "w"(v499)); - svfloat32_t v553; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v553) : "w"(v513), "w"(v504)); - svfloat32_t v554; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v554) : "w"(v515), "w"(v507)); - svfloat32_t v559; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v559) : "w"(v535), "w"(v528)); - svfloat32_t v561; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v561) : "w"(v538), "w"(v530)); - svfloat32_t v565; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v565) : "w"(v543), "w"(v535)); - svfloat32_t v566; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v566) : "w"(v545), "w"(v538)); - svint16_t v600 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v597, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svint16_t v609 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v606, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svint16_t v618 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v615, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svint16_t v627 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v624, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svfloat32_t v633; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v633) : "w"(v557), "w"(v569)); - svfloat32_t v642; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v642) : "w"(v557), "w"(v569)); - svfloat32_t v651; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v651) : "w"(v552), "w"(v564)); - svfloat32_t v660; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v660) : "w"(v552), "w"(v564)); - svfloat32_t v705; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v705) : "w"(v556), "w"(v568)); - svfloat32_t v714; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v714) : "w"(v556), "w"(v568)); - svfloat32_t v548; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v548) : "w"(v547), "w"(v513)); - svfloat32_t v550; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v550) : "w"(v549), "w"(v515)); - svfloat32_t v560; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v560) : "w"(v559), "w"(v543)); - svfloat32_t v562; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v562) : "w"(v561), "w"(v545)); - svint16_t v636 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v633, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svint16_t v645 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v642, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svint16_t v654 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v651, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svint16_t v663 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v660, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svfloat32_t v669; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v669) : "w"(v554), "w"(v566)); - svfloat32_t v678; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v678) : "w"(v554), "w"(v566)); - svfloat32_t v687; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v687) : "w"(v553), "w"(v565)); - svfloat32_t v696; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v696) : "w"(v553), "w"(v565)); - svint16_t v708 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v705, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svint16_t v717 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v714, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svst1w_u64(pred_full, (unsigned *)(v986), svreinterpret_u64_s16(v600)); - svst1w_u64(pred_full, (unsigned *)(v995), svreinterpret_u64_s16(v609)); - svst1w_u64(pred_full, (unsigned *)(v1004), svreinterpret_u64_s16(v618)); - svst1w_u64(pred_full, (unsigned *)(v1013), svreinterpret_u64_s16(v627)); - svfloat32_t v579; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v579) : "w"(v548), "w"(v560)); - svfloat32_t v588; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v588) : "w"(v548), "w"(v560)); - svint16_t v672 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v669, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svint16_t v681 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v678, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svint16_t v690 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v687, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svint16_t v699 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v696, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svfloat32_t v723; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v723) : "w"(v550), "w"(v562)); - svfloat32_t v732; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v732) : "w"(v550), "w"(v562)); - svst1w_u64(pred_full, (unsigned *)(v1022), svreinterpret_u64_s16(v636)); - svst1w_u64(pred_full, (unsigned *)(v1031), svreinterpret_u64_s16(v645)); - svst1w_u64(pred_full, (unsigned *)(v1040), svreinterpret_u64_s16(v654)); - svst1w_u64(pred_full, (unsigned *)(v1049), svreinterpret_u64_s16(v663)); - svst1w_u64(pred_full, (unsigned *)(v1094), svreinterpret_u64_s16(v708)); - svst1w_u64(pred_full, (unsigned *)(v1103), svreinterpret_u64_s16(v717)); - svint16_t v582 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v579, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svint16_t v591 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v588, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svint16_t v726 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v723, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svint16_t v735 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v732, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svst1w_u64(pred_full, (unsigned *)(v1058), svreinterpret_u64_s16(v672)); - svst1w_u64(pred_full, (unsigned *)(v1067), svreinterpret_u64_s16(v681)); - svst1w_u64(pred_full, (unsigned *)(v1076), svreinterpret_u64_s16(v690)); - svst1w_u64(pred_full, (unsigned *)(v1085), svreinterpret_u64_s16(v699)); - svst1w_u64(pred_full, (unsigned *)(v968), svreinterpret_u64_s16(v582)); - svst1w_u64(pred_full, (unsigned *)(v977), svreinterpret_u64_s16(v591)); - svst1w_u64(pred_full, (unsigned *)(v1112), svreinterpret_u64_s16(v726)); - svst1w_u64(pred_full, (unsigned *)(v1121), svreinterpret_u64_s16(v735)); - v5 += v11; - v6 += v12; - } -} -#endif - -#ifndef ARMRAL_ARCH_SVE -void armral_fft_cs16_cf32_cs16_ac_n_uu20(const armral_cmplx_int16_t *restrict x, - armral_cmplx_int16_t *restrict y, - int istride, int ostride, int howmany, - float dir) { - float v4 = dir; - const int32_t *v5 = (const int32_t *)x; - int32_t *v6 = (int32_t *)y; - int64_t v12 = howmany - 1; - int64_t v605 = howmany / 2; - for (int j = 0; j < v12; j += 2) { - float v346 = 1.5388417685876268e+00F; - float v354 = 5.8778525229247325e-01F; - float v362 = 3.6327126400268028e-01F; - float v387 = 1.0000000000000000e+00F; - float v388 = -1.0000000000000000e+00F; - float v395 = -1.2500000000000000e+00F; - float v396 = 1.2500000000000000e+00F; - float v403 = 5.5901699437494745e-01F; - float v404 = -5.5901699437494745e-01F; - float32x2_t v406 = (float32x2_t){v4, v4}; - float v412 = -1.5388417685876268e+00F; - float v417 = -5.8778525229247325e-01F; - float v422 = -3.6327126400268028e-01F; - const int32_t *v1266 = &v5[istride]; - int32_t *v1330 = &v6[ostride]; - float32x2_t v338 = (float32x2_t){v395, v395}; - float32x2_t v343 = (float32x2_t){v403, v403}; - float32x2_t v348 = (float32x2_t){v346, v412}; - float32x2_t v356 = (float32x2_t){v354, v417}; - float32x2_t v364 = (float32x2_t){v362, v422}; - float32x2_t v389 = (float32x2_t){v387, v388}; - float32x2_t v397 = (float32x2_t){v395, v396}; - float32x2_t v405 = (float32x2_t){v403, v404}; - float32x2_t v413 = (float32x2_t){v412, v412}; - float32x2_t v418 = (float32x2_t){v417, v417}; - float32x2_t v423 = (float32x2_t){v422, v422}; - const int32_t *v1104 = &v5[0]; - int32_t *v1285 = &v6[0]; - int16x4_t v1496 = vld1_s16((const int16_t *)v1266); - float32x4_t v198 = vcvtq_n_f32_s32(vmovl_s16(v1496), 15); - float32x4_t v339 = vcombine_f32(v338, v338); - float32x4_t v344 = vcombine_f32(v343, v343); - float32x2_t v350 = vmul_f32(v406, v348); - float32x2_t v358 = vmul_f32(v406, v356); - float32x2_t v366 = vmul_f32(v406, v364); - float32x2_t v391 = vmul_f32(v406, v389); - float32x2_t v399 = vmul_f32(v406, v397); - float32x2_t v407 = vmul_f32(v406, v405); - float32x4_t v414 = vcombine_f32(v413, v413); - float32x4_t v419 = vcombine_f32(v418, v418); - float32x4_t v424 = vcombine_f32(v423, v423); - const int32_t *v1113 = &v5[istride * 10]; - const int32_t *v1122 = &v5[istride * 5]; - const int32_t *v1131 = &v5[istride * 15]; - const int32_t *v1140 = &v5[istride * 4]; - const int32_t *v1149 = &v5[istride * 14]; - const int32_t *v1158 = &v5[istride * 9]; - const int32_t *v1167 = &v5[istride * 19]; - const int32_t *v1176 = &v5[istride * 8]; - const int32_t *v1185 = &v5[istride * 18]; - const int32_t *v1194 = &v5[istride * 13]; - const int32_t *v1203 = &v5[istride * 3]; - const int32_t *v1212 = &v5[istride * 12]; - const int32_t *v1221 = &v5[istride * 2]; - const int32_t *v1230 = &v5[istride * 17]; - const int32_t *v1239 = &v5[istride * 7]; - const int32_t *v1248 = &v5[istride * 16]; - const int32_t *v1257 = &v5[istride * 6]; - const int32_t *v1275 = &v5[istride * 11]; - int32_t *v1294 = &v6[ostride * 5]; - int32_t *v1303 = &v6[ostride * 10]; - int32_t *v1312 = &v6[ostride * 15]; - int32_t *v1321 = &v6[ostride * 16]; - int32_t *v1339 = &v6[ostride * 6]; - int32_t *v1348 = &v6[ostride * 11]; - int32_t *v1357 = &v6[ostride * 12]; - int32_t *v1366 = &v6[ostride * 17]; - int32_t *v1375 = &v6[ostride * 2]; - int32_t *v1384 = &v6[ostride * 7]; - int32_t *v1393 = &v6[ostride * 8]; - int32_t *v1402 = &v6[ostride * 13]; - int32_t *v1411 = &v6[ostride * 18]; - int32_t *v1420 = &v6[ostride * 3]; - int32_t *v1429 = &v6[ostride * 4]; - int32_t *v1438 = &v6[ostride * 9]; - int32_t *v1447 = &v6[ostride * 14]; - int32_t *v1456 = &v6[ostride * 19]; - int16x4_t v1460 = vld1_s16((const int16_t *)v1104); - float32x4_t v28 = vcvtq_n_f32_s32(vmovl_s16(v1460), 15); - float32x4_t v352 = vcombine_f32(v350, v350); - float32x4_t v360 = vcombine_f32(v358, v358); - float32x4_t v368 = vcombine_f32(v366, v366); - float32x4_t v393 = vcombine_f32(v391, v391); - float32x4_t v401 = vcombine_f32(v399, v399); - float32x4_t v409 = vcombine_f32(v407, v407); - int16x4_t v1462 = vld1_s16((const int16_t *)v1113); - int16x4_t v1464 = vld1_s16((const int16_t *)v1122); - int16x4_t v1466 = vld1_s16((const int16_t *)v1131); - int16x4_t v1468 = vld1_s16((const int16_t *)v1140); - int16x4_t v1470 = vld1_s16((const int16_t *)v1149); - int16x4_t v1472 = vld1_s16((const int16_t *)v1158); - int16x4_t v1474 = vld1_s16((const int16_t *)v1167); - int16x4_t v1476 = vld1_s16((const int16_t *)v1176); - int16x4_t v1478 = vld1_s16((const int16_t *)v1185); - int16x4_t v1480 = vld1_s16((const int16_t *)v1194); - int16x4_t v1482 = vld1_s16((const int16_t *)v1203); - int16x4_t v1484 = vld1_s16((const int16_t *)v1212); - int16x4_t v1486 = vld1_s16((const int16_t *)v1221); - int16x4_t v1488 = vld1_s16((const int16_t *)v1230); - int16x4_t v1490 = vld1_s16((const int16_t *)v1239); - int16x4_t v1492 = vld1_s16((const int16_t *)v1248); - int16x4_t v1494 = vld1_s16((const int16_t *)v1257); - int16x4_t v1498 = vld1_s16((const int16_t *)v1275); - float32x4_t v36 = vcvtq_n_f32_s32(vmovl_s16(v1462), 15); - float32x4_t v46 = vcvtq_n_f32_s32(vmovl_s16(v1464), 15); - float32x4_t v54 = vcvtq_n_f32_s32(vmovl_s16(v1466), 15); - float32x4_t v66 = vcvtq_n_f32_s32(vmovl_s16(v1468), 15); - float32x4_t v74 = vcvtq_n_f32_s32(vmovl_s16(v1470), 15); - float32x4_t v84 = vcvtq_n_f32_s32(vmovl_s16(v1472), 15); - float32x4_t v92 = vcvtq_n_f32_s32(vmovl_s16(v1474), 15); - float32x4_t v104 = vcvtq_n_f32_s32(vmovl_s16(v1476), 15); - float32x4_t v112 = vcvtq_n_f32_s32(vmovl_s16(v1478), 15); - float32x4_t v122 = vcvtq_n_f32_s32(vmovl_s16(v1480), 15); - float32x4_t v130 = vcvtq_n_f32_s32(vmovl_s16(v1482), 15); - float32x4_t v142 = vcvtq_n_f32_s32(vmovl_s16(v1484), 15); - float32x4_t v150 = vcvtq_n_f32_s32(vmovl_s16(v1486), 15); - float32x4_t v160 = vcvtq_n_f32_s32(vmovl_s16(v1488), 15); - float32x4_t v168 = vcvtq_n_f32_s32(vmovl_s16(v1490), 15); - float32x4_t v180 = vcvtq_n_f32_s32(vmovl_s16(v1492), 15); - float32x4_t v188 = vcvtq_n_f32_s32(vmovl_s16(v1494), 15); - float32x4_t v206 = vcvtq_n_f32_s32(vmovl_s16(v1498), 15); - float32x4_t v37 = vaddq_f32(v28, v36); - float32x4_t v38 = vsubq_f32(v28, v36); - float32x4_t v55 = vaddq_f32(v46, v54); - float32x4_t v56 = vsubq_f32(v46, v54); - float32x4_t v75 = vaddq_f32(v66, v74); - float32x4_t v76 = vsubq_f32(v66, v74); - float32x4_t v93 = vaddq_f32(v84, v92); - float32x4_t v94 = vsubq_f32(v84, v92); - float32x4_t v113 = vaddq_f32(v104, v112); - float32x4_t v114 = vsubq_f32(v104, v112); - float32x4_t v131 = vaddq_f32(v122, v130); - float32x4_t v132 = vsubq_f32(v122, v130); - float32x4_t v151 = vaddq_f32(v142, v150); - float32x4_t v152 = vsubq_f32(v142, v150); - float32x4_t v169 = vaddq_f32(v160, v168); - float32x4_t v170 = vsubq_f32(v160, v168); - float32x4_t v189 = vaddq_f32(v180, v188); - float32x4_t v190 = vsubq_f32(v180, v188); - float32x4_t v207 = vaddq_f32(v198, v206); - float32x4_t v208 = vsubq_f32(v198, v206); - float32x4_t v57 = vaddq_f32(v37, v55); - float32x4_t v58 = vsubq_f32(v37, v55); - float32x4_t v95 = vaddq_f32(v75, v93); - float32x4_t v96 = vsubq_f32(v75, v93); - float32x4_t v133 = vaddq_f32(v113, v131); - float32x4_t v134 = vsubq_f32(v113, v131); - float32x4_t v171 = vaddq_f32(v151, v169); - float32x4_t v172 = vsubq_f32(v151, v169); - float32x4_t v209 = vaddq_f32(v189, v207); - float32x4_t v210 = vsubq_f32(v189, v207); - float32x4_t v323 = vaddq_f32(v76, v190); - float32x4_t v324 = vsubq_f32(v76, v190); - float32x4_t v325 = vaddq_f32(v152, v114); - float32x4_t v326 = vsubq_f32(v152, v114); - float32x4_t v379 = vaddq_f32(v94, v208); - float32x4_t v380 = vsubq_f32(v94, v208); - float32x4_t v381 = vaddq_f32(v170, v132); - float32x4_t v382 = vsubq_f32(v170, v132); - float32x4_t v211 = vaddq_f32(v95, v209); - float32x4_t v212 = vsubq_f32(v95, v209); - float32x4_t v213 = vaddq_f32(v171, v133); - float32x4_t v214 = vsubq_f32(v171, v133); - float32x4_t v267 = vaddq_f32(v96, v210); - float32x4_t v268 = vsubq_f32(v96, v210); - float32x4_t v269 = vaddq_f32(v172, v134); - float32x4_t v270 = vsubq_f32(v172, v134); - float32x4_t v327 = vaddq_f32(v323, v325); - float32x4_t v328 = vsubq_f32(v323, v325); - float32x4_t v329 = vaddq_f32(v324, v326); - float32x4_t v351 = vrev64q_f32(v324); - float32x4_t v367 = vrev64q_f32(v326); - float32x4_t v383 = vaddq_f32(v379, v381); - float32x4_t v384 = vsubq_f32(v379, v381); - float32x4_t v385 = vaddq_f32(v380, v382); - float32x4_t v415 = vmulq_f32(v380, v414); - float32x4_t v425 = vmulq_f32(v382, v424); - float32x4_t v215 = vaddq_f32(v211, v213); - float32x4_t v216 = vsubq_f32(v211, v213); - float32x4_t v217 = vaddq_f32(v212, v214); - float32x4_t v239 = vrev64q_f32(v212); - float32x4_t v255 = vrev64q_f32(v214); - float32x4_t v271 = vaddq_f32(v267, v269); - float32x4_t v272 = vsubq_f32(v267, v269); - float32x4_t v273 = vaddq_f32(v268, v270); - float32x4_t v295 = vrev64q_f32(v268); - float32x4_t v311 = vrev64q_f32(v270); - float32x4_t v330 = vaddq_f32(v327, v38); - float32x4_t v340 = vmulq_f32(v327, v339); - float32x4_t v345 = vmulq_f32(v328, v344); - float32x4_t v353 = vmulq_f32(v351, v352); - float32x4_t v359 = vrev64q_f32(v329); - float32x4_t v369 = vmulq_f32(v367, v368); - float32x4_t v386 = vaddq_f32(v383, v56); - float32x4_t v400 = vrev64q_f32(v383); - float32x4_t v408 = vrev64q_f32(v384); - float32x4_t v420 = vmulq_f32(v385, v419); - float32x4_t v218 = vaddq_f32(v215, v57); - float32x4_t v228 = vmulq_f32(v215, v339); - float32x4_t v233 = vmulq_f32(v216, v344); - float32x4_t v241 = vmulq_f32(v239, v352); - float32x4_t v247 = vrev64q_f32(v217); - float32x4_t v257 = vmulq_f32(v255, v368); - float32x4_t v274 = vaddq_f32(v271, v58); - float32x4_t v284 = vmulq_f32(v271, v339); - float32x4_t v289 = vmulq_f32(v272, v344); - float32x4_t v297 = vmulq_f32(v295, v352); - float32x4_t v303 = vrev64q_f32(v273); - float32x4_t v313 = vmulq_f32(v311, v368); - float32x4_t v361 = vmulq_f32(v359, v360); - float32x4_t v370 = vaddq_f32(v330, v340); - float32x4_t v392 = vrev64q_f32(v386); - float32x4_t v402 = vmulq_f32(v400, v401); - float32x4_t v410 = vmulq_f32(v408, v409); - float32x4_t v429 = vsubq_f32(v415, v420); - float32x4_t v430 = vaddq_f32(v420, v425); - float32x4_t v249 = vmulq_f32(v247, v360); - float32x4_t v258 = vaddq_f32(v218, v228); - float32x4_t v305 = vmulq_f32(v303, v360); - float32x4_t v314 = vaddq_f32(v274, v284); - float32x4_t v371 = vaddq_f32(v370, v345); - float32x4_t v372 = vsubq_f32(v370, v345); - float32x4_t v373 = vsubq_f32(v353, v361); - float32x4_t v374 = vaddq_f32(v361, v369); - float32x4_t v394 = vmulq_f32(v392, v393); - int16x4_t v439 = vqmovn_s32(vcvtq_n_s32_f32(v218, 15)); - int16x4_t v455 = vqmovn_s32(vcvtq_n_s32_f32(v274, 15)); - float32x4_t v259 = vaddq_f32(v258, v233); - float32x4_t v260 = vsubq_f32(v258, v233); - float32x4_t v261 = vsubq_f32(v241, v249); - float32x4_t v262 = vaddq_f32(v249, v257); - float32x4_t v315 = vaddq_f32(v314, v289); - float32x4_t v316 = vsubq_f32(v314, v289); - float32x4_t v317 = vsubq_f32(v297, v305); - float32x4_t v318 = vaddq_f32(v305, v313); - float32x4_t v375 = vaddq_f32(v371, v373); - float32x4_t v376 = vsubq_f32(v371, v373); - float32x4_t v377 = vaddq_f32(v372, v374); - float32x4_t v378 = vsubq_f32(v372, v374); - float32x4_t v426 = vaddq_f32(v394, v402); - float32x4_t v435 = vaddq_f32(v330, v394); - float32x4_t v436 = vsubq_f32(v330, v394); - vst1_s16((int16_t *)v1285, v439); - vst1_s16((int16_t *)v1303, v455); - float32x4_t v263 = vaddq_f32(v259, v261); - float32x4_t v264 = vsubq_f32(v259, v261); - float32x4_t v265 = vaddq_f32(v260, v262); - float32x4_t v266 = vsubq_f32(v260, v262); - float32x4_t v319 = vaddq_f32(v315, v317); - float32x4_t v320 = vsubq_f32(v315, v317); - float32x4_t v321 = vaddq_f32(v316, v318); - float32x4_t v322 = vsubq_f32(v316, v318); - float32x4_t v427 = vaddq_f32(v426, v410); - float32x4_t v428 = vsubq_f32(v426, v410); - int16x4_t v447 = vqmovn_s32(vcvtq_n_s32_f32(v436, 15)); - int16x4_t v463 = vqmovn_s32(vcvtq_n_s32_f32(v435, 15)); - float32x4_t v431 = vaddq_f32(v427, v429); - float32x4_t v432 = vsubq_f32(v427, v429); - float32x4_t v433 = vaddq_f32(v428, v430); - float32x4_t v434 = vsubq_f32(v428, v430); - int16x4_t v473 = vqmovn_s32(vcvtq_n_s32_f32(v264, 15)); - int16x4_t v489 = vqmovn_s32(vcvtq_n_s32_f32(v320, 15)); - int16x4_t v507 = vqmovn_s32(vcvtq_n_s32_f32(v266, 15)); - int16x4_t v523 = vqmovn_s32(vcvtq_n_s32_f32(v322, 15)); - int16x4_t v541 = vqmovn_s32(vcvtq_n_s32_f32(v265, 15)); - int16x4_t v557 = vqmovn_s32(vcvtq_n_s32_f32(v321, 15)); - int16x4_t v575 = vqmovn_s32(vcvtq_n_s32_f32(v263, 15)); - int16x4_t v591 = vqmovn_s32(vcvtq_n_s32_f32(v319, 15)); - vst1_s16((int16_t *)v1294, v447); - vst1_s16((int16_t *)v1312, v463); - float32x4_t v469 = vaddq_f32(v376, v432); - float32x4_t v470 = vsubq_f32(v376, v432); - float32x4_t v503 = vaddq_f32(v378, v434); - float32x4_t v504 = vsubq_f32(v378, v434); - float32x4_t v537 = vaddq_f32(v377, v433); - float32x4_t v538 = vsubq_f32(v377, v433); - float32x4_t v571 = vaddq_f32(v375, v431); - float32x4_t v572 = vsubq_f32(v375, v431); - vst1_s16((int16_t *)v1321, v473); - vst1_s16((int16_t *)v1339, v489); - vst1_s16((int16_t *)v1357, v507); - vst1_s16((int16_t *)v1375, v523); - vst1_s16((int16_t *)v1393, v541); - vst1_s16((int16_t *)v1411, v557); - vst1_s16((int16_t *)v1429, v575); - vst1_s16((int16_t *)v1447, v591); - int16x4_t v481 = vqmovn_s32(vcvtq_n_s32_f32(v470, 15)); - int16x4_t v497 = vqmovn_s32(vcvtq_n_s32_f32(v469, 15)); - int16x4_t v515 = vqmovn_s32(vcvtq_n_s32_f32(v504, 15)); - int16x4_t v531 = vqmovn_s32(vcvtq_n_s32_f32(v503, 15)); - int16x4_t v549 = vqmovn_s32(vcvtq_n_s32_f32(v538, 15)); - int16x4_t v565 = vqmovn_s32(vcvtq_n_s32_f32(v537, 15)); - int16x4_t v583 = vqmovn_s32(vcvtq_n_s32_f32(v572, 15)); - int16x4_t v599 = vqmovn_s32(vcvtq_n_s32_f32(v571, 15)); - vst1_s16((int16_t *)v1330, v481); - vst1_s16((int16_t *)v1348, v497); - vst1_s16((int16_t *)v1366, v515); - vst1_s16((int16_t *)v1384, v531); - vst1_s16((int16_t *)v1402, v549); - vst1_s16((int16_t *)v1420, v565); - vst1_s16((int16_t *)v1438, v583); - vst1_s16((int16_t *)v1456, v599); - v5 += 2 * 1; - v6 += 2 * 1; - } - for (int j = v605 * 2; j < howmany; j += 1) { - int16x4_t v751 = vld1s_s16(&v5[istride]); - float v883 = 1.5388417685876268e+00F; - float v890 = 5.8778525229247325e-01F; - float v897 = 3.6327126400268028e-01F; - float v921 = 1.0000000000000000e+00F; - float v922 = -1.0000000000000000e+00F; - float v928 = -1.2500000000000000e+00F; - float v929 = 1.2500000000000000e+00F; - float v935 = 5.5901699437494745e-01F; - float v936 = -5.5901699437494745e-01F; - float32x2_t v938 = (float32x2_t){v4, v4}; - float v943 = -1.5388417685876268e+00F; - float v947 = -5.8778525229247325e-01F; - float v951 = -3.6327126400268028e-01F; - int16x4_t v617 = vld1s_s16(&v5[0]); - float32x2_t v752 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v751)), 15); - float32x2_t v877 = (float32x2_t){v928, v928}; - float32x2_t v881 = (float32x2_t){v935, v935}; - float32x2_t v885 = (float32x2_t){v883, v943}; - float32x2_t v892 = (float32x2_t){v890, v947}; - float32x2_t v899 = (float32x2_t){v897, v951}; - float32x2_t v923 = (float32x2_t){v921, v922}; - float32x2_t v930 = (float32x2_t){v928, v929}; - float32x2_t v937 = (float32x2_t){v935, v936}; - float32x2_t v944 = (float32x2_t){v943, v943}; - float32x2_t v948 = (float32x2_t){v947, v947}; - float32x2_t v952 = (float32x2_t){v951, v951}; - float32x2_t v618 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v617)), 15); - int16x4_t v623 = vld1s_s16(&v5[istride * 10]); - int16x4_t v631 = vld1s_s16(&v5[istride * 5]); - int16x4_t v637 = vld1s_s16(&v5[istride * 15]); - int16x4_t v647 = vld1s_s16(&v5[istride * 4]); - int16x4_t v653 = vld1s_s16(&v5[istride * 14]); - int16x4_t v661 = vld1s_s16(&v5[istride * 9]); - int16x4_t v667 = vld1s_s16(&v5[istride * 19]); - int16x4_t v677 = vld1s_s16(&v5[istride * 8]); - int16x4_t v683 = vld1s_s16(&v5[istride * 18]); - int16x4_t v691 = vld1s_s16(&v5[istride * 13]); - int16x4_t v697 = vld1s_s16(&v5[istride * 3]); - int16x4_t v707 = vld1s_s16(&v5[istride * 12]); - int16x4_t v713 = vld1s_s16(&v5[istride * 2]); - int16x4_t v721 = vld1s_s16(&v5[istride * 17]); - int16x4_t v727 = vld1s_s16(&v5[istride * 7]); - int16x4_t v737 = vld1s_s16(&v5[istride * 16]); - int16x4_t v743 = vld1s_s16(&v5[istride * 6]); - int16x4_t v757 = vld1s_s16(&v5[istride * 11]); - float32x2_t v887 = vmul_f32(v938, v885); - float32x2_t v894 = vmul_f32(v938, v892); - float32x2_t v901 = vmul_f32(v938, v899); - float32x2_t v925 = vmul_f32(v938, v923); - float32x2_t v932 = vmul_f32(v938, v930); - float32x2_t v939 = vmul_f32(v938, v937); - float32x2_t v624 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v623)), 15); - float32x2_t v632 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v631)), 15); - float32x2_t v638 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v637)), 15); - float32x2_t v648 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v647)), 15); - float32x2_t v654 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v653)), 15); - float32x2_t v662 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v661)), 15); - float32x2_t v668 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v667)), 15); - float32x2_t v678 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v677)), 15); - float32x2_t v684 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v683)), 15); - float32x2_t v692 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v691)), 15); - float32x2_t v698 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v697)), 15); - float32x2_t v708 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v707)), 15); - float32x2_t v714 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v713)), 15); - float32x2_t v722 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v721)), 15); - float32x2_t v728 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v727)), 15); - float32x2_t v738 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v737)), 15); - float32x2_t v744 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v743)), 15); - float32x2_t v758 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v757)), 15); - float32x2_t v625 = vadd_f32(v618, v624); - float32x2_t v626 = vsub_f32(v618, v624); - float32x2_t v639 = vadd_f32(v632, v638); - float32x2_t v640 = vsub_f32(v632, v638); - float32x2_t v655 = vadd_f32(v648, v654); - float32x2_t v656 = vsub_f32(v648, v654); - float32x2_t v669 = vadd_f32(v662, v668); - float32x2_t v670 = vsub_f32(v662, v668); - float32x2_t v685 = vadd_f32(v678, v684); - float32x2_t v686 = vsub_f32(v678, v684); - float32x2_t v699 = vadd_f32(v692, v698); - float32x2_t v700 = vsub_f32(v692, v698); - float32x2_t v715 = vadd_f32(v708, v714); - float32x2_t v716 = vsub_f32(v708, v714); - float32x2_t v729 = vadd_f32(v722, v728); - float32x2_t v730 = vsub_f32(v722, v728); - float32x2_t v745 = vadd_f32(v738, v744); - float32x2_t v746 = vsub_f32(v738, v744); - float32x2_t v759 = vadd_f32(v752, v758); - float32x2_t v760 = vsub_f32(v752, v758); - float32x2_t v641 = vadd_f32(v625, v639); - float32x2_t v642 = vsub_f32(v625, v639); - float32x2_t v671 = vadd_f32(v655, v669); - float32x2_t v672 = vsub_f32(v655, v669); - float32x2_t v701 = vadd_f32(v685, v699); - float32x2_t v702 = vsub_f32(v685, v699); - float32x2_t v731 = vadd_f32(v715, v729); - float32x2_t v732 = vsub_f32(v715, v729); - float32x2_t v761 = vadd_f32(v745, v759); - float32x2_t v762 = vsub_f32(v745, v759); - float32x2_t v863 = vadd_f32(v656, v746); - float32x2_t v864 = vsub_f32(v656, v746); - float32x2_t v865 = vadd_f32(v716, v686); - float32x2_t v866 = vsub_f32(v716, v686); - float32x2_t v913 = vadd_f32(v670, v760); - float32x2_t v914 = vsub_f32(v670, v760); - float32x2_t v915 = vadd_f32(v730, v700); - float32x2_t v916 = vsub_f32(v730, v700); - float32x2_t v763 = vadd_f32(v671, v761); - float32x2_t v764 = vsub_f32(v671, v761); - float32x2_t v765 = vadd_f32(v731, v701); - float32x2_t v766 = vsub_f32(v731, v701); - float32x2_t v813 = vadd_f32(v672, v762); - float32x2_t v814 = vsub_f32(v672, v762); - float32x2_t v815 = vadd_f32(v732, v702); - float32x2_t v816 = vsub_f32(v732, v702); - float32x2_t v867 = vadd_f32(v863, v865); - float32x2_t v868 = vsub_f32(v863, v865); - float32x2_t v869 = vadd_f32(v864, v866); - float32x2_t v888 = vrev64_f32(v864); - float32x2_t v902 = vrev64_f32(v866); - float32x2_t v917 = vadd_f32(v913, v915); - float32x2_t v918 = vsub_f32(v913, v915); - float32x2_t v919 = vadd_f32(v914, v916); - float32x2_t v945 = vmul_f32(v914, v944); - float32x2_t v953 = vmul_f32(v916, v952); - float32x2_t v767 = vadd_f32(v763, v765); - float32x2_t v768 = vsub_f32(v763, v765); - float32x2_t v769 = vadd_f32(v764, v766); - float32x2_t v788 = vrev64_f32(v764); - float32x2_t v802 = vrev64_f32(v766); - float32x2_t v817 = vadd_f32(v813, v815); - float32x2_t v818 = vsub_f32(v813, v815); - float32x2_t v819 = vadd_f32(v814, v816); - float32x2_t v838 = vrev64_f32(v814); - float32x2_t v852 = vrev64_f32(v816); - float32x2_t v870 = vadd_f32(v867, v626); - float32x2_t v878 = vmul_f32(v867, v877); - float32x2_t v882 = vmul_f32(v868, v881); - float32x2_t v889 = vmul_f32(v888, v887); - float32x2_t v895 = vrev64_f32(v869); - float32x2_t v903 = vmul_f32(v902, v901); - float32x2_t v920 = vadd_f32(v917, v640); - float32x2_t v933 = vrev64_f32(v917); - float32x2_t v940 = vrev64_f32(v918); - float32x2_t v949 = vmul_f32(v919, v948); - float32x2_t v770 = vadd_f32(v767, v641); - float32x2_t v778 = vmul_f32(v767, v877); - float32x2_t v782 = vmul_f32(v768, v881); - float32x2_t v789 = vmul_f32(v788, v887); - float32x2_t v795 = vrev64_f32(v769); - float32x2_t v803 = vmul_f32(v802, v901); - float32x2_t v820 = vadd_f32(v817, v642); - float32x2_t v828 = vmul_f32(v817, v877); - float32x2_t v832 = vmul_f32(v818, v881); - float32x2_t v839 = vmul_f32(v838, v887); - float32x2_t v845 = vrev64_f32(v819); - float32x2_t v853 = vmul_f32(v852, v901); - float32x2_t v896 = vmul_f32(v895, v894); - float32x2_t v904 = vadd_f32(v870, v878); - float32x2_t v926 = vrev64_f32(v920); - float32x2_t v934 = vmul_f32(v933, v932); - float32x2_t v941 = vmul_f32(v940, v939); - float32x2_t v957 = vsub_f32(v945, v949); - float32x2_t v958 = vadd_f32(v949, v953); - float32x2_t v796 = vmul_f32(v795, v894); - float32x2_t v804 = vadd_f32(v770, v778); - float32x2_t v846 = vmul_f32(v845, v894); - float32x2_t v854 = vadd_f32(v820, v828); - float32x2_t v905 = vadd_f32(v904, v882); - float32x2_t v906 = vsub_f32(v904, v882); - float32x2_t v907 = vsub_f32(v889, v896); - float32x2_t v908 = vadd_f32(v896, v903); - float32x2_t v927 = vmul_f32(v926, v925); - int16x4_t v967 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v770, 15), (int32x2_t){0, 0})); - int16x4_t v979 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v820, 15), (int32x2_t){0, 0})); - float32x2_t v805 = vadd_f32(v804, v782); - float32x2_t v806 = vsub_f32(v804, v782); - float32x2_t v807 = vsub_f32(v789, v796); - float32x2_t v808 = vadd_f32(v796, v803); - float32x2_t v855 = vadd_f32(v854, v832); - float32x2_t v856 = vsub_f32(v854, v832); - float32x2_t v857 = vsub_f32(v839, v846); - float32x2_t v858 = vadd_f32(v846, v853); - float32x2_t v909 = vadd_f32(v905, v907); - float32x2_t v910 = vsub_f32(v905, v907); - float32x2_t v911 = vadd_f32(v906, v908); - float32x2_t v912 = vsub_f32(v906, v908); - float32x2_t v954 = vadd_f32(v927, v934); - float32x2_t v963 = vadd_f32(v870, v927); - float32x2_t v964 = vsub_f32(v870, v927); - v6[0] = vget_lane_s32(vreinterpret_s32_s16(v967), 0); - v6[ostride * 10] = vget_lane_s32(vreinterpret_s32_s16(v979), 0); - float32x2_t v809 = vadd_f32(v805, v807); - float32x2_t v810 = vsub_f32(v805, v807); - float32x2_t v811 = vadd_f32(v806, v808); - float32x2_t v812 = vsub_f32(v806, v808); - float32x2_t v859 = vadd_f32(v855, v857); - float32x2_t v860 = vsub_f32(v855, v857); - float32x2_t v861 = vadd_f32(v856, v858); - float32x2_t v862 = vsub_f32(v856, v858); - float32x2_t v955 = vadd_f32(v954, v941); - float32x2_t v956 = vsub_f32(v954, v941); - int16x4_t v973 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v964, 15), (int32x2_t){0, 0})); - int16x4_t v985 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v963, 15), (int32x2_t){0, 0})); - float32x2_t v959 = vadd_f32(v955, v957); - float32x2_t v960 = vsub_f32(v955, v957); - float32x2_t v961 = vadd_f32(v956, v958); - float32x2_t v962 = vsub_f32(v956, v958); - v6[ostride * 5] = vget_lane_s32(vreinterpret_s32_s16(v973), 0); - v6[ostride * 15] = vget_lane_s32(vreinterpret_s32_s16(v985), 0); - int16x4_t v993 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v810, 15), (int32x2_t){0, 0})); - int16x4_t v1005 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v860, 15), (int32x2_t){0, 0})); - int16x4_t v1019 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v812, 15), (int32x2_t){0, 0})); - int16x4_t v1031 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v862, 15), (int32x2_t){0, 0})); - int16x4_t v1045 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v811, 15), (int32x2_t){0, 0})); - int16x4_t v1057 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v861, 15), (int32x2_t){0, 0})); - int16x4_t v1071 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v809, 15), (int32x2_t){0, 0})); - int16x4_t v1083 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v859, 15), (int32x2_t){0, 0})); - float32x2_t v989 = vadd_f32(v910, v960); - float32x2_t v990 = vsub_f32(v910, v960); - v6[ostride * 16] = vget_lane_s32(vreinterpret_s32_s16(v993), 0); - v6[ostride * 6] = vget_lane_s32(vreinterpret_s32_s16(v1005), 0); - float32x2_t v1015 = vadd_f32(v912, v962); - float32x2_t v1016 = vsub_f32(v912, v962); - v6[ostride * 12] = vget_lane_s32(vreinterpret_s32_s16(v1019), 0); - v6[ostride * 2] = vget_lane_s32(vreinterpret_s32_s16(v1031), 0); - float32x2_t v1041 = vadd_f32(v911, v961); - float32x2_t v1042 = vsub_f32(v911, v961); - v6[ostride * 8] = vget_lane_s32(vreinterpret_s32_s16(v1045), 0); - v6[ostride * 18] = vget_lane_s32(vreinterpret_s32_s16(v1057), 0); - float32x2_t v1067 = vadd_f32(v909, v959); - float32x2_t v1068 = vsub_f32(v909, v959); - v6[ostride * 4] = vget_lane_s32(vreinterpret_s32_s16(v1071), 0); - v6[ostride * 14] = vget_lane_s32(vreinterpret_s32_s16(v1083), 0); - int16x4_t v999 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v990, 15), (int32x2_t){0, 0})); - int16x4_t v1011 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v989, 15), (int32x2_t){0, 0})); - int16x4_t v1025 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1016, 15), (int32x2_t){0, 0})); - int16x4_t v1037 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1015, 15), (int32x2_t){0, 0})); - int16x4_t v1051 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1042, 15), (int32x2_t){0, 0})); - int16x4_t v1063 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1041, 15), (int32x2_t){0, 0})); - int16x4_t v1077 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1068, 15), (int32x2_t){0, 0})); - int16x4_t v1089 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1067, 15), (int32x2_t){0, 0})); - v6[ostride] = vget_lane_s32(vreinterpret_s32_s16(v999), 0); - v6[ostride * 11] = vget_lane_s32(vreinterpret_s32_s16(v1011), 0); - v6[ostride * 17] = vget_lane_s32(vreinterpret_s32_s16(v1025), 0); - v6[ostride * 7] = vget_lane_s32(vreinterpret_s32_s16(v1037), 0); - v6[ostride * 13] = vget_lane_s32(vreinterpret_s32_s16(v1051), 0); - v6[ostride * 3] = vget_lane_s32(vreinterpret_s32_s16(v1063), 0); - v6[ostride * 9] = vget_lane_s32(vreinterpret_s32_s16(v1077), 0); - v6[ostride * 19] = vget_lane_s32(vreinterpret_s32_s16(v1089), 0); - v5 += 1 * 1; - v6 += 1 * 1; - } -} -#endif - -#ifdef ARMRAL_ARCH_SVE -void armral_fft_cs16_cf32_cs16_ac_n_uu20(const armral_cmplx_int16_t *restrict x, - armral_cmplx_int16_t *restrict y, - int istride, int ostride, int howmany, - float dir) { - int64_t v0 = istride; - int64_t v2 = ostride; - float v4 = dir; - const int32_t *v5 = (const int32_t *)x; - int32_t *v6 = (int32_t *)y; - int64_t v8 = howmany; - int64_t v10 = svcntd(); - int64_t v11 = v10 * 1; - int64_t v12 = v10 * 1; - for (int j = 0; j < v8; j += v10) { - svbool_t pred_full = svwhilelt_b32(j * 2, howmany * 2); - float v328 = -1.2500000000000000e+00F; - float v333 = 5.5901699437494745e-01F; - float v376 = -1.0000000000000000e+00F; - float v383 = 1.2500000000000000e+00F; - float v390 = -5.5901699437494745e-01F; - float v397 = -1.5388417685876268e+00F; - float v402 = -5.8778525229247325e-01F; - float v407 = -3.6327126400268028e-01F; - const int32_t *v759 = &v5[v0]; - int32_t *v847 = &v6[v2]; - int64_t v27 = v0 * 10; - int64_t v37 = v0 * 5; - int64_t v45 = v0 * 15; - int64_t v57 = v0 * 4; - int64_t v65 = v0 * 14; - int64_t v75 = v0 * 9; - int64_t v83 = v0 * 19; - int64_t v95 = v0 * 8; - int64_t v103 = v0 * 18; - int64_t v113 = v0 * 13; - int64_t v121 = v0 * 3; - int64_t v133 = v0 * 12; - int64_t v141 = v0 * 2; - int64_t v151 = v0 * 17; - int64_t v159 = v0 * 7; - int64_t v171 = v0 * 16; - int64_t v179 = v0 * 6; - int64_t v197 = v0 * 11; - float v341 = v4 * v397; - float v348 = v4 * v402; - float v355 = v4 * v407; - float v379 = v4 * v376; - float v386 = v4 * v383; - float v393 = v4 * v390; - int64_t v431 = v2 * 5; - int64_t v439 = v2 * 10; - int64_t v447 = v2 * 15; - int64_t v457 = v2 * 16; - int64_t v473 = v2 * 6; - int64_t v481 = v2 * 11; - int64_t v491 = v2 * 12; - int64_t v499 = v2 * 17; - int64_t v507 = v2 * 2; - int64_t v515 = v2 * 7; - int64_t v525 = v2 * 8; - int64_t v533 = v2 * 13; - int64_t v541 = v2 * 18; - int64_t v549 = v2 * 3; - int64_t v559 = v2 * 4; - int64_t v567 = v2 * 9; - int64_t v575 = v2 * 14; - int64_t v583 = v2 * 19; - const int32_t *v597 = &v5[0]; - svfloat32_t v784 = svdup_n_f32(v328); - svfloat32_t v785 = svdup_n_f32(v333); - svfloat32_t v792 = svdup_n_f32(v397); - svfloat32_t v793 = svdup_n_f32(v402); - svfloat32_t v794 = svdup_n_f32(v407); - int32_t *v802 = &v6[0]; - svfloat32_t v195 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v759[0])), - 1.F / (1ULL << 15ULL)); - const int32_t *v606 = &v5[v27]; - const int32_t *v615 = &v5[v37]; - const int32_t *v624 = &v5[v45]; - const int32_t *v633 = &v5[v57]; - const int32_t *v642 = &v5[v65]; - const int32_t *v651 = &v5[v75]; - const int32_t *v660 = &v5[v83]; - const int32_t *v669 = &v5[v95]; - const int32_t *v678 = &v5[v103]; - const int32_t *v687 = &v5[v113]; - const int32_t *v696 = &v5[v121]; - const int32_t *v705 = &v5[v133]; - const int32_t *v714 = &v5[v141]; - const int32_t *v723 = &v5[v151]; - const int32_t *v732 = &v5[v159]; - const int32_t *v741 = &v5[v171]; - const int32_t *v750 = &v5[v179]; - const int32_t *v768 = &v5[v197]; - svfloat32_t v786 = svdup_n_f32(v341); - svfloat32_t v787 = svdup_n_f32(v348); - svfloat32_t v788 = svdup_n_f32(v355); - svfloat32_t v789 = svdup_n_f32(v379); - svfloat32_t v790 = svdup_n_f32(v386); - svfloat32_t v791 = svdup_n_f32(v393); - int32_t *v811 = &v6[v431]; - int32_t *v820 = &v6[v439]; - int32_t *v829 = &v6[v447]; - int32_t *v838 = &v6[v457]; - int32_t *v856 = &v6[v473]; - int32_t *v865 = &v6[v481]; - int32_t *v874 = &v6[v491]; - int32_t *v883 = &v6[v499]; - int32_t *v892 = &v6[v507]; - int32_t *v901 = &v6[v515]; - int32_t *v910 = &v6[v525]; - int32_t *v919 = &v6[v533]; - int32_t *v928 = &v6[v541]; - int32_t *v937 = &v6[v549]; - int32_t *v946 = &v6[v559]; - int32_t *v955 = &v6[v567]; - int32_t *v964 = &v6[v575]; - int32_t *v973 = &v6[v583]; - svfloat32_t v25 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v597[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v33 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v606[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v43 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v615[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v51 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v624[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v63 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v633[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v71 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v642[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v81 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v651[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v89 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v660[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v101 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v669[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v109 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v678[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v119 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v687[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v127 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v696[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v139 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v705[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v147 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v714[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v157 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v723[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v165 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v732[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v177 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v741[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v185 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v750[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v203 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v768[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v34; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v34) : "w"(v25), "w"(v33)); - svfloat32_t v35; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v35) : "w"(v25), "w"(v33)); - svfloat32_t v52; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v52) : "w"(v43), "w"(v51)); - svfloat32_t v53; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v53) : "w"(v43), "w"(v51)); - svfloat32_t v72; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v72) : "w"(v63), "w"(v71)); - svfloat32_t v73; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v73) : "w"(v63), "w"(v71)); - svfloat32_t v90; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v90) : "w"(v81), "w"(v89)); - svfloat32_t v91; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v91) : "w"(v81), "w"(v89)); - svfloat32_t v110; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v110) : "w"(v101), "w"(v109)); - svfloat32_t v111; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v111) : "w"(v101), "w"(v109)); - svfloat32_t v128; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v128) : "w"(v119), "w"(v127)); - svfloat32_t v129; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v129) : "w"(v119), "w"(v127)); - svfloat32_t v148; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v148) : "w"(v139), "w"(v147)); - svfloat32_t v149; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v149) : "w"(v139), "w"(v147)); - svfloat32_t v166; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v166) : "w"(v157), "w"(v165)); - svfloat32_t v167; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v167) : "w"(v157), "w"(v165)); - svfloat32_t v186; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v186) : "w"(v177), "w"(v185)); - svfloat32_t v187; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v187) : "w"(v177), "w"(v185)); - svfloat32_t v204; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v204) : "w"(v195), "w"(v203)); - svfloat32_t v205; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v205) : "w"(v195), "w"(v203)); - svfloat32_t v54; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v54) : "w"(v34), "w"(v52)); - svfloat32_t v55; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v55) : "w"(v34), "w"(v52)); - svfloat32_t v92; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v92) : "w"(v72), "w"(v90)); - svfloat32_t v93; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v93) : "w"(v72), "w"(v90)); - svfloat32_t v130; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v130) : "w"(v110), "w"(v128)); - svfloat32_t v131; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v131) : "w"(v110), "w"(v128)); - svfloat32_t v168; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v168) : "w"(v148), "w"(v166)); - svfloat32_t v169; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v169) : "w"(v148), "w"(v166)); - svfloat32_t v206; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v206) : "w"(v186), "w"(v204)); - svfloat32_t v207; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v207) : "w"(v186), "w"(v204)); - svfloat32_t v314; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v314) : "w"(v73), "w"(v187)); - svfloat32_t v315; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v315) : "w"(v73), "w"(v187)); - svfloat32_t v316; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v316) : "w"(v149), "w"(v111)); - svfloat32_t v317; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v317) : "w"(v149), "w"(v111)); - svfloat32_t v367; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v367) : "w"(v91), "w"(v205)); - svfloat32_t v368; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v368) : "w"(v91), "w"(v205)); - svfloat32_t v369; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v369) : "w"(v167), "w"(v129)); - svfloat32_t v370; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v370) : "w"(v167), "w"(v129)); - svfloat32_t v208; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v208) : "w"(v92), "w"(v206)); - svfloat32_t v209; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v209) : "w"(v92), "w"(v206)); - svfloat32_t v210; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v210) : "w"(v168), "w"(v130)); - svfloat32_t v211; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v211) : "w"(v168), "w"(v130)); - svfloat32_t v261; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v261) : "w"(v93), "w"(v207)); - svfloat32_t v262; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v262) : "w"(v93), "w"(v207)); - svfloat32_t v263; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v263) : "w"(v169), "w"(v131)); - svfloat32_t v264; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v264) : "w"(v169), "w"(v131)); - svfloat32_t v318; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v318) : "w"(v314), "w"(v316)); - svfloat32_t v319; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v319) : "w"(v314), "w"(v316)); - svfloat32_t v320; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v320) : "w"(v315), "w"(v317)); - svfloat32_t zero343; - asm volatile("mov %0.s, #0" : "=w"(zero343)); - svfloat32_t v343 = svcmla_f32_x(pred_full, zero343, v786, v315, 90); - svfloat32_t v371; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v371) : "w"(v367), "w"(v369)); - svfloat32_t v372; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v372) : "w"(v367), "w"(v369)); - svfloat32_t v373; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v373) : "w"(v368), "w"(v370)); - svfloat32_t v410; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v410) : "w"(v370), "w"(v794)); - svfloat32_t v212; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v212) : "w"(v208), "w"(v210)); - svfloat32_t v213; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v213) : "w"(v208), "w"(v210)); - svfloat32_t v214; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v214) : "w"(v209), "w"(v211)); - svfloat32_t zero237; - asm volatile("mov %0.s, #0" : "=w"(zero237)); - svfloat32_t v237 = svcmla_f32_x(pred_full, zero237, v786, v209, 90); - svfloat32_t v265; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v265) : "w"(v261), "w"(v263)); - svfloat32_t v266; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v266) : "w"(v261), "w"(v263)); - svfloat32_t v267; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v267) : "w"(v262), "w"(v264)); - svfloat32_t zero290; - asm volatile("mov %0.s, #0" : "=w"(zero290)); - svfloat32_t v290 = svcmla_f32_x(pred_full, zero290, v786, v262, 90); - svfloat32_t v321; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v321) : "w"(v318), "w"(v35)); - svfloat32_t zero350; - asm volatile("mov %0.s, #0" : "=w"(zero350)); - svfloat32_t v350 = svcmla_f32_x(pred_full, zero350, v787, v320, 90); - svfloat32_t v374; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v374) : "w"(v371), "w"(v53)); - svfloat32_t zero395; - asm volatile("mov %0.s, #0" : "=w"(zero395)); - svfloat32_t v395 = svcmla_f32_x(pred_full, zero395, v791, v372, 90); - svfloat32_t v405; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v405) : "w"(v373), "w"(v793)); - svfloat32_t v215; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v215) : "w"(v212), "w"(v54)); - svfloat32_t zero244; - asm volatile("mov %0.s, #0" : "=w"(zero244)); - svfloat32_t v244 = svcmla_f32_x(pred_full, zero244, v787, v214, 90); - svfloat32_t v268; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v268) : "w"(v265), "w"(v55)); - svfloat32_t zero297; - asm volatile("mov %0.s, #0" : "=w"(zero297)); - svfloat32_t v297 = svcmla_f32_x(pred_full, zero297, v787, v267, 90); - svfloat32_t v358 = svmla_f32_x(pred_full, v321, v318, v784); - svfloat32_t v361; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v361) : "w"(v343), "w"(v350)); - svfloat32_t v362 = svcmla_f32_x(pred_full, v350, v788, v317, 90); - svfloat32_t zero381; - asm volatile("mov %0.s, #0" : "=w"(zero381)); - svfloat32_t v381 = svcmla_f32_x(pred_full, zero381, v789, v374, 90); - svfloat32_t v414 = svnmls_f32_x(pred_full, v405, v368, v792); - svfloat32_t v415 = svmla_f32_x(pred_full, v410, v373, v793); - svfloat32_t v252 = svmla_f32_x(pred_full, v215, v212, v784); - svfloat32_t v255; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v255) : "w"(v237), "w"(v244)); - svfloat32_t v256 = svcmla_f32_x(pred_full, v244, v788, v211, 90); - svfloat32_t v305 = svmla_f32_x(pred_full, v268, v265, v784); - svfloat32_t v308; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v308) : "w"(v290), "w"(v297)); - svfloat32_t v309 = svcmla_f32_x(pred_full, v297, v788, v264, 90); - svfloat32_t v359 = svmla_f32_x(pred_full, v358, v319, v785); - svfloat32_t v360 = svmls_f32_x(pred_full, v358, v319, v785); - svfloat32_t v411 = svcmla_f32_x(pred_full, v381, v790, v371, 90); - svfloat32_t v420; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v420) : "w"(v321), "w"(v381)); - svfloat32_t v421; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v421) : "w"(v321), "w"(v381)); - svint16_t v424 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v215, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svint16_t v440 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v268, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svfloat32_t v253 = svmla_f32_x(pred_full, v252, v213, v785); - svfloat32_t v254 = svmls_f32_x(pred_full, v252, v213, v785); - svfloat32_t v306 = svmla_f32_x(pred_full, v305, v266, v785); - svfloat32_t v307 = svmls_f32_x(pred_full, v305, v266, v785); - svfloat32_t v363; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v363) : "w"(v359), "w"(v361)); - svfloat32_t v364; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v364) : "w"(v359), "w"(v361)); - svfloat32_t v365; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v365) : "w"(v360), "w"(v362)); - svfloat32_t v366; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v366) : "w"(v360), "w"(v362)); - svfloat32_t v412; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v412) : "w"(v411), "w"(v395)); - svfloat32_t v413; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v413) : "w"(v411), "w"(v395)); - svint16_t v432 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v421, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svint16_t v448 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v420, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svst1w_u64(pred_full, (unsigned *)(v802), svreinterpret_u64_s16(v424)); - svst1w_u64(pred_full, (unsigned *)(v820), svreinterpret_u64_s16(v440)); - svfloat32_t v257; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v257) : "w"(v253), "w"(v255)); - svfloat32_t v258; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v258) : "w"(v253), "w"(v255)); - svfloat32_t v259; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v259) : "w"(v254), "w"(v256)); - svfloat32_t v260; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v260) : "w"(v254), "w"(v256)); - svfloat32_t v310; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v310) : "w"(v306), "w"(v308)); - svfloat32_t v311; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v311) : "w"(v306), "w"(v308)); - svfloat32_t v312; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v312) : "w"(v307), "w"(v309)); - svfloat32_t v313; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v313) : "w"(v307), "w"(v309)); - svfloat32_t v416; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v416) : "w"(v412), "w"(v414)); - svfloat32_t v417; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v417) : "w"(v412), "w"(v414)); - svfloat32_t v418; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v418) : "w"(v413), "w"(v415)); - svfloat32_t v419; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v419) : "w"(v413), "w"(v415)); - svst1w_u64(pred_full, (unsigned *)(v811), svreinterpret_u64_s16(v432)); - svst1w_u64(pred_full, (unsigned *)(v829), svreinterpret_u64_s16(v448)); - svfloat32_t v454; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v454) : "w"(v364), "w"(v417)); - svfloat32_t v455; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v455) : "w"(v364), "w"(v417)); - svint16_t v458 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v258, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svint16_t v474 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v311, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svfloat32_t v488; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v488) : "w"(v366), "w"(v419)); - svfloat32_t v489; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v489) : "w"(v366), "w"(v419)); - svint16_t v492 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v260, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svint16_t v508 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v313, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svfloat32_t v522; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v522) : "w"(v365), "w"(v418)); - svfloat32_t v523; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v523) : "w"(v365), "w"(v418)); - svint16_t v526 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v259, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svint16_t v542 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v312, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svfloat32_t v556; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v556) : "w"(v363), "w"(v416)); - svfloat32_t v557; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v557) : "w"(v363), "w"(v416)); - svint16_t v560 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v257, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svint16_t v576 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v310, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svint16_t v466 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v455, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svint16_t v482 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v454, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svint16_t v500 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v489, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svint16_t v516 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v488, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svint16_t v534 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v523, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svint16_t v550 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v522, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svint16_t v568 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v557, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svint16_t v584 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v556, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svst1w_u64(pred_full, (unsigned *)(v838), svreinterpret_u64_s16(v458)); - svst1w_u64(pred_full, (unsigned *)(v856), svreinterpret_u64_s16(v474)); - svst1w_u64(pred_full, (unsigned *)(v874), svreinterpret_u64_s16(v492)); - svst1w_u64(pred_full, (unsigned *)(v892), svreinterpret_u64_s16(v508)); - svst1w_u64(pred_full, (unsigned *)(v910), svreinterpret_u64_s16(v526)); - svst1w_u64(pred_full, (unsigned *)(v928), svreinterpret_u64_s16(v542)); - svst1w_u64(pred_full, (unsigned *)(v946), svreinterpret_u64_s16(v560)); - svst1w_u64(pred_full, (unsigned *)(v964), svreinterpret_u64_s16(v576)); - svst1w_u64(pred_full, (unsigned *)(v847), svreinterpret_u64_s16(v466)); - svst1w_u64(pred_full, (unsigned *)(v865), svreinterpret_u64_s16(v482)); - svst1w_u64(pred_full, (unsigned *)(v883), svreinterpret_u64_s16(v500)); - svst1w_u64(pred_full, (unsigned *)(v901), svreinterpret_u64_s16(v516)); - svst1w_u64(pred_full, (unsigned *)(v919), svreinterpret_u64_s16(v534)); - svst1w_u64(pred_full, (unsigned *)(v937), svreinterpret_u64_s16(v550)); - svst1w_u64(pred_full, (unsigned *)(v955), svreinterpret_u64_s16(v568)); - svst1w_u64(pred_full, (unsigned *)(v973), svreinterpret_u64_s16(v584)); - v5 += v11; - v6 += v12; - } -} -#endif - -#ifndef ARMRAL_ARCH_SVE -void armral_fft_cs16_cf32_cs16_ac_n_uu21(const armral_cmplx_int16_t *restrict x, - armral_cmplx_int16_t *restrict y, - int istride, int ostride, int howmany, - float dir) { - float v4 = dir; - const int32_t *v5 = (const int32_t *)x; - int32_t *v6 = (int32_t *)y; - int64_t v12 = howmany - 1; - int64_t v681 = howmany / 2; - for (int j = 0; j < v12; j += 2) { - float v233 = -1.1666666666666665e+00F; - float v238 = 7.9015646852540022e-01F; - float v243 = 5.5854267289647742e-02F; - float v248 = 7.3430220123575241e-01F; - float v252 = 4.4095855184409838e-01F; - float v253 = -4.4095855184409838e-01F; - float v260 = 3.4087293062393137e-01F; - float v261 = -3.4087293062393137e-01F; - float v268 = -5.3396936033772524e-01F; - float v269 = 5.3396936033772524e-01F; - float v276 = 8.7484229096165667e-01F; - float v277 = -8.7484229096165667e-01F; - float v321 = -1.4999999999999998e+00F; - float v326 = 1.7499999999999996e+00F; - float v331 = -1.1852347027881001e+00F; - float v336 = -8.3781400934471603e-02F; - float v341 = -1.1014533018536286e+00F; - float v345 = -6.6143782776614746e-01F; - float v346 = 6.6143782776614746e-01F; - float v353 = -5.1130939593589697e-01F; - float v354 = 5.1130939593589697e-01F; - float v361 = 8.0095404050658769e-01F; - float v362 = -8.0095404050658769e-01F; - float v369 = -1.3122634364424848e+00F; - float v370 = 1.3122634364424848e+00F; - float v413 = 8.6602540378443871e-01F; - float v414 = -8.6602540378443871e-01F; - float v421 = -1.0103629710818451e+00F; - float v422 = 1.0103629710818451e+00F; - float v429 = 6.8429557470759583e-01F; - float v430 = -6.8429557470759583e-01F; - float v437 = 4.8371214382601155e-02F; - float v438 = -4.8371214382601155e-02F; - float v445 = 6.3592436032499466e-01F; - float v446 = -6.3592436032499466e-01F; - float32x2_t v448 = (float32x2_t){v4, v4}; - float v454 = -3.8188130791298663e-01F; - float v459 = -2.9520461738277515e-01F; - float v464 = 4.6243103089499693e-01F; - float v469 = -7.5763564827777208e-01F; - const int32_t *v1384 = &v5[istride]; - int32_t *v1475 = &v6[ostride]; - float32x2_t v234 = (float32x2_t){v233, v233}; - float32x2_t v239 = (float32x2_t){v238, v238}; - float32x2_t v244 = (float32x2_t){v243, v243}; - float32x2_t v249 = (float32x2_t){v248, v248}; - float32x2_t v254 = (float32x2_t){v252, v253}; - float32x2_t v262 = (float32x2_t){v260, v261}; - float32x2_t v270 = (float32x2_t){v268, v269}; - float32x2_t v278 = (float32x2_t){v276, v277}; - float32x2_t v322 = (float32x2_t){v321, v321}; - float32x2_t v327 = (float32x2_t){v326, v326}; - float32x2_t v332 = (float32x2_t){v331, v331}; - float32x2_t v337 = (float32x2_t){v336, v336}; - float32x2_t v342 = (float32x2_t){v341, v341}; - float32x2_t v347 = (float32x2_t){v345, v346}; - float32x2_t v355 = (float32x2_t){v353, v354}; - float32x2_t v363 = (float32x2_t){v361, v362}; - float32x2_t v371 = (float32x2_t){v369, v370}; - float32x2_t v415 = (float32x2_t){v413, v414}; - float32x2_t v423 = (float32x2_t){v421, v422}; - float32x2_t v431 = (float32x2_t){v429, v430}; - float32x2_t v439 = (float32x2_t){v437, v438}; - float32x2_t v447 = (float32x2_t){v445, v446}; - float32x2_t v455 = (float32x2_t){v454, v454}; - float32x2_t v460 = (float32x2_t){v459, v459}; - float32x2_t v465 = (float32x2_t){v464, v464}; - float32x2_t v470 = (float32x2_t){v469, v469}; - const int32_t *v1267 = &v5[0]; - int32_t *v1439 = &v6[0]; - int16x4_t v1653 = vld1_s16((const int16_t *)v1384); - float32x4_t v163 = vcvtq_n_f32_s32(vmovl_s16(v1653), 15); - float32x4_t v235 = vcombine_f32(v234, v234); - float32x4_t v240 = vcombine_f32(v239, v239); - float32x4_t v245 = vcombine_f32(v244, v244); - float32x4_t v250 = vcombine_f32(v249, v249); - float32x2_t v256 = vmul_f32(v448, v254); - float32x2_t v264 = vmul_f32(v448, v262); - float32x2_t v272 = vmul_f32(v448, v270); - float32x2_t v280 = vmul_f32(v448, v278); - float32x4_t v323 = vcombine_f32(v322, v322); - float32x4_t v328 = vcombine_f32(v327, v327); - float32x4_t v333 = vcombine_f32(v332, v332); - float32x4_t v338 = vcombine_f32(v337, v337); - float32x4_t v343 = vcombine_f32(v342, v342); - float32x2_t v349 = vmul_f32(v448, v347); - float32x2_t v357 = vmul_f32(v448, v355); - float32x2_t v365 = vmul_f32(v448, v363); - float32x2_t v373 = vmul_f32(v448, v371); - float32x2_t v417 = vmul_f32(v448, v415); - float32x2_t v425 = vmul_f32(v448, v423); - float32x2_t v433 = vmul_f32(v448, v431); - float32x2_t v441 = vmul_f32(v448, v439); - float32x2_t v449 = vmul_f32(v448, v447); - float32x4_t v456 = vcombine_f32(v455, v455); - float32x4_t v461 = vcombine_f32(v460, v460); - float32x4_t v466 = vcombine_f32(v465, v465); - float32x4_t v471 = vcombine_f32(v470, v470); - const int32_t *v1248 = &v5[istride * 7]; - const int32_t *v1257 = &v5[istride * 14]; - const int32_t *v1276 = &v5[istride * 10]; - const int32_t *v1285 = &v5[istride * 17]; - const int32_t *v1294 = &v5[istride * 3]; - const int32_t *v1303 = &v5[istride * 13]; - const int32_t *v1312 = &v5[istride * 20]; - const int32_t *v1321 = &v5[istride * 6]; - const int32_t *v1330 = &v5[istride * 16]; - const int32_t *v1339 = &v5[istride * 2]; - const int32_t *v1348 = &v5[istride * 9]; - const int32_t *v1357 = &v5[istride * 19]; - const int32_t *v1366 = &v5[istride * 5]; - const int32_t *v1375 = &v5[istride * 12]; - const int32_t *v1393 = &v5[istride * 8]; - const int32_t *v1402 = &v5[istride * 15]; - const int32_t *v1411 = &v5[istride * 4]; - const int32_t *v1420 = &v5[istride * 11]; - const int32_t *v1429 = &v5[istride * 18]; - int32_t *v1448 = &v6[ostride * 7]; - int32_t *v1457 = &v6[ostride * 14]; - int32_t *v1466 = &v6[ostride * 15]; - int32_t *v1484 = &v6[ostride * 8]; - int32_t *v1493 = &v6[ostride * 9]; - int32_t *v1502 = &v6[ostride * 16]; - int32_t *v1511 = &v6[ostride * 2]; - int32_t *v1520 = &v6[ostride * 3]; - int32_t *v1529 = &v6[ostride * 10]; - int32_t *v1538 = &v6[ostride * 17]; - int32_t *v1547 = &v6[ostride * 18]; - int32_t *v1556 = &v6[ostride * 4]; - int32_t *v1565 = &v6[ostride * 11]; - int32_t *v1574 = &v6[ostride * 12]; - int32_t *v1583 = &v6[ostride * 19]; - int32_t *v1592 = &v6[ostride * 5]; - int32_t *v1601 = &v6[ostride * 6]; - int32_t *v1610 = &v6[ostride * 13]; - int32_t *v1619 = &v6[ostride * 20]; - int16x4_t v1627 = vld1_s16((const int16_t *)v1267); - float32x4_t v46 = vcvtq_n_f32_s32(vmovl_s16(v1627), 15); - float32x4_t v258 = vcombine_f32(v256, v256); - float32x4_t v266 = vcombine_f32(v264, v264); - float32x4_t v274 = vcombine_f32(v272, v272); - float32x4_t v282 = vcombine_f32(v280, v280); - float32x4_t v351 = vcombine_f32(v349, v349); - float32x4_t v359 = vcombine_f32(v357, v357); - float32x4_t v367 = vcombine_f32(v365, v365); - float32x4_t v375 = vcombine_f32(v373, v373); - float32x4_t v419 = vcombine_f32(v417, v417); - float32x4_t v427 = vcombine_f32(v425, v425); - float32x4_t v435 = vcombine_f32(v433, v433); - float32x4_t v443 = vcombine_f32(v441, v441); - float32x4_t v451 = vcombine_f32(v449, v449); - int16x4_t v1623 = vld1_s16((const int16_t *)v1248); - int16x4_t v1625 = vld1_s16((const int16_t *)v1257); - int16x4_t v1629 = vld1_s16((const int16_t *)v1276); - int16x4_t v1631 = vld1_s16((const int16_t *)v1285); - int16x4_t v1633 = vld1_s16((const int16_t *)v1294); - int16x4_t v1635 = vld1_s16((const int16_t *)v1303); - int16x4_t v1637 = vld1_s16((const int16_t *)v1312); - int16x4_t v1639 = vld1_s16((const int16_t *)v1321); - int16x4_t v1641 = vld1_s16((const int16_t *)v1330); - int16x4_t v1643 = vld1_s16((const int16_t *)v1339); - int16x4_t v1645 = vld1_s16((const int16_t *)v1348); - int16x4_t v1647 = vld1_s16((const int16_t *)v1357); - int16x4_t v1649 = vld1_s16((const int16_t *)v1366); - int16x4_t v1651 = vld1_s16((const int16_t *)v1375); - int16x4_t v1655 = vld1_s16((const int16_t *)v1393); - int16x4_t v1657 = vld1_s16((const int16_t *)v1402); - int16x4_t v1659 = vld1_s16((const int16_t *)v1411); - int16x4_t v1661 = vld1_s16((const int16_t *)v1420); - int16x4_t v1663 = vld1_s16((const int16_t *)v1429); - float32x4_t v28 = vcvtq_n_f32_s32(vmovl_s16(v1623), 15); - float32x4_t v36 = vcvtq_n_f32_s32(vmovl_s16(v1625), 15); - float32x4_t v55 = vcvtq_n_f32_s32(vmovl_s16(v1629), 15); - float32x4_t v63 = vcvtq_n_f32_s32(vmovl_s16(v1631), 15); - float32x4_t v73 = vcvtq_n_f32_s32(vmovl_s16(v1633), 15); - float32x4_t v82 = vcvtq_n_f32_s32(vmovl_s16(v1635), 15); - float32x4_t v90 = vcvtq_n_f32_s32(vmovl_s16(v1637), 15); - float32x4_t v100 = vcvtq_n_f32_s32(vmovl_s16(v1639), 15); - float32x4_t v109 = vcvtq_n_f32_s32(vmovl_s16(v1641), 15); - float32x4_t v117 = vcvtq_n_f32_s32(vmovl_s16(v1643), 15); - float32x4_t v127 = vcvtq_n_f32_s32(vmovl_s16(v1645), 15); - float32x4_t v136 = vcvtq_n_f32_s32(vmovl_s16(v1647), 15); - float32x4_t v144 = vcvtq_n_f32_s32(vmovl_s16(v1649), 15); - float32x4_t v154 = vcvtq_n_f32_s32(vmovl_s16(v1651), 15); - float32x4_t v171 = vcvtq_n_f32_s32(vmovl_s16(v1655), 15); - float32x4_t v181 = vcvtq_n_f32_s32(vmovl_s16(v1657), 15); - float32x4_t v190 = vcvtq_n_f32_s32(vmovl_s16(v1659), 15); - float32x4_t v198 = vcvtq_n_f32_s32(vmovl_s16(v1661), 15); - float32x4_t v208 = vcvtq_n_f32_s32(vmovl_s16(v1663), 15); - float32x4_t v37 = vaddq_f32(v28, v36); - float32x4_t v38 = vsubq_f32(v28, v36); - float32x4_t v64 = vaddq_f32(v55, v63); - float32x4_t v65 = vsubq_f32(v55, v63); - float32x4_t v91 = vaddq_f32(v82, v90); - float32x4_t v92 = vsubq_f32(v82, v90); - float32x4_t v118 = vaddq_f32(v109, v117); - float32x4_t v119 = vsubq_f32(v109, v117); - float32x4_t v145 = vaddq_f32(v136, v144); - float32x4_t v146 = vsubq_f32(v136, v144); - float32x4_t v172 = vaddq_f32(v163, v171); - float32x4_t v173 = vsubq_f32(v163, v171); - float32x4_t v199 = vaddq_f32(v190, v198); - float32x4_t v200 = vsubq_f32(v190, v198); - float32x4_t v47 = vaddq_f32(v37, v46); - float32x4_t v74 = vaddq_f32(v64, v73); - float32x4_t v101 = vaddq_f32(v91, v100); - float32x4_t v128 = vaddq_f32(v118, v127); - float32x4_t v155 = vaddq_f32(v145, v154); - float32x4_t v182 = vaddq_f32(v172, v181); - float32x4_t v209 = vaddq_f32(v199, v208); - float32x4_t v303 = vaddq_f32(v64, v199); - float32x4_t v304 = vsubq_f32(v64, v199); - float32x4_t v305 = vaddq_f32(v145, v118); - float32x4_t v306 = vsubq_f32(v145, v118); - float32x4_t v307 = vaddq_f32(v91, v172); - float32x4_t v308 = vsubq_f32(v91, v172); - float32x4_t v396 = vaddq_f32(v65, v200); - float32x4_t v397 = vsubq_f32(v65, v200); - float32x4_t v398 = vaddq_f32(v146, v119); - float32x4_t v399 = vsubq_f32(v146, v119); - float32x4_t v400 = vaddq_f32(v92, v173); - float32x4_t v401 = vsubq_f32(v92, v173); - float32x4_t v210 = vaddq_f32(v74, v209); - float32x4_t v211 = vsubq_f32(v74, v209); - float32x4_t v212 = vaddq_f32(v155, v128); - float32x4_t v213 = vsubq_f32(v155, v128); - float32x4_t v214 = vaddq_f32(v101, v182); - float32x4_t v215 = vsubq_f32(v101, v182); - float32x4_t v309 = vaddq_f32(v303, v305); - float32x4_t v312 = vsubq_f32(v303, v305); - float32x4_t v313 = vsubq_f32(v305, v307); - float32x4_t v314 = vsubq_f32(v307, v303); - float32x4_t v315 = vaddq_f32(v304, v306); - float32x4_t v317 = vsubq_f32(v304, v306); - float32x4_t v318 = vsubq_f32(v306, v308); - float32x4_t v319 = vsubq_f32(v308, v304); - float32x4_t v402 = vaddq_f32(v396, v398); - float32x4_t v405 = vsubq_f32(v396, v398); - float32x4_t v406 = vsubq_f32(v398, v400); - float32x4_t v407 = vsubq_f32(v400, v396); - float32x4_t v408 = vaddq_f32(v397, v399); - float32x4_t v410 = vsubq_f32(v397, v399); - float32x4_t v411 = vsubq_f32(v399, v401); - float32x4_t v412 = vsubq_f32(v401, v397); - float32x4_t v216 = vaddq_f32(v210, v212); - float32x4_t v219 = vsubq_f32(v210, v212); - float32x4_t v220 = vsubq_f32(v212, v214); - float32x4_t v221 = vsubq_f32(v214, v210); - float32x4_t v222 = vaddq_f32(v211, v213); - float32x4_t v224 = vsubq_f32(v211, v213); - float32x4_t v225 = vsubq_f32(v213, v215); - float32x4_t v226 = vsubq_f32(v215, v211); - float32x4_t v310 = vaddq_f32(v309, v307); - float32x4_t v316 = vaddq_f32(v315, v308); - float32x4_t v334 = vmulq_f32(v312, v333); - float32x4_t v339 = vmulq_f32(v313, v338); - float32x4_t v344 = vmulq_f32(v314, v343); - float32x4_t v358 = vrev64q_f32(v317); - float32x4_t v366 = vrev64q_f32(v318); - float32x4_t v374 = vrev64q_f32(v319); - float32x4_t v403 = vaddq_f32(v402, v400); - float32x4_t v409 = vaddq_f32(v408, v401); - float32x4_t v434 = vrev64q_f32(v405); - float32x4_t v442 = vrev64q_f32(v406); - float32x4_t v450 = vrev64q_f32(v407); - float32x4_t v462 = vmulq_f32(v410, v461); - float32x4_t v467 = vmulq_f32(v411, v466); - float32x4_t v472 = vmulq_f32(v412, v471); - float32x4_t v217 = vaddq_f32(v216, v214); - float32x4_t v223 = vaddq_f32(v222, v215); - float32x4_t v241 = vmulq_f32(v219, v240); - float32x4_t v246 = vmulq_f32(v220, v245); - float32x4_t v251 = vmulq_f32(v221, v250); - float32x4_t v265 = vrev64q_f32(v224); - float32x4_t v273 = vrev64q_f32(v225); - float32x4_t v281 = vrev64q_f32(v226); - float32x4_t v311 = vaddq_f32(v310, v37); - float32x4_t v329 = vmulq_f32(v310, v328); - float32x4_t v350 = vrev64q_f32(v316); - float32x4_t v360 = vmulq_f32(v358, v359); - float32x4_t v368 = vmulq_f32(v366, v367); - float32x4_t v376 = vmulq_f32(v374, v375); - float32x4_t v404 = vaddq_f32(v403, v38); - float32x4_t v426 = vrev64q_f32(v403); - float32x4_t v436 = vmulq_f32(v434, v435); - float32x4_t v444 = vmulq_f32(v442, v443); - float32x4_t v452 = vmulq_f32(v450, v451); - float32x4_t v457 = vmulq_f32(v409, v456); - float32x4_t v218 = vaddq_f32(v217, v47); - float32x4_t v236 = vmulq_f32(v217, v235); - float32x4_t v257 = vrev64q_f32(v223); - float32x4_t v267 = vmulq_f32(v265, v266); - float32x4_t v275 = vmulq_f32(v273, v274); - float32x4_t v283 = vmulq_f32(v281, v282); - float32x4_t v324 = vmulq_f32(v311, v323); - float32x4_t v352 = vmulq_f32(v350, v351); - float32x4_t v418 = vrev64q_f32(v404); - float32x4_t v428 = vmulq_f32(v426, v427); - float32x4_t v480 = vaddq_f32(v457, v462); - float32x4_t v482 = vsubq_f32(v457, v462); - float32x4_t v484 = vsubq_f32(v457, v467); - float32x4_t v259 = vmulq_f32(v257, v258); - float32x4_t v284 = vaddq_f32(v218, v236); - float32x4_t v377 = vaddq_f32(v324, v329); - float32x4_t v384 = vaddq_f32(v352, v360); - float32x4_t v386 = vsubq_f32(v352, v360); - float32x4_t v388 = vsubq_f32(v352, v368); - float32x4_t v420 = vmulq_f32(v418, v419); - float32x4_t v481 = vaddq_f32(v480, v467); - float32x4_t v483 = vsubq_f32(v482, v472); - float32x4_t v485 = vaddq_f32(v484, v472); - float32x4_t v492 = vaddq_f32(v218, v324); - int16x4_t v497 = vqmovn_s32(vcvtq_n_s32_f32(v218, 15)); - float32x4_t v285 = vaddq_f32(v284, v241); - float32x4_t v287 = vsubq_f32(v284, v241); - float32x4_t v289 = vsubq_f32(v284, v246); - float32x4_t v291 = vaddq_f32(v259, v267); - float32x4_t v293 = vsubq_f32(v259, v267); - float32x4_t v295 = vsubq_f32(v259, v275); - float32x4_t v378 = vaddq_f32(v377, v334); - float32x4_t v380 = vsubq_f32(v377, v334); - float32x4_t v382 = vsubq_f32(v377, v339); - float32x4_t v385 = vaddq_f32(v384, v368); - float32x4_t v387 = vsubq_f32(v386, v376); - float32x4_t v389 = vaddq_f32(v388, v376); - float32x4_t v473 = vaddq_f32(v420, v428); - float32x4_t v493 = vaddq_f32(v492, v420); - float32x4_t v494 = vsubq_f32(v492, v420); - vst1_s16((int16_t *)v1439, v497); - float32x4_t v286 = vaddq_f32(v285, v246); - float32x4_t v288 = vsubq_f32(v287, v251); - float32x4_t v290 = vaddq_f32(v289, v251); - float32x4_t v292 = vaddq_f32(v291, v275); - float32x4_t v294 = vsubq_f32(v293, v283); - float32x4_t v296 = vaddq_f32(v295, v283); - float32x4_t v379 = vaddq_f32(v378, v339); - float32x4_t v381 = vsubq_f32(v380, v344); - float32x4_t v383 = vaddq_f32(v382, v344); - float32x4_t v474 = vaddq_f32(v473, v436); - float32x4_t v476 = vsubq_f32(v473, v436); - float32x4_t v478 = vsubq_f32(v473, v444); - int16x4_t v505 = vqmovn_s32(vcvtq_n_s32_f32(v494, 15)); - int16x4_t v513 = vqmovn_s32(vcvtq_n_s32_f32(v493, 15)); - float32x4_t v297 = vaddq_f32(v286, v292); - float32x4_t v298 = vsubq_f32(v286, v292); - float32x4_t v299 = vaddq_f32(v288, v294); - float32x4_t v300 = vsubq_f32(v288, v294); - float32x4_t v301 = vaddq_f32(v290, v296); - float32x4_t v302 = vsubq_f32(v290, v296); - float32x4_t v390 = vaddq_f32(v379, v385); - float32x4_t v391 = vsubq_f32(v379, v385); - float32x4_t v392 = vaddq_f32(v381, v387); - float32x4_t v393 = vsubq_f32(v381, v387); - float32x4_t v394 = vaddq_f32(v383, v389); - float32x4_t v395 = vsubq_f32(v383, v389); - float32x4_t v475 = vaddq_f32(v474, v444); - float32x4_t v477 = vsubq_f32(v476, v452); - float32x4_t v479 = vaddq_f32(v478, v452); - vst1_s16((int16_t *)v1448, v505); - vst1_s16((int16_t *)v1457, v513); - float32x4_t v486 = vaddq_f32(v475, v481); - float32x4_t v487 = vsubq_f32(v475, v481); - float32x4_t v488 = vaddq_f32(v477, v483); - float32x4_t v489 = vsubq_f32(v477, v483); - float32x4_t v490 = vaddq_f32(v479, v485); - float32x4_t v491 = vsubq_f32(v479, v485); - float32x4_t v519 = vaddq_f32(v298, v391); - int16x4_t v524 = vqmovn_s32(vcvtq_n_s32_f32(v298, 15)); - float32x4_t v546 = vaddq_f32(v300, v393); - int16x4_t v551 = vqmovn_s32(vcvtq_n_s32_f32(v300, 15)); - float32x4_t v573 = vaddq_f32(v301, v394); - int16x4_t v578 = vqmovn_s32(vcvtq_n_s32_f32(v301, 15)); - float32x4_t v600 = vaddq_f32(v302, v395); - int16x4_t v605 = vqmovn_s32(vcvtq_n_s32_f32(v302, 15)); - float32x4_t v627 = vaddq_f32(v299, v392); - int16x4_t v632 = vqmovn_s32(vcvtq_n_s32_f32(v299, 15)); - float32x4_t v654 = vaddq_f32(v297, v390); - int16x4_t v659 = vqmovn_s32(vcvtq_n_s32_f32(v297, 15)); - float32x4_t v520 = vaddq_f32(v519, v487); - float32x4_t v521 = vsubq_f32(v519, v487); - float32x4_t v547 = vaddq_f32(v546, v489); - float32x4_t v548 = vsubq_f32(v546, v489); - float32x4_t v574 = vaddq_f32(v573, v490); - float32x4_t v575 = vsubq_f32(v573, v490); - float32x4_t v601 = vaddq_f32(v600, v491); - float32x4_t v602 = vsubq_f32(v600, v491); - float32x4_t v628 = vaddq_f32(v627, v488); - float32x4_t v629 = vsubq_f32(v627, v488); - float32x4_t v655 = vaddq_f32(v654, v486); - float32x4_t v656 = vsubq_f32(v654, v486); - vst1_s16((int16_t *)v1466, v524); - vst1_s16((int16_t *)v1493, v551); - vst1_s16((int16_t *)v1520, v578); - vst1_s16((int16_t *)v1547, v605); - vst1_s16((int16_t *)v1574, v632); - vst1_s16((int16_t *)v1601, v659); - int16x4_t v532 = vqmovn_s32(vcvtq_n_s32_f32(v521, 15)); - int16x4_t v540 = vqmovn_s32(vcvtq_n_s32_f32(v520, 15)); - int16x4_t v559 = vqmovn_s32(vcvtq_n_s32_f32(v548, 15)); - int16x4_t v567 = vqmovn_s32(vcvtq_n_s32_f32(v547, 15)); - int16x4_t v586 = vqmovn_s32(vcvtq_n_s32_f32(v575, 15)); - int16x4_t v594 = vqmovn_s32(vcvtq_n_s32_f32(v574, 15)); - int16x4_t v613 = vqmovn_s32(vcvtq_n_s32_f32(v602, 15)); - int16x4_t v621 = vqmovn_s32(vcvtq_n_s32_f32(v601, 15)); - int16x4_t v640 = vqmovn_s32(vcvtq_n_s32_f32(v629, 15)); - int16x4_t v648 = vqmovn_s32(vcvtq_n_s32_f32(v628, 15)); - int16x4_t v667 = vqmovn_s32(vcvtq_n_s32_f32(v656, 15)); - int16x4_t v675 = vqmovn_s32(vcvtq_n_s32_f32(v655, 15)); - vst1_s16((int16_t *)v1475, v532); - vst1_s16((int16_t *)v1484, v540); - vst1_s16((int16_t *)v1502, v559); - vst1_s16((int16_t *)v1511, v567); - vst1_s16((int16_t *)v1529, v586); - vst1_s16((int16_t *)v1538, v594); - vst1_s16((int16_t *)v1556, v613); - vst1_s16((int16_t *)v1565, v621); - vst1_s16((int16_t *)v1583, v640); - vst1_s16((int16_t *)v1592, v648); - vst1_s16((int16_t *)v1610, v667); - vst1_s16((int16_t *)v1619, v675); - v5 += 2 * 1; - v6 += 2 * 1; - } - for (int j = v681 * 2; j < howmany; j += 1) { - int16x4_t v798 = vld1s_s16(&v5[istride]); - float v858 = -1.1666666666666665e+00F; - float v862 = 7.9015646852540022e-01F; - float v866 = 5.5854267289647742e-02F; - float v870 = 7.3430220123575241e-01F; - float v873 = 4.4095855184409838e-01F; - float v874 = -4.4095855184409838e-01F; - float v880 = 3.4087293062393137e-01F; - float v881 = -3.4087293062393137e-01F; - float v887 = -5.3396936033772524e-01F; - float v888 = 5.3396936033772524e-01F; - float v894 = 8.7484229096165667e-01F; - float v895 = -8.7484229096165667e-01F; - float v938 = -1.4999999999999998e+00F; - float v942 = 1.7499999999999996e+00F; - float v946 = -1.1852347027881001e+00F; - float v950 = -8.3781400934471603e-02F; - float v954 = -1.1014533018536286e+00F; - float v957 = -6.6143782776614746e-01F; - float v958 = 6.6143782776614746e-01F; - float v964 = -5.1130939593589697e-01F; - float v965 = 5.1130939593589697e-01F; - float v971 = 8.0095404050658769e-01F; - float v972 = -8.0095404050658769e-01F; - float v978 = -1.3122634364424848e+00F; - float v979 = 1.3122634364424848e+00F; - float v1021 = 8.6602540378443871e-01F; - float v1022 = -8.6602540378443871e-01F; - float v1028 = -1.0103629710818451e+00F; - float v1029 = 1.0103629710818451e+00F; - float v1035 = 6.8429557470759583e-01F; - float v1036 = -6.8429557470759583e-01F; - float v1042 = 4.8371214382601155e-02F; - float v1043 = -4.8371214382601155e-02F; - float v1049 = 6.3592436032499466e-01F; - float v1050 = -6.3592436032499466e-01F; - float32x2_t v1052 = (float32x2_t){v4, v4}; - float v1057 = -3.8188130791298663e-01F; - float v1061 = -2.9520461738277515e-01F; - float v1065 = 4.6243103089499693e-01F; - float v1069 = -7.5763564827777208e-01F; - int16x4_t v707 = vld1s_s16(&v5[0]); - float32x2_t v799 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v798)), 15); - float32x2_t v859 = (float32x2_t){v858, v858}; - float32x2_t v863 = (float32x2_t){v862, v862}; - float32x2_t v867 = (float32x2_t){v866, v866}; - float32x2_t v871 = (float32x2_t){v870, v870}; - float32x2_t v875 = (float32x2_t){v873, v874}; - float32x2_t v882 = (float32x2_t){v880, v881}; - float32x2_t v889 = (float32x2_t){v887, v888}; - float32x2_t v896 = (float32x2_t){v894, v895}; - float32x2_t v939 = (float32x2_t){v938, v938}; - float32x2_t v943 = (float32x2_t){v942, v942}; - float32x2_t v947 = (float32x2_t){v946, v946}; - float32x2_t v951 = (float32x2_t){v950, v950}; - float32x2_t v955 = (float32x2_t){v954, v954}; - float32x2_t v959 = (float32x2_t){v957, v958}; - float32x2_t v966 = (float32x2_t){v964, v965}; - float32x2_t v973 = (float32x2_t){v971, v972}; - float32x2_t v980 = (float32x2_t){v978, v979}; - float32x2_t v1023 = (float32x2_t){v1021, v1022}; - float32x2_t v1030 = (float32x2_t){v1028, v1029}; - float32x2_t v1037 = (float32x2_t){v1035, v1036}; - float32x2_t v1044 = (float32x2_t){v1042, v1043}; - float32x2_t v1051 = (float32x2_t){v1049, v1050}; - float32x2_t v1058 = (float32x2_t){v1057, v1057}; - float32x2_t v1062 = (float32x2_t){v1061, v1061}; - float32x2_t v1066 = (float32x2_t){v1065, v1065}; - float32x2_t v1070 = (float32x2_t){v1069, v1069}; - int16x4_t v693 = vld1s_s16(&v5[istride * 7]); - int16x4_t v699 = vld1s_s16(&v5[istride * 14]); - float32x2_t v708 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v707)), 15); - int16x4_t v714 = vld1s_s16(&v5[istride * 10]); - int16x4_t v720 = vld1s_s16(&v5[istride * 17]); - int16x4_t v728 = vld1s_s16(&v5[istride * 3]); - int16x4_t v735 = vld1s_s16(&v5[istride * 13]); - int16x4_t v741 = vld1s_s16(&v5[istride * 20]); - int16x4_t v749 = vld1s_s16(&v5[istride * 6]); - int16x4_t v756 = vld1s_s16(&v5[istride * 16]); - int16x4_t v762 = vld1s_s16(&v5[istride * 2]); - int16x4_t v770 = vld1s_s16(&v5[istride * 9]); - int16x4_t v777 = vld1s_s16(&v5[istride * 19]); - int16x4_t v783 = vld1s_s16(&v5[istride * 5]); - int16x4_t v791 = vld1s_s16(&v5[istride * 12]); - int16x4_t v804 = vld1s_s16(&v5[istride * 8]); - int16x4_t v812 = vld1s_s16(&v5[istride * 15]); - int16x4_t v819 = vld1s_s16(&v5[istride * 4]); - int16x4_t v825 = vld1s_s16(&v5[istride * 11]); - int16x4_t v833 = vld1s_s16(&v5[istride * 18]); - float32x2_t v877 = vmul_f32(v1052, v875); - float32x2_t v884 = vmul_f32(v1052, v882); - float32x2_t v891 = vmul_f32(v1052, v889); - float32x2_t v898 = vmul_f32(v1052, v896); - float32x2_t v961 = vmul_f32(v1052, v959); - float32x2_t v968 = vmul_f32(v1052, v966); - float32x2_t v975 = vmul_f32(v1052, v973); - float32x2_t v982 = vmul_f32(v1052, v980); - float32x2_t v1025 = vmul_f32(v1052, v1023); - float32x2_t v1032 = vmul_f32(v1052, v1030); - float32x2_t v1039 = vmul_f32(v1052, v1037); - float32x2_t v1046 = vmul_f32(v1052, v1044); - float32x2_t v1053 = vmul_f32(v1052, v1051); - float32x2_t v694 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v693)), 15); - float32x2_t v700 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v699)), 15); - float32x2_t v715 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v714)), 15); - float32x2_t v721 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v720)), 15); - float32x2_t v729 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v728)), 15); - float32x2_t v736 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v735)), 15); - float32x2_t v742 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v741)), 15); - float32x2_t v750 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v749)), 15); - float32x2_t v757 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v756)), 15); - float32x2_t v763 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v762)), 15); - float32x2_t v771 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v770)), 15); - float32x2_t v778 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v777)), 15); - float32x2_t v784 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v783)), 15); - float32x2_t v792 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v791)), 15); - float32x2_t v805 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v804)), 15); - float32x2_t v813 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v812)), 15); - float32x2_t v820 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v819)), 15); - float32x2_t v826 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v825)), 15); - float32x2_t v834 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v833)), 15); - float32x2_t v701 = vadd_f32(v694, v700); - float32x2_t v702 = vsub_f32(v694, v700); - float32x2_t v722 = vadd_f32(v715, v721); - float32x2_t v723 = vsub_f32(v715, v721); - float32x2_t v743 = vadd_f32(v736, v742); - float32x2_t v744 = vsub_f32(v736, v742); - float32x2_t v764 = vadd_f32(v757, v763); - float32x2_t v765 = vsub_f32(v757, v763); - float32x2_t v785 = vadd_f32(v778, v784); - float32x2_t v786 = vsub_f32(v778, v784); - float32x2_t v806 = vadd_f32(v799, v805); - float32x2_t v807 = vsub_f32(v799, v805); - float32x2_t v827 = vadd_f32(v820, v826); - float32x2_t v828 = vsub_f32(v820, v826); - float32x2_t v709 = vadd_f32(v701, v708); - float32x2_t v730 = vadd_f32(v722, v729); - float32x2_t v751 = vadd_f32(v743, v750); - float32x2_t v772 = vadd_f32(v764, v771); - float32x2_t v793 = vadd_f32(v785, v792); - float32x2_t v814 = vadd_f32(v806, v813); - float32x2_t v835 = vadd_f32(v827, v834); - float32x2_t v920 = vadd_f32(v722, v827); - float32x2_t v921 = vsub_f32(v722, v827); - float32x2_t v922 = vadd_f32(v785, v764); - float32x2_t v923 = vsub_f32(v785, v764); - float32x2_t v924 = vadd_f32(v743, v806); - float32x2_t v925 = vsub_f32(v743, v806); - float32x2_t v1004 = vadd_f32(v723, v828); - float32x2_t v1005 = vsub_f32(v723, v828); - float32x2_t v1006 = vadd_f32(v786, v765); - float32x2_t v1007 = vsub_f32(v786, v765); - float32x2_t v1008 = vadd_f32(v744, v807); - float32x2_t v1009 = vsub_f32(v744, v807); - float32x2_t v836 = vadd_f32(v730, v835); - float32x2_t v837 = vsub_f32(v730, v835); - float32x2_t v838 = vadd_f32(v793, v772); - float32x2_t v839 = vsub_f32(v793, v772); - float32x2_t v840 = vadd_f32(v751, v814); - float32x2_t v841 = vsub_f32(v751, v814); - float32x2_t v926 = vadd_f32(v920, v922); - float32x2_t v929 = vsub_f32(v920, v922); - float32x2_t v930 = vsub_f32(v922, v924); - float32x2_t v931 = vsub_f32(v924, v920); - float32x2_t v932 = vadd_f32(v921, v923); - float32x2_t v934 = vsub_f32(v921, v923); - float32x2_t v935 = vsub_f32(v923, v925); - float32x2_t v936 = vsub_f32(v925, v921); - float32x2_t v1010 = vadd_f32(v1004, v1006); - float32x2_t v1013 = vsub_f32(v1004, v1006); - float32x2_t v1014 = vsub_f32(v1006, v1008); - float32x2_t v1015 = vsub_f32(v1008, v1004); - float32x2_t v1016 = vadd_f32(v1005, v1007); - float32x2_t v1018 = vsub_f32(v1005, v1007); - float32x2_t v1019 = vsub_f32(v1007, v1009); - float32x2_t v1020 = vsub_f32(v1009, v1005); - float32x2_t v842 = vadd_f32(v836, v838); - float32x2_t v845 = vsub_f32(v836, v838); - float32x2_t v846 = vsub_f32(v838, v840); - float32x2_t v847 = vsub_f32(v840, v836); - float32x2_t v848 = vadd_f32(v837, v839); - float32x2_t v850 = vsub_f32(v837, v839); - float32x2_t v851 = vsub_f32(v839, v841); - float32x2_t v852 = vsub_f32(v841, v837); - float32x2_t v927 = vadd_f32(v926, v924); - float32x2_t v933 = vadd_f32(v932, v925); - float32x2_t v948 = vmul_f32(v929, v947); - float32x2_t v952 = vmul_f32(v930, v951); - float32x2_t v956 = vmul_f32(v931, v955); - float32x2_t v969 = vrev64_f32(v934); - float32x2_t v976 = vrev64_f32(v935); - float32x2_t v983 = vrev64_f32(v936); - float32x2_t v1011 = vadd_f32(v1010, v1008); - float32x2_t v1017 = vadd_f32(v1016, v1009); - float32x2_t v1040 = vrev64_f32(v1013); - float32x2_t v1047 = vrev64_f32(v1014); - float32x2_t v1054 = vrev64_f32(v1015); - float32x2_t v1063 = vmul_f32(v1018, v1062); - float32x2_t v1067 = vmul_f32(v1019, v1066); - float32x2_t v1071 = vmul_f32(v1020, v1070); - float32x2_t v843 = vadd_f32(v842, v840); - float32x2_t v849 = vadd_f32(v848, v841); - float32x2_t v864 = vmul_f32(v845, v863); - float32x2_t v868 = vmul_f32(v846, v867); - float32x2_t v872 = vmul_f32(v847, v871); - float32x2_t v885 = vrev64_f32(v850); - float32x2_t v892 = vrev64_f32(v851); - float32x2_t v899 = vrev64_f32(v852); - float32x2_t v928 = vadd_f32(v927, v701); - float32x2_t v944 = vmul_f32(v927, v943); - float32x2_t v962 = vrev64_f32(v933); - float32x2_t v970 = vmul_f32(v969, v968); - float32x2_t v977 = vmul_f32(v976, v975); - float32x2_t v984 = vmul_f32(v983, v982); - float32x2_t v1012 = vadd_f32(v1011, v702); - float32x2_t v1033 = vrev64_f32(v1011); - float32x2_t v1041 = vmul_f32(v1040, v1039); - float32x2_t v1048 = vmul_f32(v1047, v1046); - float32x2_t v1055 = vmul_f32(v1054, v1053); - float32x2_t v1059 = vmul_f32(v1017, v1058); - float32x2_t v844 = vadd_f32(v843, v709); - float32x2_t v860 = vmul_f32(v843, v859); - float32x2_t v878 = vrev64_f32(v849); - float32x2_t v886 = vmul_f32(v885, v884); - float32x2_t v893 = vmul_f32(v892, v891); - float32x2_t v900 = vmul_f32(v899, v898); - float32x2_t v940 = vmul_f32(v928, v939); - float32x2_t v963 = vmul_f32(v962, v961); - float32x2_t v1026 = vrev64_f32(v1012); - float32x2_t v1034 = vmul_f32(v1033, v1032); - float32x2_t v1079 = vadd_f32(v1059, v1063); - float32x2_t v1081 = vsub_f32(v1059, v1063); - float32x2_t v1083 = vsub_f32(v1059, v1067); - float32x2_t v879 = vmul_f32(v878, v877); - float32x2_t v901 = vadd_f32(v844, v860); - float32x2_t v985 = vadd_f32(v940, v944); - float32x2_t v992 = vadd_f32(v963, v970); - float32x2_t v994 = vsub_f32(v963, v970); - float32x2_t v996 = vsub_f32(v963, v977); - float32x2_t v1027 = vmul_f32(v1026, v1025); - float32x2_t v1080 = vadd_f32(v1079, v1067); - float32x2_t v1082 = vsub_f32(v1081, v1071); - float32x2_t v1084 = vadd_f32(v1083, v1071); - float32x2_t v1091 = vadd_f32(v844, v940); - int16x4_t v1096 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v844, 15), (int32x2_t){0, 0})); - float32x2_t v902 = vadd_f32(v901, v864); - float32x2_t v904 = vsub_f32(v901, v864); - float32x2_t v906 = vsub_f32(v901, v868); - float32x2_t v908 = vadd_f32(v879, v886); - float32x2_t v910 = vsub_f32(v879, v886); - float32x2_t v912 = vsub_f32(v879, v893); - float32x2_t v986 = vadd_f32(v985, v948); - float32x2_t v988 = vsub_f32(v985, v948); - float32x2_t v990 = vsub_f32(v985, v952); - float32x2_t v993 = vadd_f32(v992, v977); - float32x2_t v995 = vsub_f32(v994, v984); - float32x2_t v997 = vadd_f32(v996, v984); - float32x2_t v1072 = vadd_f32(v1027, v1034); - float32x2_t v1092 = vadd_f32(v1091, v1027); - float32x2_t v1093 = vsub_f32(v1091, v1027); - v6[0] = vget_lane_s32(vreinterpret_s32_s16(v1096), 0); - float32x2_t v903 = vadd_f32(v902, v868); - float32x2_t v905 = vsub_f32(v904, v872); - float32x2_t v907 = vadd_f32(v906, v872); - float32x2_t v909 = vadd_f32(v908, v893); - float32x2_t v911 = vsub_f32(v910, v900); - float32x2_t v913 = vadd_f32(v912, v900); - float32x2_t v987 = vadd_f32(v986, v952); - float32x2_t v989 = vsub_f32(v988, v956); - float32x2_t v991 = vadd_f32(v990, v956); - float32x2_t v1073 = vadd_f32(v1072, v1041); - float32x2_t v1075 = vsub_f32(v1072, v1041); - float32x2_t v1077 = vsub_f32(v1072, v1048); - int16x4_t v1102 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1093, 15), (int32x2_t){0, 0})); - int16x4_t v1108 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1092, 15), (int32x2_t){0, 0})); - float32x2_t v914 = vadd_f32(v903, v909); - float32x2_t v915 = vsub_f32(v903, v909); - float32x2_t v916 = vadd_f32(v905, v911); - float32x2_t v917 = vsub_f32(v905, v911); - float32x2_t v918 = vadd_f32(v907, v913); - float32x2_t v919 = vsub_f32(v907, v913); - float32x2_t v998 = vadd_f32(v987, v993); - float32x2_t v999 = vsub_f32(v987, v993); - float32x2_t v1000 = vadd_f32(v989, v995); - float32x2_t v1001 = vsub_f32(v989, v995); - float32x2_t v1002 = vadd_f32(v991, v997); - float32x2_t v1003 = vsub_f32(v991, v997); - float32x2_t v1074 = vadd_f32(v1073, v1048); - float32x2_t v1076 = vsub_f32(v1075, v1055); - float32x2_t v1078 = vadd_f32(v1077, v1055); - v6[ostride * 7] = vget_lane_s32(vreinterpret_s32_s16(v1102), 0); - v6[ostride * 14] = vget_lane_s32(vreinterpret_s32_s16(v1108), 0); - float32x2_t v1085 = vadd_f32(v1074, v1080); - float32x2_t v1086 = vsub_f32(v1074, v1080); - float32x2_t v1087 = vadd_f32(v1076, v1082); - float32x2_t v1088 = vsub_f32(v1076, v1082); - float32x2_t v1089 = vadd_f32(v1078, v1084); - float32x2_t v1090 = vsub_f32(v1078, v1084); - float32x2_t v1112 = vadd_f32(v915, v999); - int16x4_t v1117 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v915, 15), (int32x2_t){0, 0})); - float32x2_t v1133 = vadd_f32(v917, v1001); - int16x4_t v1138 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v917, 15), (int32x2_t){0, 0})); - float32x2_t v1154 = vadd_f32(v918, v1002); - int16x4_t v1159 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v918, 15), (int32x2_t){0, 0})); - float32x2_t v1175 = vadd_f32(v919, v1003); - int16x4_t v1180 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v919, 15), (int32x2_t){0, 0})); - float32x2_t v1196 = vadd_f32(v916, v1000); - int16x4_t v1201 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v916, 15), (int32x2_t){0, 0})); - float32x2_t v1217 = vadd_f32(v914, v998); - int16x4_t v1222 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v914, 15), (int32x2_t){0, 0})); - float32x2_t v1113 = vadd_f32(v1112, v1086); - float32x2_t v1114 = vsub_f32(v1112, v1086); - v6[ostride * 15] = vget_lane_s32(vreinterpret_s32_s16(v1117), 0); - float32x2_t v1134 = vadd_f32(v1133, v1088); - float32x2_t v1135 = vsub_f32(v1133, v1088); - v6[ostride * 9] = vget_lane_s32(vreinterpret_s32_s16(v1138), 0); - float32x2_t v1155 = vadd_f32(v1154, v1089); - float32x2_t v1156 = vsub_f32(v1154, v1089); - v6[ostride * 3] = vget_lane_s32(vreinterpret_s32_s16(v1159), 0); - float32x2_t v1176 = vadd_f32(v1175, v1090); - float32x2_t v1177 = vsub_f32(v1175, v1090); - v6[ostride * 18] = vget_lane_s32(vreinterpret_s32_s16(v1180), 0); - float32x2_t v1197 = vadd_f32(v1196, v1087); - float32x2_t v1198 = vsub_f32(v1196, v1087); - v6[ostride * 12] = vget_lane_s32(vreinterpret_s32_s16(v1201), 0); - float32x2_t v1218 = vadd_f32(v1217, v1085); - float32x2_t v1219 = vsub_f32(v1217, v1085); - v6[ostride * 6] = vget_lane_s32(vreinterpret_s32_s16(v1222), 0); - int16x4_t v1123 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1114, 15), (int32x2_t){0, 0})); - int16x4_t v1129 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1113, 15), (int32x2_t){0, 0})); - int16x4_t v1144 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1135, 15), (int32x2_t){0, 0})); - int16x4_t v1150 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1134, 15), (int32x2_t){0, 0})); - int16x4_t v1165 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1156, 15), (int32x2_t){0, 0})); - int16x4_t v1171 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1155, 15), (int32x2_t){0, 0})); - int16x4_t v1186 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1177, 15), (int32x2_t){0, 0})); - int16x4_t v1192 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1176, 15), (int32x2_t){0, 0})); - int16x4_t v1207 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1198, 15), (int32x2_t){0, 0})); - int16x4_t v1213 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1197, 15), (int32x2_t){0, 0})); - int16x4_t v1228 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1219, 15), (int32x2_t){0, 0})); - int16x4_t v1234 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1218, 15), (int32x2_t){0, 0})); - v6[ostride] = vget_lane_s32(vreinterpret_s32_s16(v1123), 0); - v6[ostride * 8] = vget_lane_s32(vreinterpret_s32_s16(v1129), 0); - v6[ostride * 16] = vget_lane_s32(vreinterpret_s32_s16(v1144), 0); - v6[ostride * 2] = vget_lane_s32(vreinterpret_s32_s16(v1150), 0); - v6[ostride * 10] = vget_lane_s32(vreinterpret_s32_s16(v1165), 0); - v6[ostride * 17] = vget_lane_s32(vreinterpret_s32_s16(v1171), 0); - v6[ostride * 4] = vget_lane_s32(vreinterpret_s32_s16(v1186), 0); - v6[ostride * 11] = vget_lane_s32(vreinterpret_s32_s16(v1192), 0); - v6[ostride * 19] = vget_lane_s32(vreinterpret_s32_s16(v1207), 0); - v6[ostride * 5] = vget_lane_s32(vreinterpret_s32_s16(v1213), 0); - v6[ostride * 13] = vget_lane_s32(vreinterpret_s32_s16(v1228), 0); - v6[ostride * 20] = vget_lane_s32(vreinterpret_s32_s16(v1234), 0); - v5 += 1 * 1; - v6 += 1 * 1; - } -} -#endif - -#ifdef ARMRAL_ARCH_SVE -void armral_fft_cs16_cf32_cs16_ac_n_uu21(const armral_cmplx_int16_t *restrict x, - armral_cmplx_int16_t *restrict y, - int istride, int ostride, int howmany, - float dir) { - int64_t v0 = istride; - int64_t v2 = ostride; - float v4 = dir; - const int32_t *v5 = (const int32_t *)x; - int32_t *v6 = (int32_t *)y; - int64_t v8 = howmany; - int64_t v10 = svcntd(); - int64_t v11 = v10 * 1; - int64_t v12 = v10 * 1; - for (int j = 0; j < v8; j += v10) { - svbool_t pred_full = svwhilelt_b32(j * 2, howmany * 2); - float v230 = -1.1666666666666665e+00F; - float v235 = 7.9015646852540022e-01F; - float v240 = 5.5854267289647742e-02F; - float v245 = 7.3430220123575241e-01F; - float v250 = -4.4095855184409838e-01F; - float v257 = -3.4087293062393137e-01F; - float v264 = 5.3396936033772524e-01F; - float v271 = -8.7484229096165667e-01F; - float v314 = -1.4999999999999998e+00F; - float v319 = 1.7499999999999996e+00F; - float v324 = -1.1852347027881001e+00F; - float v329 = -8.3781400934471603e-02F; - float v334 = -1.1014533018536286e+00F; - float v339 = 6.6143782776614746e-01F; - float v346 = 5.1130939593589697e-01F; - float v353 = -8.0095404050658769e-01F; - float v360 = 1.3122634364424848e+00F; - float v403 = -8.6602540378443871e-01F; - float v410 = 1.0103629710818451e+00F; - float v417 = -6.8429557470759583e-01F; - float v424 = -4.8371214382601155e-02F; - float v431 = -6.3592436032499466e-01F; - float v438 = -3.8188130791298663e-01F; - float v443 = -2.9520461738277515e-01F; - float v448 = 4.6243103089499693e-01F; - float v453 = -7.5763564827777208e-01F; - const int32_t *v807 = &v5[v0]; - int32_t *v925 = &v6[v2]; - int64_t v19 = v0 * 7; - int64_t v27 = v0 * 14; - int64_t v46 = v0 * 10; - int64_t v54 = v0 * 17; - int64_t v64 = v0 * 3; - int64_t v73 = v0 * 13; - int64_t v81 = v0 * 20; - int64_t v91 = v0 * 6; - int64_t v100 = v0 * 16; - int64_t v108 = v0 * 2; - int64_t v118 = v0 * 9; - int64_t v127 = v0 * 19; - int64_t v135 = v0 * 5; - int64_t v145 = v0 * 12; - int64_t v162 = v0 * 8; - int64_t v172 = v0 * 15; - int64_t v181 = v0 * 4; - int64_t v189 = v0 * 11; - int64_t v199 = v0 * 18; - float v253 = v4 * v250; - float v260 = v4 * v257; - float v267 = v4 * v264; - float v274 = v4 * v271; - float v342 = v4 * v339; - float v349 = v4 * v346; - float v356 = v4 * v353; - float v363 = v4 * v360; - float v406 = v4 * v403; - float v413 = v4 * v410; - float v420 = v4 * v417; - float v427 = v4 * v424; - float v434 = v4 * v431; - int64_t v488 = v2 * 7; - int64_t v496 = v2 * 14; - int64_t v507 = v2 * 15; - int64_t v523 = v2 * 8; - int64_t v534 = v2 * 9; - int64_t v542 = v2 * 16; - int64_t v550 = v2 * 2; - int64_t v561 = v2 * 3; - int64_t v569 = v2 * 10; - int64_t v577 = v2 * 17; - int64_t v588 = v2 * 18; - int64_t v596 = v2 * 4; - int64_t v604 = v2 * 11; - int64_t v615 = v2 * 12; - int64_t v623 = v2 * 19; - int64_t v631 = v2 * 5; - int64_t v642 = v2 * 6; - int64_t v650 = v2 * 13; - int64_t v658 = v2 * 20; - const int32_t *v690 = &v5[0]; - svfloat32_t v856 = svdup_n_f32(v230); - svfloat32_t v857 = svdup_n_f32(v235); - svfloat32_t v858 = svdup_n_f32(v240); - svfloat32_t v859 = svdup_n_f32(v245); - svfloat32_t v864 = svdup_n_f32(v314); - svfloat32_t v865 = svdup_n_f32(v319); - svfloat32_t v866 = svdup_n_f32(v324); - svfloat32_t v867 = svdup_n_f32(v329); - svfloat32_t v868 = svdup_n_f32(v334); - svfloat32_t v878 = svdup_n_f32(v438); - svfloat32_t v879 = svdup_n_f32(v443); - svfloat32_t v880 = svdup_n_f32(v448); - svfloat32_t v881 = svdup_n_f32(v453); - int32_t *v889 = &v6[0]; - svfloat32_t v160 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v807[0])), - 1.F / (1ULL << 15ULL)); - const int32_t *v671 = &v5[v19]; - const int32_t *v680 = &v5[v27]; - const int32_t *v699 = &v5[v46]; - const int32_t *v708 = &v5[v54]; - const int32_t *v717 = &v5[v64]; - const int32_t *v726 = &v5[v73]; - const int32_t *v735 = &v5[v81]; - const int32_t *v744 = &v5[v91]; - const int32_t *v753 = &v5[v100]; - const int32_t *v762 = &v5[v108]; - const int32_t *v771 = &v5[v118]; - const int32_t *v780 = &v5[v127]; - const int32_t *v789 = &v5[v135]; - const int32_t *v798 = &v5[v145]; - const int32_t *v816 = &v5[v162]; - const int32_t *v825 = &v5[v172]; - const int32_t *v834 = &v5[v181]; - const int32_t *v843 = &v5[v189]; - const int32_t *v852 = &v5[v199]; - svfloat32_t v860 = svdup_n_f32(v253); - svfloat32_t v861 = svdup_n_f32(v260); - svfloat32_t v862 = svdup_n_f32(v267); - svfloat32_t v863 = svdup_n_f32(v274); - svfloat32_t v869 = svdup_n_f32(v342); - svfloat32_t v870 = svdup_n_f32(v349); - svfloat32_t v871 = svdup_n_f32(v356); - svfloat32_t v872 = svdup_n_f32(v363); - svfloat32_t v873 = svdup_n_f32(v406); - svfloat32_t v874 = svdup_n_f32(v413); - svfloat32_t v875 = svdup_n_f32(v420); - svfloat32_t v876 = svdup_n_f32(v427); - svfloat32_t v877 = svdup_n_f32(v434); - int32_t *v898 = &v6[v488]; - int32_t *v907 = &v6[v496]; - int32_t *v916 = &v6[v507]; - int32_t *v934 = &v6[v523]; - int32_t *v943 = &v6[v534]; - int32_t *v952 = &v6[v542]; - int32_t *v961 = &v6[v550]; - int32_t *v970 = &v6[v561]; - int32_t *v979 = &v6[v569]; - int32_t *v988 = &v6[v577]; - int32_t *v997 = &v6[v588]; - int32_t *v1006 = &v6[v596]; - int32_t *v1015 = &v6[v604]; - int32_t *v1024 = &v6[v615]; - int32_t *v1033 = &v6[v623]; - int32_t *v1042 = &v6[v631]; - int32_t *v1051 = &v6[v642]; - int32_t *v1060 = &v6[v650]; - int32_t *v1069 = &v6[v658]; - svfloat32_t v43 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v690[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v25 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v671[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v33 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v680[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v52 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v699[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v60 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v708[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v70 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v717[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v79 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v726[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v87 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v735[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v97 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v744[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v106 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v753[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v114 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v762[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v124 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v771[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v133 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v780[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v141 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v789[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v151 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v798[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v168 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v816[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v178 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v825[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v187 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v834[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v195 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v843[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v205 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v852[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v34; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v34) : "w"(v25), "w"(v33)); - svfloat32_t v35; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v35) : "w"(v25), "w"(v33)); - svfloat32_t v61; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v61) : "w"(v52), "w"(v60)); - svfloat32_t v62; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v62) : "w"(v52), "w"(v60)); - svfloat32_t v88; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v88) : "w"(v79), "w"(v87)); - svfloat32_t v89; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v89) : "w"(v79), "w"(v87)); - svfloat32_t v115; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v115) : "w"(v106), "w"(v114)); - svfloat32_t v116; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v116) : "w"(v106), "w"(v114)); - svfloat32_t v142; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v142) : "w"(v133), "w"(v141)); - svfloat32_t v143; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v143) : "w"(v133), "w"(v141)); - svfloat32_t v169; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v169) : "w"(v160), "w"(v168)); - svfloat32_t v170; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v170) : "w"(v160), "w"(v168)); - svfloat32_t v196; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v196) : "w"(v187), "w"(v195)); - svfloat32_t v197; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v197) : "w"(v187), "w"(v195)); - svfloat32_t v44; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v44) : "w"(v34), "w"(v43)); - svfloat32_t v71; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v71) : "w"(v61), "w"(v70)); - svfloat32_t v98; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v98) : "w"(v88), "w"(v97)); - svfloat32_t v125; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v125) : "w"(v115), "w"(v124)); - svfloat32_t v152; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v152) : "w"(v142), "w"(v151)); - svfloat32_t v179; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v179) : "w"(v169), "w"(v178)); - svfloat32_t v206; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v206) : "w"(v196), "w"(v205)); - svfloat32_t v296; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v296) : "w"(v61), "w"(v196)); - svfloat32_t v297; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v297) : "w"(v61), "w"(v196)); - svfloat32_t v298; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v298) : "w"(v142), "w"(v115)); - svfloat32_t v299; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v299) : "w"(v142), "w"(v115)); - svfloat32_t v300; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v300) : "w"(v88), "w"(v169)); - svfloat32_t v301; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v301) : "w"(v88), "w"(v169)); - svfloat32_t v385; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v385) : "w"(v62), "w"(v197)); - svfloat32_t v386; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v386) : "w"(v62), "w"(v197)); - svfloat32_t v387; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v387) : "w"(v143), "w"(v116)); - svfloat32_t v388; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v388) : "w"(v143), "w"(v116)); - svfloat32_t v389; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v389) : "w"(v89), "w"(v170)); - svfloat32_t v390; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v390) : "w"(v89), "w"(v170)); - svfloat32_t v207; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v207) : "w"(v71), "w"(v206)); - svfloat32_t v208; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v208) : "w"(v71), "w"(v206)); - svfloat32_t v209; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v209) : "w"(v152), "w"(v125)); - svfloat32_t v210; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v210) : "w"(v152), "w"(v125)); - svfloat32_t v211; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v211) : "w"(v98), "w"(v179)); - svfloat32_t v212; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v212) : "w"(v98), "w"(v179)); - svfloat32_t v302; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v302) : "w"(v296), "w"(v298)); - svfloat32_t v305; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v305) : "w"(v296), "w"(v298)); - svfloat32_t v306; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v306) : "w"(v298), "w"(v300)); - svfloat32_t v307; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v307) : "w"(v300), "w"(v296)); - svfloat32_t v308; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v308) : "w"(v297), "w"(v299)); - svfloat32_t v310; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v310) : "w"(v297), "w"(v299)); - svfloat32_t v311; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v311) : "w"(v299), "w"(v301)); - svfloat32_t v312; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v312) : "w"(v301), "w"(v297)); - svfloat32_t v391; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v391) : "w"(v385), "w"(v387)); - svfloat32_t v394; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v394) : "w"(v385), "w"(v387)); - svfloat32_t v395; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v395) : "w"(v387), "w"(v389)); - svfloat32_t v396; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v396) : "w"(v389), "w"(v385)); - svfloat32_t v397; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v397) : "w"(v386), "w"(v388)); - svfloat32_t v399; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v399) : "w"(v386), "w"(v388)); - svfloat32_t v400; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v400) : "w"(v388), "w"(v390)); - svfloat32_t v401; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v401) : "w"(v390), "w"(v386)); - svfloat32_t v213; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v213) : "w"(v207), "w"(v209)); - svfloat32_t v216; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v216) : "w"(v207), "w"(v209)); - svfloat32_t v217; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v217) : "w"(v209), "w"(v211)); - svfloat32_t v218; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v218) : "w"(v211), "w"(v207)); - svfloat32_t v219; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v219) : "w"(v208), "w"(v210)); - svfloat32_t v221; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v221) : "w"(v208), "w"(v210)); - svfloat32_t v222; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v222) : "w"(v210), "w"(v212)); - svfloat32_t v223; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v223) : "w"(v212), "w"(v208)); - svfloat32_t v303; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v303) : "w"(v302), "w"(v300)); - svfloat32_t v309; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v309) : "w"(v308), "w"(v301)); - svfloat32_t zero351; - asm volatile("mov %0.s, #0" : "=w"(zero351)); - svfloat32_t v351 = svcmla_f32_x(pred_full, zero351, v870, v310, 90); - svfloat32_t zero358; - asm volatile("mov %0.s, #0" : "=w"(zero358)); - svfloat32_t v358 = svcmla_f32_x(pred_full, zero358, v871, v311, 90); - svfloat32_t zero365; - asm volatile("mov %0.s, #0" : "=w"(zero365)); - svfloat32_t v365 = svcmla_f32_x(pred_full, zero365, v872, v312, 90); - svfloat32_t v392; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v392) : "w"(v391), "w"(v389)); - svfloat32_t v398; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v398) : "w"(v397), "w"(v390)); - svfloat32_t zero422; - asm volatile("mov %0.s, #0" : "=w"(zero422)); - svfloat32_t v422 = svcmla_f32_x(pred_full, zero422, v875, v394, 90); - svfloat32_t zero429; - asm volatile("mov %0.s, #0" : "=w"(zero429)); - svfloat32_t v429 = svcmla_f32_x(pred_full, zero429, v876, v395, 90); - svfloat32_t zero436; - asm volatile("mov %0.s, #0" : "=w"(zero436)); - svfloat32_t v436 = svcmla_f32_x(pred_full, zero436, v877, v396, 90); - svfloat32_t v446; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v446) : "w"(v399), "w"(v879)); - svfloat32_t v451; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v451) : "w"(v400), "w"(v880)); - svfloat32_t v214; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v214) : "w"(v213), "w"(v211)); - svfloat32_t v220; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v220) : "w"(v219), "w"(v212)); - svfloat32_t zero262; - asm volatile("mov %0.s, #0" : "=w"(zero262)); - svfloat32_t v262 = svcmla_f32_x(pred_full, zero262, v861, v221, 90); - svfloat32_t zero269; - asm volatile("mov %0.s, #0" : "=w"(zero269)); - svfloat32_t v269 = svcmla_f32_x(pred_full, zero269, v862, v222, 90); - svfloat32_t zero276; - asm volatile("mov %0.s, #0" : "=w"(zero276)); - svfloat32_t v276 = svcmla_f32_x(pred_full, zero276, v863, v223, 90); - svfloat32_t v304; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v304) : "w"(v303), "w"(v34)); - svfloat32_t v322; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v322) : "w"(v303), "w"(v865)); - svfloat32_t zero344; - asm volatile("mov %0.s, #0" : "=w"(zero344)); - svfloat32_t v344 = svcmla_f32_x(pred_full, zero344, v869, v309, 90); - svfloat32_t v393; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v393) : "w"(v392), "w"(v35)); - svfloat32_t v215; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v215) : "w"(v214), "w"(v44)); - svfloat32_t zero255; - asm volatile("mov %0.s, #0" : "=w"(zero255)); - svfloat32_t v255 = svcmla_f32_x(pred_full, zero255, v860, v220, 90); - svfloat32_t v373; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v373) : "w"(v344), "w"(v351)); - svfloat32_t v375; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v375) : "w"(v344), "w"(v351)); - svfloat32_t v377; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v377) : "w"(v344), "w"(v358)); - svfloat32_t zero408; - asm volatile("mov %0.s, #0" : "=w"(zero408)); - svfloat32_t v408 = svcmla_f32_x(pred_full, zero408, v873, v393, 90); - svfloat32_t v464 = svmla_f32_x(pred_full, v446, v398, v878); - svfloat32_t v466 = svnmls_f32_x(pred_full, v446, v398, v878); - svfloat32_t v468 = svnmls_f32_x(pred_full, v451, v398, v878); - svfloat32_t v277 = svmla_f32_x(pred_full, v215, v214, v856); - svfloat32_t v284; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v284) : "w"(v255), "w"(v262)); - svfloat32_t v286; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v286) : "w"(v255), "w"(v262)); - svfloat32_t v288; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v288) : "w"(v255), "w"(v269)); - svfloat32_t v366 = svmla_f32_x(pred_full, v322, v304, v864); - svfloat32_t v374; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v374) : "w"(v373), "w"(v358)); - svfloat32_t v376; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v376) : "w"(v375), "w"(v365)); - svfloat32_t v378; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v378) : "w"(v377), "w"(v365)); - svfloat32_t v457 = svcmla_f32_x(pred_full, v408, v874, v392, 90); - svfloat32_t v465 = svmla_f32_x(pred_full, v464, v400, v880); - svfloat32_t v467 = svmls_f32_x(pred_full, v466, v401, v881); - svfloat32_t v469 = svmla_f32_x(pred_full, v468, v401, v881); - svfloat32_t v476 = svmla_f32_x(pred_full, v215, v304, v864); - svint16_t v481 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v215, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svfloat32_t v278 = svmla_f32_x(pred_full, v277, v216, v857); - svfloat32_t v280 = svmls_f32_x(pred_full, v277, v216, v857); - svfloat32_t v282 = svmls_f32_x(pred_full, v277, v217, v858); - svfloat32_t v285; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v285) : "w"(v284), "w"(v269)); - svfloat32_t v287; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v287) : "w"(v286), "w"(v276)); - svfloat32_t v289; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v289) : "w"(v288), "w"(v276)); - svfloat32_t v367 = svmla_f32_x(pred_full, v366, v305, v866); - svfloat32_t v369 = svmls_f32_x(pred_full, v366, v305, v866); - svfloat32_t v371 = svmls_f32_x(pred_full, v366, v306, v867); - svfloat32_t v458; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v458) : "w"(v457), "w"(v422)); - svfloat32_t v460; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v460) : "w"(v457), "w"(v422)); - svfloat32_t v462; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v462) : "w"(v457), "w"(v429)); - svfloat32_t v477; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v477) : "w"(v476), "w"(v408)); - svfloat32_t v478; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v478) : "w"(v476), "w"(v408)); - svst1w_u64(pred_full, (unsigned *)(v889), svreinterpret_u64_s16(v481)); - svfloat32_t v279 = svmla_f32_x(pred_full, v278, v217, v858); - svfloat32_t v281 = svmls_f32_x(pred_full, v280, v218, v859); - svfloat32_t v283 = svmla_f32_x(pred_full, v282, v218, v859); - svfloat32_t v368 = svmla_f32_x(pred_full, v367, v306, v867); - svfloat32_t v370 = svmls_f32_x(pred_full, v369, v307, v868); - svfloat32_t v372 = svmla_f32_x(pred_full, v371, v307, v868); - svfloat32_t v459; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v459) : "w"(v458), "w"(v429)); - svfloat32_t v461; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v461) : "w"(v460), "w"(v436)); - svfloat32_t v463; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v463) : "w"(v462), "w"(v436)); - svint16_t v489 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v478, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svint16_t v497 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v477, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svfloat32_t v290; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v290) : "w"(v279), "w"(v285)); - svfloat32_t v291; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v291) : "w"(v279), "w"(v285)); - svfloat32_t v292; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v292) : "w"(v281), "w"(v287)); - svfloat32_t v293; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v293) : "w"(v281), "w"(v287)); - svfloat32_t v294; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v294) : "w"(v283), "w"(v289)); - svfloat32_t v295; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v295) : "w"(v283), "w"(v289)); - svfloat32_t v379; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v379) : "w"(v368), "w"(v374)); - svfloat32_t v380; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v380) : "w"(v368), "w"(v374)); - svfloat32_t v381; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v381) : "w"(v370), "w"(v376)); - svfloat32_t v382; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v382) : "w"(v370), "w"(v376)); - svfloat32_t v383; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v383) : "w"(v372), "w"(v378)); - svfloat32_t v384; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v384) : "w"(v372), "w"(v378)); - svfloat32_t v470; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v470) : "w"(v459), "w"(v465)); - svfloat32_t v471; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v471) : "w"(v459), "w"(v465)); - svfloat32_t v472; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v472) : "w"(v461), "w"(v467)); - svfloat32_t v473; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v473) : "w"(v461), "w"(v467)); - svfloat32_t v474; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v474) : "w"(v463), "w"(v469)); - svfloat32_t v475; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v475) : "w"(v463), "w"(v469)); - svst1w_u64(pred_full, (unsigned *)(v898), svreinterpret_u64_s16(v489)); - svst1w_u64(pred_full, (unsigned *)(v907), svreinterpret_u64_s16(v497)); - svfloat32_t v503; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v503) : "w"(v291), "w"(v380)); - svint16_t v508 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v291, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svfloat32_t v530; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v530) : "w"(v293), "w"(v382)); - svint16_t v535 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v293, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svfloat32_t v557; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v557) : "w"(v294), "w"(v383)); - svint16_t v562 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v294, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svfloat32_t v584; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v584) : "w"(v295), "w"(v384)); - svint16_t v589 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v295, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svfloat32_t v611; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v611) : "w"(v292), "w"(v381)); - svint16_t v616 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v292, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svfloat32_t v638; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v638) : "w"(v290), "w"(v379)); - svint16_t v643 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v290, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svfloat32_t v504; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v504) : "w"(v503), "w"(v471)); - svfloat32_t v505; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v505) : "w"(v503), "w"(v471)); - svfloat32_t v531; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v531) : "w"(v530), "w"(v473)); - svfloat32_t v532; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v532) : "w"(v530), "w"(v473)); - svfloat32_t v558; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v558) : "w"(v557), "w"(v474)); - svfloat32_t v559; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v559) : "w"(v557), "w"(v474)); - svfloat32_t v585; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v585) : "w"(v584), "w"(v475)); - svfloat32_t v586; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v586) : "w"(v584), "w"(v475)); - svfloat32_t v612; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v612) : "w"(v611), "w"(v472)); - svfloat32_t v613; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v613) : "w"(v611), "w"(v472)); - svfloat32_t v639; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v639) : "w"(v638), "w"(v470)); - svfloat32_t v640; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v640) : "w"(v638), "w"(v470)); - svst1w_u64(pred_full, (unsigned *)(v916), svreinterpret_u64_s16(v508)); - svst1w_u64(pred_full, (unsigned *)(v943), svreinterpret_u64_s16(v535)); - svst1w_u64(pred_full, (unsigned *)(v970), svreinterpret_u64_s16(v562)); - svst1w_u64(pred_full, (unsigned *)(v997), svreinterpret_u64_s16(v589)); - svst1w_u64(pred_full, (unsigned *)(v1024), svreinterpret_u64_s16(v616)); - svst1w_u64(pred_full, (unsigned *)(v1051), svreinterpret_u64_s16(v643)); - svint16_t v516 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v505, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svint16_t v524 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v504, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svint16_t v543 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v532, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svint16_t v551 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v531, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svint16_t v570 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v559, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svint16_t v578 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v558, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svint16_t v597 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v586, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svint16_t v605 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v585, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svint16_t v624 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v613, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svint16_t v632 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v612, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svint16_t v651 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v640, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svint16_t v659 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v639, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svst1w_u64(pred_full, (unsigned *)(v925), svreinterpret_u64_s16(v516)); - svst1w_u64(pred_full, (unsigned *)(v934), svreinterpret_u64_s16(v524)); - svst1w_u64(pred_full, (unsigned *)(v952), svreinterpret_u64_s16(v543)); - svst1w_u64(pred_full, (unsigned *)(v961), svreinterpret_u64_s16(v551)); - svst1w_u64(pred_full, (unsigned *)(v979), svreinterpret_u64_s16(v570)); - svst1w_u64(pred_full, (unsigned *)(v988), svreinterpret_u64_s16(v578)); - svst1w_u64(pred_full, (unsigned *)(v1006), svreinterpret_u64_s16(v597)); - svst1w_u64(pred_full, (unsigned *)(v1015), svreinterpret_u64_s16(v605)); - svst1w_u64(pred_full, (unsigned *)(v1033), svreinterpret_u64_s16(v624)); - svst1w_u64(pred_full, (unsigned *)(v1042), svreinterpret_u64_s16(v632)); - svst1w_u64(pred_full, (unsigned *)(v1060), svreinterpret_u64_s16(v651)); - svst1w_u64(pred_full, (unsigned *)(v1069), svreinterpret_u64_s16(v659)); - v5 += v11; - v6 += v12; - } -} -#endif - -#ifndef ARMRAL_ARCH_SVE -void armral_fft_cs16_cf32_cs16_ac_n_uu22(const armral_cmplx_int16_t *restrict x, - armral_cmplx_int16_t *restrict y, - int istride, int ostride, int howmany, - float dir) { - float v4 = dir; - const int32_t *v5 = (const int32_t *)x; - int32_t *v6 = (int32_t *)y; - int64_t v12 = howmany - 1; - int64_t v833 = howmany / 2; - for (int j = 0; j < v12; j += 2) { - float v481 = 1.1000000000000001e+00F; - float v485 = 3.3166247903554003e-01F; - float v486 = -3.3166247903554003e-01F; - float v494 = 5.1541501300188641e-01F; - float v499 = 9.4125353283118118e-01F; - float v504 = 1.4143537075597825e+00F; - float v509 = 8.5949297361449750e-01F; - float v514 = 4.2314838273285138e-02F; - float v519 = 3.8639279888589606e-01F; - float v524 = 5.1254589567200015e-01F; - float v529 = 1.0702757469471715e+00F; - float v534 = 5.5486073394528512e-01F; - float v538 = 1.2412944743900585e+00F; - float v539 = -1.2412944743900585e+00F; - float v546 = 2.0897833842005756e-01F; - float v547 = -2.0897833842005756e-01F; - float v554 = 3.7415717312460811e-01F; - float v555 = -3.7415717312460811e-01F; - float v562 = 4.9929922194110327e-02F; - float v563 = -4.9929922194110327e-02F; - float v570 = 6.5815896284539266e-01F; - float v571 = -6.5815896284539266e-01F; - float v578 = 6.3306543373877577e-01F; - float v579 = -6.3306543373877577e-01F; - float v586 = 1.0822460581641109e+00F; - float v587 = -1.0822460581641109e+00F; - float v594 = 8.1720737907134022e-01F; - float v595 = -8.1720737907134022e-01F; - float v602 = 4.2408709531871824e-01F; - float v603 = -4.2408709531871824e-01F; - float32x2_t v605 = (float32x2_t){v4, v4}; - const int32_t *v1651 = &v5[istride]; - int32_t *v1760 = &v6[ostride]; - float32x2_t v482 = (float32x2_t){v481, v481}; - float32x2_t v487 = (float32x2_t){v485, v486}; - float32x2_t v495 = (float32x2_t){v494, v494}; - float32x2_t v500 = (float32x2_t){v499, v499}; - float32x2_t v505 = (float32x2_t){v504, v504}; - float32x2_t v510 = (float32x2_t){v509, v509}; - float32x2_t v515 = (float32x2_t){v514, v514}; - float32x2_t v520 = (float32x2_t){v519, v519}; - float32x2_t v525 = (float32x2_t){v524, v524}; - float32x2_t v530 = (float32x2_t){v529, v529}; - float32x2_t v535 = (float32x2_t){v534, v534}; - float32x2_t v540 = (float32x2_t){v538, v539}; - float32x2_t v548 = (float32x2_t){v546, v547}; - float32x2_t v556 = (float32x2_t){v554, v555}; - float32x2_t v564 = (float32x2_t){v562, v563}; - float32x2_t v572 = (float32x2_t){v570, v571}; - float32x2_t v580 = (float32x2_t){v578, v579}; - float32x2_t v588 = (float32x2_t){v586, v587}; - float32x2_t v596 = (float32x2_t){v594, v595}; - float32x2_t v604 = (float32x2_t){v602, v603}; - const int32_t *v1534 = &v5[0]; - int32_t *v1733 = &v6[0]; - int16x4_t v1952 = vld1_s16((const int16_t *)v1651); - float32x4_t v144 = vcvtq_n_f32_s32(vmovl_s16(v1952), 15); - float32x4_t v483 = vcombine_f32(v482, v482); - float32x2_t v489 = vmul_f32(v605, v487); - float32x4_t v496 = vcombine_f32(v495, v495); - float32x4_t v501 = vcombine_f32(v500, v500); - float32x4_t v506 = vcombine_f32(v505, v505); - float32x4_t v511 = vcombine_f32(v510, v510); - float32x4_t v516 = vcombine_f32(v515, v515); - float32x4_t v521 = vcombine_f32(v520, v520); - float32x4_t v526 = vcombine_f32(v525, v525); - float32x4_t v531 = vcombine_f32(v530, v530); - float32x4_t v536 = vcombine_f32(v535, v535); - float32x2_t v542 = vmul_f32(v605, v540); - float32x2_t v550 = vmul_f32(v605, v548); - float32x2_t v558 = vmul_f32(v605, v556); - float32x2_t v566 = vmul_f32(v605, v564); - float32x2_t v574 = vmul_f32(v605, v572); - float32x2_t v582 = vmul_f32(v605, v580); - float32x2_t v590 = vmul_f32(v605, v588); - float32x2_t v598 = vmul_f32(v605, v596); - float32x2_t v606 = vmul_f32(v605, v604); - const int32_t *v1543 = &v5[istride * 11]; - const int32_t *v1552 = &v5[istride * 2]; - const int32_t *v1561 = &v5[istride * 13]; - const int32_t *v1570 = &v5[istride * 4]; - const int32_t *v1579 = &v5[istride * 15]; - const int32_t *v1588 = &v5[istride * 6]; - const int32_t *v1597 = &v5[istride * 17]; - const int32_t *v1606 = &v5[istride * 8]; - const int32_t *v1615 = &v5[istride * 19]; - const int32_t *v1624 = &v5[istride * 10]; - const int32_t *v1633 = &v5[istride * 21]; - const int32_t *v1642 = &v5[istride * 12]; - const int32_t *v1660 = &v5[istride * 14]; - const int32_t *v1669 = &v5[istride * 3]; - const int32_t *v1678 = &v5[istride * 16]; - const int32_t *v1687 = &v5[istride * 5]; - const int32_t *v1696 = &v5[istride * 18]; - const int32_t *v1705 = &v5[istride * 7]; - const int32_t *v1714 = &v5[istride * 20]; - const int32_t *v1723 = &v5[istride * 9]; - int32_t *v1742 = &v6[ostride * 11]; - int32_t *v1751 = &v6[ostride * 12]; - int32_t *v1769 = &v6[ostride * 2]; - int32_t *v1778 = &v6[ostride * 13]; - int32_t *v1787 = &v6[ostride * 14]; - int32_t *v1796 = &v6[ostride * 3]; - int32_t *v1805 = &v6[ostride * 4]; - int32_t *v1814 = &v6[ostride * 15]; - int32_t *v1823 = &v6[ostride * 16]; - int32_t *v1832 = &v6[ostride * 5]; - int32_t *v1841 = &v6[ostride * 6]; - int32_t *v1850 = &v6[ostride * 17]; - int32_t *v1859 = &v6[ostride * 18]; - int32_t *v1868 = &v6[ostride * 7]; - int32_t *v1877 = &v6[ostride * 8]; - int32_t *v1886 = &v6[ostride * 19]; - int32_t *v1895 = &v6[ostride * 20]; - int32_t *v1904 = &v6[ostride * 9]; - int32_t *v1913 = &v6[ostride * 10]; - int32_t *v1922 = &v6[ostride * 21]; - int16x4_t v1926 = vld1_s16((const int16_t *)v1534); - float32x4_t v28 = vcvtq_n_f32_s32(vmovl_s16(v1926), 15); - float32x4_t v491 = vcombine_f32(v489, v489); - float32x4_t v544 = vcombine_f32(v542, v542); - float32x4_t v552 = vcombine_f32(v550, v550); - float32x4_t v560 = vcombine_f32(v558, v558); - float32x4_t v568 = vcombine_f32(v566, v566); - float32x4_t v576 = vcombine_f32(v574, v574); - float32x4_t v584 = vcombine_f32(v582, v582); - float32x4_t v592 = vcombine_f32(v590, v590); - float32x4_t v600 = vcombine_f32(v598, v598); - float32x4_t v608 = vcombine_f32(v606, v606); - int16x4_t v1928 = vld1_s16((const int16_t *)v1543); - int16x4_t v1930 = vld1_s16((const int16_t *)v1552); - int16x4_t v1932 = vld1_s16((const int16_t *)v1561); - int16x4_t v1934 = vld1_s16((const int16_t *)v1570); - int16x4_t v1936 = vld1_s16((const int16_t *)v1579); - int16x4_t v1938 = vld1_s16((const int16_t *)v1588); - int16x4_t v1940 = vld1_s16((const int16_t *)v1597); - int16x4_t v1942 = vld1_s16((const int16_t *)v1606); - int16x4_t v1944 = vld1_s16((const int16_t *)v1615); - int16x4_t v1946 = vld1_s16((const int16_t *)v1624); - int16x4_t v1948 = vld1_s16((const int16_t *)v1633); - int16x4_t v1950 = vld1_s16((const int16_t *)v1642); - int16x4_t v1954 = vld1_s16((const int16_t *)v1660); - int16x4_t v1956 = vld1_s16((const int16_t *)v1669); - int16x4_t v1958 = vld1_s16((const int16_t *)v1678); - int16x4_t v1960 = vld1_s16((const int16_t *)v1687); - int16x4_t v1962 = vld1_s16((const int16_t *)v1696); - int16x4_t v1964 = vld1_s16((const int16_t *)v1705); - int16x4_t v1966 = vld1_s16((const int16_t *)v1714); - int16x4_t v1968 = vld1_s16((const int16_t *)v1723); - float32x4_t v36 = vcvtq_n_f32_s32(vmovl_s16(v1928), 15); - float32x4_t v46 = vcvtq_n_f32_s32(vmovl_s16(v1930), 15); - float32x4_t v54 = vcvtq_n_f32_s32(vmovl_s16(v1932), 15); - float32x4_t v64 = vcvtq_n_f32_s32(vmovl_s16(v1934), 15); - float32x4_t v72 = vcvtq_n_f32_s32(vmovl_s16(v1936), 15); - float32x4_t v82 = vcvtq_n_f32_s32(vmovl_s16(v1938), 15); - float32x4_t v90 = vcvtq_n_f32_s32(vmovl_s16(v1940), 15); - float32x4_t v100 = vcvtq_n_f32_s32(vmovl_s16(v1942), 15); - float32x4_t v108 = vcvtq_n_f32_s32(vmovl_s16(v1944), 15); - float32x4_t v118 = vcvtq_n_f32_s32(vmovl_s16(v1946), 15); - float32x4_t v126 = vcvtq_n_f32_s32(vmovl_s16(v1948), 15); - float32x4_t v136 = vcvtq_n_f32_s32(vmovl_s16(v1950), 15); - float32x4_t v154 = vcvtq_n_f32_s32(vmovl_s16(v1954), 15); - float32x4_t v162 = vcvtq_n_f32_s32(vmovl_s16(v1956), 15); - float32x4_t v172 = vcvtq_n_f32_s32(vmovl_s16(v1958), 15); - float32x4_t v180 = vcvtq_n_f32_s32(vmovl_s16(v1960), 15); - float32x4_t v190 = vcvtq_n_f32_s32(vmovl_s16(v1962), 15); - float32x4_t v198 = vcvtq_n_f32_s32(vmovl_s16(v1964), 15); - float32x4_t v208 = vcvtq_n_f32_s32(vmovl_s16(v1966), 15); - float32x4_t v216 = vcvtq_n_f32_s32(vmovl_s16(v1968), 15); - float32x4_t v37 = vaddq_f32(v28, v36); - float32x4_t v38 = vsubq_f32(v28, v36); - float32x4_t v55 = vaddq_f32(v46, v54); - float32x4_t v56 = vsubq_f32(v46, v54); - float32x4_t v73 = vaddq_f32(v64, v72); - float32x4_t v74 = vsubq_f32(v64, v72); - float32x4_t v91 = vaddq_f32(v82, v90); - float32x4_t v92 = vsubq_f32(v82, v90); - float32x4_t v109 = vaddq_f32(v100, v108); - float32x4_t v110 = vsubq_f32(v100, v108); - float32x4_t v127 = vaddq_f32(v118, v126); - float32x4_t v128 = vsubq_f32(v118, v126); - float32x4_t v145 = vaddq_f32(v136, v144); - float32x4_t v146 = vsubq_f32(v136, v144); - float32x4_t v163 = vaddq_f32(v154, v162); - float32x4_t v164 = vsubq_f32(v154, v162); - float32x4_t v181 = vaddq_f32(v172, v180); - float32x4_t v182 = vsubq_f32(v172, v180); - float32x4_t v199 = vaddq_f32(v190, v198); - float32x4_t v200 = vsubq_f32(v190, v198); - float32x4_t v217 = vaddq_f32(v208, v216); - float32x4_t v218 = vsubq_f32(v208, v216); - float32x4_t v219 = vaddq_f32(v55, v217); - float32x4_t v220 = vaddq_f32(v73, v199); - float32x4_t v221 = vaddq_f32(v91, v181); - float32x4_t v222 = vaddq_f32(v109, v163); - float32x4_t v223 = vaddq_f32(v127, v145); - float32x4_t v224 = vsubq_f32(v55, v217); - float32x4_t v225 = vsubq_f32(v73, v199); - float32x4_t v226 = vsubq_f32(v91, v181); - float32x4_t v227 = vsubq_f32(v109, v163); - float32x4_t v228 = vsubq_f32(v127, v145); - float32x4_t v438 = vaddq_f32(v56, v218); - float32x4_t v439 = vaddq_f32(v74, v200); - float32x4_t v440 = vaddq_f32(v92, v182); - float32x4_t v441 = vaddq_f32(v110, v164); - float32x4_t v442 = vaddq_f32(v128, v146); - float32x4_t v443 = vsubq_f32(v56, v218); - float32x4_t v444 = vsubq_f32(v74, v200); - float32x4_t v445 = vsubq_f32(v92, v182); - float32x4_t v446 = vsubq_f32(v110, v164); - float32x4_t v447 = vsubq_f32(v128, v146); - float32x4_t v229 = vaddq_f32(v219, v220); - float32x4_t v230 = vaddq_f32(v221, v223); - float32x4_t v232 = vsubq_f32(v225, v226); - float32x4_t v233 = vaddq_f32(v224, v228); - float32x4_t v238 = vsubq_f32(v220, v222); - float32x4_t v239 = vsubq_f32(v219, v222); - float32x4_t v240 = vsubq_f32(v220, v219); - float32x4_t v241 = vsubq_f32(v223, v222); - float32x4_t v242 = vsubq_f32(v221, v222); - float32x4_t v243 = vsubq_f32(v223, v221); - float32x4_t v244 = vsubq_f32(v220, v223); - float32x4_t v245 = vsubq_f32(v219, v221); - float32x4_t v247 = vaddq_f32(v225, v227); - float32x4_t v248 = vsubq_f32(v224, v227); - float32x4_t v249 = vaddq_f32(v224, v225); - float32x4_t v250 = vsubq_f32(v227, v228); - float32x4_t v251 = vsubq_f32(v226, v227); - float32x4_t v252 = vsubq_f32(v226, v228); - float32x4_t v253 = vaddq_f32(v225, v228); - float32x4_t v254 = vsubq_f32(v224, v226); - float32x4_t v448 = vaddq_f32(v438, v439); - float32x4_t v449 = vaddq_f32(v440, v442); - float32x4_t v451 = vsubq_f32(v444, v445); - float32x4_t v452 = vaddq_f32(v443, v447); - float32x4_t v457 = vsubq_f32(v439, v441); - float32x4_t v458 = vsubq_f32(v438, v441); - float32x4_t v459 = vsubq_f32(v439, v438); - float32x4_t v460 = vsubq_f32(v442, v441); - float32x4_t v461 = vsubq_f32(v440, v441); - float32x4_t v462 = vsubq_f32(v442, v440); - float32x4_t v463 = vsubq_f32(v439, v442); - float32x4_t v464 = vsubq_f32(v438, v440); - float32x4_t v466 = vaddq_f32(v444, v446); - float32x4_t v467 = vsubq_f32(v443, v446); - float32x4_t v468 = vaddq_f32(v443, v444); - float32x4_t v469 = vsubq_f32(v446, v447); - float32x4_t v470 = vsubq_f32(v445, v446); - float32x4_t v471 = vsubq_f32(v445, v447); - float32x4_t v472 = vaddq_f32(v444, v447); - float32x4_t v473 = vsubq_f32(v443, v445); - float32x4_t v231 = vaddq_f32(v222, v229); - float32x4_t v236 = vsubq_f32(v232, v233); - float32x4_t v246 = vsubq_f32(v230, v229); - float32x4_t v255 = vaddq_f32(v232, v233); - float32x4_t v278 = vmulq_f32(v238, v496); - float32x4_t v283 = vmulq_f32(v239, v501); - float32x4_t v288 = vmulq_f32(v240, v506); - float32x4_t v293 = vmulq_f32(v241, v511); - float32x4_t v298 = vmulq_f32(v242, v516); - float32x4_t v303 = vmulq_f32(v243, v521); - float32x4_t v308 = vmulq_f32(v244, v526); - float32x4_t v313 = vmulq_f32(v245, v531); - float32x4_t v324 = vrev64q_f32(v247); - float32x4_t v332 = vrev64q_f32(v248); - float32x4_t v340 = vrev64q_f32(v249); - float32x4_t v348 = vrev64q_f32(v250); - float32x4_t v356 = vrev64q_f32(v251); - float32x4_t v364 = vrev64q_f32(v252); - float32x4_t v372 = vrev64q_f32(v253); - float32x4_t v380 = vrev64q_f32(v254); - float32x4_t v450 = vaddq_f32(v441, v448); - float32x4_t v455 = vsubq_f32(v451, v452); - float32x4_t v465 = vsubq_f32(v449, v448); - float32x4_t v474 = vaddq_f32(v451, v452); - float32x4_t v497 = vmulq_f32(v457, v496); - float32x4_t v502 = vmulq_f32(v458, v501); - float32x4_t v507 = vmulq_f32(v459, v506); - float32x4_t v512 = vmulq_f32(v460, v511); - float32x4_t v517 = vmulq_f32(v461, v516); - float32x4_t v522 = vmulq_f32(v462, v521); - float32x4_t v527 = vmulq_f32(v463, v526); - float32x4_t v532 = vmulq_f32(v464, v531); - float32x4_t v543 = vrev64q_f32(v466); - float32x4_t v551 = vrev64q_f32(v467); - float32x4_t v559 = vrev64q_f32(v468); - float32x4_t v567 = vrev64q_f32(v469); - float32x4_t v575 = vrev64q_f32(v470); - float32x4_t v583 = vrev64q_f32(v471); - float32x4_t v591 = vrev64q_f32(v472); - float32x4_t v599 = vrev64q_f32(v473); - float32x4_t v234 = vaddq_f32(v231, v230); - float32x4_t v237 = vsubq_f32(v236, v227); - float32x4_t v318 = vmulq_f32(v246, v536); - float32x4_t v326 = vmulq_f32(v324, v544); - float32x4_t v334 = vmulq_f32(v332, v552); - float32x4_t v342 = vmulq_f32(v340, v560); - float32x4_t v350 = vmulq_f32(v348, v568); - float32x4_t v358 = vmulq_f32(v356, v576); - float32x4_t v366 = vmulq_f32(v364, v584); - float32x4_t v374 = vmulq_f32(v372, v592); - float32x4_t v382 = vmulq_f32(v380, v600); - float32x4_t v388 = vrev64q_f32(v255); - float32x4_t v392 = vaddq_f32(v278, v283); - float32x4_t v393 = vaddq_f32(v283, v288); - float32x4_t v394 = vsubq_f32(v278, v288); - float32x4_t v395 = vaddq_f32(v293, v298); - float32x4_t v396 = vaddq_f32(v298, v303); - float32x4_t v397 = vsubq_f32(v293, v303); - float32x4_t v453 = vaddq_f32(v450, v449); - float32x4_t v456 = vsubq_f32(v455, v446); - float32x4_t v537 = vmulq_f32(v465, v536); - float32x4_t v545 = vmulq_f32(v543, v544); - float32x4_t v553 = vmulq_f32(v551, v552); - float32x4_t v561 = vmulq_f32(v559, v560); - float32x4_t v569 = vmulq_f32(v567, v568); - float32x4_t v577 = vmulq_f32(v575, v576); - float32x4_t v585 = vmulq_f32(v583, v584); - float32x4_t v593 = vmulq_f32(v591, v592); - float32x4_t v601 = vmulq_f32(v599, v600); - float32x4_t v607 = vrev64q_f32(v474); - float32x4_t v611 = vaddq_f32(v497, v502); - float32x4_t v612 = vaddq_f32(v502, v507); - float32x4_t v613 = vsubq_f32(v497, v507); - float32x4_t v614 = vaddq_f32(v512, v517); - float32x4_t v615 = vaddq_f32(v517, v522); - float32x4_t v616 = vsubq_f32(v512, v522); - float32x4_t v235 = vaddq_f32(v37, v234); - float32x4_t v265 = vmulq_f32(v234, v483); - float32x4_t v271 = vrev64q_f32(v237); - float32x4_t v390 = vmulq_f32(v388, v608); - float32x4_t v398 = vaddq_f32(v313, v318); - float32x4_t v399 = vaddq_f32(v308, v318); - float32x4_t v400 = vaddq_f32(v334, v342); - float32x4_t v401 = vsubq_f32(v326, v342); - float32x4_t v402 = vaddq_f32(v358, v366); - float32x4_t v403 = vsubq_f32(v350, v366); - float32x4_t v454 = vaddq_f32(v38, v453); - float32x4_t v484 = vmulq_f32(v453, v483); - float32x4_t v490 = vrev64q_f32(v456); - float32x4_t v609 = vmulq_f32(v607, v608); - float32x4_t v617 = vaddq_f32(v532, v537); - float32x4_t v618 = vaddq_f32(v527, v537); - float32x4_t v619 = vaddq_f32(v553, v561); - float32x4_t v620 = vsubq_f32(v545, v561); - float32x4_t v621 = vaddq_f32(v577, v585); - float32x4_t v622 = vsubq_f32(v569, v585); - float32x4_t v273 = vmulq_f32(v271, v491); - float32x4_t v391 = vsubq_f32(v235, v265); - float32x4_t v404 = vaddq_f32(v382, v390); - float32x4_t v405 = vsubq_f32(v374, v390); - float32x4_t v406 = vaddq_f32(v396, v398); - float32x4_t v424 = vaddq_f32(v400, v401); - float32x4_t v492 = vmulq_f32(v490, v491); - float32x4_t v610 = vsubq_f32(v454, v484); - float32x4_t v623 = vaddq_f32(v601, v609); - float32x4_t v624 = vsubq_f32(v593, v609); - float32x4_t v625 = vaddq_f32(v615, v617); - float32x4_t v643 = vaddq_f32(v619, v620); - int16x4_t v659 = vqmovn_s32(vcvtq_n_s32_f32(v235, 15)); - int16x4_t v667 = vqmovn_s32(vcvtq_n_s32_f32(v454, 15)); - float32x4_t v407 = vaddq_f32(v406, v391); - float32x4_t v408 = vsubq_f32(v391, v393); - float32x4_t v410 = vaddq_f32(v391, v397); - float32x4_t v412 = vsubq_f32(v391, v394); - float32x4_t v414 = vaddq_f32(v391, v392); - float32x4_t v416 = vaddq_f32(v273, v402); - float32x4_t v418 = vsubq_f32(v404, v400); - float32x4_t v420 = vaddq_f32(v273, v405); - float32x4_t v422 = vsubq_f32(v405, v401); - float32x4_t v425 = vaddq_f32(v424, v402); - float32x4_t v626 = vaddq_f32(v625, v610); - float32x4_t v627 = vsubq_f32(v610, v612); - float32x4_t v629 = vaddq_f32(v610, v616); - float32x4_t v631 = vsubq_f32(v610, v613); - float32x4_t v633 = vaddq_f32(v610, v611); - float32x4_t v635 = vaddq_f32(v492, v621); - float32x4_t v637 = vsubq_f32(v623, v619); - float32x4_t v639 = vaddq_f32(v492, v624); - float32x4_t v641 = vsubq_f32(v624, v620); - float32x4_t v644 = vaddq_f32(v643, v621); - vst1_s16((int16_t *)v1733, v659); - vst1_s16((int16_t *)v1742, v667); - float32x4_t v409 = vsubq_f32(v408, v398); - float32x4_t v411 = vaddq_f32(v410, v399); - float32x4_t v413 = vsubq_f32(v412, v399); - float32x4_t v415 = vsubq_f32(v414, v395); - float32x4_t v417 = vaddq_f32(v416, v404); - float32x4_t v419 = vsubq_f32(v418, v273); - float32x4_t v421 = vaddq_f32(v420, v403); - float32x4_t v423 = vsubq_f32(v422, v273); - float32x4_t v426 = vaddq_f32(v425, v403); - float32x4_t v628 = vsubq_f32(v627, v617); - float32x4_t v630 = vaddq_f32(v629, v618); - float32x4_t v632 = vsubq_f32(v631, v618); - float32x4_t v634 = vsubq_f32(v633, v614); - float32x4_t v636 = vaddq_f32(v635, v623); - float32x4_t v638 = vsubq_f32(v637, v492); - float32x4_t v640 = vaddq_f32(v639, v622); - float32x4_t v642 = vsubq_f32(v641, v492); - float32x4_t v645 = vaddq_f32(v644, v622); - float32x4_t v427 = vsubq_f32(v426, v273); - float32x4_t v429 = vaddq_f32(v407, v417); - float32x4_t v430 = vaddq_f32(v409, v419); - float32x4_t v431 = vsubq_f32(v411, v421); - float32x4_t v432 = vaddq_f32(v413, v423); - float32x4_t v433 = vsubq_f32(v413, v423); - float32x4_t v434 = vaddq_f32(v411, v421); - float32x4_t v435 = vsubq_f32(v409, v419); - float32x4_t v436 = vsubq_f32(v407, v417); - float32x4_t v646 = vsubq_f32(v645, v492); - float32x4_t v648 = vaddq_f32(v626, v636); - float32x4_t v649 = vaddq_f32(v628, v638); - float32x4_t v650 = vsubq_f32(v630, v640); - float32x4_t v651 = vaddq_f32(v632, v642); - float32x4_t v652 = vsubq_f32(v632, v642); - float32x4_t v653 = vaddq_f32(v630, v640); - float32x4_t v654 = vsubq_f32(v628, v638); - float32x4_t v655 = vsubq_f32(v626, v636); - float32x4_t v428 = vaddq_f32(v415, v427); - float32x4_t v437 = vsubq_f32(v415, v427); - float32x4_t v647 = vaddq_f32(v634, v646); - float32x4_t v656 = vsubq_f32(v634, v646); - int16x4_t v691 = vqmovn_s32(vcvtq_n_s32_f32(v436, 15)); - int16x4_t v699 = vqmovn_s32(vcvtq_n_s32_f32(v655, 15)); - int16x4_t v707 = vqmovn_s32(vcvtq_n_s32_f32(v435, 15)); - int16x4_t v715 = vqmovn_s32(vcvtq_n_s32_f32(v654, 15)); - int16x4_t v723 = vqmovn_s32(vcvtq_n_s32_f32(v434, 15)); - int16x4_t v731 = vqmovn_s32(vcvtq_n_s32_f32(v653, 15)); - int16x4_t v739 = vqmovn_s32(vcvtq_n_s32_f32(v433, 15)); - int16x4_t v747 = vqmovn_s32(vcvtq_n_s32_f32(v652, 15)); - int16x4_t v755 = vqmovn_s32(vcvtq_n_s32_f32(v432, 15)); - int16x4_t v763 = vqmovn_s32(vcvtq_n_s32_f32(v651, 15)); - int16x4_t v771 = vqmovn_s32(vcvtq_n_s32_f32(v431, 15)); - int16x4_t v779 = vqmovn_s32(vcvtq_n_s32_f32(v650, 15)); - int16x4_t v787 = vqmovn_s32(vcvtq_n_s32_f32(v430, 15)); - int16x4_t v795 = vqmovn_s32(vcvtq_n_s32_f32(v649, 15)); - int16x4_t v803 = vqmovn_s32(vcvtq_n_s32_f32(v429, 15)); - int16x4_t v811 = vqmovn_s32(vcvtq_n_s32_f32(v648, 15)); - int16x4_t v675 = vqmovn_s32(vcvtq_n_s32_f32(v437, 15)); - int16x4_t v683 = vqmovn_s32(vcvtq_n_s32_f32(v656, 15)); - int16x4_t v819 = vqmovn_s32(vcvtq_n_s32_f32(v428, 15)); - int16x4_t v827 = vqmovn_s32(vcvtq_n_s32_f32(v647, 15)); - vst1_s16((int16_t *)v1769, v691); - vst1_s16((int16_t *)v1778, v699); - vst1_s16((int16_t *)v1787, v707); - vst1_s16((int16_t *)v1796, v715); - vst1_s16((int16_t *)v1805, v723); - vst1_s16((int16_t *)v1814, v731); - vst1_s16((int16_t *)v1823, v739); - vst1_s16((int16_t *)v1832, v747); - vst1_s16((int16_t *)v1841, v755); - vst1_s16((int16_t *)v1850, v763); - vst1_s16((int16_t *)v1859, v771); - vst1_s16((int16_t *)v1868, v779); - vst1_s16((int16_t *)v1877, v787); - vst1_s16((int16_t *)v1886, v795); - vst1_s16((int16_t *)v1895, v803); - vst1_s16((int16_t *)v1904, v811); - vst1_s16((int16_t *)v1751, v675); - vst1_s16((int16_t *)v1760, v683); - vst1_s16((int16_t *)v1913, v819); - vst1_s16((int16_t *)v1922, v827); - v5 += 2 * 1; - v6 += 2 * 1; - } - for (int j = v833 * 2; j < howmany; j += 1) { - int16x4_t v935 = vld1s_s16(&v5[istride]); - float v1235 = 1.1000000000000001e+00F; - float v1238 = 3.3166247903554003e-01F; - float v1239 = -3.3166247903554003e-01F; - float v1246 = 5.1541501300188641e-01F; - float v1250 = 9.4125353283118118e-01F; - float v1254 = 1.4143537075597825e+00F; - float v1258 = 8.5949297361449750e-01F; - float v1262 = 4.2314838273285138e-02F; - float v1266 = 3.8639279888589606e-01F; - float v1270 = 5.1254589567200015e-01F; - float v1274 = 1.0702757469471715e+00F; - float v1278 = 5.5486073394528512e-01F; - float v1281 = 1.2412944743900585e+00F; - float v1282 = -1.2412944743900585e+00F; - float v1288 = 2.0897833842005756e-01F; - float v1289 = -2.0897833842005756e-01F; - float v1295 = 3.7415717312460811e-01F; - float v1296 = -3.7415717312460811e-01F; - float v1302 = 4.9929922194110327e-02F; - float v1303 = -4.9929922194110327e-02F; - float v1309 = 6.5815896284539266e-01F; - float v1310 = -6.5815896284539266e-01F; - float v1316 = 6.3306543373877577e-01F; - float v1317 = -6.3306543373877577e-01F; - float v1323 = 1.0822460581641109e+00F; - float v1324 = -1.0822460581641109e+00F; - float v1330 = 8.1720737907134022e-01F; - float v1331 = -8.1720737907134022e-01F; - float v1337 = 4.2408709531871824e-01F; - float v1338 = -4.2408709531871824e-01F; - float32x2_t v1340 = (float32x2_t){v4, v4}; - int16x4_t v845 = vld1s_s16(&v5[0]); - float32x2_t v936 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v935)), 15); - float32x2_t v1236 = (float32x2_t){v1235, v1235}; - float32x2_t v1240 = (float32x2_t){v1238, v1239}; - float32x2_t v1247 = (float32x2_t){v1246, v1246}; - float32x2_t v1251 = (float32x2_t){v1250, v1250}; - float32x2_t v1255 = (float32x2_t){v1254, v1254}; - float32x2_t v1259 = (float32x2_t){v1258, v1258}; - float32x2_t v1263 = (float32x2_t){v1262, v1262}; - float32x2_t v1267 = (float32x2_t){v1266, v1266}; - float32x2_t v1271 = (float32x2_t){v1270, v1270}; - float32x2_t v1275 = (float32x2_t){v1274, v1274}; - float32x2_t v1279 = (float32x2_t){v1278, v1278}; - float32x2_t v1283 = (float32x2_t){v1281, v1282}; - float32x2_t v1290 = (float32x2_t){v1288, v1289}; - float32x2_t v1297 = (float32x2_t){v1295, v1296}; - float32x2_t v1304 = (float32x2_t){v1302, v1303}; - float32x2_t v1311 = (float32x2_t){v1309, v1310}; - float32x2_t v1318 = (float32x2_t){v1316, v1317}; - float32x2_t v1325 = (float32x2_t){v1323, v1324}; - float32x2_t v1332 = (float32x2_t){v1330, v1331}; - float32x2_t v1339 = (float32x2_t){v1337, v1338}; - float32x2_t v846 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v845)), 15); - int16x4_t v851 = vld1s_s16(&v5[istride * 11]); - int16x4_t v859 = vld1s_s16(&v5[istride * 2]); - int16x4_t v865 = vld1s_s16(&v5[istride * 13]); - int16x4_t v873 = vld1s_s16(&v5[istride * 4]); - int16x4_t v879 = vld1s_s16(&v5[istride * 15]); - int16x4_t v887 = vld1s_s16(&v5[istride * 6]); - int16x4_t v893 = vld1s_s16(&v5[istride * 17]); - int16x4_t v901 = vld1s_s16(&v5[istride * 8]); - int16x4_t v907 = vld1s_s16(&v5[istride * 19]); - int16x4_t v915 = vld1s_s16(&v5[istride * 10]); - int16x4_t v921 = vld1s_s16(&v5[istride * 21]); - int16x4_t v929 = vld1s_s16(&v5[istride * 12]); - int16x4_t v943 = vld1s_s16(&v5[istride * 14]); - int16x4_t v949 = vld1s_s16(&v5[istride * 3]); - int16x4_t v957 = vld1s_s16(&v5[istride * 16]); - int16x4_t v963 = vld1s_s16(&v5[istride * 5]); - int16x4_t v971 = vld1s_s16(&v5[istride * 18]); - int16x4_t v977 = vld1s_s16(&v5[istride * 7]); - int16x4_t v985 = vld1s_s16(&v5[istride * 20]); - int16x4_t v991 = vld1s_s16(&v5[istride * 9]); - float32x2_t v1242 = vmul_f32(v1340, v1240); - float32x2_t v1285 = vmul_f32(v1340, v1283); - float32x2_t v1292 = vmul_f32(v1340, v1290); - float32x2_t v1299 = vmul_f32(v1340, v1297); - float32x2_t v1306 = vmul_f32(v1340, v1304); - float32x2_t v1313 = vmul_f32(v1340, v1311); - float32x2_t v1320 = vmul_f32(v1340, v1318); - float32x2_t v1327 = vmul_f32(v1340, v1325); - float32x2_t v1334 = vmul_f32(v1340, v1332); - float32x2_t v1341 = vmul_f32(v1340, v1339); - float32x2_t v852 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v851)), 15); - float32x2_t v860 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v859)), 15); - float32x2_t v866 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v865)), 15); - float32x2_t v874 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v873)), 15); - float32x2_t v880 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v879)), 15); - float32x2_t v888 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v887)), 15); - float32x2_t v894 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v893)), 15); - float32x2_t v902 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v901)), 15); - float32x2_t v908 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v907)), 15); - float32x2_t v916 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v915)), 15); - float32x2_t v922 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v921)), 15); - float32x2_t v930 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v929)), 15); - float32x2_t v944 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v943)), 15); - float32x2_t v950 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v949)), 15); - float32x2_t v958 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v957)), 15); - float32x2_t v964 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v963)), 15); - float32x2_t v972 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v971)), 15); - float32x2_t v978 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v977)), 15); - float32x2_t v986 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v985)), 15); - float32x2_t v992 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v991)), 15); - float32x2_t v853 = vadd_f32(v846, v852); - float32x2_t v854 = vsub_f32(v846, v852); - float32x2_t v867 = vadd_f32(v860, v866); - float32x2_t v868 = vsub_f32(v860, v866); - float32x2_t v881 = vadd_f32(v874, v880); - float32x2_t v882 = vsub_f32(v874, v880); - float32x2_t v895 = vadd_f32(v888, v894); - float32x2_t v896 = vsub_f32(v888, v894); - float32x2_t v909 = vadd_f32(v902, v908); - float32x2_t v910 = vsub_f32(v902, v908); - float32x2_t v923 = vadd_f32(v916, v922); - float32x2_t v924 = vsub_f32(v916, v922); - float32x2_t v937 = vadd_f32(v930, v936); - float32x2_t v938 = vsub_f32(v930, v936); - float32x2_t v951 = vadd_f32(v944, v950); - float32x2_t v952 = vsub_f32(v944, v950); - float32x2_t v965 = vadd_f32(v958, v964); - float32x2_t v966 = vsub_f32(v958, v964); - float32x2_t v979 = vadd_f32(v972, v978); - float32x2_t v980 = vsub_f32(v972, v978); - float32x2_t v993 = vadd_f32(v986, v992); - float32x2_t v994 = vsub_f32(v986, v992); - float32x2_t v995 = vadd_f32(v867, v993); - float32x2_t v996 = vadd_f32(v881, v979); - float32x2_t v997 = vadd_f32(v895, v965); - float32x2_t v998 = vadd_f32(v909, v951); - float32x2_t v999 = vadd_f32(v923, v937); - float32x2_t v1000 = vsub_f32(v867, v993); - float32x2_t v1001 = vsub_f32(v881, v979); - float32x2_t v1002 = vsub_f32(v895, v965); - float32x2_t v1003 = vsub_f32(v909, v951); - float32x2_t v1004 = vsub_f32(v923, v937); - float32x2_t v1193 = vadd_f32(v868, v994); - float32x2_t v1194 = vadd_f32(v882, v980); - float32x2_t v1195 = vadd_f32(v896, v966); - float32x2_t v1196 = vadd_f32(v910, v952); - float32x2_t v1197 = vadd_f32(v924, v938); - float32x2_t v1198 = vsub_f32(v868, v994); - float32x2_t v1199 = vsub_f32(v882, v980); - float32x2_t v1200 = vsub_f32(v896, v966); - float32x2_t v1201 = vsub_f32(v910, v952); - float32x2_t v1202 = vsub_f32(v924, v938); - float32x2_t v1005 = vadd_f32(v995, v996); - float32x2_t v1006 = vadd_f32(v997, v999); - float32x2_t v1008 = vsub_f32(v1001, v1002); - float32x2_t v1009 = vadd_f32(v1000, v1004); - float32x2_t v1014 = vsub_f32(v996, v998); - float32x2_t v1015 = vsub_f32(v995, v998); - float32x2_t v1016 = vsub_f32(v996, v995); - float32x2_t v1017 = vsub_f32(v999, v998); - float32x2_t v1018 = vsub_f32(v997, v998); - float32x2_t v1019 = vsub_f32(v999, v997); - float32x2_t v1020 = vsub_f32(v996, v999); - float32x2_t v1021 = vsub_f32(v995, v997); - float32x2_t v1023 = vadd_f32(v1001, v1003); - float32x2_t v1024 = vsub_f32(v1000, v1003); - float32x2_t v1025 = vadd_f32(v1000, v1001); - float32x2_t v1026 = vsub_f32(v1003, v1004); - float32x2_t v1027 = vsub_f32(v1002, v1003); - float32x2_t v1028 = vsub_f32(v1002, v1004); - float32x2_t v1029 = vadd_f32(v1001, v1004); - float32x2_t v1030 = vsub_f32(v1000, v1002); - float32x2_t v1203 = vadd_f32(v1193, v1194); - float32x2_t v1204 = vadd_f32(v1195, v1197); - float32x2_t v1206 = vsub_f32(v1199, v1200); - float32x2_t v1207 = vadd_f32(v1198, v1202); - float32x2_t v1212 = vsub_f32(v1194, v1196); - float32x2_t v1213 = vsub_f32(v1193, v1196); - float32x2_t v1214 = vsub_f32(v1194, v1193); - float32x2_t v1215 = vsub_f32(v1197, v1196); - float32x2_t v1216 = vsub_f32(v1195, v1196); - float32x2_t v1217 = vsub_f32(v1197, v1195); - float32x2_t v1218 = vsub_f32(v1194, v1197); - float32x2_t v1219 = vsub_f32(v1193, v1195); - float32x2_t v1221 = vadd_f32(v1199, v1201); - float32x2_t v1222 = vsub_f32(v1198, v1201); - float32x2_t v1223 = vadd_f32(v1198, v1199); - float32x2_t v1224 = vsub_f32(v1201, v1202); - float32x2_t v1225 = vsub_f32(v1200, v1201); - float32x2_t v1226 = vsub_f32(v1200, v1202); - float32x2_t v1227 = vadd_f32(v1199, v1202); - float32x2_t v1228 = vsub_f32(v1198, v1200); - float32x2_t v1007 = vadd_f32(v998, v1005); - float32x2_t v1012 = vsub_f32(v1008, v1009); - float32x2_t v1022 = vsub_f32(v1006, v1005); - float32x2_t v1031 = vadd_f32(v1008, v1009); - float32x2_t v1050 = vmul_f32(v1014, v1247); - float32x2_t v1054 = vmul_f32(v1015, v1251); - float32x2_t v1058 = vmul_f32(v1016, v1255); - float32x2_t v1062 = vmul_f32(v1017, v1259); - float32x2_t v1066 = vmul_f32(v1018, v1263); - float32x2_t v1070 = vmul_f32(v1019, v1267); - float32x2_t v1074 = vmul_f32(v1020, v1271); - float32x2_t v1078 = vmul_f32(v1021, v1275); - float32x2_t v1088 = vrev64_f32(v1023); - float32x2_t v1095 = vrev64_f32(v1024); - float32x2_t v1102 = vrev64_f32(v1025); - float32x2_t v1109 = vrev64_f32(v1026); - float32x2_t v1116 = vrev64_f32(v1027); - float32x2_t v1123 = vrev64_f32(v1028); - float32x2_t v1130 = vrev64_f32(v1029); - float32x2_t v1137 = vrev64_f32(v1030); - float32x2_t v1205 = vadd_f32(v1196, v1203); - float32x2_t v1210 = vsub_f32(v1206, v1207); - float32x2_t v1220 = vsub_f32(v1204, v1203); - float32x2_t v1229 = vadd_f32(v1206, v1207); - float32x2_t v1248 = vmul_f32(v1212, v1247); - float32x2_t v1252 = vmul_f32(v1213, v1251); - float32x2_t v1256 = vmul_f32(v1214, v1255); - float32x2_t v1260 = vmul_f32(v1215, v1259); - float32x2_t v1264 = vmul_f32(v1216, v1263); - float32x2_t v1268 = vmul_f32(v1217, v1267); - float32x2_t v1272 = vmul_f32(v1218, v1271); - float32x2_t v1276 = vmul_f32(v1219, v1275); - float32x2_t v1286 = vrev64_f32(v1221); - float32x2_t v1293 = vrev64_f32(v1222); - float32x2_t v1300 = vrev64_f32(v1223); - float32x2_t v1307 = vrev64_f32(v1224); - float32x2_t v1314 = vrev64_f32(v1225); - float32x2_t v1321 = vrev64_f32(v1226); - float32x2_t v1328 = vrev64_f32(v1227); - float32x2_t v1335 = vrev64_f32(v1228); - float32x2_t v1010 = vadd_f32(v1007, v1006); - float32x2_t v1013 = vsub_f32(v1012, v1003); - float32x2_t v1082 = vmul_f32(v1022, v1279); - float32x2_t v1089 = vmul_f32(v1088, v1285); - float32x2_t v1096 = vmul_f32(v1095, v1292); - float32x2_t v1103 = vmul_f32(v1102, v1299); - float32x2_t v1110 = vmul_f32(v1109, v1306); - float32x2_t v1117 = vmul_f32(v1116, v1313); - float32x2_t v1124 = vmul_f32(v1123, v1320); - float32x2_t v1131 = vmul_f32(v1130, v1327); - float32x2_t v1138 = vmul_f32(v1137, v1334); - float32x2_t v1144 = vrev64_f32(v1031); - float32x2_t v1147 = vadd_f32(v1050, v1054); - float32x2_t v1148 = vadd_f32(v1054, v1058); - float32x2_t v1149 = vsub_f32(v1050, v1058); - float32x2_t v1150 = vadd_f32(v1062, v1066); - float32x2_t v1151 = vadd_f32(v1066, v1070); - float32x2_t v1152 = vsub_f32(v1062, v1070); - float32x2_t v1208 = vadd_f32(v1205, v1204); - float32x2_t v1211 = vsub_f32(v1210, v1201); - float32x2_t v1280 = vmul_f32(v1220, v1279); - float32x2_t v1287 = vmul_f32(v1286, v1285); - float32x2_t v1294 = vmul_f32(v1293, v1292); - float32x2_t v1301 = vmul_f32(v1300, v1299); - float32x2_t v1308 = vmul_f32(v1307, v1306); - float32x2_t v1315 = vmul_f32(v1314, v1313); - float32x2_t v1322 = vmul_f32(v1321, v1320); - float32x2_t v1329 = vmul_f32(v1328, v1327); - float32x2_t v1336 = vmul_f32(v1335, v1334); - float32x2_t v1342 = vrev64_f32(v1229); - float32x2_t v1345 = vadd_f32(v1248, v1252); - float32x2_t v1346 = vadd_f32(v1252, v1256); - float32x2_t v1347 = vsub_f32(v1248, v1256); - float32x2_t v1348 = vadd_f32(v1260, v1264); - float32x2_t v1349 = vadd_f32(v1264, v1268); - float32x2_t v1350 = vsub_f32(v1260, v1268); - float32x2_t v1011 = vadd_f32(v853, v1010); - float32x2_t v1039 = vmul_f32(v1010, v1236); - float32x2_t v1045 = vrev64_f32(v1013); - float32x2_t v1145 = vmul_f32(v1144, v1341); - float32x2_t v1153 = vadd_f32(v1078, v1082); - float32x2_t v1154 = vadd_f32(v1074, v1082); - float32x2_t v1155 = vadd_f32(v1096, v1103); - float32x2_t v1156 = vsub_f32(v1089, v1103); - float32x2_t v1157 = vadd_f32(v1117, v1124); - float32x2_t v1158 = vsub_f32(v1110, v1124); - float32x2_t v1209 = vadd_f32(v854, v1208); - float32x2_t v1237 = vmul_f32(v1208, v1236); - float32x2_t v1243 = vrev64_f32(v1211); - float32x2_t v1343 = vmul_f32(v1342, v1341); - float32x2_t v1351 = vadd_f32(v1276, v1280); - float32x2_t v1352 = vadd_f32(v1272, v1280); - float32x2_t v1353 = vadd_f32(v1294, v1301); - float32x2_t v1354 = vsub_f32(v1287, v1301); - float32x2_t v1355 = vadd_f32(v1315, v1322); - float32x2_t v1356 = vsub_f32(v1308, v1322); - float32x2_t v1046 = vmul_f32(v1045, v1242); - float32x2_t v1146 = vsub_f32(v1011, v1039); - float32x2_t v1159 = vadd_f32(v1138, v1145); - float32x2_t v1160 = vsub_f32(v1131, v1145); - float32x2_t v1161 = vadd_f32(v1151, v1153); - float32x2_t v1179 = vadd_f32(v1155, v1156); - float32x2_t v1244 = vmul_f32(v1243, v1242); - float32x2_t v1344 = vsub_f32(v1209, v1237); - float32x2_t v1357 = vadd_f32(v1336, v1343); - float32x2_t v1358 = vsub_f32(v1329, v1343); - float32x2_t v1359 = vadd_f32(v1349, v1351); - float32x2_t v1377 = vadd_f32(v1353, v1354); - int16x4_t v1393 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1011, 15), (int32x2_t){0, 0})); - int16x4_t v1399 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1209, 15), (int32x2_t){0, 0})); - float32x2_t v1162 = vadd_f32(v1161, v1146); - float32x2_t v1163 = vsub_f32(v1146, v1148); - float32x2_t v1165 = vadd_f32(v1146, v1152); - float32x2_t v1167 = vsub_f32(v1146, v1149); - float32x2_t v1169 = vadd_f32(v1146, v1147); - float32x2_t v1171 = vadd_f32(v1046, v1157); - float32x2_t v1173 = vsub_f32(v1159, v1155); - float32x2_t v1175 = vadd_f32(v1046, v1160); - float32x2_t v1177 = vsub_f32(v1160, v1156); - float32x2_t v1180 = vadd_f32(v1179, v1157); - float32x2_t v1360 = vadd_f32(v1359, v1344); - float32x2_t v1361 = vsub_f32(v1344, v1346); - float32x2_t v1363 = vadd_f32(v1344, v1350); - float32x2_t v1365 = vsub_f32(v1344, v1347); - float32x2_t v1367 = vadd_f32(v1344, v1345); - float32x2_t v1369 = vadd_f32(v1244, v1355); - float32x2_t v1371 = vsub_f32(v1357, v1353); - float32x2_t v1373 = vadd_f32(v1244, v1358); - float32x2_t v1375 = vsub_f32(v1358, v1354); - float32x2_t v1378 = vadd_f32(v1377, v1355); - v6[0] = vget_lane_s32(vreinterpret_s32_s16(v1393), 0); - v6[ostride * 11] = vget_lane_s32(vreinterpret_s32_s16(v1399), 0); - float32x2_t v1164 = vsub_f32(v1163, v1153); - float32x2_t v1166 = vadd_f32(v1165, v1154); - float32x2_t v1168 = vsub_f32(v1167, v1154); - float32x2_t v1170 = vsub_f32(v1169, v1150); - float32x2_t v1172 = vadd_f32(v1171, v1159); - float32x2_t v1174 = vsub_f32(v1173, v1046); - float32x2_t v1176 = vadd_f32(v1175, v1158); - float32x2_t v1178 = vsub_f32(v1177, v1046); - float32x2_t v1181 = vadd_f32(v1180, v1158); - float32x2_t v1362 = vsub_f32(v1361, v1351); - float32x2_t v1364 = vadd_f32(v1363, v1352); - float32x2_t v1366 = vsub_f32(v1365, v1352); - float32x2_t v1368 = vsub_f32(v1367, v1348); - float32x2_t v1370 = vadd_f32(v1369, v1357); - float32x2_t v1372 = vsub_f32(v1371, v1244); - float32x2_t v1374 = vadd_f32(v1373, v1356); - float32x2_t v1376 = vsub_f32(v1375, v1244); - float32x2_t v1379 = vadd_f32(v1378, v1356); - float32x2_t v1182 = vsub_f32(v1181, v1046); - float32x2_t v1184 = vadd_f32(v1162, v1172); - float32x2_t v1185 = vadd_f32(v1164, v1174); - float32x2_t v1186 = vsub_f32(v1166, v1176); - float32x2_t v1187 = vadd_f32(v1168, v1178); - float32x2_t v1188 = vsub_f32(v1168, v1178); - float32x2_t v1189 = vadd_f32(v1166, v1176); - float32x2_t v1190 = vsub_f32(v1164, v1174); - float32x2_t v1191 = vsub_f32(v1162, v1172); - float32x2_t v1380 = vsub_f32(v1379, v1244); - float32x2_t v1382 = vadd_f32(v1360, v1370); - float32x2_t v1383 = vadd_f32(v1362, v1372); - float32x2_t v1384 = vsub_f32(v1364, v1374); - float32x2_t v1385 = vadd_f32(v1366, v1376); - float32x2_t v1386 = vsub_f32(v1366, v1376); - float32x2_t v1387 = vadd_f32(v1364, v1374); - float32x2_t v1388 = vsub_f32(v1362, v1372); - float32x2_t v1389 = vsub_f32(v1360, v1370); - float32x2_t v1183 = vadd_f32(v1170, v1182); - float32x2_t v1192 = vsub_f32(v1170, v1182); - float32x2_t v1381 = vadd_f32(v1368, v1380); - float32x2_t v1390 = vsub_f32(v1368, v1380); - int16x4_t v1417 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1191, 15), (int32x2_t){0, 0})); - int16x4_t v1423 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1389, 15), (int32x2_t){0, 0})); - int16x4_t v1429 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1190, 15), (int32x2_t){0, 0})); - int16x4_t v1435 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1388, 15), (int32x2_t){0, 0})); - int16x4_t v1441 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1189, 15), (int32x2_t){0, 0})); - int16x4_t v1447 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1387, 15), (int32x2_t){0, 0})); - int16x4_t v1453 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1188, 15), (int32x2_t){0, 0})); - int16x4_t v1459 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1386, 15), (int32x2_t){0, 0})); - int16x4_t v1465 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1187, 15), (int32x2_t){0, 0})); - int16x4_t v1471 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1385, 15), (int32x2_t){0, 0})); - int16x4_t v1477 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1186, 15), (int32x2_t){0, 0})); - int16x4_t v1483 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1384, 15), (int32x2_t){0, 0})); - int16x4_t v1489 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1185, 15), (int32x2_t){0, 0})); - int16x4_t v1495 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1383, 15), (int32x2_t){0, 0})); - int16x4_t v1501 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1184, 15), (int32x2_t){0, 0})); - int16x4_t v1507 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1382, 15), (int32x2_t){0, 0})); - int16x4_t v1405 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1192, 15), (int32x2_t){0, 0})); - int16x4_t v1411 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1390, 15), (int32x2_t){0, 0})); - v6[ostride * 2] = vget_lane_s32(vreinterpret_s32_s16(v1417), 0); - v6[ostride * 13] = vget_lane_s32(vreinterpret_s32_s16(v1423), 0); - v6[ostride * 14] = vget_lane_s32(vreinterpret_s32_s16(v1429), 0); - v6[ostride * 3] = vget_lane_s32(vreinterpret_s32_s16(v1435), 0); - v6[ostride * 4] = vget_lane_s32(vreinterpret_s32_s16(v1441), 0); - v6[ostride * 15] = vget_lane_s32(vreinterpret_s32_s16(v1447), 0); - v6[ostride * 16] = vget_lane_s32(vreinterpret_s32_s16(v1453), 0); - v6[ostride * 5] = vget_lane_s32(vreinterpret_s32_s16(v1459), 0); - v6[ostride * 6] = vget_lane_s32(vreinterpret_s32_s16(v1465), 0); - v6[ostride * 17] = vget_lane_s32(vreinterpret_s32_s16(v1471), 0); - v6[ostride * 18] = vget_lane_s32(vreinterpret_s32_s16(v1477), 0); - v6[ostride * 7] = vget_lane_s32(vreinterpret_s32_s16(v1483), 0); - v6[ostride * 8] = vget_lane_s32(vreinterpret_s32_s16(v1489), 0); - v6[ostride * 19] = vget_lane_s32(vreinterpret_s32_s16(v1495), 0); - v6[ostride * 20] = vget_lane_s32(vreinterpret_s32_s16(v1501), 0); - v6[ostride * 9] = vget_lane_s32(vreinterpret_s32_s16(v1507), 0); - int16x4_t v1513 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1183, 15), (int32x2_t){0, 0})); - int16x4_t v1519 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1381, 15), (int32x2_t){0, 0})); - v6[ostride * 12] = vget_lane_s32(vreinterpret_s32_s16(v1405), 0); - v6[ostride] = vget_lane_s32(vreinterpret_s32_s16(v1411), 0); - v6[ostride * 10] = vget_lane_s32(vreinterpret_s32_s16(v1513), 0); - v6[ostride * 21] = vget_lane_s32(vreinterpret_s32_s16(v1519), 0); - v5 += 1 * 1; - v6 += 1 * 1; - } -} -#endif - -#ifdef ARMRAL_ARCH_SVE -void armral_fft_cs16_cf32_cs16_ac_n_uu22(const armral_cmplx_int16_t *restrict x, - armral_cmplx_int16_t *restrict y, - int istride, int ostride, int howmany, - float dir) { - int64_t v0 = istride; - int64_t v2 = ostride; - float v4 = dir; - const int32_t *v5 = (const int32_t *)x; - int32_t *v6 = (int32_t *)y; - int64_t v8 = howmany; - int64_t v10 = svcntd(); - int64_t v11 = v10 * 1; - int64_t v12 = v10 * 1; - for (int j = 0; j < v8; j += v10) { - svbool_t pred_full = svwhilelt_b32(j * 2, howmany * 2); - float v468 = 1.1000000000000001e+00F; - float v473 = -3.3166247903554003e-01F; - float v480 = 5.1541501300188641e-01F; - float v485 = 9.4125353283118118e-01F; - float v490 = 1.4143537075597825e+00F; - float v495 = 8.5949297361449750e-01F; - float v500 = 4.2314838273285138e-02F; - float v505 = 3.8639279888589606e-01F; - float v510 = 5.1254589567200015e-01F; - float v515 = 1.0702757469471715e+00F; - float v520 = 5.5486073394528512e-01F; - float v525 = -1.2412944743900585e+00F; - float v532 = -2.0897833842005756e-01F; - float v539 = -3.7415717312460811e-01F; - float v546 = -4.9929922194110327e-02F; - float v553 = -6.5815896284539266e-01F; - float v560 = -6.3306543373877577e-01F; - float v567 = -1.0822460581641109e+00F; - float v574 = -8.1720737907134022e-01F; - float v581 = -4.2408709531871824e-01F; - const int32_t *v934 = &v5[v0]; - int32_t *v1085 = &v6[v2]; - int64_t v27 = v0 * 11; - int64_t v37 = v0 * 2; - int64_t v45 = v0 * 13; - int64_t v55 = v0 * 4; - int64_t v63 = v0 * 15; - int64_t v73 = v0 * 6; - int64_t v81 = v0 * 17; - int64_t v91 = v0 * 8; - int64_t v99 = v0 * 19; - int64_t v109 = v0 * 10; - int64_t v117 = v0 * 21; - int64_t v127 = v0 * 12; - int64_t v145 = v0 * 14; - int64_t v153 = v0 * 3; - int64_t v163 = v0 * 16; - int64_t v171 = v0 * 5; - int64_t v181 = v0 * 18; - int64_t v189 = v0 * 7; - int64_t v199 = v0 * 20; - int64_t v207 = v0 * 9; - float v476 = v4 * v473; - float v528 = v4 * v525; - float v535 = v4 * v532; - float v542 = v4 * v539; - float v549 = v4 * v546; - float v556 = v4 * v553; - float v563 = v4 * v560; - float v570 = v4 * v567; - float v577 = v4 * v574; - float v584 = v4 * v581; - int64_t v643 = v2 * 11; - int64_t v651 = v2 * 12; - int64_t v667 = v2 * 2; - int64_t v675 = v2 * 13; - int64_t v683 = v2 * 14; - int64_t v691 = v2 * 3; - int64_t v699 = v2 * 4; - int64_t v707 = v2 * 15; - int64_t v715 = v2 * 16; - int64_t v723 = v2 * 5; - int64_t v731 = v2 * 6; - int64_t v739 = v2 * 17; - int64_t v747 = v2 * 18; - int64_t v755 = v2 * 7; - int64_t v763 = v2 * 8; - int64_t v771 = v2 * 19; - int64_t v779 = v2 * 20; - int64_t v787 = v2 * 9; - int64_t v795 = v2 * 10; - int64_t v803 = v2 * 21; - const int32_t *v817 = &v5[0]; - svfloat32_t v1031 = svdup_n_f32(v468); - svfloat32_t v1033 = svdup_n_f32(v480); - svfloat32_t v1034 = svdup_n_f32(v485); - svfloat32_t v1035 = svdup_n_f32(v490); - svfloat32_t v1036 = svdup_n_f32(v495); - svfloat32_t v1037 = svdup_n_f32(v500); - svfloat32_t v1038 = svdup_n_f32(v505); - svfloat32_t v1039 = svdup_n_f32(v510); - svfloat32_t v1040 = svdup_n_f32(v515); - svfloat32_t v1041 = svdup_n_f32(v520); - int32_t *v1058 = &v6[0]; - svfloat32_t v141 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v934[0])), - 1.F / (1ULL << 15ULL)); - const int32_t *v826 = &v5[v27]; - const int32_t *v835 = &v5[v37]; - const int32_t *v844 = &v5[v45]; - const int32_t *v853 = &v5[v55]; - const int32_t *v862 = &v5[v63]; - const int32_t *v871 = &v5[v73]; - const int32_t *v880 = &v5[v81]; - const int32_t *v889 = &v5[v91]; - const int32_t *v898 = &v5[v99]; - const int32_t *v907 = &v5[v109]; - const int32_t *v916 = &v5[v117]; - const int32_t *v925 = &v5[v127]; - const int32_t *v943 = &v5[v145]; - const int32_t *v952 = &v5[v153]; - const int32_t *v961 = &v5[v163]; - const int32_t *v970 = &v5[v171]; - const int32_t *v979 = &v5[v181]; - const int32_t *v988 = &v5[v189]; - const int32_t *v997 = &v5[v199]; - const int32_t *v1006 = &v5[v207]; - svfloat32_t v1032 = svdup_n_f32(v476); - svfloat32_t v1042 = svdup_n_f32(v528); - svfloat32_t v1043 = svdup_n_f32(v535); - svfloat32_t v1044 = svdup_n_f32(v542); - svfloat32_t v1045 = svdup_n_f32(v549); - svfloat32_t v1046 = svdup_n_f32(v556); - svfloat32_t v1047 = svdup_n_f32(v563); - svfloat32_t v1048 = svdup_n_f32(v570); - svfloat32_t v1049 = svdup_n_f32(v577); - svfloat32_t v1050 = svdup_n_f32(v584); - int32_t *v1067 = &v6[v643]; - int32_t *v1076 = &v6[v651]; - int32_t *v1094 = &v6[v667]; - int32_t *v1103 = &v6[v675]; - int32_t *v1112 = &v6[v683]; - int32_t *v1121 = &v6[v691]; - int32_t *v1130 = &v6[v699]; - int32_t *v1139 = &v6[v707]; - int32_t *v1148 = &v6[v715]; - int32_t *v1157 = &v6[v723]; - int32_t *v1166 = &v6[v731]; - int32_t *v1175 = &v6[v739]; - int32_t *v1184 = &v6[v747]; - int32_t *v1193 = &v6[v755]; - int32_t *v1202 = &v6[v763]; - int32_t *v1211 = &v6[v771]; - int32_t *v1220 = &v6[v779]; - int32_t *v1229 = &v6[v787]; - int32_t *v1238 = &v6[v795]; - int32_t *v1247 = &v6[v803]; - svfloat32_t v25 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v817[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v33 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v826[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v43 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v835[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v51 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v844[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v61 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v853[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v69 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v862[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v79 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v871[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v87 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v880[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v97 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v889[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v105 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v898[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v115 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v907[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v123 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v916[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v133 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v925[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v151 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v943[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v159 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v952[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v169 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v961[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v177 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v970[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v187 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v979[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v195 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v988[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v205 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v997[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v213 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v1006[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v34; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v34) : "w"(v25), "w"(v33)); - svfloat32_t v35; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v35) : "w"(v25), "w"(v33)); - svfloat32_t v52; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v52) : "w"(v43), "w"(v51)); - svfloat32_t v53; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v53) : "w"(v43), "w"(v51)); - svfloat32_t v70; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v70) : "w"(v61), "w"(v69)); - svfloat32_t v71; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v71) : "w"(v61), "w"(v69)); - svfloat32_t v88; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v88) : "w"(v79), "w"(v87)); - svfloat32_t v89; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v89) : "w"(v79), "w"(v87)); - svfloat32_t v106; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v106) : "w"(v97), "w"(v105)); - svfloat32_t v107; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v107) : "w"(v97), "w"(v105)); - svfloat32_t v124; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v124) : "w"(v115), "w"(v123)); - svfloat32_t v125; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v125) : "w"(v115), "w"(v123)); - svfloat32_t v142; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v142) : "w"(v133), "w"(v141)); - svfloat32_t v143; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v143) : "w"(v133), "w"(v141)); - svfloat32_t v160; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v160) : "w"(v151), "w"(v159)); - svfloat32_t v161; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v161) : "w"(v151), "w"(v159)); - svfloat32_t v178; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v178) : "w"(v169), "w"(v177)); - svfloat32_t v179; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v179) : "w"(v169), "w"(v177)); - svfloat32_t v196; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v196) : "w"(v187), "w"(v195)); - svfloat32_t v197; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v197) : "w"(v187), "w"(v195)); - svfloat32_t v214; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v214) : "w"(v205), "w"(v213)); - svfloat32_t v215; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v215) : "w"(v205), "w"(v213)); - svfloat32_t v216; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v216) : "w"(v52), "w"(v214)); - svfloat32_t v217; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v217) : "w"(v70), "w"(v196)); - svfloat32_t v218; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v218) : "w"(v88), "w"(v178)); - svfloat32_t v219; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v219) : "w"(v106), "w"(v160)); - svfloat32_t v220; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v220) : "w"(v124), "w"(v142)); - svfloat32_t v221; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v221) : "w"(v52), "w"(v214)); - svfloat32_t v222; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v222) : "w"(v70), "w"(v196)); - svfloat32_t v223; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v223) : "w"(v88), "w"(v178)); - svfloat32_t v224; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v224) : "w"(v106), "w"(v160)); - svfloat32_t v225; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v225) : "w"(v124), "w"(v142)); - svfloat32_t v425; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v425) : "w"(v53), "w"(v215)); - svfloat32_t v426; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v426) : "w"(v71), "w"(v197)); - svfloat32_t v427; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v427) : "w"(v89), "w"(v179)); - svfloat32_t v428; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v428) : "w"(v107), "w"(v161)); - svfloat32_t v429; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v429) : "w"(v125), "w"(v143)); - svfloat32_t v430; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v430) : "w"(v53), "w"(v215)); - svfloat32_t v431; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v431) : "w"(v71), "w"(v197)); - svfloat32_t v432; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v432) : "w"(v89), "w"(v179)); - svfloat32_t v433; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v433) : "w"(v107), "w"(v161)); - svfloat32_t v434; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v434) : "w"(v125), "w"(v143)); - svfloat32_t v226; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v226) : "w"(v216), "w"(v217)); - svfloat32_t v227; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v227) : "w"(v218), "w"(v220)); - svfloat32_t v229; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v229) : "w"(v222), "w"(v223)); - svfloat32_t v230; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v230) : "w"(v221), "w"(v225)); - svfloat32_t v235; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v235) : "w"(v217), "w"(v219)); - svfloat32_t v236; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v236) : "w"(v216), "w"(v219)); - svfloat32_t v237; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v237) : "w"(v217), "w"(v216)); - svfloat32_t v238; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v238) : "w"(v220), "w"(v219)); - svfloat32_t v239; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v239) : "w"(v218), "w"(v219)); - svfloat32_t v240; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v240) : "w"(v220), "w"(v218)); - svfloat32_t v241; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v241) : "w"(v217), "w"(v220)); - svfloat32_t v242; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v242) : "w"(v216), "w"(v218)); - svfloat32_t v244; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v244) : "w"(v222), "w"(v224)); - svfloat32_t v245; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v245) : "w"(v221), "w"(v224)); - svfloat32_t v246; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v246) : "w"(v221), "w"(v222)); - svfloat32_t v247; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v247) : "w"(v224), "w"(v225)); - svfloat32_t v248; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v248) : "w"(v223), "w"(v224)); - svfloat32_t v249; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v249) : "w"(v223), "w"(v225)); - svfloat32_t v250; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v250) : "w"(v222), "w"(v225)); - svfloat32_t v251; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v251) : "w"(v221), "w"(v223)); - svfloat32_t v435; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v435) : "w"(v425), "w"(v426)); - svfloat32_t v436; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v436) : "w"(v427), "w"(v429)); - svfloat32_t v438; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v438) : "w"(v431), "w"(v432)); - svfloat32_t v439; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v439) : "w"(v430), "w"(v434)); - svfloat32_t v444; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v444) : "w"(v426), "w"(v428)); - svfloat32_t v445; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v445) : "w"(v425), "w"(v428)); - svfloat32_t v446; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v446) : "w"(v426), "w"(v425)); - svfloat32_t v447; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v447) : "w"(v429), "w"(v428)); - svfloat32_t v448; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v448) : "w"(v427), "w"(v428)); - svfloat32_t v449; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v449) : "w"(v429), "w"(v427)); - svfloat32_t v450; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v450) : "w"(v426), "w"(v429)); - svfloat32_t v451; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v451) : "w"(v425), "w"(v427)); - svfloat32_t v453; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v453) : "w"(v431), "w"(v433)); - svfloat32_t v454; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v454) : "w"(v430), "w"(v433)); - svfloat32_t v455; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v455) : "w"(v430), "w"(v431)); - svfloat32_t v456; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v456) : "w"(v433), "w"(v434)); - svfloat32_t v457; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v457) : "w"(v432), "w"(v433)); - svfloat32_t v458; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v458) : "w"(v432), "w"(v434)); - svfloat32_t v459; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v459) : "w"(v431), "w"(v434)); - svfloat32_t v460; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v460) : "w"(v430), "w"(v432)); - svfloat32_t v228; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v228) : "w"(v219), "w"(v226)); - svfloat32_t v233; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v233) : "w"(v229), "w"(v230)); - svfloat32_t v243; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v243) : "w"(v227), "w"(v226)); - svfloat32_t v252; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v252) : "w"(v229), "w"(v230)); - svfloat32_t v279; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v279) : "w"(v236), "w"(v1034)); - svfloat32_t v284; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v284) : "w"(v237), "w"(v1035)); - svfloat32_t v294; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v294) : "w"(v239), "w"(v1037)); - svfloat32_t v299; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v299) : "w"(v240), "w"(v1038)); - svfloat32_t zero321; - asm volatile("mov %0.s, #0" : "=w"(zero321)); - svfloat32_t v321 = svcmla_f32_x(pred_full, zero321, v1042, v244, 90); - svfloat32_t zero335; - asm volatile("mov %0.s, #0" : "=w"(zero335)); - svfloat32_t v335 = svcmla_f32_x(pred_full, zero335, v1044, v246, 90); - svfloat32_t zero342; - asm volatile("mov %0.s, #0" : "=w"(zero342)); - svfloat32_t v342 = svcmla_f32_x(pred_full, zero342, v1045, v247, 90); - svfloat32_t zero356; - asm volatile("mov %0.s, #0" : "=w"(zero356)); - svfloat32_t v356 = svcmla_f32_x(pred_full, zero356, v1047, v249, 90); - svfloat32_t zero363; - asm volatile("mov %0.s, #0" : "=w"(zero363)); - svfloat32_t v363 = svcmla_f32_x(pred_full, zero363, v1048, v250, 90); - svfloat32_t v437; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v437) : "w"(v428), "w"(v435)); - svfloat32_t v442; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v442) : "w"(v438), "w"(v439)); - svfloat32_t v452; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v452) : "w"(v436), "w"(v435)); - svfloat32_t v461; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v461) : "w"(v438), "w"(v439)); - svfloat32_t v488; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v488) : "w"(v445), "w"(v1034)); - svfloat32_t v493; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v493) : "w"(v446), "w"(v1035)); - svfloat32_t v503; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v503) : "w"(v448), "w"(v1037)); - svfloat32_t v508; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v508) : "w"(v449), "w"(v1038)); - svfloat32_t zero530; - asm volatile("mov %0.s, #0" : "=w"(zero530)); - svfloat32_t v530 = svcmla_f32_x(pred_full, zero530, v1042, v453, 90); - svfloat32_t zero544; - asm volatile("mov %0.s, #0" : "=w"(zero544)); - svfloat32_t v544 = svcmla_f32_x(pred_full, zero544, v1044, v455, 90); - svfloat32_t zero551; - asm volatile("mov %0.s, #0" : "=w"(zero551)); - svfloat32_t v551 = svcmla_f32_x(pred_full, zero551, v1045, v456, 90); - svfloat32_t zero565; - asm volatile("mov %0.s, #0" : "=w"(zero565)); - svfloat32_t v565 = svcmla_f32_x(pred_full, zero565, v1047, v458, 90); - svfloat32_t zero572; - asm volatile("mov %0.s, #0" : "=w"(zero572)); - svfloat32_t v572 = svcmla_f32_x(pred_full, zero572, v1048, v459, 90); - svfloat32_t v231; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v231) : "w"(v228), "w"(v227)); - svfloat32_t v234; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v234) : "w"(v233), "w"(v224)); - svfloat32_t v314; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v314) : "w"(v243), "w"(v1041)); - svfloat32_t zero377; - asm volatile("mov %0.s, #0" : "=w"(zero377)); - svfloat32_t v377 = svcmla_f32_x(pred_full, zero377, v1050, v252, 90); - svfloat32_t v379 = svmla_f32_x(pred_full, v279, v235, v1033); - svfloat32_t v380 = svmla_f32_x(pred_full, v284, v236, v1034); - svfloat32_t v381 = svnmls_f32_x(pred_full, v284, v235, v1033); - svfloat32_t v382 = svmla_f32_x(pred_full, v294, v238, v1036); - svfloat32_t v383 = svmla_f32_x(pred_full, v299, v239, v1037); - svfloat32_t v384 = svnmls_f32_x(pred_full, v299, v238, v1036); - svfloat32_t v387 = svcmla_f32_x(pred_full, v335, v1043, v245, 90); - svfloat32_t v388; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v388) : "w"(v321), "w"(v335)); - svfloat32_t v389 = svcmla_f32_x(pred_full, v356, v1046, v248, 90); - svfloat32_t v390; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v390) : "w"(v342), "w"(v356)); - svfloat32_t v440; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v440) : "w"(v437), "w"(v436)); - svfloat32_t v443; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v443) : "w"(v442), "w"(v433)); - svfloat32_t v523; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v523) : "w"(v452), "w"(v1041)); - svfloat32_t zero586; - asm volatile("mov %0.s, #0" : "=w"(zero586)); - svfloat32_t v586 = svcmla_f32_x(pred_full, zero586, v1050, v461, 90); - svfloat32_t v588 = svmla_f32_x(pred_full, v488, v444, v1033); - svfloat32_t v589 = svmla_f32_x(pred_full, v493, v445, v1034); - svfloat32_t v590 = svnmls_f32_x(pred_full, v493, v444, v1033); - svfloat32_t v591 = svmla_f32_x(pred_full, v503, v447, v1036); - svfloat32_t v592 = svmla_f32_x(pred_full, v508, v448, v1037); - svfloat32_t v593 = svnmls_f32_x(pred_full, v508, v447, v1036); - svfloat32_t v596 = svcmla_f32_x(pred_full, v544, v1043, v454, 90); - svfloat32_t v597; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v597) : "w"(v530), "w"(v544)); - svfloat32_t v598 = svcmla_f32_x(pred_full, v565, v1046, v457, 90); - svfloat32_t v599; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v599) : "w"(v551), "w"(v565)); - svfloat32_t v232; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v232) : "w"(v34), "w"(v231)); - svfloat32_t zero269; - asm volatile("mov %0.s, #0" : "=w"(zero269)); - svfloat32_t v269 = svcmla_f32_x(pred_full, zero269, v1032, v234, 90); - svfloat32_t v385 = svmla_f32_x(pred_full, v314, v242, v1040); - svfloat32_t v386 = svmla_f32_x(pred_full, v314, v241, v1039); - svfloat32_t v391 = svcmla_f32_x(pred_full, v377, v1049, v251, 90); - svfloat32_t v392; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v392) : "w"(v363), "w"(v377)); - svfloat32_t v411; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v411) : "w"(v387), "w"(v388)); - svfloat32_t v441; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v441) : "w"(v35), "w"(v440)); - svfloat32_t zero478; - asm volatile("mov %0.s, #0" : "=w"(zero478)); - svfloat32_t v478 = svcmla_f32_x(pred_full, zero478, v1032, v443, 90); - svfloat32_t v594 = svmla_f32_x(pred_full, v523, v451, v1040); - svfloat32_t v595 = svmla_f32_x(pred_full, v523, v450, v1039); - svfloat32_t v600 = svcmla_f32_x(pred_full, v586, v1049, v460, 90); - svfloat32_t v601; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v601) : "w"(v572), "w"(v586)); - svfloat32_t v620; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v620) : "w"(v596), "w"(v597)); - svfloat32_t v378 = svmls_f32_x(pred_full, v232, v231, v1031); - svfloat32_t v393; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v393) : "w"(v383), "w"(v385)); - svfloat32_t v403; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v403) : "w"(v269), "w"(v389)); - svfloat32_t v405; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v405) : "w"(v391), "w"(v387)); - svfloat32_t v407; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v407) : "w"(v269), "w"(v392)); - svfloat32_t v409; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v409) : "w"(v392), "w"(v388)); - svfloat32_t v412; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v412) : "w"(v411), "w"(v389)); - svfloat32_t v587 = svmls_f32_x(pred_full, v441, v440, v1031); - svfloat32_t v602; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v602) : "w"(v592), "w"(v594)); - svfloat32_t v612; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v612) : "w"(v478), "w"(v598)); - svfloat32_t v614; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v614) : "w"(v600), "w"(v596)); - svfloat32_t v616; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v616) : "w"(v478), "w"(v601)); - svfloat32_t v618; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v618) : "w"(v601), "w"(v597)); - svfloat32_t v621; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v621) : "w"(v620), "w"(v598)); - svint16_t v636 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v232, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svint16_t v644 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v441, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svfloat32_t v394; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v394) : "w"(v393), "w"(v378)); - svfloat32_t v395; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v395) : "w"(v378), "w"(v380)); - svfloat32_t v397; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v397) : "w"(v378), "w"(v384)); - svfloat32_t v399; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v399) : "w"(v378), "w"(v381)); - svfloat32_t v401; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v401) : "w"(v378), "w"(v379)); - svfloat32_t v404; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v404) : "w"(v403), "w"(v391)); - svfloat32_t v406; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v406) : "w"(v405), "w"(v269)); - svfloat32_t v408; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v408) : "w"(v407), "w"(v390)); - svfloat32_t v410; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v410) : "w"(v409), "w"(v269)); - svfloat32_t v413; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v413) : "w"(v412), "w"(v390)); - svfloat32_t v603; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v603) : "w"(v602), "w"(v587)); - svfloat32_t v604; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v604) : "w"(v587), "w"(v589)); - svfloat32_t v606; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v606) : "w"(v587), "w"(v593)); - svfloat32_t v608; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v608) : "w"(v587), "w"(v590)); - svfloat32_t v610; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v610) : "w"(v587), "w"(v588)); - svfloat32_t v613; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v613) : "w"(v612), "w"(v600)); - svfloat32_t v615; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v615) : "w"(v614), "w"(v478)); - svfloat32_t v617; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v617) : "w"(v616), "w"(v599)); - svfloat32_t v619; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v619) : "w"(v618), "w"(v478)); - svfloat32_t v622; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v622) : "w"(v621), "w"(v599)); - svst1w_u64(pred_full, (unsigned *)(v1058), svreinterpret_u64_s16(v636)); - svst1w_u64(pred_full, (unsigned *)(v1067), svreinterpret_u64_s16(v644)); - svfloat32_t v396; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v396) : "w"(v395), "w"(v385)); - svfloat32_t v398; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v398) : "w"(v397), "w"(v386)); - svfloat32_t v400; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v400) : "w"(v399), "w"(v386)); - svfloat32_t v402; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v402) : "w"(v401), "w"(v382)); - svfloat32_t v414; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v414) : "w"(v413), "w"(v269)); - svfloat32_t v416; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v416) : "w"(v394), "w"(v404)); - svfloat32_t v423; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v423) : "w"(v394), "w"(v404)); - svfloat32_t v605; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v605) : "w"(v604), "w"(v594)); - svfloat32_t v607; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v607) : "w"(v606), "w"(v595)); - svfloat32_t v609; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v609) : "w"(v608), "w"(v595)); - svfloat32_t v611; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v611) : "w"(v610), "w"(v591)); - svfloat32_t v623; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v623) : "w"(v622), "w"(v478)); - svfloat32_t v625; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v625) : "w"(v603), "w"(v613)); - svfloat32_t v632; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v632) : "w"(v603), "w"(v613)); - svfloat32_t v415; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v415) : "w"(v402), "w"(v414)); - svfloat32_t v417; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v417) : "w"(v396), "w"(v406)); - svfloat32_t v418; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v418) : "w"(v398), "w"(v408)); - svfloat32_t v419; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v419) : "w"(v400), "w"(v410)); - svfloat32_t v420; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v420) : "w"(v400), "w"(v410)); - svfloat32_t v421; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v421) : "w"(v398), "w"(v408)); - svfloat32_t v422; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v422) : "w"(v396), "w"(v406)); - svfloat32_t v424; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v424) : "w"(v402), "w"(v414)); - svfloat32_t v624; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v624) : "w"(v611), "w"(v623)); - svfloat32_t v626; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v626) : "w"(v605), "w"(v615)); - svfloat32_t v627; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v627) : "w"(v607), "w"(v617)); - svfloat32_t v628; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v628) : "w"(v609), "w"(v619)); - svfloat32_t v629; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v629) : "w"(v609), "w"(v619)); - svfloat32_t v630; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v630) : "w"(v607), "w"(v617)); - svfloat32_t v631; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v631) : "w"(v605), "w"(v615)); - svfloat32_t v633; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v633) : "w"(v611), "w"(v623)); - svint16_t v668 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v423, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svint16_t v676 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v632, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svint16_t v780 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v416, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svint16_t v788 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v625, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svint16_t v652 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v424, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svint16_t v660 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v633, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svint16_t v684 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v422, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svint16_t v692 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v631, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svint16_t v700 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v421, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svint16_t v708 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v630, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svint16_t v716 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v420, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svint16_t v724 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v629, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svint16_t v732 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v419, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svint16_t v740 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v628, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svint16_t v748 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v418, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svint16_t v756 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v627, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svint16_t v764 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v417, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svint16_t v772 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v626, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svint16_t v796 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v415, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svint16_t v804 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v624, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svst1w_u64(pred_full, (unsigned *)(v1094), svreinterpret_u64_s16(v668)); - svst1w_u64(pred_full, (unsigned *)(v1103), svreinterpret_u64_s16(v676)); - svst1w_u64(pred_full, (unsigned *)(v1220), svreinterpret_u64_s16(v780)); - svst1w_u64(pred_full, (unsigned *)(v1229), svreinterpret_u64_s16(v788)); - svst1w_u64(pred_full, (unsigned *)(v1076), svreinterpret_u64_s16(v652)); - svst1w_u64(pred_full, (unsigned *)(v1085), svreinterpret_u64_s16(v660)); - svst1w_u64(pred_full, (unsigned *)(v1112), svreinterpret_u64_s16(v684)); - svst1w_u64(pred_full, (unsigned *)(v1121), svreinterpret_u64_s16(v692)); - svst1w_u64(pred_full, (unsigned *)(v1130), svreinterpret_u64_s16(v700)); - svst1w_u64(pred_full, (unsigned *)(v1139), svreinterpret_u64_s16(v708)); - svst1w_u64(pred_full, (unsigned *)(v1148), svreinterpret_u64_s16(v716)); - svst1w_u64(pred_full, (unsigned *)(v1157), svreinterpret_u64_s16(v724)); - svst1w_u64(pred_full, (unsigned *)(v1166), svreinterpret_u64_s16(v732)); - svst1w_u64(pred_full, (unsigned *)(v1175), svreinterpret_u64_s16(v740)); - svst1w_u64(pred_full, (unsigned *)(v1184), svreinterpret_u64_s16(v748)); - svst1w_u64(pred_full, (unsigned *)(v1193), svreinterpret_u64_s16(v756)); - svst1w_u64(pred_full, (unsigned *)(v1202), svreinterpret_u64_s16(v764)); - svst1w_u64(pred_full, (unsigned *)(v1211), svreinterpret_u64_s16(v772)); - svst1w_u64(pred_full, (unsigned *)(v1238), svreinterpret_u64_s16(v796)); - svst1w_u64(pred_full, (unsigned *)(v1247), svreinterpret_u64_s16(v804)); - v5 += v11; - v6 += v12; - } -} -#endif - -#ifndef ARMRAL_ARCH_SVE -void armral_fft_cs16_cf32_cs16_ac_n_uu24(const armral_cmplx_int16_t *restrict x, - armral_cmplx_int16_t *restrict y, - int istride, int ostride, int howmany, - float dir) { - float v4 = dir; - const int32_t *v5 = (const int32_t *)x; - int32_t *v6 = (int32_t *)y; - int64_t v12 = howmany - 1; - int64_t v684 = howmany / 2; - for (int j = 0; j < v12; j += 2) { - float v281 = 1.0000000000000000e+00F; - float v282 = -1.0000000000000000e+00F; - float v290 = -7.0710678118654746e-01F; - float v298 = 7.0710678118654757e-01F; - float v356 = -1.4999999999999998e+00F; - float v357 = 1.4999999999999998e+00F; - float v365 = 1.0606601717798210e+00F; - float v373 = -1.0606601717798212e+00F; - float v432 = 8.6602540378443871e-01F; - float v441 = -8.6602540378443871e-01F; - float v450 = 6.1237243569579458e-01F; - float v451 = -6.1237243569579458e-01F; - float32x2_t v453 = (float32x2_t){v4, v4}; - const int32_t *v1336 = &v5[istride]; - int32_t *v1499 = &v6[ostride]; - float32x2_t v283 = (float32x2_t){v281, v282}; - float32x2_t v291 = (float32x2_t){v298, v290}; - float32x2_t v299 = (float32x2_t){v298, v298}; - float32x2_t v353 = (float32x2_t){v356, v356}; - float32x2_t v358 = (float32x2_t){v356, v357}; - float32x2_t v366 = (float32x2_t){v373, v365}; - float32x2_t v374 = (float32x2_t){v373, v373}; - float32x2_t v434 = (float32x2_t){v432, v441}; - float32x2_t v442 = (float32x2_t){v441, v441}; - float32x2_t v447 = (float32x2_t){v451, v451}; - float32x2_t v452 = (float32x2_t){v450, v451}; - const int32_t *v1264 = &v5[0]; - int32_t *v1463 = &v6[0]; - int16x4_t v1694 = vld1_s16((const int16_t *)v1336); - float32x4_t v117 = vcvtq_n_f32_s32(vmovl_s16(v1694), 15); - float32x2_t v285 = vmul_f32(v453, v283); - float32x2_t v293 = vmul_f32(v453, v291); - float32x4_t v300 = vcombine_f32(v299, v299); - float32x4_t v354 = vcombine_f32(v353, v353); - float32x2_t v360 = vmul_f32(v453, v358); - float32x2_t v368 = vmul_f32(v453, v366); - float32x4_t v375 = vcombine_f32(v374, v374); - float32x2_t v436 = vmul_f32(v453, v434); - float32x4_t v443 = vcombine_f32(v442, v442); - float32x4_t v448 = vcombine_f32(v447, v447); - float32x2_t v454 = vmul_f32(v453, v452); - const int32_t *v1245 = &v5[istride * 8]; - const int32_t *v1254 = &v5[istride * 16]; - const int32_t *v1273 = &v5[istride * 11]; - const int32_t *v1282 = &v5[istride * 19]; - const int32_t *v1291 = &v5[istride * 3]; - const int32_t *v1300 = &v5[istride * 14]; - const int32_t *v1309 = &v5[istride * 22]; - const int32_t *v1318 = &v5[istride * 6]; - const int32_t *v1327 = &v5[istride * 17]; - const int32_t *v1345 = &v5[istride * 9]; - const int32_t *v1354 = &v5[istride * 20]; - const int32_t *v1363 = &v5[istride * 4]; - const int32_t *v1372 = &v5[istride * 12]; - const int32_t *v1381 = &v5[istride * 23]; - const int32_t *v1390 = &v5[istride * 7]; - const int32_t *v1399 = &v5[istride * 15]; - const int32_t *v1408 = &v5[istride * 2]; - const int32_t *v1417 = &v5[istride * 10]; - const int32_t *v1426 = &v5[istride * 18]; - const int32_t *v1435 = &v5[istride * 5]; - const int32_t *v1444 = &v5[istride * 13]; - const int32_t *v1453 = &v5[istride * 21]; - int32_t *v1472 = &v6[ostride * 16]; - int32_t *v1481 = &v6[ostride * 8]; - int32_t *v1490 = &v6[ostride * 9]; - int32_t *v1508 = &v6[ostride * 17]; - int32_t *v1517 = &v6[ostride * 18]; - int32_t *v1526 = &v6[ostride * 10]; - int32_t *v1535 = &v6[ostride * 2]; - int32_t *v1544 = &v6[ostride * 3]; - int32_t *v1553 = &v6[ostride * 19]; - int32_t *v1562 = &v6[ostride * 11]; - int32_t *v1571 = &v6[ostride * 12]; - int32_t *v1580 = &v6[ostride * 4]; - int32_t *v1589 = &v6[ostride * 20]; - int32_t *v1598 = &v6[ostride * 21]; - int32_t *v1607 = &v6[ostride * 13]; - int32_t *v1616 = &v6[ostride * 5]; - int32_t *v1625 = &v6[ostride * 6]; - int32_t *v1634 = &v6[ostride * 22]; - int32_t *v1643 = &v6[ostride * 14]; - int32_t *v1652 = &v6[ostride * 15]; - int32_t *v1661 = &v6[ostride * 7]; - int32_t *v1670 = &v6[ostride * 23]; - int16x4_t v1678 = vld1_s16((const int16_t *)v1264); - float32x4_t v46 = vcvtq_n_f32_s32(vmovl_s16(v1678), 15); - float32x4_t v287 = vcombine_f32(v285, v285); - float32x4_t v295 = vcombine_f32(v293, v293); - float32x4_t v362 = vcombine_f32(v360, v360); - float32x4_t v370 = vcombine_f32(v368, v368); - float32x4_t v438 = vcombine_f32(v436, v436); - float32x4_t v456 = vcombine_f32(v454, v454); - int16x4_t v1674 = vld1_s16((const int16_t *)v1245); - int16x4_t v1676 = vld1_s16((const int16_t *)v1254); - int16x4_t v1680 = vld1_s16((const int16_t *)v1273); - int16x4_t v1682 = vld1_s16((const int16_t *)v1282); - int16x4_t v1684 = vld1_s16((const int16_t *)v1291); - int16x4_t v1686 = vld1_s16((const int16_t *)v1300); - int16x4_t v1688 = vld1_s16((const int16_t *)v1309); - int16x4_t v1690 = vld1_s16((const int16_t *)v1318); - int16x4_t v1692 = vld1_s16((const int16_t *)v1327); - int16x4_t v1696 = vld1_s16((const int16_t *)v1345); - int16x4_t v1698 = vld1_s16((const int16_t *)v1354); - int16x4_t v1700 = vld1_s16((const int16_t *)v1363); - int16x4_t v1702 = vld1_s16((const int16_t *)v1372); - int16x4_t v1704 = vld1_s16((const int16_t *)v1381); - int16x4_t v1706 = vld1_s16((const int16_t *)v1390); - int16x4_t v1708 = vld1_s16((const int16_t *)v1399); - int16x4_t v1710 = vld1_s16((const int16_t *)v1408); - int16x4_t v1712 = vld1_s16((const int16_t *)v1417); - int16x4_t v1714 = vld1_s16((const int16_t *)v1426); - int16x4_t v1716 = vld1_s16((const int16_t *)v1435); - int16x4_t v1718 = vld1_s16((const int16_t *)v1444); - int16x4_t v1720 = vld1_s16((const int16_t *)v1453); - float32x4_t v28 = vcvtq_n_f32_s32(vmovl_s16(v1674), 15); - float32x4_t v36 = vcvtq_n_f32_s32(vmovl_s16(v1676), 15); - float32x4_t v55 = vcvtq_n_f32_s32(vmovl_s16(v1680), 15); - float32x4_t v63 = vcvtq_n_f32_s32(vmovl_s16(v1682), 15); - float32x4_t v73 = vcvtq_n_f32_s32(vmovl_s16(v1684), 15); - float32x4_t v82 = vcvtq_n_f32_s32(vmovl_s16(v1686), 15); - float32x4_t v90 = vcvtq_n_f32_s32(vmovl_s16(v1688), 15); - float32x4_t v100 = vcvtq_n_f32_s32(vmovl_s16(v1690), 15); - float32x4_t v109 = vcvtq_n_f32_s32(vmovl_s16(v1692), 15); - float32x4_t v127 = vcvtq_n_f32_s32(vmovl_s16(v1696), 15); - float32x4_t v136 = vcvtq_n_f32_s32(vmovl_s16(v1698), 15); - float32x4_t v144 = vcvtq_n_f32_s32(vmovl_s16(v1700), 15); - float32x4_t v154 = vcvtq_n_f32_s32(vmovl_s16(v1702), 15); - float32x4_t v163 = vcvtq_n_f32_s32(vmovl_s16(v1704), 15); - float32x4_t v171 = vcvtq_n_f32_s32(vmovl_s16(v1706), 15); - float32x4_t v181 = vcvtq_n_f32_s32(vmovl_s16(v1708), 15); - float32x4_t v190 = vcvtq_n_f32_s32(vmovl_s16(v1710), 15); - float32x4_t v198 = vcvtq_n_f32_s32(vmovl_s16(v1712), 15); - float32x4_t v208 = vcvtq_n_f32_s32(vmovl_s16(v1714), 15); - float32x4_t v217 = vcvtq_n_f32_s32(vmovl_s16(v1716), 15); - float32x4_t v225 = vcvtq_n_f32_s32(vmovl_s16(v1718), 15); - float32x4_t v235 = vcvtq_n_f32_s32(vmovl_s16(v1720), 15); - float32x4_t v37 = vaddq_f32(v28, v36); - float32x4_t v38 = vsubq_f32(v28, v36); - float32x4_t v64 = vaddq_f32(v55, v63); - float32x4_t v65 = vsubq_f32(v55, v63); - float32x4_t v91 = vaddq_f32(v82, v90); - float32x4_t v92 = vsubq_f32(v82, v90); - float32x4_t v118 = vaddq_f32(v109, v117); - float32x4_t v119 = vsubq_f32(v109, v117); - float32x4_t v145 = vaddq_f32(v136, v144); - float32x4_t v146 = vsubq_f32(v136, v144); - float32x4_t v172 = vaddq_f32(v163, v171); - float32x4_t v173 = vsubq_f32(v163, v171); - float32x4_t v199 = vaddq_f32(v190, v198); - float32x4_t v200 = vsubq_f32(v190, v198); - float32x4_t v226 = vaddq_f32(v217, v225); - float32x4_t v227 = vsubq_f32(v217, v225); - float32x4_t v47 = vaddq_f32(v37, v46); - float32x4_t v74 = vaddq_f32(v64, v73); - float32x4_t v101 = vaddq_f32(v91, v100); - float32x4_t v128 = vaddq_f32(v118, v127); - float32x4_t v155 = vaddq_f32(v145, v154); - float32x4_t v182 = vaddq_f32(v172, v181); - float32x4_t v209 = vaddq_f32(v199, v208); - float32x4_t v236 = vaddq_f32(v226, v235); - float32x4_t v312 = vaddq_f32(v37, v145); - float32x4_t v313 = vsubq_f32(v37, v145); - float32x4_t v314 = vaddq_f32(v91, v199); - float32x4_t v315 = vsubq_f32(v91, v199); - float32x4_t v316 = vaddq_f32(v64, v172); - float32x4_t v317 = vsubq_f32(v64, v172); - float32x4_t v318 = vaddq_f32(v118, v226); - float32x4_t v319 = vsubq_f32(v118, v226); - float32x4_t v387 = vaddq_f32(v38, v146); - float32x4_t v388 = vsubq_f32(v38, v146); - float32x4_t v389 = vaddq_f32(v92, v200); - float32x4_t v390 = vsubq_f32(v92, v200); - float32x4_t v391 = vaddq_f32(v65, v173); - float32x4_t v392 = vsubq_f32(v65, v173); - float32x4_t v393 = vaddq_f32(v119, v227); - float32x4_t v394 = vsubq_f32(v119, v227); - float32x4_t v237 = vaddq_f32(v47, v155); - float32x4_t v238 = vsubq_f32(v47, v155); - float32x4_t v239 = vaddq_f32(v101, v209); - float32x4_t v240 = vsubq_f32(v101, v209); - float32x4_t v241 = vaddq_f32(v74, v182); - float32x4_t v242 = vsubq_f32(v74, v182); - float32x4_t v243 = vaddq_f32(v128, v236); - float32x4_t v244 = vsubq_f32(v128, v236); - float32x4_t v320 = vaddq_f32(v312, v314); - float32x4_t v321 = vsubq_f32(v312, v314); - float32x4_t v322 = vaddq_f32(v316, v318); - float32x4_t v323 = vsubq_f32(v316, v318); - float32x4_t v326 = vaddq_f32(v317, v319); - float32x4_t v327 = vsubq_f32(v317, v319); - float32x4_t v355 = vmulq_f32(v313, v354); - float32x4_t v361 = vrev64q_f32(v315); - float32x4_t v395 = vaddq_f32(v387, v389); - float32x4_t v396 = vsubq_f32(v387, v389); - float32x4_t v397 = vaddq_f32(v391, v393); - float32x4_t v398 = vsubq_f32(v391, v393); - float32x4_t v401 = vaddq_f32(v392, v394); - float32x4_t v402 = vsubq_f32(v392, v394); - float32x4_t v437 = vrev64q_f32(v388); - float32x4_t v444 = vmulq_f32(v390, v443); - float32x4_t v245 = vaddq_f32(v237, v239); - float32x4_t v246 = vsubq_f32(v237, v239); - float32x4_t v247 = vaddq_f32(v241, v243); - float32x4_t v248 = vsubq_f32(v241, v243); - float32x4_t v251 = vaddq_f32(v242, v244); - float32x4_t v252 = vsubq_f32(v242, v244); - float32x4_t v286 = vrev64q_f32(v240); - float32x4_t v324 = vaddq_f32(v320, v322); - float32x4_t v325 = vsubq_f32(v320, v322); - float32x4_t v342 = vmulq_f32(v321, v354); - float32x4_t v348 = vrev64q_f32(v323); - float32x4_t v363 = vmulq_f32(v361, v362); - float32x4_t v369 = vrev64q_f32(v326); - float32x4_t v376 = vmulq_f32(v327, v375); - float32x4_t v399 = vaddq_f32(v395, v397); - float32x4_t v400 = vsubq_f32(v395, v397); - float32x4_t v424 = vrev64q_f32(v396); - float32x4_t v431 = vmulq_f32(v398, v443); - float32x4_t v439 = vmulq_f32(v437, v438); - float32x4_t v449 = vmulq_f32(v401, v448); - float32x4_t v455 = vrev64q_f32(v402); - float32x4_t v249 = vaddq_f32(v245, v247); - float32x4_t v250 = vsubq_f32(v245, v247); - float32x4_t v273 = vrev64q_f32(v248); - float32x4_t v288 = vmulq_f32(v286, v287); - float32x4_t v294 = vrev64q_f32(v251); - float32x4_t v301 = vmulq_f32(v252, v300); - float32x4_t v332 = vmulq_f32(v324, v354); - float32x4_t v337 = vmulq_f32(v325, v354); - float32x4_t v350 = vmulq_f32(v348, v362); - float32x4_t v371 = vmulq_f32(v369, v370); - float32x4_t v379 = vaddq_f32(v355, v376); - float32x4_t v380 = vsubq_f32(v355, v376); - float32x4_t v408 = vrev64q_f32(v399); - float32x4_t v416 = vrev64q_f32(v400); - float32x4_t v426 = vmulq_f32(v424, v438); - float32x4_t v457 = vmulq_f32(v455, v456); - float32x4_t v462 = vaddq_f32(v444, v449); - float32x4_t v463 = vsubq_f32(v444, v449); - float32x4_t v275 = vmulq_f32(v273, v287); - float32x4_t v296 = vmulq_f32(v294, v295); - float32x4_t v304 = vaddq_f32(v238, v301); - float32x4_t v305 = vsubq_f32(v238, v301); - float32x4_t v377 = vaddq_f32(v342, v350); - float32x4_t v378 = vsubq_f32(v342, v350); - float32x4_t v381 = vaddq_f32(v363, v371); - float32x4_t v382 = vsubq_f32(v363, v371); - float32x4_t v410 = vmulq_f32(v408, v438); - float32x4_t v418 = vmulq_f32(v416, v438); - float32x4_t v458 = vaddq_f32(v426, v431); - float32x4_t v459 = vsubq_f32(v426, v431); - float32x4_t v460 = vaddq_f32(v439, v457); - float32x4_t v461 = vsubq_f32(v439, v457); - float32x4_t v468 = vaddq_f32(v249, v332); - int16x4_t v473 = vqmovn_s32(vcvtq_n_s32_f32(v249, 15)); - float32x4_t v576 = vaddq_f32(v250, v337); - int16x4_t v581 = vqmovn_s32(vcvtq_n_s32_f32(v250, 15)); - float32x4_t v302 = vaddq_f32(v246, v275); - float32x4_t v303 = vsubq_f32(v246, v275); - float32x4_t v306 = vaddq_f32(v288, v296); - float32x4_t v307 = vsubq_f32(v288, v296); - float32x4_t v383 = vaddq_f32(v379, v381); - float32x4_t v384 = vsubq_f32(v379, v381); - float32x4_t v385 = vaddq_f32(v380, v382); - float32x4_t v386 = vsubq_f32(v380, v382); - float32x4_t v464 = vaddq_f32(v460, v462); - float32x4_t v465 = vsubq_f32(v460, v462); - float32x4_t v466 = vaddq_f32(v461, v463); - float32x4_t v467 = vsubq_f32(v461, v463); - float32x4_t v469 = vaddq_f32(v468, v410); - float32x4_t v470 = vsubq_f32(v468, v410); - float32x4_t v577 = vaddq_f32(v576, v418); - float32x4_t v578 = vsubq_f32(v576, v418); - vst1_s16((int16_t *)v1463, v473); - vst1_s16((int16_t *)v1571, v581); - float32x4_t v308 = vaddq_f32(v304, v306); - float32x4_t v309 = vsubq_f32(v304, v306); - float32x4_t v310 = vaddq_f32(v305, v307); - float32x4_t v311 = vsubq_f32(v305, v307); - int16x4_t v481 = vqmovn_s32(vcvtq_n_s32_f32(v470, 15)); - int16x4_t v489 = vqmovn_s32(vcvtq_n_s32_f32(v469, 15)); - float32x4_t v522 = vaddq_f32(v303, v378); - int16x4_t v527 = vqmovn_s32(vcvtq_n_s32_f32(v303, 15)); - int16x4_t v589 = vqmovn_s32(vcvtq_n_s32_f32(v578, 15)); - int16x4_t v597 = vqmovn_s32(vcvtq_n_s32_f32(v577, 15)); - float32x4_t v630 = vaddq_f32(v302, v377); - int16x4_t v635 = vqmovn_s32(vcvtq_n_s32_f32(v302, 15)); - float32x4_t v495 = vaddq_f32(v309, v384); - int16x4_t v500 = vqmovn_s32(vcvtq_n_s32_f32(v309, 15)); - float32x4_t v523 = vaddq_f32(v522, v459); - float32x4_t v524 = vsubq_f32(v522, v459); - float32x4_t v549 = vaddq_f32(v310, v385); - int16x4_t v554 = vqmovn_s32(vcvtq_n_s32_f32(v310, 15)); - float32x4_t v603 = vaddq_f32(v311, v386); - int16x4_t v608 = vqmovn_s32(vcvtq_n_s32_f32(v311, 15)); - float32x4_t v631 = vaddq_f32(v630, v458); - float32x4_t v632 = vsubq_f32(v630, v458); - float32x4_t v657 = vaddq_f32(v308, v383); - int16x4_t v662 = vqmovn_s32(vcvtq_n_s32_f32(v308, 15)); - vst1_s16((int16_t *)v1472, v481); - vst1_s16((int16_t *)v1481, v489); - vst1_s16((int16_t *)v1517, v527); - vst1_s16((int16_t *)v1580, v589); - vst1_s16((int16_t *)v1589, v597); - vst1_s16((int16_t *)v1625, v635); - float32x4_t v496 = vaddq_f32(v495, v465); - float32x4_t v497 = vsubq_f32(v495, v465); - int16x4_t v535 = vqmovn_s32(vcvtq_n_s32_f32(v524, 15)); - int16x4_t v543 = vqmovn_s32(vcvtq_n_s32_f32(v523, 15)); - float32x4_t v550 = vaddq_f32(v549, v466); - float32x4_t v551 = vsubq_f32(v549, v466); - float32x4_t v604 = vaddq_f32(v603, v467); - float32x4_t v605 = vsubq_f32(v603, v467); - int16x4_t v643 = vqmovn_s32(vcvtq_n_s32_f32(v632, 15)); - int16x4_t v651 = vqmovn_s32(vcvtq_n_s32_f32(v631, 15)); - float32x4_t v658 = vaddq_f32(v657, v464); - float32x4_t v659 = vsubq_f32(v657, v464); - vst1_s16((int16_t *)v1490, v500); - vst1_s16((int16_t *)v1544, v554); - vst1_s16((int16_t *)v1598, v608); - vst1_s16((int16_t *)v1652, v662); - int16x4_t v508 = vqmovn_s32(vcvtq_n_s32_f32(v497, 15)); - int16x4_t v516 = vqmovn_s32(vcvtq_n_s32_f32(v496, 15)); - int16x4_t v562 = vqmovn_s32(vcvtq_n_s32_f32(v551, 15)); - int16x4_t v570 = vqmovn_s32(vcvtq_n_s32_f32(v550, 15)); - int16x4_t v616 = vqmovn_s32(vcvtq_n_s32_f32(v605, 15)); - int16x4_t v624 = vqmovn_s32(vcvtq_n_s32_f32(v604, 15)); - int16x4_t v670 = vqmovn_s32(vcvtq_n_s32_f32(v659, 15)); - int16x4_t v678 = vqmovn_s32(vcvtq_n_s32_f32(v658, 15)); - vst1_s16((int16_t *)v1526, v535); - vst1_s16((int16_t *)v1535, v543); - vst1_s16((int16_t *)v1634, v643); - vst1_s16((int16_t *)v1643, v651); - vst1_s16((int16_t *)v1499, v508); - vst1_s16((int16_t *)v1508, v516); - vst1_s16((int16_t *)v1553, v562); - vst1_s16((int16_t *)v1562, v570); - vst1_s16((int16_t *)v1607, v616); - vst1_s16((int16_t *)v1616, v624); - vst1_s16((int16_t *)v1661, v670); - vst1_s16((int16_t *)v1670, v678); - v5 += 2 * 1; - v6 += 2 * 1; - } - for (int j = v684 * 2; j < howmany; j += 1) { - int16x4_t v765 = vld1s_s16(&v5[istride]); - float v899 = 1.0000000000000000e+00F; - float v900 = -1.0000000000000000e+00F; - float v907 = -7.0710678118654746e-01F; - float v914 = 7.0710678118654757e-01F; - float v966 = -1.4999999999999998e+00F; - float v967 = 1.4999999999999998e+00F; - float v974 = 1.0606601717798210e+00F; - float v981 = -1.0606601717798212e+00F; - float v1035 = 8.6602540378443871e-01F; - float v1043 = -8.6602540378443871e-01F; - float v1050 = 6.1237243569579458e-01F; - float v1051 = -6.1237243569579458e-01F; - float32x2_t v1053 = (float32x2_t){v4, v4}; - int16x4_t v710 = vld1s_s16(&v5[0]); - float32x2_t v766 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v765)), 15); - float32x2_t v901 = (float32x2_t){v899, v900}; - float32x2_t v908 = (float32x2_t){v914, v907}; - float32x2_t v915 = (float32x2_t){v914, v914}; - float32x2_t v964 = (float32x2_t){v966, v966}; - float32x2_t v968 = (float32x2_t){v966, v967}; - float32x2_t v975 = (float32x2_t){v981, v974}; - float32x2_t v982 = (float32x2_t){v981, v981}; - float32x2_t v1037 = (float32x2_t){v1035, v1043}; - float32x2_t v1044 = (float32x2_t){v1043, v1043}; - float32x2_t v1048 = (float32x2_t){v1051, v1051}; - float32x2_t v1052 = (float32x2_t){v1050, v1051}; - int16x4_t v696 = vld1s_s16(&v5[istride * 8]); - int16x4_t v702 = vld1s_s16(&v5[istride * 16]); - float32x2_t v711 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v710)), 15); - int16x4_t v717 = vld1s_s16(&v5[istride * 11]); - int16x4_t v723 = vld1s_s16(&v5[istride * 19]); - int16x4_t v731 = vld1s_s16(&v5[istride * 3]); - int16x4_t v738 = vld1s_s16(&v5[istride * 14]); - int16x4_t v744 = vld1s_s16(&v5[istride * 22]); - int16x4_t v752 = vld1s_s16(&v5[istride * 6]); - int16x4_t v759 = vld1s_s16(&v5[istride * 17]); - int16x4_t v773 = vld1s_s16(&v5[istride * 9]); - int16x4_t v780 = vld1s_s16(&v5[istride * 20]); - int16x4_t v786 = vld1s_s16(&v5[istride * 4]); - int16x4_t v794 = vld1s_s16(&v5[istride * 12]); - int16x4_t v801 = vld1s_s16(&v5[istride * 23]); - int16x4_t v807 = vld1s_s16(&v5[istride * 7]); - int16x4_t v815 = vld1s_s16(&v5[istride * 15]); - int16x4_t v822 = vld1s_s16(&v5[istride * 2]); - int16x4_t v828 = vld1s_s16(&v5[istride * 10]); - int16x4_t v836 = vld1s_s16(&v5[istride * 18]); - int16x4_t v843 = vld1s_s16(&v5[istride * 5]); - int16x4_t v849 = vld1s_s16(&v5[istride * 13]); - int16x4_t v857 = vld1s_s16(&v5[istride * 21]); - float32x2_t v903 = vmul_f32(v1053, v901); - float32x2_t v910 = vmul_f32(v1053, v908); - float32x2_t v970 = vmul_f32(v1053, v968); - float32x2_t v977 = vmul_f32(v1053, v975); - float32x2_t v1039 = vmul_f32(v1053, v1037); - float32x2_t v1054 = vmul_f32(v1053, v1052); - float32x2_t v697 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v696)), 15); - float32x2_t v703 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v702)), 15); - float32x2_t v718 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v717)), 15); - float32x2_t v724 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v723)), 15); - float32x2_t v732 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v731)), 15); - float32x2_t v739 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v738)), 15); - float32x2_t v745 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v744)), 15); - float32x2_t v753 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v752)), 15); - float32x2_t v760 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v759)), 15); - float32x2_t v774 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v773)), 15); - float32x2_t v781 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v780)), 15); - float32x2_t v787 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v786)), 15); - float32x2_t v795 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v794)), 15); - float32x2_t v802 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v801)), 15); - float32x2_t v808 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v807)), 15); - float32x2_t v816 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v815)), 15); - float32x2_t v823 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v822)), 15); - float32x2_t v829 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v828)), 15); - float32x2_t v837 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v836)), 15); - float32x2_t v844 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v843)), 15); - float32x2_t v850 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v849)), 15); - float32x2_t v858 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v857)), 15); - float32x2_t v704 = vadd_f32(v697, v703); - float32x2_t v705 = vsub_f32(v697, v703); - float32x2_t v725 = vadd_f32(v718, v724); - float32x2_t v726 = vsub_f32(v718, v724); - float32x2_t v746 = vadd_f32(v739, v745); - float32x2_t v747 = vsub_f32(v739, v745); - float32x2_t v767 = vadd_f32(v760, v766); - float32x2_t v768 = vsub_f32(v760, v766); - float32x2_t v788 = vadd_f32(v781, v787); - float32x2_t v789 = vsub_f32(v781, v787); - float32x2_t v809 = vadd_f32(v802, v808); - float32x2_t v810 = vsub_f32(v802, v808); - float32x2_t v830 = vadd_f32(v823, v829); - float32x2_t v831 = vsub_f32(v823, v829); - float32x2_t v851 = vadd_f32(v844, v850); - float32x2_t v852 = vsub_f32(v844, v850); - float32x2_t v712 = vadd_f32(v704, v711); - float32x2_t v733 = vadd_f32(v725, v732); - float32x2_t v754 = vadd_f32(v746, v753); - float32x2_t v775 = vadd_f32(v767, v774); - float32x2_t v796 = vadd_f32(v788, v795); - float32x2_t v817 = vadd_f32(v809, v816); - float32x2_t v838 = vadd_f32(v830, v837); - float32x2_t v859 = vadd_f32(v851, v858); - float32x2_t v927 = vadd_f32(v704, v788); - float32x2_t v928 = vsub_f32(v704, v788); - float32x2_t v929 = vadd_f32(v746, v830); - float32x2_t v930 = vsub_f32(v746, v830); - float32x2_t v931 = vadd_f32(v725, v809); - float32x2_t v932 = vsub_f32(v725, v809); - float32x2_t v933 = vadd_f32(v767, v851); - float32x2_t v934 = vsub_f32(v767, v851); - float32x2_t v994 = vadd_f32(v705, v789); - float32x2_t v995 = vsub_f32(v705, v789); - float32x2_t v996 = vadd_f32(v747, v831); - float32x2_t v997 = vsub_f32(v747, v831); - float32x2_t v998 = vadd_f32(v726, v810); - float32x2_t v999 = vsub_f32(v726, v810); - float32x2_t v1000 = vadd_f32(v768, v852); - float32x2_t v1001 = vsub_f32(v768, v852); - float32x2_t v860 = vadd_f32(v712, v796); - float32x2_t v861 = vsub_f32(v712, v796); - float32x2_t v862 = vadd_f32(v754, v838); - float32x2_t v863 = vsub_f32(v754, v838); - float32x2_t v864 = vadd_f32(v733, v817); - float32x2_t v865 = vsub_f32(v733, v817); - float32x2_t v866 = vadd_f32(v775, v859); - float32x2_t v867 = vsub_f32(v775, v859); - float32x2_t v935 = vadd_f32(v927, v929); - float32x2_t v936 = vsub_f32(v927, v929); - float32x2_t v937 = vadd_f32(v931, v933); - float32x2_t v938 = vsub_f32(v931, v933); - float32x2_t v941 = vadd_f32(v932, v934); - float32x2_t v942 = vsub_f32(v932, v934); - float32x2_t v965 = vmul_f32(v928, v964); - float32x2_t v971 = vrev64_f32(v930); - float32x2_t v1002 = vadd_f32(v994, v996); - float32x2_t v1003 = vsub_f32(v994, v996); - float32x2_t v1004 = vadd_f32(v998, v1000); - float32x2_t v1005 = vsub_f32(v998, v1000); - float32x2_t v1008 = vadd_f32(v999, v1001); - float32x2_t v1009 = vsub_f32(v999, v1001); - float32x2_t v1040 = vrev64_f32(v995); - float32x2_t v1045 = vmul_f32(v997, v1044); - float32x2_t v868 = vadd_f32(v860, v862); - float32x2_t v869 = vsub_f32(v860, v862); - float32x2_t v870 = vadd_f32(v864, v866); - float32x2_t v871 = vsub_f32(v864, v866); - float32x2_t v874 = vadd_f32(v865, v867); - float32x2_t v875 = vsub_f32(v865, v867); - float32x2_t v904 = vrev64_f32(v863); - float32x2_t v939 = vadd_f32(v935, v937); - float32x2_t v940 = vsub_f32(v935, v937); - float32x2_t v954 = vmul_f32(v936, v964); - float32x2_t v960 = vrev64_f32(v938); - float32x2_t v972 = vmul_f32(v971, v970); - float32x2_t v978 = vrev64_f32(v941); - float32x2_t v983 = vmul_f32(v942, v982); - float32x2_t v1006 = vadd_f32(v1002, v1004); - float32x2_t v1007 = vsub_f32(v1002, v1004); - float32x2_t v1029 = vrev64_f32(v1003); - float32x2_t v1034 = vmul_f32(v1005, v1044); - float32x2_t v1041 = vmul_f32(v1040, v1039); - float32x2_t v1049 = vmul_f32(v1008, v1048); - float32x2_t v1055 = vrev64_f32(v1009); - float32x2_t v872 = vadd_f32(v868, v870); - float32x2_t v873 = vsub_f32(v868, v870); - float32x2_t v893 = vrev64_f32(v871); - float32x2_t v905 = vmul_f32(v904, v903); - float32x2_t v911 = vrev64_f32(v874); - float32x2_t v916 = vmul_f32(v875, v915); - float32x2_t v946 = vmul_f32(v939, v964); - float32x2_t v950 = vmul_f32(v940, v964); - float32x2_t v961 = vmul_f32(v960, v970); - float32x2_t v979 = vmul_f32(v978, v977); - float32x2_t v986 = vadd_f32(v965, v983); - float32x2_t v987 = vsub_f32(v965, v983); - float32x2_t v1015 = vrev64_f32(v1006); - float32x2_t v1022 = vrev64_f32(v1007); - float32x2_t v1030 = vmul_f32(v1029, v1039); - float32x2_t v1056 = vmul_f32(v1055, v1054); - float32x2_t v1061 = vadd_f32(v1045, v1049); - float32x2_t v1062 = vsub_f32(v1045, v1049); - float32x2_t v894 = vmul_f32(v893, v903); - float32x2_t v912 = vmul_f32(v911, v910); - float32x2_t v919 = vadd_f32(v861, v916); - float32x2_t v920 = vsub_f32(v861, v916); - float32x2_t v984 = vadd_f32(v954, v961); - float32x2_t v985 = vsub_f32(v954, v961); - float32x2_t v988 = vadd_f32(v972, v979); - float32x2_t v989 = vsub_f32(v972, v979); - float32x2_t v1016 = vmul_f32(v1015, v1039); - float32x2_t v1023 = vmul_f32(v1022, v1039); - float32x2_t v1057 = vadd_f32(v1030, v1034); - float32x2_t v1058 = vsub_f32(v1030, v1034); - float32x2_t v1059 = vadd_f32(v1041, v1056); - float32x2_t v1060 = vsub_f32(v1041, v1056); - float32x2_t v1067 = vadd_f32(v872, v946); - int16x4_t v1072 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v872, 15), (int32x2_t){0, 0})); - float32x2_t v1151 = vadd_f32(v873, v950); - int16x4_t v1156 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v873, 15), (int32x2_t){0, 0})); - float32x2_t v917 = vadd_f32(v869, v894); - float32x2_t v918 = vsub_f32(v869, v894); - float32x2_t v921 = vadd_f32(v905, v912); - float32x2_t v922 = vsub_f32(v905, v912); - float32x2_t v990 = vadd_f32(v986, v988); - float32x2_t v991 = vsub_f32(v986, v988); - float32x2_t v992 = vadd_f32(v987, v989); - float32x2_t v993 = vsub_f32(v987, v989); - float32x2_t v1063 = vadd_f32(v1059, v1061); - float32x2_t v1064 = vsub_f32(v1059, v1061); - float32x2_t v1065 = vadd_f32(v1060, v1062); - float32x2_t v1066 = vsub_f32(v1060, v1062); - float32x2_t v1068 = vadd_f32(v1067, v1016); - float32x2_t v1069 = vsub_f32(v1067, v1016); - v6[0] = vget_lane_s32(vreinterpret_s32_s16(v1072), 0); - float32x2_t v1152 = vadd_f32(v1151, v1023); - float32x2_t v1153 = vsub_f32(v1151, v1023); - v6[ostride * 12] = vget_lane_s32(vreinterpret_s32_s16(v1156), 0); - float32x2_t v923 = vadd_f32(v919, v921); - float32x2_t v924 = vsub_f32(v919, v921); - float32x2_t v925 = vadd_f32(v920, v922); - float32x2_t v926 = vsub_f32(v920, v922); - int16x4_t v1078 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1069, 15), (int32x2_t){0, 0})); - int16x4_t v1084 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1068, 15), (int32x2_t){0, 0})); - float32x2_t v1109 = vadd_f32(v918, v985); - int16x4_t v1114 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v918, 15), (int32x2_t){0, 0})); - int16x4_t v1162 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1153, 15), (int32x2_t){0, 0})); - int16x4_t v1168 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1152, 15), (int32x2_t){0, 0})); - float32x2_t v1193 = vadd_f32(v917, v984); - int16x4_t v1198 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v917, 15), (int32x2_t){0, 0})); - v6[ostride * 16] = vget_lane_s32(vreinterpret_s32_s16(v1078), 0); - v6[ostride * 8] = vget_lane_s32(vreinterpret_s32_s16(v1084), 0); - float32x2_t v1088 = vadd_f32(v924, v991); - int16x4_t v1093 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v924, 15), (int32x2_t){0, 0})); - float32x2_t v1110 = vadd_f32(v1109, v1058); - float32x2_t v1111 = vsub_f32(v1109, v1058); - v6[ostride * 18] = vget_lane_s32(vreinterpret_s32_s16(v1114), 0); - float32x2_t v1130 = vadd_f32(v925, v992); - int16x4_t v1135 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v925, 15), (int32x2_t){0, 0})); - v6[ostride * 4] = vget_lane_s32(vreinterpret_s32_s16(v1162), 0); - v6[ostride * 20] = vget_lane_s32(vreinterpret_s32_s16(v1168), 0); - float32x2_t v1172 = vadd_f32(v926, v993); - int16x4_t v1177 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v926, 15), (int32x2_t){0, 0})); - float32x2_t v1194 = vadd_f32(v1193, v1057); - float32x2_t v1195 = vsub_f32(v1193, v1057); - v6[ostride * 6] = vget_lane_s32(vreinterpret_s32_s16(v1198), 0); - float32x2_t v1214 = vadd_f32(v923, v990); - int16x4_t v1219 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v923, 15), (int32x2_t){0, 0})); - float32x2_t v1089 = vadd_f32(v1088, v1064); - float32x2_t v1090 = vsub_f32(v1088, v1064); - v6[ostride * 9] = vget_lane_s32(vreinterpret_s32_s16(v1093), 0); - int16x4_t v1120 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1111, 15), (int32x2_t){0, 0})); - int16x4_t v1126 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1110, 15), (int32x2_t){0, 0})); - float32x2_t v1131 = vadd_f32(v1130, v1065); - float32x2_t v1132 = vsub_f32(v1130, v1065); - v6[ostride * 3] = vget_lane_s32(vreinterpret_s32_s16(v1135), 0); - float32x2_t v1173 = vadd_f32(v1172, v1066); - float32x2_t v1174 = vsub_f32(v1172, v1066); - v6[ostride * 21] = vget_lane_s32(vreinterpret_s32_s16(v1177), 0); - int16x4_t v1204 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1195, 15), (int32x2_t){0, 0})); - int16x4_t v1210 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1194, 15), (int32x2_t){0, 0})); - float32x2_t v1215 = vadd_f32(v1214, v1063); - float32x2_t v1216 = vsub_f32(v1214, v1063); - v6[ostride * 15] = vget_lane_s32(vreinterpret_s32_s16(v1219), 0); - int16x4_t v1099 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1090, 15), (int32x2_t){0, 0})); - int16x4_t v1105 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1089, 15), (int32x2_t){0, 0})); - v6[ostride * 10] = vget_lane_s32(vreinterpret_s32_s16(v1120), 0); - v6[ostride * 2] = vget_lane_s32(vreinterpret_s32_s16(v1126), 0); - int16x4_t v1141 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1132, 15), (int32x2_t){0, 0})); - int16x4_t v1147 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1131, 15), (int32x2_t){0, 0})); - int16x4_t v1183 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1174, 15), (int32x2_t){0, 0})); - int16x4_t v1189 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1173, 15), (int32x2_t){0, 0})); - v6[ostride * 22] = vget_lane_s32(vreinterpret_s32_s16(v1204), 0); - v6[ostride * 14] = vget_lane_s32(vreinterpret_s32_s16(v1210), 0); - int16x4_t v1225 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1216, 15), (int32x2_t){0, 0})); - int16x4_t v1231 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1215, 15), (int32x2_t){0, 0})); - v6[ostride] = vget_lane_s32(vreinterpret_s32_s16(v1099), 0); - v6[ostride * 17] = vget_lane_s32(vreinterpret_s32_s16(v1105), 0); - v6[ostride * 19] = vget_lane_s32(vreinterpret_s32_s16(v1141), 0); - v6[ostride * 11] = vget_lane_s32(vreinterpret_s32_s16(v1147), 0); - v6[ostride * 13] = vget_lane_s32(vreinterpret_s32_s16(v1183), 0); - v6[ostride * 5] = vget_lane_s32(vreinterpret_s32_s16(v1189), 0); - v6[ostride * 7] = vget_lane_s32(vreinterpret_s32_s16(v1225), 0); - v6[ostride * 23] = vget_lane_s32(vreinterpret_s32_s16(v1231), 0); - v5 += 1 * 1; - v6 += 1 * 1; - } -} -#endif - -#ifdef ARMRAL_ARCH_SVE -void armral_fft_cs16_cf32_cs16_ac_n_uu24(const armral_cmplx_int16_t *restrict x, - armral_cmplx_int16_t *restrict y, - int istride, int ostride, int howmany, - float dir) { - int64_t v0 = istride; - int64_t v2 = ostride; - float v4 = dir; - const int32_t *v5 = (const int32_t *)x; - int32_t *v6 = (int32_t *)y; - int64_t v8 = howmany; - int64_t v10 = svcntd(); - int64_t v11 = v10 * 1; - int64_t v12 = v10 * 1; - for (int j = 0; j < v8; j += v10) { - svbool_t pred_full = svwhilelt_b32(j * 2, howmany * 2); - float v278 = -1.0000000000000000e+00F; - float v285 = -7.0710678118654746e-01F; - float v292 = 7.0710678118654757e-01F; - float v345 = -1.4999999999999998e+00F; - float v350 = 1.4999999999999998e+00F; - float v357 = 1.0606601717798210e+00F; - float v364 = -1.0606601717798212e+00F; - float v428 = -8.6602540378443871e-01F; - float v438 = -6.1237243569579458e-01F; - const int32_t *v767 = &v5[v0]; - int32_t *v954 = &v6[v2]; - int64_t v19 = v0 * 8; - int64_t v27 = v0 * 16; - int64_t v46 = v0 * 11; - int64_t v54 = v0 * 19; - int64_t v64 = v0 * 3; - int64_t v73 = v0 * 14; - int64_t v81 = v0 * 22; - int64_t v91 = v0 * 6; - int64_t v100 = v0 * 17; - int64_t v118 = v0 * 9; - int64_t v127 = v0 * 20; - int64_t v135 = v0 * 4; - int64_t v145 = v0 * 12; - int64_t v154 = v0 * 23; - int64_t v162 = v0 * 7; - int64_t v172 = v0 * 15; - int64_t v181 = v0 * 2; - int64_t v189 = v0 * 10; - int64_t v199 = v0 * 18; - int64_t v208 = v0 * 5; - int64_t v216 = v0 * 13; - int64_t v226 = v0 * 21; - float v281 = v4 * v278; - float v288 = v4 * v285; - float v353 = v4 * v350; - float v360 = v4 * v357; - float v424 = v4 * v428; - float v441 = v4 * v438; - int64_t v466 = v2 * 16; - int64_t v474 = v2 * 8; - int64_t v485 = v2 * 9; - int64_t v501 = v2 * 17; - int64_t v512 = v2 * 18; - int64_t v520 = v2 * 10; - int64_t v528 = v2 * 2; - int64_t v539 = v2 * 3; - int64_t v547 = v2 * 19; - int64_t v555 = v2 * 11; - int64_t v566 = v2 * 12; - int64_t v574 = v2 * 4; - int64_t v582 = v2 * 20; - int64_t v593 = v2 * 21; - int64_t v601 = v2 * 13; - int64_t v609 = v2 * 5; - int64_t v620 = v2 * 6; - int64_t v628 = v2 * 22; - int64_t v636 = v2 * 14; - int64_t v647 = v2 * 15; - int64_t v655 = v2 * 7; - int64_t v663 = v2 * 23; - const int32_t *v695 = &v5[0]; - svfloat32_t v894 = svdup_n_f32(v292); - svfloat32_t v899 = svdup_n_f32(v345); - svfloat32_t v902 = svdup_n_f32(v364); - svfloat32_t v908 = svdup_n_f32(v428); - svfloat32_t v909 = svdup_n_f32(v438); - int32_t *v918 = &v6[0]; - svfloat32_t v114 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v767[0])), - 1.F / (1ULL << 15ULL)); - const int32_t *v676 = &v5[v19]; - const int32_t *v685 = &v5[v27]; - const int32_t *v704 = &v5[v46]; - const int32_t *v713 = &v5[v54]; - const int32_t *v722 = &v5[v64]; - const int32_t *v731 = &v5[v73]; - const int32_t *v740 = &v5[v81]; - const int32_t *v749 = &v5[v91]; - const int32_t *v758 = &v5[v100]; - const int32_t *v776 = &v5[v118]; - const int32_t *v785 = &v5[v127]; - const int32_t *v794 = &v5[v135]; - const int32_t *v803 = &v5[v145]; - const int32_t *v812 = &v5[v154]; - const int32_t *v821 = &v5[v162]; - const int32_t *v830 = &v5[v172]; - const int32_t *v839 = &v5[v181]; - const int32_t *v848 = &v5[v189]; - const int32_t *v857 = &v5[v199]; - const int32_t *v866 = &v5[v208]; - const int32_t *v875 = &v5[v216]; - const int32_t *v884 = &v5[v226]; - svfloat32_t v892 = svdup_n_f32(v281); - svfloat32_t v893 = svdup_n_f32(v288); - svfloat32_t v900 = svdup_n_f32(v353); - svfloat32_t v901 = svdup_n_f32(v360); - svfloat32_t v907 = svdup_n_f32(v424); - svfloat32_t v910 = svdup_n_f32(v441); - int32_t *v927 = &v6[v466]; - int32_t *v936 = &v6[v474]; - int32_t *v945 = &v6[v485]; - int32_t *v963 = &v6[v501]; - int32_t *v972 = &v6[v512]; - int32_t *v981 = &v6[v520]; - int32_t *v990 = &v6[v528]; - int32_t *v999 = &v6[v539]; - int32_t *v1008 = &v6[v547]; - int32_t *v1017 = &v6[v555]; - int32_t *v1026 = &v6[v566]; - int32_t *v1035 = &v6[v574]; - int32_t *v1044 = &v6[v582]; - int32_t *v1053 = &v6[v593]; - int32_t *v1062 = &v6[v601]; - int32_t *v1071 = &v6[v609]; - int32_t *v1080 = &v6[v620]; - int32_t *v1089 = &v6[v628]; - int32_t *v1098 = &v6[v636]; - int32_t *v1107 = &v6[v647]; - int32_t *v1116 = &v6[v655]; - int32_t *v1125 = &v6[v663]; - svfloat32_t v43 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v695[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v25 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v676[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v33 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v685[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v52 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v704[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v60 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v713[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v70 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v722[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v79 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v731[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v87 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v740[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v97 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v749[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v106 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v758[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v124 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v776[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v133 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v785[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v141 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v794[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v151 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v803[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v160 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v812[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v168 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v821[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v178 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v830[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v187 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v839[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v195 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v848[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v205 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v857[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v214 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v866[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v222 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v875[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v232 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v884[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v34; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v34) : "w"(v25), "w"(v33)); - svfloat32_t v35; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v35) : "w"(v25), "w"(v33)); - svfloat32_t v61; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v61) : "w"(v52), "w"(v60)); - svfloat32_t v62; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v62) : "w"(v52), "w"(v60)); - svfloat32_t v88; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v88) : "w"(v79), "w"(v87)); - svfloat32_t v89; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v89) : "w"(v79), "w"(v87)); - svfloat32_t v115; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v115) : "w"(v106), "w"(v114)); - svfloat32_t v116; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v116) : "w"(v106), "w"(v114)); - svfloat32_t v142; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v142) : "w"(v133), "w"(v141)); - svfloat32_t v143; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v143) : "w"(v133), "w"(v141)); - svfloat32_t v169; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v169) : "w"(v160), "w"(v168)); - svfloat32_t v170; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v170) : "w"(v160), "w"(v168)); - svfloat32_t v196; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v196) : "w"(v187), "w"(v195)); - svfloat32_t v197; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v197) : "w"(v187), "w"(v195)); - svfloat32_t v223; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v223) : "w"(v214), "w"(v222)); - svfloat32_t v224; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v224) : "w"(v214), "w"(v222)); - svfloat32_t v44; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v44) : "w"(v34), "w"(v43)); - svfloat32_t v71; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v71) : "w"(v61), "w"(v70)); - svfloat32_t v98; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v98) : "w"(v88), "w"(v97)); - svfloat32_t v125; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v125) : "w"(v115), "w"(v124)); - svfloat32_t v152; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v152) : "w"(v142), "w"(v151)); - svfloat32_t v179; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v179) : "w"(v169), "w"(v178)); - svfloat32_t v206; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v206) : "w"(v196), "w"(v205)); - svfloat32_t v233; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v233) : "w"(v223), "w"(v232)); - svfloat32_t v306; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v306) : "w"(v34), "w"(v142)); - svfloat32_t v307; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v307) : "w"(v34), "w"(v142)); - svfloat32_t v308; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v308) : "w"(v88), "w"(v196)); - svfloat32_t v309; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v309) : "w"(v88), "w"(v196)); - svfloat32_t v310; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v310) : "w"(v61), "w"(v169)); - svfloat32_t v311; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v311) : "w"(v61), "w"(v169)); - svfloat32_t v312; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v312) : "w"(v115), "w"(v223)); - svfloat32_t v313; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v313) : "w"(v115), "w"(v223)); - svfloat32_t v378; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v378) : "w"(v35), "w"(v143)); - svfloat32_t v379; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v379) : "w"(v35), "w"(v143)); - svfloat32_t v380; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v380) : "w"(v89), "w"(v197)); - svfloat32_t v381; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v381) : "w"(v89), "w"(v197)); - svfloat32_t v382; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v382) : "w"(v62), "w"(v170)); - svfloat32_t v383; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v383) : "w"(v62), "w"(v170)); - svfloat32_t v384; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v384) : "w"(v116), "w"(v224)); - svfloat32_t v385; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v385) : "w"(v116), "w"(v224)); - svfloat32_t v234; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v234) : "w"(v44), "w"(v152)); - svfloat32_t v235; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v235) : "w"(v44), "w"(v152)); - svfloat32_t v236; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v236) : "w"(v98), "w"(v206)); - svfloat32_t v237; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v237) : "w"(v98), "w"(v206)); - svfloat32_t v238; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v238) : "w"(v71), "w"(v179)); - svfloat32_t v239; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v239) : "w"(v71), "w"(v179)); - svfloat32_t v240; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v240) : "w"(v125), "w"(v233)); - svfloat32_t v241; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v241) : "w"(v125), "w"(v233)); - svfloat32_t v314; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v314) : "w"(v306), "w"(v308)); - svfloat32_t v315; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v315) : "w"(v306), "w"(v308)); - svfloat32_t v316; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v316) : "w"(v310), "w"(v312)); - svfloat32_t v317; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v317) : "w"(v310), "w"(v312)); - svfloat32_t v320; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v320) : "w"(v311), "w"(v313)); - svfloat32_t v321; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v321) : "w"(v311), "w"(v313)); - svfloat32_t zero355; - asm volatile("mov %0.s, #0" : "=w"(zero355)); - svfloat32_t v355 = svcmla_f32_x(pred_full, zero355, v900, v309, 90); - svfloat32_t v386; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v386) : "w"(v378), "w"(v380)); - svfloat32_t v387; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v387) : "w"(v378), "w"(v380)); - svfloat32_t v388; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v388) : "w"(v382), "w"(v384)); - svfloat32_t v389; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v389) : "w"(v382), "w"(v384)); - svfloat32_t v392; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v392) : "w"(v383), "w"(v385)); - svfloat32_t v393; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v393) : "w"(v383), "w"(v385)); - svfloat32_t zero426; - asm volatile("mov %0.s, #0" : "=w"(zero426)); - svfloat32_t v426 = svcmla_f32_x(pred_full, zero426, v907, v379, 90); - svfloat32_t v242; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v242) : "w"(v234), "w"(v236)); - svfloat32_t v243; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v243) : "w"(v234), "w"(v236)); - svfloat32_t v244; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v244) : "w"(v238), "w"(v240)); - svfloat32_t v245; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v245) : "w"(v238), "w"(v240)); - svfloat32_t v248; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v248) : "w"(v239), "w"(v241)); - svfloat32_t v249; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v249) : "w"(v239), "w"(v241)); - svfloat32_t zero283; - asm volatile("mov %0.s, #0" : "=w"(zero283)); - svfloat32_t v283 = svcmla_f32_x(pred_full, zero283, v892, v237, 90); - svfloat32_t v318; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v318) : "w"(v314), "w"(v316)); - svfloat32_t v319; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v319) : "w"(v314), "w"(v316)); - svfloat32_t zero343; - asm volatile("mov %0.s, #0" : "=w"(zero343)); - svfloat32_t v343 = svcmla_f32_x(pred_full, zero343, v900, v317, 90); - svfloat32_t zero362; - asm volatile("mov %0.s, #0" : "=w"(zero362)); - svfloat32_t v362 = svcmla_f32_x(pred_full, zero362, v901, v320, 90); - svfloat32_t v367; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v367) : "w"(v321), "w"(v902)); - svfloat32_t v390; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v390) : "w"(v386), "w"(v388)); - svfloat32_t v391; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v391) : "w"(v386), "w"(v388)); - svfloat32_t zero414; - asm volatile("mov %0.s, #0" : "=w"(zero414)); - svfloat32_t v414 = svcmla_f32_x(pred_full, zero414, v907, v387, 90); - svfloat32_t v436; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v436) : "w"(v392), "w"(v909)); - svfloat32_t zero443; - asm volatile("mov %0.s, #0" : "=w"(zero443)); - svfloat32_t v443 = svcmla_f32_x(pred_full, zero443, v910, v393, 90); - svfloat32_t v246; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v246) : "w"(v242), "w"(v244)); - svfloat32_t v247; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v247) : "w"(v242), "w"(v244)); - svfloat32_t zero271; - asm volatile("mov %0.s, #0" : "=w"(zero271)); - svfloat32_t v271 = svcmla_f32_x(pred_full, zero271, v892, v245, 90); - svfloat32_t zero290; - asm volatile("mov %0.s, #0" : "=w"(zero290)); - svfloat32_t v290 = svcmla_f32_x(pred_full, zero290, v893, v248, 90); - svfloat32_t v368 = svmla_f32_x(pred_full, v343, v315, v899); - svfloat32_t v369 = svnmls_f32_x(pred_full, v343, v315, v899); - svfloat32_t v370 = svmla_f32_x(pred_full, v367, v307, v899); - svfloat32_t v371 = svnmls_f32_x(pred_full, v367, v307, v899); - svfloat32_t v372; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v372) : "w"(v355), "w"(v362)); - svfloat32_t v373; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v373) : "w"(v355), "w"(v362)); - svfloat32_t zero400; - asm volatile("mov %0.s, #0" : "=w"(zero400)); - svfloat32_t v400 = svcmla_f32_x(pred_full, zero400, v907, v390, 90); - svfloat32_t zero407; - asm volatile("mov %0.s, #0" : "=w"(zero407)); - svfloat32_t v407 = svcmla_f32_x(pred_full, zero407, v907, v391, 90); - svfloat32_t v444 = svmla_f32_x(pred_full, v414, v389, v908); - svfloat32_t v445 = svmls_f32_x(pred_full, v414, v389, v908); - svfloat32_t v446; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v446) : "w"(v426), "w"(v443)); - svfloat32_t v447; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v447) : "w"(v426), "w"(v443)); - svfloat32_t v448 = svmla_f32_x(pred_full, v436, v381, v908); - svfloat32_t v449 = svnmls_f32_x(pred_full, v436, v381, v908); - svfloat32_t v296; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v296) : "w"(v243), "w"(v271)); - svfloat32_t v297; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v297) : "w"(v243), "w"(v271)); - svfloat32_t v298 = svmla_f32_x(pred_full, v235, v249, v894); - svfloat32_t v299 = svmls_f32_x(pred_full, v235, v249, v894); - svfloat32_t v300; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v300) : "w"(v283), "w"(v290)); - svfloat32_t v301; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v301) : "w"(v283), "w"(v290)); - svfloat32_t v374; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v374) : "w"(v370), "w"(v372)); - svfloat32_t v375; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v375) : "w"(v370), "w"(v372)); - svfloat32_t v376; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v376) : "w"(v371), "w"(v373)); - svfloat32_t v377; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v377) : "w"(v371), "w"(v373)); - svfloat32_t v450; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v450) : "w"(v446), "w"(v448)); - svfloat32_t v451; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v451) : "w"(v446), "w"(v448)); - svfloat32_t v452; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v452) : "w"(v447), "w"(v449)); - svfloat32_t v453; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v453) : "w"(v447), "w"(v449)); - svfloat32_t v454 = svmla_f32_x(pred_full, v246, v318, v899); - svint16_t v459 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v246, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svfloat32_t v562 = svmla_f32_x(pred_full, v247, v319, v899); - svint16_t v567 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v247, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svfloat32_t v302; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v302) : "w"(v298), "w"(v300)); - svfloat32_t v303; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v303) : "w"(v298), "w"(v300)); - svfloat32_t v304; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v304) : "w"(v299), "w"(v301)); - svfloat32_t v305; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v305) : "w"(v299), "w"(v301)); - svfloat32_t v455; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v455) : "w"(v454), "w"(v400)); - svfloat32_t v456; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v456) : "w"(v454), "w"(v400)); - svfloat32_t v508; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v508) : "w"(v297), "w"(v369)); - svint16_t v513 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v297, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svfloat32_t v563; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v563) : "w"(v562), "w"(v407)); - svfloat32_t v564; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v564) : "w"(v562), "w"(v407)); - svfloat32_t v616; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v616) : "w"(v296), "w"(v368)); - svint16_t v621 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v296, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svst1w_u64(pred_full, (unsigned *)(v918), svreinterpret_u64_s16(v459)); - svst1w_u64(pred_full, (unsigned *)(v1026), svreinterpret_u64_s16(v567)); - svint16_t v467 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v456, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svint16_t v475 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v455, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svfloat32_t v481; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v481) : "w"(v303), "w"(v375)); - svint16_t v486 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v303, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svfloat32_t v509; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v509) : "w"(v508), "w"(v445)); - svfloat32_t v510; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v510) : "w"(v508), "w"(v445)); - svfloat32_t v535; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v535) : "w"(v304), "w"(v376)); - svint16_t v540 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v304, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svint16_t v575 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v564, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svint16_t v583 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v563, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svfloat32_t v589; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v589) : "w"(v305), "w"(v377)); - svint16_t v594 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v305, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svfloat32_t v617; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v617) : "w"(v616), "w"(v444)); - svfloat32_t v618; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v618) : "w"(v616), "w"(v444)); - svfloat32_t v643; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v643) : "w"(v302), "w"(v374)); - svint16_t v648 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v302, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svst1w_u64(pred_full, (unsigned *)(v972), svreinterpret_u64_s16(v513)); - svst1w_u64(pred_full, (unsigned *)(v1080), svreinterpret_u64_s16(v621)); - svfloat32_t v482; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v482) : "w"(v481), "w"(v451)); - svfloat32_t v483; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v483) : "w"(v481), "w"(v451)); - svint16_t v521 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v510, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svint16_t v529 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v509, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svfloat32_t v536; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v536) : "w"(v535), "w"(v452)); - svfloat32_t v537; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v537) : "w"(v535), "w"(v452)); - svfloat32_t v590; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v590) : "w"(v589), "w"(v453)); - svfloat32_t v591; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v591) : "w"(v589), "w"(v453)); - svint16_t v629 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v618, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svint16_t v637 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v617, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svfloat32_t v644; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v644) : "w"(v643), "w"(v450)); - svfloat32_t v645; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v645) : "w"(v643), "w"(v450)); - svst1w_u64(pred_full, (unsigned *)(v927), svreinterpret_u64_s16(v467)); - svst1w_u64(pred_full, (unsigned *)(v936), svreinterpret_u64_s16(v475)); - svst1w_u64(pred_full, (unsigned *)(v945), svreinterpret_u64_s16(v486)); - svst1w_u64(pred_full, (unsigned *)(v999), svreinterpret_u64_s16(v540)); - svst1w_u64(pred_full, (unsigned *)(v1035), svreinterpret_u64_s16(v575)); - svst1w_u64(pred_full, (unsigned *)(v1044), svreinterpret_u64_s16(v583)); - svst1w_u64(pred_full, (unsigned *)(v1053), svreinterpret_u64_s16(v594)); - svst1w_u64(pred_full, (unsigned *)(v1107), svreinterpret_u64_s16(v648)); - svint16_t v494 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v483, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svint16_t v502 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v482, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svint16_t v548 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v537, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svint16_t v556 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v536, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svint16_t v602 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v591, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svint16_t v610 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v590, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svint16_t v656 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v645, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svint16_t v664 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v644, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svst1w_u64(pred_full, (unsigned *)(v981), svreinterpret_u64_s16(v521)); - svst1w_u64(pred_full, (unsigned *)(v990), svreinterpret_u64_s16(v529)); - svst1w_u64(pred_full, (unsigned *)(v1089), svreinterpret_u64_s16(v629)); - svst1w_u64(pred_full, (unsigned *)(v1098), svreinterpret_u64_s16(v637)); - svst1w_u64(pred_full, (unsigned *)(v954), svreinterpret_u64_s16(v494)); - svst1w_u64(pred_full, (unsigned *)(v963), svreinterpret_u64_s16(v502)); - svst1w_u64(pred_full, (unsigned *)(v1008), svreinterpret_u64_s16(v548)); - svst1w_u64(pred_full, (unsigned *)(v1017), svreinterpret_u64_s16(v556)); - svst1w_u64(pred_full, (unsigned *)(v1062), svreinterpret_u64_s16(v602)); - svst1w_u64(pred_full, (unsigned *)(v1071), svreinterpret_u64_s16(v610)); - svst1w_u64(pred_full, (unsigned *)(v1116), svreinterpret_u64_s16(v656)); - svst1w_u64(pred_full, (unsigned *)(v1125), svreinterpret_u64_s16(v664)); - v5 += v11; - v6 += v12; - } -} -#endif - -#ifndef ARMRAL_ARCH_SVE -void armral_fft_cs16_cf32_cs16_ac_n_uu25(const armral_cmplx_int16_t *restrict x, - armral_cmplx_int16_t *restrict y, - int istride, int ostride, int howmany, - float dir) { - float v4 = dir; - const int32_t *v5 = (const int32_t *)x; - int32_t *v6 = (int32_t *)y; - int64_t v12 = howmany - 1; - int64_t v1751 = howmany / 2; - for (int j = 0; j < v12; j += 2) { - float v941 = 0.0000000000000000e+00F; - float v1060 = 9.6858316112863108e-01F; - float v1064 = -2.4868988716485479e-01F; - float v1065 = 2.4868988716485479e-01F; - float v1233 = 8.7630668004386358e-01F; - float v1237 = -4.8175367410171532e-01F; - float v1238 = 4.8175367410171532e-01F; - float v1406 = 7.2896862742141155e-01F; - float v1410 = -6.8454710592868862e-01F; - float v1411 = 6.8454710592868862e-01F; - float v1420 = 6.2790519529313527e-02F; - float v1424 = -9.9802672842827156e-01F; - float v1425 = 9.9802672842827156e-01F; - float v1579 = 5.3582679497899655e-01F; - float v1583 = -8.4432792550201508e-01F; - float v1584 = 8.4432792550201508e-01F; - float v1593 = -4.2577929156507272e-01F; - float v1597 = -9.0482705246601947e-01F; - float v1598 = 9.0482705246601947e-01F; - float v1607 = -6.3742398974868952e-01F; - float v1611 = 7.7051324277578936e-01F; - float v1612 = -7.7051324277578936e-01F; - float v1628 = -9.9211470131447776e-01F; - float v1632 = -1.2533323356430454e-01F; - float v1633 = 1.2533323356430454e-01F; - float v1651 = 2.5000000000000000e-01F; - float v1663 = 5.5901699437494745e-01F; - float v1675 = 6.1803398874989490e-01F; - float v1706 = 9.5105651629515353e-01F; - float v1707 = -9.5105651629515353e-01F; - float32x2_t v1709 = (float32x2_t){v4, v4}; - float v1738 = 2.0000000000000000e+00F; - const int32_t *v3255 = &v5[istride]; - int32_t *v3481 = &v6[ostride]; - float v944 = dir * v941; - float32x2_t v1061 = (float32x2_t){v1060, v1060}; - float32x2_t v1066 = (float32x2_t){v1064, v1065}; - float32x2_t v1234 = (float32x2_t){v1233, v1233}; - float32x2_t v1239 = (float32x2_t){v1237, v1238}; - float32x2_t v1407 = (float32x2_t){v1406, v1406}; - float32x2_t v1412 = (float32x2_t){v1410, v1411}; - float32x2_t v1421 = (float32x2_t){v1420, v1420}; - float32x2_t v1426 = (float32x2_t){v1424, v1425}; - float32x2_t v1461 = (float32x2_t){v1612, v1611}; - float32x2_t v1580 = (float32x2_t){v1579, v1579}; - float32x2_t v1585 = (float32x2_t){v1583, v1584}; - float32x2_t v1594 = (float32x2_t){v1593, v1593}; - float32x2_t v1599 = (float32x2_t){v1597, v1598}; - float32x2_t v1608 = (float32x2_t){v1607, v1607}; - float32x2_t v1613 = (float32x2_t){v1611, v1612}; - float32x2_t v1629 = (float32x2_t){v1628, v1628}; - float32x2_t v1634 = (float32x2_t){v1632, v1633}; - float32x2_t v1652 = (float32x2_t){v1651, v1651}; - float32x2_t v1664 = (float32x2_t){v1663, v1663}; - float32x2_t v1676 = (float32x2_t){v1675, v1675}; - float32x2_t v1708 = (float32x2_t){v1706, v1707}; - float32x2_t v1739 = (float32x2_t){v1738, v1738}; - const int32_t *v3210 = &v5[0]; - int32_t *v3436 = &v6[0]; - int16x4_t v3666 = vld1_s16((const int16_t *)v3255); - float32x4_t v201 = vcvtq_n_f32_s32(vmovl_s16(v3666), 15); - float32x2_t v942 = (float32x2_t){v941, v944}; - float32x4_t v1062 = vcombine_f32(v1061, v1061); - float32x2_t v1068 = vmul_f32(v1709, v1066); - float32x4_t v1235 = vcombine_f32(v1234, v1234); - float32x2_t v1241 = vmul_f32(v1709, v1239); - float32x4_t v1408 = vcombine_f32(v1407, v1407); - float32x2_t v1414 = vmul_f32(v1709, v1412); - float32x4_t v1422 = vcombine_f32(v1421, v1421); - float32x2_t v1428 = vmul_f32(v1709, v1426); - float32x2_t v1463 = vmul_f32(v1709, v1461); - float32x4_t v1581 = vcombine_f32(v1580, v1580); - float32x2_t v1587 = vmul_f32(v1709, v1585); - float32x4_t v1595 = vcombine_f32(v1594, v1594); - float32x2_t v1601 = vmul_f32(v1709, v1599); - float32x4_t v1609 = vcombine_f32(v1608, v1608); - float32x2_t v1615 = vmul_f32(v1709, v1613); - float32x4_t v1630 = vcombine_f32(v1629, v1629); - float32x2_t v1636 = vmul_f32(v1709, v1634); - float32x4_t v1653 = vcombine_f32(v1652, v1652); - float32x4_t v1665 = vcombine_f32(v1664, v1664); - float32x4_t v1677 = vcombine_f32(v1676, v1676); - float32x2_t v1710 = vmul_f32(v1709, v1708); - float32x4_t v1740 = vcombine_f32(v1739, v1739); - const int32_t *v3219 = &v5[istride * 5]; - const int32_t *v3228 = &v5[istride * 10]; - const int32_t *v3237 = &v5[istride * 15]; - const int32_t *v3246 = &v5[istride * 20]; - const int32_t *v3264 = &v5[istride * 6]; - const int32_t *v3273 = &v5[istride * 11]; - const int32_t *v3282 = &v5[istride * 16]; - const int32_t *v3291 = &v5[istride * 21]; - const int32_t *v3300 = &v5[istride * 2]; - const int32_t *v3309 = &v5[istride * 7]; - const int32_t *v3318 = &v5[istride * 12]; - const int32_t *v3327 = &v5[istride * 17]; - const int32_t *v3336 = &v5[istride * 22]; - const int32_t *v3345 = &v5[istride * 3]; - const int32_t *v3354 = &v5[istride * 8]; - const int32_t *v3363 = &v5[istride * 13]; - const int32_t *v3372 = &v5[istride * 18]; - const int32_t *v3381 = &v5[istride * 23]; - const int32_t *v3390 = &v5[istride * 4]; - const int32_t *v3399 = &v5[istride * 9]; - const int32_t *v3408 = &v5[istride * 14]; - const int32_t *v3417 = &v5[istride * 19]; - const int32_t *v3426 = &v5[istride * 24]; - int32_t *v3445 = &v6[ostride * 5]; - int32_t *v3454 = &v6[ostride * 10]; - int32_t *v3463 = &v6[ostride * 15]; - int32_t *v3472 = &v6[ostride * 20]; - int32_t *v3490 = &v6[ostride * 6]; - int32_t *v3499 = &v6[ostride * 11]; - int32_t *v3508 = &v6[ostride * 16]; - int32_t *v3517 = &v6[ostride * 21]; - int32_t *v3526 = &v6[ostride * 2]; - int32_t *v3535 = &v6[ostride * 7]; - int32_t *v3544 = &v6[ostride * 12]; - int32_t *v3553 = &v6[ostride * 17]; - int32_t *v3562 = &v6[ostride * 22]; - int32_t *v3571 = &v6[ostride * 3]; - int32_t *v3580 = &v6[ostride * 8]; - int32_t *v3589 = &v6[ostride * 13]; - int32_t *v3598 = &v6[ostride * 18]; - int32_t *v3607 = &v6[ostride * 23]; - int32_t *v3616 = &v6[ostride * 4]; - int32_t *v3625 = &v6[ostride * 9]; - int32_t *v3634 = &v6[ostride * 14]; - int32_t *v3643 = &v6[ostride * 19]; - int32_t *v3652 = &v6[ostride * 24]; - int16x4_t v3656 = vld1_s16((const int16_t *)v3210); - float32x4_t v28 = vcvtq_n_f32_s32(vmovl_s16(v3656), 15); - float32x4_t v946 = vcombine_f32(v942, v942); - float32x4_t v1070 = vcombine_f32(v1068, v1068); - float32x4_t v1243 = vcombine_f32(v1241, v1241); - float32x4_t v1416 = vcombine_f32(v1414, v1414); - float32x4_t v1430 = vcombine_f32(v1428, v1428); - float32x4_t v1465 = vcombine_f32(v1463, v1463); - float32x4_t v1589 = vcombine_f32(v1587, v1587); - float32x4_t v1603 = vcombine_f32(v1601, v1601); - float32x4_t v1617 = vcombine_f32(v1615, v1615); - float32x4_t v1638 = vcombine_f32(v1636, v1636); - float32x4_t v1712 = vcombine_f32(v1710, v1710); - int16x4_t v3658 = vld1_s16((const int16_t *)v3219); - int16x4_t v3660 = vld1_s16((const int16_t *)v3228); - int16x4_t v3662 = vld1_s16((const int16_t *)v3237); - int16x4_t v3664 = vld1_s16((const int16_t *)v3246); - int16x4_t v3668 = vld1_s16((const int16_t *)v3264); - int16x4_t v3670 = vld1_s16((const int16_t *)v3273); - int16x4_t v3672 = vld1_s16((const int16_t *)v3282); - int16x4_t v3674 = vld1_s16((const int16_t *)v3291); - int16x4_t v3676 = vld1_s16((const int16_t *)v3300); - int16x4_t v3678 = vld1_s16((const int16_t *)v3309); - int16x4_t v3680 = vld1_s16((const int16_t *)v3318); - int16x4_t v3682 = vld1_s16((const int16_t *)v3327); - int16x4_t v3684 = vld1_s16((const int16_t *)v3336); - int16x4_t v3686 = vld1_s16((const int16_t *)v3345); - int16x4_t v3688 = vld1_s16((const int16_t *)v3354); - int16x4_t v3690 = vld1_s16((const int16_t *)v3363); - int16x4_t v3692 = vld1_s16((const int16_t *)v3372); - int16x4_t v3694 = vld1_s16((const int16_t *)v3381); - int16x4_t v3696 = vld1_s16((const int16_t *)v3390); - int16x4_t v3698 = vld1_s16((const int16_t *)v3399); - int16x4_t v3700 = vld1_s16((const int16_t *)v3408); - int16x4_t v3702 = vld1_s16((const int16_t *)v3417); - int16x4_t v3704 = vld1_s16((const int16_t *)v3426); - float32x4_t v36 = vcvtq_n_f32_s32(vmovl_s16(v3658), 15); - float32x4_t v44 = vcvtq_n_f32_s32(vmovl_s16(v3660), 15); - float32x4_t v52 = vcvtq_n_f32_s32(vmovl_s16(v3662), 15); - float32x4_t v60 = vcvtq_n_f32_s32(vmovl_s16(v3664), 15); - float32x4_t v209 = vcvtq_n_f32_s32(vmovl_s16(v3668), 15); - float32x4_t v217 = vcvtq_n_f32_s32(vmovl_s16(v3670), 15); - float32x4_t v225 = vcvtq_n_f32_s32(vmovl_s16(v3672), 15); - float32x4_t v233 = vcvtq_n_f32_s32(vmovl_s16(v3674), 15); - float32x4_t v374 = vcvtq_n_f32_s32(vmovl_s16(v3676), 15); - float32x4_t v382 = vcvtq_n_f32_s32(vmovl_s16(v3678), 15); - float32x4_t v390 = vcvtq_n_f32_s32(vmovl_s16(v3680), 15); - float32x4_t v398 = vcvtq_n_f32_s32(vmovl_s16(v3682), 15); - float32x4_t v406 = vcvtq_n_f32_s32(vmovl_s16(v3684), 15); - float32x4_t v547 = vcvtq_n_f32_s32(vmovl_s16(v3686), 15); - float32x4_t v555 = vcvtq_n_f32_s32(vmovl_s16(v3688), 15); - float32x4_t v563 = vcvtq_n_f32_s32(vmovl_s16(v3690), 15); - float32x4_t v571 = vcvtq_n_f32_s32(vmovl_s16(v3692), 15); - float32x4_t v579 = vcvtq_n_f32_s32(vmovl_s16(v3694), 15); - float32x4_t v720 = vcvtq_n_f32_s32(vmovl_s16(v3696), 15); - float32x4_t v728 = vcvtq_n_f32_s32(vmovl_s16(v3698), 15); - float32x4_t v736 = vcvtq_n_f32_s32(vmovl_s16(v3700), 15); - float32x4_t v744 = vcvtq_n_f32_s32(vmovl_s16(v3702), 15); - float32x4_t v752 = vcvtq_n_f32_s32(vmovl_s16(v3704), 15); - float32x4_t v71 = vrev64q_f32(v36); - float32x4_t v85 = vrev64q_f32(v44); - float32x4_t v99 = vrev64q_f32(v60); - float32x4_t v120 = vrev64q_f32(v52); - float32x4_t v244 = vrev64q_f32(v209); - float32x4_t v258 = vrev64q_f32(v217); - float32x4_t v272 = vrev64q_f32(v233); - float32x4_t v293 = vrev64q_f32(v225); - float32x4_t v417 = vrev64q_f32(v382); - float32x4_t v431 = vrev64q_f32(v390); - float32x4_t v445 = vrev64q_f32(v406); - float32x4_t v466 = vrev64q_f32(v398); - float32x4_t v590 = vrev64q_f32(v555); - float32x4_t v604 = vrev64q_f32(v563); - float32x4_t v618 = vrev64q_f32(v579); - float32x4_t v639 = vrev64q_f32(v571); - float32x4_t v763 = vrev64q_f32(v728); - float32x4_t v777 = vrev64q_f32(v736); - float32x4_t v791 = vrev64q_f32(v752); - float32x4_t v812 = vrev64q_f32(v744); - float32x4_t v73 = vmulq_f32(v71, v946); - float32x4_t v87 = vmulq_f32(v85, v946); - float32x4_t v101 = vmulq_f32(v99, v946); - float32x4_t v122 = vmulq_f32(v120, v946); - float32x4_t v246 = vmulq_f32(v244, v946); - float32x4_t v260 = vmulq_f32(v258, v946); - float32x4_t v274 = vmulq_f32(v272, v946); - float32x4_t v295 = vmulq_f32(v293, v946); - float32x4_t v419 = vmulq_f32(v417, v946); - float32x4_t v433 = vmulq_f32(v431, v946); - float32x4_t v447 = vmulq_f32(v445, v946); - float32x4_t v468 = vmulq_f32(v466, v946); - float32x4_t v592 = vmulq_f32(v590, v946); - float32x4_t v606 = vmulq_f32(v604, v946); - float32x4_t v620 = vmulq_f32(v618, v946); - float32x4_t v641 = vmulq_f32(v639, v946); - float32x4_t v765 = vmulq_f32(v763, v946); - float32x4_t v779 = vmulq_f32(v777, v946); - float32x4_t v793 = vmulq_f32(v791, v946); - float32x4_t v814 = vmulq_f32(v812, v946); - float32x4_t v74 = vaddq_f32(v73, v36); - float32x4_t v88 = vaddq_f32(v87, v44); - float32x4_t v102 = vaddq_f32(v101, v60); - float32x4_t v123 = vaddq_f32(v122, v52); - float32x4_t v247 = vaddq_f32(v246, v209); - float32x4_t v261 = vaddq_f32(v260, v217); - float32x4_t v275 = vaddq_f32(v274, v233); - float32x4_t v296 = vaddq_f32(v295, v225); - float32x4_t v420 = vaddq_f32(v419, v382); - float32x4_t v434 = vaddq_f32(v433, v390); - float32x4_t v448 = vaddq_f32(v447, v406); - float32x4_t v469 = vaddq_f32(v468, v398); - float32x4_t v593 = vaddq_f32(v592, v555); - float32x4_t v607 = vaddq_f32(v606, v563); - float32x4_t v621 = vaddq_f32(v620, v579); - float32x4_t v642 = vaddq_f32(v641, v571); - float32x4_t v766 = vaddq_f32(v765, v728); - float32x4_t v780 = vaddq_f32(v779, v736); - float32x4_t v794 = vaddq_f32(v793, v752); - float32x4_t v815 = vaddq_f32(v814, v744); - float32x4_t v103 = vsubq_f32(v74, v102); - float32x4_t v108 = vmulq_f32(v74, v1740); - float32x4_t v124 = vsubq_f32(v88, v123); - float32x4_t v129 = vmulq_f32(v88, v1740); - float32x4_t v276 = vsubq_f32(v247, v275); - float32x4_t v281 = vmulq_f32(v247, v1740); - float32x4_t v297 = vsubq_f32(v261, v296); - float32x4_t v302 = vmulq_f32(v261, v1740); - float32x4_t v449 = vsubq_f32(v420, v448); - float32x4_t v454 = vmulq_f32(v420, v1740); - float32x4_t v470 = vsubq_f32(v434, v469); - float32x4_t v475 = vmulq_f32(v434, v1740); - float32x4_t v622 = vsubq_f32(v593, v621); - float32x4_t v627 = vmulq_f32(v593, v1740); - float32x4_t v643 = vsubq_f32(v607, v642); - float32x4_t v648 = vmulq_f32(v607, v1740); - float32x4_t v795 = vsubq_f32(v766, v794); - float32x4_t v800 = vmulq_f32(v766, v1740); - float32x4_t v816 = vsubq_f32(v780, v815); - float32x4_t v821 = vmulq_f32(v780, v1740); - float32x4_t v109 = vsubq_f32(v108, v103); - float32x4_t v130 = vsubq_f32(v129, v124); - float32x4_t v143 = vmulq_f32(v124, v1677); - float32x4_t v161 = vmulq_f32(v103, v1677); - float32x4_t v282 = vsubq_f32(v281, v276); - float32x4_t v303 = vsubq_f32(v302, v297); - float32x4_t v316 = vmulq_f32(v297, v1677); - float32x4_t v334 = vmulq_f32(v276, v1677); - float32x4_t v455 = vsubq_f32(v454, v449); - float32x4_t v476 = vsubq_f32(v475, v470); - float32x4_t v489 = vmulq_f32(v470, v1677); - float32x4_t v507 = vmulq_f32(v449, v1677); - float32x4_t v628 = vsubq_f32(v627, v622); - float32x4_t v649 = vsubq_f32(v648, v643); - float32x4_t v662 = vmulq_f32(v643, v1677); - float32x4_t v680 = vmulq_f32(v622, v1677); - float32x4_t v801 = vsubq_f32(v800, v795); - float32x4_t v822 = vsubq_f32(v821, v816); - float32x4_t v835 = vmulq_f32(v816, v1677); - float32x4_t v853 = vmulq_f32(v795, v1677); - float32x4_t v131 = vaddq_f32(v109, v130); - float32x4_t v132 = vsubq_f32(v109, v130); - float32x4_t v144 = vaddq_f32(v103, v143); - float32x4_t v162 = vsubq_f32(v161, v124); - float32x4_t v304 = vaddq_f32(v282, v303); - float32x4_t v305 = vsubq_f32(v282, v303); - float32x4_t v317 = vaddq_f32(v276, v316); - float32x4_t v335 = vsubq_f32(v334, v297); - float32x4_t v477 = vaddq_f32(v455, v476); - float32x4_t v478 = vsubq_f32(v455, v476); - float32x4_t v490 = vaddq_f32(v449, v489); - float32x4_t v508 = vsubq_f32(v507, v470); - float32x4_t v650 = vaddq_f32(v628, v649); - float32x4_t v651 = vsubq_f32(v628, v649); - float32x4_t v663 = vaddq_f32(v622, v662); - float32x4_t v681 = vsubq_f32(v680, v643); - float32x4_t v823 = vaddq_f32(v801, v822); - float32x4_t v824 = vsubq_f32(v801, v822); - float32x4_t v836 = vaddq_f32(v795, v835); - float32x4_t v854 = vsubq_f32(v853, v816); - float32x4_t v137 = vmulq_f32(v131, v1653); - float32x4_t v149 = vmulq_f32(v132, v1665); - float32x4_t v163 = vaddq_f32(v28, v131); - float32x4_t v169 = vrev64q_f32(v144); - float32x4_t v178 = vrev64q_f32(v162); - float32x4_t v310 = vmulq_f32(v304, v1653); - float32x4_t v322 = vmulq_f32(v305, v1665); - float32x4_t v336 = vaddq_f32(v201, v304); - float32x4_t v342 = vrev64q_f32(v317); - float32x4_t v351 = vrev64q_f32(v335); - float32x4_t v483 = vmulq_f32(v477, v1653); - float32x4_t v495 = vmulq_f32(v478, v1665); - float32x4_t v509 = vaddq_f32(v374, v477); - float32x4_t v515 = vrev64q_f32(v490); - float32x4_t v524 = vrev64q_f32(v508); - float32x4_t v656 = vmulq_f32(v650, v1653); - float32x4_t v668 = vmulq_f32(v651, v1665); - float32x4_t v682 = vaddq_f32(v547, v650); - float32x4_t v688 = vrev64q_f32(v663); - float32x4_t v697 = vrev64q_f32(v681); - float32x4_t v829 = vmulq_f32(v823, v1653); - float32x4_t v841 = vmulq_f32(v824, v1665); - float32x4_t v855 = vaddq_f32(v720, v823); - float32x4_t v861 = vrev64q_f32(v836); - float32x4_t v870 = vrev64q_f32(v854); - float32x4_t v138 = vsubq_f32(v28, v137); - float32x4_t v171 = vmulq_f32(v169, v1712); - float32x4_t v180 = vmulq_f32(v178, v1712); - float32x4_t v311 = vsubq_f32(v201, v310); - float32x4_t v344 = vmulq_f32(v342, v1712); - float32x4_t v353 = vmulq_f32(v351, v1712); - float32x4_t v484 = vsubq_f32(v374, v483); - float32x4_t v517 = vmulq_f32(v515, v1712); - float32x4_t v526 = vmulq_f32(v524, v1712); - float32x4_t v657 = vsubq_f32(v547, v656); - float32x4_t v690 = vmulq_f32(v688, v1712); - float32x4_t v699 = vmulq_f32(v697, v1712); - float32x4_t v830 = vsubq_f32(v720, v829); - float32x4_t v863 = vmulq_f32(v861, v1712); - float32x4_t v872 = vmulq_f32(v870, v1712); - float32x4_t v896 = vrev64q_f32(v336); - float32x4_t v910 = vrev64q_f32(v509); - float32x4_t v924 = vrev64q_f32(v855); - float32x4_t v945 = vrev64q_f32(v682); - float32x4_t v150 = vsubq_f32(v138, v149); - float32x4_t v155 = vmulq_f32(v138, v1740); - float32x4_t v323 = vsubq_f32(v311, v322); - float32x4_t v328 = vmulq_f32(v311, v1740); - float32x4_t v496 = vsubq_f32(v484, v495); - float32x4_t v501 = vmulq_f32(v484, v1740); - float32x4_t v669 = vsubq_f32(v657, v668); - float32x4_t v674 = vmulq_f32(v657, v1740); - float32x4_t v842 = vsubq_f32(v830, v841); - float32x4_t v847 = vmulq_f32(v830, v1740); - float32x4_t v898 = vmulq_f32(v896, v946); - float32x4_t v912 = vmulq_f32(v910, v946); - float32x4_t v926 = vmulq_f32(v924, v946); - float32x4_t v947 = vmulq_f32(v945, v946); - float32x4_t v156 = vsubq_f32(v155, v150); - float32x4_t v181 = vsubq_f32(v150, v180); - float32x4_t v186 = vmulq_f32(v150, v1740); - float32x4_t v329 = vsubq_f32(v328, v323); - float32x4_t v354 = vsubq_f32(v323, v353); - float32x4_t v359 = vmulq_f32(v323, v1740); - float32x4_t v502 = vsubq_f32(v501, v496); - float32x4_t v527 = vsubq_f32(v496, v526); - float32x4_t v532 = vmulq_f32(v496, v1740); - float32x4_t v675 = vsubq_f32(v674, v669); - float32x4_t v700 = vsubq_f32(v669, v699); - float32x4_t v705 = vmulq_f32(v669, v1740); - float32x4_t v848 = vsubq_f32(v847, v842); - float32x4_t v873 = vsubq_f32(v842, v872); - float32x4_t v878 = vmulq_f32(v842, v1740); - float32x4_t v899 = vaddq_f32(v898, v336); - float32x4_t v913 = vaddq_f32(v912, v509); - float32x4_t v927 = vaddq_f32(v926, v855); - float32x4_t v948 = vaddq_f32(v947, v682); - float32x4_t v172 = vsubq_f32(v156, v171); - float32x4_t v187 = vsubq_f32(v186, v181); - float32x4_t v192 = vmulq_f32(v156, v1740); - float32x4_t v345 = vsubq_f32(v329, v344); - float32x4_t v360 = vsubq_f32(v359, v354); - float32x4_t v365 = vmulq_f32(v329, v1740); - float32x4_t v518 = vsubq_f32(v502, v517); - float32x4_t v533 = vsubq_f32(v532, v527); - float32x4_t v538 = vmulq_f32(v502, v1740); - float32x4_t v691 = vsubq_f32(v675, v690); - float32x4_t v706 = vsubq_f32(v705, v700); - float32x4_t v711 = vmulq_f32(v675, v1740); - float32x4_t v864 = vsubq_f32(v848, v863); - float32x4_t v879 = vsubq_f32(v878, v873); - float32x4_t v884 = vmulq_f32(v848, v1740); - float32x4_t v928 = vsubq_f32(v899, v927); - float32x4_t v933 = vmulq_f32(v899, v1740); - float32x4_t v949 = vsubq_f32(v913, v948); - float32x4_t v954 = vmulq_f32(v913, v1740); - float32x4_t v1242 = vrev64q_f32(v354); - float32x4_t v1256 = vrev64q_f32(v527); - float32x4_t v1270 = vrev64q_f32(v873); - float32x4_t v1291 = vrev64q_f32(v700); - float32x4_t v193 = vsubq_f32(v192, v172); - float32x4_t v366 = vsubq_f32(v365, v345); - float32x4_t v539 = vsubq_f32(v538, v518); - float32x4_t v712 = vsubq_f32(v711, v691); - float32x4_t v885 = vsubq_f32(v884, v864); - float32x4_t v934 = vsubq_f32(v933, v928); - float32x4_t v955 = vsubq_f32(v954, v949); - float32x4_t v968 = vmulq_f32(v949, v1677); - float32x4_t v986 = vmulq_f32(v928, v1677); - float32x4_t v1069 = vrev64q_f32(v345); - float32x4_t v1083 = vrev64q_f32(v518); - float32x4_t v1097 = vrev64q_f32(v864); - float32x4_t v1118 = vrev64q_f32(v691); - float32x4_t v1244 = vmulq_f32(v1242, v1243); - float32x4_t v1258 = vmulq_f32(v1256, v1589); - float32x4_t v1272 = vmulq_f32(v1270, v1603); - float32x4_t v1293 = vmulq_f32(v1291, v1430); - float32x4_t v1415 = vrev64q_f32(v360); - float32x4_t v1429 = vrev64q_f32(v533); - float32x4_t v1443 = vrev64q_f32(v879); - float32x4_t v1464 = vrev64q_f32(v706); - float32x4_t v956 = vaddq_f32(v934, v955); - float32x4_t v957 = vsubq_f32(v934, v955); - float32x4_t v969 = vaddq_f32(v928, v968); - float32x4_t v987 = vsubq_f32(v986, v949); - float32x4_t v1071 = vmulq_f32(v1069, v1070); - float32x4_t v1085 = vmulq_f32(v1083, v1243); - float32x4_t v1099 = vmulq_f32(v1097, v1589); - float32x4_t v1120 = vmulq_f32(v1118, v1416); - float32x4_t v1245 = vfmaq_f32(v1244, v354, v1235); - float32x4_t v1259 = vfmaq_f32(v1258, v527, v1581); - float32x4_t v1273 = vfmaq_f32(v1272, v873, v1595); - float32x4_t v1294 = vfmaq_f32(v1293, v700, v1422); - float32x4_t v1417 = vmulq_f32(v1415, v1416); - float32x4_t v1431 = vmulq_f32(v1429, v1430); - float32x4_t v1445 = vmulq_f32(v1443, v1638); - float32x4_t v1466 = vmulq_f32(v1464, v1465); - float32x4_t v1588 = vrev64q_f32(v366); - float32x4_t v1602 = vrev64q_f32(v539); - float32x4_t v1616 = vrev64q_f32(v885); - float32x4_t v1637 = vrev64q_f32(v712); - float32x4_t v962 = vmulq_f32(v956, v1653); - float32x4_t v974 = vmulq_f32(v957, v1665); - float32x4_t v988 = vaddq_f32(v163, v956); - float32x4_t v1002 = vrev64q_f32(v969); - float32x4_t v1019 = vrev64q_f32(v987); - float32x4_t v1072 = vfmaq_f32(v1071, v345, v1062); - float32x4_t v1086 = vfmaq_f32(v1085, v518, v1235); - float32x4_t v1100 = vfmaq_f32(v1099, v864, v1581); - float32x4_t v1121 = vfmaq_f32(v1120, v691, v1408); - float32x4_t v1274 = vsubq_f32(v1245, v1273); - float32x4_t v1279 = vmulq_f32(v1245, v1740); - float32x4_t v1295 = vsubq_f32(v1259, v1294); - float32x4_t v1300 = vmulq_f32(v1259, v1740); - float32x4_t v1418 = vfmaq_f32(v1417, v360, v1408); - float32x4_t v1432 = vfmaq_f32(v1431, v533, v1422); - float32x4_t v1446 = vfmaq_f32(v1445, v879, v1630); - float32x4_t v1467 = vfmaq_f32(v1466, v706, v1609); - float32x4_t v1590 = vmulq_f32(v1588, v1589); - float32x4_t v1604 = vmulq_f32(v1602, v1603); - float32x4_t v1618 = vmulq_f32(v1616, v1617); - float32x4_t v1639 = vmulq_f32(v1637, v1638); - float32x4_t v963 = vsubq_f32(v163, v962); - int16x4_t v991 = vqmovn_s32(vcvtq_n_s32_f32(v988, 15)); - float32x4_t v1004 = vmulq_f32(v1002, v1712); - float32x4_t v1021 = vmulq_f32(v1019, v1712); - float32x4_t v1101 = vsubq_f32(v1072, v1100); - float32x4_t v1106 = vmulq_f32(v1072, v1740); - float32x4_t v1122 = vsubq_f32(v1086, v1121); - float32x4_t v1127 = vmulq_f32(v1086, v1740); - float32x4_t v1280 = vsubq_f32(v1279, v1274); - float32x4_t v1301 = vsubq_f32(v1300, v1295); - float32x4_t v1314 = vmulq_f32(v1295, v1677); - float32x4_t v1332 = vmulq_f32(v1274, v1677); - float32x4_t v1447 = vsubq_f32(v1418, v1446); - float32x4_t v1452 = vmulq_f32(v1418, v1740); - float32x4_t v1468 = vsubq_f32(v1432, v1467); - float32x4_t v1473 = vmulq_f32(v1432, v1740); - float32x4_t v1591 = vfmaq_f32(v1590, v366, v1581); - float32x4_t v1605 = vfmaq_f32(v1604, v539, v1595); - float32x4_t v1619 = vfmaq_f32(v1618, v885, v1609); - float32x4_t v1640 = vfmaq_f32(v1639, v712, v1630); - float32x4_t v975 = vsubq_f32(v963, v974); - float32x4_t v980 = vmulq_f32(v963, v1740); - float32x4_t v1107 = vsubq_f32(v1106, v1101); - float32x4_t v1128 = vsubq_f32(v1127, v1122); - float32x4_t v1141 = vmulq_f32(v1122, v1677); - float32x4_t v1159 = vmulq_f32(v1101, v1677); - float32x4_t v1302 = vaddq_f32(v1280, v1301); - float32x4_t v1303 = vsubq_f32(v1280, v1301); - float32x4_t v1315 = vaddq_f32(v1274, v1314); - float32x4_t v1333 = vsubq_f32(v1332, v1295); - float32x4_t v1453 = vsubq_f32(v1452, v1447); - float32x4_t v1474 = vsubq_f32(v1473, v1468); - float32x4_t v1487 = vmulq_f32(v1468, v1677); - float32x4_t v1505 = vmulq_f32(v1447, v1677); - float32x4_t v1620 = vsubq_f32(v1591, v1619); - float32x4_t v1625 = vmulq_f32(v1591, v1740); - float32x4_t v1641 = vsubq_f32(v1605, v1640); - float32x4_t v1646 = vmulq_f32(v1605, v1740); - vst1_s16((int16_t *)v3436, v991); - float32x4_t v981 = vsubq_f32(v980, v975); - float32x4_t v1022 = vsubq_f32(v975, v1021); - float32x4_t v1035 = vmulq_f32(v975, v1740); - float32x4_t v1129 = vaddq_f32(v1107, v1128); - float32x4_t v1130 = vsubq_f32(v1107, v1128); - float32x4_t v1142 = vaddq_f32(v1101, v1141); - float32x4_t v1160 = vsubq_f32(v1159, v1122); - float32x4_t v1308 = vmulq_f32(v1302, v1653); - float32x4_t v1320 = vmulq_f32(v1303, v1665); - float32x4_t v1334 = vaddq_f32(v181, v1302); - float32x4_t v1348 = vrev64q_f32(v1315); - float32x4_t v1365 = vrev64q_f32(v1333); - float32x4_t v1475 = vaddq_f32(v1453, v1474); - float32x4_t v1476 = vsubq_f32(v1453, v1474); - float32x4_t v1488 = vaddq_f32(v1447, v1487); - float32x4_t v1506 = vsubq_f32(v1505, v1468); - float32x4_t v1626 = vsubq_f32(v1625, v1620); - float32x4_t v1647 = vsubq_f32(v1646, v1641); - float32x4_t v1660 = vmulq_f32(v1641, v1677); - float32x4_t v1678 = vmulq_f32(v1620, v1677); - float32x4_t v1005 = vsubq_f32(v981, v1004); - int16x4_t v1025 = vqmovn_s32(vcvtq_n_s32_f32(v1022, 15)); - float32x4_t v1036 = vsubq_f32(v1035, v1022); - float32x4_t v1049 = vmulq_f32(v981, v1740); - float32x4_t v1135 = vmulq_f32(v1129, v1653); - float32x4_t v1147 = vmulq_f32(v1130, v1665); - float32x4_t v1161 = vaddq_f32(v172, v1129); - float32x4_t v1175 = vrev64q_f32(v1142); - float32x4_t v1192 = vrev64q_f32(v1160); - float32x4_t v1309 = vsubq_f32(v181, v1308); - int16x4_t v1337 = vqmovn_s32(vcvtq_n_s32_f32(v1334, 15)); - float32x4_t v1350 = vmulq_f32(v1348, v1712); - float32x4_t v1367 = vmulq_f32(v1365, v1712); - float32x4_t v1481 = vmulq_f32(v1475, v1653); - float32x4_t v1493 = vmulq_f32(v1476, v1665); - float32x4_t v1507 = vaddq_f32(v187, v1475); - float32x4_t v1521 = vrev64q_f32(v1488); - float32x4_t v1538 = vrev64q_f32(v1506); - float32x4_t v1648 = vaddq_f32(v1626, v1647); - float32x4_t v1649 = vsubq_f32(v1626, v1647); - float32x4_t v1661 = vaddq_f32(v1620, v1660); - float32x4_t v1679 = vsubq_f32(v1678, v1641); - int16x4_t v1008 = vqmovn_s32(vcvtq_n_s32_f32(v1005, 15)); - int16x4_t v1039 = vqmovn_s32(vcvtq_n_s32_f32(v1036, 15)); - float32x4_t v1050 = vsubq_f32(v1049, v1005); - float32x4_t v1136 = vsubq_f32(v172, v1135); - int16x4_t v1164 = vqmovn_s32(vcvtq_n_s32_f32(v1161, 15)); - float32x4_t v1177 = vmulq_f32(v1175, v1712); - float32x4_t v1194 = vmulq_f32(v1192, v1712); - float32x4_t v1321 = vsubq_f32(v1309, v1320); - float32x4_t v1326 = vmulq_f32(v1309, v1740); - float32x4_t v1482 = vsubq_f32(v187, v1481); - int16x4_t v1510 = vqmovn_s32(vcvtq_n_s32_f32(v1507, 15)); - float32x4_t v1523 = vmulq_f32(v1521, v1712); - float32x4_t v1540 = vmulq_f32(v1538, v1712); - float32x4_t v1654 = vmulq_f32(v1648, v1653); - float32x4_t v1666 = vmulq_f32(v1649, v1665); - float32x4_t v1680 = vaddq_f32(v193, v1648); - float32x4_t v1694 = vrev64q_f32(v1661); - float32x4_t v1711 = vrev64q_f32(v1679); - vst1_s16((int16_t *)v3454, v1025); - vst1_s16((int16_t *)v3526, v1337); - int16x4_t v1053 = vqmovn_s32(vcvtq_n_s32_f32(v1050, 15)); - float32x4_t v1148 = vsubq_f32(v1136, v1147); - float32x4_t v1153 = vmulq_f32(v1136, v1740); - float32x4_t v1327 = vsubq_f32(v1326, v1321); - float32x4_t v1368 = vsubq_f32(v1321, v1367); - float32x4_t v1381 = vmulq_f32(v1321, v1740); - float32x4_t v1494 = vsubq_f32(v1482, v1493); - float32x4_t v1499 = vmulq_f32(v1482, v1740); - float32x4_t v1655 = vsubq_f32(v193, v1654); - int16x4_t v1683 = vqmovn_s32(vcvtq_n_s32_f32(v1680, 15)); - float32x4_t v1696 = vmulq_f32(v1694, v1712); - float32x4_t v1713 = vmulq_f32(v1711, v1712); - vst1_s16((int16_t *)v3445, v1008); - vst1_s16((int16_t *)v3463, v1039); - vst1_s16((int16_t *)v3481, v1164); - vst1_s16((int16_t *)v3571, v1510); - float32x4_t v1154 = vsubq_f32(v1153, v1148); - float32x4_t v1195 = vsubq_f32(v1148, v1194); - float32x4_t v1208 = vmulq_f32(v1148, v1740); - float32x4_t v1351 = vsubq_f32(v1327, v1350); - int16x4_t v1371 = vqmovn_s32(vcvtq_n_s32_f32(v1368, 15)); - float32x4_t v1382 = vsubq_f32(v1381, v1368); - float32x4_t v1395 = vmulq_f32(v1327, v1740); - float32x4_t v1500 = vsubq_f32(v1499, v1494); - float32x4_t v1541 = vsubq_f32(v1494, v1540); - float32x4_t v1554 = vmulq_f32(v1494, v1740); - float32x4_t v1667 = vsubq_f32(v1655, v1666); - float32x4_t v1672 = vmulq_f32(v1655, v1740); - vst1_s16((int16_t *)v3472, v1053); - vst1_s16((int16_t *)v3616, v1683); - float32x4_t v1178 = vsubq_f32(v1154, v1177); - int16x4_t v1198 = vqmovn_s32(vcvtq_n_s32_f32(v1195, 15)); - float32x4_t v1209 = vsubq_f32(v1208, v1195); - float32x4_t v1222 = vmulq_f32(v1154, v1740); - int16x4_t v1354 = vqmovn_s32(vcvtq_n_s32_f32(v1351, 15)); - int16x4_t v1385 = vqmovn_s32(vcvtq_n_s32_f32(v1382, 15)); - float32x4_t v1396 = vsubq_f32(v1395, v1351); - float32x4_t v1524 = vsubq_f32(v1500, v1523); - int16x4_t v1544 = vqmovn_s32(vcvtq_n_s32_f32(v1541, 15)); - float32x4_t v1555 = vsubq_f32(v1554, v1541); - float32x4_t v1568 = vmulq_f32(v1500, v1740); - float32x4_t v1673 = vsubq_f32(v1672, v1667); - float32x4_t v1714 = vsubq_f32(v1667, v1713); - float32x4_t v1727 = vmulq_f32(v1667, v1740); - vst1_s16((int16_t *)v3544, v1371); - int16x4_t v1181 = vqmovn_s32(vcvtq_n_s32_f32(v1178, 15)); - int16x4_t v1212 = vqmovn_s32(vcvtq_n_s32_f32(v1209, 15)); - float32x4_t v1223 = vsubq_f32(v1222, v1178); - int16x4_t v1399 = vqmovn_s32(vcvtq_n_s32_f32(v1396, 15)); - int16x4_t v1527 = vqmovn_s32(vcvtq_n_s32_f32(v1524, 15)); - int16x4_t v1558 = vqmovn_s32(vcvtq_n_s32_f32(v1555, 15)); - float32x4_t v1569 = vsubq_f32(v1568, v1524); - float32x4_t v1697 = vsubq_f32(v1673, v1696); - int16x4_t v1717 = vqmovn_s32(vcvtq_n_s32_f32(v1714, 15)); - float32x4_t v1728 = vsubq_f32(v1727, v1714); - float32x4_t v1741 = vmulq_f32(v1673, v1740); - vst1_s16((int16_t *)v3499, v1198); - vst1_s16((int16_t *)v3535, v1354); - vst1_s16((int16_t *)v3553, v1385); - vst1_s16((int16_t *)v3589, v1544); - int16x4_t v1226 = vqmovn_s32(vcvtq_n_s32_f32(v1223, 15)); - int16x4_t v1572 = vqmovn_s32(vcvtq_n_s32_f32(v1569, 15)); - int16x4_t v1700 = vqmovn_s32(vcvtq_n_s32_f32(v1697, 15)); - int16x4_t v1731 = vqmovn_s32(vcvtq_n_s32_f32(v1728, 15)); - float32x4_t v1742 = vsubq_f32(v1741, v1697); - vst1_s16((int16_t *)v3490, v1181); - vst1_s16((int16_t *)v3508, v1212); - vst1_s16((int16_t *)v3562, v1399); - vst1_s16((int16_t *)v3580, v1527); - vst1_s16((int16_t *)v3598, v1558); - vst1_s16((int16_t *)v3634, v1717); - int16x4_t v1745 = vqmovn_s32(vcvtq_n_s32_f32(v1742, 15)); - vst1_s16((int16_t *)v3517, v1226); - vst1_s16((int16_t *)v3607, v1572); - vst1_s16((int16_t *)v3625, v1700); - vst1_s16((int16_t *)v3643, v1731); - vst1_s16((int16_t *)v3652, v1745); - v5 += 2 * 1; - v6 += 2 * 1; - } - for (int j = v1751 * 2; j < howmany; j += 1) { - int16x4_t v1907 = vld1s_s16(&v5[istride]); - float v2526 = 0.0000000000000000e+00F; - float v2624 = 9.6858316112863108e-01F; - float v2627 = -2.4868988716485479e-01F; - float v2628 = 2.4868988716485479e-01F; - float v2768 = 8.7630668004386358e-01F; - float v2771 = -4.8175367410171532e-01F; - float v2772 = 4.8175367410171532e-01F; - float v2912 = 7.2896862742141155e-01F; - float v2915 = -6.8454710592868862e-01F; - float v2916 = 6.8454710592868862e-01F; - float v2924 = 6.2790519529313527e-02F; - float v2927 = -9.9802672842827156e-01F; - float v2928 = 9.9802672842827156e-01F; - float v3056 = 5.3582679497899655e-01F; - float v3059 = -8.4432792550201508e-01F; - float v3060 = 8.4432792550201508e-01F; - float v3068 = -4.2577929156507272e-01F; - float v3071 = -9.0482705246601947e-01F; - float v3072 = 9.0482705246601947e-01F; - float v3080 = -6.3742398974868952e-01F; - float v3083 = 7.7051324277578936e-01F; - float v3084 = -7.7051324277578936e-01F; - float v3098 = -9.9211470131447776e-01F; - float v3101 = -1.2533323356430454e-01F; - float v3102 = 1.2533323356430454e-01F; - float v3118 = 2.5000000000000000e-01F; - float v3128 = 5.5901699437494745e-01F; - float v3138 = 6.1803398874989490e-01F; - float v3163 = 9.5105651629515353e-01F; - float v3164 = -9.5105651629515353e-01F; - float32x2_t v3166 = (float32x2_t){v4, v4}; - float v3189 = 2.0000000000000000e+00F; - int16x4_t v1763 = vld1s_s16(&v5[0]); - float32x2_t v1908 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v1907)), 15); - float v2529 = dir * v2526; - float32x2_t v2625 = (float32x2_t){v2624, v2624}; - float32x2_t v2629 = (float32x2_t){v2627, v2628}; - float32x2_t v2769 = (float32x2_t){v2768, v2768}; - float32x2_t v2773 = (float32x2_t){v2771, v2772}; - float32x2_t v2913 = (float32x2_t){v2912, v2912}; - float32x2_t v2917 = (float32x2_t){v2915, v2916}; - float32x2_t v2925 = (float32x2_t){v2924, v2924}; - float32x2_t v2929 = (float32x2_t){v2927, v2928}; - float32x2_t v2959 = (float32x2_t){v3084, v3083}; - float32x2_t v3057 = (float32x2_t){v3056, v3056}; - float32x2_t v3061 = (float32x2_t){v3059, v3060}; - float32x2_t v3069 = (float32x2_t){v3068, v3068}; - float32x2_t v3073 = (float32x2_t){v3071, v3072}; - float32x2_t v3081 = (float32x2_t){v3080, v3080}; - float32x2_t v3085 = (float32x2_t){v3083, v3084}; - float32x2_t v3099 = (float32x2_t){v3098, v3098}; - float32x2_t v3103 = (float32x2_t){v3101, v3102}; - float32x2_t v3119 = (float32x2_t){v3118, v3118}; - float32x2_t v3129 = (float32x2_t){v3128, v3128}; - float32x2_t v3139 = (float32x2_t){v3138, v3138}; - float32x2_t v3165 = (float32x2_t){v3163, v3164}; - float32x2_t v3190 = (float32x2_t){v3189, v3189}; - float32x2_t v1764 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v1763)), 15); - int16x4_t v1769 = vld1s_s16(&v5[istride * 5]); - int16x4_t v1775 = vld1s_s16(&v5[istride * 10]); - int16x4_t v1781 = vld1s_s16(&v5[istride * 15]); - int16x4_t v1787 = vld1s_s16(&v5[istride * 20]); - int16x4_t v1913 = vld1s_s16(&v5[istride * 6]); - int16x4_t v1919 = vld1s_s16(&v5[istride * 11]); - int16x4_t v1925 = vld1s_s16(&v5[istride * 16]); - int16x4_t v1931 = vld1s_s16(&v5[istride * 21]); - int16x4_t v2051 = vld1s_s16(&v5[istride * 2]); - int16x4_t v2057 = vld1s_s16(&v5[istride * 7]); - int16x4_t v2063 = vld1s_s16(&v5[istride * 12]); - int16x4_t v2069 = vld1s_s16(&v5[istride * 17]); - int16x4_t v2075 = vld1s_s16(&v5[istride * 22]); - int16x4_t v2195 = vld1s_s16(&v5[istride * 3]); - int16x4_t v2201 = vld1s_s16(&v5[istride * 8]); - int16x4_t v2207 = vld1s_s16(&v5[istride * 13]); - int16x4_t v2213 = vld1s_s16(&v5[istride * 18]); - int16x4_t v2219 = vld1s_s16(&v5[istride * 23]); - int16x4_t v2339 = vld1s_s16(&v5[istride * 4]); - int16x4_t v2345 = vld1s_s16(&v5[istride * 9]); - int16x4_t v2351 = vld1s_s16(&v5[istride * 14]); - int16x4_t v2357 = vld1s_s16(&v5[istride * 19]); - int16x4_t v2363 = vld1s_s16(&v5[istride * 24]); - float32x2_t v2527 = (float32x2_t){v2526, v2529}; - float32x2_t v2631 = vmul_f32(v3166, v2629); - float32x2_t v2775 = vmul_f32(v3166, v2773); - float32x2_t v2919 = vmul_f32(v3166, v2917); - float32x2_t v2931 = vmul_f32(v3166, v2929); - float32x2_t v2961 = vmul_f32(v3166, v2959); - float32x2_t v3063 = vmul_f32(v3166, v3061); - float32x2_t v3075 = vmul_f32(v3166, v3073); - float32x2_t v3087 = vmul_f32(v3166, v3085); - float32x2_t v3105 = vmul_f32(v3166, v3103); - float32x2_t v3167 = vmul_f32(v3166, v3165); - float32x2_t v1770 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v1769)), 15); - float32x2_t v1776 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v1775)), 15); - float32x2_t v1782 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v1781)), 15); - float32x2_t v1788 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v1787)), 15); - float32x2_t v1914 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v1913)), 15); - float32x2_t v1920 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v1919)), 15); - float32x2_t v1926 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v1925)), 15); - float32x2_t v1932 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v1931)), 15); - float32x2_t v2052 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v2051)), 15); - float32x2_t v2058 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v2057)), 15); - float32x2_t v2064 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v2063)), 15); - float32x2_t v2070 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v2069)), 15); - float32x2_t v2076 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v2075)), 15); - float32x2_t v2196 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v2195)), 15); - float32x2_t v2202 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v2201)), 15); - float32x2_t v2208 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v2207)), 15); - float32x2_t v2214 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v2213)), 15); - float32x2_t v2220 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v2219)), 15); - float32x2_t v2340 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v2339)), 15); - float32x2_t v2346 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v2345)), 15); - float32x2_t v2352 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v2351)), 15); - float32x2_t v2358 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v2357)), 15); - float32x2_t v2364 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v2363)), 15); - float32x2_t v1798 = vrev64_f32(v1770); - float32x2_t v1810 = vrev64_f32(v1776); - float32x2_t v1822 = vrev64_f32(v1788); - float32x2_t v1840 = vrev64_f32(v1782); - float32x2_t v1942 = vrev64_f32(v1914); - float32x2_t v1954 = vrev64_f32(v1920); - float32x2_t v1966 = vrev64_f32(v1932); - float32x2_t v1984 = vrev64_f32(v1926); - float32x2_t v2086 = vrev64_f32(v2058); - float32x2_t v2098 = vrev64_f32(v2064); - float32x2_t v2110 = vrev64_f32(v2076); - float32x2_t v2128 = vrev64_f32(v2070); - float32x2_t v2230 = vrev64_f32(v2202); - float32x2_t v2242 = vrev64_f32(v2208); - float32x2_t v2254 = vrev64_f32(v2220); - float32x2_t v2272 = vrev64_f32(v2214); - float32x2_t v2374 = vrev64_f32(v2346); - float32x2_t v2386 = vrev64_f32(v2352); - float32x2_t v2398 = vrev64_f32(v2364); - float32x2_t v2416 = vrev64_f32(v2358); - float32x2_t v1799 = vmul_f32(v1798, v2527); - float32x2_t v1811 = vmul_f32(v1810, v2527); - float32x2_t v1823 = vmul_f32(v1822, v2527); - float32x2_t v1841 = vmul_f32(v1840, v2527); - float32x2_t v1943 = vmul_f32(v1942, v2527); - float32x2_t v1955 = vmul_f32(v1954, v2527); - float32x2_t v1967 = vmul_f32(v1966, v2527); - float32x2_t v1985 = vmul_f32(v1984, v2527); - float32x2_t v2087 = vmul_f32(v2086, v2527); - float32x2_t v2099 = vmul_f32(v2098, v2527); - float32x2_t v2111 = vmul_f32(v2110, v2527); - float32x2_t v2129 = vmul_f32(v2128, v2527); - float32x2_t v2231 = vmul_f32(v2230, v2527); - float32x2_t v2243 = vmul_f32(v2242, v2527); - float32x2_t v2255 = vmul_f32(v2254, v2527); - float32x2_t v2273 = vmul_f32(v2272, v2527); - float32x2_t v2375 = vmul_f32(v2374, v2527); - float32x2_t v2387 = vmul_f32(v2386, v2527); - float32x2_t v2399 = vmul_f32(v2398, v2527); - float32x2_t v2417 = vmul_f32(v2416, v2527); - float32x2_t v1800 = vadd_f32(v1799, v1770); - float32x2_t v1812 = vadd_f32(v1811, v1776); - float32x2_t v1824 = vadd_f32(v1823, v1788); - float32x2_t v1842 = vadd_f32(v1841, v1782); - float32x2_t v1944 = vadd_f32(v1943, v1914); - float32x2_t v1956 = vadd_f32(v1955, v1920); - float32x2_t v1968 = vadd_f32(v1967, v1932); - float32x2_t v1986 = vadd_f32(v1985, v1926); - float32x2_t v2088 = vadd_f32(v2087, v2058); - float32x2_t v2100 = vadd_f32(v2099, v2064); - float32x2_t v2112 = vadd_f32(v2111, v2076); - float32x2_t v2130 = vadd_f32(v2129, v2070); - float32x2_t v2232 = vadd_f32(v2231, v2202); - float32x2_t v2244 = vadd_f32(v2243, v2208); - float32x2_t v2256 = vadd_f32(v2255, v2220); - float32x2_t v2274 = vadd_f32(v2273, v2214); - float32x2_t v2376 = vadd_f32(v2375, v2346); - float32x2_t v2388 = vadd_f32(v2387, v2352); - float32x2_t v2400 = vadd_f32(v2399, v2364); - float32x2_t v2418 = vadd_f32(v2417, v2358); - float32x2_t v1825 = vsub_f32(v1800, v1824); - float32x2_t v1829 = vmul_f32(v1800, v3190); - float32x2_t v1843 = vsub_f32(v1812, v1842); - float32x2_t v1847 = vmul_f32(v1812, v3190); - float32x2_t v1969 = vsub_f32(v1944, v1968); - float32x2_t v1973 = vmul_f32(v1944, v3190); - float32x2_t v1987 = vsub_f32(v1956, v1986); - float32x2_t v1991 = vmul_f32(v1956, v3190); - float32x2_t v2113 = vsub_f32(v2088, v2112); - float32x2_t v2117 = vmul_f32(v2088, v3190); - float32x2_t v2131 = vsub_f32(v2100, v2130); - float32x2_t v2135 = vmul_f32(v2100, v3190); - float32x2_t v2257 = vsub_f32(v2232, v2256); - float32x2_t v2261 = vmul_f32(v2232, v3190); - float32x2_t v2275 = vsub_f32(v2244, v2274); - float32x2_t v2279 = vmul_f32(v2244, v3190); - float32x2_t v2401 = vsub_f32(v2376, v2400); - float32x2_t v2405 = vmul_f32(v2376, v3190); - float32x2_t v2419 = vsub_f32(v2388, v2418); - float32x2_t v2423 = vmul_f32(v2388, v3190); - float32x2_t v1830 = vsub_f32(v1829, v1825); - float32x2_t v1848 = vsub_f32(v1847, v1843); - float32x2_t v1859 = vmul_f32(v1843, v3139); - float32x2_t v1874 = vmul_f32(v1825, v3139); - float32x2_t v1974 = vsub_f32(v1973, v1969); - float32x2_t v1992 = vsub_f32(v1991, v1987); - float32x2_t v2003 = vmul_f32(v1987, v3139); - float32x2_t v2018 = vmul_f32(v1969, v3139); - float32x2_t v2118 = vsub_f32(v2117, v2113); - float32x2_t v2136 = vsub_f32(v2135, v2131); - float32x2_t v2147 = vmul_f32(v2131, v3139); - float32x2_t v2162 = vmul_f32(v2113, v3139); - float32x2_t v2262 = vsub_f32(v2261, v2257); - float32x2_t v2280 = vsub_f32(v2279, v2275); - float32x2_t v2291 = vmul_f32(v2275, v3139); - float32x2_t v2306 = vmul_f32(v2257, v3139); - float32x2_t v2406 = vsub_f32(v2405, v2401); - float32x2_t v2424 = vsub_f32(v2423, v2419); - float32x2_t v2435 = vmul_f32(v2419, v3139); - float32x2_t v2450 = vmul_f32(v2401, v3139); - float32x2_t v1849 = vadd_f32(v1830, v1848); - float32x2_t v1850 = vsub_f32(v1830, v1848); - float32x2_t v1860 = vadd_f32(v1825, v1859); - float32x2_t v1875 = vsub_f32(v1874, v1843); - float32x2_t v1993 = vadd_f32(v1974, v1992); - float32x2_t v1994 = vsub_f32(v1974, v1992); - float32x2_t v2004 = vadd_f32(v1969, v2003); - float32x2_t v2019 = vsub_f32(v2018, v1987); - float32x2_t v2137 = vadd_f32(v2118, v2136); - float32x2_t v2138 = vsub_f32(v2118, v2136); - float32x2_t v2148 = vadd_f32(v2113, v2147); - float32x2_t v2163 = vsub_f32(v2162, v2131); - float32x2_t v2281 = vadd_f32(v2262, v2280); - float32x2_t v2282 = vsub_f32(v2262, v2280); - float32x2_t v2292 = vadd_f32(v2257, v2291); - float32x2_t v2307 = vsub_f32(v2306, v2275); - float32x2_t v2425 = vadd_f32(v2406, v2424); - float32x2_t v2426 = vsub_f32(v2406, v2424); - float32x2_t v2436 = vadd_f32(v2401, v2435); - float32x2_t v2451 = vsub_f32(v2450, v2419); - float32x2_t v1854 = vmul_f32(v1849, v3119); - float32x2_t v1864 = vmul_f32(v1850, v3129); - float32x2_t v1876 = vadd_f32(v1764, v1849); - float32x2_t v1882 = vrev64_f32(v1860); - float32x2_t v1890 = vrev64_f32(v1875); - float32x2_t v1998 = vmul_f32(v1993, v3119); - float32x2_t v2008 = vmul_f32(v1994, v3129); - float32x2_t v2020 = vadd_f32(v1908, v1993); - float32x2_t v2026 = vrev64_f32(v2004); - float32x2_t v2034 = vrev64_f32(v2019); - float32x2_t v2142 = vmul_f32(v2137, v3119); - float32x2_t v2152 = vmul_f32(v2138, v3129); - float32x2_t v2164 = vadd_f32(v2052, v2137); - float32x2_t v2170 = vrev64_f32(v2148); - float32x2_t v2178 = vrev64_f32(v2163); - float32x2_t v2286 = vmul_f32(v2281, v3119); - float32x2_t v2296 = vmul_f32(v2282, v3129); - float32x2_t v2308 = vadd_f32(v2196, v2281); - float32x2_t v2314 = vrev64_f32(v2292); - float32x2_t v2322 = vrev64_f32(v2307); - float32x2_t v2430 = vmul_f32(v2425, v3119); - float32x2_t v2440 = vmul_f32(v2426, v3129); - float32x2_t v2452 = vadd_f32(v2340, v2425); - float32x2_t v2458 = vrev64_f32(v2436); - float32x2_t v2466 = vrev64_f32(v2451); - float32x2_t v1855 = vsub_f32(v1764, v1854); - float32x2_t v1883 = vmul_f32(v1882, v3167); - float32x2_t v1891 = vmul_f32(v1890, v3167); - float32x2_t v1999 = vsub_f32(v1908, v1998); - float32x2_t v2027 = vmul_f32(v2026, v3167); - float32x2_t v2035 = vmul_f32(v2034, v3167); - float32x2_t v2143 = vsub_f32(v2052, v2142); - float32x2_t v2171 = vmul_f32(v2170, v3167); - float32x2_t v2179 = vmul_f32(v2178, v3167); - float32x2_t v2287 = vsub_f32(v2196, v2286); - float32x2_t v2315 = vmul_f32(v2314, v3167); - float32x2_t v2323 = vmul_f32(v2322, v3167); - float32x2_t v2431 = vsub_f32(v2340, v2430); - float32x2_t v2459 = vmul_f32(v2458, v3167); - float32x2_t v2467 = vmul_f32(v2466, v3167); - float32x2_t v2488 = vrev64_f32(v2020); - float32x2_t v2500 = vrev64_f32(v2164); - float32x2_t v2512 = vrev64_f32(v2452); - float32x2_t v2530 = vrev64_f32(v2308); - float32x2_t v1865 = vsub_f32(v1855, v1864); - float32x2_t v1869 = vmul_f32(v1855, v3190); - float32x2_t v2009 = vsub_f32(v1999, v2008); - float32x2_t v2013 = vmul_f32(v1999, v3190); - float32x2_t v2153 = vsub_f32(v2143, v2152); - float32x2_t v2157 = vmul_f32(v2143, v3190); - float32x2_t v2297 = vsub_f32(v2287, v2296); - float32x2_t v2301 = vmul_f32(v2287, v3190); - float32x2_t v2441 = vsub_f32(v2431, v2440); - float32x2_t v2445 = vmul_f32(v2431, v3190); - float32x2_t v2489 = vmul_f32(v2488, v2527); - float32x2_t v2501 = vmul_f32(v2500, v2527); - float32x2_t v2513 = vmul_f32(v2512, v2527); - float32x2_t v2531 = vmul_f32(v2530, v2527); - float32x2_t v1870 = vsub_f32(v1869, v1865); - float32x2_t v1892 = vsub_f32(v1865, v1891); - float32x2_t v1896 = vmul_f32(v1865, v3190); - float32x2_t v2014 = vsub_f32(v2013, v2009); - float32x2_t v2036 = vsub_f32(v2009, v2035); - float32x2_t v2040 = vmul_f32(v2009, v3190); - float32x2_t v2158 = vsub_f32(v2157, v2153); - float32x2_t v2180 = vsub_f32(v2153, v2179); - float32x2_t v2184 = vmul_f32(v2153, v3190); - float32x2_t v2302 = vsub_f32(v2301, v2297); - float32x2_t v2324 = vsub_f32(v2297, v2323); - float32x2_t v2328 = vmul_f32(v2297, v3190); - float32x2_t v2446 = vsub_f32(v2445, v2441); - float32x2_t v2468 = vsub_f32(v2441, v2467); - float32x2_t v2472 = vmul_f32(v2441, v3190); - float32x2_t v2490 = vadd_f32(v2489, v2020); - float32x2_t v2502 = vadd_f32(v2501, v2164); - float32x2_t v2514 = vadd_f32(v2513, v2452); - float32x2_t v2532 = vadd_f32(v2531, v2308); - float32x2_t v1884 = vsub_f32(v1870, v1883); - float32x2_t v1897 = vsub_f32(v1896, v1892); - float32x2_t v1901 = vmul_f32(v1870, v3190); - float32x2_t v2028 = vsub_f32(v2014, v2027); - float32x2_t v2041 = vsub_f32(v2040, v2036); - float32x2_t v2045 = vmul_f32(v2014, v3190); - float32x2_t v2172 = vsub_f32(v2158, v2171); - float32x2_t v2185 = vsub_f32(v2184, v2180); - float32x2_t v2189 = vmul_f32(v2158, v3190); - float32x2_t v2316 = vsub_f32(v2302, v2315); - float32x2_t v2329 = vsub_f32(v2328, v2324); - float32x2_t v2333 = vmul_f32(v2302, v3190); - float32x2_t v2460 = vsub_f32(v2446, v2459); - float32x2_t v2473 = vsub_f32(v2472, v2468); - float32x2_t v2477 = vmul_f32(v2446, v3190); - float32x2_t v2515 = vsub_f32(v2490, v2514); - float32x2_t v2519 = vmul_f32(v2490, v3190); - float32x2_t v2533 = vsub_f32(v2502, v2532); - float32x2_t v2537 = vmul_f32(v2502, v3190); - float32x2_t v2776 = vrev64_f32(v2036); - float32x2_t v2788 = vrev64_f32(v2180); - float32x2_t v2800 = vrev64_f32(v2468); - float32x2_t v2818 = vrev64_f32(v2324); - float32x2_t v1902 = vsub_f32(v1901, v1884); - float32x2_t v2046 = vsub_f32(v2045, v2028); - float32x2_t v2190 = vsub_f32(v2189, v2172); - float32x2_t v2334 = vsub_f32(v2333, v2316); - float32x2_t v2478 = vsub_f32(v2477, v2460); - float32x2_t v2520 = vsub_f32(v2519, v2515); - float32x2_t v2538 = vsub_f32(v2537, v2533); - float32x2_t v2549 = vmul_f32(v2533, v3139); - float32x2_t v2564 = vmul_f32(v2515, v3139); - float32x2_t v2632 = vrev64_f32(v2028); - float32x2_t v2644 = vrev64_f32(v2172); - float32x2_t v2656 = vrev64_f32(v2460); - float32x2_t v2674 = vrev64_f32(v2316); - float32x2_t v2777 = vmul_f32(v2776, v2775); - float32x2_t v2789 = vmul_f32(v2788, v3063); - float32x2_t v2801 = vmul_f32(v2800, v3075); - float32x2_t v2819 = vmul_f32(v2818, v2931); - float32x2_t v2920 = vrev64_f32(v2041); - float32x2_t v2932 = vrev64_f32(v2185); - float32x2_t v2944 = vrev64_f32(v2473); - float32x2_t v2962 = vrev64_f32(v2329); - float32x2_t v2539 = vadd_f32(v2520, v2538); - float32x2_t v2540 = vsub_f32(v2520, v2538); - float32x2_t v2550 = vadd_f32(v2515, v2549); - float32x2_t v2565 = vsub_f32(v2564, v2533); - float32x2_t v2633 = vmul_f32(v2632, v2631); - float32x2_t v2645 = vmul_f32(v2644, v2775); - float32x2_t v2657 = vmul_f32(v2656, v3063); - float32x2_t v2675 = vmul_f32(v2674, v2919); - float32x2_t v2778 = vfma_f32(v2777, v2036, v2769); - float32x2_t v2790 = vfma_f32(v2789, v2180, v3057); - float32x2_t v2802 = vfma_f32(v2801, v2468, v3069); - float32x2_t v2820 = vfma_f32(v2819, v2324, v2925); - float32x2_t v2921 = vmul_f32(v2920, v2919); - float32x2_t v2933 = vmul_f32(v2932, v2931); - float32x2_t v2945 = vmul_f32(v2944, v3105); - float32x2_t v2963 = vmul_f32(v2962, v2961); - float32x2_t v3064 = vrev64_f32(v2046); - float32x2_t v3076 = vrev64_f32(v2190); - float32x2_t v3088 = vrev64_f32(v2478); - float32x2_t v3106 = vrev64_f32(v2334); - float32x2_t v2544 = vmul_f32(v2539, v3119); - float32x2_t v2554 = vmul_f32(v2540, v3129); - float32x2_t v2566 = vadd_f32(v1876, v2539); - float32x2_t v2578 = vrev64_f32(v2550); - float32x2_t v2592 = vrev64_f32(v2565); - float32x2_t v2634 = vfma_f32(v2633, v2028, v2625); - float32x2_t v2646 = vfma_f32(v2645, v2172, v2769); - float32x2_t v2658 = vfma_f32(v2657, v2460, v3057); - float32x2_t v2676 = vfma_f32(v2675, v2316, v2913); - float32x2_t v2803 = vsub_f32(v2778, v2802); - float32x2_t v2807 = vmul_f32(v2778, v3190); - float32x2_t v2821 = vsub_f32(v2790, v2820); - float32x2_t v2825 = vmul_f32(v2790, v3190); - float32x2_t v2922 = vfma_f32(v2921, v2041, v2913); - float32x2_t v2934 = vfma_f32(v2933, v2185, v2925); - float32x2_t v2946 = vfma_f32(v2945, v2473, v3099); - float32x2_t v2964 = vfma_f32(v2963, v2329, v3081); - float32x2_t v3065 = vmul_f32(v3064, v3063); - float32x2_t v3077 = vmul_f32(v3076, v3075); - float32x2_t v3089 = vmul_f32(v3088, v3087); - float32x2_t v3107 = vmul_f32(v3106, v3105); - float32x2_t v2545 = vsub_f32(v1876, v2544); - int16x4_t v2569 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v2566, 15), (int32x2_t){0, 0})); - float32x2_t v2579 = vmul_f32(v2578, v3167); - float32x2_t v2593 = vmul_f32(v2592, v3167); - float32x2_t v2659 = vsub_f32(v2634, v2658); - float32x2_t v2663 = vmul_f32(v2634, v3190); - float32x2_t v2677 = vsub_f32(v2646, v2676); - float32x2_t v2681 = vmul_f32(v2646, v3190); - float32x2_t v2808 = vsub_f32(v2807, v2803); - float32x2_t v2826 = vsub_f32(v2825, v2821); - float32x2_t v2837 = vmul_f32(v2821, v3139); - float32x2_t v2852 = vmul_f32(v2803, v3139); - float32x2_t v2947 = vsub_f32(v2922, v2946); - float32x2_t v2951 = vmul_f32(v2922, v3190); - float32x2_t v2965 = vsub_f32(v2934, v2964); - float32x2_t v2969 = vmul_f32(v2934, v3190); - float32x2_t v3066 = vfma_f32(v3065, v2046, v3057); - float32x2_t v3078 = vfma_f32(v3077, v2190, v3069); - float32x2_t v3090 = vfma_f32(v3089, v2478, v3081); - float32x2_t v3108 = vfma_f32(v3107, v2334, v3099); - float32x2_t v2555 = vsub_f32(v2545, v2554); - float32x2_t v2559 = vmul_f32(v2545, v3190); - v6[0] = vget_lane_s32(vreinterpret_s32_s16(v2569), 0); - float32x2_t v2664 = vsub_f32(v2663, v2659); - float32x2_t v2682 = vsub_f32(v2681, v2677); - float32x2_t v2693 = vmul_f32(v2677, v3139); - float32x2_t v2708 = vmul_f32(v2659, v3139); - float32x2_t v2827 = vadd_f32(v2808, v2826); - float32x2_t v2828 = vsub_f32(v2808, v2826); - float32x2_t v2838 = vadd_f32(v2803, v2837); - float32x2_t v2853 = vsub_f32(v2852, v2821); - float32x2_t v2952 = vsub_f32(v2951, v2947); - float32x2_t v2970 = vsub_f32(v2969, v2965); - float32x2_t v2981 = vmul_f32(v2965, v3139); - float32x2_t v2996 = vmul_f32(v2947, v3139); - float32x2_t v3091 = vsub_f32(v3066, v3090); - float32x2_t v3095 = vmul_f32(v3066, v3190); - float32x2_t v3109 = vsub_f32(v3078, v3108); - float32x2_t v3113 = vmul_f32(v3078, v3190); - float32x2_t v2560 = vsub_f32(v2559, v2555); - float32x2_t v2594 = vsub_f32(v2555, v2593); - float32x2_t v2604 = vmul_f32(v2555, v3190); - float32x2_t v2683 = vadd_f32(v2664, v2682); - float32x2_t v2684 = vsub_f32(v2664, v2682); - float32x2_t v2694 = vadd_f32(v2659, v2693); - float32x2_t v2709 = vsub_f32(v2708, v2677); - float32x2_t v2832 = vmul_f32(v2827, v3119); - float32x2_t v2842 = vmul_f32(v2828, v3129); - float32x2_t v2854 = vadd_f32(v1892, v2827); - float32x2_t v2866 = vrev64_f32(v2838); - float32x2_t v2880 = vrev64_f32(v2853); - float32x2_t v2971 = vadd_f32(v2952, v2970); - float32x2_t v2972 = vsub_f32(v2952, v2970); - float32x2_t v2982 = vadd_f32(v2947, v2981); - float32x2_t v2997 = vsub_f32(v2996, v2965); - float32x2_t v3096 = vsub_f32(v3095, v3091); - float32x2_t v3114 = vsub_f32(v3113, v3109); - float32x2_t v3125 = vmul_f32(v3109, v3139); - float32x2_t v3140 = vmul_f32(v3091, v3139); - float32x2_t v2580 = vsub_f32(v2560, v2579); - int16x4_t v2597 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v2594, 15), (int32x2_t){0, 0})); - float32x2_t v2605 = vsub_f32(v2604, v2594); - float32x2_t v2615 = vmul_f32(v2560, v3190); - float32x2_t v2688 = vmul_f32(v2683, v3119); - float32x2_t v2698 = vmul_f32(v2684, v3129); - float32x2_t v2710 = vadd_f32(v1884, v2683); - float32x2_t v2722 = vrev64_f32(v2694); - float32x2_t v2736 = vrev64_f32(v2709); - float32x2_t v2833 = vsub_f32(v1892, v2832); - int16x4_t v2857 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v2854, 15), (int32x2_t){0, 0})); - float32x2_t v2867 = vmul_f32(v2866, v3167); - float32x2_t v2881 = vmul_f32(v2880, v3167); - float32x2_t v2976 = vmul_f32(v2971, v3119); - float32x2_t v2986 = vmul_f32(v2972, v3129); - float32x2_t v2998 = vadd_f32(v1897, v2971); - float32x2_t v3010 = vrev64_f32(v2982); - float32x2_t v3024 = vrev64_f32(v2997); - float32x2_t v3115 = vadd_f32(v3096, v3114); - float32x2_t v3116 = vsub_f32(v3096, v3114); - float32x2_t v3126 = vadd_f32(v3091, v3125); - float32x2_t v3141 = vsub_f32(v3140, v3109); - int16x4_t v2583 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v2580, 15), (int32x2_t){0, 0})); - v6[ostride * 10] = vget_lane_s32(vreinterpret_s32_s16(v2597), 0); - int16x4_t v2608 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v2605, 15), (int32x2_t){0, 0})); - float32x2_t v2616 = vsub_f32(v2615, v2580); - float32x2_t v2689 = vsub_f32(v1884, v2688); - int16x4_t v2713 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v2710, 15), (int32x2_t){0, 0})); - float32x2_t v2723 = vmul_f32(v2722, v3167); - float32x2_t v2737 = vmul_f32(v2736, v3167); - float32x2_t v2843 = vsub_f32(v2833, v2842); - float32x2_t v2847 = vmul_f32(v2833, v3190); - v6[ostride * 2] = vget_lane_s32(vreinterpret_s32_s16(v2857), 0); - float32x2_t v2977 = vsub_f32(v1897, v2976); - int16x4_t v3001 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v2998, 15), (int32x2_t){0, 0})); - float32x2_t v3011 = vmul_f32(v3010, v3167); - float32x2_t v3025 = vmul_f32(v3024, v3167); - float32x2_t v3120 = vmul_f32(v3115, v3119); - float32x2_t v3130 = vmul_f32(v3116, v3129); - float32x2_t v3142 = vadd_f32(v1902, v3115); - float32x2_t v3154 = vrev64_f32(v3126); - float32x2_t v3168 = vrev64_f32(v3141); - v6[ostride * 5] = vget_lane_s32(vreinterpret_s32_s16(v2583), 0); - v6[ostride * 15] = vget_lane_s32(vreinterpret_s32_s16(v2608), 0); - int16x4_t v2619 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v2616, 15), (int32x2_t){0, 0})); - float32x2_t v2699 = vsub_f32(v2689, v2698); - float32x2_t v2703 = vmul_f32(v2689, v3190); - v6[ostride] = vget_lane_s32(vreinterpret_s32_s16(v2713), 0); - float32x2_t v2848 = vsub_f32(v2847, v2843); - float32x2_t v2882 = vsub_f32(v2843, v2881); - float32x2_t v2892 = vmul_f32(v2843, v3190); - float32x2_t v2987 = vsub_f32(v2977, v2986); - float32x2_t v2991 = vmul_f32(v2977, v3190); - v6[ostride * 3] = vget_lane_s32(vreinterpret_s32_s16(v3001), 0); - float32x2_t v3121 = vsub_f32(v1902, v3120); - int16x4_t v3145 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v3142, 15), (int32x2_t){0, 0})); - float32x2_t v3155 = vmul_f32(v3154, v3167); - float32x2_t v3169 = vmul_f32(v3168, v3167); - v6[ostride * 20] = vget_lane_s32(vreinterpret_s32_s16(v2619), 0); - float32x2_t v2704 = vsub_f32(v2703, v2699); - float32x2_t v2738 = vsub_f32(v2699, v2737); - float32x2_t v2748 = vmul_f32(v2699, v3190); - float32x2_t v2868 = vsub_f32(v2848, v2867); - int16x4_t v2885 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v2882, 15), (int32x2_t){0, 0})); - float32x2_t v2893 = vsub_f32(v2892, v2882); - float32x2_t v2903 = vmul_f32(v2848, v3190); - float32x2_t v2992 = vsub_f32(v2991, v2987); - float32x2_t v3026 = vsub_f32(v2987, v3025); - float32x2_t v3036 = vmul_f32(v2987, v3190); - float32x2_t v3131 = vsub_f32(v3121, v3130); - float32x2_t v3135 = vmul_f32(v3121, v3190); - v6[ostride * 4] = vget_lane_s32(vreinterpret_s32_s16(v3145), 0); - float32x2_t v2724 = vsub_f32(v2704, v2723); - int16x4_t v2741 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v2738, 15), (int32x2_t){0, 0})); - float32x2_t v2749 = vsub_f32(v2748, v2738); - float32x2_t v2759 = vmul_f32(v2704, v3190); - int16x4_t v2871 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v2868, 15), (int32x2_t){0, 0})); - v6[ostride * 12] = vget_lane_s32(vreinterpret_s32_s16(v2885), 0); - int16x4_t v2896 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v2893, 15), (int32x2_t){0, 0})); - float32x2_t v2904 = vsub_f32(v2903, v2868); - float32x2_t v3012 = vsub_f32(v2992, v3011); - int16x4_t v3029 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v3026, 15), (int32x2_t){0, 0})); - float32x2_t v3037 = vsub_f32(v3036, v3026); - float32x2_t v3047 = vmul_f32(v2992, v3190); - float32x2_t v3136 = vsub_f32(v3135, v3131); - float32x2_t v3170 = vsub_f32(v3131, v3169); - float32x2_t v3180 = vmul_f32(v3131, v3190); - int16x4_t v2727 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v2724, 15), (int32x2_t){0, 0})); - v6[ostride * 11] = vget_lane_s32(vreinterpret_s32_s16(v2741), 0); - int16x4_t v2752 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v2749, 15), (int32x2_t){0, 0})); - float32x2_t v2760 = vsub_f32(v2759, v2724); - v6[ostride * 7] = vget_lane_s32(vreinterpret_s32_s16(v2871), 0); - v6[ostride * 17] = vget_lane_s32(vreinterpret_s32_s16(v2896), 0); - int16x4_t v2907 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v2904, 15), (int32x2_t){0, 0})); - int16x4_t v3015 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v3012, 15), (int32x2_t){0, 0})); - v6[ostride * 13] = vget_lane_s32(vreinterpret_s32_s16(v3029), 0); - int16x4_t v3040 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v3037, 15), (int32x2_t){0, 0})); - float32x2_t v3048 = vsub_f32(v3047, v3012); - float32x2_t v3156 = vsub_f32(v3136, v3155); - int16x4_t v3173 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v3170, 15), (int32x2_t){0, 0})); - float32x2_t v3181 = vsub_f32(v3180, v3170); - float32x2_t v3191 = vmul_f32(v3136, v3190); - v6[ostride * 6] = vget_lane_s32(vreinterpret_s32_s16(v2727), 0); - v6[ostride * 16] = vget_lane_s32(vreinterpret_s32_s16(v2752), 0); - int16x4_t v2763 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v2760, 15), (int32x2_t){0, 0})); - v6[ostride * 22] = vget_lane_s32(vreinterpret_s32_s16(v2907), 0); - v6[ostride * 8] = vget_lane_s32(vreinterpret_s32_s16(v3015), 0); - v6[ostride * 18] = vget_lane_s32(vreinterpret_s32_s16(v3040), 0); - int16x4_t v3051 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v3048, 15), (int32x2_t){0, 0})); - int16x4_t v3159 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v3156, 15), (int32x2_t){0, 0})); - v6[ostride * 14] = vget_lane_s32(vreinterpret_s32_s16(v3173), 0); - int16x4_t v3184 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v3181, 15), (int32x2_t){0, 0})); - float32x2_t v3192 = vsub_f32(v3191, v3156); - v6[ostride * 21] = vget_lane_s32(vreinterpret_s32_s16(v2763), 0); - v6[ostride * 23] = vget_lane_s32(vreinterpret_s32_s16(v3051), 0); - v6[ostride * 9] = vget_lane_s32(vreinterpret_s32_s16(v3159), 0); - v6[ostride * 19] = vget_lane_s32(vreinterpret_s32_s16(v3184), 0); - int16x4_t v3195 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v3192, 15), (int32x2_t){0, 0})); - v6[ostride * 24] = vget_lane_s32(vreinterpret_s32_s16(v3195), 0); - v5 += 1 * 1; - v6 += 1 * 1; - } -} -#endif - -#ifdef ARMRAL_ARCH_SVE -void armral_fft_cs16_cf32_cs16_ac_n_uu25(const armral_cmplx_int16_t *restrict x, - armral_cmplx_int16_t *restrict y, - int istride, int ostride, int howmany, - float dir) { - int64_t v0 = istride; - int64_t v2 = ostride; - float v4 = dir; - const int32_t *v5 = (const int32_t *)x; - int32_t *v6 = (int32_t *)y; - int64_t v8 = howmany; - int64_t v10 = svcntd(); - int64_t v11 = v10 * 1; - int64_t v12 = v10 * 1; - for (int j = 0; j < v8; j += v10) { - svbool_t pred_full = svwhilelt_b32(j * 2, howmany * 2); - float v1021 = 9.6858316112863108e-01F; - float v1026 = 2.4868988716485479e-01F; - float v1188 = 8.7630668004386358e-01F; - float v1193 = 4.8175367410171532e-01F; - float v1355 = 7.2896862742141155e-01F; - float v1360 = 6.8454710592868862e-01F; - float v1368 = 6.2790519529313527e-02F; - float v1373 = 9.9802672842827156e-01F; - float v1406 = 7.7051324277578925e-01F; - float v1522 = 5.3582679497899655e-01F; - float v1527 = 8.4432792550201508e-01F; - float v1535 = -4.2577929156507272e-01F; - float v1540 = 9.0482705246601947e-01F; - float v1548 = -6.3742398974868952e-01F; - float v1553 = -7.7051324277578936e-01F; - float v1568 = -9.9211470131447776e-01F; - float v1573 = 1.2533323356430454e-01F; - float v1590 = 2.5000000000000000e-01F; - float v1602 = 5.5901699437494745e-01F; - float v1614 = 6.1803398874989490e-01F; - float v1644 = 0.0000000000000000e+00F; - float v1645 = -9.5105651629515353e-01F; - float v1675 = 2.0000000000000000e+00F; - const int32_t *v1759 = &v5[v0]; - int32_t *v2095 = &v6[v2]; - int64_t v27 = v0 * 5; - int64_t v35 = v0 * 10; - int64_t v43 = v0 * 15; - int64_t v51 = v0 * 20; - int64_t v194 = v0 * 6; - int64_t v202 = v0 * 11; - int64_t v210 = v0 * 16; - int64_t v218 = v0 * 21; - int64_t v353 = v0 * 2; - int64_t v361 = v0 * 7; - int64_t v369 = v0 * 12; - int64_t v377 = v0 * 17; - int64_t v385 = v0 * 22; - int64_t v520 = v0 * 3; - int64_t v528 = v0 * 8; - int64_t v536 = v0 * 13; - int64_t v544 = v0 * 18; - int64_t v552 = v0 * 23; - int64_t v687 = v0 * 4; - int64_t v695 = v0 * 9; - int64_t v703 = v0 * 14; - int64_t v711 = v0 * 19; - int64_t v719 = v0 * 24; - float v908 = v4 * v1644; - int64_t v969 = v2 * 5; - int64_t v985 = v2 * 10; - int64_t v999 = v2 * 15; - int64_t v1013 = v2 * 20; - float v1029 = v4 * v1026; - int64_t v1136 = v2 * 6; - int64_t v1152 = v2 * 11; - int64_t v1166 = v2 * 16; - int64_t v1180 = v2 * 21; - float v1196 = v4 * v1193; - int64_t v1287 = v2 * 2; - int64_t v1303 = v2 * 7; - int64_t v1319 = v2 * 12; - int64_t v1333 = v2 * 17; - int64_t v1347 = v2 * 22; - float v1363 = v4 * v1360; - float v1376 = v4 * v1373; - float v1409 = v4 * v1406; - int64_t v1454 = v2 * 3; - int64_t v1470 = v2 * 8; - int64_t v1486 = v2 * 13; - int64_t v1500 = v2 * 18; - int64_t v1514 = v2 * 23; - float v1530 = v4 * v1527; - float v1543 = v4 * v1540; - float v1556 = v4 * v1553; - float v1576 = v4 * v1573; - int64_t v1621 = v2 * 4; - int64_t v1637 = v2 * 9; - float v1648 = v4 * v1645; - int64_t v1653 = v2 * 14; - int64_t v1667 = v2 * 19; - int64_t v1681 = v2 * 24; - const int32_t *v1695 = &v5[0]; - int32_t *v2031 = &v6[0]; - svfloat32_t v2074 = svdup_n_f32(v1021); - svfloat32_t v2138 = svdup_n_f32(v1188); - svfloat32_t v2202 = svdup_n_f32(v1355); - svfloat32_t v2204 = svdup_n_f32(v1368); - svfloat32_t v2266 = svdup_n_f32(v1522); - svfloat32_t v2268 = svdup_n_f32(v1535); - svfloat32_t v2270 = svdup_n_f32(v1548); - svfloat32_t v2273 = svdup_n_f32(v1568); - svfloat32_t v2276 = svdup_n_f32(v1590); - svfloat32_t v2278 = svdup_n_f32(v1602); - svfloat32_t v2280 = svdup_n_f32(v1614); - svfloat32_t v2320 = svdup_n_f32(v1675); - svfloat32_t v192 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v1759[0])), - 1.F / (1ULL << 15ULL)); - const int32_t *v1704 = &v5[v27]; - const int32_t *v1713 = &v5[v35]; - const int32_t *v1722 = &v5[v43]; - const int32_t *v1731 = &v5[v51]; - const int32_t *v1768 = &v5[v194]; - const int32_t *v1777 = &v5[v202]; - const int32_t *v1786 = &v5[v210]; - const int32_t *v1795 = &v5[v218]; - const int32_t *v1823 = &v5[v353]; - const int32_t *v1832 = &v5[v361]; - const int32_t *v1841 = &v5[v369]; - const int32_t *v1850 = &v5[v377]; - const int32_t *v1859 = &v5[v385]; - const int32_t *v1887 = &v5[v520]; - const int32_t *v1896 = &v5[v528]; - const int32_t *v1905 = &v5[v536]; - const int32_t *v1914 = &v5[v544]; - const int32_t *v1923 = &v5[v552]; - const int32_t *v1951 = &v5[v687]; - const int32_t *v1960 = &v5[v695]; - const int32_t *v1969 = &v5[v703]; - const int32_t *v1978 = &v5[v711]; - const int32_t *v1987 = &v5[v719]; - svfloat32_t v2017 = svdup_n_f32(v908); - int32_t *v2041 = &v6[v969]; - int32_t *v2051 = &v6[v985]; - int32_t *v2061 = &v6[v999]; - int32_t *v2071 = &v6[v1013]; - svfloat32_t v2075 = svdup_n_f32(v1029); - int32_t *v2105 = &v6[v1136]; - int32_t *v2115 = &v6[v1152]; - int32_t *v2125 = &v6[v1166]; - int32_t *v2135 = &v6[v1180]; - svfloat32_t v2139 = svdup_n_f32(v1196); - int32_t *v2159 = &v6[v1287]; - int32_t *v2169 = &v6[v1303]; - int32_t *v2179 = &v6[v1319]; - int32_t *v2189 = &v6[v1333]; - int32_t *v2199 = &v6[v1347]; - svfloat32_t v2203 = svdup_n_f32(v1363); - svfloat32_t v2205 = svdup_n_f32(v1376); - svfloat32_t v2210 = svdup_n_f32(v1409); - int32_t *v2223 = &v6[v1454]; - int32_t *v2233 = &v6[v1470]; - int32_t *v2243 = &v6[v1486]; - int32_t *v2253 = &v6[v1500]; - int32_t *v2263 = &v6[v1514]; - svfloat32_t v2267 = svdup_n_f32(v1530); - svfloat32_t v2269 = svdup_n_f32(v1543); - svfloat32_t v2271 = svdup_n_f32(v1556); - svfloat32_t v2274 = svdup_n_f32(v1576); - int32_t *v2287 = &v6[v1621]; - int32_t *v2297 = &v6[v1637]; - svfloat32_t v2300 = svdup_n_f32(v1648); - int32_t *v2307 = &v6[v1653]; - int32_t *v2317 = &v6[v1667]; - int32_t *v2327 = &v6[v1681]; - svfloat32_t v25 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v1695[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v33 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v1704[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v41 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v1713[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v49 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v1722[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v57 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v1731[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v200 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v1768[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v208 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v1777[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v216 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v1786[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v224 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v1795[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v359 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v1823[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v367 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v1832[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v375 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v1841[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v383 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v1850[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v391 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v1859[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v526 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v1887[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v534 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v1896[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v542 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v1905[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v550 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v1914[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v558 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v1923[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v693 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v1951[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v701 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v1960[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v709 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v1969[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v717 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v1978[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v725 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v1987[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v70 = svcmla_f32_x(pred_full, v33, v2017, v33, 90); - svfloat32_t v83 = svcmla_f32_x(pred_full, v41, v2017, v41, 90); - svfloat32_t v96 = svcmla_f32_x(pred_full, v57, v2017, v57, 90); - svfloat32_t v116 = svcmla_f32_x(pred_full, v49, v2017, v49, 90); - svfloat32_t v237 = svcmla_f32_x(pred_full, v200, v2017, v200, 90); - svfloat32_t v250 = svcmla_f32_x(pred_full, v208, v2017, v208, 90); - svfloat32_t v263 = svcmla_f32_x(pred_full, v224, v2017, v224, 90); - svfloat32_t v283 = svcmla_f32_x(pred_full, v216, v2017, v216, 90); - svfloat32_t v404 = svcmla_f32_x(pred_full, v367, v2017, v367, 90); - svfloat32_t v417 = svcmla_f32_x(pred_full, v375, v2017, v375, 90); - svfloat32_t v430 = svcmla_f32_x(pred_full, v391, v2017, v391, 90); - svfloat32_t v450 = svcmla_f32_x(pred_full, v383, v2017, v383, 90); - svfloat32_t v571 = svcmla_f32_x(pred_full, v534, v2017, v534, 90); - svfloat32_t v584 = svcmla_f32_x(pred_full, v542, v2017, v542, 90); - svfloat32_t v597 = svcmla_f32_x(pred_full, v558, v2017, v558, 90); - svfloat32_t v617 = svcmla_f32_x(pred_full, v550, v2017, v550, 90); - svfloat32_t v738 = svcmla_f32_x(pred_full, v701, v2017, v701, 90); - svfloat32_t v751 = svcmla_f32_x(pred_full, v709, v2017, v709, 90); - svfloat32_t v764 = svcmla_f32_x(pred_full, v725, v2017, v725, 90); - svfloat32_t v784 = svcmla_f32_x(pred_full, v717, v2017, v717, 90); - svfloat32_t v97; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v97) : "w"(v70), "w"(v96)); - svfloat32_t v117; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v117) : "w"(v83), "w"(v116)); - svfloat32_t v264; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v264) : "w"(v237), "w"(v263)); - svfloat32_t v284; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v284) : "w"(v250), "w"(v283)); - svfloat32_t v431; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v431) : "w"(v404), "w"(v430)); - svfloat32_t v451; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v451) : "w"(v417), "w"(v450)); - svfloat32_t v598; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v598) : "w"(v571), "w"(v597)); - svfloat32_t v618; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v618) : "w"(v584), "w"(v617)); - svfloat32_t v765; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v765) : "w"(v738), "w"(v764)); - svfloat32_t v785; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v785) : "w"(v751), "w"(v784)); - svfloat32_t v103 = svnmls_f32_x(pred_full, v97, v70, v2320); - svfloat32_t v123 = svnmls_f32_x(pred_full, v117, v83, v2320); - svfloat32_t v270 = svnmls_f32_x(pred_full, v264, v237, v2320); - svfloat32_t v290 = svnmls_f32_x(pred_full, v284, v250, v2320); - svfloat32_t v437 = svnmls_f32_x(pred_full, v431, v404, v2320); - svfloat32_t v457 = svnmls_f32_x(pred_full, v451, v417, v2320); - svfloat32_t v604 = svnmls_f32_x(pred_full, v598, v571, v2320); - svfloat32_t v624 = svnmls_f32_x(pred_full, v618, v584, v2320); - svfloat32_t v771 = svnmls_f32_x(pred_full, v765, v738, v2320); - svfloat32_t v791 = svnmls_f32_x(pred_full, v785, v751, v2320); - svfloat32_t v124; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v124) : "w"(v103), "w"(v123)); - svfloat32_t v125; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v125) : "w"(v103), "w"(v123)); - svfloat32_t v137 = svmla_f32_x(pred_full, v97, v117, v2280); - svfloat32_t v155 = svnmls_f32_x(pred_full, v117, v97, v2280); - svfloat32_t v291; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v291) : "w"(v270), "w"(v290)); - svfloat32_t v292; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v292) : "w"(v270), "w"(v290)); - svfloat32_t v304 = svmla_f32_x(pred_full, v264, v284, v2280); - svfloat32_t v322 = svnmls_f32_x(pred_full, v284, v264, v2280); - svfloat32_t v458; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v458) : "w"(v437), "w"(v457)); - svfloat32_t v459; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v459) : "w"(v437), "w"(v457)); - svfloat32_t v471 = svmla_f32_x(pred_full, v431, v451, v2280); - svfloat32_t v489 = svnmls_f32_x(pred_full, v451, v431, v2280); - svfloat32_t v625; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v625) : "w"(v604), "w"(v624)); - svfloat32_t v626; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v626) : "w"(v604), "w"(v624)); - svfloat32_t v638 = svmla_f32_x(pred_full, v598, v618, v2280); - svfloat32_t v656 = svnmls_f32_x(pred_full, v618, v598, v2280); - svfloat32_t v792; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v792) : "w"(v771), "w"(v791)); - svfloat32_t v793; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v793) : "w"(v771), "w"(v791)); - svfloat32_t v805 = svmla_f32_x(pred_full, v765, v785, v2280); - svfloat32_t v823 = svnmls_f32_x(pred_full, v785, v765, v2280); - svfloat32_t v156; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v156) : "w"(v25), "w"(v124)); - svfloat32_t zero163; - asm volatile("mov %0.s, #0" : "=w"(zero163)); - svfloat32_t v163 = svcmla_f32_x(pred_full, zero163, v2300, v137, 90); - svfloat32_t zero171; - asm volatile("mov %0.s, #0" : "=w"(zero171)); - svfloat32_t v171 = svcmla_f32_x(pred_full, zero171, v2300, v155, 90); - svfloat32_t v323; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v323) : "w"(v192), "w"(v291)); - svfloat32_t zero330; - asm volatile("mov %0.s, #0" : "=w"(zero330)); - svfloat32_t v330 = svcmla_f32_x(pred_full, zero330, v2300, v304, 90); - svfloat32_t zero338; - asm volatile("mov %0.s, #0" : "=w"(zero338)); - svfloat32_t v338 = svcmla_f32_x(pred_full, zero338, v2300, v322, 90); - svfloat32_t v490; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v490) : "w"(v359), "w"(v458)); - svfloat32_t zero497; - asm volatile("mov %0.s, #0" : "=w"(zero497)); - svfloat32_t v497 = svcmla_f32_x(pred_full, zero497, v2300, v471, 90); - svfloat32_t zero505; - asm volatile("mov %0.s, #0" : "=w"(zero505)); - svfloat32_t v505 = svcmla_f32_x(pred_full, zero505, v2300, v489, 90); - svfloat32_t v657; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v657) : "w"(v526), "w"(v625)); - svfloat32_t zero664; - asm volatile("mov %0.s, #0" : "=w"(zero664)); - svfloat32_t v664 = svcmla_f32_x(pred_full, zero664, v2300, v638, 90); - svfloat32_t zero672; - asm volatile("mov %0.s, #0" : "=w"(zero672)); - svfloat32_t v672 = svcmla_f32_x(pred_full, zero672, v2300, v656, 90); - svfloat32_t v824; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v824) : "w"(v693), "w"(v792)); - svfloat32_t zero831; - asm volatile("mov %0.s, #0" : "=w"(zero831)); - svfloat32_t v831 = svcmla_f32_x(pred_full, zero831, v2300, v805, 90); - svfloat32_t zero839; - asm volatile("mov %0.s, #0" : "=w"(zero839)); - svfloat32_t v839 = svcmla_f32_x(pred_full, zero839, v2300, v823, 90); - svfloat32_t v131 = svmls_f32_x(pred_full, v25, v124, v2276); - svfloat32_t v298 = svmls_f32_x(pred_full, v192, v291, v2276); - svfloat32_t v465 = svmls_f32_x(pred_full, v359, v458, v2276); - svfloat32_t v632 = svmls_f32_x(pred_full, v526, v625, v2276); - svfloat32_t v799 = svmls_f32_x(pred_full, v693, v792, v2276); - svfloat32_t v143 = svmls_f32_x(pred_full, v131, v125, v2278); - svfloat32_t v310 = svmls_f32_x(pred_full, v298, v292, v2278); - svfloat32_t v477 = svmls_f32_x(pred_full, v465, v459, v2278); - svfloat32_t v644 = svmls_f32_x(pred_full, v632, v626, v2278); - svfloat32_t v811 = svmls_f32_x(pred_full, v799, v793, v2278); - svfloat32_t v865 = svcmla_f32_x(pred_full, v323, v2017, v323, 90); - svfloat32_t v878 = svcmla_f32_x(pred_full, v490, v2017, v490, 90); - svfloat32_t v891 = svcmla_f32_x(pred_full, v824, v2017, v824, 90); - svfloat32_t v911 = svcmla_f32_x(pred_full, v657, v2017, v657, 90); - svfloat32_t v149 = svnmls_f32_x(pred_full, v143, v131, v2320); - svfloat32_t v172; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v172) : "w"(v143), "w"(v171)); - svfloat32_t v316 = svnmls_f32_x(pred_full, v310, v298, v2320); - svfloat32_t v339; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v339) : "w"(v310), "w"(v338)); - svfloat32_t v483 = svnmls_f32_x(pred_full, v477, v465, v2320); - svfloat32_t v506; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v506) : "w"(v477), "w"(v505)); - svfloat32_t v650 = svnmls_f32_x(pred_full, v644, v632, v2320); - svfloat32_t v673; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v673) : "w"(v644), "w"(v672)); - svfloat32_t v817 = svnmls_f32_x(pred_full, v811, v799, v2320); - svfloat32_t v840; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v840) : "w"(v811), "w"(v839)); - svfloat32_t v892; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v892) : "w"(v865), "w"(v891)); - svfloat32_t v912; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v912) : "w"(v878), "w"(v911)); - svfloat32_t v164; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v164) : "w"(v149), "w"(v163)); - svfloat32_t v178 = svnmls_f32_x(pred_full, v172, v143, v2320); - svfloat32_t v331; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v331) : "w"(v316), "w"(v330)); - svfloat32_t v345 = svnmls_f32_x(pred_full, v339, v310, v2320); - svfloat32_t v498; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v498) : "w"(v483), "w"(v497)); - svfloat32_t v512 = svnmls_f32_x(pred_full, v506, v477, v2320); - svfloat32_t v665; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v665) : "w"(v650), "w"(v664)); - svfloat32_t v679 = svnmls_f32_x(pred_full, v673, v644, v2320); - svfloat32_t v832; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v832) : "w"(v817), "w"(v831)); - svfloat32_t v846 = svnmls_f32_x(pred_full, v840, v811, v2320); - svfloat32_t v898 = svnmls_f32_x(pred_full, v892, v865, v2320); - svfloat32_t v918 = svnmls_f32_x(pred_full, v912, v878, v2320); - svfloat32_t v1191; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v1191) : "w"(v339), "w"(v2138)); - svfloat32_t v1204; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v1204) : "w"(v506), "w"(v2266)); - svfloat32_t v1217; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v1217) : "w"(v840), "w"(v2268)); - svfloat32_t v1237; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v1237) : "w"(v673), "w"(v2204)); - svfloat32_t v184 = svnmls_f32_x(pred_full, v164, v149, v2320); - svfloat32_t v351 = svnmls_f32_x(pred_full, v331, v316, v2320); - svfloat32_t v518 = svnmls_f32_x(pred_full, v498, v483, v2320); - svfloat32_t v685 = svnmls_f32_x(pred_full, v665, v650, v2320); - svfloat32_t v852 = svnmls_f32_x(pred_full, v832, v817, v2320); - svfloat32_t v919; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v919) : "w"(v898), "w"(v918)); - svfloat32_t v920; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v920) : "w"(v898), "w"(v918)); - svfloat32_t v932 = svmla_f32_x(pred_full, v892, v912, v2280); - svfloat32_t v950 = svnmls_f32_x(pred_full, v912, v892, v2280); - svfloat32_t v1024; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v1024) : "w"(v331), "w"(v2074)); - svfloat32_t v1037; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v1037) : "w"(v498), "w"(v2138)); - svfloat32_t v1050; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v1050) : "w"(v832), "w"(v2266)); - svfloat32_t v1070; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v1070) : "w"(v665), "w"(v2202)); - svfloat32_t v1199 = svcmla_f32_x(pred_full, v1191, v2139, v339, 90); - svfloat32_t v1212 = svcmla_f32_x(pred_full, v1204, v2267, v506, 90); - svfloat32_t v1225 = svcmla_f32_x(pred_full, v1217, v2269, v840, 90); - svfloat32_t v1245 = svcmla_f32_x(pred_full, v1237, v2205, v673, 90); - svfloat32_t v1358; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v1358) : "w"(v345), "w"(v2202)); - svfloat32_t v1371; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v1371) : "w"(v512), "w"(v2204)); - svfloat32_t v1384; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v1384) : "w"(v846), "w"(v2273)); - svfloat32_t v1404; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v1404) : "w"(v679), "w"(v2270)); - svfloat32_t v951; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v951) : "w"(v156), "w"(v919)); - svfloat32_t zero966; - asm volatile("mov %0.s, #0" : "=w"(zero966)); - svfloat32_t v966 = svcmla_f32_x(pred_full, zero966, v2300, v932, 90); - svfloat32_t zero982; - asm volatile("mov %0.s, #0" : "=w"(zero982)); - svfloat32_t v982 = svcmla_f32_x(pred_full, zero982, v2300, v950, 90); - svfloat32_t v1032 = svcmla_f32_x(pred_full, v1024, v2075, v331, 90); - svfloat32_t v1045 = svcmla_f32_x(pred_full, v1037, v2139, v498, 90); - svfloat32_t v1058 = svcmla_f32_x(pred_full, v1050, v2267, v832, 90); - svfloat32_t v1078 = svcmla_f32_x(pred_full, v1070, v2203, v665, 90); - svfloat32_t v1226; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1226) : "w"(v1199), "w"(v1225)); - svfloat32_t v1246; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1246) : "w"(v1212), "w"(v1245)); - svfloat32_t v1366 = svcmla_f32_x(pred_full, v1358, v2203, v345, 90); - svfloat32_t v1379 = svcmla_f32_x(pred_full, v1371, v2205, v512, 90); - svfloat32_t v1392 = svcmla_f32_x(pred_full, v1384, v2274, v846, 90); - svfloat32_t v1412 = svcmla_f32_x(pred_full, v1404, v2210, v679, 90); - svfloat32_t v1525; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v1525) : "w"(v351), "w"(v2266)); - svfloat32_t v1538; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v1538) : "w"(v518), "w"(v2268)); - svfloat32_t v1551; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v1551) : "w"(v852), "w"(v2270)); - svfloat32_t v1571; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v1571) : "w"(v685), "w"(v2273)); - svfloat32_t v926 = svmls_f32_x(pred_full, v156, v919, v2276); - svint16_t v954 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v951, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svfloat32_t v1059; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1059) : "w"(v1032), "w"(v1058)); - svfloat32_t v1079; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1079) : "w"(v1045), "w"(v1078)); - svfloat32_t v1232 = svnmls_f32_x(pred_full, v1226, v1199, v2320); - svfloat32_t v1252 = svnmls_f32_x(pred_full, v1246, v1212, v2320); - svfloat32_t v1393; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1393) : "w"(v1366), "w"(v1392)); - svfloat32_t v1413; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1413) : "w"(v1379), "w"(v1412)); - svfloat32_t v1533 = svcmla_f32_x(pred_full, v1525, v2267, v351, 90); - svfloat32_t v1546 = svcmla_f32_x(pred_full, v1538, v2269, v518, 90); - svfloat32_t v1559 = svcmla_f32_x(pred_full, v1551, v2271, v852, 90); - svfloat32_t v1579 = svcmla_f32_x(pred_full, v1571, v2274, v685, 90); - svfloat32_t v938 = svmls_f32_x(pred_full, v926, v920, v2278); - svfloat32_t v1065 = svnmls_f32_x(pred_full, v1059, v1032, v2320); - svfloat32_t v1085 = svnmls_f32_x(pred_full, v1079, v1045, v2320); - svfloat32_t v1253; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v1253) : "w"(v1232), "w"(v1252)); - svfloat32_t v1254; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1254) : "w"(v1232), "w"(v1252)); - svfloat32_t v1266 = svmla_f32_x(pred_full, v1226, v1246, v2280); - svfloat32_t v1284 = svnmls_f32_x(pred_full, v1246, v1226, v2280); - svfloat32_t v1399 = svnmls_f32_x(pred_full, v1393, v1366, v2320); - svfloat32_t v1419 = svnmls_f32_x(pred_full, v1413, v1379, v2320); - svfloat32_t v1560; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1560) : "w"(v1533), "w"(v1559)); - svfloat32_t v1580; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1580) : "w"(v1546), "w"(v1579)); - svst1w_u64(pred_full, (unsigned *)(v2031), svreinterpret_u64_s16(v954)); - svfloat32_t v944 = svnmls_f32_x(pred_full, v938, v926, v2320); - svfloat32_t v983; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v983) : "w"(v938), "w"(v982)); - svfloat32_t v1086; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v1086) : "w"(v1065), "w"(v1085)); - svfloat32_t v1087; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1087) : "w"(v1065), "w"(v1085)); - svfloat32_t v1099 = svmla_f32_x(pred_full, v1059, v1079, v2280); - svfloat32_t v1117 = svnmls_f32_x(pred_full, v1079, v1059, v2280); - svfloat32_t v1285; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v1285) : "w"(v172), "w"(v1253)); - svfloat32_t zero1300; - asm volatile("mov %0.s, #0" : "=w"(zero1300)); - svfloat32_t v1300 = svcmla_f32_x(pred_full, zero1300, v2300, v1266, 90); - svfloat32_t zero1316; - asm volatile("mov %0.s, #0" : "=w"(zero1316)); - svfloat32_t v1316 = svcmla_f32_x(pred_full, zero1316, v2300, v1284, 90); - svfloat32_t v1420; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v1420) : "w"(v1399), "w"(v1419)); - svfloat32_t v1421; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1421) : "w"(v1399), "w"(v1419)); - svfloat32_t v1433 = svmla_f32_x(pred_full, v1393, v1413, v2280); - svfloat32_t v1451 = svnmls_f32_x(pred_full, v1413, v1393, v2280); - svfloat32_t v1566 = svnmls_f32_x(pred_full, v1560, v1533, v2320); - svfloat32_t v1586 = svnmls_f32_x(pred_full, v1580, v1546, v2320); - svfloat32_t v967; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v967) : "w"(v944), "w"(v966)); - svint16_t v986 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v983, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svfloat32_t v997 = svnmls_f32_x(pred_full, v983, v938, v2320); - svfloat32_t v1118; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v1118) : "w"(v164), "w"(v1086)); - svfloat32_t zero1133; - asm volatile("mov %0.s, #0" : "=w"(zero1133)); - svfloat32_t v1133 = svcmla_f32_x(pred_full, zero1133, v2300, v1099, 90); - svfloat32_t zero1149; - asm volatile("mov %0.s, #0" : "=w"(zero1149)); - svfloat32_t v1149 = svcmla_f32_x(pred_full, zero1149, v2300, v1117, 90); - svfloat32_t v1260 = svmls_f32_x(pred_full, v172, v1253, v2276); - svint16_t v1288 = - svtbl_s16(svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, - svmul_n_f32_x(pred_full, v1285, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64(svindex_u64(0xffffffff00030001ULL, - 0x0000000000040004ULL))); - svfloat32_t v1452; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v1452) : "w"(v178), "w"(v1420)); - svfloat32_t zero1467; - asm volatile("mov %0.s, #0" : "=w"(zero1467)); - svfloat32_t v1467 = svcmla_f32_x(pred_full, zero1467, v2300, v1433, 90); - svfloat32_t zero1483; - asm volatile("mov %0.s, #0" : "=w"(zero1483)); - svfloat32_t v1483 = svcmla_f32_x(pred_full, zero1483, v2300, v1451, 90); - svfloat32_t v1587; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v1587) : "w"(v1566), "w"(v1586)); - svfloat32_t v1588; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1588) : "w"(v1566), "w"(v1586)); - svfloat32_t v1600 = svmla_f32_x(pred_full, v1560, v1580, v2280); - svfloat32_t v1618 = svnmls_f32_x(pred_full, v1580, v1560, v2280); - svint16_t v970 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v967, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svint16_t v1000 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v997, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svfloat32_t v1011 = svnmls_f32_x(pred_full, v967, v944, v2320); - svfloat32_t v1093 = svmls_f32_x(pred_full, v164, v1086, v2276); - svint16_t v1121 = - svtbl_s16(svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, - svmul_n_f32_x(pred_full, v1118, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64(svindex_u64(0xffffffff00030001ULL, - 0x0000000000040004ULL))); - svfloat32_t v1272 = svmls_f32_x(pred_full, v1260, v1254, v2278); - svfloat32_t v1427 = svmls_f32_x(pred_full, v178, v1420, v2276); - svint16_t v1455 = - svtbl_s16(svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, - svmul_n_f32_x(pred_full, v1452, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64(svindex_u64(0xffffffff00030001ULL, - 0x0000000000040004ULL))); - svfloat32_t v1619; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v1619) : "w"(v184), "w"(v1587)); - svfloat32_t zero1634; - asm volatile("mov %0.s, #0" : "=w"(zero1634)); - svfloat32_t v1634 = svcmla_f32_x(pred_full, zero1634, v2300, v1600, 90); - svfloat32_t zero1650; - asm volatile("mov %0.s, #0" : "=w"(zero1650)); - svfloat32_t v1650 = svcmla_f32_x(pred_full, zero1650, v2300, v1618, 90); - svst1w_u64(pred_full, (unsigned *)(v2051), svreinterpret_u64_s16(v986)); - svst1w_u64(pred_full, (unsigned *)(v2159), svreinterpret_u64_s16(v1288)); - svint16_t v1014 = - svtbl_s16(svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, - svmul_n_f32_x(pred_full, v1011, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64(svindex_u64(0xffffffff00030001ULL, - 0x0000000000040004ULL))); - svfloat32_t v1105 = svmls_f32_x(pred_full, v1093, v1087, v2278); - svfloat32_t v1278 = svnmls_f32_x(pred_full, v1272, v1260, v2320); - svfloat32_t v1317; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1317) : "w"(v1272), "w"(v1316)); - svfloat32_t v1439 = svmls_f32_x(pred_full, v1427, v1421, v2278); - svfloat32_t v1594 = svmls_f32_x(pred_full, v184, v1587, v2276); - svint16_t v1622 = - svtbl_s16(svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, - svmul_n_f32_x(pred_full, v1619, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64(svindex_u64(0xffffffff00030001ULL, - 0x0000000000040004ULL))); - svst1w_u64(pred_full, (unsigned *)(v2041), svreinterpret_u64_s16(v970)); - svst1w_u64(pred_full, (unsigned *)(v2061), svreinterpret_u64_s16(v1000)); - svst1w_u64(pred_full, (unsigned *)(v2095), svreinterpret_u64_s16(v1121)); - svst1w_u64(pred_full, (unsigned *)(v2223), svreinterpret_u64_s16(v1455)); - svfloat32_t v1111 = svnmls_f32_x(pred_full, v1105, v1093, v2320); - svfloat32_t v1150; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1150) : "w"(v1105), "w"(v1149)); - svfloat32_t v1301; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1301) : "w"(v1278), "w"(v1300)); - svint16_t v1320 = - svtbl_s16(svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, - svmul_n_f32_x(pred_full, v1317, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64(svindex_u64(0xffffffff00030001ULL, - 0x0000000000040004ULL))); - svfloat32_t v1331 = svnmls_f32_x(pred_full, v1317, v1272, v2320); - svfloat32_t v1445 = svnmls_f32_x(pred_full, v1439, v1427, v2320); - svfloat32_t v1484; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1484) : "w"(v1439), "w"(v1483)); - svfloat32_t v1606 = svmls_f32_x(pred_full, v1594, v1588, v2278); - svst1w_u64(pred_full, (unsigned *)(v2071), svreinterpret_u64_s16(v1014)); - svst1w_u64(pred_full, (unsigned *)(v2287), svreinterpret_u64_s16(v1622)); - svfloat32_t v1134; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1134) : "w"(v1111), "w"(v1133)); - svint16_t v1153 = - svtbl_s16(svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, - svmul_n_f32_x(pred_full, v1150, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64(svindex_u64(0xffffffff00030001ULL, - 0x0000000000040004ULL))); - svfloat32_t v1164 = svnmls_f32_x(pred_full, v1150, v1105, v2320); - svint16_t v1304 = - svtbl_s16(svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, - svmul_n_f32_x(pred_full, v1301, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64(svindex_u64(0xffffffff00030001ULL, - 0x0000000000040004ULL))); - svint16_t v1334 = - svtbl_s16(svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, - svmul_n_f32_x(pred_full, v1331, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64(svindex_u64(0xffffffff00030001ULL, - 0x0000000000040004ULL))); - svfloat32_t v1345 = svnmls_f32_x(pred_full, v1301, v1278, v2320); - svfloat32_t v1468; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1468) : "w"(v1445), "w"(v1467)); - svint16_t v1487 = - svtbl_s16(svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, - svmul_n_f32_x(pred_full, v1484, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64(svindex_u64(0xffffffff00030001ULL, - 0x0000000000040004ULL))); - svfloat32_t v1498 = svnmls_f32_x(pred_full, v1484, v1439, v2320); - svfloat32_t v1612 = svnmls_f32_x(pred_full, v1606, v1594, v2320); - svfloat32_t v1651; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1651) : "w"(v1606), "w"(v1650)); - svst1w_u64(pred_full, (unsigned *)(v2179), svreinterpret_u64_s16(v1320)); - svint16_t v1137 = - svtbl_s16(svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, - svmul_n_f32_x(pred_full, v1134, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64(svindex_u64(0xffffffff00030001ULL, - 0x0000000000040004ULL))); - svint16_t v1167 = - svtbl_s16(svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, - svmul_n_f32_x(pred_full, v1164, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64(svindex_u64(0xffffffff00030001ULL, - 0x0000000000040004ULL))); - svfloat32_t v1178 = svnmls_f32_x(pred_full, v1134, v1111, v2320); - svint16_t v1348 = - svtbl_s16(svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, - svmul_n_f32_x(pred_full, v1345, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64(svindex_u64(0xffffffff00030001ULL, - 0x0000000000040004ULL))); - svint16_t v1471 = - svtbl_s16(svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, - svmul_n_f32_x(pred_full, v1468, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64(svindex_u64(0xffffffff00030001ULL, - 0x0000000000040004ULL))); - svint16_t v1501 = - svtbl_s16(svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, - svmul_n_f32_x(pred_full, v1498, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64(svindex_u64(0xffffffff00030001ULL, - 0x0000000000040004ULL))); - svfloat32_t v1512 = svnmls_f32_x(pred_full, v1468, v1445, v2320); - svfloat32_t v1635; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1635) : "w"(v1612), "w"(v1634)); - svint16_t v1654 = - svtbl_s16(svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, - svmul_n_f32_x(pred_full, v1651, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64(svindex_u64(0xffffffff00030001ULL, - 0x0000000000040004ULL))); - svfloat32_t v1665 = svnmls_f32_x(pred_full, v1651, v1606, v2320); - svst1w_u64(pred_full, (unsigned *)(v2115), svreinterpret_u64_s16(v1153)); - svst1w_u64(pred_full, (unsigned *)(v2169), svreinterpret_u64_s16(v1304)); - svst1w_u64(pred_full, (unsigned *)(v2189), svreinterpret_u64_s16(v1334)); - svst1w_u64(pred_full, (unsigned *)(v2243), svreinterpret_u64_s16(v1487)); - svint16_t v1181 = - svtbl_s16(svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, - svmul_n_f32_x(pred_full, v1178, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64(svindex_u64(0xffffffff00030001ULL, - 0x0000000000040004ULL))); - svint16_t v1515 = - svtbl_s16(svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, - svmul_n_f32_x(pred_full, v1512, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64(svindex_u64(0xffffffff00030001ULL, - 0x0000000000040004ULL))); - svint16_t v1638 = - svtbl_s16(svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, - svmul_n_f32_x(pred_full, v1635, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64(svindex_u64(0xffffffff00030001ULL, - 0x0000000000040004ULL))); - svint16_t v1668 = - svtbl_s16(svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, - svmul_n_f32_x(pred_full, v1665, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64(svindex_u64(0xffffffff00030001ULL, - 0x0000000000040004ULL))); - svfloat32_t v1679 = svnmls_f32_x(pred_full, v1635, v1612, v2320); - svst1w_u64(pred_full, (unsigned *)(v2105), svreinterpret_u64_s16(v1137)); - svst1w_u64(pred_full, (unsigned *)(v2125), svreinterpret_u64_s16(v1167)); - svst1w_u64(pred_full, (unsigned *)(v2199), svreinterpret_u64_s16(v1348)); - svst1w_u64(pred_full, (unsigned *)(v2233), svreinterpret_u64_s16(v1471)); - svst1w_u64(pred_full, (unsigned *)(v2253), svreinterpret_u64_s16(v1501)); - svst1w_u64(pred_full, (unsigned *)(v2307), svreinterpret_u64_s16(v1654)); - svint16_t v1682 = - svtbl_s16(svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, - svmul_n_f32_x(pred_full, v1679, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64(svindex_u64(0xffffffff00030001ULL, - 0x0000000000040004ULL))); - svst1w_u64(pred_full, (unsigned *)(v2135), svreinterpret_u64_s16(v1181)); - svst1w_u64(pred_full, (unsigned *)(v2263), svreinterpret_u64_s16(v1515)); - svst1w_u64(pred_full, (unsigned *)(v2297), svreinterpret_u64_s16(v1638)); - svst1w_u64(pred_full, (unsigned *)(v2317), svreinterpret_u64_s16(v1668)); - svst1w_u64(pred_full, (unsigned *)(v2327), svreinterpret_u64_s16(v1682)); - v5 += v11; - v6 += v12; - } -} -#endif - -#ifndef ARMRAL_ARCH_SVE -void armral_fft_cs16_cf32_cs16_ac_n_uu32(const armral_cmplx_int16_t *restrict x, - armral_cmplx_int16_t *restrict y, - int istride, int ostride, int howmany, - float dir) { - float v4 = dir; - const int32_t *v5 = (const int32_t *)x; - int32_t *v6 = (int32_t *)y; - int64_t v12 = howmany - 1; - int64_t v1241 = howmany / 2; - for (int j = 0; j < v12; j += 2) { - float v946 = 7.0710678118654757e-01F; - float v959 = -7.0710678118654746e-01F; - float v1020 = 5.5557023301960229e-01F; - float v1037 = -1.9509032201612861e-01F; - float v1099 = 9.2387953251128674e-01F; - float v1107 = -9.2387953251128685e-01F; - float v1111 = 3.8268343236508967e-01F; - float v1112 = -3.8268343236508967e-01F; - float v1168 = 1.9509032201612833e-01F; - float v1172 = -9.8078528040323043e-01F; - float v1173 = 9.8078528040323043e-01F; - float v1181 = -5.5557023301960218e-01F; - float v1185 = 8.3146961230254524e-01F; - float v1186 = -8.3146961230254524e-01F; - float v1197 = -1.0000000000000000e+00F; - float v1198 = 1.0000000000000000e+00F; - float32x2_t v1200 = (float32x2_t){v4, v4}; - const int32_t *v2421 = &v5[istride]; - int32_t *v2602 = &v6[ostride]; - float32x2_t v725 = (float32x2_t){v1173, v1173}; - float32x2_t v799 = (float32x2_t){v1099, v1099}; - float32x2_t v804 = (float32x2_t){v1112, v1111}; - float32x2_t v873 = (float32x2_t){v1185, v1185}; - float32x2_t v878 = (float32x2_t){v1181, v1020}; - float32x2_t v886 = (float32x2_t){v1037, v1037}; - float32x2_t v947 = (float32x2_t){v946, v946}; - float32x2_t v960 = (float32x2_t){v959, v959}; - float32x2_t v965 = (float32x2_t){v1198, v1197}; - float32x2_t v1021 = (float32x2_t){v1020, v1020}; - float32x2_t v1026 = (float32x2_t){v1186, v1185}; - float32x2_t v1034 = (float32x2_t){v1172, v1172}; - float32x2_t v1039 = (float32x2_t){v1037, v1168}; - float32x2_t v1095 = (float32x2_t){v1111, v1111}; - float32x2_t v1100 = (float32x2_t){v1107, v1099}; - float32x2_t v1108 = (float32x2_t){v1107, v1107}; - float32x2_t v1113 = (float32x2_t){v1111, v1112}; - float32x2_t v1169 = (float32x2_t){v1168, v1168}; - float32x2_t v1174 = (float32x2_t){v1172, v1173}; - float32x2_t v1182 = (float32x2_t){v1181, v1181}; - float32x2_t v1187 = (float32x2_t){v1185, v1186}; - float32x2_t v1199 = (float32x2_t){v1197, v1198}; - const int32_t *v2277 = &v5[0]; - int32_t *v2566 = &v6[0]; - int16x4_t v2881 = vld1_s16((const int16_t *)v2421); - float32x4_t v404 = vcvtq_n_f32_s32(vmovl_s16(v2881), 15); - float32x4_t v726 = vcombine_f32(v725, v725); - float32x4_t v800 = vcombine_f32(v799, v799); - float32x2_t v806 = vmul_f32(v1200, v804); - float32x4_t v874 = vcombine_f32(v873, v873); - float32x2_t v880 = vmul_f32(v1200, v878); - float32x4_t v887 = vcombine_f32(v886, v886); - float32x4_t v948 = vcombine_f32(v947, v947); - float32x4_t v961 = vcombine_f32(v960, v960); - float32x2_t v967 = vmul_f32(v1200, v965); - float32x4_t v1022 = vcombine_f32(v1021, v1021); - float32x2_t v1028 = vmul_f32(v1200, v1026); - float32x4_t v1035 = vcombine_f32(v1034, v1034); - float32x2_t v1041 = vmul_f32(v1200, v1039); - float32x4_t v1096 = vcombine_f32(v1095, v1095); - float32x2_t v1102 = vmul_f32(v1200, v1100); - float32x4_t v1109 = vcombine_f32(v1108, v1108); - float32x2_t v1115 = vmul_f32(v1200, v1113); - float32x4_t v1170 = vcombine_f32(v1169, v1169); - float32x2_t v1176 = vmul_f32(v1200, v1174); - float32x4_t v1183 = vcombine_f32(v1182, v1182); - float32x2_t v1189 = vmul_f32(v1200, v1187); - float32x2_t v1201 = vmul_f32(v1200, v1199); - const int32_t *v2286 = &v5[istride * 16]; - const int32_t *v2295 = &v5[istride * 8]; - const int32_t *v2304 = &v5[istride * 24]; - const int32_t *v2313 = &v5[istride * 4]; - const int32_t *v2322 = &v5[istride * 20]; - const int32_t *v2331 = &v5[istride * 12]; - const int32_t *v2340 = &v5[istride * 28]; - const int32_t *v2349 = &v5[istride * 2]; - const int32_t *v2358 = &v5[istride * 18]; - const int32_t *v2367 = &v5[istride * 10]; - const int32_t *v2376 = &v5[istride * 26]; - const int32_t *v2385 = &v5[istride * 6]; - const int32_t *v2394 = &v5[istride * 22]; - const int32_t *v2403 = &v5[istride * 14]; - const int32_t *v2412 = &v5[istride * 30]; - const int32_t *v2430 = &v5[istride * 17]; - const int32_t *v2439 = &v5[istride * 9]; - const int32_t *v2448 = &v5[istride * 25]; - const int32_t *v2457 = &v5[istride * 5]; - const int32_t *v2466 = &v5[istride * 21]; - const int32_t *v2475 = &v5[istride * 13]; - const int32_t *v2484 = &v5[istride * 29]; - const int32_t *v2493 = &v5[istride * 3]; - const int32_t *v2502 = &v5[istride * 19]; - const int32_t *v2511 = &v5[istride * 11]; - const int32_t *v2520 = &v5[istride * 27]; - const int32_t *v2529 = &v5[istride * 7]; - const int32_t *v2538 = &v5[istride * 23]; - const int32_t *v2547 = &v5[istride * 15]; - const int32_t *v2556 = &v5[istride * 31]; - int32_t *v2575 = &v6[ostride * 8]; - int32_t *v2584 = &v6[ostride * 16]; - int32_t *v2593 = &v6[ostride * 24]; - int32_t *v2611 = &v6[ostride * 9]; - int32_t *v2620 = &v6[ostride * 17]; - int32_t *v2629 = &v6[ostride * 25]; - int32_t *v2638 = &v6[ostride * 2]; - int32_t *v2647 = &v6[ostride * 10]; - int32_t *v2656 = &v6[ostride * 18]; - int32_t *v2665 = &v6[ostride * 26]; - int32_t *v2674 = &v6[ostride * 3]; - int32_t *v2683 = &v6[ostride * 11]; - int32_t *v2692 = &v6[ostride * 19]; - int32_t *v2701 = &v6[ostride * 27]; - int32_t *v2710 = &v6[ostride * 4]; - int32_t *v2719 = &v6[ostride * 12]; - int32_t *v2728 = &v6[ostride * 20]; - int32_t *v2737 = &v6[ostride * 28]; - int32_t *v2746 = &v6[ostride * 5]; - int32_t *v2755 = &v6[ostride * 13]; - int32_t *v2764 = &v6[ostride * 21]; - int32_t *v2773 = &v6[ostride * 29]; - int32_t *v2782 = &v6[ostride * 6]; - int32_t *v2791 = &v6[ostride * 14]; - int32_t *v2800 = &v6[ostride * 22]; - int32_t *v2809 = &v6[ostride * 30]; - int32_t *v2818 = &v6[ostride * 7]; - int32_t *v2827 = &v6[ostride * 15]; - int32_t *v2836 = &v6[ostride * 23]; - int32_t *v2845 = &v6[ostride * 31]; - int16x4_t v2849 = vld1_s16((const int16_t *)v2277); - float32x4_t v28 = vcvtq_n_f32_s32(vmovl_s16(v2849), 15); - float32x4_t v808 = vcombine_f32(v806, v806); - float32x4_t v882 = vcombine_f32(v880, v880); - float32x4_t v969 = vcombine_f32(v967, v967); - float32x4_t v1030 = vcombine_f32(v1028, v1028); - float32x4_t v1043 = vcombine_f32(v1041, v1041); - float32x4_t v1104 = vcombine_f32(v1102, v1102); - float32x4_t v1117 = vcombine_f32(v1115, v1115); - float32x4_t v1178 = vcombine_f32(v1176, v1176); - float32x4_t v1191 = vcombine_f32(v1189, v1189); - float32x4_t v1203 = vcombine_f32(v1201, v1201); - int16x4_t v2851 = vld1_s16((const int16_t *)v2286); - int16x4_t v2853 = vld1_s16((const int16_t *)v2295); - int16x4_t v2855 = vld1_s16((const int16_t *)v2304); - int16x4_t v2857 = vld1_s16((const int16_t *)v2313); - int16x4_t v2859 = vld1_s16((const int16_t *)v2322); - int16x4_t v2861 = vld1_s16((const int16_t *)v2331); - int16x4_t v2863 = vld1_s16((const int16_t *)v2340); - int16x4_t v2865 = vld1_s16((const int16_t *)v2349); - int16x4_t v2867 = vld1_s16((const int16_t *)v2358); - int16x4_t v2869 = vld1_s16((const int16_t *)v2367); - int16x4_t v2871 = vld1_s16((const int16_t *)v2376); - int16x4_t v2873 = vld1_s16((const int16_t *)v2385); - int16x4_t v2875 = vld1_s16((const int16_t *)v2394); - int16x4_t v2877 = vld1_s16((const int16_t *)v2403); - int16x4_t v2879 = vld1_s16((const int16_t *)v2412); - int16x4_t v2883 = vld1_s16((const int16_t *)v2430); - int16x4_t v2885 = vld1_s16((const int16_t *)v2439); - int16x4_t v2887 = vld1_s16((const int16_t *)v2448); - int16x4_t v2889 = vld1_s16((const int16_t *)v2457); - int16x4_t v2891 = vld1_s16((const int16_t *)v2466); - int16x4_t v2893 = vld1_s16((const int16_t *)v2475); - int16x4_t v2895 = vld1_s16((const int16_t *)v2484); - int16x4_t v2897 = vld1_s16((const int16_t *)v2493); - int16x4_t v2899 = vld1_s16((const int16_t *)v2502); - int16x4_t v2901 = vld1_s16((const int16_t *)v2511); - int16x4_t v2903 = vld1_s16((const int16_t *)v2520); - int16x4_t v2905 = vld1_s16((const int16_t *)v2529); - int16x4_t v2907 = vld1_s16((const int16_t *)v2538); - int16x4_t v2909 = vld1_s16((const int16_t *)v2547); - int16x4_t v2911 = vld1_s16((const int16_t *)v2556); - float32x4_t v36 = vcvtq_n_f32_s32(vmovl_s16(v2851), 15); - float32x4_t v46 = vcvtq_n_f32_s32(vmovl_s16(v2853), 15); - float32x4_t v54 = vcvtq_n_f32_s32(vmovl_s16(v2855), 15); - float32x4_t v76 = vcvtq_n_f32_s32(vmovl_s16(v2857), 15); - float32x4_t v84 = vcvtq_n_f32_s32(vmovl_s16(v2859), 15); - float32x4_t v94 = vcvtq_n_f32_s32(vmovl_s16(v2861), 15); - float32x4_t v102 = vcvtq_n_f32_s32(vmovl_s16(v2863), 15); - float32x4_t v168 = vcvtq_n_f32_s32(vmovl_s16(v2865), 15); - float32x4_t v176 = vcvtq_n_f32_s32(vmovl_s16(v2867), 15); - float32x4_t v186 = vcvtq_n_f32_s32(vmovl_s16(v2869), 15); - float32x4_t v194 = vcvtq_n_f32_s32(vmovl_s16(v2871), 15); - float32x4_t v216 = vcvtq_n_f32_s32(vmovl_s16(v2873), 15); - float32x4_t v224 = vcvtq_n_f32_s32(vmovl_s16(v2875), 15); - float32x4_t v234 = vcvtq_n_f32_s32(vmovl_s16(v2877), 15); - float32x4_t v242 = vcvtq_n_f32_s32(vmovl_s16(v2879), 15); - float32x4_t v412 = vcvtq_n_f32_s32(vmovl_s16(v2883), 15); - float32x4_t v422 = vcvtq_n_f32_s32(vmovl_s16(v2885), 15); - float32x4_t v430 = vcvtq_n_f32_s32(vmovl_s16(v2887), 15); - float32x4_t v452 = vcvtq_n_f32_s32(vmovl_s16(v2889), 15); - float32x4_t v460 = vcvtq_n_f32_s32(vmovl_s16(v2891), 15); - float32x4_t v470 = vcvtq_n_f32_s32(vmovl_s16(v2893), 15); - float32x4_t v478 = vcvtq_n_f32_s32(vmovl_s16(v2895), 15); - float32x4_t v544 = vcvtq_n_f32_s32(vmovl_s16(v2897), 15); - float32x4_t v552 = vcvtq_n_f32_s32(vmovl_s16(v2899), 15); - float32x4_t v562 = vcvtq_n_f32_s32(vmovl_s16(v2901), 15); - float32x4_t v570 = vcvtq_n_f32_s32(vmovl_s16(v2903), 15); - float32x4_t v592 = vcvtq_n_f32_s32(vmovl_s16(v2905), 15); - float32x4_t v600 = vcvtq_n_f32_s32(vmovl_s16(v2907), 15); - float32x4_t v610 = vcvtq_n_f32_s32(vmovl_s16(v2909), 15); - float32x4_t v618 = vcvtq_n_f32_s32(vmovl_s16(v2911), 15); - float32x4_t v37 = vaddq_f32(v28, v36); - float32x4_t v38 = vsubq_f32(v28, v36); - float32x4_t v55 = vaddq_f32(v46, v54); - float32x4_t v56 = vsubq_f32(v46, v54); - float32x4_t v85 = vaddq_f32(v76, v84); - float32x4_t v86 = vsubq_f32(v76, v84); - float32x4_t v103 = vaddq_f32(v94, v102); - float32x4_t v104 = vsubq_f32(v94, v102); - float32x4_t v177 = vaddq_f32(v168, v176); - float32x4_t v178 = vsubq_f32(v168, v176); - float32x4_t v195 = vaddq_f32(v186, v194); - float32x4_t v196 = vsubq_f32(v186, v194); - float32x4_t v225 = vaddq_f32(v216, v224); - float32x4_t v226 = vsubq_f32(v216, v224); - float32x4_t v243 = vaddq_f32(v234, v242); - float32x4_t v244 = vsubq_f32(v234, v242); - float32x4_t v413 = vaddq_f32(v404, v412); - float32x4_t v414 = vsubq_f32(v404, v412); - float32x4_t v431 = vaddq_f32(v422, v430); - float32x4_t v432 = vsubq_f32(v422, v430); - float32x4_t v461 = vaddq_f32(v452, v460); - float32x4_t v462 = vsubq_f32(v452, v460); - float32x4_t v479 = vaddq_f32(v470, v478); - float32x4_t v480 = vsubq_f32(v470, v478); - float32x4_t v553 = vaddq_f32(v544, v552); - float32x4_t v554 = vsubq_f32(v544, v552); - float32x4_t v571 = vaddq_f32(v562, v570); - float32x4_t v572 = vsubq_f32(v562, v570); - float32x4_t v601 = vaddq_f32(v592, v600); - float32x4_t v602 = vsubq_f32(v592, v600); - float32x4_t v619 = vaddq_f32(v610, v618); - float32x4_t v620 = vsubq_f32(v610, v618); - float32x4_t v62 = vrev64q_f32(v56); - float32x4_t v65 = vaddq_f32(v37, v55); - float32x4_t v66 = vsubq_f32(v37, v55); - float32x4_t v105 = vaddq_f32(v85, v103); - float32x4_t v106 = vsubq_f32(v85, v103); - float32x4_t v123 = vmulq_f32(v86, v948); - float32x4_t v136 = vmulq_f32(v104, v961); - float32x4_t v202 = vrev64q_f32(v196); - float32x4_t v205 = vaddq_f32(v177, v195); - float32x4_t v206 = vsubq_f32(v177, v195); - float32x4_t v250 = vrev64q_f32(v244); - float32x4_t v253 = vaddq_f32(v225, v243); - float32x4_t v254 = vsubq_f32(v225, v243); - float32x4_t v438 = vrev64q_f32(v432); - float32x4_t v441 = vaddq_f32(v413, v431); - float32x4_t v442 = vsubq_f32(v413, v431); - float32x4_t v481 = vaddq_f32(v461, v479); - float32x4_t v482 = vsubq_f32(v461, v479); - float32x4_t v499 = vmulq_f32(v462, v948); - float32x4_t v512 = vmulq_f32(v480, v961); - float32x4_t v578 = vrev64q_f32(v572); - float32x4_t v581 = vaddq_f32(v553, v571); - float32x4_t v582 = vsubq_f32(v553, v571); - float32x4_t v621 = vaddq_f32(v601, v619); - float32x4_t v622 = vsubq_f32(v601, v619); - float32x4_t v639 = vmulq_f32(v602, v948); - float32x4_t v652 = vmulq_f32(v620, v961); - float32x4_t v64 = vmulq_f32(v62, v969); - float32x4_t v112 = vrev64q_f32(v106); - float32x4_t v115 = vaddq_f32(v65, v105); - float32x4_t v116 = vsubq_f32(v65, v105); - float32x4_t v129 = vrev64q_f32(v123); - float32x4_t v142 = vrev64q_f32(v136); - float32x4_t v204 = vmulq_f32(v202, v969); - float32x4_t v252 = vmulq_f32(v250, v969); - float32x4_t v257 = vaddq_f32(v205, v253); - float32x4_t v258 = vsubq_f32(v205, v253); - float32x4_t v317 = vmulq_f32(v206, v948); - float32x4_t v330 = vmulq_f32(v254, v961); - float32x4_t v440 = vmulq_f32(v438, v969); - float32x4_t v488 = vrev64q_f32(v482); - float32x4_t v491 = vaddq_f32(v441, v481); - float32x4_t v492 = vsubq_f32(v441, v481); - float32x4_t v505 = vrev64q_f32(v499); - float32x4_t v518 = vrev64q_f32(v512); - float32x4_t v580 = vmulq_f32(v578, v969); - float32x4_t v628 = vrev64q_f32(v622); - float32x4_t v631 = vaddq_f32(v581, v621); - float32x4_t v632 = vsubq_f32(v581, v621); - float32x4_t v645 = vrev64q_f32(v639); - float32x4_t v658 = vrev64q_f32(v652); - float32x4_t v67 = vsubq_f32(v38, v64); - float32x4_t v68 = vaddq_f32(v38, v64); - float32x4_t v114 = vmulq_f32(v112, v969); - float32x4_t v131 = vmulq_f32(v129, v1203); - float32x4_t v144 = vmulq_f32(v142, v969); - float32x4_t v207 = vsubq_f32(v178, v204); - float32x4_t v208 = vaddq_f32(v178, v204); - float32x4_t v255 = vsubq_f32(v226, v252); - float32x4_t v256 = vaddq_f32(v226, v252); - float32x4_t v264 = vrev64q_f32(v258); - float32x4_t v267 = vaddq_f32(v115, v257); - float32x4_t v268 = vsubq_f32(v115, v257); - float32x4_t v323 = vrev64q_f32(v317); - float32x4_t v336 = vrev64q_f32(v330); - float32x4_t v443 = vsubq_f32(v414, v440); - float32x4_t v444 = vaddq_f32(v414, v440); - float32x4_t v490 = vmulq_f32(v488, v969); - float32x4_t v507 = vmulq_f32(v505, v1203); - float32x4_t v520 = vmulq_f32(v518, v969); - float32x4_t v583 = vsubq_f32(v554, v580); - float32x4_t v584 = vaddq_f32(v554, v580); - float32x4_t v630 = vmulq_f32(v628, v969); - float32x4_t v647 = vmulq_f32(v645, v1203); - float32x4_t v660 = vmulq_f32(v658, v969); - float32x4_t v677 = vaddq_f32(v491, v631); - float32x4_t v678 = vsubq_f32(v491, v631); - float32x4_t v949 = vmulq_f32(v492, v948); - float32x4_t v962 = vmulq_f32(v632, v961); - float32x4_t v117 = vsubq_f32(v66, v114); - float32x4_t v118 = vaddq_f32(v66, v114); - float32x4_t v145 = vaddq_f32(v123, v131); - float32x4_t v146 = vaddq_f32(v136, v144); - float32x4_t v266 = vmulq_f32(v264, v969); - float32x4_t v275 = vmulq_f32(v207, v800); - float32x4_t v281 = vrev64q_f32(v207); - float32x4_t v288 = vmulq_f32(v255, v1096); - float32x4_t v294 = vrev64q_f32(v255); - float32x4_t v325 = vmulq_f32(v323, v1203); - float32x4_t v338 = vmulq_f32(v336, v969); - float32x4_t v359 = vmulq_f32(v208, v1096); - float32x4_t v365 = vrev64q_f32(v208); - float32x4_t v372 = vmulq_f32(v256, v1109); - float32x4_t v378 = vrev64q_f32(v256); - float32x4_t v493 = vsubq_f32(v442, v490); - float32x4_t v494 = vaddq_f32(v442, v490); - float32x4_t v521 = vaddq_f32(v499, v507); - float32x4_t v522 = vaddq_f32(v512, v520); - float32x4_t v633 = vsubq_f32(v582, v630); - float32x4_t v634 = vaddq_f32(v582, v630); - float32x4_t v661 = vaddq_f32(v639, v647); - float32x4_t v662 = vaddq_f32(v652, v660); - float32x4_t v684 = vrev64q_f32(v678); - float32x4_t v687 = vaddq_f32(v267, v677); - float32x4_t v688 = vsubq_f32(v267, v677); - float32x4_t v955 = vrev64q_f32(v949); - float32x4_t v968 = vrev64q_f32(v962); - float32x4_t v147 = vaddq_f32(v145, v146); - float32x4_t v148 = vsubq_f32(v146, v145); - float32x4_t v269 = vsubq_f32(v116, v266); - float32x4_t v270 = vaddq_f32(v116, v266); - float32x4_t v339 = vaddq_f32(v317, v325); - float32x4_t v340 = vaddq_f32(v330, v338); - float32x4_t v523 = vaddq_f32(v521, v522); - float32x4_t v524 = vsubq_f32(v522, v521); - float32x4_t v663 = vaddq_f32(v661, v662); - float32x4_t v664 = vsubq_f32(v662, v661); - float32x4_t v686 = vmulq_f32(v684, v969); - int16x4_t v693 = vqmovn_s32(vcvtq_n_s32_f32(v687, 15)); - int16x4_t v709 = vqmovn_s32(vcvtq_n_s32_f32(v688, 15)); - float32x4_t v801 = vmulq_f32(v493, v800); - float32x4_t v807 = vrev64q_f32(v493); - float32x4_t v814 = vmulq_f32(v633, v1096); - float32x4_t v820 = vrev64q_f32(v633); - float32x4_t v957 = vmulq_f32(v955, v1203); - float32x4_t v970 = vmulq_f32(v968, v969); - float32x4_t v1097 = vmulq_f32(v494, v1096); - float32x4_t v1103 = vrev64q_f32(v494); - float32x4_t v1110 = vmulq_f32(v634, v1109); - float32x4_t v1116 = vrev64q_f32(v634); - float32x4_t v154 = vrev64q_f32(v148); - float32x4_t v157 = vaddq_f32(v67, v147); - float32x4_t v158 = vsubq_f32(v67, v147); - float32x4_t v297 = vfmaq_f32(v275, v281, v808); - float32x4_t v298 = vfmaq_f32(v288, v294, v1104); - float32x4_t v341 = vaddq_f32(v339, v340); - float32x4_t v342 = vsubq_f32(v340, v339); - float32x4_t v381 = vfmaq_f32(v359, v365, v1104); - float32x4_t v382 = vfmaq_f32(v372, v378, v1117); - float32x4_t v530 = vrev64q_f32(v524); - float32x4_t v533 = vaddq_f32(v443, v523); - float32x4_t v534 = vsubq_f32(v443, v523); - float32x4_t v670 = vrev64q_f32(v664); - float32x4_t v673 = vaddq_f32(v583, v663); - float32x4_t v674 = vsubq_f32(v583, v663); - float32x4_t v689 = vsubq_f32(v268, v686); - float32x4_t v690 = vaddq_f32(v268, v686); - float32x4_t v971 = vaddq_f32(v949, v957); - float32x4_t v972 = vaddq_f32(v962, v970); - vst1_s16((int16_t *)v2566, v693); - vst1_s16((int16_t *)v2584, v709); - float32x4_t v156 = vmulq_f32(v154, v1203); - float32x4_t v299 = vaddq_f32(v297, v298); - float32x4_t v300 = vsubq_f32(v298, v297); - float32x4_t v348 = vrev64q_f32(v342); - float32x4_t v351 = vaddq_f32(v117, v341); - float32x4_t v352 = vsubq_f32(v117, v341); - float32x4_t v383 = vaddq_f32(v381, v382); - float32x4_t v384 = vsubq_f32(v382, v381); - float32x4_t v532 = vmulq_f32(v530, v1203); - float32x4_t v672 = vmulq_f32(v670, v1203); - int16x4_t v701 = vqmovn_s32(vcvtq_n_s32_f32(v689, 15)); - int16x4_t v717 = vqmovn_s32(vcvtq_n_s32_f32(v690, 15)); - float32x4_t v727 = vmulq_f32(v533, v726); - float32x4_t v733 = vrev64q_f32(v533); - float32x4_t v740 = vmulq_f32(v673, v874); - float32x4_t v746 = vrev64q_f32(v673); - float32x4_t v823 = vfmaq_f32(v801, v807, v808); - float32x4_t v824 = vfmaq_f32(v814, v820, v1104); - float32x4_t v973 = vaddq_f32(v971, v972); - float32x4_t v974 = vsubq_f32(v972, v971); - float32x4_t v1023 = vmulq_f32(v534, v1022); - float32x4_t v1029 = vrev64q_f32(v534); - float32x4_t v1036 = vmulq_f32(v674, v1035); - float32x4_t v1042 = vrev64q_f32(v674); - float32x4_t v1119 = vfmaq_f32(v1097, v1103, v1104); - float32x4_t v1120 = vfmaq_f32(v1110, v1116, v1117); - float32x4_t v159 = vsubq_f32(v68, v156); - float32x4_t v160 = vaddq_f32(v68, v156); - float32x4_t v306 = vrev64q_f32(v300); - float32x4_t v309 = vaddq_f32(v157, v299); - float32x4_t v310 = vsubq_f32(v157, v299); - float32x4_t v350 = vmulq_f32(v348, v1203); - float32x4_t v390 = vrev64q_f32(v384); - float32x4_t v535 = vsubq_f32(v444, v532); - float32x4_t v536 = vaddq_f32(v444, v532); - float32x4_t v675 = vsubq_f32(v584, v672); - float32x4_t v676 = vaddq_f32(v584, v672); - float32x4_t v825 = vaddq_f32(v823, v824); - float32x4_t v826 = vsubq_f32(v824, v823); - float32x4_t v980 = vrev64q_f32(v974); - float32x4_t v983 = vaddq_f32(v269, v973); - float32x4_t v984 = vsubq_f32(v269, v973); - float32x4_t v1121 = vaddq_f32(v1119, v1120); - float32x4_t v1122 = vsubq_f32(v1120, v1119); - vst1_s16((int16_t *)v2575, v701); - vst1_s16((int16_t *)v2593, v717); - float32x4_t v308 = vmulq_f32(v306, v1203); - float32x4_t v353 = vsubq_f32(v118, v350); - float32x4_t v354 = vaddq_f32(v118, v350); - float32x4_t v392 = vmulq_f32(v390, v1203); - float32x4_t v393 = vaddq_f32(v159, v383); - float32x4_t v394 = vsubq_f32(v159, v383); - float32x4_t v749 = vfmaq_f32(v727, v733, v1043); - float32x4_t v750 = vfmaq_f32(v740, v746, v882); - float32x4_t v832 = vrev64q_f32(v826); - float32x4_t v835 = vaddq_f32(v351, v825); - float32x4_t v836 = vsubq_f32(v351, v825); - float32x4_t v875 = vmulq_f32(v535, v874); - float32x4_t v881 = vrev64q_f32(v535); - float32x4_t v888 = vmulq_f32(v675, v887); - float32x4_t v894 = vrev64q_f32(v675); - float32x4_t v982 = vmulq_f32(v980, v1203); - int16x4_t v989 = vqmovn_s32(vcvtq_n_s32_f32(v983, 15)); - int16x4_t v1005 = vqmovn_s32(vcvtq_n_s32_f32(v984, 15)); - float32x4_t v1045 = vfmaq_f32(v1023, v1029, v1030); - float32x4_t v1046 = vfmaq_f32(v1036, v1042, v1043); - float32x4_t v1128 = vrev64q_f32(v1122); - float32x4_t v1171 = vmulq_f32(v536, v1170); - float32x4_t v1177 = vrev64q_f32(v536); - float32x4_t v1184 = vmulq_f32(v676, v1183); - float32x4_t v1190 = vrev64q_f32(v676); - float32x4_t v311 = vsubq_f32(v158, v308); - float32x4_t v312 = vaddq_f32(v158, v308); - float32x4_t v395 = vsubq_f32(v160, v392); - float32x4_t v396 = vaddq_f32(v160, v392); - float32x4_t v751 = vaddq_f32(v749, v750); - float32x4_t v752 = vsubq_f32(v750, v749); - float32x4_t v834 = vmulq_f32(v832, v1203); - int16x4_t v841 = vqmovn_s32(vcvtq_n_s32_f32(v835, 15)); - int16x4_t v857 = vqmovn_s32(vcvtq_n_s32_f32(v836, 15)); - float32x4_t v985 = vsubq_f32(v270, v982); - float32x4_t v986 = vaddq_f32(v270, v982); - float32x4_t v1047 = vaddq_f32(v1045, v1046); - float32x4_t v1048 = vsubq_f32(v1046, v1045); - float32x4_t v1130 = vmulq_f32(v1128, v1203); - float32x4_t v1131 = vaddq_f32(v353, v1121); - float32x4_t v1132 = vsubq_f32(v353, v1121); - vst1_s16((int16_t *)v2710, v989); - vst1_s16((int16_t *)v2728, v1005); - float32x4_t v758 = vrev64q_f32(v752); - float32x4_t v761 = vaddq_f32(v309, v751); - float32x4_t v762 = vsubq_f32(v309, v751); - float32x4_t v837 = vsubq_f32(v352, v834); - float32x4_t v838 = vaddq_f32(v352, v834); - float32x4_t v897 = vfmaq_f32(v875, v881, v882); - float32x4_t v898 = vfmaq_f32(v888, v894, v1178); - int16x4_t v997 = vqmovn_s32(vcvtq_n_s32_f32(v985, 15)); - int16x4_t v1013 = vqmovn_s32(vcvtq_n_s32_f32(v986, 15)); - float32x4_t v1054 = vrev64q_f32(v1048); - float32x4_t v1057 = vaddq_f32(v311, v1047); - float32x4_t v1058 = vsubq_f32(v311, v1047); - float32x4_t v1133 = vsubq_f32(v354, v1130); - float32x4_t v1134 = vaddq_f32(v354, v1130); - int16x4_t v1137 = vqmovn_s32(vcvtq_n_s32_f32(v1131, 15)); - int16x4_t v1153 = vqmovn_s32(vcvtq_n_s32_f32(v1132, 15)); - float32x4_t v1193 = vfmaq_f32(v1171, v1177, v1178); - float32x4_t v1194 = vfmaq_f32(v1184, v1190, v1191); - vst1_s16((int16_t *)v2638, v841); - vst1_s16((int16_t *)v2656, v857); - float32x4_t v760 = vmulq_f32(v758, v1203); - int16x4_t v767 = vqmovn_s32(vcvtq_n_s32_f32(v761, 15)); - int16x4_t v783 = vqmovn_s32(vcvtq_n_s32_f32(v762, 15)); - int16x4_t v849 = vqmovn_s32(vcvtq_n_s32_f32(v837, 15)); - int16x4_t v865 = vqmovn_s32(vcvtq_n_s32_f32(v838, 15)); - float32x4_t v899 = vaddq_f32(v897, v898); - float32x4_t v900 = vsubq_f32(v898, v897); - float32x4_t v1056 = vmulq_f32(v1054, v1203); - int16x4_t v1063 = vqmovn_s32(vcvtq_n_s32_f32(v1057, 15)); - int16x4_t v1079 = vqmovn_s32(vcvtq_n_s32_f32(v1058, 15)); - int16x4_t v1145 = vqmovn_s32(vcvtq_n_s32_f32(v1133, 15)); - int16x4_t v1161 = vqmovn_s32(vcvtq_n_s32_f32(v1134, 15)); - float32x4_t v1195 = vaddq_f32(v1193, v1194); - float32x4_t v1196 = vsubq_f32(v1194, v1193); - vst1_s16((int16_t *)v2719, v997); - vst1_s16((int16_t *)v2737, v1013); - vst1_s16((int16_t *)v2782, v1137); - vst1_s16((int16_t *)v2800, v1153); - float32x4_t v763 = vsubq_f32(v310, v760); - float32x4_t v764 = vaddq_f32(v310, v760); - float32x4_t v906 = vrev64q_f32(v900); - float32x4_t v909 = vaddq_f32(v393, v899); - float32x4_t v910 = vsubq_f32(v393, v899); - float32x4_t v1059 = vsubq_f32(v312, v1056); - float32x4_t v1060 = vaddq_f32(v312, v1056); - float32x4_t v1202 = vrev64q_f32(v1196); - float32x4_t v1205 = vaddq_f32(v395, v1195); - float32x4_t v1206 = vsubq_f32(v395, v1195); - vst1_s16((int16_t *)v2602, v767); - vst1_s16((int16_t *)v2620, v783); - vst1_s16((int16_t *)v2647, v849); - vst1_s16((int16_t *)v2665, v865); - vst1_s16((int16_t *)v2746, v1063); - vst1_s16((int16_t *)v2764, v1079); - vst1_s16((int16_t *)v2791, v1145); - vst1_s16((int16_t *)v2809, v1161); - int16x4_t v775 = vqmovn_s32(vcvtq_n_s32_f32(v763, 15)); - int16x4_t v791 = vqmovn_s32(vcvtq_n_s32_f32(v764, 15)); - float32x4_t v908 = vmulq_f32(v906, v1203); - int16x4_t v915 = vqmovn_s32(vcvtq_n_s32_f32(v909, 15)); - int16x4_t v931 = vqmovn_s32(vcvtq_n_s32_f32(v910, 15)); - int16x4_t v1071 = vqmovn_s32(vcvtq_n_s32_f32(v1059, 15)); - int16x4_t v1087 = vqmovn_s32(vcvtq_n_s32_f32(v1060, 15)); - float32x4_t v1204 = vmulq_f32(v1202, v1203); - int16x4_t v1211 = vqmovn_s32(vcvtq_n_s32_f32(v1205, 15)); - int16x4_t v1227 = vqmovn_s32(vcvtq_n_s32_f32(v1206, 15)); - float32x4_t v911 = vsubq_f32(v394, v908); - float32x4_t v912 = vaddq_f32(v394, v908); - float32x4_t v1207 = vsubq_f32(v396, v1204); - float32x4_t v1208 = vaddq_f32(v396, v1204); - vst1_s16((int16_t *)v2611, v775); - vst1_s16((int16_t *)v2629, v791); - vst1_s16((int16_t *)v2674, v915); - vst1_s16((int16_t *)v2692, v931); - vst1_s16((int16_t *)v2755, v1071); - vst1_s16((int16_t *)v2773, v1087); - vst1_s16((int16_t *)v2818, v1211); - vst1_s16((int16_t *)v2836, v1227); - int16x4_t v923 = vqmovn_s32(vcvtq_n_s32_f32(v911, 15)); - int16x4_t v939 = vqmovn_s32(vcvtq_n_s32_f32(v912, 15)); - int16x4_t v1219 = vqmovn_s32(vcvtq_n_s32_f32(v1207, 15)); - int16x4_t v1235 = vqmovn_s32(vcvtq_n_s32_f32(v1208, 15)); - vst1_s16((int16_t *)v2683, v923); - vst1_s16((int16_t *)v2701, v939); - vst1_s16((int16_t *)v2827, v1219); - vst1_s16((int16_t *)v2845, v1235); - v5 += 2 * 1; - v6 += 2 * 1; - } - for (int j = v1241 * 2; j < howmany; j += 1) { - int16x4_t v1572 = vld1s_s16(&v5[istride]); - float v2023 = 7.0710678118654757e-01F; - float v2034 = -7.0710678118654746e-01F; - float v2084 = 5.5557023301960229e-01F; - float v2098 = -1.9509032201612861e-01F; - float v2149 = 9.2387953251128674e-01F; - float v2156 = -9.2387953251128685e-01F; - float v2159 = 3.8268343236508967e-01F; - float v2160 = -3.8268343236508967e-01F; - float v2206 = 1.9509032201612833e-01F; - float v2209 = -9.8078528040323043e-01F; - float v2210 = 9.8078528040323043e-01F; - float v2217 = -5.5557023301960218e-01F; - float v2220 = 8.3146961230254524e-01F; - float v2221 = -8.3146961230254524e-01F; - float v2231 = -1.0000000000000000e+00F; - float v2232 = 1.0000000000000000e+00F; - float32x2_t v2234 = (float32x2_t){v4, v4}; - int16x4_t v1253 = vld1s_s16(&v5[0]); - float32x2_t v1573 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v1572)), 15); - float32x2_t v1841 = (float32x2_t){v2210, v2210}; - float32x2_t v1902 = (float32x2_t){v2149, v2149}; - float32x2_t v1906 = (float32x2_t){v2160, v2159}; - float32x2_t v1963 = (float32x2_t){v2220, v2220}; - float32x2_t v1967 = (float32x2_t){v2217, v2084}; - float32x2_t v1974 = (float32x2_t){v2098, v2098}; - float32x2_t v2024 = (float32x2_t){v2023, v2023}; - float32x2_t v2035 = (float32x2_t){v2034, v2034}; - float32x2_t v2039 = (float32x2_t){v2232, v2231}; - float32x2_t v2085 = (float32x2_t){v2084, v2084}; - float32x2_t v2089 = (float32x2_t){v2221, v2220}; - float32x2_t v2096 = (float32x2_t){v2209, v2209}; - float32x2_t v2100 = (float32x2_t){v2098, v2206}; - float32x2_t v2146 = (float32x2_t){v2159, v2159}; - float32x2_t v2150 = (float32x2_t){v2156, v2149}; - float32x2_t v2157 = (float32x2_t){v2156, v2156}; - float32x2_t v2161 = (float32x2_t){v2159, v2160}; - float32x2_t v2207 = (float32x2_t){v2206, v2206}; - float32x2_t v2211 = (float32x2_t){v2209, v2210}; - float32x2_t v2218 = (float32x2_t){v2217, v2217}; - float32x2_t v2222 = (float32x2_t){v2220, v2221}; - float32x2_t v2233 = (float32x2_t){v2231, v2232}; - float32x2_t v1254 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v1253)), 15); - int16x4_t v1259 = vld1s_s16(&v5[istride * 16]); - int16x4_t v1267 = vld1s_s16(&v5[istride * 8]); - int16x4_t v1273 = vld1s_s16(&v5[istride * 24]); - int16x4_t v1292 = vld1s_s16(&v5[istride * 4]); - int16x4_t v1298 = vld1s_s16(&v5[istride * 20]); - int16x4_t v1306 = vld1s_s16(&v5[istride * 12]); - int16x4_t v1312 = vld1s_s16(&v5[istride * 28]); - int16x4_t v1370 = vld1s_s16(&v5[istride * 2]); - int16x4_t v1376 = vld1s_s16(&v5[istride * 18]); - int16x4_t v1384 = vld1s_s16(&v5[istride * 10]); - int16x4_t v1390 = vld1s_s16(&v5[istride * 26]); - int16x4_t v1409 = vld1s_s16(&v5[istride * 6]); - int16x4_t v1415 = vld1s_s16(&v5[istride * 22]); - int16x4_t v1423 = vld1s_s16(&v5[istride * 14]); - int16x4_t v1429 = vld1s_s16(&v5[istride * 30]); - int16x4_t v1578 = vld1s_s16(&v5[istride * 17]); - int16x4_t v1586 = vld1s_s16(&v5[istride * 9]); - int16x4_t v1592 = vld1s_s16(&v5[istride * 25]); - int16x4_t v1611 = vld1s_s16(&v5[istride * 5]); - int16x4_t v1617 = vld1s_s16(&v5[istride * 21]); - int16x4_t v1625 = vld1s_s16(&v5[istride * 13]); - int16x4_t v1631 = vld1s_s16(&v5[istride * 29]); - int16x4_t v1689 = vld1s_s16(&v5[istride * 3]); - int16x4_t v1695 = vld1s_s16(&v5[istride * 19]); - int16x4_t v1703 = vld1s_s16(&v5[istride * 11]); - int16x4_t v1709 = vld1s_s16(&v5[istride * 27]); - int16x4_t v1728 = vld1s_s16(&v5[istride * 7]); - int16x4_t v1734 = vld1s_s16(&v5[istride * 23]); - int16x4_t v1742 = vld1s_s16(&v5[istride * 15]); - int16x4_t v1748 = vld1s_s16(&v5[istride * 31]); - float32x2_t v1908 = vmul_f32(v2234, v1906); - float32x2_t v1969 = vmul_f32(v2234, v1967); - float32x2_t v2041 = vmul_f32(v2234, v2039); - float32x2_t v2091 = vmul_f32(v2234, v2089); - float32x2_t v2102 = vmul_f32(v2234, v2100); - float32x2_t v2152 = vmul_f32(v2234, v2150); - float32x2_t v2163 = vmul_f32(v2234, v2161); - float32x2_t v2213 = vmul_f32(v2234, v2211); - float32x2_t v2224 = vmul_f32(v2234, v2222); - float32x2_t v2235 = vmul_f32(v2234, v2233); - float32x2_t v1260 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v1259)), 15); - float32x2_t v1268 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v1267)), 15); - float32x2_t v1274 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v1273)), 15); - float32x2_t v1293 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v1292)), 15); - float32x2_t v1299 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v1298)), 15); - float32x2_t v1307 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v1306)), 15); - float32x2_t v1313 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v1312)), 15); - float32x2_t v1371 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v1370)), 15); - float32x2_t v1377 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v1376)), 15); - float32x2_t v1385 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v1384)), 15); - float32x2_t v1391 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v1390)), 15); - float32x2_t v1410 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v1409)), 15); - float32x2_t v1416 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v1415)), 15); - float32x2_t v1424 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v1423)), 15); - float32x2_t v1430 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v1429)), 15); - float32x2_t v1579 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v1578)), 15); - float32x2_t v1587 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v1586)), 15); - float32x2_t v1593 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v1592)), 15); - float32x2_t v1612 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v1611)), 15); - float32x2_t v1618 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v1617)), 15); - float32x2_t v1626 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v1625)), 15); - float32x2_t v1632 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v1631)), 15); - float32x2_t v1690 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v1689)), 15); - float32x2_t v1696 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v1695)), 15); - float32x2_t v1704 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v1703)), 15); - float32x2_t v1710 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v1709)), 15); - float32x2_t v1729 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v1728)), 15); - float32x2_t v1735 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v1734)), 15); - float32x2_t v1743 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v1742)), 15); - float32x2_t v1749 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v1748)), 15); - float32x2_t v1261 = vadd_f32(v1254, v1260); - float32x2_t v1262 = vsub_f32(v1254, v1260); - float32x2_t v1275 = vadd_f32(v1268, v1274); - float32x2_t v1276 = vsub_f32(v1268, v1274); - float32x2_t v1300 = vadd_f32(v1293, v1299); - float32x2_t v1301 = vsub_f32(v1293, v1299); - float32x2_t v1314 = vadd_f32(v1307, v1313); - float32x2_t v1315 = vsub_f32(v1307, v1313); - float32x2_t v1378 = vadd_f32(v1371, v1377); - float32x2_t v1379 = vsub_f32(v1371, v1377); - float32x2_t v1392 = vadd_f32(v1385, v1391); - float32x2_t v1393 = vsub_f32(v1385, v1391); - float32x2_t v1417 = vadd_f32(v1410, v1416); - float32x2_t v1418 = vsub_f32(v1410, v1416); - float32x2_t v1431 = vadd_f32(v1424, v1430); - float32x2_t v1432 = vsub_f32(v1424, v1430); - float32x2_t v1580 = vadd_f32(v1573, v1579); - float32x2_t v1581 = vsub_f32(v1573, v1579); - float32x2_t v1594 = vadd_f32(v1587, v1593); - float32x2_t v1595 = vsub_f32(v1587, v1593); - float32x2_t v1619 = vadd_f32(v1612, v1618); - float32x2_t v1620 = vsub_f32(v1612, v1618); - float32x2_t v1633 = vadd_f32(v1626, v1632); - float32x2_t v1634 = vsub_f32(v1626, v1632); - float32x2_t v1697 = vadd_f32(v1690, v1696); - float32x2_t v1698 = vsub_f32(v1690, v1696); - float32x2_t v1711 = vadd_f32(v1704, v1710); - float32x2_t v1712 = vsub_f32(v1704, v1710); - float32x2_t v1736 = vadd_f32(v1729, v1735); - float32x2_t v1737 = vsub_f32(v1729, v1735); - float32x2_t v1750 = vadd_f32(v1743, v1749); - float32x2_t v1751 = vsub_f32(v1743, v1749); - float32x2_t v1282 = vrev64_f32(v1276); - float32x2_t v1284 = vadd_f32(v1261, v1275); - float32x2_t v1285 = vsub_f32(v1261, v1275); - float32x2_t v1316 = vadd_f32(v1300, v1314); - float32x2_t v1317 = vsub_f32(v1300, v1314); - float32x2_t v1332 = vmul_f32(v1301, v2024); - float32x2_t v1343 = vmul_f32(v1315, v2035); - float32x2_t v1399 = vrev64_f32(v1393); - float32x2_t v1401 = vadd_f32(v1378, v1392); - float32x2_t v1402 = vsub_f32(v1378, v1392); - float32x2_t v1438 = vrev64_f32(v1432); - float32x2_t v1440 = vadd_f32(v1417, v1431); - float32x2_t v1441 = vsub_f32(v1417, v1431); - float32x2_t v1601 = vrev64_f32(v1595); - float32x2_t v1603 = vadd_f32(v1580, v1594); - float32x2_t v1604 = vsub_f32(v1580, v1594); - float32x2_t v1635 = vadd_f32(v1619, v1633); - float32x2_t v1636 = vsub_f32(v1619, v1633); - float32x2_t v1651 = vmul_f32(v1620, v2024); - float32x2_t v1662 = vmul_f32(v1634, v2035); - float32x2_t v1718 = vrev64_f32(v1712); - float32x2_t v1720 = vadd_f32(v1697, v1711); - float32x2_t v1721 = vsub_f32(v1697, v1711); - float32x2_t v1752 = vadd_f32(v1736, v1750); - float32x2_t v1753 = vsub_f32(v1736, v1750); - float32x2_t v1768 = vmul_f32(v1737, v2024); - float32x2_t v1779 = vmul_f32(v1751, v2035); - float32x2_t v1283 = vmul_f32(v1282, v2041); - float32x2_t v1323 = vrev64_f32(v1317); - float32x2_t v1325 = vadd_f32(v1284, v1316); - float32x2_t v1326 = vsub_f32(v1284, v1316); - float32x2_t v1338 = vrev64_f32(v1332); - float32x2_t v1349 = vrev64_f32(v1343); - float32x2_t v1400 = vmul_f32(v1399, v2041); - float32x2_t v1439 = vmul_f32(v1438, v2041); - float32x2_t v1444 = vadd_f32(v1401, v1440); - float32x2_t v1445 = vsub_f32(v1401, v1440); - float32x2_t v1497 = vmul_f32(v1402, v2024); - float32x2_t v1508 = vmul_f32(v1441, v2035); - float32x2_t v1602 = vmul_f32(v1601, v2041); - float32x2_t v1642 = vrev64_f32(v1636); - float32x2_t v1644 = vadd_f32(v1603, v1635); - float32x2_t v1645 = vsub_f32(v1603, v1635); - float32x2_t v1657 = vrev64_f32(v1651); - float32x2_t v1668 = vrev64_f32(v1662); - float32x2_t v1719 = vmul_f32(v1718, v2041); - float32x2_t v1759 = vrev64_f32(v1753); - float32x2_t v1761 = vadd_f32(v1720, v1752); - float32x2_t v1762 = vsub_f32(v1720, v1752); - float32x2_t v1774 = vrev64_f32(v1768); - float32x2_t v1785 = vrev64_f32(v1779); - float32x2_t v1286 = vsub_f32(v1262, v1283); - float32x2_t v1287 = vadd_f32(v1262, v1283); - float32x2_t v1324 = vmul_f32(v1323, v2041); - float32x2_t v1339 = vmul_f32(v1338, v2235); - float32x2_t v1350 = vmul_f32(v1349, v2041); - float32x2_t v1403 = vsub_f32(v1379, v1400); - float32x2_t v1404 = vadd_f32(v1379, v1400); - float32x2_t v1442 = vsub_f32(v1418, v1439); - float32x2_t v1443 = vadd_f32(v1418, v1439); - float32x2_t v1451 = vrev64_f32(v1445); - float32x2_t v1453 = vadd_f32(v1325, v1444); - float32x2_t v1454 = vsub_f32(v1325, v1444); - float32x2_t v1503 = vrev64_f32(v1497); - float32x2_t v1514 = vrev64_f32(v1508); - float32x2_t v1605 = vsub_f32(v1581, v1602); - float32x2_t v1606 = vadd_f32(v1581, v1602); - float32x2_t v1643 = vmul_f32(v1642, v2041); - float32x2_t v1658 = vmul_f32(v1657, v2235); - float32x2_t v1669 = vmul_f32(v1668, v2041); - float32x2_t v1722 = vsub_f32(v1698, v1719); - float32x2_t v1723 = vadd_f32(v1698, v1719); - float32x2_t v1760 = vmul_f32(v1759, v2041); - float32x2_t v1775 = vmul_f32(v1774, v2235); - float32x2_t v1786 = vmul_f32(v1785, v2041); - float32x2_t v1802 = vadd_f32(v1644, v1761); - float32x2_t v1803 = vsub_f32(v1644, v1761); - float32x2_t v2025 = vmul_f32(v1645, v2024); - float32x2_t v2036 = vmul_f32(v1762, v2035); - float32x2_t v1327 = vsub_f32(v1285, v1324); - float32x2_t v1328 = vadd_f32(v1285, v1324); - float32x2_t v1351 = vadd_f32(v1332, v1339); - float32x2_t v1352 = vadd_f32(v1343, v1350); - float32x2_t v1452 = vmul_f32(v1451, v2041); - float32x2_t v1460 = vmul_f32(v1403, v1902); - float32x2_t v1466 = vrev64_f32(v1403); - float32x2_t v1471 = vmul_f32(v1442, v2146); - float32x2_t v1477 = vrev64_f32(v1442); - float32x2_t v1504 = vmul_f32(v1503, v2235); - float32x2_t v1515 = vmul_f32(v1514, v2041); - float32x2_t v1534 = vmul_f32(v1404, v2146); - float32x2_t v1540 = vrev64_f32(v1404); - float32x2_t v1545 = vmul_f32(v1443, v2157); - float32x2_t v1551 = vrev64_f32(v1443); - float32x2_t v1646 = vsub_f32(v1604, v1643); - float32x2_t v1647 = vadd_f32(v1604, v1643); - float32x2_t v1670 = vadd_f32(v1651, v1658); - float32x2_t v1671 = vadd_f32(v1662, v1669); - float32x2_t v1763 = vsub_f32(v1721, v1760); - float32x2_t v1764 = vadd_f32(v1721, v1760); - float32x2_t v1787 = vadd_f32(v1768, v1775); - float32x2_t v1788 = vadd_f32(v1779, v1786); - float32x2_t v1809 = vrev64_f32(v1803); - float32x2_t v1811 = vadd_f32(v1453, v1802); - float32x2_t v1812 = vsub_f32(v1453, v1802); - float32x2_t v2031 = vrev64_f32(v2025); - float32x2_t v2042 = vrev64_f32(v2036); - float32x2_t v1353 = vadd_f32(v1351, v1352); - float32x2_t v1354 = vsub_f32(v1352, v1351); - float32x2_t v1455 = vsub_f32(v1326, v1452); - float32x2_t v1456 = vadd_f32(v1326, v1452); - float32x2_t v1516 = vadd_f32(v1497, v1504); - float32x2_t v1517 = vadd_f32(v1508, v1515); - float32x2_t v1672 = vadd_f32(v1670, v1671); - float32x2_t v1673 = vsub_f32(v1671, v1670); - float32x2_t v1789 = vadd_f32(v1787, v1788); - float32x2_t v1790 = vsub_f32(v1788, v1787); - float32x2_t v1810 = vmul_f32(v1809, v2041); - int16x4_t v1817 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1811, 15), (int32x2_t){0, 0})); - int16x4_t v1829 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1812, 15), (int32x2_t){0, 0})); - float32x2_t v1903 = vmul_f32(v1646, v1902); - float32x2_t v1909 = vrev64_f32(v1646); - float32x2_t v1914 = vmul_f32(v1763, v2146); - float32x2_t v1920 = vrev64_f32(v1763); - float32x2_t v2032 = vmul_f32(v2031, v2235); - float32x2_t v2043 = vmul_f32(v2042, v2041); - float32x2_t v2147 = vmul_f32(v1647, v2146); - float32x2_t v2153 = vrev64_f32(v1647); - float32x2_t v2158 = vmul_f32(v1764, v2157); - float32x2_t v2164 = vrev64_f32(v1764); - float32x2_t v1360 = vrev64_f32(v1354); - float32x2_t v1362 = vadd_f32(v1286, v1353); - float32x2_t v1363 = vsub_f32(v1286, v1353); - float32x2_t v1479 = vfma_f32(v1460, v1466, v1908); - float32x2_t v1480 = vfma_f32(v1471, v1477, v2152); - float32x2_t v1518 = vadd_f32(v1516, v1517); - float32x2_t v1519 = vsub_f32(v1517, v1516); - float32x2_t v1553 = vfma_f32(v1534, v1540, v2152); - float32x2_t v1554 = vfma_f32(v1545, v1551, v2163); - float32x2_t v1679 = vrev64_f32(v1673); - float32x2_t v1681 = vadd_f32(v1605, v1672); - float32x2_t v1682 = vsub_f32(v1605, v1672); - float32x2_t v1796 = vrev64_f32(v1790); - float32x2_t v1798 = vadd_f32(v1722, v1789); - float32x2_t v1799 = vsub_f32(v1722, v1789); - float32x2_t v1813 = vsub_f32(v1454, v1810); - float32x2_t v1814 = vadd_f32(v1454, v1810); - v6[0] = vget_lane_s32(vreinterpret_s32_s16(v1817), 0); - v6[ostride * 16] = vget_lane_s32(vreinterpret_s32_s16(v1829), 0); - float32x2_t v2044 = vadd_f32(v2025, v2032); - float32x2_t v2045 = vadd_f32(v2036, v2043); - float32x2_t v1361 = vmul_f32(v1360, v2235); - float32x2_t v1481 = vadd_f32(v1479, v1480); - float32x2_t v1482 = vsub_f32(v1480, v1479); - float32x2_t v1525 = vrev64_f32(v1519); - float32x2_t v1527 = vadd_f32(v1327, v1518); - float32x2_t v1528 = vsub_f32(v1327, v1518); - float32x2_t v1555 = vadd_f32(v1553, v1554); - float32x2_t v1556 = vsub_f32(v1554, v1553); - float32x2_t v1680 = vmul_f32(v1679, v2235); - float32x2_t v1797 = vmul_f32(v1796, v2235); - int16x4_t v1823 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1813, 15), (int32x2_t){0, 0})); - int16x4_t v1835 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1814, 15), (int32x2_t){0, 0})); - float32x2_t v1842 = vmul_f32(v1681, v1841); - float32x2_t v1848 = vrev64_f32(v1681); - float32x2_t v1853 = vmul_f32(v1798, v1963); - float32x2_t v1859 = vrev64_f32(v1798); - float32x2_t v1922 = vfma_f32(v1903, v1909, v1908); - float32x2_t v1923 = vfma_f32(v1914, v1920, v2152); - float32x2_t v2046 = vadd_f32(v2044, v2045); - float32x2_t v2047 = vsub_f32(v2045, v2044); - float32x2_t v2086 = vmul_f32(v1682, v2085); - float32x2_t v2092 = vrev64_f32(v1682); - float32x2_t v2097 = vmul_f32(v1799, v2096); - float32x2_t v2103 = vrev64_f32(v1799); - float32x2_t v2166 = vfma_f32(v2147, v2153, v2152); - float32x2_t v2167 = vfma_f32(v2158, v2164, v2163); - float32x2_t v1364 = vsub_f32(v1287, v1361); - float32x2_t v1365 = vadd_f32(v1287, v1361); - float32x2_t v1488 = vrev64_f32(v1482); - float32x2_t v1490 = vadd_f32(v1362, v1481); - float32x2_t v1491 = vsub_f32(v1362, v1481); - float32x2_t v1526 = vmul_f32(v1525, v2235); - float32x2_t v1562 = vrev64_f32(v1556); - float32x2_t v1683 = vsub_f32(v1606, v1680); - float32x2_t v1684 = vadd_f32(v1606, v1680); - float32x2_t v1800 = vsub_f32(v1723, v1797); - float32x2_t v1801 = vadd_f32(v1723, v1797); - v6[ostride * 8] = vget_lane_s32(vreinterpret_s32_s16(v1823), 0); - v6[ostride * 24] = vget_lane_s32(vreinterpret_s32_s16(v1835), 0); - float32x2_t v1924 = vadd_f32(v1922, v1923); - float32x2_t v1925 = vsub_f32(v1923, v1922); - float32x2_t v2053 = vrev64_f32(v2047); - float32x2_t v2055 = vadd_f32(v1455, v2046); - float32x2_t v2056 = vsub_f32(v1455, v2046); - float32x2_t v2168 = vadd_f32(v2166, v2167); - float32x2_t v2169 = vsub_f32(v2167, v2166); - float32x2_t v1489 = vmul_f32(v1488, v2235); - float32x2_t v1529 = vsub_f32(v1328, v1526); - float32x2_t v1530 = vadd_f32(v1328, v1526); - float32x2_t v1563 = vmul_f32(v1562, v2235); - float32x2_t v1564 = vadd_f32(v1364, v1555); - float32x2_t v1565 = vsub_f32(v1364, v1555); - float32x2_t v1861 = vfma_f32(v1842, v1848, v2102); - float32x2_t v1862 = vfma_f32(v1853, v1859, v1969); - float32x2_t v1931 = vrev64_f32(v1925); - float32x2_t v1933 = vadd_f32(v1527, v1924); - float32x2_t v1934 = vsub_f32(v1527, v1924); - float32x2_t v1964 = vmul_f32(v1683, v1963); - float32x2_t v1970 = vrev64_f32(v1683); - float32x2_t v1975 = vmul_f32(v1800, v1974); - float32x2_t v1981 = vrev64_f32(v1800); - float32x2_t v2054 = vmul_f32(v2053, v2235); - int16x4_t v2061 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v2055, 15), (int32x2_t){0, 0})); - int16x4_t v2073 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v2056, 15), (int32x2_t){0, 0})); - float32x2_t v2105 = vfma_f32(v2086, v2092, v2091); - float32x2_t v2106 = vfma_f32(v2097, v2103, v2102); - float32x2_t v2175 = vrev64_f32(v2169); - float32x2_t v2208 = vmul_f32(v1684, v2207); - float32x2_t v2214 = vrev64_f32(v1684); - float32x2_t v2219 = vmul_f32(v1801, v2218); - float32x2_t v2225 = vrev64_f32(v1801); - float32x2_t v1492 = vsub_f32(v1363, v1489); - float32x2_t v1493 = vadd_f32(v1363, v1489); - float32x2_t v1566 = vsub_f32(v1365, v1563); - float32x2_t v1567 = vadd_f32(v1365, v1563); - float32x2_t v1863 = vadd_f32(v1861, v1862); - float32x2_t v1864 = vsub_f32(v1862, v1861); - float32x2_t v1932 = vmul_f32(v1931, v2235); - int16x4_t v1939 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1933, 15), (int32x2_t){0, 0})); - int16x4_t v1951 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1934, 15), (int32x2_t){0, 0})); - float32x2_t v2057 = vsub_f32(v1456, v2054); - float32x2_t v2058 = vadd_f32(v1456, v2054); - v6[ostride * 4] = vget_lane_s32(vreinterpret_s32_s16(v2061), 0); - v6[ostride * 20] = vget_lane_s32(vreinterpret_s32_s16(v2073), 0); - float32x2_t v2107 = vadd_f32(v2105, v2106); - float32x2_t v2108 = vsub_f32(v2106, v2105); - float32x2_t v2176 = vmul_f32(v2175, v2235); - float32x2_t v2177 = vadd_f32(v1529, v2168); - float32x2_t v2178 = vsub_f32(v1529, v2168); - float32x2_t v1870 = vrev64_f32(v1864); - float32x2_t v1872 = vadd_f32(v1490, v1863); - float32x2_t v1873 = vsub_f32(v1490, v1863); - float32x2_t v1935 = vsub_f32(v1528, v1932); - float32x2_t v1936 = vadd_f32(v1528, v1932); - v6[ostride * 2] = vget_lane_s32(vreinterpret_s32_s16(v1939), 0); - v6[ostride * 18] = vget_lane_s32(vreinterpret_s32_s16(v1951), 0); - float32x2_t v1983 = vfma_f32(v1964, v1970, v1969); - float32x2_t v1984 = vfma_f32(v1975, v1981, v2213); - int16x4_t v2067 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v2057, 15), (int32x2_t){0, 0})); - int16x4_t v2079 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v2058, 15), (int32x2_t){0, 0})); - float32x2_t v2114 = vrev64_f32(v2108); - float32x2_t v2116 = vadd_f32(v1492, v2107); - float32x2_t v2117 = vsub_f32(v1492, v2107); - float32x2_t v2179 = vsub_f32(v1530, v2176); - float32x2_t v2180 = vadd_f32(v1530, v2176); - int16x4_t v2183 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v2177, 15), (int32x2_t){0, 0})); - int16x4_t v2195 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v2178, 15), (int32x2_t){0, 0})); - float32x2_t v2227 = vfma_f32(v2208, v2214, v2213); - float32x2_t v2228 = vfma_f32(v2219, v2225, v2224); - float32x2_t v1871 = vmul_f32(v1870, v2235); - int16x4_t v1878 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1872, 15), (int32x2_t){0, 0})); - int16x4_t v1890 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1873, 15), (int32x2_t){0, 0})); - int16x4_t v1945 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1935, 15), (int32x2_t){0, 0})); - int16x4_t v1957 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1936, 15), (int32x2_t){0, 0})); - float32x2_t v1985 = vadd_f32(v1983, v1984); - float32x2_t v1986 = vsub_f32(v1984, v1983); - v6[ostride * 12] = vget_lane_s32(vreinterpret_s32_s16(v2067), 0); - v6[ostride * 28] = vget_lane_s32(vreinterpret_s32_s16(v2079), 0); - float32x2_t v2115 = vmul_f32(v2114, v2235); - int16x4_t v2122 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v2116, 15), (int32x2_t){0, 0})); - int16x4_t v2134 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v2117, 15), (int32x2_t){0, 0})); - v6[ostride * 6] = vget_lane_s32(vreinterpret_s32_s16(v2183), 0); - int16x4_t v2189 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v2179, 15), (int32x2_t){0, 0})); - v6[ostride * 22] = vget_lane_s32(vreinterpret_s32_s16(v2195), 0); - int16x4_t v2201 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v2180, 15), (int32x2_t){0, 0})); - float32x2_t v2229 = vadd_f32(v2227, v2228); - float32x2_t v2230 = vsub_f32(v2228, v2227); - float32x2_t v1874 = vsub_f32(v1491, v1871); - float32x2_t v1875 = vadd_f32(v1491, v1871); - v6[ostride] = vget_lane_s32(vreinterpret_s32_s16(v1878), 0); - v6[ostride * 17] = vget_lane_s32(vreinterpret_s32_s16(v1890), 0); - v6[ostride * 10] = vget_lane_s32(vreinterpret_s32_s16(v1945), 0); - v6[ostride * 26] = vget_lane_s32(vreinterpret_s32_s16(v1957), 0); - float32x2_t v1992 = vrev64_f32(v1986); - float32x2_t v1994 = vadd_f32(v1564, v1985); - float32x2_t v1995 = vsub_f32(v1564, v1985); - float32x2_t v2118 = vsub_f32(v1493, v2115); - float32x2_t v2119 = vadd_f32(v1493, v2115); - v6[ostride * 5] = vget_lane_s32(vreinterpret_s32_s16(v2122), 0); - v6[ostride * 21] = vget_lane_s32(vreinterpret_s32_s16(v2134), 0); - v6[ostride * 14] = vget_lane_s32(vreinterpret_s32_s16(v2189), 0); - v6[ostride * 30] = vget_lane_s32(vreinterpret_s32_s16(v2201), 0); - float32x2_t v2236 = vrev64_f32(v2230); - float32x2_t v2238 = vadd_f32(v1566, v2229); - float32x2_t v2239 = vsub_f32(v1566, v2229); - int16x4_t v1884 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1874, 15), (int32x2_t){0, 0})); - int16x4_t v1896 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1875, 15), (int32x2_t){0, 0})); - float32x2_t v1993 = vmul_f32(v1992, v2235); - int16x4_t v2000 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1994, 15), (int32x2_t){0, 0})); - int16x4_t v2012 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1995, 15), (int32x2_t){0, 0})); - int16x4_t v2128 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v2118, 15), (int32x2_t){0, 0})); - int16x4_t v2140 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v2119, 15), (int32x2_t){0, 0})); - float32x2_t v2237 = vmul_f32(v2236, v2235); - int16x4_t v2244 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v2238, 15), (int32x2_t){0, 0})); - int16x4_t v2256 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v2239, 15), (int32x2_t){0, 0})); - v6[ostride * 9] = vget_lane_s32(vreinterpret_s32_s16(v1884), 0); - v6[ostride * 25] = vget_lane_s32(vreinterpret_s32_s16(v1896), 0); - float32x2_t v1996 = vsub_f32(v1565, v1993); - float32x2_t v1997 = vadd_f32(v1565, v1993); - v6[ostride * 3] = vget_lane_s32(vreinterpret_s32_s16(v2000), 0); - v6[ostride * 19] = vget_lane_s32(vreinterpret_s32_s16(v2012), 0); - v6[ostride * 13] = vget_lane_s32(vreinterpret_s32_s16(v2128), 0); - v6[ostride * 29] = vget_lane_s32(vreinterpret_s32_s16(v2140), 0); - float32x2_t v2240 = vsub_f32(v1567, v2237); - float32x2_t v2241 = vadd_f32(v1567, v2237); - v6[ostride * 7] = vget_lane_s32(vreinterpret_s32_s16(v2244), 0); - v6[ostride * 23] = vget_lane_s32(vreinterpret_s32_s16(v2256), 0); - int16x4_t v2006 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1996, 15), (int32x2_t){0, 0})); - int16x4_t v2018 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1997, 15), (int32x2_t){0, 0})); - int16x4_t v2250 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v2240, 15), (int32x2_t){0, 0})); - int16x4_t v2262 = - vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v2241, 15), (int32x2_t){0, 0})); - v6[ostride * 11] = vget_lane_s32(vreinterpret_s32_s16(v2006), 0); - v6[ostride * 27] = vget_lane_s32(vreinterpret_s32_s16(v2018), 0); - v6[ostride * 15] = vget_lane_s32(vreinterpret_s32_s16(v2250), 0); - v6[ostride * 31] = vget_lane_s32(vreinterpret_s32_s16(v2262), 0); - v5 += 1 * 1; - v6 += 1 * 1; - } -} -#endif - -#ifdef ARMRAL_ARCH_SVE -void armral_fft_cs16_cf32_cs16_ac_n_uu32(const armral_cmplx_int16_t *restrict x, - armral_cmplx_int16_t *restrict y, - int istride, int ostride, int howmany, - float dir) { - int64_t v0 = istride; - int64_t v2 = ostride; - float v4 = dir; - const int32_t *v5 = (const int32_t *)x; - int32_t *v6 = (int32_t *)y; - int64_t v8 = howmany; - int64_t v10 = svcntd(); - int64_t v11 = v10 * 1; - int64_t v12 = v10 * 1; - for (int j = 0; j < v8; j += v10) { - svbool_t pred_full = svwhilelt_b32(j * 2, howmany * 2); - float v847 = -1.9509032201612819e-01F; - float v906 = 7.0710678118654757e-01F; - float v918 = -7.0710678118654746e-01F; - float v923 = -1.0000000000000000e+00F; - float v977 = 5.5557023301960229e-01F; - float v982 = 8.3146961230254524e-01F; - float v989 = -9.8078528040323043e-01F; - float v1048 = 3.8268343236508984e-01F; - float v1053 = 9.2387953251128674e-01F; - float v1060 = -9.2387953251128685e-01F; - float v1065 = -3.8268343236508967e-01F; - float v1119 = 1.9509032201612833e-01F; - float v1124 = 9.8078528040323043e-01F; - float v1131 = -5.5557023301960218e-01F; - float v1136 = -8.3146961230254524e-01F; - float v1147 = 1.0000000000000000e+00F; - const int32_t *v1365 = &v5[v0]; - int32_t *v1566 = &v6[v2]; - int64_t v27 = v0 * 16; - int64_t v37 = v0 * 8; - int64_t v45 = v0 * 24; - int64_t v66 = v0 * 4; - int64_t v74 = v0 * 20; - int64_t v84 = v0 * 12; - int64_t v92 = v0 * 28; - int64_t v154 = v0 * 2; - int64_t v162 = v0 * 18; - int64_t v172 = v0 * 10; - int64_t v180 = v0 * 26; - int64_t v201 = v0 * 6; - int64_t v209 = v0 * 22; - int64_t v219 = v0 * 14; - int64_t v227 = v0 * 30; - int64_t v386 = v0 * 17; - int64_t v396 = v0 * 9; - int64_t v404 = v0 * 25; - int64_t v425 = v0 * 5; - int64_t v433 = v0 * 21; - int64_t v443 = v0 * 13; - int64_t v451 = v0 * 29; - int64_t v513 = v0 * 3; - int64_t v521 = v0 * 19; - int64_t v531 = v0 * 11; - int64_t v539 = v0 * 27; - int64_t v560 = v0 * 7; - int64_t v568 = v0 * 23; - int64_t v578 = v0 * 15; - int64_t v586 = v0 * 31; - int64_t v669 = v2 * 8; - int64_t v677 = v2 * 16; - int64_t v685 = v2 * 24; - int64_t v740 = v2 * 9; - int64_t v748 = v2 * 17; - int64_t v756 = v2 * 25; - float v772 = v4 * v1048; - int64_t v803 = v2 * 2; - int64_t v811 = v2 * 10; - int64_t v819 = v2 * 18; - int64_t v827 = v2 * 26; - float v843 = v4 * v977; - int64_t v874 = v2 * 3; - int64_t v882 = v2 * 11; - int64_t v890 = v2 * 19; - int64_t v898 = v2 * 27; - float v926 = v4 * v923; - int64_t v945 = v2 * 4; - int64_t v953 = v2 * 12; - int64_t v961 = v2 * 20; - int64_t v969 = v2 * 28; - float v985 = v4 * v982; - float v997 = v4 * v1119; - int64_t v1016 = v2 * 5; - int64_t v1024 = v2 * 13; - int64_t v1032 = v2 * 21; - int64_t v1040 = v2 * 29; - float v1056 = v4 * v1053; - float v1068 = v4 * v1065; - int64_t v1087 = v2 * 6; - int64_t v1095 = v2 * 14; - int64_t v1103 = v2 * 22; - int64_t v1111 = v2 * 30; - float v1127 = v4 * v1124; - float v1139 = v4 * v1136; - float v1150 = v4 * v1147; - int64_t v1158 = v2 * 7; - int64_t v1166 = v2 * 15; - int64_t v1174 = v2 * 23; - int64_t v1182 = v2 * 31; - const int32_t *v1196 = &v5[0]; - int32_t *v1525 = &v6[0]; - svfloat32_t v1555 = svdup_n_f32(v1124); - svfloat32_t v1596 = svdup_n_f32(v1053); - svfloat32_t v1637 = svdup_n_f32(v982); - svfloat32_t v1639 = svdup_n_f32(v847); - svfloat32_t v1678 = svdup_n_f32(v906); - svfloat32_t v1680 = svdup_n_f32(v918); - svfloat32_t v1719 = svdup_n_f32(v977); - svfloat32_t v1721 = svdup_n_f32(v989); - svfloat32_t v1760 = svdup_n_f32(v1048); - svfloat32_t v1762 = svdup_n_f32(v1060); - svfloat32_t v1801 = svdup_n_f32(v1119); - svfloat32_t v1803 = svdup_n_f32(v1131); - svfloat32_t v384 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v1365[0])), - 1.F / (1ULL << 15ULL)); - const int32_t *v1205 = &v5[v27]; - const int32_t *v1214 = &v5[v37]; - const int32_t *v1223 = &v5[v45]; - const int32_t *v1233 = &v5[v66]; - const int32_t *v1242 = &v5[v74]; - const int32_t *v1251 = &v5[v84]; - const int32_t *v1260 = &v5[v92]; - const int32_t *v1275 = &v5[v154]; - const int32_t *v1284 = &v5[v162]; - const int32_t *v1293 = &v5[v172]; - const int32_t *v1302 = &v5[v180]; - const int32_t *v1312 = &v5[v201]; - const int32_t *v1321 = &v5[v209]; - const int32_t *v1330 = &v5[v219]; - const int32_t *v1339 = &v5[v227]; - const int32_t *v1374 = &v5[v386]; - const int32_t *v1383 = &v5[v396]; - const int32_t *v1392 = &v5[v404]; - const int32_t *v1402 = &v5[v425]; - const int32_t *v1411 = &v5[v433]; - const int32_t *v1420 = &v5[v443]; - const int32_t *v1429 = &v5[v451]; - const int32_t *v1444 = &v5[v513]; - const int32_t *v1453 = &v5[v521]; - const int32_t *v1462 = &v5[v531]; - const int32_t *v1471 = &v5[v539]; - const int32_t *v1481 = &v5[v560]; - const int32_t *v1490 = &v5[v568]; - const int32_t *v1499 = &v5[v578]; - const int32_t *v1508 = &v5[v586]; - int32_t *v1534 = &v6[v669]; - int32_t *v1543 = &v6[v677]; - int32_t *v1552 = &v6[v685]; - int32_t *v1575 = &v6[v740]; - int32_t *v1584 = &v6[v748]; - int32_t *v1593 = &v6[v756]; - svfloat32_t v1597 = svdup_n_f32(v772); - int32_t *v1607 = &v6[v803]; - int32_t *v1616 = &v6[v811]; - int32_t *v1625 = &v6[v819]; - int32_t *v1634 = &v6[v827]; - svfloat32_t v1638 = svdup_n_f32(v843); - int32_t *v1648 = &v6[v874]; - int32_t *v1657 = &v6[v882]; - int32_t *v1666 = &v6[v890]; - int32_t *v1675 = &v6[v898]; - svfloat32_t v1681 = svdup_n_f32(v926); - int32_t *v1689 = &v6[v945]; - int32_t *v1698 = &v6[v953]; - int32_t *v1707 = &v6[v961]; - int32_t *v1716 = &v6[v969]; - svfloat32_t v1720 = svdup_n_f32(v985); - svfloat32_t v1722 = svdup_n_f32(v997); - int32_t *v1730 = &v6[v1016]; - int32_t *v1739 = &v6[v1024]; - int32_t *v1748 = &v6[v1032]; - int32_t *v1757 = &v6[v1040]; - svfloat32_t v1761 = svdup_n_f32(v1056); - svfloat32_t v1763 = svdup_n_f32(v1068); - int32_t *v1771 = &v6[v1087]; - int32_t *v1780 = &v6[v1095]; - int32_t *v1789 = &v6[v1103]; - int32_t *v1798 = &v6[v1111]; - svfloat32_t v1802 = svdup_n_f32(v1127); - svfloat32_t v1804 = svdup_n_f32(v1139); - svfloat32_t v1805 = svdup_n_f32(v1150); - int32_t *v1812 = &v6[v1158]; - int32_t *v1821 = &v6[v1166]; - int32_t *v1830 = &v6[v1174]; - int32_t *v1839 = &v6[v1182]; - svfloat32_t v25 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v1196[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v33 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v1205[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v43 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v1214[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v51 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v1223[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v72 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v1233[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v80 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v1242[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v90 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v1251[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v98 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v1260[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v160 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v1275[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v168 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v1284[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v178 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v1293[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v186 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v1302[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v207 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v1312[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v215 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v1321[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v225 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v1330[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v233 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v1339[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v392 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v1374[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v402 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v1383[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v410 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v1392[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v431 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v1402[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v439 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v1411[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v449 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v1420[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v457 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v1429[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v519 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v1444[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v527 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v1453[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v537 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v1462[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v545 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v1471[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v566 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v1481[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v574 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v1490[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v584 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v1499[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v592 = svmul_n_f32_x( - pred_full, - svcvt_f32_s32_x(pred_full, - svld1sh_s32(pred_full, (const int16_t *)&v1508[0])), - 1.F / (1ULL << 15ULL)); - svfloat32_t v34; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v34) : "w"(v25), "w"(v33)); - svfloat32_t v35; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v35) : "w"(v25), "w"(v33)); - svfloat32_t v52; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v52) : "w"(v43), "w"(v51)); - svfloat32_t v53; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v53) : "w"(v43), "w"(v51)); - svfloat32_t v81; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v81) : "w"(v72), "w"(v80)); - svfloat32_t v82; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v82) : "w"(v72), "w"(v80)); - svfloat32_t v99; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v99) : "w"(v90), "w"(v98)); - svfloat32_t v100; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v100) : "w"(v90), "w"(v98)); - svfloat32_t v169; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v169) : "w"(v160), "w"(v168)); - svfloat32_t v170; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v170) : "w"(v160), "w"(v168)); - svfloat32_t v187; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v187) : "w"(v178), "w"(v186)); - svfloat32_t v188; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v188) : "w"(v178), "w"(v186)); - svfloat32_t v216; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v216) : "w"(v207), "w"(v215)); - svfloat32_t v217; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v217) : "w"(v207), "w"(v215)); - svfloat32_t v234; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v234) : "w"(v225), "w"(v233)); - svfloat32_t v235; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v235) : "w"(v225), "w"(v233)); - svfloat32_t v393; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v393) : "w"(v384), "w"(v392)); - svfloat32_t v394; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v394) : "w"(v384), "w"(v392)); - svfloat32_t v411; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v411) : "w"(v402), "w"(v410)); - svfloat32_t v412; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v412) : "w"(v402), "w"(v410)); - svfloat32_t v440; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v440) : "w"(v431), "w"(v439)); - svfloat32_t v441; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v441) : "w"(v431), "w"(v439)); - svfloat32_t v458; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v458) : "w"(v449), "w"(v457)); - svfloat32_t v459; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v459) : "w"(v449), "w"(v457)); - svfloat32_t v528; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v528) : "w"(v519), "w"(v527)); - svfloat32_t v529; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v529) : "w"(v519), "w"(v527)); - svfloat32_t v546; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v546) : "w"(v537), "w"(v545)); - svfloat32_t v547; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v547) : "w"(v537), "w"(v545)); - svfloat32_t v575; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v575) : "w"(v566), "w"(v574)); - svfloat32_t v576; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v576) : "w"(v566), "w"(v574)); - svfloat32_t v593; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v593) : "w"(v584), "w"(v592)); - svfloat32_t v594; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v594) : "w"(v584), "w"(v592)); - svfloat32_t zero60; - asm volatile("mov %0.s, #0" : "=w"(zero60)); - svfloat32_t v60 = svcmla_f32_x(pred_full, zero60, v1681, v53, 90); - svfloat32_t v61; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v61) : "w"(v34), "w"(v52)); - svfloat32_t v62; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v62) : "w"(v34), "w"(v52)); - svfloat32_t v101; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v101) : "w"(v81), "w"(v99)); - svfloat32_t v102; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v102) : "w"(v81), "w"(v99)); - svfloat32_t v118; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v118) : "w"(v82), "w"(v1678)); - svfloat32_t v130; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v130) : "w"(v100), "w"(v1680)); - svfloat32_t zero195; - asm volatile("mov %0.s, #0" : "=w"(zero195)); - svfloat32_t v195 = svcmla_f32_x(pred_full, zero195, v1681, v188, 90); - svfloat32_t v196; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v196) : "w"(v169), "w"(v187)); - svfloat32_t v197; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v197) : "w"(v169), "w"(v187)); - svfloat32_t zero242; - asm volatile("mov %0.s, #0" : "=w"(zero242)); - svfloat32_t v242 = svcmla_f32_x(pred_full, zero242, v1681, v235, 90); - svfloat32_t v243; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v243) : "w"(v216), "w"(v234)); - svfloat32_t v244; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v244) : "w"(v216), "w"(v234)); - svfloat32_t zero419; - asm volatile("mov %0.s, #0" : "=w"(zero419)); - svfloat32_t v419 = svcmla_f32_x(pred_full, zero419, v1681, v412, 90); - svfloat32_t v420; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v420) : "w"(v393), "w"(v411)); - svfloat32_t v421; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v421) : "w"(v393), "w"(v411)); - svfloat32_t v460; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v460) : "w"(v440), "w"(v458)); - svfloat32_t v461; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v461) : "w"(v440), "w"(v458)); - svfloat32_t v477; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v477) : "w"(v441), "w"(v1678)); - svfloat32_t v489; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v489) : "w"(v459), "w"(v1680)); - svfloat32_t zero554; - asm volatile("mov %0.s, #0" : "=w"(zero554)); - svfloat32_t v554 = svcmla_f32_x(pred_full, zero554, v1681, v547, 90); - svfloat32_t v555; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v555) : "w"(v528), "w"(v546)); - svfloat32_t v556; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v556) : "w"(v528), "w"(v546)); - svfloat32_t v595; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v595) : "w"(v575), "w"(v593)); - svfloat32_t v596; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v596) : "w"(v575), "w"(v593)); - svfloat32_t v612; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v612) : "w"(v576), "w"(v1678)); - svfloat32_t v624; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v624) : "w"(v594), "w"(v1680)); - svfloat32_t v63; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v63) : "w"(v35), "w"(v60)); - svfloat32_t v64; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v64) : "w"(v35), "w"(v60)); - svfloat32_t zero109; - asm volatile("mov %0.s, #0" : "=w"(zero109)); - svfloat32_t v109 = svcmla_f32_x(pred_full, zero109, v1681, v102, 90); - svfloat32_t v110; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v110) : "w"(v61), "w"(v101)); - svfloat32_t v111; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v111) : "w"(v61), "w"(v101)); - svfloat32_t v198; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v198) : "w"(v170), "w"(v195)); - svfloat32_t v199; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v199) : "w"(v170), "w"(v195)); - svfloat32_t v245; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v245) : "w"(v217), "w"(v242)); - svfloat32_t v246; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v246) : "w"(v217), "w"(v242)); - svfloat32_t v247; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v247) : "w"(v196), "w"(v243)); - svfloat32_t v248; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v248) : "w"(v196), "w"(v243)); - svfloat32_t v303; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v303) : "w"(v197), "w"(v1678)); - svfloat32_t v315; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v315) : "w"(v244), "w"(v1680)); - svfloat32_t v422; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v422) : "w"(v394), "w"(v419)); - svfloat32_t v423; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v423) : "w"(v394), "w"(v419)); - svfloat32_t zero468; - asm volatile("mov %0.s, #0" : "=w"(zero468)); - svfloat32_t v468 = svcmla_f32_x(pred_full, zero468, v1681, v461, 90); - svfloat32_t v469; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v469) : "w"(v420), "w"(v460)); - svfloat32_t v470; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v470) : "w"(v420), "w"(v460)); - svfloat32_t v557; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v557) : "w"(v529), "w"(v554)); - svfloat32_t v558; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v558) : "w"(v529), "w"(v554)); - svfloat32_t zero603; - asm volatile("mov %0.s, #0" : "=w"(zero603)); - svfloat32_t v603 = svcmla_f32_x(pred_full, zero603, v1681, v596, 90); - svfloat32_t v604; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v604) : "w"(v555), "w"(v595)); - svfloat32_t v605; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v605) : "w"(v555), "w"(v595)); - svfloat32_t v112; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v112) : "w"(v62), "w"(v109)); - svfloat32_t v113; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v113) : "w"(v62), "w"(v109)); - svfloat32_t v138 = svcmla_f32_x(pred_full, v118, v1805, v118, 90); - svfloat32_t v139 = svcmla_f32_x(pred_full, v130, v1681, v130, 90); - svfloat32_t zero255; - asm volatile("mov %0.s, #0" : "=w"(zero255)); - svfloat32_t v255 = svcmla_f32_x(pred_full, zero255, v1681, v248, 90); - svfloat32_t v256; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v256) : "w"(v110), "w"(v247)); - svfloat32_t v257; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v257) : "w"(v110), "w"(v247)); - svfloat32_t v264; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v264) : "w"(v198), "w"(v1596)); - svfloat32_t v276; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v276) : "w"(v245), "w"(v1760)); - svfloat32_t v342; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v342) : "w"(v199), "w"(v1760)); - svfloat32_t v354; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v354) : "w"(v246), "w"(v1762)); - svfloat32_t v471; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v471) : "w"(v421), "w"(v468)); - svfloat32_t v472; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v472) : "w"(v421), "w"(v468)); - svfloat32_t v497 = svcmla_f32_x(pred_full, v477, v1805, v477, 90); - svfloat32_t v498 = svcmla_f32_x(pred_full, v489, v1681, v489, 90); - svfloat32_t v606; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v606) : "w"(v556), "w"(v603)); - svfloat32_t v607; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v607) : "w"(v556), "w"(v603)); - svfloat32_t v632 = svcmla_f32_x(pred_full, v612, v1805, v612, 90); - svfloat32_t v633 = svcmla_f32_x(pred_full, v624, v1681, v624, 90); - svfloat32_t v647; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v647) : "w"(v469), "w"(v604)); - svfloat32_t v648; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v648) : "w"(v469), "w"(v604)); - svfloat32_t v909; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v909) : "w"(v470), "w"(v1678)); - svfloat32_t v921; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v921) : "w"(v605), "w"(v1680)); - svfloat32_t v140; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v140) : "w"(v138), "w"(v139)); - svfloat32_t v141; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v141) : "w"(v139), "w"(v138)); - svfloat32_t v258; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v258) : "w"(v111), "w"(v255)); - svfloat32_t v259; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v259) : "w"(v111), "w"(v255)); - svfloat32_t v284 = svcmla_f32_x(pred_full, v264, v1597, v198, 90); - svfloat32_t v285 = svcmla_f32_x(pred_full, v276, v1761, v245, 90); - svfloat32_t v323 = svcmla_f32_x(pred_full, v303, v1805, v303, 90); - svfloat32_t v324 = svcmla_f32_x(pred_full, v315, v1681, v315, 90); - svfloat32_t v362 = svcmla_f32_x(pred_full, v342, v1761, v199, 90); - svfloat32_t v363 = svcmla_f32_x(pred_full, v354, v1763, v246, 90); - svfloat32_t v499; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v499) : "w"(v497), "w"(v498)); - svfloat32_t v500; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v500) : "w"(v498), "w"(v497)); - svfloat32_t v634; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v634) : "w"(v632), "w"(v633)); - svfloat32_t v635; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v635) : "w"(v633), "w"(v632)); - svfloat32_t zero655; - asm volatile("mov %0.s, #0" : "=w"(zero655)); - svfloat32_t v655 = svcmla_f32_x(pred_full, zero655, v1681, v648, 90); - svfloat32_t v656; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v656) : "w"(v256), "w"(v647)); - svfloat32_t v657; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v657) : "w"(v256), "w"(v647)); - svfloat32_t v767; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v767) : "w"(v471), "w"(v1596)); - svfloat32_t v779; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v779) : "w"(v606), "w"(v1760)); - svfloat32_t v1051; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v1051) : "w"(v472), "w"(v1760)); - svfloat32_t v1063; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v1063) : "w"(v607), "w"(v1762)); - svfloat32_t zero148; - asm volatile("mov %0.s, #0" : "=w"(zero148)); - svfloat32_t v148 = svcmla_f32_x(pred_full, zero148, v1805, v141, 90); - svfloat32_t v149; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v149) : "w"(v63), "w"(v140)); - svfloat32_t v150; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v150) : "w"(v63), "w"(v140)); - svfloat32_t v286; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v286) : "w"(v284), "w"(v285)); - svfloat32_t v287; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v287) : "w"(v285), "w"(v284)); - svfloat32_t v325; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v325) : "w"(v323), "w"(v324)); - svfloat32_t v326; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v326) : "w"(v324), "w"(v323)); - svfloat32_t v364; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v364) : "w"(v362), "w"(v363)); - svfloat32_t v365; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v365) : "w"(v363), "w"(v362)); - svfloat32_t zero507; - asm volatile("mov %0.s, #0" : "=w"(zero507)); - svfloat32_t v507 = svcmla_f32_x(pred_full, zero507, v1805, v500, 90); - svfloat32_t v508; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v508) : "w"(v422), "w"(v499)); - svfloat32_t v509; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v509) : "w"(v422), "w"(v499)); - svfloat32_t zero642; - asm volatile("mov %0.s, #0" : "=w"(zero642)); - svfloat32_t v642 = svcmla_f32_x(pred_full, zero642, v1805, v635, 90); - svfloat32_t v643; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v643) : "w"(v557), "w"(v634)); - svfloat32_t v644; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v644) : "w"(v557), "w"(v634)); - svfloat32_t v658; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v658) : "w"(v257), "w"(v655)); - svfloat32_t v659; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v659) : "w"(v257), "w"(v655)); - svint16_t v662 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v656, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svint16_t v678 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v657, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svfloat32_t v787 = svcmla_f32_x(pred_full, v767, v1597, v471, 90); - svfloat32_t v788 = svcmla_f32_x(pred_full, v779, v1761, v606, 90); - svfloat32_t v929 = svcmla_f32_x(pred_full, v909, v1805, v909, 90); - svfloat32_t v930 = svcmla_f32_x(pred_full, v921, v1681, v921, 90); - svfloat32_t v1071 = svcmla_f32_x(pred_full, v1051, v1761, v472, 90); - svfloat32_t v1072 = svcmla_f32_x(pred_full, v1063, v1763, v607, 90); - svfloat32_t v151; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v151) : "w"(v64), "w"(v148)); - svfloat32_t v152; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v152) : "w"(v64), "w"(v148)); - svfloat32_t zero294; - asm volatile("mov %0.s, #0" : "=w"(zero294)); - svfloat32_t v294 = svcmla_f32_x(pred_full, zero294, v1805, v287, 90); - svfloat32_t v295; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v295) : "w"(v149), "w"(v286)); - svfloat32_t v296; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v296) : "w"(v149), "w"(v286)); - svfloat32_t zero333; - asm volatile("mov %0.s, #0" : "=w"(zero333)); - svfloat32_t v333 = svcmla_f32_x(pred_full, zero333, v1805, v326, 90); - svfloat32_t v334; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v334) : "w"(v112), "w"(v325)); - svfloat32_t v335; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v335) : "w"(v112), "w"(v325)); - svfloat32_t zero372; - asm volatile("mov %0.s, #0" : "=w"(zero372)); - svfloat32_t v372 = svcmla_f32_x(pred_full, zero372, v1805, v365, 90); - svfloat32_t v510; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v510) : "w"(v423), "w"(v507)); - svfloat32_t v511; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v511) : "w"(v423), "w"(v507)); - svfloat32_t v645; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v645) : "w"(v558), "w"(v642)); - svfloat32_t v646; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v646) : "w"(v558), "w"(v642)); - svint16_t v670 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v658, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svint16_t v686 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v659, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svfloat32_t v696; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v696) : "w"(v508), "w"(v1555)); - svfloat32_t v708; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v708) : "w"(v643), "w"(v1637)); - svfloat32_t v789; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v789) : "w"(v787), "w"(v788)); - svfloat32_t v790; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v790) : "w"(v788), "w"(v787)); - svfloat32_t v931; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v931) : "w"(v929), "w"(v930)); - svfloat32_t v932; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v932) : "w"(v930), "w"(v929)); - svfloat32_t v980; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v980) : "w"(v509), "w"(v1719)); - svfloat32_t v992; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v992) : "w"(v644), "w"(v1721)); - svfloat32_t v1073; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v1073) : "w"(v1071), "w"(v1072)); - svfloat32_t v1074; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1074) : "w"(v1072), "w"(v1071)); - svst1w_u64(pred_full, (unsigned *)(v1525), svreinterpret_u64_s16(v662)); - svst1w_u64(pred_full, (unsigned *)(v1543), svreinterpret_u64_s16(v678)); - svfloat32_t v297; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v297) : "w"(v150), "w"(v294)); - svfloat32_t v298; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v298) : "w"(v150), "w"(v294)); - svfloat32_t v336; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v336) : "w"(v113), "w"(v333)); - svfloat32_t v337; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v337) : "w"(v113), "w"(v333)); - svfloat32_t v373; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v373) : "w"(v151), "w"(v364)); - svfloat32_t v374; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v374) : "w"(v151), "w"(v364)); - svfloat32_t v375; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v375) : "w"(v152), "w"(v372)); - svfloat32_t v376; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v376) : "w"(v152), "w"(v372)); - svfloat32_t v716 = svcmla_f32_x(pred_full, v696, v1722, v508, 90); - svfloat32_t v717 = svcmla_f32_x(pred_full, v708, v1638, v643, 90); - svfloat32_t zero797; - asm volatile("mov %0.s, #0" : "=w"(zero797)); - svfloat32_t v797 = svcmla_f32_x(pred_full, zero797, v1805, v790, 90); - svfloat32_t v798; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v798) : "w"(v334), "w"(v789)); - svfloat32_t v799; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v799) : "w"(v334), "w"(v789)); - svfloat32_t v838; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v838) : "w"(v510), "w"(v1637)); - svfloat32_t v850; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v850) : "w"(v645), "w"(v1639)); - svfloat32_t zero939; - asm volatile("mov %0.s, #0" : "=w"(zero939)); - svfloat32_t v939 = svcmla_f32_x(pred_full, zero939, v1805, v932, 90); - svfloat32_t v940; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v940) : "w"(v258), "w"(v931)); - svfloat32_t v941; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v941) : "w"(v258), "w"(v931)); - svfloat32_t v1000 = svcmla_f32_x(pred_full, v980, v1720, v509, 90); - svfloat32_t v1001 = svcmla_f32_x(pred_full, v992, v1722, v644, 90); - svfloat32_t zero1081; - asm volatile("mov %0.s, #0" : "=w"(zero1081)); - svfloat32_t v1081 = svcmla_f32_x(pred_full, zero1081, v1805, v1074, 90); - svfloat32_t v1122; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v1122) : "w"(v511), "w"(v1801)); - svfloat32_t v1134; - asm("fmul %0.s, %1.s, %2.s" : "=w"(v1134) : "w"(v646), "w"(v1803)); - svst1w_u64(pred_full, (unsigned *)(v1534), svreinterpret_u64_s16(v670)); - svst1w_u64(pred_full, (unsigned *)(v1552), svreinterpret_u64_s16(v686)); - svfloat32_t v718; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v718) : "w"(v716), "w"(v717)); - svfloat32_t v719; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v719) : "w"(v717), "w"(v716)); - svfloat32_t v800; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v800) : "w"(v335), "w"(v797)); - svfloat32_t v801; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v801) : "w"(v335), "w"(v797)); - svint16_t v804 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v798, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svint16_t v820 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v799, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svfloat32_t v858 = svcmla_f32_x(pred_full, v838, v1638, v510, 90); - svfloat32_t v859 = svcmla_f32_x(pred_full, v850, v1802, v645, 90); - svfloat32_t v942; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v942) : "w"(v259), "w"(v939)); - svfloat32_t v943; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v943) : "w"(v259), "w"(v939)); - svint16_t v946 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v940, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svint16_t v962 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v941, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svfloat32_t v1002; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v1002) : "w"(v1000), "w"(v1001)); - svfloat32_t v1003; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1003) : "w"(v1001), "w"(v1000)); - svfloat32_t v1082; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v1082) : "w"(v336), "w"(v1073)); - svfloat32_t v1083; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1083) : "w"(v336), "w"(v1073)); - svfloat32_t v1084; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1084) : "w"(v337), "w"(v1081)); - svfloat32_t v1085; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v1085) : "w"(v337), "w"(v1081)); - svfloat32_t v1142 = svcmla_f32_x(pred_full, v1122, v1802, v511, 90); - svfloat32_t v1143 = svcmla_f32_x(pred_full, v1134, v1804, v646, 90); - svfloat32_t zero726; - asm volatile("mov %0.s, #0" : "=w"(zero726)); - svfloat32_t v726 = svcmla_f32_x(pred_full, zero726, v1805, v719, 90); - svfloat32_t v727; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v727) : "w"(v295), "w"(v718)); - svfloat32_t v728; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v728) : "w"(v295), "w"(v718)); - svint16_t v812 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v800, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svint16_t v828 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v801, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svfloat32_t v860; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v860) : "w"(v858), "w"(v859)); - svfloat32_t v861; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v861) : "w"(v859), "w"(v858)); - svint16_t v954 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v942, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svint16_t v970 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v943, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svfloat32_t zero1010; - asm volatile("mov %0.s, #0" : "=w"(zero1010)); - svfloat32_t v1010 = svcmla_f32_x(pred_full, zero1010, v1805, v1003, 90); - svfloat32_t v1011; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v1011) : "w"(v297), "w"(v1002)); - svfloat32_t v1012; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1012) : "w"(v297), "w"(v1002)); - svint16_t v1088 = - svtbl_s16(svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, - svmul_n_f32_x(pred_full, v1082, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64(svindex_u64(0xffffffff00030001ULL, - 0x0000000000040004ULL))); - svint16_t v1096 = - svtbl_s16(svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, - svmul_n_f32_x(pred_full, v1084, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64(svindex_u64(0xffffffff00030001ULL, - 0x0000000000040004ULL))); - svint16_t v1104 = - svtbl_s16(svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, - svmul_n_f32_x(pred_full, v1083, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64(svindex_u64(0xffffffff00030001ULL, - 0x0000000000040004ULL))); - svint16_t v1112 = - svtbl_s16(svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, - svmul_n_f32_x(pred_full, v1085, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64(svindex_u64(0xffffffff00030001ULL, - 0x0000000000040004ULL))); - svfloat32_t v1144; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v1144) : "w"(v1142), "w"(v1143)); - svfloat32_t v1145; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1145) : "w"(v1143), "w"(v1142)); - svst1w_u64(pred_full, (unsigned *)(v1607), svreinterpret_u64_s16(v804)); - svst1w_u64(pred_full, (unsigned *)(v1625), svreinterpret_u64_s16(v820)); - svst1w_u64(pred_full, (unsigned *)(v1689), svreinterpret_u64_s16(v946)); - svst1w_u64(pred_full, (unsigned *)(v1707), svreinterpret_u64_s16(v962)); - svfloat32_t v729; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v729) : "w"(v296), "w"(v726)); - svfloat32_t v730; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v730) : "w"(v296), "w"(v726)); - svint16_t v733 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v727, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svint16_t v749 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v728, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svfloat32_t zero868; - asm volatile("mov %0.s, #0" : "=w"(zero868)); - svfloat32_t v868 = svcmla_f32_x(pred_full, zero868, v1805, v861, 90); - svfloat32_t v869; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v869) : "w"(v373), "w"(v860)); - svfloat32_t v870; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v870) : "w"(v373), "w"(v860)); - svfloat32_t v1013; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1013) : "w"(v298), "w"(v1010)); - svfloat32_t v1014; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v1014) : "w"(v298), "w"(v1010)); - svint16_t v1017 = - svtbl_s16(svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, - svmul_n_f32_x(pred_full, v1011, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64(svindex_u64(0xffffffff00030001ULL, - 0x0000000000040004ULL))); - svint16_t v1033 = - svtbl_s16(svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, - svmul_n_f32_x(pred_full, v1012, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64(svindex_u64(0xffffffff00030001ULL, - 0x0000000000040004ULL))); - svfloat32_t zero1152; - asm volatile("mov %0.s, #0" : "=w"(zero1152)); - svfloat32_t v1152 = svcmla_f32_x(pred_full, zero1152, v1805, v1145, 90); - svfloat32_t v1153; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v1153) : "w"(v375), "w"(v1144)); - svfloat32_t v1154; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1154) : "w"(v375), "w"(v1144)); - svst1w_u64(pred_full, (unsigned *)(v1616), svreinterpret_u64_s16(v812)); - svst1w_u64(pred_full, (unsigned *)(v1634), svreinterpret_u64_s16(v828)); - svst1w_u64(pred_full, (unsigned *)(v1698), svreinterpret_u64_s16(v954)); - svst1w_u64(pred_full, (unsigned *)(v1716), svreinterpret_u64_s16(v970)); - svst1w_u64(pred_full, (unsigned *)(v1771), svreinterpret_u64_s16(v1088)); - svst1w_u64(pred_full, (unsigned *)(v1780), svreinterpret_u64_s16(v1096)); - svst1w_u64(pred_full, (unsigned *)(v1789), svreinterpret_u64_s16(v1104)); - svst1w_u64(pred_full, (unsigned *)(v1798), svreinterpret_u64_s16(v1112)); - svint16_t v741 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v729, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svint16_t v757 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v730, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svfloat32_t v871; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v871) : "w"(v374), "w"(v868)); - svfloat32_t v872; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v872) : "w"(v374), "w"(v868)); - svint16_t v875 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v869, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svint16_t v891 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v870, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svint16_t v1025 = - svtbl_s16(svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, - svmul_n_f32_x(pred_full, v1013, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64(svindex_u64(0xffffffff00030001ULL, - 0x0000000000040004ULL))); - svint16_t v1041 = - svtbl_s16(svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, - svmul_n_f32_x(pred_full, v1014, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64(svindex_u64(0xffffffff00030001ULL, - 0x0000000000040004ULL))); - svfloat32_t v1155; - asm("fsub %0.s, %1.s, %2.s" : "=w"(v1155) : "w"(v376), "w"(v1152)); - svfloat32_t v1156; - asm("fadd %0.s, %1.s, %2.s" : "=w"(v1156) : "w"(v376), "w"(v1152)); - svint16_t v1159 = - svtbl_s16(svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, - svmul_n_f32_x(pred_full, v1153, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64(svindex_u64(0xffffffff00030001ULL, - 0x0000000000040004ULL))); - svint16_t v1175 = - svtbl_s16(svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, - svmul_n_f32_x(pred_full, v1154, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64(svindex_u64(0xffffffff00030001ULL, - 0x0000000000040004ULL))); - svst1w_u64(pred_full, (unsigned *)(v1566), svreinterpret_u64_s16(v733)); - svst1w_u64(pred_full, (unsigned *)(v1584), svreinterpret_u64_s16(v749)); - svst1w_u64(pred_full, (unsigned *)(v1730), svreinterpret_u64_s16(v1017)); - svst1w_u64(pred_full, (unsigned *)(v1748), svreinterpret_u64_s16(v1033)); - svint16_t v883 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v871, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svint16_t v899 = svtbl_s16( - svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, svmul_n_f32_x(pred_full, v872, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64( - svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); - svint16_t v1167 = - svtbl_s16(svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, - svmul_n_f32_x(pred_full, v1155, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64(svindex_u64(0xffffffff00030001ULL, - 0x0000000000040004ULL))); - svint16_t v1183 = - svtbl_s16(svreinterpret_s16_s32(svcvt_s32_f32_x( - pred_full, - svmul_n_f32_x(pred_full, v1156, (float)(1ULL << 31ULL)))), - svreinterpret_u16_u64(svindex_u64(0xffffffff00030001ULL, - 0x0000000000040004ULL))); - svst1w_u64(pred_full, (unsigned *)(v1575), svreinterpret_u64_s16(v741)); - svst1w_u64(pred_full, (unsigned *)(v1593), svreinterpret_u64_s16(v757)); - svst1w_u64(pred_full, (unsigned *)(v1648), svreinterpret_u64_s16(v875)); - svst1w_u64(pred_full, (unsigned *)(v1666), svreinterpret_u64_s16(v891)); - svst1w_u64(pred_full, (unsigned *)(v1739), svreinterpret_u64_s16(v1025)); - svst1w_u64(pred_full, (unsigned *)(v1757), svreinterpret_u64_s16(v1041)); - svst1w_u64(pred_full, (unsigned *)(v1812), svreinterpret_u64_s16(v1159)); - svst1w_u64(pred_full, (unsigned *)(v1830), svreinterpret_u64_s16(v1175)); - svst1w_u64(pred_full, (unsigned *)(v1657), svreinterpret_u64_s16(v883)); - svst1w_u64(pred_full, (unsigned *)(v1675), svreinterpret_u64_s16(v899)); - svst1w_u64(pred_full, (unsigned *)(v1821), svreinterpret_u64_s16(v1167)); - svst1w_u64(pred_full, (unsigned *)(v1839), svreinterpret_u64_s16(v1183)); - v5 += v11; - v6 += v12; - } -} -#endif diff --git a/src/LowerPHY/FFT/fft_cs16_cf32_cs16_ac_n_uu.h b/src/LowerPHY/FFT/fft_cs16_cf32_cs16_ac_n_uu.h deleted file mode 100644 index 756cff9a8840fcb83bfe732f088fa2ff850470c2..0000000000000000000000000000000000000000 --- a/src/LowerPHY/FFT/fft_cs16_cf32_cs16_ac_n_uu.h +++ /dev/null @@ -1,45 +0,0 @@ -/* - Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates -*/ -#pragma once - -#include "armral.h" -#include "fft_helper.h" - -#ifdef __cplusplus -extern "C" { -#endif - -typedef void(cs16_cf32_cs16_ac_n_uu_fft_t)(const armral_cmplx_int16_t *x, - armral_cmplx_int16_t *y, int istride, - int ostride, int howmany, float dir); - -cs16_cf32_cs16_ac_n_uu_fft_t armral_fft_cs16_cf32_cs16_ac_n_uu2; -cs16_cf32_cs16_ac_n_uu_fft_t armral_fft_cs16_cf32_cs16_ac_n_uu3; -cs16_cf32_cs16_ac_n_uu_fft_t armral_fft_cs16_cf32_cs16_ac_n_uu4; -cs16_cf32_cs16_ac_n_uu_fft_t armral_fft_cs16_cf32_cs16_ac_n_uu5; -cs16_cf32_cs16_ac_n_uu_fft_t armral_fft_cs16_cf32_cs16_ac_n_uu6; -cs16_cf32_cs16_ac_n_uu_fft_t armral_fft_cs16_cf32_cs16_ac_n_uu7; -cs16_cf32_cs16_ac_n_uu_fft_t armral_fft_cs16_cf32_cs16_ac_n_uu8; -cs16_cf32_cs16_ac_n_uu_fft_t armral_fft_cs16_cf32_cs16_ac_n_uu9; -cs16_cf32_cs16_ac_n_uu_fft_t armral_fft_cs16_cf32_cs16_ac_n_uu10; -cs16_cf32_cs16_ac_n_uu_fft_t armral_fft_cs16_cf32_cs16_ac_n_uu11; -cs16_cf32_cs16_ac_n_uu_fft_t armral_fft_cs16_cf32_cs16_ac_n_uu12; -cs16_cf32_cs16_ac_n_uu_fft_t armral_fft_cs16_cf32_cs16_ac_n_uu13; -cs16_cf32_cs16_ac_n_uu_fft_t armral_fft_cs16_cf32_cs16_ac_n_uu14; -cs16_cf32_cs16_ac_n_uu_fft_t armral_fft_cs16_cf32_cs16_ac_n_uu15; -cs16_cf32_cs16_ac_n_uu_fft_t armral_fft_cs16_cf32_cs16_ac_n_uu16; -cs16_cf32_cs16_ac_n_uu_fft_t armral_fft_cs16_cf32_cs16_ac_n_uu17; -cs16_cf32_cs16_ac_n_uu_fft_t armral_fft_cs16_cf32_cs16_ac_n_uu18; -cs16_cf32_cs16_ac_n_uu_fft_t armral_fft_cs16_cf32_cs16_ac_n_uu19; -cs16_cf32_cs16_ac_n_uu_fft_t armral_fft_cs16_cf32_cs16_ac_n_uu20; -cs16_cf32_cs16_ac_n_uu_fft_t armral_fft_cs16_cf32_cs16_ac_n_uu21; -cs16_cf32_cs16_ac_n_uu_fft_t armral_fft_cs16_cf32_cs16_ac_n_uu22; -cs16_cf32_cs16_ac_n_uu_fft_t armral_fft_cs16_cf32_cs16_ac_n_uu24; -cs16_cf32_cs16_ac_n_uu_fft_t armral_fft_cs16_cf32_cs16_ac_n_uu25; -cs16_cf32_cs16_ac_n_uu_fft_t armral_fft_cs16_cf32_cs16_ac_n_uu32; - -#ifdef __cplusplus -} // extern "C" -#endif \ No newline at end of file diff --git a/src/LowerPHY/FFT/fft_cs16_cf32_cs16_ac_n_uun.c b/src/LowerPHY/FFT/fft_cs16_cf32_cs16_ac_n_uun.c new file mode 100644 index 0000000000000000000000000000000000000000..23f9836ba55987e8f92a10730f9bb61a862f65fb --- /dev/null +++ b/src/LowerPHY/FFT/fft_cs16_cf32_cs16_ac_n_uun.c @@ -0,0 +1,14222 @@ +/* + Arm RAN Acceleration Library + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause +*/ +#include "fft_cs16_cf32_cs16_ac_n_uun.h" + +#include +#ifdef ARMRAL_ARCH_SVE +#include +#endif + +#ifndef ARMRAL_ARCH_SVE +void armral_fft_cs16_cf32_cs16_ac_n_uun2(const armral_cmplx_int16_t *restrict x, + armral_cmplx_int16_t *restrict y, + int istride, int ostride, int howmany, + float dir) { + const int32_t *v5 = (const int32_t *)x; + int32_t *v6 = (int32_t *)y; + int16x4_t v13 = vld1s_s16(&v5[0]); + int16x4_t v19 = vld1s_s16(&v5[istride]); + float32x2_t v14 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v13)), 15); + float32x2_t v20 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v19)), 15); + float32x2_t v21 = vadd_f32(v14, v20); + float32x2_t v22 = vsub_f32(v14, v20); + int16x4_t v33 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v21, 15), (int32x2_t){0, 0})); + int16x4_t v39 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v22, 15), (int32x2_t){0, 0})); + v6[0] = vget_lane_s32(vreinterpret_s32_s16(v33), 0); + v6[ostride] = vget_lane_s32(vreinterpret_s32_s16(v39), 0); +} +#endif + +#ifdef ARMRAL_ARCH_SVE +void armral_fft_cs16_cf32_cs16_ac_n_uun2(const armral_cmplx_int16_t *restrict x, + armral_cmplx_int16_t *restrict y, + int istride, int ostride, int howmany, + float dir) { + int64_t v0 = istride; + int64_t v2 = ostride; + const int32_t *v5 = (const int32_t *)x; + int32_t *v6 = (int32_t *)y; + svbool_t pred_full = svptrue_pat_b32(SV_VL2); + const int32_t *v74 = &v5[v0]; + int32_t *v95 = &v6[v2]; + const int32_t *v65 = &v5[0]; + int32_t *v86 = &v6[0]; + svfloat32_t v29 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v74[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v21 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v65[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v30 = svadd_f32_x(svptrue_b32(), v21, v29); + svfloat32_t v31 = svsub_f32_x(svptrue_b32(), v21, v29); + svint16_t v44 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v30, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v52 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v31, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svst1w_u64(pred_full, (unsigned *)(v86), svreinterpret_u64_s16(v44)); + svst1w_u64(pred_full, (unsigned *)(v95), svreinterpret_u64_s16(v52)); +} +#endif + +#ifndef ARMRAL_ARCH_SVE +void armral_fft_cs16_cf32_cs16_ac_n_uun3(const armral_cmplx_int16_t *restrict x, + armral_cmplx_int16_t *restrict y, + int istride, int ostride, int howmany, + float dir) { + float v4 = dir; + const int32_t *v5 = (const int32_t *)x; + int32_t *v6 = (int32_t *)y; + float v35 = -1.4999999999999998e+00F; + float v38 = 8.6602540378443871e-01F; + float v39 = -8.6602540378443871e-01F; + int16x4_t v13 = vld1s_s16(&v5[istride]); + int16x4_t v27 = vld1s_s16(&v5[0]); + float32x2_t v36 = (float32x2_t){v35, v35}; + float32x2_t v40 = (float32x2_t){v38, v39}; + float32x2_t v41 = (float32x2_t){v4, v4}; + float32x2_t v14 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v13)), 15); + int16x4_t v19 = vld1s_s16(&v5[istride * 2]); + float32x2_t v28 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v27)), 15); + float32x2_t v42 = vmul_f32(v41, v40); + float32x2_t v20 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v19)), 15); + float32x2_t v21 = vadd_f32(v14, v20); + float32x2_t v22 = vsub_f32(v14, v20); + float32x2_t v29 = vadd_f32(v21, v28); + float32x2_t v37 = vmul_f32(v21, v36); + float32x2_t v43 = vrev64_f32(v22); + float32x2_t v44 = vmul_f32(v43, v42); + float32x2_t v45 = vadd_f32(v29, v37); + int16x4_t v50 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v29, 15), (int32x2_t){0, 0})); + float32x2_t v46 = vadd_f32(v45, v44); + float32x2_t v47 = vsub_f32(v45, v44); + v6[0] = vget_lane_s32(vreinterpret_s32_s16(v50), 0); + int16x4_t v56 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v47, 15), (int32x2_t){0, 0})); + int16x4_t v62 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v46, 15), (int32x2_t){0, 0})); + v6[ostride] = vget_lane_s32(vreinterpret_s32_s16(v56), 0); + v6[ostride * 2] = vget_lane_s32(vreinterpret_s32_s16(v62), 0); +} +#endif + +#ifdef ARMRAL_ARCH_SVE +void armral_fft_cs16_cf32_cs16_ac_n_uun3(const armral_cmplx_int16_t *restrict x, + armral_cmplx_int16_t *restrict y, + int istride, int ostride, int howmany, + float dir) { + int64_t v0 = istride; + int64_t v2 = ostride; + float v4 = dir; + const int32_t *v5 = (const int32_t *)x; + int32_t *v6 = (int32_t *)y; + svbool_t pred_full = svptrue_pat_b32(SV_VL2); + float v47 = -1.4999999999999998e+00F; + float v52 = -8.6602540378443871e-01F; + const int32_t *v91 = &v5[v0]; + int32_t *v132 = &v6[v2]; + int64_t v23 = v0 * 2; + float v55 = v4 * v52; + int64_t v78 = v2 * 2; + const int32_t *v110 = &v5[0]; + svfloat32_t v114 = svdup_n_f32(v47); + int32_t *v123 = &v6[0]; + svfloat32_t v21 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v91[0])), + 1.F / (1ULL << 15ULL)); + const int32_t *v100 = &v5[v23]; + svfloat32_t v115 = svdup_n_f32(v55); + int32_t *v141 = &v6[v78]; + svfloat32_t v39 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v110[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v29 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v100[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v30 = svadd_f32_x(svptrue_b32(), v21, v29); + svfloat32_t v31 = svsub_f32_x(svptrue_b32(), v21, v29); + svfloat32_t v40 = svadd_f32_x(svptrue_b32(), v30, v39); + svfloat32_t zero57 = svdup_n_f32(0); + svfloat32_t v57 = svcmla_f32_x(pred_full, zero57, v115, v31, 90); + svfloat32_t v58 = svmla_f32_x(pred_full, v40, v30, v114); + svint16_t v63 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v40, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svfloat32_t v59 = svadd_f32_x(svptrue_b32(), v58, v57); + svfloat32_t v60 = svsub_f32_x(svptrue_b32(), v58, v57); + svst1w_u64(pred_full, (unsigned *)(v123), svreinterpret_u64_s16(v63)); + svint16_t v71 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v60, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v79 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v59, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svst1w_u64(pred_full, (unsigned *)(v132), svreinterpret_u64_s16(v71)); + svst1w_u64(pred_full, (unsigned *)(v141), svreinterpret_u64_s16(v79)); +} +#endif + +#ifndef ARMRAL_ARCH_SVE +void armral_fft_cs16_cf32_cs16_ac_n_uun4(const armral_cmplx_int16_t *restrict x, + armral_cmplx_int16_t *restrict y, + int istride, int ostride, int howmany, + float dir) { + float v4 = dir; + const int32_t *v5 = (const int32_t *)x; + int32_t *v6 = (int32_t *)y; + float v51 = 1.0000000000000000e+00F; + float v52 = -1.0000000000000000e+00F; + int16x4_t v13 = vld1s_s16(&v5[0]); + int16x4_t v27 = vld1s_s16(&v5[istride]); + float32x2_t v53 = (float32x2_t){v51, v52}; + float32x2_t v54 = (float32x2_t){v4, v4}; + float32x2_t v14 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v13)), 15); + int16x4_t v19 = vld1s_s16(&v5[istride * 2]); + float32x2_t v28 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v27)), 15); + int16x4_t v33 = vld1s_s16(&v5[istride * 3]); + float32x2_t v55 = vmul_f32(v54, v53); + float32x2_t v20 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v19)), 15); + float32x2_t v34 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v33)), 15); + float32x2_t v21 = vadd_f32(v14, v20); + float32x2_t v22 = vsub_f32(v14, v20); + float32x2_t v35 = vadd_f32(v28, v34); + float32x2_t v36 = vsub_f32(v28, v34); + float32x2_t v37 = vadd_f32(v21, v35); + float32x2_t v38 = vsub_f32(v21, v35); + float32x2_t v56 = vrev64_f32(v36); + float32x2_t v57 = vmul_f32(v56, v55); + int16x4_t v62 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v37, 15), (int32x2_t){0, 0})); + int16x4_t v74 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v38, 15), (int32x2_t){0, 0})); + float32x2_t v58 = vadd_f32(v22, v57); + float32x2_t v59 = vsub_f32(v22, v57); + v6[0] = vget_lane_s32(vreinterpret_s32_s16(v62), 0); + v6[ostride * 2] = vget_lane_s32(vreinterpret_s32_s16(v74), 0); + int16x4_t v68 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v59, 15), (int32x2_t){0, 0})); + int16x4_t v80 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v58, 15), (int32x2_t){0, 0})); + v6[ostride] = vget_lane_s32(vreinterpret_s32_s16(v68), 0); + v6[ostride * 3] = vget_lane_s32(vreinterpret_s32_s16(v80), 0); +} +#endif + +#ifdef ARMRAL_ARCH_SVE +void armral_fft_cs16_cf32_cs16_ac_n_uun4(const armral_cmplx_int16_t *restrict x, + armral_cmplx_int16_t *restrict y, + int istride, int ostride, int howmany, + float dir) { + int64_t v0 = istride; + int64_t v2 = ostride; + float v4 = dir; + const int32_t *v5 = (const int32_t *)x; + int32_t *v6 = (int32_t *)y; + svbool_t pred_full = svptrue_pat_b32(SV_VL2); + float v68 = -1.0000000000000000e+00F; + const int32_t *v133 = &v5[v0]; + int32_t *v165 = &v6[v2]; + int64_t v23 = v0 * 2; + int64_t v41 = v0 * 3; + float v71 = v4 * v68; + int64_t v93 = v2 * 2; + int64_t v101 = v2 * 3; + const int32_t *v115 = &v5[0]; + int32_t *v156 = &v6[0]; + svfloat32_t v39 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v133[0])), + 1.F / (1ULL << 15ULL)); + const int32_t *v124 = &v5[v23]; + const int32_t *v142 = &v5[v41]; + svfloat32_t v148 = svdup_n_f32(v71); + int32_t *v174 = &v6[v93]; + int32_t *v183 = &v6[v101]; + svfloat32_t v21 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v115[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v29 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v124[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v47 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v142[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v30 = svadd_f32_x(svptrue_b32(), v21, v29); + svfloat32_t v31 = svsub_f32_x(svptrue_b32(), v21, v29); + svfloat32_t v48 = svadd_f32_x(svptrue_b32(), v39, v47); + svfloat32_t v49 = svsub_f32_x(svptrue_b32(), v39, v47); + svfloat32_t v50 = svadd_f32_x(svptrue_b32(), v30, v48); + svfloat32_t v51 = svsub_f32_x(svptrue_b32(), v30, v48); + svfloat32_t zero73 = svdup_n_f32(0); + svfloat32_t v73 = svcmla_f32_x(pred_full, zero73, v148, v49, 90); + svfloat32_t v74 = svadd_f32_x(svptrue_b32(), v31, v73); + svfloat32_t v75 = svsub_f32_x(svptrue_b32(), v31, v73); + svint16_t v78 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v50, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v94 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v51, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v86 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v75, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v102 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v74, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svst1w_u64(pred_full, (unsigned *)(v156), svreinterpret_u64_s16(v78)); + svst1w_u64(pred_full, (unsigned *)(v174), svreinterpret_u64_s16(v94)); + svst1w_u64(pred_full, (unsigned *)(v165), svreinterpret_u64_s16(v86)); + svst1w_u64(pred_full, (unsigned *)(v183), svreinterpret_u64_s16(v102)); +} +#endif + +#ifndef ARMRAL_ARCH_SVE +void armral_fft_cs16_cf32_cs16_ac_n_uun5(const armral_cmplx_int16_t *restrict x, + armral_cmplx_int16_t *restrict y, + int istride, int ostride, int howmany, + float dir) { + float v4 = dir; + const int32_t *v5 = (const int32_t *)x; + int32_t *v6 = (int32_t *)y; + float v52 = -1.2500000000000000e+00F; + float v56 = 5.5901699437494745e-01F; + float v59 = 1.5388417685876268e+00F; + float v60 = -1.5388417685876268e+00F; + float v66 = 5.8778525229247325e-01F; + float v67 = -5.8778525229247325e-01F; + float v73 = 3.6327126400268028e-01F; + float v74 = -3.6327126400268028e-01F; + int16x4_t v13 = vld1s_s16(&v5[istride]); + int16x4_t v44 = vld1s_s16(&v5[0]); + float32x2_t v53 = (float32x2_t){v52, v52}; + float32x2_t v57 = (float32x2_t){v56, v56}; + float32x2_t v61 = (float32x2_t){v59, v60}; + float32x2_t v68 = (float32x2_t){v66, v67}; + float32x2_t v75 = (float32x2_t){v73, v74}; + float32x2_t v76 = (float32x2_t){v4, v4}; + float32x2_t v14 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v13)), 15); + int16x4_t v19 = vld1s_s16(&v5[istride * 4]); + int16x4_t v27 = vld1s_s16(&v5[istride * 3]); + int16x4_t v33 = vld1s_s16(&v5[istride * 2]); + float32x2_t v45 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v44)), 15); + float32x2_t v63 = vmul_f32(v76, v61); + float32x2_t v70 = vmul_f32(v76, v68); + float32x2_t v77 = vmul_f32(v76, v75); + float32x2_t v20 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v19)), 15); + float32x2_t v28 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v27)), 15); + float32x2_t v34 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v33)), 15); + float32x2_t v21 = vadd_f32(v14, v20); + float32x2_t v22 = vsub_f32(v14, v20); + float32x2_t v35 = vadd_f32(v28, v34); + float32x2_t v36 = vsub_f32(v28, v34); + float32x2_t v37 = vadd_f32(v21, v35); + float32x2_t v38 = vsub_f32(v21, v35); + float32x2_t v39 = vadd_f32(v22, v36); + float32x2_t v64 = vrev64_f32(v22); + float32x2_t v78 = vrev64_f32(v36); + float32x2_t v46 = vadd_f32(v37, v45); + float32x2_t v54 = vmul_f32(v37, v53); + float32x2_t v58 = vmul_f32(v38, v57); + float32x2_t v65 = vmul_f32(v64, v63); + float32x2_t v71 = vrev64_f32(v39); + float32x2_t v79 = vmul_f32(v78, v77); + float32x2_t v72 = vmul_f32(v71, v70); + float32x2_t v80 = vadd_f32(v46, v54); + int16x4_t v91 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v46, 15), (int32x2_t){0, 0})); + float32x2_t v81 = vadd_f32(v80, v58); + float32x2_t v82 = vsub_f32(v80, v58); + float32x2_t v83 = vsub_f32(v65, v72); + float32x2_t v84 = vadd_f32(v72, v79); + v6[0] = vget_lane_s32(vreinterpret_s32_s16(v91), 0); + float32x2_t v85 = vadd_f32(v81, v83); + float32x2_t v86 = vsub_f32(v81, v83); + float32x2_t v87 = vadd_f32(v82, v84); + float32x2_t v88 = vsub_f32(v82, v84); + int16x4_t v97 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v86, 15), (int32x2_t){0, 0})); + int16x4_t v103 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v88, 15), (int32x2_t){0, 0})); + int16x4_t v109 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v87, 15), (int32x2_t){0, 0})); + int16x4_t v115 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v85, 15), (int32x2_t){0, 0})); + v6[ostride] = vget_lane_s32(vreinterpret_s32_s16(v97), 0); + v6[ostride * 2] = vget_lane_s32(vreinterpret_s32_s16(v103), 0); + v6[ostride * 3] = vget_lane_s32(vreinterpret_s32_s16(v109), 0); + v6[ostride * 4] = vget_lane_s32(vreinterpret_s32_s16(v115), 0); +} +#endif + +#ifdef ARMRAL_ARCH_SVE +void armral_fft_cs16_cf32_cs16_ac_n_uun5(const armral_cmplx_int16_t *restrict x, + armral_cmplx_int16_t *restrict y, + int istride, int ostride, int howmany, + float dir) { + int64_t v0 = istride; + int64_t v2 = ostride; + float v4 = dir; + const int32_t *v5 = (const int32_t *)x; + int32_t *v6 = (int32_t *)y; + svbool_t pred_full = svptrue_pat_b32(SV_VL2); + float v68 = -1.2500000000000000e+00F; + float v73 = 5.5901699437494745e-01F; + float v78 = -1.5388417685876268e+00F; + float v85 = -5.8778525229247325e-01F; + float v92 = -3.6327126400268028e-01F; + const int32_t *v153 = &v5[v0]; + int32_t *v215 = &v6[v2]; + int64_t v23 = v0 * 4; + int64_t v33 = v0 * 3; + int64_t v41 = v0 * 2; + float v81 = v4 * v78; + float v88 = v4 * v85; + float v95 = v4 * v92; + int64_t v124 = v2 * 2; + int64_t v132 = v2 * 3; + int64_t v140 = v2 * 4; + const int32_t *v190 = &v5[0]; + svfloat32_t v194 = svdup_n_f32(v68); + svfloat32_t v195 = svdup_n_f32(v73); + int32_t *v206 = &v6[0]; + svfloat32_t v21 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v153[0])), + 1.F / (1ULL << 15ULL)); + const int32_t *v162 = &v5[v23]; + const int32_t *v171 = &v5[v33]; + const int32_t *v180 = &v5[v41]; + svfloat32_t v196 = svdup_n_f32(v81); + svfloat32_t v197 = svdup_n_f32(v88); + svfloat32_t v198 = svdup_n_f32(v95); + int32_t *v224 = &v6[v124]; + int32_t *v233 = &v6[v132]; + int32_t *v242 = &v6[v140]; + svfloat32_t v60 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v190[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v29 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v162[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v39 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v171[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v47 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v180[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v30 = svadd_f32_x(svptrue_b32(), v21, v29); + svfloat32_t v31 = svsub_f32_x(svptrue_b32(), v21, v29); + svfloat32_t v48 = svadd_f32_x(svptrue_b32(), v39, v47); + svfloat32_t v49 = svsub_f32_x(svptrue_b32(), v39, v47); + svfloat32_t v50 = svadd_f32_x(svptrue_b32(), v30, v48); + svfloat32_t v51 = svsub_f32_x(svptrue_b32(), v30, v48); + svfloat32_t v52 = svadd_f32_x(svptrue_b32(), v31, v49); + svfloat32_t zero83 = svdup_n_f32(0); + svfloat32_t v83 = svcmla_f32_x(pred_full, zero83, v196, v31, 90); + svfloat32_t v61 = svadd_f32_x(svptrue_b32(), v50, v60); + svfloat32_t zero90 = svdup_n_f32(0); + svfloat32_t v90 = svcmla_f32_x(pred_full, zero90, v197, v52, 90); + svfloat32_t v98 = svmla_f32_x(pred_full, v61, v50, v194); + svfloat32_t v101 = svsub_f32_x(svptrue_b32(), v83, v90); + svfloat32_t v102 = svcmla_f32_x(pred_full, v90, v198, v49, 90); + svint16_t v109 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v61, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svfloat32_t v99 = svmla_f32_x(pred_full, v98, v51, v195); + svfloat32_t v100 = svmls_f32_x(pred_full, v98, v51, v195); + svst1w_u64(pred_full, (unsigned *)(v206), svreinterpret_u64_s16(v109)); + svfloat32_t v103 = svadd_f32_x(svptrue_b32(), v99, v101); + svfloat32_t v104 = svsub_f32_x(svptrue_b32(), v99, v101); + svfloat32_t v105 = svadd_f32_x(svptrue_b32(), v100, v102); + svfloat32_t v106 = svsub_f32_x(svptrue_b32(), v100, v102); + svint16_t v117 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v104, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v125 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v106, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v133 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v105, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v141 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v103, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svst1w_u64(pred_full, (unsigned *)(v215), svreinterpret_u64_s16(v117)); + svst1w_u64(pred_full, (unsigned *)(v224), svreinterpret_u64_s16(v125)); + svst1w_u64(pred_full, (unsigned *)(v233), svreinterpret_u64_s16(v133)); + svst1w_u64(pred_full, (unsigned *)(v242), svreinterpret_u64_s16(v141)); +} +#endif + +#ifndef ARMRAL_ARCH_SVE +void armral_fft_cs16_cf32_cs16_ac_n_uun6(const armral_cmplx_int16_t *restrict x, + armral_cmplx_int16_t *restrict y, + int istride, int ostride, int howmany, + float dir) { + float v4 = dir; + const int32_t *v5 = (const int32_t *)x; + int32_t *v6 = (int32_t *)y; + float v80 = -1.4999999999999998e+00F; + float v83 = 8.6602540378443871e-01F; + float v84 = -8.6602540378443871e-01F; + int16x4_t v13 = vld1s_s16(&v5[0]); + int16x4_t v47 = vld1s_s16(&v5[istride]); + float32x2_t v81 = (float32x2_t){v80, v80}; + float32x2_t v85 = (float32x2_t){v83, v84}; + float32x2_t v86 = (float32x2_t){v4, v4}; + float32x2_t v14 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v13)), 15); + int16x4_t v19 = vld1s_s16(&v5[istride * 3]); + int16x4_t v27 = vld1s_s16(&v5[istride * 2]); + int16x4_t v33 = vld1s_s16(&v5[istride * 5]); + int16x4_t v41 = vld1s_s16(&v5[istride * 4]); + float32x2_t v48 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v47)), 15); + float32x2_t v87 = vmul_f32(v86, v85); + float32x2_t v20 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v19)), 15); + float32x2_t v28 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v27)), 15); + float32x2_t v34 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v33)), 15); + float32x2_t v42 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v41)), 15); + float32x2_t v21 = vadd_f32(v14, v20); + float32x2_t v22 = vsub_f32(v14, v20); + float32x2_t v35 = vadd_f32(v28, v34); + float32x2_t v36 = vsub_f32(v28, v34); + float32x2_t v49 = vadd_f32(v42, v48); + float32x2_t v50 = vsub_f32(v42, v48); + float32x2_t v51 = vadd_f32(v35, v49); + float32x2_t v52 = vsub_f32(v35, v49); + float32x2_t v72 = vadd_f32(v36, v50); + float32x2_t v73 = vsub_f32(v36, v50); + float32x2_t v53 = vadd_f32(v51, v21); + float32x2_t v61 = vmul_f32(v51, v81); + float32x2_t v67 = vrev64_f32(v52); + float32x2_t v74 = vadd_f32(v72, v22); + float32x2_t v82 = vmul_f32(v72, v81); + float32x2_t v88 = vrev64_f32(v73); + float32x2_t v68 = vmul_f32(v67, v87); + float32x2_t v69 = vadd_f32(v53, v61); + float32x2_t v89 = vmul_f32(v88, v87); + float32x2_t v90 = vadd_f32(v74, v82); + int16x4_t v95 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v53, 15), (int32x2_t){0, 0})); + int16x4_t v101 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v74, 15), (int32x2_t){0, 0})); + float32x2_t v70 = vadd_f32(v69, v68); + float32x2_t v71 = vsub_f32(v69, v68); + float32x2_t v91 = vadd_f32(v90, v89); + float32x2_t v92 = vsub_f32(v90, v89); + v6[0] = vget_lane_s32(vreinterpret_s32_s16(v95), 0); + v6[ostride * 3] = vget_lane_s32(vreinterpret_s32_s16(v101), 0); + int16x4_t v107 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v71, 15), (int32x2_t){0, 0})); + int16x4_t v113 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v92, 15), (int32x2_t){0, 0})); + int16x4_t v119 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v70, 15), (int32x2_t){0, 0})); + int16x4_t v125 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v91, 15), (int32x2_t){0, 0})); + v6[ostride * 4] = vget_lane_s32(vreinterpret_s32_s16(v107), 0); + v6[ostride] = vget_lane_s32(vreinterpret_s32_s16(v113), 0); + v6[ostride * 2] = vget_lane_s32(vreinterpret_s32_s16(v119), 0); + v6[ostride * 5] = vget_lane_s32(vreinterpret_s32_s16(v125), 0); +} +#endif + +#ifdef ARMRAL_ARCH_SVE +void armral_fft_cs16_cf32_cs16_ac_n_uun6(const armral_cmplx_int16_t *restrict x, + armral_cmplx_int16_t *restrict y, + int istride, int ostride, int howmany, + float dir) { + int64_t v0 = istride; + int64_t v2 = ostride; + float v4 = dir; + const int32_t *v5 = (const int32_t *)x; + int32_t *v6 = (int32_t *)y; + svbool_t pred_full = svptrue_pat_b32(SV_VL2); + float v100 = -1.4999999999999998e+00F; + float v105 = -8.6602540378443871e-01F; + const int32_t *v214 = &v5[v0]; + int32_t *v257 = &v6[v2]; + int64_t v23 = v0 * 3; + int64_t v33 = v0 * 2; + int64_t v41 = v0 * 5; + int64_t v51 = v0 * 4; + float v108 = v4 * v105; + int64_t v123 = v2 * 3; + int64_t v131 = v2 * 4; + int64_t v147 = v2 * 2; + int64_t v155 = v2 * 5; + const int32_t *v169 = &v5[0]; + svfloat32_t v221 = svdup_n_f32(v100); + int32_t *v230 = &v6[0]; + svfloat32_t v65 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v214[0])), + 1.F / (1ULL << 15ULL)); + const int32_t *v178 = &v5[v23]; + const int32_t *v187 = &v5[v33]; + const int32_t *v196 = &v5[v41]; + const int32_t *v205 = &v5[v51]; + svfloat32_t v222 = svdup_n_f32(v108); + int32_t *v239 = &v6[v123]; + int32_t *v248 = &v6[v131]; + int32_t *v266 = &v6[v147]; + int32_t *v275 = &v6[v155]; + svfloat32_t v21 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v169[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v29 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v178[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v39 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v187[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v47 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v196[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v57 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v205[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v30 = svadd_f32_x(svptrue_b32(), v21, v29); + svfloat32_t v31 = svsub_f32_x(svptrue_b32(), v21, v29); + svfloat32_t v48 = svadd_f32_x(svptrue_b32(), v39, v47); + svfloat32_t v49 = svsub_f32_x(svptrue_b32(), v39, v47); + svfloat32_t v66 = svadd_f32_x(svptrue_b32(), v57, v65); + svfloat32_t v67 = svsub_f32_x(svptrue_b32(), v57, v65); + svfloat32_t v68 = svadd_f32_x(svptrue_b32(), v48, v66); + svfloat32_t v69 = svsub_f32_x(svptrue_b32(), v48, v66); + svfloat32_t v91 = svadd_f32_x(svptrue_b32(), v49, v67); + svfloat32_t v92 = svsub_f32_x(svptrue_b32(), v49, v67); + svfloat32_t v70 = svadd_f32_x(svptrue_b32(), v68, v30); + svfloat32_t zero87 = svdup_n_f32(0); + svfloat32_t v87 = svcmla_f32_x(pred_full, zero87, v222, v69, 90); + svfloat32_t v93 = svadd_f32_x(svptrue_b32(), v91, v31); + svfloat32_t zero110 = svdup_n_f32(0); + svfloat32_t v110 = svcmla_f32_x(pred_full, zero110, v222, v92, 90); + svfloat32_t v88 = svmla_f32_x(pred_full, v70, v68, v221); + svfloat32_t v111 = svmla_f32_x(pred_full, v93, v91, v221); + svint16_t v116 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v70, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v124 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v93, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svfloat32_t v89 = svadd_f32_x(svptrue_b32(), v88, v87); + svfloat32_t v90 = svsub_f32_x(svptrue_b32(), v88, v87); + svfloat32_t v112 = svadd_f32_x(svptrue_b32(), v111, v110); + svfloat32_t v113 = svsub_f32_x(svptrue_b32(), v111, v110); + svst1w_u64(pred_full, (unsigned *)(v230), svreinterpret_u64_s16(v116)); + svst1w_u64(pred_full, (unsigned *)(v239), svreinterpret_u64_s16(v124)); + svint16_t v132 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v90, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v140 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v113, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v148 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v89, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v156 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v112, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svst1w_u64(pred_full, (unsigned *)(v248), svreinterpret_u64_s16(v132)); + svst1w_u64(pred_full, (unsigned *)(v257), svreinterpret_u64_s16(v140)); + svst1w_u64(pred_full, (unsigned *)(v266), svreinterpret_u64_s16(v148)); + svst1w_u64(pred_full, (unsigned *)(v275), svreinterpret_u64_s16(v156)); +} +#endif + +#ifndef ARMRAL_ARCH_SVE +void armral_fft_cs16_cf32_cs16_ac_n_uun7(const armral_cmplx_int16_t *restrict x, + armral_cmplx_int16_t *restrict y, + int istride, int ostride, int howmany, + float dir) { + float v4 = dir; + const int32_t *v5 = (const int32_t *)x; + int32_t *v6 = (int32_t *)y; + float v73 = -1.1666666666666665e+00F; + float v77 = 7.9015646852540022e-01F; + float v81 = 5.5854267289647742e-02F; + float v85 = 7.3430220123575241e-01F; + float v88 = 4.4095855184409838e-01F; + float v89 = -4.4095855184409838e-01F; + float v95 = 3.4087293062393137e-01F; + float v96 = -3.4087293062393137e-01F; + float v102 = -5.3396936033772524e-01F; + float v103 = 5.3396936033772524e-01F; + float v109 = 8.7484229096165667e-01F; + float v110 = -8.7484229096165667e-01F; + int16x4_t v13 = vld1s_s16(&v5[istride]); + int16x4_t v57 = vld1s_s16(&v5[0]); + float32x2_t v74 = (float32x2_t){v73, v73}; + float32x2_t v78 = (float32x2_t){v77, v77}; + float32x2_t v82 = (float32x2_t){v81, v81}; + float32x2_t v86 = (float32x2_t){v85, v85}; + float32x2_t v90 = (float32x2_t){v88, v89}; + float32x2_t v97 = (float32x2_t){v95, v96}; + float32x2_t v104 = (float32x2_t){v102, v103}; + float32x2_t v111 = (float32x2_t){v109, v110}; + float32x2_t v112 = (float32x2_t){v4, v4}; + float32x2_t v14 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v13)), 15); + int16x4_t v19 = vld1s_s16(&v5[istride * 6]); + int16x4_t v27 = vld1s_s16(&v5[istride * 4]); + int16x4_t v33 = vld1s_s16(&v5[istride * 3]); + int16x4_t v41 = vld1s_s16(&v5[istride * 2]); + int16x4_t v47 = vld1s_s16(&v5[istride * 5]); + float32x2_t v58 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v57)), 15); + float32x2_t v92 = vmul_f32(v112, v90); + float32x2_t v99 = vmul_f32(v112, v97); + float32x2_t v106 = vmul_f32(v112, v104); + float32x2_t v113 = vmul_f32(v112, v111); + float32x2_t v20 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v19)), 15); + float32x2_t v28 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v27)), 15); + float32x2_t v34 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v33)), 15); + float32x2_t v42 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v41)), 15); + float32x2_t v48 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v47)), 15); + float32x2_t v21 = vadd_f32(v14, v20); + float32x2_t v22 = vsub_f32(v14, v20); + float32x2_t v35 = vadd_f32(v28, v34); + float32x2_t v36 = vsub_f32(v28, v34); + float32x2_t v49 = vadd_f32(v42, v48); + float32x2_t v50 = vsub_f32(v42, v48); + float32x2_t v51 = vadd_f32(v21, v35); + float32x2_t v60 = vsub_f32(v21, v35); + float32x2_t v61 = vsub_f32(v35, v49); + float32x2_t v62 = vsub_f32(v49, v21); + float32x2_t v63 = vadd_f32(v22, v36); + float32x2_t v65 = vsub_f32(v22, v36); + float32x2_t v66 = vsub_f32(v36, v50); + float32x2_t v67 = vsub_f32(v50, v22); + float32x2_t v52 = vadd_f32(v51, v49); + float32x2_t v64 = vadd_f32(v63, v50); + float32x2_t v79 = vmul_f32(v60, v78); + float32x2_t v83 = vmul_f32(v61, v82); + float32x2_t v87 = vmul_f32(v62, v86); + float32x2_t v100 = vrev64_f32(v65); + float32x2_t v107 = vrev64_f32(v66); + float32x2_t v114 = vrev64_f32(v67); + float32x2_t v59 = vadd_f32(v52, v58); + float32x2_t v75 = vmul_f32(v52, v74); + float32x2_t v93 = vrev64_f32(v64); + float32x2_t v101 = vmul_f32(v100, v99); + float32x2_t v108 = vmul_f32(v107, v106); + float32x2_t v115 = vmul_f32(v114, v113); + float32x2_t v94 = vmul_f32(v93, v92); + float32x2_t v116 = vadd_f32(v59, v75); + int16x4_t v137 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v59, 15), (int32x2_t){0, 0})); + float32x2_t v117 = vadd_f32(v116, v79); + float32x2_t v119 = vsub_f32(v116, v79); + float32x2_t v121 = vsub_f32(v116, v83); + float32x2_t v123 = vadd_f32(v94, v101); + float32x2_t v125 = vsub_f32(v94, v101); + float32x2_t v127 = vsub_f32(v94, v108); + v6[0] = vget_lane_s32(vreinterpret_s32_s16(v137), 0); + float32x2_t v118 = vadd_f32(v117, v83); + float32x2_t v120 = vsub_f32(v119, v87); + float32x2_t v122 = vadd_f32(v121, v87); + float32x2_t v124 = vadd_f32(v123, v108); + float32x2_t v126 = vsub_f32(v125, v115); + float32x2_t v128 = vadd_f32(v127, v115); + float32x2_t v129 = vadd_f32(v118, v124); + float32x2_t v130 = vsub_f32(v118, v124); + float32x2_t v131 = vadd_f32(v120, v126); + float32x2_t v132 = vsub_f32(v120, v126); + float32x2_t v133 = vadd_f32(v122, v128); + float32x2_t v134 = vsub_f32(v122, v128); + int16x4_t v143 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v130, 15), (int32x2_t){0, 0})); + int16x4_t v149 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v132, 15), (int32x2_t){0, 0})); + int16x4_t v155 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v133, 15), (int32x2_t){0, 0})); + int16x4_t v161 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v134, 15), (int32x2_t){0, 0})); + int16x4_t v167 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v131, 15), (int32x2_t){0, 0})); + int16x4_t v173 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v129, 15), (int32x2_t){0, 0})); + v6[ostride] = vget_lane_s32(vreinterpret_s32_s16(v143), 0); + v6[ostride * 2] = vget_lane_s32(vreinterpret_s32_s16(v149), 0); + v6[ostride * 3] = vget_lane_s32(vreinterpret_s32_s16(v155), 0); + v6[ostride * 4] = vget_lane_s32(vreinterpret_s32_s16(v161), 0); + v6[ostride * 5] = vget_lane_s32(vreinterpret_s32_s16(v167), 0); + v6[ostride * 6] = vget_lane_s32(vreinterpret_s32_s16(v173), 0); +} +#endif + +#ifdef ARMRAL_ARCH_SVE +void armral_fft_cs16_cf32_cs16_ac_n_uun7(const armral_cmplx_int16_t *restrict x, + armral_cmplx_int16_t *restrict y, + int istride, int ostride, int howmany, + float dir) { + int64_t v0 = istride; + int64_t v2 = ostride; + float v4 = dir; + const int32_t *v5 = (const int32_t *)x; + int32_t *v6 = (int32_t *)y; + svbool_t pred_full = svptrue_pat_b32(SV_VL2); + float v93 = -1.1666666666666665e+00F; + float v98 = 7.9015646852540022e-01F; + float v103 = 5.5854267289647742e-02F; + float v108 = 7.3430220123575241e-01F; + float v113 = -4.4095855184409838e-01F; + float v120 = -3.4087293062393137e-01F; + float v127 = 5.3396936033772524e-01F; + float v134 = -8.7484229096165667e-01F; + const int32_t *v221 = &v5[v0]; + int32_t *v304 = &v6[v2]; + int64_t v23 = v0 * 6; + int64_t v33 = v0 * 4; + int64_t v41 = v0 * 3; + int64_t v51 = v0 * 2; + int64_t v59 = v0 * 5; + float v116 = v4 * v113; + float v123 = v4 * v120; + float v130 = v4 * v127; + float v137 = v4 * v134; + int64_t v176 = v2 * 2; + int64_t v184 = v2 * 3; + int64_t v192 = v2 * 4; + int64_t v200 = v2 * 5; + int64_t v208 = v2 * 6; + const int32_t *v276 = &v5[0]; + svfloat32_t v280 = svdup_n_f32(v93); + svfloat32_t v281 = svdup_n_f32(v98); + svfloat32_t v282 = svdup_n_f32(v103); + svfloat32_t v283 = svdup_n_f32(v108); + int32_t *v295 = &v6[0]; + svfloat32_t v21 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v221[0])), + 1.F / (1ULL << 15ULL)); + const int32_t *v230 = &v5[v23]; + const int32_t *v239 = &v5[v33]; + const int32_t *v248 = &v5[v41]; + const int32_t *v257 = &v5[v51]; + const int32_t *v266 = &v5[v59]; + svfloat32_t v284 = svdup_n_f32(v116); + svfloat32_t v285 = svdup_n_f32(v123); + svfloat32_t v286 = svdup_n_f32(v130); + svfloat32_t v287 = svdup_n_f32(v137); + int32_t *v313 = &v6[v176]; + int32_t *v322 = &v6[v184]; + int32_t *v331 = &v6[v192]; + int32_t *v340 = &v6[v200]; + int32_t *v349 = &v6[v208]; + svfloat32_t v77 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v276[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v29 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v230[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v39 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v239[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v47 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v248[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v57 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v257[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v65 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v266[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v30 = svadd_f32_x(svptrue_b32(), v21, v29); + svfloat32_t v31 = svsub_f32_x(svptrue_b32(), v21, v29); + svfloat32_t v48 = svadd_f32_x(svptrue_b32(), v39, v47); + svfloat32_t v49 = svsub_f32_x(svptrue_b32(), v39, v47); + svfloat32_t v66 = svadd_f32_x(svptrue_b32(), v57, v65); + svfloat32_t v67 = svsub_f32_x(svptrue_b32(), v57, v65); + svfloat32_t v68 = svadd_f32_x(svptrue_b32(), v30, v48); + svfloat32_t v79 = svsub_f32_x(svptrue_b32(), v30, v48); + svfloat32_t v80 = svsub_f32_x(svptrue_b32(), v48, v66); + svfloat32_t v81 = svsub_f32_x(svptrue_b32(), v66, v30); + svfloat32_t v82 = svadd_f32_x(svptrue_b32(), v31, v49); + svfloat32_t v84 = svsub_f32_x(svptrue_b32(), v31, v49); + svfloat32_t v85 = svsub_f32_x(svptrue_b32(), v49, v67); + svfloat32_t v86 = svsub_f32_x(svptrue_b32(), v67, v31); + svfloat32_t v69 = svadd_f32_x(svptrue_b32(), v68, v66); + svfloat32_t v83 = svadd_f32_x(svptrue_b32(), v82, v67); + svfloat32_t zero125 = svdup_n_f32(0); + svfloat32_t v125 = svcmla_f32_x(pred_full, zero125, v285, v84, 90); + svfloat32_t zero132 = svdup_n_f32(0); + svfloat32_t v132 = svcmla_f32_x(pred_full, zero132, v286, v85, 90); + svfloat32_t zero139 = svdup_n_f32(0); + svfloat32_t v139 = svcmla_f32_x(pred_full, zero139, v287, v86, 90); + svfloat32_t v78 = svadd_f32_x(svptrue_b32(), v69, v77); + svfloat32_t zero118 = svdup_n_f32(0); + svfloat32_t v118 = svcmla_f32_x(pred_full, zero118, v284, v83, 90); + svfloat32_t v140 = svmla_f32_x(pred_full, v78, v69, v280); + svfloat32_t v147 = svadd_f32_x(svptrue_b32(), v118, v125); + svfloat32_t v149 = svsub_f32_x(svptrue_b32(), v118, v125); + svfloat32_t v151 = svsub_f32_x(svptrue_b32(), v118, v132); + svint16_t v161 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v78, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svfloat32_t v141 = svmla_f32_x(pred_full, v140, v79, v281); + svfloat32_t v143 = svmls_f32_x(pred_full, v140, v79, v281); + svfloat32_t v145 = svmls_f32_x(pred_full, v140, v80, v282); + svfloat32_t v148 = svadd_f32_x(svptrue_b32(), v147, v132); + svfloat32_t v150 = svsub_f32_x(svptrue_b32(), v149, v139); + svfloat32_t v152 = svadd_f32_x(svptrue_b32(), v151, v139); + svst1w_u64(pred_full, (unsigned *)(v295), svreinterpret_u64_s16(v161)); + svfloat32_t v142 = svmla_f32_x(pred_full, v141, v80, v282); + svfloat32_t v144 = svmls_f32_x(pred_full, v143, v81, v283); + svfloat32_t v146 = svmla_f32_x(pred_full, v145, v81, v283); + svfloat32_t v153 = svadd_f32_x(svptrue_b32(), v142, v148); + svfloat32_t v154 = svsub_f32_x(svptrue_b32(), v142, v148); + svfloat32_t v155 = svadd_f32_x(svptrue_b32(), v144, v150); + svfloat32_t v156 = svsub_f32_x(svptrue_b32(), v144, v150); + svfloat32_t v157 = svadd_f32_x(svptrue_b32(), v146, v152); + svfloat32_t v158 = svsub_f32_x(svptrue_b32(), v146, v152); + svint16_t v169 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v154, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v177 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v156, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v185 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v157, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v193 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v158, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v201 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v155, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v209 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v153, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svst1w_u64(pred_full, (unsigned *)(v304), svreinterpret_u64_s16(v169)); + svst1w_u64(pred_full, (unsigned *)(v313), svreinterpret_u64_s16(v177)); + svst1w_u64(pred_full, (unsigned *)(v322), svreinterpret_u64_s16(v185)); + svst1w_u64(pred_full, (unsigned *)(v331), svreinterpret_u64_s16(v193)); + svst1w_u64(pred_full, (unsigned *)(v340), svreinterpret_u64_s16(v201)); + svst1w_u64(pred_full, (unsigned *)(v349), svreinterpret_u64_s16(v209)); +} +#endif + +#ifndef ARMRAL_ARCH_SVE +void armral_fft_cs16_cf32_cs16_ac_n_uun8(const armral_cmplx_int16_t *restrict x, + armral_cmplx_int16_t *restrict y, + int istride, int ostride, int howmany, + float dir) { + float v4 = dir; + const int32_t *v5 = (const int32_t *)x; + int32_t *v6 = (int32_t *)y; + float v96 = 1.0000000000000000e+00F; + float v97 = -1.0000000000000000e+00F; + float v104 = -7.0710678118654746e-01F; + float v111 = 7.0710678118654757e-01F; + int16x4_t v13 = vld1s_s16(&v5[0]); + int16x4_t v41 = vld1s_s16(&v5[istride]); + float32x2_t v98 = (float32x2_t){v96, v97}; + float32x2_t v105 = (float32x2_t){v111, v104}; + float32x2_t v106 = (float32x2_t){v4, v4}; + float32x2_t v112 = (float32x2_t){v111, v111}; + float32x2_t v14 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v13)), 15); + int16x4_t v19 = vld1s_s16(&v5[istride * 4]); + int16x4_t v27 = vld1s_s16(&v5[istride * 2]); + int16x4_t v33 = vld1s_s16(&v5[istride * 6]); + float32x2_t v42 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v41)), 15); + int16x4_t v47 = vld1s_s16(&v5[istride * 5]); + int16x4_t v55 = vld1s_s16(&v5[istride * 3]); + int16x4_t v61 = vld1s_s16(&v5[istride * 7]); + float32x2_t v100 = vmul_f32(v106, v98); + float32x2_t v107 = vmul_f32(v106, v105); + float32x2_t v20 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v19)), 15); + float32x2_t v28 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v27)), 15); + float32x2_t v34 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v33)), 15); + float32x2_t v48 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v47)), 15); + float32x2_t v56 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v55)), 15); + float32x2_t v62 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v61)), 15); + float32x2_t v21 = vadd_f32(v14, v20); + float32x2_t v22 = vsub_f32(v14, v20); + float32x2_t v35 = vadd_f32(v28, v34); + float32x2_t v36 = vsub_f32(v28, v34); + float32x2_t v49 = vadd_f32(v42, v48); + float32x2_t v50 = vsub_f32(v42, v48); + float32x2_t v63 = vadd_f32(v56, v62); + float32x2_t v64 = vsub_f32(v56, v62); + float32x2_t v65 = vadd_f32(v21, v35); + float32x2_t v66 = vsub_f32(v21, v35); + float32x2_t v67 = vadd_f32(v49, v63); + float32x2_t v68 = vsub_f32(v49, v63); + float32x2_t v71 = vadd_f32(v50, v64); + float32x2_t v72 = vsub_f32(v50, v64); + float32x2_t v101 = vrev64_f32(v36); + float32x2_t v69 = vadd_f32(v65, v67); + float32x2_t v70 = vsub_f32(v65, v67); + float32x2_t v90 = vrev64_f32(v68); + float32x2_t v102 = vmul_f32(v101, v100); + float32x2_t v108 = vrev64_f32(v71); + float32x2_t v113 = vmul_f32(v72, v112); + float32x2_t v91 = vmul_f32(v90, v100); + float32x2_t v109 = vmul_f32(v108, v107); + float32x2_t v116 = vadd_f32(v22, v113); + float32x2_t v117 = vsub_f32(v22, v113); + int16x4_t v126 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v69, 15), (int32x2_t){0, 0})); + int16x4_t v150 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v70, 15), (int32x2_t){0, 0})); + float32x2_t v114 = vadd_f32(v66, v91); + float32x2_t v115 = vsub_f32(v66, v91); + float32x2_t v118 = vadd_f32(v102, v109); + float32x2_t v119 = vsub_f32(v102, v109); + v6[0] = vget_lane_s32(vreinterpret_s32_s16(v126), 0); + v6[ostride * 4] = vget_lane_s32(vreinterpret_s32_s16(v150), 0); + float32x2_t v120 = vadd_f32(v116, v118); + float32x2_t v121 = vsub_f32(v116, v118); + float32x2_t v122 = vadd_f32(v117, v119); + float32x2_t v123 = vsub_f32(v117, v119); + int16x4_t v138 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v115, 15), (int32x2_t){0, 0})); + int16x4_t v162 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v114, 15), (int32x2_t){0, 0})); + int16x4_t v132 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v121, 15), (int32x2_t){0, 0})); + v6[ostride * 2] = vget_lane_s32(vreinterpret_s32_s16(v138), 0); + int16x4_t v144 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v122, 15), (int32x2_t){0, 0})); + int16x4_t v156 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v123, 15), (int32x2_t){0, 0})); + v6[ostride * 6] = vget_lane_s32(vreinterpret_s32_s16(v162), 0); + int16x4_t v168 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v120, 15), (int32x2_t){0, 0})); + v6[ostride] = vget_lane_s32(vreinterpret_s32_s16(v132), 0); + v6[ostride * 3] = vget_lane_s32(vreinterpret_s32_s16(v144), 0); + v6[ostride * 5] = vget_lane_s32(vreinterpret_s32_s16(v156), 0); + v6[ostride * 7] = vget_lane_s32(vreinterpret_s32_s16(v168), 0); +} +#endif + +#ifdef ARMRAL_ARCH_SVE +void armral_fft_cs16_cf32_cs16_ac_n_uun8(const armral_cmplx_int16_t *restrict x, + armral_cmplx_int16_t *restrict y, + int istride, int ostride, int howmany, + float dir) { + int64_t v0 = istride; + int64_t v2 = ostride; + float v4 = dir; + const int32_t *v5 = (const int32_t *)x; + int32_t *v6 = (int32_t *)y; + svbool_t pred_full = svptrue_pat_b32(SV_VL2); + float v122 = -1.0000000000000000e+00F; + float v129 = -7.0710678118654746e-01F; + float v136 = 7.0710678118654757e-01F; + const int32_t *v257 = &v5[v0]; + int32_t *v311 = &v6[v2]; + int64_t v23 = v0 * 4; + int64_t v33 = v0 * 2; + int64_t v41 = v0 * 6; + int64_t v59 = v0 * 5; + int64_t v69 = v0 * 3; + int64_t v77 = v0 * 7; + float v125 = v4 * v122; + float v132 = v4 * v129; + int64_t v167 = v2 * 2; + int64_t v175 = v2 * 3; + int64_t v183 = v2 * 4; + int64_t v191 = v2 * 5; + int64_t v199 = v2 * 6; + int64_t v207 = v2 * 7; + const int32_t *v221 = &v5[0]; + svfloat32_t v294 = svdup_n_f32(v136); + int32_t *v302 = &v6[0]; + svfloat32_t v57 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v257[0])), + 1.F / (1ULL << 15ULL)); + const int32_t *v230 = &v5[v23]; + const int32_t *v239 = &v5[v33]; + const int32_t *v248 = &v5[v41]; + const int32_t *v266 = &v5[v59]; + const int32_t *v275 = &v5[v69]; + const int32_t *v284 = &v5[v77]; + svfloat32_t v292 = svdup_n_f32(v125); + svfloat32_t v293 = svdup_n_f32(v132); + int32_t *v320 = &v6[v167]; + int32_t *v329 = &v6[v175]; + int32_t *v338 = &v6[v183]; + int32_t *v347 = &v6[v191]; + int32_t *v356 = &v6[v199]; + int32_t *v365 = &v6[v207]; + svfloat32_t v21 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v221[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v29 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v230[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v39 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v239[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v47 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v248[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v65 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v266[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v75 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v275[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v83 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v284[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v30 = svadd_f32_x(svptrue_b32(), v21, v29); + svfloat32_t v31 = svsub_f32_x(svptrue_b32(), v21, v29); + svfloat32_t v48 = svadd_f32_x(svptrue_b32(), v39, v47); + svfloat32_t v49 = svsub_f32_x(svptrue_b32(), v39, v47); + svfloat32_t v66 = svadd_f32_x(svptrue_b32(), v57, v65); + svfloat32_t v67 = svsub_f32_x(svptrue_b32(), v57, v65); + svfloat32_t v84 = svadd_f32_x(svptrue_b32(), v75, v83); + svfloat32_t v85 = svsub_f32_x(svptrue_b32(), v75, v83); + svfloat32_t v86 = svadd_f32_x(svptrue_b32(), v30, v48); + svfloat32_t v87 = svsub_f32_x(svptrue_b32(), v30, v48); + svfloat32_t v88 = svadd_f32_x(svptrue_b32(), v66, v84); + svfloat32_t v89 = svsub_f32_x(svptrue_b32(), v66, v84); + svfloat32_t v92 = svadd_f32_x(svptrue_b32(), v67, v85); + svfloat32_t v93 = svsub_f32_x(svptrue_b32(), v67, v85); + svfloat32_t zero127 = svdup_n_f32(0); + svfloat32_t v127 = svcmla_f32_x(pred_full, zero127, v292, v49, 90); + svfloat32_t v90 = svadd_f32_x(svptrue_b32(), v86, v88); + svfloat32_t v91 = svsub_f32_x(svptrue_b32(), v86, v88); + svfloat32_t zero115 = svdup_n_f32(0); + svfloat32_t v115 = svcmla_f32_x(pred_full, zero115, v292, v89, 90); + svfloat32_t zero134 = svdup_n_f32(0); + svfloat32_t v134 = svcmla_f32_x(pred_full, zero134, v293, v92, 90); + svfloat32_t v140 = svadd_f32_x(svptrue_b32(), v87, v115); + svfloat32_t v141 = svsub_f32_x(svptrue_b32(), v87, v115); + svfloat32_t v142 = svmla_f32_x(pred_full, v31, v93, v294); + svfloat32_t v143 = svmls_f32_x(pred_full, v31, v93, v294); + svfloat32_t v144 = svadd_f32_x(svptrue_b32(), v127, v134); + svfloat32_t v145 = svsub_f32_x(svptrue_b32(), v127, v134); + svint16_t v152 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v90, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v184 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v91, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svfloat32_t v146 = svadd_f32_x(svptrue_b32(), v142, v144); + svfloat32_t v147 = svsub_f32_x(svptrue_b32(), v142, v144); + svfloat32_t v148 = svadd_f32_x(svptrue_b32(), v143, v145); + svfloat32_t v149 = svsub_f32_x(svptrue_b32(), v143, v145); + svint16_t v168 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v141, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v200 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v140, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svst1w_u64(pred_full, (unsigned *)(v302), svreinterpret_u64_s16(v152)); + svst1w_u64(pred_full, (unsigned *)(v338), svreinterpret_u64_s16(v184)); + svint16_t v160 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v147, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v176 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v148, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v192 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v149, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v208 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v146, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svst1w_u64(pred_full, (unsigned *)(v320), svreinterpret_u64_s16(v168)); + svst1w_u64(pred_full, (unsigned *)(v356), svreinterpret_u64_s16(v200)); + svst1w_u64(pred_full, (unsigned *)(v311), svreinterpret_u64_s16(v160)); + svst1w_u64(pred_full, (unsigned *)(v329), svreinterpret_u64_s16(v176)); + svst1w_u64(pred_full, (unsigned *)(v347), svreinterpret_u64_s16(v192)); + svst1w_u64(pred_full, (unsigned *)(v365), svreinterpret_u64_s16(v208)); +} +#endif + +#ifndef ARMRAL_ARCH_SVE +void armral_fft_cs16_cf32_cs16_ac_n_uun9(const armral_cmplx_int16_t *restrict x, + armral_cmplx_int16_t *restrict y, + int istride, int ostride, int howmany, + float dir) { + float v4 = dir; + const int32_t *v5 = (const int32_t *)x; + int32_t *v6 = (int32_t *)y; + float v88 = -5.0000000000000000e-01F; + float v99 = -1.4999999999999998e+00F; + float v102 = 8.6602540378443871e-01F; + float v103 = -8.6602540378443871e-01F; + float v110 = 7.6604444311897801e-01F; + float v114 = 9.3969262078590832e-01F; + float v118 = -1.7364817766693039e-01F; + float v121 = 6.4278760968653925e-01F; + float v122 = -6.4278760968653925e-01F; + float v128 = -3.4202014332566888e-01F; + float v129 = 3.4202014332566888e-01F; + float v135 = 9.8480775301220802e-01F; + float v136 = -9.8480775301220802e-01F; + int16x4_t v13 = vld1s_s16(&v5[istride]); + int16x4_t v72 = vld1s_s16(&v5[0]); + float32x2_t v89 = (float32x2_t){v88, v88}; + float32x2_t v100 = (float32x2_t){v99, v99}; + float32x2_t v104 = (float32x2_t){v102, v103}; + float32x2_t v111 = (float32x2_t){v110, v110}; + float32x2_t v115 = (float32x2_t){v114, v114}; + float32x2_t v119 = (float32x2_t){v118, v118}; + float32x2_t v123 = (float32x2_t){v121, v122}; + float32x2_t v130 = (float32x2_t){v128, v129}; + float32x2_t v137 = (float32x2_t){v135, v136}; + float32x2_t v138 = (float32x2_t){v4, v4}; + float32x2_t v14 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v13)), 15); + int16x4_t v19 = vld1s_s16(&v5[istride * 8]); + int16x4_t v27 = vld1s_s16(&v5[istride * 7]); + int16x4_t v33 = vld1s_s16(&v5[istride * 2]); + int16x4_t v41 = vld1s_s16(&v5[istride * 3]); + int16x4_t v47 = vld1s_s16(&v5[istride * 6]); + int16x4_t v55 = vld1s_s16(&v5[istride * 4]); + int16x4_t v61 = vld1s_s16(&v5[istride * 5]); + float32x2_t v73 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v72)), 15); + float32x2_t v106 = vmul_f32(v138, v104); + float32x2_t v125 = vmul_f32(v138, v123); + float32x2_t v132 = vmul_f32(v138, v130); + float32x2_t v139 = vmul_f32(v138, v137); + float32x2_t v20 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v19)), 15); + float32x2_t v28 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v27)), 15); + float32x2_t v34 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v33)), 15); + float32x2_t v42 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v41)), 15); + float32x2_t v48 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v47)), 15); + float32x2_t v56 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v55)), 15); + float32x2_t v62 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v61)), 15); + float32x2_t v21 = vadd_f32(v14, v20); + float32x2_t v22 = vsub_f32(v14, v20); + float32x2_t v35 = vadd_f32(v28, v34); + float32x2_t v36 = vsub_f32(v28, v34); + float32x2_t v49 = vadd_f32(v42, v48); + float32x2_t v50 = vsub_f32(v42, v48); + float32x2_t v63 = vadd_f32(v56, v62); + float32x2_t v64 = vsub_f32(v56, v62); + float32x2_t v65 = vadd_f32(v21, v35); + float32x2_t v75 = vadd_f32(v22, v36); + float32x2_t v77 = vsub_f32(v21, v35); + float32x2_t v78 = vsub_f32(v35, v63); + float32x2_t v79 = vsub_f32(v63, v21); + float32x2_t v80 = vsub_f32(v22, v36); + float32x2_t v81 = vsub_f32(v36, v64); + float32x2_t v82 = vsub_f32(v64, v22); + float32x2_t v101 = vmul_f32(v49, v100); + float32x2_t v107 = vrev64_f32(v50); + float32x2_t v66 = vadd_f32(v65, v63); + float32x2_t v76 = vadd_f32(v75, v64); + float32x2_t v108 = vmul_f32(v107, v106); + float32x2_t v112 = vmul_f32(v77, v111); + float32x2_t v116 = vmul_f32(v78, v115); + float32x2_t v120 = vmul_f32(v79, v119); + float32x2_t v126 = vrev64_f32(v80); + float32x2_t v133 = vrev64_f32(v81); + float32x2_t v140 = vrev64_f32(v82); + float32x2_t v67 = vadd_f32(v66, v49); + float32x2_t v90 = vmul_f32(v66, v89); + float32x2_t v96 = vrev64_f32(v76); + float32x2_t v127 = vmul_f32(v126, v125); + float32x2_t v134 = vmul_f32(v133, v132); + float32x2_t v141 = vmul_f32(v140, v139); + float32x2_t v74 = vadd_f32(v67, v73); + float32x2_t v97 = vmul_f32(v96, v106); + float32x2_t v142 = vadd_f32(v90, v90); + float32x2_t v155 = vadd_f32(v108, v127); + float32x2_t v157 = vsub_f32(v108, v134); + float32x2_t v159 = vsub_f32(v108, v127); + float32x2_t v143 = vadd_f32(v142, v90); + float32x2_t v147 = vadd_f32(v74, v101); + float32x2_t v156 = vadd_f32(v155, v134); + float32x2_t v158 = vadd_f32(v157, v141); + float32x2_t v160 = vsub_f32(v159, v141); + int16x4_t v169 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v74, 15), (int32x2_t){0, 0})); + float32x2_t v144 = vadd_f32(v74, v143); + float32x2_t v148 = vadd_f32(v147, v142); + v6[0] = vget_lane_s32(vreinterpret_s32_s16(v169), 0); + float32x2_t v145 = vadd_f32(v144, v97); + float32x2_t v146 = vsub_f32(v144, v97); + float32x2_t v149 = vadd_f32(v148, v112); + float32x2_t v151 = vsub_f32(v148, v116); + float32x2_t v153 = vsub_f32(v148, v112); + float32x2_t v150 = vadd_f32(v149, v116); + float32x2_t v152 = vadd_f32(v151, v120); + float32x2_t v154 = vsub_f32(v153, v120); + int16x4_t v187 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v146, 15), (int32x2_t){0, 0})); + int16x4_t v205 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v145, 15), (int32x2_t){0, 0})); + float32x2_t v161 = vadd_f32(v150, v156); + float32x2_t v162 = vsub_f32(v150, v156); + float32x2_t v163 = vadd_f32(v152, v158); + float32x2_t v164 = vsub_f32(v152, v158); + float32x2_t v165 = vadd_f32(v154, v160); + float32x2_t v166 = vsub_f32(v154, v160); + v6[ostride * 3] = vget_lane_s32(vreinterpret_s32_s16(v187), 0); + v6[ostride * 6] = vget_lane_s32(vreinterpret_s32_s16(v205), 0); + int16x4_t v175 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v162, 15), (int32x2_t){0, 0})); + int16x4_t v181 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v163, 15), (int32x2_t){0, 0})); + int16x4_t v193 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v166, 15), (int32x2_t){0, 0})); + int16x4_t v199 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v165, 15), (int32x2_t){0, 0})); + int16x4_t v211 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v164, 15), (int32x2_t){0, 0})); + int16x4_t v217 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v161, 15), (int32x2_t){0, 0})); + v6[ostride] = vget_lane_s32(vreinterpret_s32_s16(v175), 0); + v6[ostride * 2] = vget_lane_s32(vreinterpret_s32_s16(v181), 0); + v6[ostride * 4] = vget_lane_s32(vreinterpret_s32_s16(v193), 0); + v6[ostride * 5] = vget_lane_s32(vreinterpret_s32_s16(v199), 0); + v6[ostride * 7] = vget_lane_s32(vreinterpret_s32_s16(v211), 0); + v6[ostride * 8] = vget_lane_s32(vreinterpret_s32_s16(v217), 0); +} +#endif + +#ifdef ARMRAL_ARCH_SVE +void armral_fft_cs16_cf32_cs16_ac_n_uun9(const armral_cmplx_int16_t *restrict x, + armral_cmplx_int16_t *restrict y, + int istride, int ostride, int howmany, + float dir) { + int64_t v0 = istride; + int64_t v2 = ostride; + float v4 = dir; + const int32_t *v5 = (const int32_t *)x; + int32_t *v6 = (int32_t *)y; + svbool_t pred_full = svptrue_pat_b32(SV_VL2); + float v112 = -5.0000000000000000e-01F; + float v124 = -1.4999999999999998e+00F; + float v129 = -8.6602540378443871e-01F; + float v136 = 7.6604444311897801e-01F; + float v141 = 9.3969262078590832e-01F; + float v146 = -1.7364817766693039e-01F; + float v151 = -6.4278760968653925e-01F; + float v158 = 3.4202014332566888e-01F; + float v165 = -9.8480775301220802e-01F; + const int32_t *v274 = &v5[v0]; + int32_t *v377 = &v6[v2]; + int64_t v23 = v0 * 8; + int64_t v33 = v0 * 7; + int64_t v41 = v0 * 2; + int64_t v51 = v0 * 3; + int64_t v59 = v0 * 6; + int64_t v69 = v0 * 4; + int64_t v77 = v0 * 5; + float v132 = v4 * v129; + float v154 = v4 * v151; + float v161 = v4 * v158; + float v168 = v4 * v165; + int64_t v213 = v2 * 2; + int64_t v221 = v2 * 3; + int64_t v229 = v2 * 4; + int64_t v237 = v2 * 5; + int64_t v245 = v2 * 6; + int64_t v253 = v2 * 7; + int64_t v261 = v2 * 8; + const int32_t *v347 = &v5[0]; + svfloat32_t v351 = svdup_n_f32(v112); + svfloat32_t v353 = svdup_n_f32(v124); + svfloat32_t v355 = svdup_n_f32(v136); + svfloat32_t v356 = svdup_n_f32(v141); + svfloat32_t v357 = svdup_n_f32(v146); + int32_t *v368 = &v6[0]; + svfloat32_t v21 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v274[0])), + 1.F / (1ULL << 15ULL)); + const int32_t *v283 = &v5[v23]; + const int32_t *v292 = &v5[v33]; + const int32_t *v301 = &v5[v41]; + const int32_t *v310 = &v5[v51]; + const int32_t *v319 = &v5[v59]; + const int32_t *v328 = &v5[v69]; + const int32_t *v337 = &v5[v77]; + svfloat32_t v354 = svdup_n_f32(v132); + svfloat32_t v358 = svdup_n_f32(v154); + svfloat32_t v359 = svdup_n_f32(v161); + svfloat32_t v360 = svdup_n_f32(v168); + int32_t *v386 = &v6[v213]; + int32_t *v395 = &v6[v221]; + int32_t *v404 = &v6[v229]; + int32_t *v413 = &v6[v237]; + int32_t *v422 = &v6[v245]; + int32_t *v431 = &v6[v253]; + int32_t *v440 = &v6[v261]; + svfloat32_t v96 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v347[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v29 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v283[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v39 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v292[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v47 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v301[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v57 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v310[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v65 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v319[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v75 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v328[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v83 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v337[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v30 = svadd_f32_x(svptrue_b32(), v21, v29); + svfloat32_t v31 = svsub_f32_x(svptrue_b32(), v21, v29); + svfloat32_t v48 = svadd_f32_x(svptrue_b32(), v39, v47); + svfloat32_t v49 = svsub_f32_x(svptrue_b32(), v39, v47); + svfloat32_t v66 = svadd_f32_x(svptrue_b32(), v57, v65); + svfloat32_t v67 = svsub_f32_x(svptrue_b32(), v57, v65); + svfloat32_t v84 = svadd_f32_x(svptrue_b32(), v75, v83); + svfloat32_t v85 = svsub_f32_x(svptrue_b32(), v75, v83); + svfloat32_t v86 = svadd_f32_x(svptrue_b32(), v30, v48); + svfloat32_t v98 = svadd_f32_x(svptrue_b32(), v31, v49); + svfloat32_t v100 = svsub_f32_x(svptrue_b32(), v30, v48); + svfloat32_t v101 = svsub_f32_x(svptrue_b32(), v48, v84); + svfloat32_t v102 = svsub_f32_x(svptrue_b32(), v84, v30); + svfloat32_t v103 = svsub_f32_x(svptrue_b32(), v31, v49); + svfloat32_t v104 = svsub_f32_x(svptrue_b32(), v49, v85); + svfloat32_t v105 = svsub_f32_x(svptrue_b32(), v85, v31); + svfloat32_t zero134 = svdup_n_f32(0); + svfloat32_t v134 = svcmla_f32_x(pred_full, zero134, v354, v67, 90); + svfloat32_t v87 = svadd_f32_x(svptrue_b32(), v86, v84); + svfloat32_t v99 = svadd_f32_x(svptrue_b32(), v98, v85); + svfloat32_t zero156 = svdup_n_f32(0); + svfloat32_t v156 = svcmla_f32_x(pred_full, zero156, v358, v103, 90); + svfloat32_t zero163 = svdup_n_f32(0); + svfloat32_t v163 = svcmla_f32_x(pred_full, zero163, v359, v104, 90); + svfloat32_t zero170 = svdup_n_f32(0); + svfloat32_t v170 = svcmla_f32_x(pred_full, zero170, v360, v105, 90); + svfloat32_t v88 = svadd_f32_x(svptrue_b32(), v87, v66); + svfloat32_t v115 = svmul_f32_x(svptrue_b32(), v87, v351); + svfloat32_t zero122 = svdup_n_f32(0); + svfloat32_t v122 = svcmla_f32_x(pred_full, zero122, v354, v99, 90); + svfloat32_t v184 = svadd_f32_x(svptrue_b32(), v134, v156); + svfloat32_t v186 = svsub_f32_x(svptrue_b32(), v134, v163); + svfloat32_t v188 = svsub_f32_x(svptrue_b32(), v134, v156); + svfloat32_t v97 = svadd_f32_x(svptrue_b32(), v88, v96); + svfloat32_t v171 = svadd_f32_x(svptrue_b32(), v115, v115); + svfloat32_t v185 = svadd_f32_x(svptrue_b32(), v184, v163); + svfloat32_t v187 = svadd_f32_x(svptrue_b32(), v186, v170); + svfloat32_t v189 = svsub_f32_x(svptrue_b32(), v188, v170); + svfloat32_t v172 = svmla_f32_x(pred_full, v171, v87, v351); + svfloat32_t v176 = svmla_f32_x(pred_full, v97, v66, v353); + svint16_t v198 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v97, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svfloat32_t v173 = svadd_f32_x(svptrue_b32(), v97, v172); + svfloat32_t v177 = svadd_f32_x(svptrue_b32(), v176, v171); + svst1w_u64(pred_full, (unsigned *)(v368), svreinterpret_u64_s16(v198)); + svfloat32_t v174 = svadd_f32_x(svptrue_b32(), v173, v122); + svfloat32_t v175 = svsub_f32_x(svptrue_b32(), v173, v122); + svfloat32_t v178 = svmla_f32_x(pred_full, v177, v100, v355); + svfloat32_t v180 = svmls_f32_x(pred_full, v177, v101, v356); + svfloat32_t v182 = svmls_f32_x(pred_full, v177, v100, v355); + svfloat32_t v179 = svmla_f32_x(pred_full, v178, v101, v356); + svfloat32_t v181 = svmla_f32_x(pred_full, v180, v102, v357); + svfloat32_t v183 = svmls_f32_x(pred_full, v182, v102, v357); + svint16_t v222 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v175, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v246 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v174, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svfloat32_t v190 = svadd_f32_x(svptrue_b32(), v179, v185); + svfloat32_t v191 = svsub_f32_x(svptrue_b32(), v179, v185); + svfloat32_t v192 = svadd_f32_x(svptrue_b32(), v181, v187); + svfloat32_t v193 = svsub_f32_x(svptrue_b32(), v181, v187); + svfloat32_t v194 = svadd_f32_x(svptrue_b32(), v183, v189); + svfloat32_t v195 = svsub_f32_x(svptrue_b32(), v183, v189); + svst1w_u64(pred_full, (unsigned *)(v395), svreinterpret_u64_s16(v222)); + svst1w_u64(pred_full, (unsigned *)(v422), svreinterpret_u64_s16(v246)); + svint16_t v206 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v191, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v214 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v192, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v230 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v195, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v238 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v194, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v254 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v193, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v262 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v190, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svst1w_u64(pred_full, (unsigned *)(v377), svreinterpret_u64_s16(v206)); + svst1w_u64(pred_full, (unsigned *)(v386), svreinterpret_u64_s16(v214)); + svst1w_u64(pred_full, (unsigned *)(v404), svreinterpret_u64_s16(v230)); + svst1w_u64(pred_full, (unsigned *)(v413), svreinterpret_u64_s16(v238)); + svst1w_u64(pred_full, (unsigned *)(v431), svreinterpret_u64_s16(v254)); + svst1w_u64(pred_full, (unsigned *)(v440), svreinterpret_u64_s16(v262)); +} +#endif + +#ifndef ARMRAL_ARCH_SVE +void armral_fft_cs16_cf32_cs16_ac_n_uun10( + const armral_cmplx_int16_t *restrict x, armral_cmplx_int16_t *restrict y, + int istride, int ostride, int howmany, float dir) { + float v4 = dir; + const int32_t *v5 = (const int32_t *)x; + int32_t *v6 = (int32_t *)y; + float v142 = -1.2500000000000000e+00F; + float v146 = 5.5901699437494745e-01F; + float v149 = 1.5388417685876268e+00F; + float v150 = -1.5388417685876268e+00F; + float v156 = 5.8778525229247325e-01F; + float v157 = -5.8778525229247325e-01F; + float v163 = 3.6327126400268028e-01F; + float v164 = -3.6327126400268028e-01F; + int16x4_t v13 = vld1s_s16(&v5[0]); + int16x4_t v61 = vld1s_s16(&v5[istride]); + float32x2_t v143 = (float32x2_t){v142, v142}; + float32x2_t v147 = (float32x2_t){v146, v146}; + float32x2_t v151 = (float32x2_t){v149, v150}; + float32x2_t v158 = (float32x2_t){v156, v157}; + float32x2_t v165 = (float32x2_t){v163, v164}; + float32x2_t v166 = (float32x2_t){v4, v4}; + float32x2_t v14 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v13)), 15); + int16x4_t v19 = vld1s_s16(&v5[istride * 5]); + int16x4_t v27 = vld1s_s16(&v5[istride * 2]); + int16x4_t v33 = vld1s_s16(&v5[istride * 7]); + int16x4_t v41 = vld1s_s16(&v5[istride * 4]); + int16x4_t v47 = vld1s_s16(&v5[istride * 9]); + int16x4_t v55 = vld1s_s16(&v5[istride * 6]); + float32x2_t v62 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v61)), 15); + int16x4_t v69 = vld1s_s16(&v5[istride * 8]); + int16x4_t v75 = vld1s_s16(&v5[istride * 3]); + float32x2_t v153 = vmul_f32(v166, v151); + float32x2_t v160 = vmul_f32(v166, v158); + float32x2_t v167 = vmul_f32(v166, v165); + float32x2_t v20 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v19)), 15); + float32x2_t v28 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v27)), 15); + float32x2_t v34 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v33)), 15); + float32x2_t v42 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v41)), 15); + float32x2_t v48 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v47)), 15); + float32x2_t v56 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v55)), 15); + float32x2_t v70 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v69)), 15); + float32x2_t v76 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v75)), 15); + float32x2_t v21 = vadd_f32(v14, v20); + float32x2_t v22 = vsub_f32(v14, v20); + float32x2_t v35 = vadd_f32(v28, v34); + float32x2_t v36 = vsub_f32(v28, v34); + float32x2_t v49 = vadd_f32(v42, v48); + float32x2_t v50 = vsub_f32(v42, v48); + float32x2_t v63 = vadd_f32(v56, v62); + float32x2_t v64 = vsub_f32(v56, v62); + float32x2_t v77 = vadd_f32(v70, v76); + float32x2_t v78 = vsub_f32(v70, v76); + float32x2_t v79 = vadd_f32(v35, v77); + float32x2_t v80 = vsub_f32(v35, v77); + float32x2_t v81 = vadd_f32(v63, v49); + float32x2_t v82 = vsub_f32(v63, v49); + float32x2_t v129 = vadd_f32(v36, v78); + float32x2_t v130 = vsub_f32(v36, v78); + float32x2_t v131 = vadd_f32(v64, v50); + float32x2_t v132 = vsub_f32(v64, v50); + float32x2_t v83 = vadd_f32(v79, v81); + float32x2_t v84 = vsub_f32(v79, v81); + float32x2_t v85 = vadd_f32(v80, v82); + float32x2_t v104 = vrev64_f32(v80); + float32x2_t v118 = vrev64_f32(v82); + float32x2_t v133 = vadd_f32(v129, v131); + float32x2_t v134 = vsub_f32(v129, v131); + float32x2_t v135 = vadd_f32(v130, v132); + float32x2_t v154 = vrev64_f32(v130); + float32x2_t v168 = vrev64_f32(v132); + float32x2_t v86 = vadd_f32(v83, v21); + float32x2_t v94 = vmul_f32(v83, v143); + float32x2_t v98 = vmul_f32(v84, v147); + float32x2_t v105 = vmul_f32(v104, v153); + float32x2_t v111 = vrev64_f32(v85); + float32x2_t v119 = vmul_f32(v118, v167); + float32x2_t v136 = vadd_f32(v133, v22); + float32x2_t v144 = vmul_f32(v133, v143); + float32x2_t v148 = vmul_f32(v134, v147); + float32x2_t v155 = vmul_f32(v154, v153); + float32x2_t v161 = vrev64_f32(v135); + float32x2_t v169 = vmul_f32(v168, v167); + float32x2_t v112 = vmul_f32(v111, v160); + float32x2_t v120 = vadd_f32(v86, v94); + float32x2_t v162 = vmul_f32(v161, v160); + float32x2_t v170 = vadd_f32(v136, v144); + int16x4_t v181 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v86, 15), (int32x2_t){0, 0})); + int16x4_t v187 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v136, 15), (int32x2_t){0, 0})); + float32x2_t v121 = vadd_f32(v120, v98); + float32x2_t v122 = vsub_f32(v120, v98); + float32x2_t v123 = vsub_f32(v105, v112); + float32x2_t v124 = vadd_f32(v112, v119); + float32x2_t v171 = vadd_f32(v170, v148); + float32x2_t v172 = vsub_f32(v170, v148); + float32x2_t v173 = vsub_f32(v155, v162); + float32x2_t v174 = vadd_f32(v162, v169); + v6[0] = vget_lane_s32(vreinterpret_s32_s16(v181), 0); + v6[ostride * 5] = vget_lane_s32(vreinterpret_s32_s16(v187), 0); + float32x2_t v125 = vadd_f32(v121, v123); + float32x2_t v126 = vsub_f32(v121, v123); + float32x2_t v127 = vadd_f32(v122, v124); + float32x2_t v128 = vsub_f32(v122, v124); + float32x2_t v175 = vadd_f32(v171, v173); + float32x2_t v176 = vsub_f32(v171, v173); + float32x2_t v177 = vadd_f32(v172, v174); + float32x2_t v178 = vsub_f32(v172, v174); + int16x4_t v193 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v126, 15), (int32x2_t){0, 0})); + int16x4_t v199 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v176, 15), (int32x2_t){0, 0})); + int16x4_t v205 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v128, 15), (int32x2_t){0, 0})); + int16x4_t v211 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v178, 15), (int32x2_t){0, 0})); + int16x4_t v217 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v127, 15), (int32x2_t){0, 0})); + int16x4_t v223 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v177, 15), (int32x2_t){0, 0})); + int16x4_t v229 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v125, 15), (int32x2_t){0, 0})); + int16x4_t v235 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v175, 15), (int32x2_t){0, 0})); + v6[ostride * 6] = vget_lane_s32(vreinterpret_s32_s16(v193), 0); + v6[ostride] = vget_lane_s32(vreinterpret_s32_s16(v199), 0); + v6[ostride * 2] = vget_lane_s32(vreinterpret_s32_s16(v205), 0); + v6[ostride * 7] = vget_lane_s32(vreinterpret_s32_s16(v211), 0); + v6[ostride * 8] = vget_lane_s32(vreinterpret_s32_s16(v217), 0); + v6[ostride * 3] = vget_lane_s32(vreinterpret_s32_s16(v223), 0); + v6[ostride * 4] = vget_lane_s32(vreinterpret_s32_s16(v229), 0); + v6[ostride * 9] = vget_lane_s32(vreinterpret_s32_s16(v235), 0); +} +#endif + +#ifdef ARMRAL_ARCH_SVE +void armral_fft_cs16_cf32_cs16_ac_n_uun10( + const armral_cmplx_int16_t *restrict x, armral_cmplx_int16_t *restrict y, + int istride, int ostride, int howmany, float dir) { + int64_t v0 = istride; + int64_t v2 = ostride; + float v4 = dir; + const int32_t *v5 = (const int32_t *)x; + int32_t *v6 = (int32_t *)y; + svbool_t pred_full = svptrue_pat_b32(SV_VL2); + float v171 = -1.2500000000000000e+00F; + float v176 = 5.5901699437494745e-01F; + float v181 = -1.5388417685876268e+00F; + float v188 = -5.8778525229247325e-01F; + float v195 = -3.6327126400268028e-01F; + const int32_t *v360 = &v5[v0]; + int32_t *v427 = &v6[v2]; + int64_t v23 = v0 * 5; + int64_t v33 = v0 * 2; + int64_t v41 = v0 * 7; + int64_t v51 = v0 * 4; + int64_t v59 = v0 * 9; + int64_t v69 = v0 * 6; + int64_t v87 = v0 * 8; + int64_t v95 = v0 * 3; + float v184 = v4 * v181; + float v191 = v4 * v188; + float v198 = v4 * v195; + int64_t v219 = v2 * 5; + int64_t v227 = v2 * 6; + int64_t v243 = v2 * 2; + int64_t v251 = v2 * 7; + int64_t v259 = v2 * 8; + int64_t v267 = v2 * 3; + int64_t v275 = v2 * 4; + int64_t v283 = v2 * 9; + const int32_t *v297 = &v5[0]; + svfloat32_t v388 = svdup_n_f32(v171); + svfloat32_t v389 = svdup_n_f32(v176); + int32_t *v400 = &v6[0]; + svfloat32_t v83 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v360[0])), + 1.F / (1ULL << 15ULL)); + const int32_t *v306 = &v5[v23]; + const int32_t *v315 = &v5[v33]; + const int32_t *v324 = &v5[v41]; + const int32_t *v333 = &v5[v51]; + const int32_t *v342 = &v5[v59]; + const int32_t *v351 = &v5[v69]; + const int32_t *v369 = &v5[v87]; + const int32_t *v378 = &v5[v95]; + svfloat32_t v390 = svdup_n_f32(v184); + svfloat32_t v391 = svdup_n_f32(v191); + svfloat32_t v392 = svdup_n_f32(v198); + int32_t *v409 = &v6[v219]; + int32_t *v418 = &v6[v227]; + int32_t *v436 = &v6[v243]; + int32_t *v445 = &v6[v251]; + int32_t *v454 = &v6[v259]; + int32_t *v463 = &v6[v267]; + int32_t *v472 = &v6[v275]; + int32_t *v481 = &v6[v283]; + svfloat32_t v21 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v297[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v29 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v306[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v39 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v315[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v47 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v324[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v57 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v333[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v65 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v342[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v75 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v351[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v93 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v369[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v101 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v378[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v30 = svadd_f32_x(svptrue_b32(), v21, v29); + svfloat32_t v31 = svsub_f32_x(svptrue_b32(), v21, v29); + svfloat32_t v48 = svadd_f32_x(svptrue_b32(), v39, v47); + svfloat32_t v49 = svsub_f32_x(svptrue_b32(), v39, v47); + svfloat32_t v66 = svadd_f32_x(svptrue_b32(), v57, v65); + svfloat32_t v67 = svsub_f32_x(svptrue_b32(), v57, v65); + svfloat32_t v84 = svadd_f32_x(svptrue_b32(), v75, v83); + svfloat32_t v85 = svsub_f32_x(svptrue_b32(), v75, v83); + svfloat32_t v102 = svadd_f32_x(svptrue_b32(), v93, v101); + svfloat32_t v103 = svsub_f32_x(svptrue_b32(), v93, v101); + svfloat32_t v104 = svadd_f32_x(svptrue_b32(), v48, v102); + svfloat32_t v105 = svsub_f32_x(svptrue_b32(), v48, v102); + svfloat32_t v106 = svadd_f32_x(svptrue_b32(), v84, v66); + svfloat32_t v107 = svsub_f32_x(svptrue_b32(), v84, v66); + svfloat32_t v157 = svadd_f32_x(svptrue_b32(), v49, v103); + svfloat32_t v158 = svsub_f32_x(svptrue_b32(), v49, v103); + svfloat32_t v159 = svadd_f32_x(svptrue_b32(), v85, v67); + svfloat32_t v160 = svsub_f32_x(svptrue_b32(), v85, v67); + svfloat32_t v108 = svadd_f32_x(svptrue_b32(), v104, v106); + svfloat32_t v109 = svsub_f32_x(svptrue_b32(), v104, v106); + svfloat32_t v110 = svadd_f32_x(svptrue_b32(), v105, v107); + svfloat32_t zero133 = svdup_n_f32(0); + svfloat32_t v133 = svcmla_f32_x(pred_full, zero133, v390, v105, 90); + svfloat32_t v161 = svadd_f32_x(svptrue_b32(), v157, v159); + svfloat32_t v162 = svsub_f32_x(svptrue_b32(), v157, v159); + svfloat32_t v163 = svadd_f32_x(svptrue_b32(), v158, v160); + svfloat32_t zero186 = svdup_n_f32(0); + svfloat32_t v186 = svcmla_f32_x(pred_full, zero186, v390, v158, 90); + svfloat32_t v111 = svadd_f32_x(svptrue_b32(), v108, v30); + svfloat32_t zero140 = svdup_n_f32(0); + svfloat32_t v140 = svcmla_f32_x(pred_full, zero140, v391, v110, 90); + svfloat32_t v164 = svadd_f32_x(svptrue_b32(), v161, v31); + svfloat32_t zero193 = svdup_n_f32(0); + svfloat32_t v193 = svcmla_f32_x(pred_full, zero193, v391, v163, 90); + svfloat32_t v148 = svmla_f32_x(pred_full, v111, v108, v388); + svfloat32_t v151 = svsub_f32_x(svptrue_b32(), v133, v140); + svfloat32_t v152 = svcmla_f32_x(pred_full, v140, v392, v107, 90); + svfloat32_t v201 = svmla_f32_x(pred_full, v164, v161, v388); + svfloat32_t v204 = svsub_f32_x(svptrue_b32(), v186, v193); + svfloat32_t v205 = svcmla_f32_x(pred_full, v193, v392, v160, 90); + svint16_t v212 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v111, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v220 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v164, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svfloat32_t v149 = svmla_f32_x(pred_full, v148, v109, v389); + svfloat32_t v150 = svmls_f32_x(pred_full, v148, v109, v389); + svfloat32_t v202 = svmla_f32_x(pred_full, v201, v162, v389); + svfloat32_t v203 = svmls_f32_x(pred_full, v201, v162, v389); + svst1w_u64(pred_full, (unsigned *)(v400), svreinterpret_u64_s16(v212)); + svst1w_u64(pred_full, (unsigned *)(v409), svreinterpret_u64_s16(v220)); + svfloat32_t v153 = svadd_f32_x(svptrue_b32(), v149, v151); + svfloat32_t v154 = svsub_f32_x(svptrue_b32(), v149, v151); + svfloat32_t v155 = svadd_f32_x(svptrue_b32(), v150, v152); + svfloat32_t v156 = svsub_f32_x(svptrue_b32(), v150, v152); + svfloat32_t v206 = svadd_f32_x(svptrue_b32(), v202, v204); + svfloat32_t v207 = svsub_f32_x(svptrue_b32(), v202, v204); + svfloat32_t v208 = svadd_f32_x(svptrue_b32(), v203, v205); + svfloat32_t v209 = svsub_f32_x(svptrue_b32(), v203, v205); + svint16_t v228 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v154, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v236 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v207, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v244 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v156, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v252 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v209, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v260 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v155, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v268 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v208, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v276 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v153, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v284 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v206, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svst1w_u64(pred_full, (unsigned *)(v418), svreinterpret_u64_s16(v228)); + svst1w_u64(pred_full, (unsigned *)(v427), svreinterpret_u64_s16(v236)); + svst1w_u64(pred_full, (unsigned *)(v436), svreinterpret_u64_s16(v244)); + svst1w_u64(pred_full, (unsigned *)(v445), svreinterpret_u64_s16(v252)); + svst1w_u64(pred_full, (unsigned *)(v454), svreinterpret_u64_s16(v260)); + svst1w_u64(pred_full, (unsigned *)(v463), svreinterpret_u64_s16(v268)); + svst1w_u64(pred_full, (unsigned *)(v472), svreinterpret_u64_s16(v276)); + svst1w_u64(pred_full, (unsigned *)(v481), svreinterpret_u64_s16(v284)); +} +#endif + +#ifndef ARMRAL_ARCH_SVE +void armral_fft_cs16_cf32_cs16_ac_n_uun11( + const armral_cmplx_int16_t *restrict x, armral_cmplx_int16_t *restrict y, + int istride, int ostride, int howmany, float dir) { + float v4 = dir; + const int32_t *v5 = (const int32_t *)x; + int32_t *v6 = (int32_t *)y; + float v117 = 1.1000000000000001e+00F; + float v120 = 3.3166247903554003e-01F; + float v121 = -3.3166247903554003e-01F; + float v128 = 5.1541501300188641e-01F; + float v132 = 9.4125353283118118e-01F; + float v136 = 1.4143537075597825e+00F; + float v140 = 8.5949297361449750e-01F; + float v144 = 4.2314838273285138e-02F; + float v148 = 3.8639279888589606e-01F; + float v152 = 5.1254589567200015e-01F; + float v156 = 1.0702757469471715e+00F; + float v160 = 5.5486073394528512e-01F; + float v163 = 1.2412944743900585e+00F; + float v164 = -1.2412944743900585e+00F; + float v170 = 2.0897833842005756e-01F; + float v171 = -2.0897833842005756e-01F; + float v177 = 3.7415717312460811e-01F; + float v178 = -3.7415717312460811e-01F; + float v184 = 4.9929922194110327e-02F; + float v185 = -4.9929922194110327e-02F; + float v191 = 6.5815896284539266e-01F; + float v192 = -6.5815896284539266e-01F; + float v198 = 6.3306543373877577e-01F; + float v199 = -6.3306543373877577e-01F; + float v205 = 1.0822460581641109e+00F; + float v206 = -1.0822460581641109e+00F; + float v212 = 8.1720737907134022e-01F; + float v213 = -8.1720737907134022e-01F; + float v219 = 4.2408709531871824e-01F; + float v220 = -4.2408709531871824e-01F; + int16x4_t v13 = vld1s_s16(&v5[istride]); + int16x4_t v89 = vld1s_s16(&v5[0]); + float32x2_t v118 = (float32x2_t){v117, v117}; + float32x2_t v122 = (float32x2_t){v120, v121}; + float32x2_t v129 = (float32x2_t){v128, v128}; + float32x2_t v133 = (float32x2_t){v132, v132}; + float32x2_t v137 = (float32x2_t){v136, v136}; + float32x2_t v141 = (float32x2_t){v140, v140}; + float32x2_t v145 = (float32x2_t){v144, v144}; + float32x2_t v149 = (float32x2_t){v148, v148}; + float32x2_t v153 = (float32x2_t){v152, v152}; + float32x2_t v157 = (float32x2_t){v156, v156}; + float32x2_t v161 = (float32x2_t){v160, v160}; + float32x2_t v165 = (float32x2_t){v163, v164}; + float32x2_t v172 = (float32x2_t){v170, v171}; + float32x2_t v179 = (float32x2_t){v177, v178}; + float32x2_t v186 = (float32x2_t){v184, v185}; + float32x2_t v193 = (float32x2_t){v191, v192}; + float32x2_t v200 = (float32x2_t){v198, v199}; + float32x2_t v207 = (float32x2_t){v205, v206}; + float32x2_t v214 = (float32x2_t){v212, v213}; + float32x2_t v221 = (float32x2_t){v219, v220}; + float32x2_t v222 = (float32x2_t){v4, v4}; + float32x2_t v14 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v13)), 15); + int16x4_t v19 = vld1s_s16(&v5[istride * 10]); + int16x4_t v26 = vld1s_s16(&v5[istride * 2]); + int16x4_t v32 = vld1s_s16(&v5[istride * 9]); + int16x4_t v39 = vld1s_s16(&v5[istride * 3]); + int16x4_t v45 = vld1s_s16(&v5[istride * 8]); + int16x4_t v52 = vld1s_s16(&v5[istride * 4]); + int16x4_t v58 = vld1s_s16(&v5[istride * 7]); + int16x4_t v65 = vld1s_s16(&v5[istride * 5]); + int16x4_t v71 = vld1s_s16(&v5[istride * 6]); + float32x2_t v90 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v89)), 15); + float32x2_t v124 = vmul_f32(v222, v122); + float32x2_t v167 = vmul_f32(v222, v165); + float32x2_t v174 = vmul_f32(v222, v172); + float32x2_t v181 = vmul_f32(v222, v179); + float32x2_t v188 = vmul_f32(v222, v186); + float32x2_t v195 = vmul_f32(v222, v193); + float32x2_t v202 = vmul_f32(v222, v200); + float32x2_t v209 = vmul_f32(v222, v207); + float32x2_t v216 = vmul_f32(v222, v214); + float32x2_t v223 = vmul_f32(v222, v221); + float32x2_t v20 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v19)), 15); + float32x2_t v27 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v26)), 15); + float32x2_t v33 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v32)), 15); + float32x2_t v40 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v39)), 15); + float32x2_t v46 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v45)), 15); + float32x2_t v53 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v52)), 15); + float32x2_t v59 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v58)), 15); + float32x2_t v66 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v65)), 15); + float32x2_t v72 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v71)), 15); + float32x2_t v21 = vadd_f32(v14, v20); + float32x2_t v34 = vadd_f32(v27, v33); + float32x2_t v47 = vadd_f32(v40, v46); + float32x2_t v60 = vadd_f32(v53, v59); + float32x2_t v73 = vadd_f32(v66, v72); + float32x2_t v74 = vsub_f32(v14, v20); + float32x2_t v75 = vsub_f32(v27, v33); + float32x2_t v76 = vsub_f32(v40, v46); + float32x2_t v77 = vsub_f32(v53, v59); + float32x2_t v78 = vsub_f32(v66, v72); + float32x2_t v79 = vadd_f32(v21, v34); + float32x2_t v80 = vadd_f32(v47, v73); + float32x2_t v82 = vsub_f32(v75, v76); + float32x2_t v83 = vadd_f32(v74, v78); + float32x2_t v94 = vsub_f32(v34, v60); + float32x2_t v95 = vsub_f32(v21, v60); + float32x2_t v96 = vsub_f32(v34, v21); + float32x2_t v97 = vsub_f32(v73, v60); + float32x2_t v98 = vsub_f32(v47, v60); + float32x2_t v99 = vsub_f32(v73, v47); + float32x2_t v100 = vsub_f32(v34, v73); + float32x2_t v101 = vsub_f32(v21, v47); + float32x2_t v103 = vadd_f32(v75, v77); + float32x2_t v104 = vsub_f32(v74, v77); + float32x2_t v105 = vadd_f32(v74, v75); + float32x2_t v106 = vsub_f32(v77, v78); + float32x2_t v107 = vsub_f32(v76, v77); + float32x2_t v108 = vsub_f32(v76, v78); + float32x2_t v109 = vadd_f32(v75, v78); + float32x2_t v110 = vsub_f32(v74, v76); + float32x2_t v81 = vadd_f32(v60, v79); + float32x2_t v92 = vsub_f32(v82, v83); + float32x2_t v102 = vsub_f32(v80, v79); + float32x2_t v111 = vadd_f32(v82, v83); + float32x2_t v130 = vmul_f32(v94, v129); + float32x2_t v134 = vmul_f32(v95, v133); + float32x2_t v138 = vmul_f32(v96, v137); + float32x2_t v142 = vmul_f32(v97, v141); + float32x2_t v146 = vmul_f32(v98, v145); + float32x2_t v150 = vmul_f32(v99, v149); + float32x2_t v154 = vmul_f32(v100, v153); + float32x2_t v158 = vmul_f32(v101, v157); + float32x2_t v168 = vrev64_f32(v103); + float32x2_t v175 = vrev64_f32(v104); + float32x2_t v182 = vrev64_f32(v105); + float32x2_t v189 = vrev64_f32(v106); + float32x2_t v196 = vrev64_f32(v107); + float32x2_t v203 = vrev64_f32(v108); + float32x2_t v210 = vrev64_f32(v109); + float32x2_t v217 = vrev64_f32(v110); + float32x2_t v84 = vadd_f32(v81, v80); + float32x2_t v93 = vsub_f32(v92, v77); + float32x2_t v162 = vmul_f32(v102, v161); + float32x2_t v169 = vmul_f32(v168, v167); + float32x2_t v176 = vmul_f32(v175, v174); + float32x2_t v183 = vmul_f32(v182, v181); + float32x2_t v190 = vmul_f32(v189, v188); + float32x2_t v197 = vmul_f32(v196, v195); + float32x2_t v204 = vmul_f32(v203, v202); + float32x2_t v211 = vmul_f32(v210, v209); + float32x2_t v218 = vmul_f32(v217, v216); + float32x2_t v224 = vrev64_f32(v111); + float32x2_t v227 = vadd_f32(v130, v134); + float32x2_t v228 = vadd_f32(v134, v138); + float32x2_t v229 = vsub_f32(v130, v138); + float32x2_t v230 = vadd_f32(v142, v146); + float32x2_t v231 = vadd_f32(v146, v150); + float32x2_t v232 = vsub_f32(v142, v150); + float32x2_t v91 = vadd_f32(v90, v84); + float32x2_t v119 = vmul_f32(v84, v118); + float32x2_t v125 = vrev64_f32(v93); + float32x2_t v225 = vmul_f32(v224, v223); + float32x2_t v233 = vadd_f32(v158, v162); + float32x2_t v234 = vadd_f32(v154, v162); + float32x2_t v235 = vadd_f32(v176, v183); + float32x2_t v236 = vsub_f32(v169, v183); + float32x2_t v237 = vadd_f32(v197, v204); + float32x2_t v238 = vsub_f32(v190, v204); + float32x2_t v126 = vmul_f32(v125, v124); + float32x2_t v226 = vsub_f32(v91, v119); + float32x2_t v239 = vadd_f32(v218, v225); + float32x2_t v240 = vsub_f32(v211, v225); + float32x2_t v241 = vadd_f32(v231, v233); + float32x2_t v259 = vadd_f32(v235, v236); + int16x4_t v275 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v91, 15), (int32x2_t){0, 0})); + float32x2_t v242 = vadd_f32(v241, v226); + float32x2_t v243 = vsub_f32(v226, v228); + float32x2_t v245 = vadd_f32(v226, v232); + float32x2_t v247 = vsub_f32(v226, v229); + float32x2_t v249 = vadd_f32(v226, v227); + float32x2_t v251 = vadd_f32(v126, v237); + float32x2_t v253 = vsub_f32(v239, v235); + float32x2_t v255 = vadd_f32(v126, v240); + float32x2_t v257 = vsub_f32(v240, v236); + float32x2_t v260 = vadd_f32(v259, v237); + v6[0] = vget_lane_s32(vreinterpret_s32_s16(v275), 0); + float32x2_t v244 = vsub_f32(v243, v233); + float32x2_t v246 = vadd_f32(v245, v234); + float32x2_t v248 = vsub_f32(v247, v234); + float32x2_t v250 = vsub_f32(v249, v230); + float32x2_t v252 = vadd_f32(v251, v239); + float32x2_t v254 = vsub_f32(v253, v126); + float32x2_t v256 = vadd_f32(v255, v238); + float32x2_t v258 = vsub_f32(v257, v126); + float32x2_t v261 = vadd_f32(v260, v238); + float32x2_t v262 = vsub_f32(v261, v126); + float32x2_t v264 = vadd_f32(v242, v252); + float32x2_t v265 = vadd_f32(v244, v254); + float32x2_t v266 = vsub_f32(v246, v256); + float32x2_t v267 = vadd_f32(v248, v258); + float32x2_t v268 = vsub_f32(v248, v258); + float32x2_t v269 = vadd_f32(v246, v256); + float32x2_t v270 = vsub_f32(v244, v254); + float32x2_t v271 = vsub_f32(v242, v252); + float32x2_t v263 = vadd_f32(v250, v262); + float32x2_t v272 = vsub_f32(v250, v262); + int16x4_t v287 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v264, 15), (int32x2_t){0, 0})); + int16x4_t v293 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v265, 15), (int32x2_t){0, 0})); + int16x4_t v299 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v266, 15), (int32x2_t){0, 0})); + int16x4_t v305 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v267, 15), (int32x2_t){0, 0})); + int16x4_t v311 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v268, 15), (int32x2_t){0, 0})); + int16x4_t v317 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v269, 15), (int32x2_t){0, 0})); + int16x4_t v323 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v270, 15), (int32x2_t){0, 0})); + int16x4_t v329 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v271, 15), (int32x2_t){0, 0})); + int16x4_t v281 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v263, 15), (int32x2_t){0, 0})); + v6[ostride * 9] = vget_lane_s32(vreinterpret_s32_s16(v287), 0); + v6[ostride * 8] = vget_lane_s32(vreinterpret_s32_s16(v293), 0); + v6[ostride * 7] = vget_lane_s32(vreinterpret_s32_s16(v299), 0); + v6[ostride * 6] = vget_lane_s32(vreinterpret_s32_s16(v305), 0); + v6[ostride * 5] = vget_lane_s32(vreinterpret_s32_s16(v311), 0); + v6[ostride * 4] = vget_lane_s32(vreinterpret_s32_s16(v317), 0); + v6[ostride * 3] = vget_lane_s32(vreinterpret_s32_s16(v323), 0); + v6[ostride * 2] = vget_lane_s32(vreinterpret_s32_s16(v329), 0); + int16x4_t v335 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v272, 15), (int32x2_t){0, 0})); + v6[ostride * 10] = vget_lane_s32(vreinterpret_s32_s16(v281), 0); + v6[ostride] = vget_lane_s32(vreinterpret_s32_s16(v335), 0); +} +#endif + +#ifdef ARMRAL_ARCH_SVE +void armral_fft_cs16_cf32_cs16_ac_n_uun11( + const armral_cmplx_int16_t *restrict x, armral_cmplx_int16_t *restrict y, + int istride, int ostride, int howmany, float dir) { + int64_t v0 = istride; + int64_t v2 = ostride; + float v4 = dir; + const int32_t *v5 = (const int32_t *)x; + int32_t *v6 = (int32_t *)y; + svbool_t pred_full = svptrue_pat_b32(SV_VL2); + float v145 = 1.1000000000000001e+00F; + float v150 = -3.3166247903554003e-01F; + float v157 = 5.1541501300188641e-01F; + float v162 = 9.4125353283118118e-01F; + float v167 = 1.4143537075597825e+00F; + float v172 = 8.5949297361449750e-01F; + float v177 = 4.2314838273285138e-02F; + float v182 = 3.8639279888589606e-01F; + float v187 = 5.1254589567200015e-01F; + float v192 = 1.0702757469471715e+00F; + float v197 = 5.5486073394528512e-01F; + float v202 = -1.2412944743900585e+00F; + float v209 = -2.0897833842005756e-01F; + float v216 = -3.7415717312460811e-01F; + float v223 = -4.9929922194110327e-02F; + float v230 = -6.5815896284539266e-01F; + float v237 = -6.3306543373877577e-01F; + float v244 = -1.0822460581641109e+00F; + float v251 = -8.1720737907134022e-01F; + float v258 = -4.2408709531871824e-01F; + const int32_t *v405 = &v5[v0]; + int32_t *v617 = &v6[v2]; + int64_t v23 = v0 * 10; + int64_t v32 = v0 * 2; + int64_t v40 = v0 * 9; + int64_t v49 = v0 * 3; + int64_t v57 = v0 * 8; + int64_t v66 = v0 * 4; + int64_t v74 = v0 * 7; + int64_t v83 = v0 * 5; + int64_t v91 = v0 * 6; + float v153 = v4 * v150; + float v205 = v4 * v202; + float v212 = v4 * v209; + float v219 = v4 * v216; + float v226 = v4 * v223; + float v233 = v4 * v230; + float v240 = v4 * v237; + float v247 = v4 * v244; + float v254 = v4 * v251; + float v261 = v4 * v258; + int64_t v320 = v2 * 10; + int64_t v328 = v2 * 9; + int64_t v336 = v2 * 8; + int64_t v344 = v2 * 7; + int64_t v352 = v2 * 6; + int64_t v360 = v2 * 5; + int64_t v368 = v2 * 4; + int64_t v376 = v2 * 3; + int64_t v384 = v2 * 2; + const int32_t *v496 = &v5[0]; + svfloat32_t v500 = svdup_n_f32(v145); + svfloat32_t v502 = svdup_n_f32(v157); + svfloat32_t v503 = svdup_n_f32(v162); + svfloat32_t v504 = svdup_n_f32(v167); + svfloat32_t v505 = svdup_n_f32(v172); + svfloat32_t v506 = svdup_n_f32(v177); + svfloat32_t v507 = svdup_n_f32(v182); + svfloat32_t v508 = svdup_n_f32(v187); + svfloat32_t v509 = svdup_n_f32(v192); + svfloat32_t v510 = svdup_n_f32(v197); + int32_t *v527 = &v6[0]; + svfloat32_t v21 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v405[0])), + 1.F / (1ULL << 15ULL)); + const int32_t *v414 = &v5[v23]; + const int32_t *v423 = &v5[v32]; + const int32_t *v432 = &v5[v40]; + const int32_t *v441 = &v5[v49]; + const int32_t *v450 = &v5[v57]; + const int32_t *v459 = &v5[v66]; + const int32_t *v468 = &v5[v74]; + const int32_t *v477 = &v5[v83]; + const int32_t *v486 = &v5[v91]; + svfloat32_t v501 = svdup_n_f32(v153); + svfloat32_t v511 = svdup_n_f32(v205); + svfloat32_t v512 = svdup_n_f32(v212); + svfloat32_t v513 = svdup_n_f32(v219); + svfloat32_t v514 = svdup_n_f32(v226); + svfloat32_t v515 = svdup_n_f32(v233); + svfloat32_t v516 = svdup_n_f32(v240); + svfloat32_t v517 = svdup_n_f32(v247); + svfloat32_t v518 = svdup_n_f32(v254); + svfloat32_t v519 = svdup_n_f32(v261); + int32_t *v536 = &v6[v320]; + int32_t *v545 = &v6[v328]; + int32_t *v554 = &v6[v336]; + int32_t *v563 = &v6[v344]; + int32_t *v572 = &v6[v352]; + int32_t *v581 = &v6[v360]; + int32_t *v590 = &v6[v368]; + int32_t *v599 = &v6[v376]; + int32_t *v608 = &v6[v384]; + svfloat32_t v117 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v496[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v29 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v414[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v38 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v423[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v46 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v432[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v55 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v441[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v63 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v450[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v72 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v459[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v80 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v468[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v89 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v477[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v97 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v486[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v30 = svadd_f32_x(svptrue_b32(), v21, v29); + svfloat32_t v47 = svadd_f32_x(svptrue_b32(), v38, v46); + svfloat32_t v64 = svadd_f32_x(svptrue_b32(), v55, v63); + svfloat32_t v81 = svadd_f32_x(svptrue_b32(), v72, v80); + svfloat32_t v98 = svadd_f32_x(svptrue_b32(), v89, v97); + svfloat32_t v99 = svsub_f32_x(svptrue_b32(), v21, v29); + svfloat32_t v100 = svsub_f32_x(svptrue_b32(), v38, v46); + svfloat32_t v101 = svsub_f32_x(svptrue_b32(), v55, v63); + svfloat32_t v102 = svsub_f32_x(svptrue_b32(), v72, v80); + svfloat32_t v103 = svsub_f32_x(svptrue_b32(), v89, v97); + svfloat32_t v104 = svadd_f32_x(svptrue_b32(), v30, v47); + svfloat32_t v105 = svadd_f32_x(svptrue_b32(), v64, v98); + svfloat32_t v107 = svsub_f32_x(svptrue_b32(), v100, v101); + svfloat32_t v108 = svadd_f32_x(svptrue_b32(), v99, v103); + svfloat32_t v121 = svsub_f32_x(svptrue_b32(), v47, v81); + svfloat32_t v122 = svsub_f32_x(svptrue_b32(), v30, v81); + svfloat32_t v123 = svsub_f32_x(svptrue_b32(), v47, v30); + svfloat32_t v124 = svsub_f32_x(svptrue_b32(), v98, v81); + svfloat32_t v125 = svsub_f32_x(svptrue_b32(), v64, v81); + svfloat32_t v126 = svsub_f32_x(svptrue_b32(), v98, v64); + svfloat32_t v127 = svsub_f32_x(svptrue_b32(), v47, v98); + svfloat32_t v128 = svsub_f32_x(svptrue_b32(), v30, v64); + svfloat32_t v130 = svadd_f32_x(svptrue_b32(), v100, v102); + svfloat32_t v131 = svsub_f32_x(svptrue_b32(), v99, v102); + svfloat32_t v132 = svadd_f32_x(svptrue_b32(), v99, v100); + svfloat32_t v133 = svsub_f32_x(svptrue_b32(), v102, v103); + svfloat32_t v134 = svsub_f32_x(svptrue_b32(), v101, v102); + svfloat32_t v135 = svsub_f32_x(svptrue_b32(), v101, v103); + svfloat32_t v136 = svadd_f32_x(svptrue_b32(), v100, v103); + svfloat32_t v137 = svsub_f32_x(svptrue_b32(), v99, v101); + svfloat32_t v106 = svadd_f32_x(svptrue_b32(), v81, v104); + svfloat32_t v119 = svsub_f32_x(svptrue_b32(), v107, v108); + svfloat32_t v129 = svsub_f32_x(svptrue_b32(), v105, v104); + svfloat32_t v138 = svadd_f32_x(svptrue_b32(), v107, v108); + svfloat32_t v165 = svmul_f32_x(svptrue_b32(), v122, v503); + svfloat32_t v170 = svmul_f32_x(svptrue_b32(), v123, v504); + svfloat32_t v180 = svmul_f32_x(svptrue_b32(), v125, v506); + svfloat32_t v185 = svmul_f32_x(svptrue_b32(), v126, v507); + svfloat32_t zero207 = svdup_n_f32(0); + svfloat32_t v207 = svcmla_f32_x(pred_full, zero207, v511, v130, 90); + svfloat32_t zero221 = svdup_n_f32(0); + svfloat32_t v221 = svcmla_f32_x(pred_full, zero221, v513, v132, 90); + svfloat32_t zero228 = svdup_n_f32(0); + svfloat32_t v228 = svcmla_f32_x(pred_full, zero228, v514, v133, 90); + svfloat32_t zero242 = svdup_n_f32(0); + svfloat32_t v242 = svcmla_f32_x(pred_full, zero242, v516, v135, 90); + svfloat32_t zero249 = svdup_n_f32(0); + svfloat32_t v249 = svcmla_f32_x(pred_full, zero249, v517, v136, 90); + svfloat32_t v109 = svadd_f32_x(svptrue_b32(), v106, v105); + svfloat32_t v120 = svsub_f32_x(svptrue_b32(), v119, v102); + svfloat32_t v200 = svmul_f32_x(svptrue_b32(), v129, v510); + svfloat32_t zero263 = svdup_n_f32(0); + svfloat32_t v263 = svcmla_f32_x(pred_full, zero263, v519, v138, 90); + svfloat32_t v265 = svmla_f32_x(pred_full, v165, v121, v502); + svfloat32_t v266 = svmla_f32_x(pred_full, v170, v122, v503); + svfloat32_t v267 = svnmls_f32_x(pred_full, v170, v121, v502); + svfloat32_t v268 = svmla_f32_x(pred_full, v180, v124, v505); + svfloat32_t v269 = svmla_f32_x(pred_full, v185, v125, v506); + svfloat32_t v270 = svnmls_f32_x(pred_full, v185, v124, v505); + svfloat32_t v273 = svcmla_f32_x(pred_full, v221, v512, v131, 90); + svfloat32_t v274 = svsub_f32_x(svptrue_b32(), v207, v221); + svfloat32_t v275 = svcmla_f32_x(pred_full, v242, v515, v134, 90); + svfloat32_t v276 = svsub_f32_x(svptrue_b32(), v228, v242); + svfloat32_t v118 = svadd_f32_x(svptrue_b32(), v117, v109); + svfloat32_t zero155 = svdup_n_f32(0); + svfloat32_t v155 = svcmla_f32_x(pred_full, zero155, v501, v120, 90); + svfloat32_t v271 = svmla_f32_x(pred_full, v200, v128, v509); + svfloat32_t v272 = svmla_f32_x(pred_full, v200, v127, v508); + svfloat32_t v277 = svcmla_f32_x(pred_full, v263, v518, v137, 90); + svfloat32_t v278 = svsub_f32_x(svptrue_b32(), v249, v263); + svfloat32_t v297 = svadd_f32_x(svptrue_b32(), v273, v274); + svfloat32_t v264 = svmls_f32_x(pred_full, v118, v109, v500); + svfloat32_t v279 = svadd_f32_x(svptrue_b32(), v269, v271); + svfloat32_t v289 = svadd_f32_x(svptrue_b32(), v155, v275); + svfloat32_t v291 = svsub_f32_x(svptrue_b32(), v277, v273); + svfloat32_t v293 = svadd_f32_x(svptrue_b32(), v155, v278); + svfloat32_t v295 = svsub_f32_x(svptrue_b32(), v278, v274); + svfloat32_t v298 = svadd_f32_x(svptrue_b32(), v297, v275); + svint16_t v313 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v118, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svfloat32_t v280 = svadd_f32_x(svptrue_b32(), v279, v264); + svfloat32_t v281 = svsub_f32_x(svptrue_b32(), v264, v266); + svfloat32_t v283 = svadd_f32_x(svptrue_b32(), v264, v270); + svfloat32_t v285 = svsub_f32_x(svptrue_b32(), v264, v267); + svfloat32_t v287 = svadd_f32_x(svptrue_b32(), v264, v265); + svfloat32_t v290 = svadd_f32_x(svptrue_b32(), v289, v277); + svfloat32_t v292 = svsub_f32_x(svptrue_b32(), v291, v155); + svfloat32_t v294 = svadd_f32_x(svptrue_b32(), v293, v276); + svfloat32_t v296 = svsub_f32_x(svptrue_b32(), v295, v155); + svfloat32_t v299 = svadd_f32_x(svptrue_b32(), v298, v276); + svst1w_u64(pred_full, (unsigned *)(v527), svreinterpret_u64_s16(v313)); + svfloat32_t v282 = svsub_f32_x(svptrue_b32(), v281, v271); + svfloat32_t v284 = svadd_f32_x(svptrue_b32(), v283, v272); + svfloat32_t v286 = svsub_f32_x(svptrue_b32(), v285, v272); + svfloat32_t v288 = svsub_f32_x(svptrue_b32(), v287, v268); + svfloat32_t v300 = svsub_f32_x(svptrue_b32(), v299, v155); + svfloat32_t v302 = svadd_f32_x(svptrue_b32(), v280, v290); + svfloat32_t v309 = svsub_f32_x(svptrue_b32(), v280, v290); + svfloat32_t v301 = svadd_f32_x(svptrue_b32(), v288, v300); + svfloat32_t v303 = svadd_f32_x(svptrue_b32(), v282, v292); + svfloat32_t v304 = svsub_f32_x(svptrue_b32(), v284, v294); + svfloat32_t v305 = svadd_f32_x(svptrue_b32(), v286, v296); + svfloat32_t v306 = svsub_f32_x(svptrue_b32(), v286, v296); + svfloat32_t v307 = svadd_f32_x(svptrue_b32(), v284, v294); + svfloat32_t v308 = svsub_f32_x(svptrue_b32(), v282, v292); + svfloat32_t v310 = svsub_f32_x(svptrue_b32(), v288, v300); + svint16_t v329 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v302, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v385 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v309, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v321 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v301, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v337 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v303, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v345 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v304, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v353 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v305, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v361 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v306, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v369 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v307, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v377 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v308, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v393 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v310, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svst1w_u64(pred_full, (unsigned *)(v545), svreinterpret_u64_s16(v329)); + svst1w_u64(pred_full, (unsigned *)(v608), svreinterpret_u64_s16(v385)); + svst1w_u64(pred_full, (unsigned *)(v536), svreinterpret_u64_s16(v321)); + svst1w_u64(pred_full, (unsigned *)(v554), svreinterpret_u64_s16(v337)); + svst1w_u64(pred_full, (unsigned *)(v563), svreinterpret_u64_s16(v345)); + svst1w_u64(pred_full, (unsigned *)(v572), svreinterpret_u64_s16(v353)); + svst1w_u64(pred_full, (unsigned *)(v581), svreinterpret_u64_s16(v361)); + svst1w_u64(pred_full, (unsigned *)(v590), svreinterpret_u64_s16(v369)); + svst1w_u64(pred_full, (unsigned *)(v599), svreinterpret_u64_s16(v377)); + svst1w_u64(pred_full, (unsigned *)(v617), svreinterpret_u64_s16(v393)); +} +#endif + +#ifndef ARMRAL_ARCH_SVE +void armral_fft_cs16_cf32_cs16_ac_n_uun12( + const armral_cmplx_int16_t *restrict x, armral_cmplx_int16_t *restrict y, + int istride, int ostride, int howmany, float dir) { + float v4 = dir; + const int32_t *v5 = (const int32_t *)x; + int32_t *v6 = (int32_t *)y; + float v111 = 1.0000000000000000e+00F; + float v112 = -1.0000000000000000e+00F; + float v138 = -1.4999999999999998e+00F; + float v139 = 1.4999999999999998e+00F; + float v167 = 8.6602540378443871e-01F; + float v175 = -8.6602540378443871e-01F; + int16x4_t v27 = vld1s_s16(&v5[0]); + int16x4_t v76 = vld1s_s16(&v5[istride]); + float32x2_t v113 = (float32x2_t){v111, v112}; + float32x2_t v136 = (float32x2_t){v138, v138}; + float32x2_t v140 = (float32x2_t){v138, v139}; + float32x2_t v169 = (float32x2_t){v167, v175}; + float32x2_t v170 = (float32x2_t){v4, v4}; + float32x2_t v176 = (float32x2_t){v175, v175}; + int16x4_t v13 = vld1s_s16(&v5[istride * 4]); + int16x4_t v19 = vld1s_s16(&v5[istride * 8]); + float32x2_t v28 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v27)), 15); + int16x4_t v34 = vld1s_s16(&v5[istride * 7]); + int16x4_t v40 = vld1s_s16(&v5[istride * 11]); + int16x4_t v48 = vld1s_s16(&v5[istride * 3]); + int16x4_t v55 = vld1s_s16(&v5[istride * 10]); + int16x4_t v61 = vld1s_s16(&v5[istride * 2]); + int16x4_t v69 = vld1s_s16(&v5[istride * 6]); + float32x2_t v77 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v76)), 15); + int16x4_t v82 = vld1s_s16(&v5[istride * 5]); + int16x4_t v90 = vld1s_s16(&v5[istride * 9]); + float32x2_t v115 = vmul_f32(v170, v113); + float32x2_t v142 = vmul_f32(v170, v140); + float32x2_t v171 = vmul_f32(v170, v169); + float32x2_t v14 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v13)), 15); + float32x2_t v20 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v19)), 15); + float32x2_t v35 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v34)), 15); + float32x2_t v41 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v40)), 15); + float32x2_t v49 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v48)), 15); + float32x2_t v56 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v55)), 15); + float32x2_t v62 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v61)), 15); + float32x2_t v70 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v69)), 15); + float32x2_t v83 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v82)), 15); + float32x2_t v91 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v90)), 15); + float32x2_t v21 = vadd_f32(v14, v20); + float32x2_t v22 = vsub_f32(v14, v20); + float32x2_t v42 = vadd_f32(v35, v41); + float32x2_t v43 = vsub_f32(v35, v41); + float32x2_t v63 = vadd_f32(v56, v62); + float32x2_t v64 = vsub_f32(v56, v62); + float32x2_t v84 = vadd_f32(v77, v83); + float32x2_t v85 = vsub_f32(v77, v83); + float32x2_t v29 = vadd_f32(v21, v28); + float32x2_t v50 = vadd_f32(v42, v49); + float32x2_t v71 = vadd_f32(v63, v70); + float32x2_t v92 = vadd_f32(v84, v91); + float32x2_t v120 = vadd_f32(v21, v63); + float32x2_t v121 = vsub_f32(v21, v63); + float32x2_t v122 = vadd_f32(v42, v84); + float32x2_t v123 = vsub_f32(v42, v84); + float32x2_t v147 = vadd_f32(v22, v64); + float32x2_t v148 = vsub_f32(v22, v64); + float32x2_t v149 = vadd_f32(v43, v85); + float32x2_t v150 = vsub_f32(v43, v85); + float32x2_t v93 = vadd_f32(v29, v71); + float32x2_t v94 = vsub_f32(v29, v71); + float32x2_t v95 = vadd_f32(v50, v92); + float32x2_t v96 = vsub_f32(v50, v92); + float32x2_t v124 = vadd_f32(v120, v122); + float32x2_t v125 = vsub_f32(v120, v122); + float32x2_t v137 = vmul_f32(v121, v136); + float32x2_t v143 = vrev64_f32(v123); + float32x2_t v151 = vadd_f32(v147, v149); + float32x2_t v152 = vsub_f32(v147, v149); + float32x2_t v172 = vrev64_f32(v148); + float32x2_t v177 = vmul_f32(v150, v176); + float32x2_t v97 = vadd_f32(v93, v95); + float32x2_t v98 = vsub_f32(v93, v95); + float32x2_t v116 = vrev64_f32(v96); + float32x2_t v129 = vmul_f32(v124, v136); + float32x2_t v133 = vmul_f32(v125, v136); + float32x2_t v144 = vmul_f32(v143, v142); + float32x2_t v158 = vrev64_f32(v151); + float32x2_t v165 = vrev64_f32(v152); + float32x2_t v173 = vmul_f32(v172, v171); + float32x2_t v117 = vmul_f32(v116, v115); + float32x2_t v145 = vadd_f32(v137, v144); + float32x2_t v146 = vsub_f32(v137, v144); + float32x2_t v159 = vmul_f32(v158, v171); + float32x2_t v166 = vmul_f32(v165, v171); + float32x2_t v178 = vadd_f32(v173, v177); + float32x2_t v179 = vsub_f32(v173, v177); + float32x2_t v180 = vadd_f32(v97, v129); + int16x4_t v185 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v97, 15), (int32x2_t){0, 0})); + float32x2_t v222 = vadd_f32(v98, v133); + int16x4_t v227 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v98, 15), (int32x2_t){0, 0})); + float32x2_t v118 = vadd_f32(v94, v117); + float32x2_t v119 = vsub_f32(v94, v117); + float32x2_t v181 = vadd_f32(v180, v159); + float32x2_t v182 = vsub_f32(v180, v159); + v6[0] = vget_lane_s32(vreinterpret_s32_s16(v185), 0); + float32x2_t v223 = vadd_f32(v222, v166); + float32x2_t v224 = vsub_f32(v222, v166); + v6[ostride * 6] = vget_lane_s32(vreinterpret_s32_s16(v227), 0); + int16x4_t v191 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v182, 15), (int32x2_t){0, 0})); + int16x4_t v197 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v181, 15), (int32x2_t){0, 0})); + float32x2_t v201 = vadd_f32(v119, v146); + int16x4_t v206 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v119, 15), (int32x2_t){0, 0})); + int16x4_t v233 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v224, 15), (int32x2_t){0, 0})); + int16x4_t v239 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v223, 15), (int32x2_t){0, 0})); + float32x2_t v243 = vadd_f32(v118, v145); + int16x4_t v248 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v118, 15), (int32x2_t){0, 0})); + v6[ostride * 4] = vget_lane_s32(vreinterpret_s32_s16(v191), 0); + v6[ostride * 8] = vget_lane_s32(vreinterpret_s32_s16(v197), 0); + float32x2_t v202 = vadd_f32(v201, v179); + float32x2_t v203 = vsub_f32(v201, v179); + v6[ostride * 9] = vget_lane_s32(vreinterpret_s32_s16(v206), 0); + v6[ostride * 10] = vget_lane_s32(vreinterpret_s32_s16(v233), 0); + v6[ostride * 2] = vget_lane_s32(vreinterpret_s32_s16(v239), 0); + float32x2_t v244 = vadd_f32(v243, v178); + float32x2_t v245 = vsub_f32(v243, v178); + v6[ostride * 3] = vget_lane_s32(vreinterpret_s32_s16(v248), 0); + int16x4_t v212 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v203, 15), (int32x2_t){0, 0})); + int16x4_t v218 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v202, 15), (int32x2_t){0, 0})); + int16x4_t v254 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v245, 15), (int32x2_t){0, 0})); + int16x4_t v260 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v244, 15), (int32x2_t){0, 0})); + v6[ostride] = vget_lane_s32(vreinterpret_s32_s16(v212), 0); + v6[ostride * 5] = vget_lane_s32(vreinterpret_s32_s16(v218), 0); + v6[ostride * 7] = vget_lane_s32(vreinterpret_s32_s16(v254), 0); + v6[ostride * 11] = vget_lane_s32(vreinterpret_s32_s16(v260), 0); +} +#endif + +#ifdef ARMRAL_ARCH_SVE +void armral_fft_cs16_cf32_cs16_ac_n_uun12( + const armral_cmplx_int16_t *restrict x, armral_cmplx_int16_t *restrict y, + int istride, int ostride, int howmany, float dir) { + int64_t v0 = istride; + int64_t v2 = ostride; + float v4 = dir; + const int32_t *v5 = (const int32_t *)x; + int32_t *v6 = (int32_t *)y; + svbool_t pred_full = svptrue_pat_b32(SV_VL2); + float v144 = -1.0000000000000000e+00F; + float v169 = -1.4999999999999998e+00F; + float v174 = 1.4999999999999998e+00F; + float v210 = -8.6602540378443871e-01F; + const int32_t *v412 = &v5[v0]; + int32_t *v488 = &v6[v2]; + int64_t v15 = v0 * 4; + int64_t v23 = v0 * 8; + int64_t v42 = v0 * 7; + int64_t v50 = v0 * 11; + int64_t v60 = v0 * 3; + int64_t v69 = v0 * 10; + int64_t v77 = v0 * 2; + int64_t v87 = v0 * 6; + int64_t v104 = v0 * 5; + int64_t v114 = v0 * 9; + float v147 = v4 * v144; + float v177 = v4 * v174; + float v206 = v4 * v210; + int64_t v228 = v2 * 4; + int64_t v236 = v2 * 8; + int64_t v247 = v2 * 9; + int64_t v263 = v2 * 5; + int64_t v274 = v2 * 6; + int64_t v282 = v2 * 10; + int64_t v290 = v2 * 2; + int64_t v301 = v2 * 3; + int64_t v309 = v2 * 7; + int64_t v317 = v2 * 11; + const int32_t *v349 = &v5[0]; + svfloat32_t v439 = svdup_n_f32(v169); + svfloat32_t v444 = svdup_n_f32(v210); + int32_t *v452 = &v6[0]; + svfloat32_t v102 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v412[0])), + 1.F / (1ULL << 15ULL)); + const int32_t *v330 = &v5[v15]; + const int32_t *v339 = &v5[v23]; + const int32_t *v358 = &v5[v42]; + const int32_t *v367 = &v5[v50]; + const int32_t *v376 = &v5[v60]; + const int32_t *v385 = &v5[v69]; + const int32_t *v394 = &v5[v77]; + const int32_t *v403 = &v5[v87]; + const int32_t *v421 = &v5[v104]; + const int32_t *v430 = &v5[v114]; + svfloat32_t v436 = svdup_n_f32(v147); + svfloat32_t v440 = svdup_n_f32(v177); + svfloat32_t v443 = svdup_n_f32(v206); + int32_t *v461 = &v6[v228]; + int32_t *v470 = &v6[v236]; + int32_t *v479 = &v6[v247]; + int32_t *v497 = &v6[v263]; + int32_t *v506 = &v6[v274]; + int32_t *v515 = &v6[v282]; + int32_t *v524 = &v6[v290]; + int32_t *v533 = &v6[v301]; + int32_t *v542 = &v6[v309]; + int32_t *v551 = &v6[v317]; + svfloat32_t v39 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v349[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v21 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v330[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v29 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v339[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v48 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v358[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v56 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v367[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v66 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v376[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v75 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v385[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v83 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v394[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v93 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v403[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v110 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v421[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v120 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v430[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v30 = svadd_f32_x(svptrue_b32(), v21, v29); + svfloat32_t v31 = svsub_f32_x(svptrue_b32(), v21, v29); + svfloat32_t v57 = svadd_f32_x(svptrue_b32(), v48, v56); + svfloat32_t v58 = svsub_f32_x(svptrue_b32(), v48, v56); + svfloat32_t v84 = svadd_f32_x(svptrue_b32(), v75, v83); + svfloat32_t v85 = svsub_f32_x(svptrue_b32(), v75, v83); + svfloat32_t v111 = svadd_f32_x(svptrue_b32(), v102, v110); + svfloat32_t v112 = svsub_f32_x(svptrue_b32(), v102, v110); + svfloat32_t v40 = svadd_f32_x(svptrue_b32(), v30, v39); + svfloat32_t v67 = svadd_f32_x(svptrue_b32(), v57, v66); + svfloat32_t v94 = svadd_f32_x(svptrue_b32(), v84, v93); + svfloat32_t v121 = svadd_f32_x(svptrue_b32(), v111, v120); + svfloat32_t v152 = svadd_f32_x(svptrue_b32(), v30, v84); + svfloat32_t v153 = svsub_f32_x(svptrue_b32(), v30, v84); + svfloat32_t v154 = svadd_f32_x(svptrue_b32(), v57, v111); + svfloat32_t v155 = svsub_f32_x(svptrue_b32(), v57, v111); + svfloat32_t v182 = svadd_f32_x(svptrue_b32(), v31, v85); + svfloat32_t v183 = svsub_f32_x(svptrue_b32(), v31, v85); + svfloat32_t v184 = svadd_f32_x(svptrue_b32(), v58, v112); + svfloat32_t v185 = svsub_f32_x(svptrue_b32(), v58, v112); + svfloat32_t v122 = svadd_f32_x(svptrue_b32(), v40, v94); + svfloat32_t v123 = svsub_f32_x(svptrue_b32(), v40, v94); + svfloat32_t v124 = svadd_f32_x(svptrue_b32(), v67, v121); + svfloat32_t v125 = svsub_f32_x(svptrue_b32(), v67, v121); + svfloat32_t v156 = svadd_f32_x(svptrue_b32(), v152, v154); + svfloat32_t v157 = svsub_f32_x(svptrue_b32(), v152, v154); + svfloat32_t zero179 = svdup_n_f32(0); + svfloat32_t v179 = svcmla_f32_x(pred_full, zero179, v440, v155, 90); + svfloat32_t v186 = svadd_f32_x(svptrue_b32(), v182, v184); + svfloat32_t v187 = svsub_f32_x(svptrue_b32(), v182, v184); + svfloat32_t zero208 = svdup_n_f32(0); + svfloat32_t v208 = svcmla_f32_x(pred_full, zero208, v443, v183, 90); + svfloat32_t v126 = svadd_f32_x(svptrue_b32(), v122, v124); + svfloat32_t v127 = svsub_f32_x(svptrue_b32(), v122, v124); + svfloat32_t zero149 = svdup_n_f32(0); + svfloat32_t v149 = svcmla_f32_x(pred_full, zero149, v436, v125, 90); + svfloat32_t v180 = svmla_f32_x(pred_full, v179, v153, v439); + svfloat32_t v181 = svnmls_f32_x(pred_full, v179, v153, v439); + svfloat32_t zero194 = svdup_n_f32(0); + svfloat32_t v194 = svcmla_f32_x(pred_full, zero194, v443, v186, 90); + svfloat32_t zero201 = svdup_n_f32(0); + svfloat32_t v201 = svcmla_f32_x(pred_full, zero201, v443, v187, 90); + svfloat32_t v214 = svmla_f32_x(pred_full, v208, v185, v444); + svfloat32_t v215 = svmls_f32_x(pred_full, v208, v185, v444); + svfloat32_t v150 = svadd_f32_x(svptrue_b32(), v123, v149); + svfloat32_t v151 = svsub_f32_x(svptrue_b32(), v123, v149); + svfloat32_t v216 = svmla_f32_x(pred_full, v126, v156, v439); + svint16_t v221 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v126, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svfloat32_t v270 = svmla_f32_x(pred_full, v127, v157, v439); + svint16_t v275 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v127, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svfloat32_t v217 = svadd_f32_x(svptrue_b32(), v216, v194); + svfloat32_t v218 = svsub_f32_x(svptrue_b32(), v216, v194); + svfloat32_t v243 = svadd_f32_x(svptrue_b32(), v151, v181); + svint16_t v248 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v151, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svfloat32_t v271 = svadd_f32_x(svptrue_b32(), v270, v201); + svfloat32_t v272 = svsub_f32_x(svptrue_b32(), v270, v201); + svfloat32_t v297 = svadd_f32_x(svptrue_b32(), v150, v180); + svint16_t v302 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v150, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svst1w_u64(pred_full, (unsigned *)(v452), svreinterpret_u64_s16(v221)); + svst1w_u64(pred_full, (unsigned *)(v506), svreinterpret_u64_s16(v275)); + svint16_t v229 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v218, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v237 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v217, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svfloat32_t v244 = svadd_f32_x(svptrue_b32(), v243, v215); + svfloat32_t v245 = svsub_f32_x(svptrue_b32(), v243, v215); + svint16_t v283 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v272, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v291 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v271, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svfloat32_t v298 = svadd_f32_x(svptrue_b32(), v297, v214); + svfloat32_t v299 = svsub_f32_x(svptrue_b32(), v297, v214); + svst1w_u64(pred_full, (unsigned *)(v479), svreinterpret_u64_s16(v248)); + svst1w_u64(pred_full, (unsigned *)(v533), svreinterpret_u64_s16(v302)); + svint16_t v256 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v245, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v264 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v244, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v310 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v299, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v318 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v298, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svst1w_u64(pred_full, (unsigned *)(v461), svreinterpret_u64_s16(v229)); + svst1w_u64(pred_full, (unsigned *)(v470), svreinterpret_u64_s16(v237)); + svst1w_u64(pred_full, (unsigned *)(v515), svreinterpret_u64_s16(v283)); + svst1w_u64(pred_full, (unsigned *)(v524), svreinterpret_u64_s16(v291)); + svst1w_u64(pred_full, (unsigned *)(v488), svreinterpret_u64_s16(v256)); + svst1w_u64(pred_full, (unsigned *)(v497), svreinterpret_u64_s16(v264)); + svst1w_u64(pred_full, (unsigned *)(v542), svreinterpret_u64_s16(v310)); + svst1w_u64(pred_full, (unsigned *)(v551), svreinterpret_u64_s16(v318)); +} +#endif + +#ifndef ARMRAL_ARCH_SVE +void armral_fft_cs16_cf32_cs16_ac_n_uun13( + const armral_cmplx_int16_t *restrict x, armral_cmplx_int16_t *restrict y, + int istride, int ostride, int howmany, float dir) { + float v4 = dir; + const int32_t *v5 = (const int32_t *)x; + int32_t *v6 = (int32_t *)y; + float v135 = 1.0833333333333333e+00F; + float v139 = -3.0046260628866578e-01F; + float v142 = 7.4927933062613905e-01F; + float v143 = -7.4927933062613905e-01F; + float v149 = 4.0100212832186721e-01F; + float v150 = -4.0100212832186721e-01F; + float v156 = 5.7514072947400308e-01F; + float v157 = -5.7514072947400308e-01F; + float v164 = 5.2422663952658211e-01F; + float v168 = 5.1652078062348972e-01F; + float v172 = 7.7058589030924258e-03F; + float v176 = 4.2763404682656941e-01F; + float v180 = 1.5180597207438440e-01F; + float v184 = 5.7944001890096386e-01F; + float v187 = 1.1543953381323635e+00F; + float v188 = -1.1543953381323635e+00F; + float v194 = 9.0655220171271012e-01F; + float v195 = -9.0655220171271012e-01F; + float v201 = 8.1857027294591811e-01F; + float v202 = -8.1857027294591811e-01F; + float v208 = 1.1971367726043427e+00F; + float v209 = -1.1971367726043427e+00F; + float v215 = 8.6131170741789742e-01F; + float v216 = -8.6131170741789742e-01F; + float v222 = 1.1091548438375507e+00F; + float v223 = -1.1091548438375507e+00F; + float v229 = 4.2741434471979367e-02F; + float v230 = -4.2741434471979367e-02F; + float v236 = -4.5240494294812715e-02F; + float v237 = 4.5240494294812715e-02F; + float v243 = 2.9058457089163264e-01F; + float v244 = -2.9058457089163264e-01F; + int16x4_t v13 = vld1s_s16(&v5[istride]); + int16x4_t v120 = vld1s_s16(&v5[0]); + float32x2_t v136 = (float32x2_t){v135, v135}; + float32x2_t v140 = (float32x2_t){v139, v139}; + float32x2_t v144 = (float32x2_t){v142, v143}; + float32x2_t v151 = (float32x2_t){v149, v150}; + float32x2_t v158 = (float32x2_t){v156, v157}; + float32x2_t v165 = (float32x2_t){v164, v164}; + float32x2_t v169 = (float32x2_t){v168, v168}; + float32x2_t v173 = (float32x2_t){v172, v172}; + float32x2_t v177 = (float32x2_t){v176, v176}; + float32x2_t v181 = (float32x2_t){v180, v180}; + float32x2_t v185 = (float32x2_t){v184, v184}; + float32x2_t v189 = (float32x2_t){v187, v188}; + float32x2_t v196 = (float32x2_t){v194, v195}; + float32x2_t v203 = (float32x2_t){v201, v202}; + float32x2_t v210 = (float32x2_t){v208, v209}; + float32x2_t v217 = (float32x2_t){v215, v216}; + float32x2_t v224 = (float32x2_t){v222, v223}; + float32x2_t v231 = (float32x2_t){v229, v230}; + float32x2_t v238 = (float32x2_t){v236, v237}; + float32x2_t v245 = (float32x2_t){v243, v244}; + float32x2_t v246 = (float32x2_t){v4, v4}; + float32x2_t v14 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v13)), 15); + int16x4_t v19 = vld1s_s16(&v5[istride * 12]); + int16x4_t v26 = vld1s_s16(&v5[istride * 2]); + int16x4_t v32 = vld1s_s16(&v5[istride * 11]); + int16x4_t v39 = vld1s_s16(&v5[istride * 3]); + int16x4_t v45 = vld1s_s16(&v5[istride * 10]); + int16x4_t v52 = vld1s_s16(&v5[istride * 4]); + int16x4_t v58 = vld1s_s16(&v5[istride * 9]); + int16x4_t v65 = vld1s_s16(&v5[istride * 5]); + int16x4_t v71 = vld1s_s16(&v5[istride * 8]); + int16x4_t v78 = vld1s_s16(&v5[istride * 6]); + int16x4_t v84 = vld1s_s16(&v5[istride * 7]); + float32x2_t v121 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v120)), 15); + float32x2_t v146 = vmul_f32(v246, v144); + float32x2_t v153 = vmul_f32(v246, v151); + float32x2_t v160 = vmul_f32(v246, v158); + float32x2_t v191 = vmul_f32(v246, v189); + float32x2_t v198 = vmul_f32(v246, v196); + float32x2_t v205 = vmul_f32(v246, v203); + float32x2_t v212 = vmul_f32(v246, v210); + float32x2_t v219 = vmul_f32(v246, v217); + float32x2_t v226 = vmul_f32(v246, v224); + float32x2_t v233 = vmul_f32(v246, v231); + float32x2_t v240 = vmul_f32(v246, v238); + float32x2_t v247 = vmul_f32(v246, v245); + float32x2_t v20 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v19)), 15); + float32x2_t v27 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v26)), 15); + float32x2_t v33 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v32)), 15); + float32x2_t v40 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v39)), 15); + float32x2_t v46 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v45)), 15); + float32x2_t v53 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v52)), 15); + float32x2_t v59 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v58)), 15); + float32x2_t v66 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v65)), 15); + float32x2_t v72 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v71)), 15); + float32x2_t v79 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v78)), 15); + float32x2_t v85 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v84)), 15); + float32x2_t v21 = vadd_f32(v14, v20); + float32x2_t v34 = vadd_f32(v27, v33); + float32x2_t v47 = vadd_f32(v40, v46); + float32x2_t v60 = vadd_f32(v53, v59); + float32x2_t v73 = vadd_f32(v66, v72); + float32x2_t v86 = vadd_f32(v79, v85); + float32x2_t v87 = vsub_f32(v14, v20); + float32x2_t v88 = vsub_f32(v27, v33); + float32x2_t v89 = vsub_f32(v40, v46); + float32x2_t v90 = vsub_f32(v53, v59); + float32x2_t v91 = vsub_f32(v66, v72); + float32x2_t v92 = vsub_f32(v79, v85); + float32x2_t v93 = vadd_f32(v34, v73); + float32x2_t v95 = vadd_f32(v21, v47); + float32x2_t v98 = vadd_f32(v88, v91); + float32x2_t v100 = vadd_f32(v87, v89); + float32x2_t v102 = vsub_f32(v34, v86); + float32x2_t v103 = vsub_f32(v47, v60); + float32x2_t v104 = vsub_f32(v21, v60); + float32x2_t v105 = vsub_f32(v73, v86); + float32x2_t v110 = vsub_f32(v88, v92); + float32x2_t v111 = vsub_f32(v87, v89); + float32x2_t v112 = vsub_f32(v88, v91); + float32x2_t v113 = vadd_f32(v87, v90); + float32x2_t v114 = vsub_f32(v91, v92); + float32x2_t v115 = vadd_f32(v89, v90); + float32x2_t v94 = vadd_f32(v93, v86); + float32x2_t v96 = vadd_f32(v95, v60); + float32x2_t v99 = vadd_f32(v98, v92); + float32x2_t v101 = vsub_f32(v100, v90); + float32x2_t v106 = vsub_f32(v102, v103); + float32x2_t v107 = vsub_f32(v104, v105); + float32x2_t v108 = vadd_f32(v102, v103); + float32x2_t v109 = vadd_f32(v104, v105); + float32x2_t v127 = vadd_f32(v110, v111); + float32x2_t v128 = vadd_f32(v112, v113); + float32x2_t v129 = vsub_f32(v114, v115); + float32x2_t v192 = vrev64_f32(v110); + float32x2_t v199 = vrev64_f32(v111); + float32x2_t v213 = vrev64_f32(v112); + float32x2_t v220 = vrev64_f32(v113); + float32x2_t v234 = vrev64_f32(v114); + float32x2_t v241 = vrev64_f32(v115); + float32x2_t v97 = vadd_f32(v94, v96); + float32x2_t v123 = vsub_f32(v96, v94); + float32x2_t v124 = vadd_f32(v99, v101); + float32x2_t v125 = vadd_f32(v106, v107); + float32x2_t v126 = vsub_f32(v108, v109); + float32x2_t v147 = vrev64_f32(v99); + float32x2_t v154 = vrev64_f32(v101); + float32x2_t v166 = vmul_f32(v106, v165); + float32x2_t v170 = vmul_f32(v107, v169); + float32x2_t v178 = vmul_f32(v108, v177); + float32x2_t v182 = vmul_f32(v109, v181); + float32x2_t v193 = vmul_f32(v192, v191); + float32x2_t v200 = vmul_f32(v199, v198); + float32x2_t v206 = vrev64_f32(v127); + float32x2_t v214 = vmul_f32(v213, v212); + float32x2_t v221 = vmul_f32(v220, v219); + float32x2_t v227 = vrev64_f32(v128); + float32x2_t v235 = vmul_f32(v234, v233); + float32x2_t v242 = vmul_f32(v241, v240); + float32x2_t v248 = vrev64_f32(v129); + float32x2_t v122 = vadd_f32(v121, v97); + float32x2_t v137 = vmul_f32(v97, v136); + float32x2_t v141 = vmul_f32(v123, v140); + float32x2_t v148 = vmul_f32(v147, v146); + float32x2_t v155 = vmul_f32(v154, v153); + float32x2_t v161 = vrev64_f32(v124); + float32x2_t v174 = vmul_f32(v125, v173); + float32x2_t v186 = vmul_f32(v126, v185); + float32x2_t v207 = vmul_f32(v206, v205); + float32x2_t v228 = vmul_f32(v227, v226); + float32x2_t v249 = vmul_f32(v248, v247); + float32x2_t v251 = vadd_f32(v170, v166); + float32x2_t v162 = vmul_f32(v161, v160); + float32x2_t v250 = vsub_f32(v122, v137); + float32x2_t v252 = vsub_f32(v251, v141); + float32x2_t v253 = vadd_f32(v170, v174); + float32x2_t v255 = vsub_f32(v174, v166); + float32x2_t v263 = vsub_f32(v193, v207); + float32x2_t v264 = vsub_f32(v200, v207); + float32x2_t v265 = vsub_f32(v214, v228); + float32x2_t v266 = vsub_f32(v221, v228); + float32x2_t v267 = vsub_f32(v235, v249); + float32x2_t v268 = vadd_f32(v242, v249); + int16x4_t v303 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v122, 15), (int32x2_t){0, 0})); + float32x2_t v254 = vadd_f32(v253, v141); + float32x2_t v256 = vsub_f32(v255, v141); + float32x2_t v257 = vadd_f32(v250, v178); + float32x2_t v259 = vsub_f32(v250, v182); + float32x2_t v261 = vsub_f32(v250, v178); + float32x2_t v269 = vsub_f32(v148, v162); + float32x2_t v270 = vsub_f32(v155, v162); + float32x2_t v281 = vadd_f32(v263, v267); + float32x2_t v283 = vadd_f32(v265, v267); + float32x2_t v285 = vsub_f32(v264, v268); + v6[0] = vget_lane_s32(vreinterpret_s32_s16(v303), 0); + float32x2_t v258 = vadd_f32(v257, v182); + float32x2_t v260 = vsub_f32(v259, v186); + float32x2_t v262 = vadd_f32(v261, v186); + float32x2_t v277 = vsub_f32(v270, v263); + float32x2_t v279 = vsub_f32(v268, v269); + float32x2_t v282 = vadd_f32(v281, v270); + float32x2_t v284 = vsub_f32(v283, v270); + float32x2_t v286 = vsub_f32(v285, v269); + float32x2_t v287 = vadd_f32(v269, v264); + float32x2_t v271 = vadd_f32(v252, v258); + float32x2_t v272 = vadd_f32(v254, v260); + float32x2_t v273 = vsub_f32(v260, v254); + float32x2_t v274 = vadd_f32(v256, v262); + float32x2_t v275 = vsub_f32(v258, v252); + float32x2_t v276 = vsub_f32(v262, v256); + float32x2_t v278 = vadd_f32(v277, v265); + float32x2_t v280 = vsub_f32(v279, v266); + float32x2_t v288 = vsub_f32(v287, v266); + float32x2_t v289 = vsub_f32(v271, v278); + float32x2_t v290 = vadd_f32(v272, v280); + float32x2_t v291 = vsub_f32(v273, v282); + float32x2_t v292 = vsub_f32(v274, v284); + float32x2_t v293 = vadd_f32(v275, v286); + float32x2_t v294 = vsub_f32(v276, v288); + float32x2_t v295 = vadd_f32(v276, v288); + float32x2_t v296 = vsub_f32(v275, v286); + float32x2_t v297 = vadd_f32(v274, v284); + float32x2_t v298 = vadd_f32(v273, v282); + float32x2_t v299 = vsub_f32(v272, v280); + float32x2_t v300 = vadd_f32(v271, v278); + int16x4_t v309 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v289, 15), (int32x2_t){0, 0})); + int16x4_t v315 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v290, 15), (int32x2_t){0, 0})); + int16x4_t v321 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v291, 15), (int32x2_t){0, 0})); + int16x4_t v327 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v292, 15), (int32x2_t){0, 0})); + int16x4_t v333 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v293, 15), (int32x2_t){0, 0})); + int16x4_t v339 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v294, 15), (int32x2_t){0, 0})); + int16x4_t v345 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v295, 15), (int32x2_t){0, 0})); + int16x4_t v351 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v296, 15), (int32x2_t){0, 0})); + int16x4_t v357 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v297, 15), (int32x2_t){0, 0})); + int16x4_t v363 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v298, 15), (int32x2_t){0, 0})); + int16x4_t v369 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v299, 15), (int32x2_t){0, 0})); + int16x4_t v375 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v300, 15), (int32x2_t){0, 0})); + v6[ostride * 12] = vget_lane_s32(vreinterpret_s32_s16(v309), 0); + v6[ostride * 11] = vget_lane_s32(vreinterpret_s32_s16(v315), 0); + v6[ostride * 10] = vget_lane_s32(vreinterpret_s32_s16(v321), 0); + v6[ostride * 9] = vget_lane_s32(vreinterpret_s32_s16(v327), 0); + v6[ostride * 8] = vget_lane_s32(vreinterpret_s32_s16(v333), 0); + v6[ostride * 7] = vget_lane_s32(vreinterpret_s32_s16(v339), 0); + v6[ostride * 6] = vget_lane_s32(vreinterpret_s32_s16(v345), 0); + v6[ostride * 5] = vget_lane_s32(vreinterpret_s32_s16(v351), 0); + v6[ostride * 4] = vget_lane_s32(vreinterpret_s32_s16(v357), 0); + v6[ostride * 3] = vget_lane_s32(vreinterpret_s32_s16(v363), 0); + v6[ostride * 2] = vget_lane_s32(vreinterpret_s32_s16(v369), 0); + v6[ostride] = vget_lane_s32(vreinterpret_s32_s16(v375), 0); +} +#endif + +#ifdef ARMRAL_ARCH_SVE +void armral_fft_cs16_cf32_cs16_ac_n_uun13( + const armral_cmplx_int16_t *restrict x, armral_cmplx_int16_t *restrict y, + int istride, int ostride, int howmany, float dir) { + int64_t v0 = istride; + int64_t v2 = ostride; + float v4 = dir; + const int32_t *v5 = (const int32_t *)x; + int32_t *v6 = (int32_t *)y; + svbool_t pred_full = svptrue_pat_b32(SV_VL2); + float v167 = 1.0833333333333333e+00F; + float v172 = -3.0046260628866578e-01F; + float v177 = -7.4927933062613905e-01F; + float v184 = -4.0100212832186721e-01F; + float v191 = -5.7514072947400308e-01F; + float v198 = 5.2422663952658211e-01F; + float v203 = 5.1652078062348972e-01F; + float v208 = 7.7058589030924258e-03F; + float v213 = 4.2763404682656941e-01F; + float v218 = 1.5180597207438440e-01F; + float v223 = 5.7944001890096386e-01F; + float v228 = -1.1543953381323635e+00F; + float v235 = -9.0655220171271012e-01F; + float v242 = -8.1857027294591811e-01F; + float v249 = -1.1971367726043427e+00F; + float v256 = -8.6131170741789742e-01F; + float v263 = -1.1091548438375507e+00F; + float v270 = -4.2741434471979367e-02F; + float v277 = 4.5240494294812715e-02F; + float v284 = -2.9058457089163264e-01F; + const int32_t *v451 = &v5[v0]; + int32_t *v699 = &v6[v2]; + int64_t v23 = v0 * 12; + int64_t v32 = v0 * 2; + int64_t v40 = v0 * 11; + int64_t v49 = v0 * 3; + int64_t v57 = v0 * 10; + int64_t v66 = v0 * 4; + int64_t v74 = v0 * 9; + int64_t v83 = v0 * 5; + int64_t v91 = v0 * 8; + int64_t v100 = v0 * 6; + int64_t v108 = v0 * 7; + float v180 = v4 * v177; + float v187 = v4 * v184; + float v194 = v4 * v191; + float v231 = v4 * v228; + float v238 = v4 * v235; + float v245 = v4 * v242; + float v252 = v4 * v249; + float v259 = v4 * v256; + float v266 = v4 * v263; + float v273 = v4 * v270; + float v280 = v4 * v277; + float v287 = v4 * v284; + int64_t v350 = v2 * 12; + int64_t v358 = v2 * 11; + int64_t v366 = v2 * 10; + int64_t v374 = v2 * 9; + int64_t v382 = v2 * 8; + int64_t v390 = v2 * 7; + int64_t v398 = v2 * 6; + int64_t v406 = v2 * 5; + int64_t v414 = v2 * 4; + int64_t v422 = v2 * 3; + int64_t v430 = v2 * 2; + const int32_t *v560 = &v5[0]; + svfloat32_t v564 = svdup_n_f32(v167); + svfloat32_t v565 = svdup_n_f32(v172); + svfloat32_t v569 = svdup_n_f32(v198); + svfloat32_t v570 = svdup_n_f32(v203); + svfloat32_t v571 = svdup_n_f32(v208); + svfloat32_t v572 = svdup_n_f32(v213); + svfloat32_t v573 = svdup_n_f32(v218); + svfloat32_t v574 = svdup_n_f32(v223); + int32_t *v591 = &v6[0]; + svfloat32_t v21 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v451[0])), + 1.F / (1ULL << 15ULL)); + const int32_t *v460 = &v5[v23]; + const int32_t *v469 = &v5[v32]; + const int32_t *v478 = &v5[v40]; + const int32_t *v487 = &v5[v49]; + const int32_t *v496 = &v5[v57]; + const int32_t *v505 = &v5[v66]; + const int32_t *v514 = &v5[v74]; + const int32_t *v523 = &v5[v83]; + const int32_t *v532 = &v5[v91]; + const int32_t *v541 = &v5[v100]; + const int32_t *v550 = &v5[v108]; + svfloat32_t v566 = svdup_n_f32(v180); + svfloat32_t v567 = svdup_n_f32(v187); + svfloat32_t v568 = svdup_n_f32(v194); + svfloat32_t v575 = svdup_n_f32(v231); + svfloat32_t v576 = svdup_n_f32(v238); + svfloat32_t v577 = svdup_n_f32(v245); + svfloat32_t v578 = svdup_n_f32(v252); + svfloat32_t v579 = svdup_n_f32(v259); + svfloat32_t v580 = svdup_n_f32(v266); + svfloat32_t v581 = svdup_n_f32(v273); + svfloat32_t v582 = svdup_n_f32(v280); + svfloat32_t v583 = svdup_n_f32(v287); + int32_t *v600 = &v6[v350]; + int32_t *v609 = &v6[v358]; + int32_t *v618 = &v6[v366]; + int32_t *v627 = &v6[v374]; + int32_t *v636 = &v6[v382]; + int32_t *v645 = &v6[v390]; + int32_t *v654 = &v6[v398]; + int32_t *v663 = &v6[v406]; + int32_t *v672 = &v6[v414]; + int32_t *v681 = &v6[v422]; + int32_t *v690 = &v6[v430]; + svfloat32_t v152 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v560[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v29 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v460[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v38 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v469[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v46 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v478[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v55 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v487[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v63 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v496[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v72 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v505[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v80 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v514[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v89 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v523[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v97 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v532[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v106 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v541[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v114 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v550[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v30 = svadd_f32_x(svptrue_b32(), v21, v29); + svfloat32_t v47 = svadd_f32_x(svptrue_b32(), v38, v46); + svfloat32_t v64 = svadd_f32_x(svptrue_b32(), v55, v63); + svfloat32_t v81 = svadd_f32_x(svptrue_b32(), v72, v80); + svfloat32_t v98 = svadd_f32_x(svptrue_b32(), v89, v97); + svfloat32_t v115 = svadd_f32_x(svptrue_b32(), v106, v114); + svfloat32_t v116 = svsub_f32_x(svptrue_b32(), v21, v29); + svfloat32_t v117 = svsub_f32_x(svptrue_b32(), v38, v46); + svfloat32_t v118 = svsub_f32_x(svptrue_b32(), v55, v63); + svfloat32_t v119 = svsub_f32_x(svptrue_b32(), v72, v80); + svfloat32_t v120 = svsub_f32_x(svptrue_b32(), v89, v97); + svfloat32_t v121 = svsub_f32_x(svptrue_b32(), v106, v114); + svfloat32_t v122 = svadd_f32_x(svptrue_b32(), v47, v98); + svfloat32_t v124 = svadd_f32_x(svptrue_b32(), v30, v64); + svfloat32_t v127 = svadd_f32_x(svptrue_b32(), v117, v120); + svfloat32_t v129 = svadd_f32_x(svptrue_b32(), v116, v118); + svfloat32_t v131 = svsub_f32_x(svptrue_b32(), v47, v115); + svfloat32_t v132 = svsub_f32_x(svptrue_b32(), v64, v81); + svfloat32_t v133 = svsub_f32_x(svptrue_b32(), v30, v81); + svfloat32_t v134 = svsub_f32_x(svptrue_b32(), v98, v115); + svfloat32_t v139 = svsub_f32_x(svptrue_b32(), v117, v121); + svfloat32_t v140 = svsub_f32_x(svptrue_b32(), v116, v118); + svfloat32_t v141 = svsub_f32_x(svptrue_b32(), v117, v120); + svfloat32_t v142 = svadd_f32_x(svptrue_b32(), v116, v119); + svfloat32_t v143 = svsub_f32_x(svptrue_b32(), v120, v121); + svfloat32_t v144 = svadd_f32_x(svptrue_b32(), v118, v119); + svfloat32_t v123 = svadd_f32_x(svptrue_b32(), v122, v115); + svfloat32_t v125 = svadd_f32_x(svptrue_b32(), v124, v81); + svfloat32_t v128 = svadd_f32_x(svptrue_b32(), v127, v121); + svfloat32_t v130 = svsub_f32_x(svptrue_b32(), v129, v119); + svfloat32_t v135 = svsub_f32_x(svptrue_b32(), v131, v132); + svfloat32_t v136 = svsub_f32_x(svptrue_b32(), v133, v134); + svfloat32_t v137 = svadd_f32_x(svptrue_b32(), v131, v132); + svfloat32_t v138 = svadd_f32_x(svptrue_b32(), v133, v134); + svfloat32_t v158 = svadd_f32_x(svptrue_b32(), v139, v140); + svfloat32_t v159 = svadd_f32_x(svptrue_b32(), v141, v142); + svfloat32_t v160 = svsub_f32_x(svptrue_b32(), v143, v144); + svfloat32_t zero233 = svdup_n_f32(0); + svfloat32_t v233 = svcmla_f32_x(pred_full, zero233, v575, v139, 90); + svfloat32_t zero240 = svdup_n_f32(0); + svfloat32_t v240 = svcmla_f32_x(pred_full, zero240, v576, v140, 90); + svfloat32_t zero254 = svdup_n_f32(0); + svfloat32_t v254 = svcmla_f32_x(pred_full, zero254, v578, v141, 90); + svfloat32_t zero261 = svdup_n_f32(0); + svfloat32_t v261 = svcmla_f32_x(pred_full, zero261, v579, v142, 90); + svfloat32_t zero275 = svdup_n_f32(0); + svfloat32_t v275 = svcmla_f32_x(pred_full, zero275, v581, v143, 90); + svfloat32_t v126 = svadd_f32_x(svptrue_b32(), v123, v125); + svfloat32_t v154 = svsub_f32_x(svptrue_b32(), v125, v123); + svfloat32_t v155 = svadd_f32_x(svptrue_b32(), v128, v130); + svfloat32_t v156 = svadd_f32_x(svptrue_b32(), v135, v136); + svfloat32_t v157 = svsub_f32_x(svptrue_b32(), v137, v138); + svfloat32_t zero182 = svdup_n_f32(0); + svfloat32_t v182 = svcmla_f32_x(pred_full, zero182, v566, v128, 90); + svfloat32_t zero189 = svdup_n_f32(0); + svfloat32_t v189 = svcmla_f32_x(pred_full, zero189, v567, v130, 90); + svfloat32_t v201 = svmul_f32_x(svptrue_b32(), v135, v569); + svfloat32_t zero247 = svdup_n_f32(0); + svfloat32_t v247 = svcmla_f32_x(pred_full, zero247, v577, v158, 90); + svfloat32_t zero268 = svdup_n_f32(0); + svfloat32_t v268 = svcmla_f32_x(pred_full, zero268, v580, v159, 90); + svfloat32_t zero289 = svdup_n_f32(0); + svfloat32_t v289 = svcmla_f32_x(pred_full, zero289, v583, v160, 90); + svfloat32_t v153 = svadd_f32_x(svptrue_b32(), v152, v126); + svfloat32_t zero196 = svdup_n_f32(0); + svfloat32_t v196 = svcmla_f32_x(pred_full, zero196, v568, v155, 90); + svfloat32_t v211 = svmul_f32_x(svptrue_b32(), v156, v571); + svfloat32_t v291 = svmla_f32_x(pred_full, v201, v136, v570); + svfloat32_t v303 = svsub_f32_x(svptrue_b32(), v233, v247); + svfloat32_t v304 = svsub_f32_x(svptrue_b32(), v240, v247); + svfloat32_t v305 = svsub_f32_x(svptrue_b32(), v254, v268); + svfloat32_t v306 = svsub_f32_x(svptrue_b32(), v261, v268); + svfloat32_t v307 = svsub_f32_x(svptrue_b32(), v275, v289); + svfloat32_t v308 = svcmla_f32_x(pred_full, v289, v582, v144, 90); + svfloat32_t v290 = svmls_f32_x(pred_full, v153, v126, v564); + svfloat32_t v292 = svmls_f32_x(pred_full, v291, v154, v565); + svfloat32_t v293 = svmla_f32_x(pred_full, v211, v136, v570); + svfloat32_t v295 = svnmls_f32_x(pred_full, v201, v156, v571); + svfloat32_t v309 = svsub_f32_x(svptrue_b32(), v182, v196); + svfloat32_t v310 = svsub_f32_x(svptrue_b32(), v189, v196); + svfloat32_t v321 = svadd_f32_x(svptrue_b32(), v303, v307); + svfloat32_t v323 = svadd_f32_x(svptrue_b32(), v305, v307); + svfloat32_t v325 = svsub_f32_x(svptrue_b32(), v304, v308); + svint16_t v343 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v153, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svfloat32_t v294 = svmla_f32_x(pred_full, v293, v154, v565); + svfloat32_t v296 = svmls_f32_x(pred_full, v295, v154, v565); + svfloat32_t v297 = svmla_f32_x(pred_full, v290, v137, v572); + svfloat32_t v299 = svmls_f32_x(pred_full, v290, v138, v573); + svfloat32_t v301 = svmls_f32_x(pred_full, v290, v137, v572); + svfloat32_t v317 = svsub_f32_x(svptrue_b32(), v310, v303); + svfloat32_t v319 = svsub_f32_x(svptrue_b32(), v308, v309); + svfloat32_t v322 = svadd_f32_x(svptrue_b32(), v321, v310); + svfloat32_t v324 = svsub_f32_x(svptrue_b32(), v323, v310); + svfloat32_t v326 = svsub_f32_x(svptrue_b32(), v325, v309); + svfloat32_t v327 = svadd_f32_x(svptrue_b32(), v309, v304); + svst1w_u64(pred_full, (unsigned *)(v591), svreinterpret_u64_s16(v343)); + svfloat32_t v298 = svmla_f32_x(pred_full, v297, v138, v573); + svfloat32_t v300 = svmls_f32_x(pred_full, v299, v157, v574); + svfloat32_t v302 = svmla_f32_x(pred_full, v301, v157, v574); + svfloat32_t v318 = svadd_f32_x(svptrue_b32(), v317, v305); + svfloat32_t v320 = svsub_f32_x(svptrue_b32(), v319, v306); + svfloat32_t v328 = svsub_f32_x(svptrue_b32(), v327, v306); + svfloat32_t v311 = svadd_f32_x(svptrue_b32(), v292, v298); + svfloat32_t v312 = svadd_f32_x(svptrue_b32(), v294, v300); + svfloat32_t v313 = svsub_f32_x(svptrue_b32(), v300, v294); + svfloat32_t v314 = svadd_f32_x(svptrue_b32(), v296, v302); + svfloat32_t v315 = svsub_f32_x(svptrue_b32(), v298, v292); + svfloat32_t v316 = svsub_f32_x(svptrue_b32(), v302, v296); + svfloat32_t v329 = svsub_f32_x(svptrue_b32(), v311, v318); + svfloat32_t v330 = svadd_f32_x(svptrue_b32(), v312, v320); + svfloat32_t v331 = svsub_f32_x(svptrue_b32(), v313, v322); + svfloat32_t v332 = svsub_f32_x(svptrue_b32(), v314, v324); + svfloat32_t v333 = svadd_f32_x(svptrue_b32(), v315, v326); + svfloat32_t v334 = svsub_f32_x(svptrue_b32(), v316, v328); + svfloat32_t v335 = svadd_f32_x(svptrue_b32(), v316, v328); + svfloat32_t v336 = svsub_f32_x(svptrue_b32(), v315, v326); + svfloat32_t v337 = svadd_f32_x(svptrue_b32(), v314, v324); + svfloat32_t v338 = svadd_f32_x(svptrue_b32(), v313, v322); + svfloat32_t v339 = svsub_f32_x(svptrue_b32(), v312, v320); + svfloat32_t v340 = svadd_f32_x(svptrue_b32(), v311, v318); + svint16_t v351 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v329, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v359 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v330, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v367 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v331, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v375 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v332, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v383 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v333, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v391 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v334, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v399 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v335, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v407 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v336, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v415 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v337, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v423 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v338, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v431 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v339, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v439 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v340, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svst1w_u64(pred_full, (unsigned *)(v600), svreinterpret_u64_s16(v351)); + svst1w_u64(pred_full, (unsigned *)(v609), svreinterpret_u64_s16(v359)); + svst1w_u64(pred_full, (unsigned *)(v618), svreinterpret_u64_s16(v367)); + svst1w_u64(pred_full, (unsigned *)(v627), svreinterpret_u64_s16(v375)); + svst1w_u64(pred_full, (unsigned *)(v636), svreinterpret_u64_s16(v383)); + svst1w_u64(pred_full, (unsigned *)(v645), svreinterpret_u64_s16(v391)); + svst1w_u64(pred_full, (unsigned *)(v654), svreinterpret_u64_s16(v399)); + svst1w_u64(pred_full, (unsigned *)(v663), svreinterpret_u64_s16(v407)); + svst1w_u64(pred_full, (unsigned *)(v672), svreinterpret_u64_s16(v415)); + svst1w_u64(pred_full, (unsigned *)(v681), svreinterpret_u64_s16(v423)); + svst1w_u64(pred_full, (unsigned *)(v690), svreinterpret_u64_s16(v431)); + svst1w_u64(pred_full, (unsigned *)(v699), svreinterpret_u64_s16(v439)); +} +#endif + +#ifndef ARMRAL_ARCH_SVE +void armral_fft_cs16_cf32_cs16_ac_n_uun14( + const armral_cmplx_int16_t *restrict x, armral_cmplx_int16_t *restrict y, + int istride, int ostride, int howmany, float dir) { + float v4 = dir; + const int32_t *v5 = (const int32_t *)x; + int32_t *v6 = (int32_t *)y; + float v213 = -1.1666666666666665e+00F; + float v217 = 7.9015646852540022e-01F; + float v221 = 5.5854267289647742e-02F; + float v225 = 7.3430220123575241e-01F; + float v228 = 4.4095855184409838e-01F; + float v229 = -4.4095855184409838e-01F; + float v235 = 3.4087293062393137e-01F; + float v236 = -3.4087293062393137e-01F; + float v242 = -5.3396936033772524e-01F; + float v243 = 5.3396936033772524e-01F; + float v249 = 8.7484229096165667e-01F; + float v250 = -8.7484229096165667e-01F; + int16x4_t v13 = vld1s_s16(&v5[0]); + int16x4_t v75 = vld1s_s16(&v5[istride]); + float32x2_t v214 = (float32x2_t){v213, v213}; + float32x2_t v218 = (float32x2_t){v217, v217}; + float32x2_t v222 = (float32x2_t){v221, v221}; + float32x2_t v226 = (float32x2_t){v225, v225}; + float32x2_t v230 = (float32x2_t){v228, v229}; + float32x2_t v237 = (float32x2_t){v235, v236}; + float32x2_t v244 = (float32x2_t){v242, v243}; + float32x2_t v251 = (float32x2_t){v249, v250}; + float32x2_t v252 = (float32x2_t){v4, v4}; + float32x2_t v14 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v13)), 15); + int16x4_t v19 = vld1s_s16(&v5[istride * 7]); + int16x4_t v27 = vld1s_s16(&v5[istride * 2]); + int16x4_t v33 = vld1s_s16(&v5[istride * 9]); + int16x4_t v41 = vld1s_s16(&v5[istride * 4]); + int16x4_t v47 = vld1s_s16(&v5[istride * 11]); + int16x4_t v55 = vld1s_s16(&v5[istride * 6]); + int16x4_t v61 = vld1s_s16(&v5[istride * 13]); + int16x4_t v69 = vld1s_s16(&v5[istride * 8]); + float32x2_t v76 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v75)), 15); + int16x4_t v83 = vld1s_s16(&v5[istride * 10]); + int16x4_t v89 = vld1s_s16(&v5[istride * 3]); + int16x4_t v97 = vld1s_s16(&v5[istride * 12]); + int16x4_t v103 = vld1s_s16(&v5[istride * 5]); + float32x2_t v232 = vmul_f32(v252, v230); + float32x2_t v239 = vmul_f32(v252, v237); + float32x2_t v246 = vmul_f32(v252, v244); + float32x2_t v253 = vmul_f32(v252, v251); + float32x2_t v20 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v19)), 15); + float32x2_t v28 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v27)), 15); + float32x2_t v34 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v33)), 15); + float32x2_t v42 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v41)), 15); + float32x2_t v48 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v47)), 15); + float32x2_t v56 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v55)), 15); + float32x2_t v62 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v61)), 15); + float32x2_t v70 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v69)), 15); + float32x2_t v84 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v83)), 15); + float32x2_t v90 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v89)), 15); + float32x2_t v98 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v97)), 15); + float32x2_t v104 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v103)), 15); + float32x2_t v21 = vadd_f32(v14, v20); + float32x2_t v22 = vsub_f32(v14, v20); + float32x2_t v35 = vadd_f32(v28, v34); + float32x2_t v36 = vsub_f32(v28, v34); + float32x2_t v49 = vadd_f32(v42, v48); + float32x2_t v50 = vsub_f32(v42, v48); + float32x2_t v63 = vadd_f32(v56, v62); + float32x2_t v64 = vsub_f32(v56, v62); + float32x2_t v77 = vadd_f32(v70, v76); + float32x2_t v78 = vsub_f32(v70, v76); + float32x2_t v91 = vadd_f32(v84, v90); + float32x2_t v92 = vsub_f32(v84, v90); + float32x2_t v105 = vadd_f32(v98, v104); + float32x2_t v106 = vsub_f32(v98, v104); + float32x2_t v107 = vadd_f32(v35, v105); + float32x2_t v108 = vsub_f32(v35, v105); + float32x2_t v109 = vadd_f32(v77, v63); + float32x2_t v110 = vsub_f32(v77, v63); + float32x2_t v111 = vadd_f32(v49, v91); + float32x2_t v112 = vsub_f32(v49, v91); + float32x2_t v191 = vadd_f32(v36, v106); + float32x2_t v192 = vsub_f32(v36, v106); + float32x2_t v193 = vadd_f32(v78, v64); + float32x2_t v194 = vsub_f32(v78, v64); + float32x2_t v195 = vadd_f32(v50, v92); + float32x2_t v196 = vsub_f32(v50, v92); + float32x2_t v113 = vadd_f32(v107, v109); + float32x2_t v116 = vsub_f32(v107, v109); + float32x2_t v117 = vsub_f32(v109, v111); + float32x2_t v118 = vsub_f32(v111, v107); + float32x2_t v119 = vadd_f32(v108, v110); + float32x2_t v121 = vsub_f32(v108, v110); + float32x2_t v122 = vsub_f32(v110, v112); + float32x2_t v123 = vsub_f32(v112, v108); + float32x2_t v197 = vadd_f32(v191, v193); + float32x2_t v200 = vsub_f32(v191, v193); + float32x2_t v201 = vsub_f32(v193, v195); + float32x2_t v202 = vsub_f32(v195, v191); + float32x2_t v203 = vadd_f32(v192, v194); + float32x2_t v205 = vsub_f32(v192, v194); + float32x2_t v206 = vsub_f32(v194, v196); + float32x2_t v207 = vsub_f32(v196, v192); + float32x2_t v114 = vadd_f32(v113, v111); + float32x2_t v120 = vadd_f32(v119, v112); + float32x2_t v135 = vmul_f32(v116, v218); + float32x2_t v139 = vmul_f32(v117, v222); + float32x2_t v143 = vmul_f32(v118, v226); + float32x2_t v156 = vrev64_f32(v121); + float32x2_t v163 = vrev64_f32(v122); + float32x2_t v170 = vrev64_f32(v123); + float32x2_t v198 = vadd_f32(v197, v195); + float32x2_t v204 = vadd_f32(v203, v196); + float32x2_t v219 = vmul_f32(v200, v218); + float32x2_t v223 = vmul_f32(v201, v222); + float32x2_t v227 = vmul_f32(v202, v226); + float32x2_t v240 = vrev64_f32(v205); + float32x2_t v247 = vrev64_f32(v206); + float32x2_t v254 = vrev64_f32(v207); + float32x2_t v115 = vadd_f32(v114, v21); + float32x2_t v131 = vmul_f32(v114, v214); + float32x2_t v149 = vrev64_f32(v120); + float32x2_t v157 = vmul_f32(v156, v239); + float32x2_t v164 = vmul_f32(v163, v246); + float32x2_t v171 = vmul_f32(v170, v253); + float32x2_t v199 = vadd_f32(v198, v22); + float32x2_t v215 = vmul_f32(v198, v214); + float32x2_t v233 = vrev64_f32(v204); + float32x2_t v241 = vmul_f32(v240, v239); + float32x2_t v248 = vmul_f32(v247, v246); + float32x2_t v255 = vmul_f32(v254, v253); + float32x2_t v150 = vmul_f32(v149, v232); + float32x2_t v172 = vadd_f32(v115, v131); + float32x2_t v234 = vmul_f32(v233, v232); + float32x2_t v256 = vadd_f32(v199, v215); + int16x4_t v277 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v115, 15), (int32x2_t){0, 0})); + int16x4_t v283 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v199, 15), (int32x2_t){0, 0})); + float32x2_t v173 = vadd_f32(v172, v135); + float32x2_t v175 = vsub_f32(v172, v135); + float32x2_t v177 = vsub_f32(v172, v139); + float32x2_t v179 = vadd_f32(v150, v157); + float32x2_t v181 = vsub_f32(v150, v157); + float32x2_t v183 = vsub_f32(v150, v164); + float32x2_t v257 = vadd_f32(v256, v219); + float32x2_t v259 = vsub_f32(v256, v219); + float32x2_t v261 = vsub_f32(v256, v223); + float32x2_t v263 = vadd_f32(v234, v241); + float32x2_t v265 = vsub_f32(v234, v241); + float32x2_t v267 = vsub_f32(v234, v248); + v6[0] = vget_lane_s32(vreinterpret_s32_s16(v277), 0); + v6[ostride * 7] = vget_lane_s32(vreinterpret_s32_s16(v283), 0); + float32x2_t v174 = vadd_f32(v173, v139); + float32x2_t v176 = vsub_f32(v175, v143); + float32x2_t v178 = vadd_f32(v177, v143); + float32x2_t v180 = vadd_f32(v179, v164); + float32x2_t v182 = vsub_f32(v181, v171); + float32x2_t v184 = vadd_f32(v183, v171); + float32x2_t v258 = vadd_f32(v257, v223); + float32x2_t v260 = vsub_f32(v259, v227); + float32x2_t v262 = vadd_f32(v261, v227); + float32x2_t v264 = vadd_f32(v263, v248); + float32x2_t v266 = vsub_f32(v265, v255); + float32x2_t v268 = vadd_f32(v267, v255); + float32x2_t v185 = vadd_f32(v174, v180); + float32x2_t v186 = vsub_f32(v174, v180); + float32x2_t v187 = vadd_f32(v176, v182); + float32x2_t v188 = vsub_f32(v176, v182); + float32x2_t v189 = vadd_f32(v178, v184); + float32x2_t v190 = vsub_f32(v178, v184); + float32x2_t v269 = vadd_f32(v258, v264); + float32x2_t v270 = vsub_f32(v258, v264); + float32x2_t v271 = vadd_f32(v260, v266); + float32x2_t v272 = vsub_f32(v260, v266); + float32x2_t v273 = vadd_f32(v262, v268); + float32x2_t v274 = vsub_f32(v262, v268); + int16x4_t v289 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v186, 15), (int32x2_t){0, 0})); + int16x4_t v295 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v270, 15), (int32x2_t){0, 0})); + int16x4_t v301 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v188, 15), (int32x2_t){0, 0})); + int16x4_t v307 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v272, 15), (int32x2_t){0, 0})); + int16x4_t v313 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v189, 15), (int32x2_t){0, 0})); + int16x4_t v319 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v273, 15), (int32x2_t){0, 0})); + int16x4_t v325 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v190, 15), (int32x2_t){0, 0})); + int16x4_t v331 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v274, 15), (int32x2_t){0, 0})); + int16x4_t v337 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v187, 15), (int32x2_t){0, 0})); + int16x4_t v343 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v271, 15), (int32x2_t){0, 0})); + int16x4_t v349 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v185, 15), (int32x2_t){0, 0})); + int16x4_t v355 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v269, 15), (int32x2_t){0, 0})); + v6[ostride * 8] = vget_lane_s32(vreinterpret_s32_s16(v289), 0); + v6[ostride] = vget_lane_s32(vreinterpret_s32_s16(v295), 0); + v6[ostride * 2] = vget_lane_s32(vreinterpret_s32_s16(v301), 0); + v6[ostride * 9] = vget_lane_s32(vreinterpret_s32_s16(v307), 0); + v6[ostride * 10] = vget_lane_s32(vreinterpret_s32_s16(v313), 0); + v6[ostride * 3] = vget_lane_s32(vreinterpret_s32_s16(v319), 0); + v6[ostride * 4] = vget_lane_s32(vreinterpret_s32_s16(v325), 0); + v6[ostride * 11] = vget_lane_s32(vreinterpret_s32_s16(v331), 0); + v6[ostride * 12] = vget_lane_s32(vreinterpret_s32_s16(v337), 0); + v6[ostride * 5] = vget_lane_s32(vreinterpret_s32_s16(v343), 0); + v6[ostride * 6] = vget_lane_s32(vreinterpret_s32_s16(v349), 0); + v6[ostride * 13] = vget_lane_s32(vreinterpret_s32_s16(v355), 0); +} +#endif + +#ifdef ARMRAL_ARCH_SVE +void armral_fft_cs16_cf32_cs16_ac_n_uun14( + const armral_cmplx_int16_t *restrict x, armral_cmplx_int16_t *restrict y, + int istride, int ostride, int howmany, float dir) { + int64_t v0 = istride; + int64_t v2 = ostride; + float v4 = dir; + const int32_t *v5 = (const int32_t *)x; + int32_t *v6 = (int32_t *)y; + svbool_t pred_full = svptrue_pat_b32(SV_VL2); + float v252 = -1.1666666666666665e+00F; + float v257 = 7.9015646852540022e-01F; + float v262 = 5.5854267289647742e-02F; + float v267 = 7.3430220123575241e-01F; + float v272 = -4.4095855184409838e-01F; + float v279 = -3.4087293062393137e-01F; + float v286 = 5.3396936033772524e-01F; + float v293 = -8.7484229096165667e-01F; + const int32_t *v518 = &v5[v0]; + int32_t *v609 = &v6[v2]; + int64_t v23 = v0 * 7; + int64_t v33 = v0 * 2; + int64_t v41 = v0 * 9; + int64_t v51 = v0 * 4; + int64_t v59 = v0 * 11; + int64_t v69 = v0 * 6; + int64_t v77 = v0 * 13; + int64_t v87 = v0 * 8; + int64_t v105 = v0 * 10; + int64_t v113 = v0 * 3; + int64_t v123 = v0 * 12; + int64_t v131 = v0 * 5; + float v275 = v4 * v272; + float v282 = v4 * v279; + float v289 = v4 * v286; + float v296 = v4 * v293; + int64_t v327 = v2 * 7; + int64_t v335 = v2 * 8; + int64_t v351 = v2 * 2; + int64_t v359 = v2 * 9; + int64_t v367 = v2 * 10; + int64_t v375 = v2 * 3; + int64_t v383 = v2 * 4; + int64_t v391 = v2 * 11; + int64_t v399 = v2 * 12; + int64_t v407 = v2 * 5; + int64_t v415 = v2 * 6; + int64_t v423 = v2 * 13; + const int32_t *v437 = &v5[0]; + svfloat32_t v567 = svdup_n_f32(v252); + svfloat32_t v568 = svdup_n_f32(v257); + svfloat32_t v569 = svdup_n_f32(v262); + svfloat32_t v570 = svdup_n_f32(v267); + int32_t *v582 = &v6[0]; + svfloat32_t v101 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v518[0])), + 1.F / (1ULL << 15ULL)); + const int32_t *v446 = &v5[v23]; + const int32_t *v455 = &v5[v33]; + const int32_t *v464 = &v5[v41]; + const int32_t *v473 = &v5[v51]; + const int32_t *v482 = &v5[v59]; + const int32_t *v491 = &v5[v69]; + const int32_t *v500 = &v5[v77]; + const int32_t *v509 = &v5[v87]; + const int32_t *v527 = &v5[v105]; + const int32_t *v536 = &v5[v113]; + const int32_t *v545 = &v5[v123]; + const int32_t *v554 = &v5[v131]; + svfloat32_t v571 = svdup_n_f32(v275); + svfloat32_t v572 = svdup_n_f32(v282); + svfloat32_t v573 = svdup_n_f32(v289); + svfloat32_t v574 = svdup_n_f32(v296); + int32_t *v591 = &v6[v327]; + int32_t *v600 = &v6[v335]; + int32_t *v618 = &v6[v351]; + int32_t *v627 = &v6[v359]; + int32_t *v636 = &v6[v367]; + int32_t *v645 = &v6[v375]; + int32_t *v654 = &v6[v383]; + int32_t *v663 = &v6[v391]; + int32_t *v672 = &v6[v399]; + int32_t *v681 = &v6[v407]; + int32_t *v690 = &v6[v415]; + int32_t *v699 = &v6[v423]; + svfloat32_t v21 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v437[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v29 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v446[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v39 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v455[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v47 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v464[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v57 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v473[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v65 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v482[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v75 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v491[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v83 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v500[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v93 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v509[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v111 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v527[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v119 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v536[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v129 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v545[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v137 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v554[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v30 = svadd_f32_x(svptrue_b32(), v21, v29); + svfloat32_t v31 = svsub_f32_x(svptrue_b32(), v21, v29); + svfloat32_t v48 = svadd_f32_x(svptrue_b32(), v39, v47); + svfloat32_t v49 = svsub_f32_x(svptrue_b32(), v39, v47); + svfloat32_t v66 = svadd_f32_x(svptrue_b32(), v57, v65); + svfloat32_t v67 = svsub_f32_x(svptrue_b32(), v57, v65); + svfloat32_t v84 = svadd_f32_x(svptrue_b32(), v75, v83); + svfloat32_t v85 = svsub_f32_x(svptrue_b32(), v75, v83); + svfloat32_t v102 = svadd_f32_x(svptrue_b32(), v93, v101); + svfloat32_t v103 = svsub_f32_x(svptrue_b32(), v93, v101); + svfloat32_t v120 = svadd_f32_x(svptrue_b32(), v111, v119); + svfloat32_t v121 = svsub_f32_x(svptrue_b32(), v111, v119); + svfloat32_t v138 = svadd_f32_x(svptrue_b32(), v129, v137); + svfloat32_t v139 = svsub_f32_x(svptrue_b32(), v129, v137); + svfloat32_t v140 = svadd_f32_x(svptrue_b32(), v48, v138); + svfloat32_t v141 = svsub_f32_x(svptrue_b32(), v48, v138); + svfloat32_t v142 = svadd_f32_x(svptrue_b32(), v102, v84); + svfloat32_t v143 = svsub_f32_x(svptrue_b32(), v102, v84); + svfloat32_t v144 = svadd_f32_x(svptrue_b32(), v66, v120); + svfloat32_t v145 = svsub_f32_x(svptrue_b32(), v66, v120); + svfloat32_t v229 = svadd_f32_x(svptrue_b32(), v49, v139); + svfloat32_t v230 = svsub_f32_x(svptrue_b32(), v49, v139); + svfloat32_t v231 = svadd_f32_x(svptrue_b32(), v103, v85); + svfloat32_t v232 = svsub_f32_x(svptrue_b32(), v103, v85); + svfloat32_t v233 = svadd_f32_x(svptrue_b32(), v67, v121); + svfloat32_t v234 = svsub_f32_x(svptrue_b32(), v67, v121); + svfloat32_t v146 = svadd_f32_x(svptrue_b32(), v140, v142); + svfloat32_t v149 = svsub_f32_x(svptrue_b32(), v140, v142); + svfloat32_t v150 = svsub_f32_x(svptrue_b32(), v142, v144); + svfloat32_t v151 = svsub_f32_x(svptrue_b32(), v144, v140); + svfloat32_t v152 = svadd_f32_x(svptrue_b32(), v141, v143); + svfloat32_t v154 = svsub_f32_x(svptrue_b32(), v141, v143); + svfloat32_t v155 = svsub_f32_x(svptrue_b32(), v143, v145); + svfloat32_t v156 = svsub_f32_x(svptrue_b32(), v145, v141); + svfloat32_t v235 = svadd_f32_x(svptrue_b32(), v229, v231); + svfloat32_t v238 = svsub_f32_x(svptrue_b32(), v229, v231); + svfloat32_t v239 = svsub_f32_x(svptrue_b32(), v231, v233); + svfloat32_t v240 = svsub_f32_x(svptrue_b32(), v233, v229); + svfloat32_t v241 = svadd_f32_x(svptrue_b32(), v230, v232); + svfloat32_t v243 = svsub_f32_x(svptrue_b32(), v230, v232); + svfloat32_t v244 = svsub_f32_x(svptrue_b32(), v232, v234); + svfloat32_t v245 = svsub_f32_x(svptrue_b32(), v234, v230); + svfloat32_t v147 = svadd_f32_x(svptrue_b32(), v146, v144); + svfloat32_t v153 = svadd_f32_x(svptrue_b32(), v152, v145); + svfloat32_t zero195 = svdup_n_f32(0); + svfloat32_t v195 = svcmla_f32_x(pred_full, zero195, v572, v154, 90); + svfloat32_t zero202 = svdup_n_f32(0); + svfloat32_t v202 = svcmla_f32_x(pred_full, zero202, v573, v155, 90); + svfloat32_t zero209 = svdup_n_f32(0); + svfloat32_t v209 = svcmla_f32_x(pred_full, zero209, v574, v156, 90); + svfloat32_t v236 = svadd_f32_x(svptrue_b32(), v235, v233); + svfloat32_t v242 = svadd_f32_x(svptrue_b32(), v241, v234); + svfloat32_t zero284 = svdup_n_f32(0); + svfloat32_t v284 = svcmla_f32_x(pred_full, zero284, v572, v243, 90); + svfloat32_t zero291 = svdup_n_f32(0); + svfloat32_t v291 = svcmla_f32_x(pred_full, zero291, v573, v244, 90); + svfloat32_t zero298 = svdup_n_f32(0); + svfloat32_t v298 = svcmla_f32_x(pred_full, zero298, v574, v245, 90); + svfloat32_t v148 = svadd_f32_x(svptrue_b32(), v147, v30); + svfloat32_t zero188 = svdup_n_f32(0); + svfloat32_t v188 = svcmla_f32_x(pred_full, zero188, v571, v153, 90); + svfloat32_t v237 = svadd_f32_x(svptrue_b32(), v236, v31); + svfloat32_t zero277 = svdup_n_f32(0); + svfloat32_t v277 = svcmla_f32_x(pred_full, zero277, v571, v242, 90); + svfloat32_t v210 = svmla_f32_x(pred_full, v148, v147, v567); + svfloat32_t v217 = svadd_f32_x(svptrue_b32(), v188, v195); + svfloat32_t v219 = svsub_f32_x(svptrue_b32(), v188, v195); + svfloat32_t v221 = svsub_f32_x(svptrue_b32(), v188, v202); + svfloat32_t v299 = svmla_f32_x(pred_full, v237, v236, v567); + svfloat32_t v306 = svadd_f32_x(svptrue_b32(), v277, v284); + svfloat32_t v308 = svsub_f32_x(svptrue_b32(), v277, v284); + svfloat32_t v310 = svsub_f32_x(svptrue_b32(), v277, v291); + svint16_t v320 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v148, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v328 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v237, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svfloat32_t v211 = svmla_f32_x(pred_full, v210, v149, v568); + svfloat32_t v213 = svmls_f32_x(pred_full, v210, v149, v568); + svfloat32_t v215 = svmls_f32_x(pred_full, v210, v150, v569); + svfloat32_t v218 = svadd_f32_x(svptrue_b32(), v217, v202); + svfloat32_t v220 = svsub_f32_x(svptrue_b32(), v219, v209); + svfloat32_t v222 = svadd_f32_x(svptrue_b32(), v221, v209); + svfloat32_t v300 = svmla_f32_x(pred_full, v299, v238, v568); + svfloat32_t v302 = svmls_f32_x(pred_full, v299, v238, v568); + svfloat32_t v304 = svmls_f32_x(pred_full, v299, v239, v569); + svfloat32_t v307 = svadd_f32_x(svptrue_b32(), v306, v291); + svfloat32_t v309 = svsub_f32_x(svptrue_b32(), v308, v298); + svfloat32_t v311 = svadd_f32_x(svptrue_b32(), v310, v298); + svst1w_u64(pred_full, (unsigned *)(v582), svreinterpret_u64_s16(v320)); + svst1w_u64(pred_full, (unsigned *)(v591), svreinterpret_u64_s16(v328)); + svfloat32_t v212 = svmla_f32_x(pred_full, v211, v150, v569); + svfloat32_t v214 = svmls_f32_x(pred_full, v213, v151, v570); + svfloat32_t v216 = svmla_f32_x(pred_full, v215, v151, v570); + svfloat32_t v301 = svmla_f32_x(pred_full, v300, v239, v569); + svfloat32_t v303 = svmls_f32_x(pred_full, v302, v240, v570); + svfloat32_t v305 = svmla_f32_x(pred_full, v304, v240, v570); + svfloat32_t v223 = svadd_f32_x(svptrue_b32(), v212, v218); + svfloat32_t v224 = svsub_f32_x(svptrue_b32(), v212, v218); + svfloat32_t v225 = svadd_f32_x(svptrue_b32(), v214, v220); + svfloat32_t v226 = svsub_f32_x(svptrue_b32(), v214, v220); + svfloat32_t v227 = svadd_f32_x(svptrue_b32(), v216, v222); + svfloat32_t v228 = svsub_f32_x(svptrue_b32(), v216, v222); + svfloat32_t v312 = svadd_f32_x(svptrue_b32(), v301, v307); + svfloat32_t v313 = svsub_f32_x(svptrue_b32(), v301, v307); + svfloat32_t v314 = svadd_f32_x(svptrue_b32(), v303, v309); + svfloat32_t v315 = svsub_f32_x(svptrue_b32(), v303, v309); + svfloat32_t v316 = svadd_f32_x(svptrue_b32(), v305, v311); + svfloat32_t v317 = svsub_f32_x(svptrue_b32(), v305, v311); + svint16_t v336 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v224, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v344 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v313, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v352 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v226, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v360 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v315, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v368 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v227, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v376 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v316, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v384 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v228, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v392 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v317, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v400 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v225, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v408 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v314, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v416 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v223, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v424 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v312, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svst1w_u64(pred_full, (unsigned *)(v600), svreinterpret_u64_s16(v336)); + svst1w_u64(pred_full, (unsigned *)(v609), svreinterpret_u64_s16(v344)); + svst1w_u64(pred_full, (unsigned *)(v618), svreinterpret_u64_s16(v352)); + svst1w_u64(pred_full, (unsigned *)(v627), svreinterpret_u64_s16(v360)); + svst1w_u64(pred_full, (unsigned *)(v636), svreinterpret_u64_s16(v368)); + svst1w_u64(pred_full, (unsigned *)(v645), svreinterpret_u64_s16(v376)); + svst1w_u64(pred_full, (unsigned *)(v654), svreinterpret_u64_s16(v384)); + svst1w_u64(pred_full, (unsigned *)(v663), svreinterpret_u64_s16(v392)); + svst1w_u64(pred_full, (unsigned *)(v672), svreinterpret_u64_s16(v400)); + svst1w_u64(pred_full, (unsigned *)(v681), svreinterpret_u64_s16(v408)); + svst1w_u64(pred_full, (unsigned *)(v690), svreinterpret_u64_s16(v416)); + svst1w_u64(pred_full, (unsigned *)(v699), svreinterpret_u64_s16(v424)); +} +#endif + +#ifndef ARMRAL_ARCH_SVE +void armral_fft_cs16_cf32_cs16_ac_n_uun15( + const armral_cmplx_int16_t *restrict x, armral_cmplx_int16_t *restrict y, + int istride, int ostride, int howmany, float dir) { + float v4 = dir; + const int32_t *v5 = (const int32_t *)x; + int32_t *v6 = (int32_t *)y; + float v127 = -1.2500000000000000e+00F; + float v131 = 5.5901699437494745e-01F; + float v134 = 1.5388417685876268e+00F; + float v135 = -1.5388417685876268e+00F; + float v141 = 5.8778525229247325e-01F; + float v142 = -5.8778525229247325e-01F; + float v148 = 3.6327126400268028e-01F; + float v149 = -3.6327126400268028e-01F; + float v173 = -1.4999999999999998e+00F; + float v177 = 1.8749999999999998e+00F; + float v181 = -8.3852549156242107e-01F; + float v184 = -2.3082626528814396e+00F; + float v185 = 2.3082626528814396e+00F; + float v191 = -8.8167787843870971e-01F; + float v192 = 8.8167787843870971e-01F; + float v198 = -5.4490689600402031e-01F; + float v199 = 5.4490689600402031e-01F; + float v222 = 8.6602540378443871e-01F; + float v223 = -8.6602540378443871e-01F; + float v229 = -1.0825317547305484e+00F; + float v230 = 1.0825317547305484e+00F; + float v236 = 4.8412291827592718e-01F; + float v237 = -4.8412291827592718e-01F; + float v244 = -1.3326760640014592e+00F; + float v248 = -5.0903696045512736e-01F; + float v252 = -3.1460214309120460e-01F; + int16x4_t v27 = vld1s_s16(&v5[0]); + int16x4_t v61 = vld1s_s16(&v5[istride]); + float32x2_t v128 = (float32x2_t){v127, v127}; + float32x2_t v132 = (float32x2_t){v131, v131}; + float32x2_t v136 = (float32x2_t){v134, v135}; + float32x2_t v143 = (float32x2_t){v141, v142}; + float32x2_t v150 = (float32x2_t){v148, v149}; + float32x2_t v174 = (float32x2_t){v173, v173}; + float32x2_t v178 = (float32x2_t){v177, v177}; + float32x2_t v182 = (float32x2_t){v181, v181}; + float32x2_t v186 = (float32x2_t){v184, v185}; + float32x2_t v193 = (float32x2_t){v191, v192}; + float32x2_t v200 = (float32x2_t){v198, v199}; + float32x2_t v224 = (float32x2_t){v222, v223}; + float32x2_t v231 = (float32x2_t){v229, v230}; + float32x2_t v238 = (float32x2_t){v236, v237}; + float32x2_t v239 = (float32x2_t){v4, v4}; + float32x2_t v245 = (float32x2_t){v244, v244}; + float32x2_t v249 = (float32x2_t){v248, v248}; + float32x2_t v253 = (float32x2_t){v252, v252}; + int16x4_t v13 = vld1s_s16(&v5[istride * 5]); + int16x4_t v19 = vld1s_s16(&v5[istride * 10]); + float32x2_t v28 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v27)), 15); + int16x4_t v34 = vld1s_s16(&v5[istride * 8]); + int16x4_t v40 = vld1s_s16(&v5[istride * 13]); + int16x4_t v48 = vld1s_s16(&v5[istride * 3]); + int16x4_t v55 = vld1s_s16(&v5[istride * 11]); + float32x2_t v62 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v61)), 15); + int16x4_t v69 = vld1s_s16(&v5[istride * 6]); + int16x4_t v76 = vld1s_s16(&v5[istride * 14]); + int16x4_t v82 = vld1s_s16(&v5[istride * 4]); + int16x4_t v90 = vld1s_s16(&v5[istride * 9]); + int16x4_t v97 = vld1s_s16(&v5[istride * 2]); + int16x4_t v103 = vld1s_s16(&v5[istride * 7]); + int16x4_t v111 = vld1s_s16(&v5[istride * 12]); + float32x2_t v138 = vmul_f32(v239, v136); + float32x2_t v145 = vmul_f32(v239, v143); + float32x2_t v152 = vmul_f32(v239, v150); + float32x2_t v188 = vmul_f32(v239, v186); + float32x2_t v195 = vmul_f32(v239, v193); + float32x2_t v202 = vmul_f32(v239, v200); + float32x2_t v226 = vmul_f32(v239, v224); + float32x2_t v233 = vmul_f32(v239, v231); + float32x2_t v240 = vmul_f32(v239, v238); + float32x2_t v14 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v13)), 15); + float32x2_t v20 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v19)), 15); + float32x2_t v35 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v34)), 15); + float32x2_t v41 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v40)), 15); + float32x2_t v49 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v48)), 15); + float32x2_t v56 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v55)), 15); + float32x2_t v70 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v69)), 15); + float32x2_t v77 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v76)), 15); + float32x2_t v83 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v82)), 15); + float32x2_t v91 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v90)), 15); + float32x2_t v98 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v97)), 15); + float32x2_t v104 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v103)), 15); + float32x2_t v112 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v111)), 15); + float32x2_t v21 = vadd_f32(v14, v20); + float32x2_t v22 = vsub_f32(v14, v20); + float32x2_t v42 = vadd_f32(v35, v41); + float32x2_t v43 = vsub_f32(v35, v41); + float32x2_t v63 = vadd_f32(v56, v62); + float32x2_t v64 = vsub_f32(v56, v62); + float32x2_t v84 = vadd_f32(v77, v83); + float32x2_t v85 = vsub_f32(v77, v83); + float32x2_t v105 = vadd_f32(v98, v104); + float32x2_t v106 = vsub_f32(v98, v104); + float32x2_t v29 = vadd_f32(v21, v28); + float32x2_t v50 = vadd_f32(v42, v49); + float32x2_t v71 = vadd_f32(v63, v70); + float32x2_t v92 = vadd_f32(v84, v91); + float32x2_t v113 = vadd_f32(v105, v112); + float32x2_t v164 = vadd_f32(v42, v105); + float32x2_t v165 = vsub_f32(v42, v105); + float32x2_t v166 = vadd_f32(v84, v63); + float32x2_t v167 = vsub_f32(v84, v63); + float32x2_t v214 = vadd_f32(v43, v106); + float32x2_t v215 = vsub_f32(v43, v106); + float32x2_t v216 = vadd_f32(v85, v64); + float32x2_t v217 = vsub_f32(v85, v64); + float32x2_t v114 = vadd_f32(v50, v113); + float32x2_t v115 = vsub_f32(v50, v113); + float32x2_t v116 = vadd_f32(v92, v71); + float32x2_t v117 = vsub_f32(v92, v71); + float32x2_t v168 = vadd_f32(v164, v166); + float32x2_t v169 = vsub_f32(v164, v166); + float32x2_t v170 = vadd_f32(v165, v167); + float32x2_t v189 = vrev64_f32(v165); + float32x2_t v203 = vrev64_f32(v167); + float32x2_t v218 = vadd_f32(v214, v216); + float32x2_t v219 = vsub_f32(v214, v216); + float32x2_t v220 = vadd_f32(v215, v217); + float32x2_t v246 = vmul_f32(v215, v245); + float32x2_t v254 = vmul_f32(v217, v253); + float32x2_t v118 = vadd_f32(v114, v116); + float32x2_t v119 = vsub_f32(v114, v116); + float32x2_t v120 = vadd_f32(v115, v117); + float32x2_t v139 = vrev64_f32(v115); + float32x2_t v153 = vrev64_f32(v117); + float32x2_t v171 = vadd_f32(v168, v21); + float32x2_t v179 = vmul_f32(v168, v178); + float32x2_t v183 = vmul_f32(v169, v182); + float32x2_t v190 = vmul_f32(v189, v188); + float32x2_t v196 = vrev64_f32(v170); + float32x2_t v204 = vmul_f32(v203, v202); + float32x2_t v221 = vadd_f32(v218, v22); + float32x2_t v234 = vrev64_f32(v218); + float32x2_t v241 = vrev64_f32(v219); + float32x2_t v250 = vmul_f32(v220, v249); + float32x2_t v121 = vadd_f32(v118, v29); + float32x2_t v129 = vmul_f32(v118, v128); + float32x2_t v133 = vmul_f32(v119, v132); + float32x2_t v140 = vmul_f32(v139, v138); + float32x2_t v146 = vrev64_f32(v120); + float32x2_t v154 = vmul_f32(v153, v152); + float32x2_t v175 = vmul_f32(v171, v174); + float32x2_t v197 = vmul_f32(v196, v195); + float32x2_t v227 = vrev64_f32(v221); + float32x2_t v235 = vmul_f32(v234, v233); + float32x2_t v242 = vmul_f32(v241, v240); + float32x2_t v258 = vsub_f32(v246, v250); + float32x2_t v259 = vadd_f32(v250, v254); + float32x2_t v147 = vmul_f32(v146, v145); + float32x2_t v155 = vadd_f32(v121, v129); + float32x2_t v205 = vadd_f32(v175, v179); + float32x2_t v208 = vsub_f32(v190, v197); + float32x2_t v209 = vadd_f32(v197, v204); + float32x2_t v228 = vmul_f32(v227, v226); + float32x2_t v264 = vadd_f32(v121, v175); + int16x4_t v269 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v121, 15), (int32x2_t){0, 0})); + float32x2_t v156 = vadd_f32(v155, v133); + float32x2_t v157 = vsub_f32(v155, v133); + float32x2_t v158 = vsub_f32(v140, v147); + float32x2_t v159 = vadd_f32(v147, v154); + float32x2_t v206 = vadd_f32(v205, v183); + float32x2_t v207 = vsub_f32(v205, v183); + float32x2_t v255 = vadd_f32(v228, v235); + float32x2_t v265 = vadd_f32(v264, v228); + float32x2_t v266 = vsub_f32(v264, v228); + v6[0] = vget_lane_s32(vreinterpret_s32_s16(v269), 0); + float32x2_t v160 = vadd_f32(v156, v158); + float32x2_t v161 = vsub_f32(v156, v158); + float32x2_t v162 = vadd_f32(v157, v159); + float32x2_t v163 = vsub_f32(v157, v159); + float32x2_t v210 = vadd_f32(v206, v208); + float32x2_t v211 = vsub_f32(v206, v208); + float32x2_t v212 = vadd_f32(v207, v209); + float32x2_t v213 = vsub_f32(v207, v209); + float32x2_t v256 = vadd_f32(v255, v242); + float32x2_t v257 = vsub_f32(v255, v242); + int16x4_t v275 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v266, 15), (int32x2_t){0, 0})); + int16x4_t v281 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v265, 15), (int32x2_t){0, 0})); + float32x2_t v260 = vadd_f32(v256, v258); + float32x2_t v261 = vsub_f32(v256, v258); + float32x2_t v262 = vadd_f32(v257, v259); + float32x2_t v263 = vsub_f32(v257, v259); + v6[ostride * 10] = vget_lane_s32(vreinterpret_s32_s16(v275), 0); + v6[ostride * 5] = vget_lane_s32(vreinterpret_s32_s16(v281), 0); + float32x2_t v285 = vadd_f32(v161, v211); + int16x4_t v290 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v161, 15), (int32x2_t){0, 0})); + float32x2_t v306 = vadd_f32(v163, v213); + int16x4_t v311 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v163, 15), (int32x2_t){0, 0})); + float32x2_t v327 = vadd_f32(v162, v212); + int16x4_t v332 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v162, 15), (int32x2_t){0, 0})); + float32x2_t v348 = vadd_f32(v160, v210); + int16x4_t v353 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v160, 15), (int32x2_t){0, 0})); + float32x2_t v286 = vadd_f32(v285, v261); + float32x2_t v287 = vsub_f32(v285, v261); + v6[ostride * 6] = vget_lane_s32(vreinterpret_s32_s16(v290), 0); + float32x2_t v307 = vadd_f32(v306, v263); + float32x2_t v308 = vsub_f32(v306, v263); + v6[ostride * 12] = vget_lane_s32(vreinterpret_s32_s16(v311), 0); + float32x2_t v328 = vadd_f32(v327, v262); + float32x2_t v329 = vsub_f32(v327, v262); + v6[ostride * 3] = vget_lane_s32(vreinterpret_s32_s16(v332), 0); + float32x2_t v349 = vadd_f32(v348, v260); + float32x2_t v350 = vsub_f32(v348, v260); + v6[ostride * 9] = vget_lane_s32(vreinterpret_s32_s16(v353), 0); + int16x4_t v296 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v287, 15), (int32x2_t){0, 0})); + int16x4_t v302 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v286, 15), (int32x2_t){0, 0})); + int16x4_t v317 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v308, 15), (int32x2_t){0, 0})); + int16x4_t v323 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v307, 15), (int32x2_t){0, 0})); + int16x4_t v338 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v329, 15), (int32x2_t){0, 0})); + int16x4_t v344 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v328, 15), (int32x2_t){0, 0})); + int16x4_t v359 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v350, 15), (int32x2_t){0, 0})); + int16x4_t v365 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v349, 15), (int32x2_t){0, 0})); + v6[ostride] = vget_lane_s32(vreinterpret_s32_s16(v296), 0); + v6[ostride * 11] = vget_lane_s32(vreinterpret_s32_s16(v302), 0); + v6[ostride * 7] = vget_lane_s32(vreinterpret_s32_s16(v317), 0); + v6[ostride * 2] = vget_lane_s32(vreinterpret_s32_s16(v323), 0); + v6[ostride * 13] = vget_lane_s32(vreinterpret_s32_s16(v338), 0); + v6[ostride * 8] = vget_lane_s32(vreinterpret_s32_s16(v344), 0); + v6[ostride * 4] = vget_lane_s32(vreinterpret_s32_s16(v359), 0); + v6[ostride * 14] = vget_lane_s32(vreinterpret_s32_s16(v365), 0); +} +#endif + +#ifdef ARMRAL_ARCH_SVE +void armral_fft_cs16_cf32_cs16_ac_n_uun15( + const armral_cmplx_int16_t *restrict x, armral_cmplx_int16_t *restrict y, + int istride, int ostride, int howmany, float dir) { + int64_t v0 = istride; + int64_t v2 = ostride; + float v4 = dir; + const int32_t *v5 = (const int32_t *)x; + int32_t *v6 = (int32_t *)y; + svbool_t pred_full = svptrue_pat_b32(SV_VL2); + float v163 = -1.2500000000000000e+00F; + float v168 = 5.5901699437494745e-01F; + float v173 = -1.5388417685876268e+00F; + float v180 = -5.8778525229247325e-01F; + float v187 = -3.6327126400268028e-01F; + float v211 = -1.4999999999999998e+00F; + float v216 = 1.8749999999999998e+00F; + float v221 = -8.3852549156242107e-01F; + float v226 = 2.3082626528814396e+00F; + float v233 = 8.8167787843870971e-01F; + float v240 = 5.4490689600402031e-01F; + float v264 = -8.6602540378443871e-01F; + float v271 = 1.0825317547305484e+00F; + float v278 = -4.8412291827592718e-01F; + float v285 = -1.3326760640014592e+00F; + float v290 = -5.0903696045512736e-01F; + float v295 = -3.1460214309120460e-01F; + const int32_t *v513 = &v5[v0]; + int32_t *v640 = &v6[v2]; + int64_t v15 = v0 * 5; + int64_t v23 = v0 * 10; + int64_t v42 = v0 * 8; + int64_t v50 = v0 * 13; + int64_t v60 = v0 * 3; + int64_t v69 = v0 * 11; + int64_t v87 = v0 * 6; + int64_t v96 = v0 * 14; + int64_t v104 = v0 * 4; + int64_t v114 = v0 * 9; + int64_t v123 = v0 * 2; + int64_t v131 = v0 * 7; + int64_t v141 = v0 * 12; + float v176 = v4 * v173; + float v183 = v4 * v180; + float v190 = v4 * v187; + float v229 = v4 * v226; + float v236 = v4 * v233; + float v243 = v4 * v240; + float v267 = v4 * v264; + float v274 = v4 * v271; + float v281 = v4 * v278; + int64_t v320 = v2 * 10; + int64_t v328 = v2 * 5; + int64_t v339 = v2 * 6; + int64_t v355 = v2 * 11; + int64_t v366 = v2 * 12; + int64_t v374 = v2 * 7; + int64_t v382 = v2 * 2; + int64_t v393 = v2 * 3; + int64_t v401 = v2 * 13; + int64_t v409 = v2 * 8; + int64_t v420 = v2 * 9; + int64_t v428 = v2 * 4; + int64_t v436 = v2 * 14; + const int32_t *v468 = &v5[0]; + svfloat32_t v580 = svdup_n_f32(v163); + svfloat32_t v581 = svdup_n_f32(v168); + svfloat32_t v585 = svdup_n_f32(v211); + svfloat32_t v586 = svdup_n_f32(v216); + svfloat32_t v587 = svdup_n_f32(v221); + svfloat32_t v594 = svdup_n_f32(v285); + svfloat32_t v595 = svdup_n_f32(v290); + svfloat32_t v596 = svdup_n_f32(v295); + int32_t *v604 = &v6[0]; + svfloat32_t v83 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v513[0])), + 1.F / (1ULL << 15ULL)); + const int32_t *v449 = &v5[v15]; + const int32_t *v458 = &v5[v23]; + const int32_t *v477 = &v5[v42]; + const int32_t *v486 = &v5[v50]; + const int32_t *v495 = &v5[v60]; + const int32_t *v504 = &v5[v69]; + const int32_t *v522 = &v5[v87]; + const int32_t *v531 = &v5[v96]; + const int32_t *v540 = &v5[v104]; + const int32_t *v549 = &v5[v114]; + const int32_t *v558 = &v5[v123]; + const int32_t *v567 = &v5[v131]; + const int32_t *v576 = &v5[v141]; + svfloat32_t v582 = svdup_n_f32(v176); + svfloat32_t v583 = svdup_n_f32(v183); + svfloat32_t v584 = svdup_n_f32(v190); + svfloat32_t v588 = svdup_n_f32(v229); + svfloat32_t v589 = svdup_n_f32(v236); + svfloat32_t v590 = svdup_n_f32(v243); + svfloat32_t v591 = svdup_n_f32(v267); + svfloat32_t v592 = svdup_n_f32(v274); + svfloat32_t v593 = svdup_n_f32(v281); + int32_t *v613 = &v6[v320]; + int32_t *v622 = &v6[v328]; + int32_t *v631 = &v6[v339]; + int32_t *v649 = &v6[v355]; + int32_t *v658 = &v6[v366]; + int32_t *v667 = &v6[v374]; + int32_t *v676 = &v6[v382]; + int32_t *v685 = &v6[v393]; + int32_t *v694 = &v6[v401]; + int32_t *v703 = &v6[v409]; + int32_t *v712 = &v6[v420]; + int32_t *v721 = &v6[v428]; + int32_t *v730 = &v6[v436]; + svfloat32_t v39 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v468[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v21 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v449[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v29 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v458[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v48 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v477[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v56 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v486[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v66 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v495[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v75 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v504[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v93 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v522[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v102 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v531[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v110 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v540[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v120 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v549[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v129 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v558[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v137 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v567[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v147 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v576[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v30 = svadd_f32_x(svptrue_b32(), v21, v29); + svfloat32_t v31 = svsub_f32_x(svptrue_b32(), v21, v29); + svfloat32_t v57 = svadd_f32_x(svptrue_b32(), v48, v56); + svfloat32_t v58 = svsub_f32_x(svptrue_b32(), v48, v56); + svfloat32_t v84 = svadd_f32_x(svptrue_b32(), v75, v83); + svfloat32_t v85 = svsub_f32_x(svptrue_b32(), v75, v83); + svfloat32_t v111 = svadd_f32_x(svptrue_b32(), v102, v110); + svfloat32_t v112 = svsub_f32_x(svptrue_b32(), v102, v110); + svfloat32_t v138 = svadd_f32_x(svptrue_b32(), v129, v137); + svfloat32_t v139 = svsub_f32_x(svptrue_b32(), v129, v137); + svfloat32_t v40 = svadd_f32_x(svptrue_b32(), v30, v39); + svfloat32_t v67 = svadd_f32_x(svptrue_b32(), v57, v66); + svfloat32_t v94 = svadd_f32_x(svptrue_b32(), v84, v93); + svfloat32_t v121 = svadd_f32_x(svptrue_b32(), v111, v120); + svfloat32_t v148 = svadd_f32_x(svptrue_b32(), v138, v147); + svfloat32_t v202 = svadd_f32_x(svptrue_b32(), v57, v138); + svfloat32_t v203 = svsub_f32_x(svptrue_b32(), v57, v138); + svfloat32_t v204 = svadd_f32_x(svptrue_b32(), v111, v84); + svfloat32_t v205 = svsub_f32_x(svptrue_b32(), v111, v84); + svfloat32_t v255 = svadd_f32_x(svptrue_b32(), v58, v139); + svfloat32_t v256 = svsub_f32_x(svptrue_b32(), v58, v139); + svfloat32_t v257 = svadd_f32_x(svptrue_b32(), v112, v85); + svfloat32_t v258 = svsub_f32_x(svptrue_b32(), v112, v85); + svfloat32_t v149 = svadd_f32_x(svptrue_b32(), v67, v148); + svfloat32_t v150 = svsub_f32_x(svptrue_b32(), v67, v148); + svfloat32_t v151 = svadd_f32_x(svptrue_b32(), v121, v94); + svfloat32_t v152 = svsub_f32_x(svptrue_b32(), v121, v94); + svfloat32_t v206 = svadd_f32_x(svptrue_b32(), v202, v204); + svfloat32_t v207 = svsub_f32_x(svptrue_b32(), v202, v204); + svfloat32_t v208 = svadd_f32_x(svptrue_b32(), v203, v205); + svfloat32_t zero231 = svdup_n_f32(0); + svfloat32_t v231 = svcmla_f32_x(pred_full, zero231, v588, v203, 90); + svfloat32_t v259 = svadd_f32_x(svptrue_b32(), v255, v257); + svfloat32_t v260 = svsub_f32_x(svptrue_b32(), v255, v257); + svfloat32_t v261 = svadd_f32_x(svptrue_b32(), v256, v258); + svfloat32_t v298 = svmul_f32_x(svptrue_b32(), v258, v596); + svfloat32_t v153 = svadd_f32_x(svptrue_b32(), v149, v151); + svfloat32_t v154 = svsub_f32_x(svptrue_b32(), v149, v151); + svfloat32_t v155 = svadd_f32_x(svptrue_b32(), v150, v152); + svfloat32_t zero178 = svdup_n_f32(0); + svfloat32_t v178 = svcmla_f32_x(pred_full, zero178, v582, v150, 90); + svfloat32_t v209 = svadd_f32_x(svptrue_b32(), v206, v30); + svfloat32_t v219 = svmul_f32_x(svptrue_b32(), v206, v586); + svfloat32_t zero238 = svdup_n_f32(0); + svfloat32_t v238 = svcmla_f32_x(pred_full, zero238, v589, v208, 90); + svfloat32_t v262 = svadd_f32_x(svptrue_b32(), v259, v31); + svfloat32_t zero283 = svdup_n_f32(0); + svfloat32_t v283 = svcmla_f32_x(pred_full, zero283, v593, v260, 90); + svfloat32_t v293 = svmul_f32_x(svptrue_b32(), v261, v595); + svfloat32_t v156 = svadd_f32_x(svptrue_b32(), v153, v40); + svfloat32_t zero185 = svdup_n_f32(0); + svfloat32_t v185 = svcmla_f32_x(pred_full, zero185, v583, v155, 90); + svfloat32_t v249 = svsub_f32_x(svptrue_b32(), v231, v238); + svfloat32_t v250 = svcmla_f32_x(pred_full, v238, v590, v205, 90); + svfloat32_t zero269 = svdup_n_f32(0); + svfloat32_t v269 = svcmla_f32_x(pred_full, zero269, v591, v262, 90); + svfloat32_t v302 = svnmls_f32_x(pred_full, v293, v256, v594); + svfloat32_t v303 = svmla_f32_x(pred_full, v298, v261, v595); + svfloat32_t v193 = svmla_f32_x(pred_full, v156, v153, v580); + svfloat32_t v196 = svsub_f32_x(svptrue_b32(), v178, v185); + svfloat32_t v197 = svcmla_f32_x(pred_full, v185, v584, v152, 90); + svfloat32_t v246 = svmla_f32_x(pred_full, v219, v209, v585); + svfloat32_t v299 = svcmla_f32_x(pred_full, v269, v592, v259, 90); + svfloat32_t v308 = svmla_f32_x(pred_full, v156, v209, v585); + svint16_t v313 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v156, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svfloat32_t v194 = svmla_f32_x(pred_full, v193, v154, v581); + svfloat32_t v195 = svmls_f32_x(pred_full, v193, v154, v581); + svfloat32_t v247 = svmla_f32_x(pred_full, v246, v207, v587); + svfloat32_t v248 = svmls_f32_x(pred_full, v246, v207, v587); + svfloat32_t v300 = svadd_f32_x(svptrue_b32(), v299, v283); + svfloat32_t v301 = svsub_f32_x(svptrue_b32(), v299, v283); + svfloat32_t v309 = svadd_f32_x(svptrue_b32(), v308, v269); + svfloat32_t v310 = svsub_f32_x(svptrue_b32(), v308, v269); + svst1w_u64(pred_full, (unsigned *)(v604), svreinterpret_u64_s16(v313)); + svfloat32_t v198 = svadd_f32_x(svptrue_b32(), v194, v196); + svfloat32_t v199 = svsub_f32_x(svptrue_b32(), v194, v196); + svfloat32_t v200 = svadd_f32_x(svptrue_b32(), v195, v197); + svfloat32_t v201 = svsub_f32_x(svptrue_b32(), v195, v197); + svfloat32_t v251 = svadd_f32_x(svptrue_b32(), v247, v249); + svfloat32_t v252 = svsub_f32_x(svptrue_b32(), v247, v249); + svfloat32_t v253 = svadd_f32_x(svptrue_b32(), v248, v250); + svfloat32_t v254 = svsub_f32_x(svptrue_b32(), v248, v250); + svfloat32_t v304 = svadd_f32_x(svptrue_b32(), v300, v302); + svfloat32_t v305 = svsub_f32_x(svptrue_b32(), v300, v302); + svfloat32_t v306 = svadd_f32_x(svptrue_b32(), v301, v303); + svfloat32_t v307 = svsub_f32_x(svptrue_b32(), v301, v303); + svint16_t v321 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v310, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v329 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v309, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svfloat32_t v335 = svadd_f32_x(svptrue_b32(), v199, v252); + svint16_t v340 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v199, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svfloat32_t v362 = svadd_f32_x(svptrue_b32(), v201, v254); + svint16_t v367 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v201, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svfloat32_t v389 = svadd_f32_x(svptrue_b32(), v200, v253); + svint16_t v394 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v200, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svfloat32_t v416 = svadd_f32_x(svptrue_b32(), v198, v251); + svint16_t v421 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v198, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svst1w_u64(pred_full, (unsigned *)(v613), svreinterpret_u64_s16(v321)); + svst1w_u64(pred_full, (unsigned *)(v622), svreinterpret_u64_s16(v329)); + svfloat32_t v336 = svadd_f32_x(svptrue_b32(), v335, v305); + svfloat32_t v337 = svsub_f32_x(svptrue_b32(), v335, v305); + svfloat32_t v363 = svadd_f32_x(svptrue_b32(), v362, v307); + svfloat32_t v364 = svsub_f32_x(svptrue_b32(), v362, v307); + svfloat32_t v390 = svadd_f32_x(svptrue_b32(), v389, v306); + svfloat32_t v391 = svsub_f32_x(svptrue_b32(), v389, v306); + svfloat32_t v417 = svadd_f32_x(svptrue_b32(), v416, v304); + svfloat32_t v418 = svsub_f32_x(svptrue_b32(), v416, v304); + svst1w_u64(pred_full, (unsigned *)(v631), svreinterpret_u64_s16(v340)); + svst1w_u64(pred_full, (unsigned *)(v658), svreinterpret_u64_s16(v367)); + svst1w_u64(pred_full, (unsigned *)(v685), svreinterpret_u64_s16(v394)); + svst1w_u64(pred_full, (unsigned *)(v712), svreinterpret_u64_s16(v421)); + svint16_t v348 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v337, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v356 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v336, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v375 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v364, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v383 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v363, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v402 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v391, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v410 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v390, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v429 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v418, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v437 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v417, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svst1w_u64(pred_full, (unsigned *)(v640), svreinterpret_u64_s16(v348)); + svst1w_u64(pred_full, (unsigned *)(v649), svreinterpret_u64_s16(v356)); + svst1w_u64(pred_full, (unsigned *)(v667), svreinterpret_u64_s16(v375)); + svst1w_u64(pred_full, (unsigned *)(v676), svreinterpret_u64_s16(v383)); + svst1w_u64(pred_full, (unsigned *)(v694), svreinterpret_u64_s16(v402)); + svst1w_u64(pred_full, (unsigned *)(v703), svreinterpret_u64_s16(v410)); + svst1w_u64(pred_full, (unsigned *)(v721), svreinterpret_u64_s16(v429)); + svst1w_u64(pred_full, (unsigned *)(v730), svreinterpret_u64_s16(v437)); +} +#endif + +#ifndef ARMRAL_ARCH_SVE +void armral_fft_cs16_cf32_cs16_ac_n_uun16( + const armral_cmplx_int16_t *restrict x, armral_cmplx_int16_t *restrict y, + int istride, int ostride, int howmany, float dir) { + float v4 = dir; + const int32_t *v5 = (const int32_t *)x; + int32_t *v6 = (int32_t *)y; + float v190 = 1.0000000000000000e+00F; + float v191 = -1.0000000000000000e+00F; + float v198 = -7.0710678118654746e-01F; + float v205 = 7.0710678118654757e-01F; + float v208 = 9.2387953251128674e-01F; + float v209 = -9.2387953251128674e-01F; + float v216 = 5.4119610014619690e-01F; + float v223 = -1.3065629648763766e+00F; + float v230 = 3.8268343236508984e-01F; + float v234 = 1.3065629648763766e+00F; + float v238 = -5.4119610014619690e-01F; + int16x4_t v13 = vld1s_s16(&v5[0]); + int16x4_t v69 = vld1s_s16(&v5[istride]); + float32x2_t v192 = (float32x2_t){v190, v191}; + float32x2_t v199 = (float32x2_t){v205, v198}; + float32x2_t v206 = (float32x2_t){v205, v205}; + float32x2_t v210 = (float32x2_t){v208, v209}; + float32x2_t v217 = (float32x2_t){v238, v216}; + float32x2_t v224 = (float32x2_t){v234, v223}; + float32x2_t v225 = (float32x2_t){v4, v4}; + float32x2_t v231 = (float32x2_t){v230, v230}; + float32x2_t v235 = (float32x2_t){v234, v234}; + float32x2_t v239 = (float32x2_t){v238, v238}; + float32x2_t v14 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v13)), 15); + int16x4_t v19 = vld1s_s16(&v5[istride * 8]); + int16x4_t v27 = vld1s_s16(&v5[istride * 4]); + int16x4_t v33 = vld1s_s16(&v5[istride * 12]); + int16x4_t v41 = vld1s_s16(&v5[istride * 2]); + int16x4_t v47 = vld1s_s16(&v5[istride * 10]); + int16x4_t v55 = vld1s_s16(&v5[istride * 6]); + int16x4_t v61 = vld1s_s16(&v5[istride * 14]); + float32x2_t v70 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v69)), 15); + int16x4_t v75 = vld1s_s16(&v5[istride * 9]); + int16x4_t v83 = vld1s_s16(&v5[istride * 5]); + int16x4_t v89 = vld1s_s16(&v5[istride * 13]); + int16x4_t v97 = vld1s_s16(&v5[istride * 3]); + int16x4_t v103 = vld1s_s16(&v5[istride * 11]); + int16x4_t v111 = vld1s_s16(&v5[istride * 7]); + int16x4_t v117 = vld1s_s16(&v5[istride * 15]); + float32x2_t v194 = vmul_f32(v225, v192); + float32x2_t v201 = vmul_f32(v225, v199); + float32x2_t v212 = vmul_f32(v225, v210); + float32x2_t v219 = vmul_f32(v225, v217); + float32x2_t v226 = vmul_f32(v225, v224); + float32x2_t v20 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v19)), 15); + float32x2_t v28 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v27)), 15); + float32x2_t v34 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v33)), 15); + float32x2_t v42 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v41)), 15); + float32x2_t v48 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v47)), 15); + float32x2_t v56 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v55)), 15); + float32x2_t v62 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v61)), 15); + float32x2_t v76 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v75)), 15); + float32x2_t v84 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v83)), 15); + float32x2_t v90 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v89)), 15); + float32x2_t v98 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v97)), 15); + float32x2_t v104 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v103)), 15); + float32x2_t v112 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v111)), 15); + float32x2_t v118 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v117)), 15); + float32x2_t v21 = vadd_f32(v14, v20); + float32x2_t v22 = vsub_f32(v14, v20); + float32x2_t v35 = vadd_f32(v28, v34); + float32x2_t v36 = vsub_f32(v28, v34); + float32x2_t v49 = vadd_f32(v42, v48); + float32x2_t v50 = vsub_f32(v42, v48); + float32x2_t v63 = vadd_f32(v56, v62); + float32x2_t v64 = vsub_f32(v56, v62); + float32x2_t v77 = vadd_f32(v70, v76); + float32x2_t v78 = vsub_f32(v70, v76); + float32x2_t v91 = vadd_f32(v84, v90); + float32x2_t v92 = vsub_f32(v84, v90); + float32x2_t v105 = vadd_f32(v98, v104); + float32x2_t v106 = vsub_f32(v98, v104); + float32x2_t v119 = vadd_f32(v112, v118); + float32x2_t v120 = vsub_f32(v112, v118); + float32x2_t v121 = vadd_f32(v21, v35); + float32x2_t v122 = vsub_f32(v21, v35); + float32x2_t v123 = vadd_f32(v49, v63); + float32x2_t v124 = vsub_f32(v49, v63); + float32x2_t v125 = vadd_f32(v77, v91); + float32x2_t v126 = vsub_f32(v77, v91); + float32x2_t v127 = vadd_f32(v105, v119); + float32x2_t v128 = vsub_f32(v105, v119); + float32x2_t v137 = vadd_f32(v50, v64); + float32x2_t v138 = vsub_f32(v50, v64); + float32x2_t v139 = vadd_f32(v78, v120); + float32x2_t v140 = vsub_f32(v78, v120); + float32x2_t v141 = vadd_f32(v92, v106); + float32x2_t v142 = vsub_f32(v92, v106); + float32x2_t v195 = vrev64_f32(v36); + float32x2_t v129 = vadd_f32(v121, v123); + float32x2_t v130 = vsub_f32(v121, v123); + float32x2_t v131 = vadd_f32(v125, v127); + float32x2_t v132 = vsub_f32(v125, v127); + float32x2_t v135 = vadd_f32(v126, v128); + float32x2_t v136 = vsub_f32(v126, v128); + float32x2_t v143 = vadd_f32(v139, v141); + float32x2_t v144 = vadd_f32(v140, v142); + float32x2_t v173 = vrev64_f32(v124); + float32x2_t v196 = vmul_f32(v195, v194); + float32x2_t v202 = vrev64_f32(v137); + float32x2_t v207 = vmul_f32(v138, v206); + float32x2_t v220 = vrev64_f32(v139); + float32x2_t v227 = vrev64_f32(v141); + float32x2_t v236 = vmul_f32(v140, v235); + float32x2_t v240 = vmul_f32(v142, v239); + float32x2_t v133 = vadd_f32(v129, v131); + float32x2_t v134 = vsub_f32(v129, v131); + float32x2_t v162 = vrev64_f32(v132); + float32x2_t v174 = vmul_f32(v173, v194); + float32x2_t v180 = vrev64_f32(v135); + float32x2_t v185 = vmul_f32(v136, v206); + float32x2_t v203 = vmul_f32(v202, v201); + float32x2_t v213 = vrev64_f32(v143); + float32x2_t v221 = vmul_f32(v220, v219); + float32x2_t v228 = vmul_f32(v227, v226); + float32x2_t v232 = vmul_f32(v144, v231); + float32x2_t v251 = vadd_f32(v22, v207); + float32x2_t v252 = vsub_f32(v22, v207); + float32x2_t v163 = vmul_f32(v162, v194); + float32x2_t v181 = vmul_f32(v180, v201); + float32x2_t v214 = vmul_f32(v213, v212); + float32x2_t v243 = vadd_f32(v122, v185); + float32x2_t v245 = vsub_f32(v122, v185); + float32x2_t v253 = vadd_f32(v196, v203); + float32x2_t v254 = vsub_f32(v196, v203); + float32x2_t v257 = vsub_f32(v236, v232); + float32x2_t v258 = vsub_f32(v240, v232); + float32x2_t v259 = vsub_f32(v232, v236); + float32x2_t v260 = vsub_f32(v232, v240); + int16x4_t v287 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v133, 15), (int32x2_t){0, 0})); + int16x4_t v335 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v134, 15), (int32x2_t){0, 0})); + float32x2_t v241 = vadd_f32(v130, v163); + float32x2_t v242 = vsub_f32(v130, v163); + float32x2_t v244 = vadd_f32(v174, v181); + float32x2_t v246 = vsub_f32(v181, v174); + float32x2_t v255 = vadd_f32(v214, v221); + float32x2_t v256 = vsub_f32(v214, v228); + float32x2_t v261 = vadd_f32(v251, v257); + float32x2_t v262 = vsub_f32(v251, v257); + float32x2_t v263 = vadd_f32(v251, v259); + float32x2_t v264 = vsub_f32(v251, v259); + float32x2_t v265 = vadd_f32(v252, v254); + float32x2_t v266 = vsub_f32(v252, v254); + float32x2_t v267 = vadd_f32(v252, v260); + float32x2_t v268 = vsub_f32(v252, v260); + v6[0] = vget_lane_s32(vreinterpret_s32_s16(v287), 0); + v6[ostride * 8] = vget_lane_s32(vreinterpret_s32_s16(v335), 0); + float32x2_t v247 = vadd_f32(v243, v244); + float32x2_t v248 = vadd_f32(v245, v246); + float32x2_t v249 = vsub_f32(v245, v246); + float32x2_t v250 = vsub_f32(v243, v244); + float32x2_t v271 = vadd_f32(v255, v253); + float32x2_t v272 = vsub_f32(v255, v253); + float32x2_t v273 = vadd_f32(v256, v258); + float32x2_t v274 = vsub_f32(v256, v258); + float32x2_t v275 = vadd_f32(v256, v254); + float32x2_t v276 = vsub_f32(v256, v254); + int16x4_t v311 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v242, 15), (int32x2_t){0, 0})); + int16x4_t v359 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v241, 15), (int32x2_t){0, 0})); + float32x2_t v277 = vadd_f32(v261, v271); + float32x2_t v278 = vadd_f32(v262, v272); + float32x2_t v279 = vsub_f32(v263, v272); + float32x2_t v280 = vsub_f32(v264, v271); + float32x2_t v281 = vadd_f32(v265, v273); + float32x2_t v282 = vadd_f32(v266, v274); + float32x2_t v283 = vsub_f32(v267, v276); + float32x2_t v284 = vsub_f32(v268, v275); + int16x4_t v299 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v250, 15), (int32x2_t){0, 0})); + v6[ostride * 4] = vget_lane_s32(vreinterpret_s32_s16(v311), 0); + int16x4_t v323 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v249, 15), (int32x2_t){0, 0})); + int16x4_t v347 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v248, 15), (int32x2_t){0, 0})); + v6[ostride * 12] = vget_lane_s32(vreinterpret_s32_s16(v359), 0); + int16x4_t v371 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v247, 15), (int32x2_t){0, 0})); + int16x4_t v293 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v280, 15), (int32x2_t){0, 0})); + v6[ostride * 2] = vget_lane_s32(vreinterpret_s32_s16(v299), 0); + int16x4_t v305 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v283, 15), (int32x2_t){0, 0})); + int16x4_t v317 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v284, 15), (int32x2_t){0, 0})); + v6[ostride * 6] = vget_lane_s32(vreinterpret_s32_s16(v323), 0); + int16x4_t v329 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v279, 15), (int32x2_t){0, 0})); + int16x4_t v341 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v278, 15), (int32x2_t){0, 0})); + v6[ostride * 10] = vget_lane_s32(vreinterpret_s32_s16(v347), 0); + int16x4_t v353 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v281, 15), (int32x2_t){0, 0})); + int16x4_t v365 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v282, 15), (int32x2_t){0, 0})); + v6[ostride * 14] = vget_lane_s32(vreinterpret_s32_s16(v371), 0); + int16x4_t v377 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v277, 15), (int32x2_t){0, 0})); + v6[ostride] = vget_lane_s32(vreinterpret_s32_s16(v293), 0); + v6[ostride * 3] = vget_lane_s32(vreinterpret_s32_s16(v305), 0); + v6[ostride * 5] = vget_lane_s32(vreinterpret_s32_s16(v317), 0); + v6[ostride * 7] = vget_lane_s32(vreinterpret_s32_s16(v329), 0); + v6[ostride * 9] = vget_lane_s32(vreinterpret_s32_s16(v341), 0); + v6[ostride * 11] = vget_lane_s32(vreinterpret_s32_s16(v353), 0); + v6[ostride * 13] = vget_lane_s32(vreinterpret_s32_s16(v365), 0); + v6[ostride * 15] = vget_lane_s32(vreinterpret_s32_s16(v377), 0); +} +#endif + +#ifdef ARMRAL_ARCH_SVE +void armral_fft_cs16_cf32_cs16_ac_n_uun16( + const armral_cmplx_int16_t *restrict x, armral_cmplx_int16_t *restrict y, + int istride, int ostride, int howmany, float dir) { + int64_t v0 = istride; + int64_t v2 = ostride; + float v4 = dir; + const int32_t *v5 = (const int32_t *)x; + int32_t *v6 = (int32_t *)y; + svbool_t pred_full = svptrue_pat_b32(SV_VL2); + float v234 = -1.0000000000000000e+00F; + float v241 = -7.0710678118654746e-01F; + float v248 = 7.0710678118654757e-01F; + float v253 = -9.2387953251128674e-01F; + float v260 = 5.4119610014619690e-01F; + float v267 = -1.3065629648763766e+00F; + float v274 = 3.8268343236508984e-01F; + float v279 = 1.3065629648763766e+00F; + float v284 = -5.4119610014619690e-01F; + const int32_t *v539 = &v5[v0]; + int32_t *v639 = &v6[v2]; + int64_t v23 = v0 * 8; + int64_t v33 = v0 * 4; + int64_t v41 = v0 * 12; + int64_t v51 = v0 * 2; + int64_t v59 = v0 * 10; + int64_t v69 = v0 * 6; + int64_t v77 = v0 * 14; + int64_t v95 = v0 * 9; + int64_t v105 = v0 * 5; + int64_t v113 = v0 * 13; + int64_t v123 = v0 * 3; + int64_t v131 = v0 * 11; + int64_t v141 = v0 * 7; + int64_t v149 = v0 * 15; + float v237 = v4 * v234; + float v244 = v4 * v241; + float v256 = v4 * v253; + float v263 = v4 * v260; + float v270 = v4 * v267; + int64_t v349 = v2 * 2; + int64_t v357 = v2 * 3; + int64_t v365 = v2 * 4; + int64_t v373 = v2 * 5; + int64_t v381 = v2 * 6; + int64_t v389 = v2 * 7; + int64_t v397 = v2 * 8; + int64_t v405 = v2 * 9; + int64_t v413 = v2 * 10; + int64_t v421 = v2 * 11; + int64_t v429 = v2 * 12; + int64_t v437 = v2 * 13; + int64_t v445 = v2 * 14; + int64_t v453 = v2 * 15; + const int32_t *v467 = &v5[0]; + svfloat32_t v616 = svdup_n_f32(v248); + svfloat32_t v620 = svdup_n_f32(v274); + svfloat32_t v621 = svdup_n_f32(v279); + svfloat32_t v622 = svdup_n_f32(v284); + int32_t *v630 = &v6[0]; + svfloat32_t v93 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v539[0])), + 1.F / (1ULL << 15ULL)); + const int32_t *v476 = &v5[v23]; + const int32_t *v485 = &v5[v33]; + const int32_t *v494 = &v5[v41]; + const int32_t *v503 = &v5[v51]; + const int32_t *v512 = &v5[v59]; + const int32_t *v521 = &v5[v69]; + const int32_t *v530 = &v5[v77]; + const int32_t *v548 = &v5[v95]; + const int32_t *v557 = &v5[v105]; + const int32_t *v566 = &v5[v113]; + const int32_t *v575 = &v5[v123]; + const int32_t *v584 = &v5[v131]; + const int32_t *v593 = &v5[v141]; + const int32_t *v602 = &v5[v149]; + svfloat32_t v614 = svdup_n_f32(v237); + svfloat32_t v615 = svdup_n_f32(v244); + svfloat32_t v617 = svdup_n_f32(v256); + svfloat32_t v618 = svdup_n_f32(v263); + svfloat32_t v619 = svdup_n_f32(v270); + int32_t *v648 = &v6[v349]; + int32_t *v657 = &v6[v357]; + int32_t *v666 = &v6[v365]; + int32_t *v675 = &v6[v373]; + int32_t *v684 = &v6[v381]; + int32_t *v693 = &v6[v389]; + int32_t *v702 = &v6[v397]; + int32_t *v711 = &v6[v405]; + int32_t *v720 = &v6[v413]; + int32_t *v729 = &v6[v421]; + int32_t *v738 = &v6[v429]; + int32_t *v747 = &v6[v437]; + int32_t *v756 = &v6[v445]; + int32_t *v765 = &v6[v453]; + svfloat32_t v21 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v467[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v29 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v476[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v39 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v485[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v47 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v494[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v57 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v503[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v65 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v512[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v75 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v521[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v83 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v530[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v101 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v548[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v111 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v557[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v119 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v566[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v129 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v575[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v137 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v584[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v147 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v593[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v155 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v602[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v30 = svadd_f32_x(svptrue_b32(), v21, v29); + svfloat32_t v31 = svsub_f32_x(svptrue_b32(), v21, v29); + svfloat32_t v48 = svadd_f32_x(svptrue_b32(), v39, v47); + svfloat32_t v49 = svsub_f32_x(svptrue_b32(), v39, v47); + svfloat32_t v66 = svadd_f32_x(svptrue_b32(), v57, v65); + svfloat32_t v67 = svsub_f32_x(svptrue_b32(), v57, v65); + svfloat32_t v84 = svadd_f32_x(svptrue_b32(), v75, v83); + svfloat32_t v85 = svsub_f32_x(svptrue_b32(), v75, v83); + svfloat32_t v102 = svadd_f32_x(svptrue_b32(), v93, v101); + svfloat32_t v103 = svsub_f32_x(svptrue_b32(), v93, v101); + svfloat32_t v120 = svadd_f32_x(svptrue_b32(), v111, v119); + svfloat32_t v121 = svsub_f32_x(svptrue_b32(), v111, v119); + svfloat32_t v138 = svadd_f32_x(svptrue_b32(), v129, v137); + svfloat32_t v139 = svsub_f32_x(svptrue_b32(), v129, v137); + svfloat32_t v156 = svadd_f32_x(svptrue_b32(), v147, v155); + svfloat32_t v157 = svsub_f32_x(svptrue_b32(), v147, v155); + svfloat32_t v158 = svadd_f32_x(svptrue_b32(), v30, v48); + svfloat32_t v159 = svsub_f32_x(svptrue_b32(), v30, v48); + svfloat32_t v160 = svadd_f32_x(svptrue_b32(), v66, v84); + svfloat32_t v161 = svsub_f32_x(svptrue_b32(), v66, v84); + svfloat32_t v162 = svadd_f32_x(svptrue_b32(), v102, v120); + svfloat32_t v163 = svsub_f32_x(svptrue_b32(), v102, v120); + svfloat32_t v164 = svadd_f32_x(svptrue_b32(), v138, v156); + svfloat32_t v165 = svsub_f32_x(svptrue_b32(), v138, v156); + svfloat32_t v174 = svadd_f32_x(svptrue_b32(), v67, v85); + svfloat32_t v175 = svsub_f32_x(svptrue_b32(), v67, v85); + svfloat32_t v176 = svadd_f32_x(svptrue_b32(), v103, v157); + svfloat32_t v177 = svsub_f32_x(svptrue_b32(), v103, v157); + svfloat32_t v178 = svadd_f32_x(svptrue_b32(), v121, v139); + svfloat32_t v179 = svsub_f32_x(svptrue_b32(), v121, v139); + svfloat32_t zero239 = svdup_n_f32(0); + svfloat32_t v239 = svcmla_f32_x(pred_full, zero239, v614, v49, 90); + svfloat32_t v166 = svadd_f32_x(svptrue_b32(), v158, v160); + svfloat32_t v167 = svsub_f32_x(svptrue_b32(), v158, v160); + svfloat32_t v168 = svadd_f32_x(svptrue_b32(), v162, v164); + svfloat32_t v169 = svsub_f32_x(svptrue_b32(), v162, v164); + svfloat32_t v172 = svadd_f32_x(svptrue_b32(), v163, v165); + svfloat32_t v173 = svsub_f32_x(svptrue_b32(), v163, v165); + svfloat32_t v180 = svadd_f32_x(svptrue_b32(), v176, v178); + svfloat32_t v181 = svadd_f32_x(svptrue_b32(), v177, v179); + svfloat32_t zero215 = svdup_n_f32(0); + svfloat32_t v215 = svcmla_f32_x(pred_full, zero215, v614, v161, 90); + svfloat32_t zero246 = svdup_n_f32(0); + svfloat32_t v246 = svcmla_f32_x(pred_full, zero246, v615, v174, 90); + svfloat32_t zero272 = svdup_n_f32(0); + svfloat32_t v272 = svcmla_f32_x(pred_full, zero272, v619, v178, 90); + svfloat32_t v282 = svmul_f32_x(svptrue_b32(), v177, v621); + svfloat32_t v287 = svmul_f32_x(svptrue_b32(), v179, v622); + svfloat32_t v170 = svadd_f32_x(svptrue_b32(), v166, v168); + svfloat32_t v171 = svsub_f32_x(svptrue_b32(), v166, v168); + svfloat32_t zero203 = svdup_n_f32(0); + svfloat32_t v203 = svcmla_f32_x(pred_full, zero203, v614, v169, 90); + svfloat32_t zero222 = svdup_n_f32(0); + svfloat32_t v222 = svcmla_f32_x(pred_full, zero222, v615, v172, 90); + svfloat32_t zero258 = svdup_n_f32(0); + svfloat32_t v258 = svcmla_f32_x(pred_full, zero258, v617, v180, 90); + svfloat32_t v277 = svmul_f32_x(svptrue_b32(), v181, v620); + svfloat32_t v298 = svmla_f32_x(pred_full, v31, v175, v616); + svfloat32_t v299 = svmls_f32_x(pred_full, v31, v175, v616); + svfloat32_t v300 = svadd_f32_x(svptrue_b32(), v239, v246); + svfloat32_t v301 = svsub_f32_x(svptrue_b32(), v239, v246); + svfloat32_t v288 = svadd_f32_x(svptrue_b32(), v167, v203); + svfloat32_t v289 = svsub_f32_x(svptrue_b32(), v167, v203); + svfloat32_t v290 = svmla_f32_x(pred_full, v159, v173, v616); + svfloat32_t v291 = svadd_f32_x(svptrue_b32(), v215, v222); + svfloat32_t v292 = svmls_f32_x(pred_full, v159, v173, v616); + svfloat32_t v293 = svsub_f32_x(svptrue_b32(), v222, v215); + svfloat32_t v302 = svcmla_f32_x(pred_full, v258, v618, v176, 90); + svfloat32_t v303 = svsub_f32_x(svptrue_b32(), v258, v272); + svfloat32_t v304 = svnmls_f32_x(pred_full, v277, v177, v621); + svfloat32_t v305 = svnmls_f32_x(pred_full, v277, v179, v622); + svfloat32_t v306 = svnmls_f32_x(pred_full, v282, v181, v620); + svfloat32_t v307 = svnmls_f32_x(pred_full, v287, v181, v620); + svfloat32_t v312 = svadd_f32_x(svptrue_b32(), v299, v301); + svfloat32_t v313 = svsub_f32_x(svptrue_b32(), v299, v301); + svint16_t v334 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v170, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v398 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v171, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svfloat32_t v294 = svadd_f32_x(svptrue_b32(), v290, v291); + svfloat32_t v295 = svadd_f32_x(svptrue_b32(), v292, v293); + svfloat32_t v296 = svsub_f32_x(svptrue_b32(), v292, v293); + svfloat32_t v297 = svsub_f32_x(svptrue_b32(), v290, v291); + svfloat32_t v308 = svadd_f32_x(svptrue_b32(), v298, v304); + svfloat32_t v309 = svsub_f32_x(svptrue_b32(), v298, v304); + svfloat32_t v310 = svadd_f32_x(svptrue_b32(), v298, v306); + svfloat32_t v311 = svsub_f32_x(svptrue_b32(), v298, v306); + svfloat32_t v314 = svadd_f32_x(svptrue_b32(), v299, v307); + svfloat32_t v315 = svsub_f32_x(svptrue_b32(), v299, v307); + svfloat32_t v318 = svadd_f32_x(svptrue_b32(), v302, v300); + svfloat32_t v319 = svsub_f32_x(svptrue_b32(), v302, v300); + svfloat32_t v320 = svadd_f32_x(svptrue_b32(), v303, v305); + svfloat32_t v321 = svsub_f32_x(svptrue_b32(), v303, v305); + svfloat32_t v322 = svadd_f32_x(svptrue_b32(), v303, v301); + svfloat32_t v323 = svsub_f32_x(svptrue_b32(), v303, v301); + svint16_t v366 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v289, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v430 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v288, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svst1w_u64(pred_full, (unsigned *)(v630), svreinterpret_u64_s16(v334)); + svst1w_u64(pred_full, (unsigned *)(v702), svreinterpret_u64_s16(v398)); + svfloat32_t v324 = svadd_f32_x(svptrue_b32(), v308, v318); + svfloat32_t v325 = svadd_f32_x(svptrue_b32(), v309, v319); + svfloat32_t v326 = svsub_f32_x(svptrue_b32(), v310, v319); + svfloat32_t v327 = svsub_f32_x(svptrue_b32(), v311, v318); + svfloat32_t v328 = svadd_f32_x(svptrue_b32(), v312, v320); + svfloat32_t v329 = svadd_f32_x(svptrue_b32(), v313, v321); + svfloat32_t v330 = svsub_f32_x(svptrue_b32(), v314, v323); + svfloat32_t v331 = svsub_f32_x(svptrue_b32(), v315, v322); + svint16_t v350 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v297, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v382 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v296, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v414 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v295, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v446 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v294, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svst1w_u64(pred_full, (unsigned *)(v666), svreinterpret_u64_s16(v366)); + svst1w_u64(pred_full, (unsigned *)(v738), svreinterpret_u64_s16(v430)); + svint16_t v342 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v327, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v358 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v330, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v374 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v331, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v390 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v326, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v406 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v325, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v422 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v328, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v438 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v329, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v454 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v324, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svst1w_u64(pred_full, (unsigned *)(v648), svreinterpret_u64_s16(v350)); + svst1w_u64(pred_full, (unsigned *)(v684), svreinterpret_u64_s16(v382)); + svst1w_u64(pred_full, (unsigned *)(v720), svreinterpret_u64_s16(v414)); + svst1w_u64(pred_full, (unsigned *)(v756), svreinterpret_u64_s16(v446)); + svst1w_u64(pred_full, (unsigned *)(v639), svreinterpret_u64_s16(v342)); + svst1w_u64(pred_full, (unsigned *)(v657), svreinterpret_u64_s16(v358)); + svst1w_u64(pred_full, (unsigned *)(v675), svreinterpret_u64_s16(v374)); + svst1w_u64(pred_full, (unsigned *)(v693), svreinterpret_u64_s16(v390)); + svst1w_u64(pred_full, (unsigned *)(v711), svreinterpret_u64_s16(v406)); + svst1w_u64(pred_full, (unsigned *)(v729), svreinterpret_u64_s16(v422)); + svst1w_u64(pred_full, (unsigned *)(v747), svreinterpret_u64_s16(v438)); + svst1w_u64(pred_full, (unsigned *)(v765), svreinterpret_u64_s16(v454)); +} +#endif + +#ifndef ARMRAL_ARCH_SVE +void armral_fft_cs16_cf32_cs16_ac_n_uun17( + const armral_cmplx_int16_t *restrict x, armral_cmplx_int16_t *restrict y, + int istride, int ostride, int howmany, float dir) { + float v4 = dir; + const int32_t *v5 = (const int32_t *)x; + int32_t *v6 = (int32_t *)y; + float v183 = -4.2602849117736000e-02F; + float v187 = 2.0497965023262180e-01F; + float v191 = 1.0451835201736759e+00F; + float v195 = 1.7645848660222969e+00F; + float v199 = -7.2340797728605655e-01F; + float v203 = -8.9055591620606403e-02F; + float v207 = -1.0625000000000000e+00F; + float v211 = 2.5769410160110379e-01F; + float v215 = 7.7980260789483757e-01F; + float v219 = 5.4389318464570580e-01F; + float v223 = 4.2010193497052700e-01F; + float v227 = 1.2810929434228073e+00F; + float v231 = 4.4088907348175338e-01F; + float v235 = 3.1717619283272508e-01F; + float v238 = -9.0138318648016680e-01F; + float v239 = 9.0138318648016680e-01F; + float v245 = -4.3248756360072310e-01F; + float v246 = 4.3248756360072310e-01F; + float v252 = 6.6693537504044498e-01F; + float v253 = -6.6693537504044498e-01F; + float v259 = -6.0389004312516970e-01F; + float v260 = 6.0389004312516970e-01F; + float v266 = -3.6924873198582547e-01F; + float v267 = 3.6924873198582547e-01F; + float v273 = 4.8656938755549761e-01F; + float v274 = -4.8656938755549761e-01F; + float v280 = 2.3813712136760609e-01F; + float v281 = -2.3813712136760609e-01F; + float v287 = -1.5573820617422458e+00F; + float v288 = 1.5573820617422458e+00F; + float v294 = 6.5962247018731990e-01F; + float v295 = -6.5962247018731990e-01F; + float v301 = -1.4316961569866241e-01F; + float v302 = 1.4316961569866241e-01F; + float v308 = 2.3903469959860771e-01F; + float v309 = -2.3903469959860771e-01F; + float v315 = -4.7932541949972603e-02F; + float v316 = 4.7932541949972603e-02F; + float v322 = -2.3188014856550065e+00F; + float v323 = 2.3188014856550065e+00F; + float v329 = 7.8914568419206255e-01F; + float v330 = -7.8914568419206255e-01F; + float v336 = 3.8484572871179505e+00F; + float v337 = -3.8484572871179505e+00F; + float v343 = -1.3003804568801376e+00F; + float v344 = 1.3003804568801376e+00F; + float v350 = 4.0814769046889037e+00F; + float v351 = -4.0814769046889037e+00F; + float v357 = -1.4807159909286283e+00F; + float v358 = 1.4807159909286283e+00F; + float v364 = -1.3332470363551400e-02F; + float v365 = 1.3332470363551400e-02F; + float v371 = -3.7139778690557629e-01F; + float v372 = 3.7139778690557629e-01F; + float v378 = 1.9236512863456379e-01F; + float v379 = -1.9236512863456379e-01F; + int16x4_t v13 = vld1s_s16(&v5[istride]); + int16x4_t v175 = vld1s_s16(&v5[0]); + float32x2_t v184 = (float32x2_t){v183, v183}; + float32x2_t v188 = (float32x2_t){v187, v187}; + float32x2_t v192 = (float32x2_t){v191, v191}; + float32x2_t v196 = (float32x2_t){v195, v195}; + float32x2_t v200 = (float32x2_t){v199, v199}; + float32x2_t v204 = (float32x2_t){v203, v203}; + float32x2_t v208 = (float32x2_t){v207, v207}; + float32x2_t v212 = (float32x2_t){v211, v211}; + float32x2_t v216 = (float32x2_t){v215, v215}; + float32x2_t v220 = (float32x2_t){v219, v219}; + float32x2_t v224 = (float32x2_t){v223, v223}; + float32x2_t v228 = (float32x2_t){v227, v227}; + float32x2_t v232 = (float32x2_t){v231, v231}; + float32x2_t v236 = (float32x2_t){v235, v235}; + float32x2_t v240 = (float32x2_t){v238, v239}; + float32x2_t v247 = (float32x2_t){v245, v246}; + float32x2_t v254 = (float32x2_t){v252, v253}; + float32x2_t v261 = (float32x2_t){v259, v260}; + float32x2_t v268 = (float32x2_t){v266, v267}; + float32x2_t v275 = (float32x2_t){v273, v274}; + float32x2_t v282 = (float32x2_t){v280, v281}; + float32x2_t v289 = (float32x2_t){v287, v288}; + float32x2_t v296 = (float32x2_t){v294, v295}; + float32x2_t v303 = (float32x2_t){v301, v302}; + float32x2_t v310 = (float32x2_t){v308, v309}; + float32x2_t v317 = (float32x2_t){v315, v316}; + float32x2_t v324 = (float32x2_t){v322, v323}; + float32x2_t v331 = (float32x2_t){v329, v330}; + float32x2_t v338 = (float32x2_t){v336, v337}; + float32x2_t v345 = (float32x2_t){v343, v344}; + float32x2_t v352 = (float32x2_t){v350, v351}; + float32x2_t v359 = (float32x2_t){v357, v358}; + float32x2_t v366 = (float32x2_t){v364, v365}; + float32x2_t v373 = (float32x2_t){v371, v372}; + float32x2_t v380 = (float32x2_t){v378, v379}; + float32x2_t v381 = (float32x2_t){v4, v4}; + float32x2_t v14 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v13)), 15); + int16x4_t v19 = vld1s_s16(&v5[istride * 16]); + int16x4_t v27 = vld1s_s16(&v5[istride * 3]); + int16x4_t v33 = vld1s_s16(&v5[istride * 14]); + int16x4_t v41 = vld1s_s16(&v5[istride * 9]); + int16x4_t v47 = vld1s_s16(&v5[istride * 8]); + int16x4_t v55 = vld1s_s16(&v5[istride * 10]); + int16x4_t v61 = vld1s_s16(&v5[istride * 7]); + int16x4_t v69 = vld1s_s16(&v5[istride * 13]); + int16x4_t v75 = vld1s_s16(&v5[istride * 4]); + int16x4_t v83 = vld1s_s16(&v5[istride * 5]); + int16x4_t v89 = vld1s_s16(&v5[istride * 12]); + int16x4_t v97 = vld1s_s16(&v5[istride * 15]); + int16x4_t v103 = vld1s_s16(&v5[istride * 2]); + int16x4_t v111 = vld1s_s16(&v5[istride * 11]); + int16x4_t v117 = vld1s_s16(&v5[istride * 6]); + float32x2_t v176 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v175)), 15); + float32x2_t v242 = vmul_f32(v381, v240); + float32x2_t v249 = vmul_f32(v381, v247); + float32x2_t v256 = vmul_f32(v381, v254); + float32x2_t v263 = vmul_f32(v381, v261); + float32x2_t v270 = vmul_f32(v381, v268); + float32x2_t v277 = vmul_f32(v381, v275); + float32x2_t v284 = vmul_f32(v381, v282); + float32x2_t v291 = vmul_f32(v381, v289); + float32x2_t v298 = vmul_f32(v381, v296); + float32x2_t v305 = vmul_f32(v381, v303); + float32x2_t v312 = vmul_f32(v381, v310); + float32x2_t v319 = vmul_f32(v381, v317); + float32x2_t v326 = vmul_f32(v381, v324); + float32x2_t v333 = vmul_f32(v381, v331); + float32x2_t v340 = vmul_f32(v381, v338); + float32x2_t v347 = vmul_f32(v381, v345); + float32x2_t v354 = vmul_f32(v381, v352); + float32x2_t v361 = vmul_f32(v381, v359); + float32x2_t v368 = vmul_f32(v381, v366); + float32x2_t v375 = vmul_f32(v381, v373); + float32x2_t v382 = vmul_f32(v381, v380); + float32x2_t v20 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v19)), 15); + float32x2_t v28 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v27)), 15); + float32x2_t v34 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v33)), 15); + float32x2_t v42 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v41)), 15); + float32x2_t v48 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v47)), 15); + float32x2_t v56 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v55)), 15); + float32x2_t v62 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v61)), 15); + float32x2_t v70 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v69)), 15); + float32x2_t v76 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v75)), 15); + float32x2_t v84 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v83)), 15); + float32x2_t v90 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v89)), 15); + float32x2_t v98 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v97)), 15); + float32x2_t v104 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v103)), 15); + float32x2_t v112 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v111)), 15); + float32x2_t v118 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v117)), 15); + float32x2_t v21 = vadd_f32(v14, v20); + float32x2_t v22 = vsub_f32(v14, v20); + float32x2_t v35 = vadd_f32(v28, v34); + float32x2_t v36 = vsub_f32(v28, v34); + float32x2_t v49 = vadd_f32(v42, v48); + float32x2_t v50 = vsub_f32(v42, v48); + float32x2_t v63 = vadd_f32(v56, v62); + float32x2_t v64 = vsub_f32(v56, v62); + float32x2_t v77 = vadd_f32(v70, v76); + float32x2_t v78 = vsub_f32(v70, v76); + float32x2_t v91 = vadd_f32(v84, v90); + float32x2_t v92 = vsub_f32(v84, v90); + float32x2_t v105 = vadd_f32(v98, v104); + float32x2_t v106 = vsub_f32(v98, v104); + float32x2_t v119 = vadd_f32(v112, v118); + float32x2_t v120 = vsub_f32(v112, v118); + float32x2_t v121 = vadd_f32(v21, v77); + float32x2_t v122 = vadd_f32(v35, v91); + float32x2_t v123 = vadd_f32(v49, v105); + float32x2_t v124 = vadd_f32(v63, v119); + float32x2_t v127 = vsub_f32(v21, v77); + float32x2_t v128 = vsub_f32(v35, v91); + float32x2_t v129 = vsub_f32(v49, v105); + float32x2_t v130 = vsub_f32(v63, v119); + float32x2_t v141 = vadd_f32(v22, v50); + float32x2_t v142 = vadd_f32(v36, v64); + float32x2_t v143 = vsub_f32(v22, v50); + float32x2_t v144 = vsub_f32(v120, v92); + float32x2_t v145 = vadd_f32(v78, v106); + float32x2_t v146 = vadd_f32(v92, v120); + float32x2_t v147 = vsub_f32(v78, v106); + float32x2_t v148 = vsub_f32(v36, v64); + float32x2_t v161 = vadd_f32(v22, v78); + float32x2_t v162 = vadd_f32(v64, v120); + float32x2_t v334 = vrev64_f32(v22); + float32x2_t v341 = vrev64_f32(v78); + float32x2_t v355 = vrev64_f32(v64); + float32x2_t v362 = vrev64_f32(v120); + float32x2_t v125 = vadd_f32(v121, v123); + float32x2_t v126 = vadd_f32(v122, v124); + float32x2_t v131 = vsub_f32(v121, v123); + float32x2_t v132 = vsub_f32(v122, v124); + float32x2_t v135 = vadd_f32(v128, v130); + float32x2_t v136 = vadd_f32(v127, v129); + float32x2_t v138 = vsub_f32(v129, v130); + float32x2_t v139 = vsub_f32(v127, v128); + float32x2_t v149 = vadd_f32(v141, v142); + float32x2_t v150 = vadd_f32(v145, v146); + float32x2_t v152 = vsub_f32(v141, v142); + float32x2_t v153 = vsub_f32(v145, v146); + float32x2_t v155 = vadd_f32(v143, v144); + float32x2_t v156 = vadd_f32(v147, v148); + float32x2_t v158 = vsub_f32(v143, v144); + float32x2_t v159 = vsub_f32(v147, v148); + float32x2_t v185 = vmul_f32(v127, v184); + float32x2_t v189 = vmul_f32(v128, v188); + float32x2_t v193 = vmul_f32(v129, v192); + float32x2_t v197 = vmul_f32(v130, v196); + float32x2_t v327 = vrev64_f32(v161); + float32x2_t v335 = vmul_f32(v334, v333); + float32x2_t v342 = vmul_f32(v341, v340); + float32x2_t v348 = vrev64_f32(v162); + float32x2_t v356 = vmul_f32(v355, v354); + float32x2_t v363 = vmul_f32(v362, v361); + float32x2_t v133 = vadd_f32(v125, v126); + float32x2_t v134 = vsub_f32(v125, v126); + float32x2_t v137 = vsub_f32(v136, v135); + float32x2_t v140 = vadd_f32(v131, v132); + float32x2_t v151 = vadd_f32(v149, v150); + float32x2_t v154 = vadd_f32(v152, v153); + float32x2_t v157 = vadd_f32(v155, v156); + float32x2_t v160 = vadd_f32(v158, v159); + float32x2_t v163 = vsub_f32(v156, v150); + float32x2_t v166 = vsub_f32(v149, v155); + float32x2_t v201 = vmul_f32(v131, v200); + float32x2_t v205 = vmul_f32(v132, v204); + float32x2_t v217 = vmul_f32(v135, v216); + float32x2_t v221 = vmul_f32(v136, v220); + float32x2_t v229 = vmul_f32(v138, v228); + float32x2_t v233 = vmul_f32(v139, v232); + float32x2_t v243 = vrev64_f32(v149); + float32x2_t v250 = vrev64_f32(v150); + float32x2_t v264 = vrev64_f32(v152); + float32x2_t v271 = vrev64_f32(v153); + float32x2_t v285 = vrev64_f32(v155); + float32x2_t v292 = vrev64_f32(v156); + float32x2_t v306 = vrev64_f32(v158); + float32x2_t v313 = vrev64_f32(v159); + float32x2_t v328 = vmul_f32(v327, v326); + float32x2_t v349 = vmul_f32(v348, v347); + float32x2_t v164 = vadd_f32(v163, v22); + float32x2_t v167 = vadd_f32(v166, v64); + float32x2_t v177 = vadd_f32(v176, v133); + float32x2_t v209 = vmul_f32(v133, v208); + float32x2_t v213 = vmul_f32(v134, v212); + float32x2_t v225 = vmul_f32(v137, v224); + float32x2_t v237 = vmul_f32(v140, v236); + float32x2_t v244 = vmul_f32(v243, v242); + float32x2_t v251 = vmul_f32(v250, v249); + float32x2_t v257 = vrev64_f32(v151); + float32x2_t v265 = vmul_f32(v264, v263); + float32x2_t v272 = vmul_f32(v271, v270); + float32x2_t v278 = vrev64_f32(v154); + float32x2_t v286 = vmul_f32(v285, v284); + float32x2_t v293 = vmul_f32(v292, v291); + float32x2_t v299 = vrev64_f32(v157); + float32x2_t v307 = vmul_f32(v306, v305); + float32x2_t v314 = vmul_f32(v313, v312); + float32x2_t v320 = vrev64_f32(v160); + float32x2_t v387 = vadd_f32(v197, v229); + float32x2_t v388 = vsub_f32(v229, v193); + float32x2_t v389 = vadd_f32(v189, v233); + float32x2_t v390 = vsub_f32(v185, v233); + float32x2_t v165 = vsub_f32(v164, v162); + float32x2_t v168 = vadd_f32(v167, v78); + float32x2_t v258 = vmul_f32(v257, v256); + float32x2_t v279 = vmul_f32(v278, v277); + float32x2_t v300 = vmul_f32(v299, v298); + float32x2_t v321 = vmul_f32(v320, v319); + float32x2_t v385 = vadd_f32(v217, v225); + float32x2_t v386 = vsub_f32(v221, v225); + float32x2_t v391 = vsub_f32(v237, v205); + float32x2_t v392 = vadd_f32(v237, v201); + float32x2_t v393 = vadd_f32(v209, v177); + int16x4_t v461 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v177, 15), (int32x2_t){0, 0})); + float32x2_t v169 = vsub_f32(v168, v120); + float32x2_t v369 = vrev64_f32(v165); + float32x2_t v394 = vadd_f32(v213, v393); + float32x2_t v395 = vsub_f32(v393, v213); + float32x2_t v396 = vsub_f32(v385, v387); + float32x2_t v398 = vadd_f32(v386, v388); + float32x2_t v400 = vadd_f32(v385, v389); + float32x2_t v402 = vadd_f32(v386, v390); + float32x2_t v412 = vadd_f32(v244, v258); + float32x2_t v413 = vadd_f32(v251, v258); + float32x2_t v414 = vadd_f32(v265, v279); + float32x2_t v415 = vadd_f32(v272, v279); + float32x2_t v416 = vadd_f32(v286, v300); + float32x2_t v417 = vadd_f32(v293, v300); + float32x2_t v418 = vadd_f32(v307, v321); + float32x2_t v419 = vadd_f32(v314, v321); + v6[0] = vget_lane_s32(vreinterpret_s32_s16(v461), 0); + float32x2_t v170 = vadd_f32(v165, v169); + float32x2_t v370 = vmul_f32(v369, v368); + float32x2_t v376 = vrev64_f32(v169); + float32x2_t v397 = vadd_f32(v391, v394); + float32x2_t v399 = vadd_f32(v392, v395); + float32x2_t v401 = vsub_f32(v394, v391); + float32x2_t v403 = vsub_f32(v395, v392); + float32x2_t v423 = vadd_f32(v412, v414); + float32x2_t v424 = vsub_f32(v412, v414); + float32x2_t v425 = vadd_f32(v413, v415); + float32x2_t v426 = vsub_f32(v413, v415); + float32x2_t v427 = vadd_f32(v416, v418); + float32x2_t v428 = vsub_f32(v418, v416); + float32x2_t v429 = vadd_f32(v417, v419); + float32x2_t v430 = vsub_f32(v419, v417); + float32x2_t v377 = vmul_f32(v376, v375); + float32x2_t v383 = vrev64_f32(v170); + float32x2_t v404 = vadd_f32(v396, v397); + float32x2_t v405 = vadd_f32(v398, v399); + float32x2_t v406 = vadd_f32(v400, v401); + float32x2_t v407 = vadd_f32(v402, v403); + float32x2_t v408 = vsub_f32(v397, v396); + float32x2_t v409 = vsub_f32(v399, v398); + float32x2_t v410 = vsub_f32(v401, v400); + float32x2_t v411 = vsub_f32(v403, v402); + float32x2_t v440 = vadd_f32(v425, v429); + float32x2_t v442 = vadd_f32(v424, v430); + float32x2_t v444 = vsub_f32(v423, v427); + float32x2_t v446 = vsub_f32(v430, v424); + float32x2_t v448 = vadd_f32(v423, v427); + float32x2_t v451 = vsub_f32(v428, v426); + float32x2_t v454 = vsub_f32(v429, v425); + float32x2_t v457 = vadd_f32(v426, v428); + float32x2_t v384 = vmul_f32(v383, v382); + float32x2_t v431 = vsub_f32(v370, v377); + float32x2_t v420 = vadd_f32(v384, v377); + float32x2_t v433 = vadd_f32(v431, v431); + float32x2_t v458 = vsub_f32(v457, v431); + float32x2_t v421 = vadd_f32(v328, v420); + float32x2_t v434 = vsub_f32(v349, v433); + float32x2_t v437 = vadd_f32(v420, v420); + float32x2_t v455 = vadd_f32(v454, v433); + float32x2_t v493 = vadd_f32(v411, v458); + float32x2_t v500 = vsub_f32(v411, v458); + float32x2_t v422 = vadd_f32(v421, v335); + float32x2_t v432 = vadd_f32(v421, v342); + float32x2_t v435 = vadd_f32(v434, v356); + float32x2_t v436 = vadd_f32(v434, v363); + float32x2_t v438 = vadd_f32(v437, v437); + float32x2_t v439 = vadd_f32(v431, v437); + float32x2_t v445 = vadd_f32(v444, v437); + float32x2_t v456 = vadd_f32(v455, v437); + int16x4_t v496 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v493, 15), (int32x2_t){0, 0})); + int16x4_t v503 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v500, 15), (int32x2_t){0, 0})); + float32x2_t v441 = vadd_f32(v440, v432); + float32x2_t v443 = vadd_f32(v442, v435); + float32x2_t v447 = vsub_f32(v446, v439); + float32x2_t v449 = vadd_f32(v448, v422); + float32x2_t v452 = vsub_f32(v451, v436); + float32x2_t v479 = vadd_f32(v406, v445); + float32x2_t v486 = vsub_f32(v406, v445); + v6[ostride * 3] = vget_lane_s32(vreinterpret_s32_s16(v496), 0); + v6[ostride * 14] = vget_lane_s32(vreinterpret_s32_s16(v503), 0); + float32x2_t v563 = vadd_f32(v410, v456); + float32x2_t v570 = vsub_f32(v410, v456); + float32x2_t v450 = vadd_f32(v449, v431); + float32x2_t v453 = vadd_f32(v452, v438); + float32x2_t v465 = vadd_f32(v404, v441); + float32x2_t v472 = vsub_f32(v404, v441); + int16x4_t v482 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v479, 15), (int32x2_t){0, 0})); + int16x4_t v489 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v486, 15), (int32x2_t){0, 0})); + float32x2_t v521 = vadd_f32(v407, v447); + float32x2_t v528 = vsub_f32(v407, v447); + float32x2_t v535 = vadd_f32(v405, v443); + float32x2_t v542 = vsub_f32(v405, v443); + int16x4_t v566 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v563, 15), (int32x2_t){0, 0})); + int16x4_t v573 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v570, 15), (int32x2_t){0, 0})); + int16x4_t v468 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v465, 15), (int32x2_t){0, 0})); + int16x4_t v475 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v472, 15), (int32x2_t){0, 0})); + v6[ostride * 2] = vget_lane_s32(vreinterpret_s32_s16(v482), 0); + v6[ostride * 15] = vget_lane_s32(vreinterpret_s32_s16(v489), 0); + float32x2_t v507 = vadd_f32(v408, v450); + float32x2_t v514 = vsub_f32(v408, v450); + int16x4_t v524 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v521, 15), (int32x2_t){0, 0})); + int16x4_t v531 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v528, 15), (int32x2_t){0, 0})); + int16x4_t v538 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v535, 15), (int32x2_t){0, 0})); + int16x4_t v545 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v542, 15), (int32x2_t){0, 0})); + float32x2_t v549 = vadd_f32(v409, v453); + float32x2_t v556 = vsub_f32(v409, v453); + v6[ostride * 8] = vget_lane_s32(vreinterpret_s32_s16(v566), 0); + v6[ostride * 9] = vget_lane_s32(vreinterpret_s32_s16(v573), 0); + v6[ostride] = vget_lane_s32(vreinterpret_s32_s16(v468), 0); + v6[ostride * 16] = vget_lane_s32(vreinterpret_s32_s16(v475), 0); + int16x4_t v510 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v507, 15), (int32x2_t){0, 0})); + int16x4_t v517 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v514, 15), (int32x2_t){0, 0})); + v6[ostride * 5] = vget_lane_s32(vreinterpret_s32_s16(v524), 0); + v6[ostride * 12] = vget_lane_s32(vreinterpret_s32_s16(v531), 0); + v6[ostride * 6] = vget_lane_s32(vreinterpret_s32_s16(v538), 0); + v6[ostride * 11] = vget_lane_s32(vreinterpret_s32_s16(v545), 0); + int16x4_t v552 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v549, 15), (int32x2_t){0, 0})); + int16x4_t v559 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v556, 15), (int32x2_t){0, 0})); + v6[ostride * 4] = vget_lane_s32(vreinterpret_s32_s16(v510), 0); + v6[ostride * 13] = vget_lane_s32(vreinterpret_s32_s16(v517), 0); + v6[ostride * 7] = vget_lane_s32(vreinterpret_s32_s16(v552), 0); + v6[ostride * 10] = vget_lane_s32(vreinterpret_s32_s16(v559), 0); +} +#endif + +#ifdef ARMRAL_ARCH_SVE +void armral_fft_cs16_cf32_cs16_ac_n_uun17( + const armral_cmplx_int16_t *restrict x, armral_cmplx_int16_t *restrict y, + int istride, int ostride, int howmany, float dir) { + int64_t v0 = istride; + int64_t v2 = ostride; + float v4 = dir; + const int32_t *v5 = (const int32_t *)x; + int32_t *v6 = (int32_t *)y; + svbool_t pred_full = svptrue_pat_b32(SV_VL2); + float v223 = -4.2602849117736000e-02F; + float v228 = 2.0497965023262180e-01F; + float v233 = 1.0451835201736759e+00F; + float v238 = 1.7645848660222969e+00F; + float v243 = -7.2340797728605655e-01F; + float v248 = -8.9055591620606403e-02F; + float v253 = -1.0625000000000000e+00F; + float v258 = 2.5769410160110379e-01F; + float v263 = 7.7980260789483757e-01F; + float v268 = 5.4389318464570580e-01F; + float v273 = 4.2010193497052700e-01F; + float v278 = 1.2810929434228073e+00F; + float v283 = 4.4088907348175338e-01F; + float v288 = 3.1717619283272508e-01F; + float v293 = 9.0138318648016680e-01F; + float v300 = 4.3248756360072310e-01F; + float v307 = -6.6693537504044498e-01F; + float v314 = 6.0389004312516970e-01F; + float v321 = 3.6924873198582547e-01F; + float v328 = -4.8656938755549761e-01F; + float v335 = -2.3813712136760609e-01F; + float v342 = 1.5573820617422458e+00F; + float v349 = -6.5962247018731990e-01F; + float v356 = 1.4316961569866241e-01F; + float v363 = -2.3903469959860771e-01F; + float v370 = 4.7932541949972603e-02F; + float v377 = 2.3188014856550065e+00F; + float v384 = -7.8914568419206255e-01F; + float v391 = -3.8484572871179505e+00F; + float v398 = 1.3003804568801376e+00F; + float v405 = -4.0814769046889037e+00F; + float v412 = 1.4807159909286283e+00F; + float v419 = 1.3332470363551400e-02F; + float v426 = 3.7139778690557629e-01F; + float v433 = -1.9236512863456379e-01F; + const int32_t *v671 = &v5[v0]; + int32_t *v871 = &v6[v2]; + int64_t v23 = v0 * 16; + int64_t v33 = v0 * 3; + int64_t v41 = v0 * 14; + int64_t v51 = v0 * 9; + int64_t v59 = v0 * 8; + int64_t v69 = v0 * 10; + int64_t v77 = v0 * 7; + int64_t v87 = v0 * 13; + int64_t v95 = v0 * 4; + int64_t v105 = v0 * 5; + int64_t v113 = v0 * 12; + int64_t v123 = v0 * 15; + int64_t v131 = v0 * 2; + int64_t v141 = v0 * 11; + int64_t v149 = v0 * 6; + float v296 = v4 * v293; + float v303 = v4 * v300; + float v310 = v4 * v307; + float v317 = v4 * v314; + float v324 = v4 * v321; + float v331 = v4 * v328; + float v338 = v4 * v335; + float v345 = v4 * v342; + float v352 = v4 * v349; + float v359 = v4 * v356; + float v366 = v4 * v363; + float v373 = v4 * v370; + float v380 = v4 * v377; + float v387 = v4 * v384; + float v394 = v4 * v391; + float v401 = v4 * v398; + float v408 = v4 * v405; + float v415 = v4 * v412; + float v422 = v4 * v419; + float v429 = v4 * v426; + float v436 = v4 * v433; + int64_t v532 = v2 * 16; + int64_t v541 = v2 * 2; + int64_t v550 = v2 * 15; + int64_t v559 = v2 * 3; + int64_t v568 = v2 * 14; + int64_t v577 = v2 * 4; + int64_t v586 = v2 * 13; + int64_t v595 = v2 * 5; + int64_t v604 = v2 * 12; + int64_t v613 = v2 * 6; + int64_t v622 = v2 * 11; + int64_t v631 = v2 * 7; + int64_t v640 = v2 * 10; + int64_t v649 = v2 * 8; + int64_t v658 = v2 * 9; + const int32_t *v816 = &v5[0]; + svfloat32_t v820 = svdup_n_f32(v223); + svfloat32_t v821 = svdup_n_f32(v228); + svfloat32_t v822 = svdup_n_f32(v233); + svfloat32_t v823 = svdup_n_f32(v238); + svfloat32_t v824 = svdup_n_f32(v243); + svfloat32_t v825 = svdup_n_f32(v248); + svfloat32_t v826 = svdup_n_f32(v253); + svfloat32_t v827 = svdup_n_f32(v258); + svfloat32_t v828 = svdup_n_f32(v263); + svfloat32_t v829 = svdup_n_f32(v268); + svfloat32_t v830 = svdup_n_f32(v273); + svfloat32_t v831 = svdup_n_f32(v278); + svfloat32_t v832 = svdup_n_f32(v283); + svfloat32_t v833 = svdup_n_f32(v288); + int32_t *v862 = &v6[0]; + svfloat32_t v21 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v671[0])), + 1.F / (1ULL << 15ULL)); + const int32_t *v680 = &v5[v23]; + const int32_t *v689 = &v5[v33]; + const int32_t *v698 = &v5[v41]; + const int32_t *v707 = &v5[v51]; + const int32_t *v716 = &v5[v59]; + const int32_t *v725 = &v5[v69]; + const int32_t *v734 = &v5[v77]; + const int32_t *v743 = &v5[v87]; + const int32_t *v752 = &v5[v95]; + const int32_t *v761 = &v5[v105]; + const int32_t *v770 = &v5[v113]; + const int32_t *v779 = &v5[v123]; + const int32_t *v788 = &v5[v131]; + const int32_t *v797 = &v5[v141]; + const int32_t *v806 = &v5[v149]; + svfloat32_t v834 = svdup_n_f32(v296); + svfloat32_t v835 = svdup_n_f32(v303); + svfloat32_t v836 = svdup_n_f32(v310); + svfloat32_t v837 = svdup_n_f32(v317); + svfloat32_t v838 = svdup_n_f32(v324); + svfloat32_t v839 = svdup_n_f32(v331); + svfloat32_t v840 = svdup_n_f32(v338); + svfloat32_t v841 = svdup_n_f32(v345); + svfloat32_t v842 = svdup_n_f32(v352); + svfloat32_t v843 = svdup_n_f32(v359); + svfloat32_t v844 = svdup_n_f32(v366); + svfloat32_t v845 = svdup_n_f32(v373); + svfloat32_t v846 = svdup_n_f32(v380); + svfloat32_t v847 = svdup_n_f32(v387); + svfloat32_t v848 = svdup_n_f32(v394); + svfloat32_t v849 = svdup_n_f32(v401); + svfloat32_t v850 = svdup_n_f32(v408); + svfloat32_t v851 = svdup_n_f32(v415); + svfloat32_t v852 = svdup_n_f32(v422); + svfloat32_t v853 = svdup_n_f32(v429); + svfloat32_t v854 = svdup_n_f32(v436); + int32_t *v880 = &v6[v532]; + int32_t *v889 = &v6[v541]; + int32_t *v898 = &v6[v550]; + int32_t *v907 = &v6[v559]; + int32_t *v916 = &v6[v568]; + int32_t *v925 = &v6[v577]; + int32_t *v934 = &v6[v586]; + int32_t *v943 = &v6[v595]; + int32_t *v952 = &v6[v604]; + int32_t *v961 = &v6[v613]; + int32_t *v970 = &v6[v622]; + int32_t *v979 = &v6[v631]; + int32_t *v988 = &v6[v640]; + int32_t *v997 = &v6[v649]; + int32_t *v1006 = &v6[v658]; + svfloat32_t v215 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v816[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v29 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v680[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v39 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v689[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v47 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v698[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v57 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v707[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v65 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v716[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v75 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v725[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v83 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v734[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v93 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v743[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v101 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v752[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v111 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v761[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v119 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v770[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v129 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v779[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v137 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v788[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v147 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v797[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v155 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v806[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v30 = svadd_f32_x(svptrue_b32(), v21, v29); + svfloat32_t v31 = svsub_f32_x(svptrue_b32(), v21, v29); + svfloat32_t v48 = svadd_f32_x(svptrue_b32(), v39, v47); + svfloat32_t v49 = svsub_f32_x(svptrue_b32(), v39, v47); + svfloat32_t v66 = svadd_f32_x(svptrue_b32(), v57, v65); + svfloat32_t v67 = svsub_f32_x(svptrue_b32(), v57, v65); + svfloat32_t v84 = svadd_f32_x(svptrue_b32(), v75, v83); + svfloat32_t v85 = svsub_f32_x(svptrue_b32(), v75, v83); + svfloat32_t v102 = svadd_f32_x(svptrue_b32(), v93, v101); + svfloat32_t v103 = svsub_f32_x(svptrue_b32(), v93, v101); + svfloat32_t v120 = svadd_f32_x(svptrue_b32(), v111, v119); + svfloat32_t v121 = svsub_f32_x(svptrue_b32(), v111, v119); + svfloat32_t v138 = svadd_f32_x(svptrue_b32(), v129, v137); + svfloat32_t v139 = svsub_f32_x(svptrue_b32(), v129, v137); + svfloat32_t v156 = svadd_f32_x(svptrue_b32(), v147, v155); + svfloat32_t v157 = svsub_f32_x(svptrue_b32(), v147, v155); + svfloat32_t v158 = svadd_f32_x(svptrue_b32(), v30, v102); + svfloat32_t v159 = svadd_f32_x(svptrue_b32(), v48, v120); + svfloat32_t v160 = svadd_f32_x(svptrue_b32(), v66, v138); + svfloat32_t v161 = svadd_f32_x(svptrue_b32(), v84, v156); + svfloat32_t v164 = svsub_f32_x(svptrue_b32(), v30, v102); + svfloat32_t v165 = svsub_f32_x(svptrue_b32(), v48, v120); + svfloat32_t v166 = svsub_f32_x(svptrue_b32(), v66, v138); + svfloat32_t v167 = svsub_f32_x(svptrue_b32(), v84, v156); + svfloat32_t v178 = svadd_f32_x(svptrue_b32(), v31, v67); + svfloat32_t v179 = svadd_f32_x(svptrue_b32(), v49, v85); + svfloat32_t v180 = svsub_f32_x(svptrue_b32(), v31, v67); + svfloat32_t v181 = svsub_f32_x(svptrue_b32(), v157, v121); + svfloat32_t v182 = svadd_f32_x(svptrue_b32(), v103, v139); + svfloat32_t v183 = svadd_f32_x(svptrue_b32(), v121, v157); + svfloat32_t v184 = svsub_f32_x(svptrue_b32(), v103, v139); + svfloat32_t v185 = svsub_f32_x(svptrue_b32(), v49, v85); + svfloat32_t v198 = svadd_f32_x(svptrue_b32(), v31, v103); + svfloat32_t v199 = svadd_f32_x(svptrue_b32(), v85, v157); + svfloat32_t v162 = svadd_f32_x(svptrue_b32(), v158, v160); + svfloat32_t v163 = svadd_f32_x(svptrue_b32(), v159, v161); + svfloat32_t v168 = svsub_f32_x(svptrue_b32(), v158, v160); + svfloat32_t v169 = svsub_f32_x(svptrue_b32(), v159, v161); + svfloat32_t v172 = svadd_f32_x(svptrue_b32(), v165, v167); + svfloat32_t v173 = svadd_f32_x(svptrue_b32(), v164, v166); + svfloat32_t v175 = svsub_f32_x(svptrue_b32(), v166, v167); + svfloat32_t v176 = svsub_f32_x(svptrue_b32(), v164, v165); + svfloat32_t v186 = svadd_f32_x(svptrue_b32(), v178, v179); + svfloat32_t v187 = svadd_f32_x(svptrue_b32(), v182, v183); + svfloat32_t v189 = svsub_f32_x(svptrue_b32(), v178, v179); + svfloat32_t v190 = svsub_f32_x(svptrue_b32(), v182, v183); + svfloat32_t v192 = svadd_f32_x(svptrue_b32(), v180, v181); + svfloat32_t v193 = svadd_f32_x(svptrue_b32(), v184, v185); + svfloat32_t v195 = svsub_f32_x(svptrue_b32(), v180, v181); + svfloat32_t v196 = svsub_f32_x(svptrue_b32(), v184, v185); + svfloat32_t v236 = svmul_f32_x(svptrue_b32(), v166, v822); + svfloat32_t zero403 = svdup_n_f32(0); + svfloat32_t v403 = svcmla_f32_x(pred_full, zero403, v849, v199, 90); + svfloat32_t v170 = svadd_f32_x(svptrue_b32(), v162, v163); + svfloat32_t v171 = svsub_f32_x(svptrue_b32(), v162, v163); + svfloat32_t v174 = svsub_f32_x(svptrue_b32(), v173, v172); + svfloat32_t v177 = svadd_f32_x(svptrue_b32(), v168, v169); + svfloat32_t v188 = svadd_f32_x(svptrue_b32(), v186, v187); + svfloat32_t v191 = svadd_f32_x(svptrue_b32(), v189, v190); + svfloat32_t v194 = svadd_f32_x(svptrue_b32(), v192, v193); + svfloat32_t v197 = svadd_f32_x(svptrue_b32(), v195, v196); + svfloat32_t v200 = svsub_f32_x(svptrue_b32(), v193, v187); + svfloat32_t v203 = svsub_f32_x(svptrue_b32(), v186, v192); + svfloat32_t v246 = svmul_f32_x(svptrue_b32(), v168, v824); + svfloat32_t v251 = svmul_f32_x(svptrue_b32(), v169, v825); + svfloat32_t v281 = svmul_f32_x(svptrue_b32(), v175, v831); + svfloat32_t v286 = svmul_f32_x(svptrue_b32(), v176, v832); + svfloat32_t v201 = svadd_f32_x(svptrue_b32(), v200, v31); + svfloat32_t v204 = svadd_f32_x(svptrue_b32(), v203, v85); + svfloat32_t v216 = svadd_f32_x(svptrue_b32(), v215, v170); + svfloat32_t v276 = svmul_f32_x(svptrue_b32(), v174, v830); + svfloat32_t zero312 = svdup_n_f32(0); + svfloat32_t v312 = svcmla_f32_x(pred_full, zero312, v836, v188, 90); + svfloat32_t zero333 = svdup_n_f32(0); + svfloat32_t v333 = svcmla_f32_x(pred_full, zero333, v839, v191, 90); + svfloat32_t zero354 = svdup_n_f32(0); + svfloat32_t v354 = svcmla_f32_x(pred_full, zero354, v842, v194, 90); + svfloat32_t zero375 = svdup_n_f32(0); + svfloat32_t v375 = svcmla_f32_x(pred_full, zero375, v845, v197, 90); + svfloat32_t v441 = svmla_f32_x(pred_full, v281, v167, v823); + svfloat32_t v442 = svnmls_f32_x(pred_full, v236, v175, v831); + svfloat32_t v443 = svmla_f32_x(pred_full, v286, v165, v821); + svfloat32_t v444 = svnmls_f32_x(pred_full, v286, v164, v820); + svfloat32_t v202 = svsub_f32_x(svptrue_b32(), v201, v199); + svfloat32_t v205 = svadd_f32_x(svptrue_b32(), v204, v103); + svfloat32_t v439 = svmla_f32_x(pred_full, v276, v172, v828); + svfloat32_t v440 = svnmls_f32_x(pred_full, v276, v173, v829); + svfloat32_t v445 = svnmls_f32_x(pred_full, v251, v177, v833); + svfloat32_t v446 = svmla_f32_x(pred_full, v246, v177, v833); + svfloat32_t v447 = svmla_f32_x(pred_full, v216, v170, v826); + svfloat32_t v466 = svcmla_f32_x(pred_full, v312, v834, v186, 90); + svfloat32_t v467 = svcmla_f32_x(pred_full, v312, v835, v187, 90); + svfloat32_t v468 = svcmla_f32_x(pred_full, v333, v837, v189, 90); + svfloat32_t v469 = svcmla_f32_x(pred_full, v333, v838, v190, 90); + svfloat32_t v470 = svcmla_f32_x(pred_full, v354, v840, v192, 90); + svfloat32_t v471 = svcmla_f32_x(pred_full, v354, v841, v193, 90); + svfloat32_t v472 = svcmla_f32_x(pred_full, v375, v843, v195, 90); + svfloat32_t v473 = svcmla_f32_x(pred_full, v375, v844, v196, 90); + svint16_t v515 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v216, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svfloat32_t v206 = svsub_f32_x(svptrue_b32(), v205, v157); + svfloat32_t zero424 = svdup_n_f32(0); + svfloat32_t v424 = svcmla_f32_x(pred_full, zero424, v852, v202, 90); + svfloat32_t v448 = svmla_f32_x(pred_full, v447, v171, v827); + svfloat32_t v449 = svmls_f32_x(pred_full, v447, v171, v827); + svfloat32_t v450 = svsub_f32_x(svptrue_b32(), v439, v441); + svfloat32_t v452 = svadd_f32_x(svptrue_b32(), v440, v442); + svfloat32_t v454 = svadd_f32_x(svptrue_b32(), v439, v443); + svfloat32_t v456 = svadd_f32_x(svptrue_b32(), v440, v444); + svfloat32_t v477 = svadd_f32_x(svptrue_b32(), v466, v468); + svfloat32_t v478 = svsub_f32_x(svptrue_b32(), v466, v468); + svfloat32_t v479 = svadd_f32_x(svptrue_b32(), v467, v469); + svfloat32_t v480 = svsub_f32_x(svptrue_b32(), v467, v469); + svfloat32_t v481 = svadd_f32_x(svptrue_b32(), v470, v472); + svfloat32_t v482 = svsub_f32_x(svptrue_b32(), v472, v470); + svfloat32_t v483 = svadd_f32_x(svptrue_b32(), v471, v473); + svfloat32_t v484 = svsub_f32_x(svptrue_b32(), v473, v471); + svst1w_u64(pred_full, (unsigned *)(v862), svreinterpret_u64_s16(v515)); + svfloat32_t v207 = svadd_f32_x(svptrue_b32(), v202, v206); + svfloat32_t zero431 = svdup_n_f32(0); + svfloat32_t v431 = svcmla_f32_x(pred_full, zero431, v853, v206, 90); + svfloat32_t v451 = svadd_f32_x(svptrue_b32(), v445, v448); + svfloat32_t v453 = svadd_f32_x(svptrue_b32(), v446, v449); + svfloat32_t v455 = svsub_f32_x(svptrue_b32(), v448, v445); + svfloat32_t v457 = svsub_f32_x(svptrue_b32(), v449, v446); + svfloat32_t v494 = svadd_f32_x(svptrue_b32(), v479, v483); + svfloat32_t v496 = svadd_f32_x(svptrue_b32(), v478, v484); + svfloat32_t v498 = svsub_f32_x(svptrue_b32(), v477, v481); + svfloat32_t v500 = svsub_f32_x(svptrue_b32(), v484, v478); + svfloat32_t v502 = svadd_f32_x(svptrue_b32(), v477, v481); + svfloat32_t v505 = svsub_f32_x(svptrue_b32(), v482, v480); + svfloat32_t v508 = svsub_f32_x(svptrue_b32(), v483, v479); + svfloat32_t v511 = svadd_f32_x(svptrue_b32(), v480, v482); + svfloat32_t v458 = svadd_f32_x(svptrue_b32(), v450, v451); + svfloat32_t v459 = svadd_f32_x(svptrue_b32(), v452, v453); + svfloat32_t v460 = svadd_f32_x(svptrue_b32(), v454, v455); + svfloat32_t v461 = svadd_f32_x(svptrue_b32(), v456, v457); + svfloat32_t v462 = svsub_f32_x(svptrue_b32(), v451, v450); + svfloat32_t v463 = svsub_f32_x(svptrue_b32(), v453, v452); + svfloat32_t v464 = svsub_f32_x(svptrue_b32(), v455, v454); + svfloat32_t v465 = svsub_f32_x(svptrue_b32(), v457, v456); + svfloat32_t v485 = svsub_f32_x(svptrue_b32(), v424, v431); + svfloat32_t v474 = svcmla_f32_x(pred_full, v431, v854, v207, 90); + svfloat32_t v487 = svadd_f32_x(svptrue_b32(), v485, v485); + svfloat32_t v512 = svsub_f32_x(svptrue_b32(), v511, v485); + svfloat32_t v475 = svcmla_f32_x(pred_full, v474, v846, v198, 90); + svfloat32_t v488 = svsub_f32_x(svptrue_b32(), v403, v487); + svfloat32_t v491 = svadd_f32_x(svptrue_b32(), v474, v474); + svfloat32_t v509 = svadd_f32_x(svptrue_b32(), v508, v487); + svfloat32_t v557 = svadd_f32_x(svptrue_b32(), v465, v512); + svfloat32_t v566 = svsub_f32_x(svptrue_b32(), v465, v512); + svfloat32_t v476 = svcmla_f32_x(pred_full, v475, v847, v31, 90); + svfloat32_t v486 = svcmla_f32_x(pred_full, v475, v848, v103, 90); + svfloat32_t v489 = svcmla_f32_x(pred_full, v488, v850, v85, 90); + svfloat32_t v490 = svcmla_f32_x(pred_full, v488, v851, v157, 90); + svfloat32_t v492 = svadd_f32_x(svptrue_b32(), v491, v491); + svfloat32_t v493 = svadd_f32_x(svptrue_b32(), v485, v491); + svfloat32_t v499 = svadd_f32_x(svptrue_b32(), v498, v491); + svfloat32_t v510 = svadd_f32_x(svptrue_b32(), v509, v491); + svint16_t v560 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v557, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v569 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v566, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svfloat32_t v495 = svadd_f32_x(svptrue_b32(), v494, v486); + svfloat32_t v497 = svadd_f32_x(svptrue_b32(), v496, v489); + svfloat32_t v501 = svsub_f32_x(svptrue_b32(), v500, v493); + svfloat32_t v503 = svadd_f32_x(svptrue_b32(), v502, v476); + svfloat32_t v506 = svsub_f32_x(svptrue_b32(), v505, v490); + svfloat32_t v539 = svadd_f32_x(svptrue_b32(), v460, v499); + svfloat32_t v548 = svsub_f32_x(svptrue_b32(), v460, v499); + svfloat32_t v647 = svadd_f32_x(svptrue_b32(), v464, v510); + svfloat32_t v656 = svsub_f32_x(svptrue_b32(), v464, v510); + svst1w_u64(pred_full, (unsigned *)(v907), svreinterpret_u64_s16(v560)); + svst1w_u64(pred_full, (unsigned *)(v916), svreinterpret_u64_s16(v569)); + svfloat32_t v504 = svadd_f32_x(svptrue_b32(), v503, v485); + svfloat32_t v507 = svadd_f32_x(svptrue_b32(), v506, v492); + svfloat32_t v521 = svadd_f32_x(svptrue_b32(), v458, v495); + svfloat32_t v530 = svsub_f32_x(svptrue_b32(), v458, v495); + svint16_t v542 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v539, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v551 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v548, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svfloat32_t v593 = svadd_f32_x(svptrue_b32(), v461, v501); + svfloat32_t v602 = svsub_f32_x(svptrue_b32(), v461, v501); + svfloat32_t v611 = svadd_f32_x(svptrue_b32(), v459, v497); + svfloat32_t v620 = svsub_f32_x(svptrue_b32(), v459, v497); + svint16_t v650 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v647, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v659 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v656, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v524 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v521, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v533 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v530, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svfloat32_t v575 = svadd_f32_x(svptrue_b32(), v462, v504); + svfloat32_t v584 = svsub_f32_x(svptrue_b32(), v462, v504); + svint16_t v596 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v593, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v605 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v602, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v614 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v611, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v623 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v620, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svfloat32_t v629 = svadd_f32_x(svptrue_b32(), v463, v507); + svfloat32_t v638 = svsub_f32_x(svptrue_b32(), v463, v507); + svst1w_u64(pred_full, (unsigned *)(v889), svreinterpret_u64_s16(v542)); + svst1w_u64(pred_full, (unsigned *)(v898), svreinterpret_u64_s16(v551)); + svst1w_u64(pred_full, (unsigned *)(v997), svreinterpret_u64_s16(v650)); + svst1w_u64(pred_full, (unsigned *)(v1006), svreinterpret_u64_s16(v659)); + svint16_t v578 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v575, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v587 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v584, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v632 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v629, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v641 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v638, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svst1w_u64(pred_full, (unsigned *)(v871), svreinterpret_u64_s16(v524)); + svst1w_u64(pred_full, (unsigned *)(v880), svreinterpret_u64_s16(v533)); + svst1w_u64(pred_full, (unsigned *)(v943), svreinterpret_u64_s16(v596)); + svst1w_u64(pred_full, (unsigned *)(v952), svreinterpret_u64_s16(v605)); + svst1w_u64(pred_full, (unsigned *)(v961), svreinterpret_u64_s16(v614)); + svst1w_u64(pred_full, (unsigned *)(v970), svreinterpret_u64_s16(v623)); + svst1w_u64(pred_full, (unsigned *)(v925), svreinterpret_u64_s16(v578)); + svst1w_u64(pred_full, (unsigned *)(v934), svreinterpret_u64_s16(v587)); + svst1w_u64(pred_full, (unsigned *)(v979), svreinterpret_u64_s16(v632)); + svst1w_u64(pred_full, (unsigned *)(v988), svreinterpret_u64_s16(v641)); +} +#endif + +#ifndef ARMRAL_ARCH_SVE +void armral_fft_cs16_cf32_cs16_ac_n_uun18( + const armral_cmplx_int16_t *restrict x, armral_cmplx_int16_t *restrict y, + int istride, int ostride, int howmany, float dir) { + float v4 = dir; + const int32_t *v5 = (const int32_t *)x; + int32_t *v6 = (int32_t *)y; + float v264 = -5.0000000000000000e-01F; + float v275 = -1.4999999999999998e+00F; + float v278 = 8.6602540378443871e-01F; + float v279 = -8.6602540378443871e-01F; + float v286 = 7.6604444311897801e-01F; + float v290 = 9.3969262078590832e-01F; + float v294 = -1.7364817766693039e-01F; + float v297 = 6.4278760968653925e-01F; + float v298 = -6.4278760968653925e-01F; + float v304 = -3.4202014332566888e-01F; + float v305 = 3.4202014332566888e-01F; + float v311 = 9.8480775301220802e-01F; + float v312 = -9.8480775301220802e-01F; + int16x4_t v13 = vld1s_s16(&v5[0]); + int16x4_t v89 = vld1s_s16(&v5[istride]); + float32x2_t v265 = (float32x2_t){v264, v264}; + float32x2_t v276 = (float32x2_t){v275, v275}; + float32x2_t v280 = (float32x2_t){v278, v279}; + float32x2_t v287 = (float32x2_t){v286, v286}; + float32x2_t v291 = (float32x2_t){v290, v290}; + float32x2_t v295 = (float32x2_t){v294, v294}; + float32x2_t v299 = (float32x2_t){v297, v298}; + float32x2_t v306 = (float32x2_t){v304, v305}; + float32x2_t v313 = (float32x2_t){v311, v312}; + float32x2_t v314 = (float32x2_t){v4, v4}; + float32x2_t v14 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v13)), 15); + int16x4_t v19 = vld1s_s16(&v5[istride * 9]); + int16x4_t v27 = vld1s_s16(&v5[istride * 2]); + int16x4_t v33 = vld1s_s16(&v5[istride * 11]); + int16x4_t v41 = vld1s_s16(&v5[istride * 4]); + int16x4_t v47 = vld1s_s16(&v5[istride * 13]); + int16x4_t v55 = vld1s_s16(&v5[istride * 6]); + int16x4_t v61 = vld1s_s16(&v5[istride * 15]); + int16x4_t v69 = vld1s_s16(&v5[istride * 8]); + int16x4_t v75 = vld1s_s16(&v5[istride * 17]); + int16x4_t v83 = vld1s_s16(&v5[istride * 10]); + float32x2_t v90 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v89)), 15); + int16x4_t v97 = vld1s_s16(&v5[istride * 12]); + int16x4_t v103 = vld1s_s16(&v5[istride * 3]); + int16x4_t v111 = vld1s_s16(&v5[istride * 14]); + int16x4_t v117 = vld1s_s16(&v5[istride * 5]); + int16x4_t v125 = vld1s_s16(&v5[istride * 16]); + int16x4_t v131 = vld1s_s16(&v5[istride * 7]); + float32x2_t v282 = vmul_f32(v314, v280); + float32x2_t v301 = vmul_f32(v314, v299); + float32x2_t v308 = vmul_f32(v314, v306); + float32x2_t v315 = vmul_f32(v314, v313); + float32x2_t v20 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v19)), 15); + float32x2_t v28 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v27)), 15); + float32x2_t v34 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v33)), 15); + float32x2_t v42 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v41)), 15); + float32x2_t v48 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v47)), 15); + float32x2_t v56 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v55)), 15); + float32x2_t v62 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v61)), 15); + float32x2_t v70 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v69)), 15); + float32x2_t v76 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v75)), 15); + float32x2_t v84 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v83)), 15); + float32x2_t v98 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v97)), 15); + float32x2_t v104 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v103)), 15); + float32x2_t v112 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v111)), 15); + float32x2_t v118 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v117)), 15); + float32x2_t v126 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v125)), 15); + float32x2_t v132 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v131)), 15); + float32x2_t v21 = vadd_f32(v14, v20); + float32x2_t v22 = vsub_f32(v14, v20); + float32x2_t v35 = vadd_f32(v28, v34); + float32x2_t v36 = vsub_f32(v28, v34); + float32x2_t v49 = vadd_f32(v42, v48); + float32x2_t v50 = vsub_f32(v42, v48); + float32x2_t v63 = vadd_f32(v56, v62); + float32x2_t v64 = vsub_f32(v56, v62); + float32x2_t v77 = vadd_f32(v70, v76); + float32x2_t v78 = vsub_f32(v70, v76); + float32x2_t v91 = vadd_f32(v84, v90); + float32x2_t v92 = vsub_f32(v84, v90); + float32x2_t v105 = vadd_f32(v98, v104); + float32x2_t v106 = vsub_f32(v98, v104); + float32x2_t v119 = vadd_f32(v112, v118); + float32x2_t v120 = vsub_f32(v112, v118); + float32x2_t v133 = vadd_f32(v126, v132); + float32x2_t v134 = vsub_f32(v126, v132); + float32x2_t v135 = vadd_f32(v35, v133); + float32x2_t v136 = vsub_f32(v35, v133); + float32x2_t v137 = vadd_f32(v119, v49); + float32x2_t v138 = vsub_f32(v119, v49); + float32x2_t v139 = vadd_f32(v63, v105); + float32x2_t v140 = vsub_f32(v63, v105); + float32x2_t v141 = vadd_f32(v77, v91); + float32x2_t v142 = vsub_f32(v77, v91); + float32x2_t v239 = vadd_f32(v36, v134); + float32x2_t v240 = vsub_f32(v36, v134); + float32x2_t v241 = vadd_f32(v120, v50); + float32x2_t v242 = vsub_f32(v120, v50); + float32x2_t v243 = vadd_f32(v64, v106); + float32x2_t v244 = vsub_f32(v64, v106); + float32x2_t v245 = vadd_f32(v78, v92); + float32x2_t v246 = vsub_f32(v78, v92); + float32x2_t v143 = vadd_f32(v135, v137); + float32x2_t v147 = vadd_f32(v136, v138); + float32x2_t v149 = vsub_f32(v135, v137); + float32x2_t v150 = vsub_f32(v137, v141); + float32x2_t v151 = vsub_f32(v141, v135); + float32x2_t v152 = vsub_f32(v136, v138); + float32x2_t v153 = vsub_f32(v138, v142); + float32x2_t v154 = vsub_f32(v142, v136); + float32x2_t v173 = vmul_f32(v139, v276); + float32x2_t v179 = vrev64_f32(v140); + float32x2_t v247 = vadd_f32(v239, v241); + float32x2_t v251 = vadd_f32(v240, v242); + float32x2_t v253 = vsub_f32(v239, v241); + float32x2_t v254 = vsub_f32(v241, v245); + float32x2_t v255 = vsub_f32(v245, v239); + float32x2_t v256 = vsub_f32(v240, v242); + float32x2_t v257 = vsub_f32(v242, v246); + float32x2_t v258 = vsub_f32(v246, v240); + float32x2_t v277 = vmul_f32(v243, v276); + float32x2_t v283 = vrev64_f32(v244); + float32x2_t v144 = vadd_f32(v143, v141); + float32x2_t v148 = vadd_f32(v147, v142); + float32x2_t v180 = vmul_f32(v179, v282); + float32x2_t v184 = vmul_f32(v149, v287); + float32x2_t v188 = vmul_f32(v150, v291); + float32x2_t v192 = vmul_f32(v151, v295); + float32x2_t v198 = vrev64_f32(v152); + float32x2_t v205 = vrev64_f32(v153); + float32x2_t v212 = vrev64_f32(v154); + float32x2_t v248 = vadd_f32(v247, v245); + float32x2_t v252 = vadd_f32(v251, v246); + float32x2_t v284 = vmul_f32(v283, v282); + float32x2_t v288 = vmul_f32(v253, v287); + float32x2_t v292 = vmul_f32(v254, v291); + float32x2_t v296 = vmul_f32(v255, v295); + float32x2_t v302 = vrev64_f32(v256); + float32x2_t v309 = vrev64_f32(v257); + float32x2_t v316 = vrev64_f32(v258); + float32x2_t v145 = vadd_f32(v144, v139); + float32x2_t v162 = vmul_f32(v144, v265); + float32x2_t v168 = vrev64_f32(v148); + float32x2_t v199 = vmul_f32(v198, v301); + float32x2_t v206 = vmul_f32(v205, v308); + float32x2_t v213 = vmul_f32(v212, v315); + float32x2_t v249 = vadd_f32(v248, v243); + float32x2_t v266 = vmul_f32(v248, v265); + float32x2_t v272 = vrev64_f32(v252); + float32x2_t v303 = vmul_f32(v302, v301); + float32x2_t v310 = vmul_f32(v309, v308); + float32x2_t v317 = vmul_f32(v316, v315); + float32x2_t v146 = vadd_f32(v145, v21); + float32x2_t v169 = vmul_f32(v168, v282); + float32x2_t v214 = vadd_f32(v162, v162); + float32x2_t v227 = vadd_f32(v180, v199); + float32x2_t v229 = vsub_f32(v180, v206); + float32x2_t v231 = vsub_f32(v180, v199); + float32x2_t v250 = vadd_f32(v249, v22); + float32x2_t v273 = vmul_f32(v272, v282); + float32x2_t v318 = vadd_f32(v266, v266); + float32x2_t v331 = vadd_f32(v284, v303); + float32x2_t v333 = vsub_f32(v284, v310); + float32x2_t v335 = vsub_f32(v284, v303); + float32x2_t v215 = vadd_f32(v214, v162); + float32x2_t v219 = vadd_f32(v146, v173); + float32x2_t v228 = vadd_f32(v227, v206); + float32x2_t v230 = vadd_f32(v229, v213); + float32x2_t v232 = vsub_f32(v231, v213); + float32x2_t v319 = vadd_f32(v318, v266); + float32x2_t v323 = vadd_f32(v250, v277); + float32x2_t v332 = vadd_f32(v331, v310); + float32x2_t v334 = vadd_f32(v333, v317); + float32x2_t v336 = vsub_f32(v335, v317); + int16x4_t v345 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v146, 15), (int32x2_t){0, 0})); + int16x4_t v351 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v250, 15), (int32x2_t){0, 0})); + float32x2_t v216 = vadd_f32(v146, v215); + float32x2_t v220 = vadd_f32(v219, v214); + float32x2_t v320 = vadd_f32(v250, v319); + float32x2_t v324 = vadd_f32(v323, v318); + v6[0] = vget_lane_s32(vreinterpret_s32_s16(v345), 0); + v6[ostride * 9] = vget_lane_s32(vreinterpret_s32_s16(v351), 0); + float32x2_t v217 = vadd_f32(v216, v169); + float32x2_t v218 = vsub_f32(v216, v169); + float32x2_t v221 = vadd_f32(v220, v184); + float32x2_t v223 = vsub_f32(v220, v188); + float32x2_t v225 = vsub_f32(v220, v184); + float32x2_t v321 = vadd_f32(v320, v273); + float32x2_t v322 = vsub_f32(v320, v273); + float32x2_t v325 = vadd_f32(v324, v288); + float32x2_t v327 = vsub_f32(v324, v292); + float32x2_t v329 = vsub_f32(v324, v288); + float32x2_t v222 = vadd_f32(v221, v188); + float32x2_t v224 = vadd_f32(v223, v192); + float32x2_t v226 = vsub_f32(v225, v192); + float32x2_t v326 = vadd_f32(v325, v292); + float32x2_t v328 = vadd_f32(v327, v296); + float32x2_t v330 = vsub_f32(v329, v296); + int16x4_t v381 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v218, 15), (int32x2_t){0, 0})); + int16x4_t v387 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v322, 15), (int32x2_t){0, 0})); + int16x4_t v417 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v217, 15), (int32x2_t){0, 0})); + int16x4_t v423 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v321, 15), (int32x2_t){0, 0})); + float32x2_t v233 = vadd_f32(v222, v228); + float32x2_t v234 = vsub_f32(v222, v228); + float32x2_t v235 = vadd_f32(v224, v230); + float32x2_t v236 = vsub_f32(v224, v230); + float32x2_t v237 = vadd_f32(v226, v232); + float32x2_t v238 = vsub_f32(v226, v232); + float32x2_t v337 = vadd_f32(v326, v332); + float32x2_t v338 = vsub_f32(v326, v332); + float32x2_t v339 = vadd_f32(v328, v334); + float32x2_t v340 = vsub_f32(v328, v334); + float32x2_t v341 = vadd_f32(v330, v336); + float32x2_t v342 = vsub_f32(v330, v336); + v6[ostride * 12] = vget_lane_s32(vreinterpret_s32_s16(v381), 0); + v6[ostride * 3] = vget_lane_s32(vreinterpret_s32_s16(v387), 0); + v6[ostride * 6] = vget_lane_s32(vreinterpret_s32_s16(v417), 0); + v6[ostride * 15] = vget_lane_s32(vreinterpret_s32_s16(v423), 0); + int16x4_t v357 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v234, 15), (int32x2_t){0, 0})); + int16x4_t v363 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v338, 15), (int32x2_t){0, 0})); + int16x4_t v369 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v235, 15), (int32x2_t){0, 0})); + int16x4_t v375 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v339, 15), (int32x2_t){0, 0})); + int16x4_t v393 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v238, 15), (int32x2_t){0, 0})); + int16x4_t v399 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v342, 15), (int32x2_t){0, 0})); + int16x4_t v405 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v237, 15), (int32x2_t){0, 0})); + int16x4_t v411 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v341, 15), (int32x2_t){0, 0})); + int16x4_t v429 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v236, 15), (int32x2_t){0, 0})); + int16x4_t v435 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v340, 15), (int32x2_t){0, 0})); + int16x4_t v441 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v233, 15), (int32x2_t){0, 0})); + int16x4_t v447 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v337, 15), (int32x2_t){0, 0})); + v6[ostride * 10] = vget_lane_s32(vreinterpret_s32_s16(v357), 0); + v6[ostride] = vget_lane_s32(vreinterpret_s32_s16(v363), 0); + v6[ostride * 2] = vget_lane_s32(vreinterpret_s32_s16(v369), 0); + v6[ostride * 11] = vget_lane_s32(vreinterpret_s32_s16(v375), 0); + v6[ostride * 4] = vget_lane_s32(vreinterpret_s32_s16(v393), 0); + v6[ostride * 13] = vget_lane_s32(vreinterpret_s32_s16(v399), 0); + v6[ostride * 14] = vget_lane_s32(vreinterpret_s32_s16(v405), 0); + v6[ostride * 5] = vget_lane_s32(vreinterpret_s32_s16(v411), 0); + v6[ostride * 16] = vget_lane_s32(vreinterpret_s32_s16(v429), 0); + v6[ostride * 7] = vget_lane_s32(vreinterpret_s32_s16(v435), 0); + v6[ostride * 8] = vget_lane_s32(vreinterpret_s32_s16(v441), 0); + v6[ostride * 17] = vget_lane_s32(vreinterpret_s32_s16(v447), 0); +} +#endif + +#ifdef ARMRAL_ARCH_SVE +void armral_fft_cs16_cf32_cs16_ac_n_uun18( + const armral_cmplx_int16_t *restrict x, armral_cmplx_int16_t *restrict y, + int istride, int ostride, int howmany, float dir) { + int64_t v0 = istride; + int64_t v2 = ostride; + float v4 = dir; + const int32_t *v5 = (const int32_t *)x; + int32_t *v6 = (int32_t *)y; + svbool_t pred_full = svptrue_pat_b32(SV_VL2); + float v312 = -5.0000000000000000e-01F; + float v324 = -1.4999999999999998e+00F; + float v329 = -8.6602540378443871e-01F; + float v336 = 7.6604444311897801e-01F; + float v341 = 9.3969262078590832e-01F; + float v346 = -1.7364817766693039e-01F; + float v351 = -6.4278760968653925e-01F; + float v358 = 3.4202014332566888e-01F; + float v365 = -9.8480775301220802e-01F; + const int32_t *v646 = &v5[v0]; + int32_t *v759 = &v6[v2]; + int64_t v23 = v0 * 9; + int64_t v33 = v0 * 2; + int64_t v41 = v0 * 11; + int64_t v51 = v0 * 4; + int64_t v59 = v0 * 13; + int64_t v69 = v0 * 6; + int64_t v77 = v0 * 15; + int64_t v87 = v0 * 8; + int64_t v95 = v0 * 17; + int64_t v105 = v0 * 10; + int64_t v123 = v0 * 12; + int64_t v131 = v0 * 3; + int64_t v141 = v0 * 14; + int64_t v149 = v0 * 5; + int64_t v159 = v0 * 16; + int64_t v167 = v0 * 7; + float v332 = v4 * v329; + float v354 = v4 * v351; + float v361 = v4 * v358; + float v368 = v4 * v365; + int64_t v405 = v2 * 9; + int64_t v413 = v2 * 10; + int64_t v429 = v2 * 2; + int64_t v437 = v2 * 11; + int64_t v445 = v2 * 12; + int64_t v453 = v2 * 3; + int64_t v461 = v2 * 4; + int64_t v469 = v2 * 13; + int64_t v477 = v2 * 14; + int64_t v485 = v2 * 5; + int64_t v493 = v2 * 6; + int64_t v501 = v2 * 15; + int64_t v509 = v2 * 16; + int64_t v517 = v2 * 7; + int64_t v525 = v2 * 8; + int64_t v533 = v2 * 17; + const int32_t *v547 = &v5[0]; + svfloat32_t v715 = svdup_n_f32(v312); + svfloat32_t v717 = svdup_n_f32(v324); + svfloat32_t v719 = svdup_n_f32(v336); + svfloat32_t v720 = svdup_n_f32(v341); + svfloat32_t v721 = svdup_n_f32(v346); + int32_t *v732 = &v6[0]; + svfloat32_t v119 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v646[0])), + 1.F / (1ULL << 15ULL)); + const int32_t *v556 = &v5[v23]; + const int32_t *v565 = &v5[v33]; + const int32_t *v574 = &v5[v41]; + const int32_t *v583 = &v5[v51]; + const int32_t *v592 = &v5[v59]; + const int32_t *v601 = &v5[v69]; + const int32_t *v610 = &v5[v77]; + const int32_t *v619 = &v5[v87]; + const int32_t *v628 = &v5[v95]; + const int32_t *v637 = &v5[v105]; + const int32_t *v655 = &v5[v123]; + const int32_t *v664 = &v5[v131]; + const int32_t *v673 = &v5[v141]; + const int32_t *v682 = &v5[v149]; + const int32_t *v691 = &v5[v159]; + const int32_t *v700 = &v5[v167]; + svfloat32_t v718 = svdup_n_f32(v332); + svfloat32_t v722 = svdup_n_f32(v354); + svfloat32_t v723 = svdup_n_f32(v361); + svfloat32_t v724 = svdup_n_f32(v368); + int32_t *v741 = &v6[v405]; + int32_t *v750 = &v6[v413]; + int32_t *v768 = &v6[v429]; + int32_t *v777 = &v6[v437]; + int32_t *v786 = &v6[v445]; + int32_t *v795 = &v6[v453]; + int32_t *v804 = &v6[v461]; + int32_t *v813 = &v6[v469]; + int32_t *v822 = &v6[v477]; + int32_t *v831 = &v6[v485]; + int32_t *v840 = &v6[v493]; + int32_t *v849 = &v6[v501]; + int32_t *v858 = &v6[v509]; + int32_t *v867 = &v6[v517]; + int32_t *v876 = &v6[v525]; + int32_t *v885 = &v6[v533]; + svfloat32_t v21 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v547[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v29 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v556[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v39 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v565[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v47 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v574[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v57 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v583[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v65 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v592[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v75 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v601[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v83 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v610[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v93 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v619[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v101 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v628[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v111 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v637[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v129 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v655[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v137 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v664[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v147 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v673[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v155 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v682[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v165 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v691[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v173 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v700[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v30 = svadd_f32_x(svptrue_b32(), v21, v29); + svfloat32_t v31 = svsub_f32_x(svptrue_b32(), v21, v29); + svfloat32_t v48 = svadd_f32_x(svptrue_b32(), v39, v47); + svfloat32_t v49 = svsub_f32_x(svptrue_b32(), v39, v47); + svfloat32_t v66 = svadd_f32_x(svptrue_b32(), v57, v65); + svfloat32_t v67 = svsub_f32_x(svptrue_b32(), v57, v65); + svfloat32_t v84 = svadd_f32_x(svptrue_b32(), v75, v83); + svfloat32_t v85 = svsub_f32_x(svptrue_b32(), v75, v83); + svfloat32_t v102 = svadd_f32_x(svptrue_b32(), v93, v101); + svfloat32_t v103 = svsub_f32_x(svptrue_b32(), v93, v101); + svfloat32_t v120 = svadd_f32_x(svptrue_b32(), v111, v119); + svfloat32_t v121 = svsub_f32_x(svptrue_b32(), v111, v119); + svfloat32_t v138 = svadd_f32_x(svptrue_b32(), v129, v137); + svfloat32_t v139 = svsub_f32_x(svptrue_b32(), v129, v137); + svfloat32_t v156 = svadd_f32_x(svptrue_b32(), v147, v155); + svfloat32_t v157 = svsub_f32_x(svptrue_b32(), v147, v155); + svfloat32_t v174 = svadd_f32_x(svptrue_b32(), v165, v173); + svfloat32_t v175 = svsub_f32_x(svptrue_b32(), v165, v173); + svfloat32_t v176 = svadd_f32_x(svptrue_b32(), v48, v174); + svfloat32_t v177 = svsub_f32_x(svptrue_b32(), v48, v174); + svfloat32_t v178 = svadd_f32_x(svptrue_b32(), v156, v66); + svfloat32_t v179 = svsub_f32_x(svptrue_b32(), v156, v66); + svfloat32_t v180 = svadd_f32_x(svptrue_b32(), v84, v138); + svfloat32_t v181 = svsub_f32_x(svptrue_b32(), v84, v138); + svfloat32_t v182 = svadd_f32_x(svptrue_b32(), v102, v120); + svfloat32_t v183 = svsub_f32_x(svptrue_b32(), v102, v120); + svfloat32_t v286 = svadd_f32_x(svptrue_b32(), v49, v175); + svfloat32_t v287 = svsub_f32_x(svptrue_b32(), v49, v175); + svfloat32_t v288 = svadd_f32_x(svptrue_b32(), v157, v67); + svfloat32_t v289 = svsub_f32_x(svptrue_b32(), v157, v67); + svfloat32_t v290 = svadd_f32_x(svptrue_b32(), v85, v139); + svfloat32_t v291 = svsub_f32_x(svptrue_b32(), v85, v139); + svfloat32_t v292 = svadd_f32_x(svptrue_b32(), v103, v121); + svfloat32_t v293 = svsub_f32_x(svptrue_b32(), v103, v121); + svfloat32_t v184 = svadd_f32_x(svptrue_b32(), v176, v178); + svfloat32_t v188 = svadd_f32_x(svptrue_b32(), v177, v179); + svfloat32_t v190 = svsub_f32_x(svptrue_b32(), v176, v178); + svfloat32_t v191 = svsub_f32_x(svptrue_b32(), v178, v182); + svfloat32_t v192 = svsub_f32_x(svptrue_b32(), v182, v176); + svfloat32_t v193 = svsub_f32_x(svptrue_b32(), v177, v179); + svfloat32_t v194 = svsub_f32_x(svptrue_b32(), v179, v183); + svfloat32_t v195 = svsub_f32_x(svptrue_b32(), v183, v177); + svfloat32_t zero224 = svdup_n_f32(0); + svfloat32_t v224 = svcmla_f32_x(pred_full, zero224, v718, v181, 90); + svfloat32_t v294 = svadd_f32_x(svptrue_b32(), v286, v288); + svfloat32_t v298 = svadd_f32_x(svptrue_b32(), v287, v289); + svfloat32_t v300 = svsub_f32_x(svptrue_b32(), v286, v288); + svfloat32_t v301 = svsub_f32_x(svptrue_b32(), v288, v292); + svfloat32_t v302 = svsub_f32_x(svptrue_b32(), v292, v286); + svfloat32_t v303 = svsub_f32_x(svptrue_b32(), v287, v289); + svfloat32_t v304 = svsub_f32_x(svptrue_b32(), v289, v293); + svfloat32_t v305 = svsub_f32_x(svptrue_b32(), v293, v287); + svfloat32_t zero334 = svdup_n_f32(0); + svfloat32_t v334 = svcmla_f32_x(pred_full, zero334, v718, v291, 90); + svfloat32_t v185 = svadd_f32_x(svptrue_b32(), v184, v182); + svfloat32_t v189 = svadd_f32_x(svptrue_b32(), v188, v183); + svfloat32_t zero246 = svdup_n_f32(0); + svfloat32_t v246 = svcmla_f32_x(pred_full, zero246, v722, v193, 90); + svfloat32_t zero253 = svdup_n_f32(0); + svfloat32_t v253 = svcmla_f32_x(pred_full, zero253, v723, v194, 90); + svfloat32_t zero260 = svdup_n_f32(0); + svfloat32_t v260 = svcmla_f32_x(pred_full, zero260, v724, v195, 90); + svfloat32_t v295 = svadd_f32_x(svptrue_b32(), v294, v292); + svfloat32_t v299 = svadd_f32_x(svptrue_b32(), v298, v293); + svfloat32_t zero356 = svdup_n_f32(0); + svfloat32_t v356 = svcmla_f32_x(pred_full, zero356, v722, v303, 90); + svfloat32_t zero363 = svdup_n_f32(0); + svfloat32_t v363 = svcmla_f32_x(pred_full, zero363, v723, v304, 90); + svfloat32_t zero370 = svdup_n_f32(0); + svfloat32_t v370 = svcmla_f32_x(pred_full, zero370, v724, v305, 90); + svfloat32_t v186 = svadd_f32_x(svptrue_b32(), v185, v180); + svfloat32_t v205 = svmul_f32_x(svptrue_b32(), v185, v715); + svfloat32_t zero212 = svdup_n_f32(0); + svfloat32_t v212 = svcmla_f32_x(pred_full, zero212, v718, v189, 90); + svfloat32_t v274 = svadd_f32_x(svptrue_b32(), v224, v246); + svfloat32_t v276 = svsub_f32_x(svptrue_b32(), v224, v253); + svfloat32_t v278 = svsub_f32_x(svptrue_b32(), v224, v246); + svfloat32_t v296 = svadd_f32_x(svptrue_b32(), v295, v290); + svfloat32_t v315 = svmul_f32_x(svptrue_b32(), v295, v715); + svfloat32_t zero322 = svdup_n_f32(0); + svfloat32_t v322 = svcmla_f32_x(pred_full, zero322, v718, v299, 90); + svfloat32_t v384 = svadd_f32_x(svptrue_b32(), v334, v356); + svfloat32_t v386 = svsub_f32_x(svptrue_b32(), v334, v363); + svfloat32_t v388 = svsub_f32_x(svptrue_b32(), v334, v356); + svfloat32_t v187 = svadd_f32_x(svptrue_b32(), v186, v30); + svfloat32_t v261 = svadd_f32_x(svptrue_b32(), v205, v205); + svfloat32_t v275 = svadd_f32_x(svptrue_b32(), v274, v253); + svfloat32_t v277 = svadd_f32_x(svptrue_b32(), v276, v260); + svfloat32_t v279 = svsub_f32_x(svptrue_b32(), v278, v260); + svfloat32_t v297 = svadd_f32_x(svptrue_b32(), v296, v31); + svfloat32_t v371 = svadd_f32_x(svptrue_b32(), v315, v315); + svfloat32_t v385 = svadd_f32_x(svptrue_b32(), v384, v363); + svfloat32_t v387 = svadd_f32_x(svptrue_b32(), v386, v370); + svfloat32_t v389 = svsub_f32_x(svptrue_b32(), v388, v370); + svfloat32_t v262 = svmla_f32_x(pred_full, v261, v185, v715); + svfloat32_t v266 = svmla_f32_x(pred_full, v187, v180, v717); + svfloat32_t v372 = svmla_f32_x(pred_full, v371, v295, v715); + svfloat32_t v376 = svmla_f32_x(pred_full, v297, v290, v717); + svint16_t v398 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v187, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v406 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v297, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svfloat32_t v263 = svadd_f32_x(svptrue_b32(), v187, v262); + svfloat32_t v267 = svadd_f32_x(svptrue_b32(), v266, v261); + svfloat32_t v373 = svadd_f32_x(svptrue_b32(), v297, v372); + svfloat32_t v377 = svadd_f32_x(svptrue_b32(), v376, v371); + svst1w_u64(pred_full, (unsigned *)(v732), svreinterpret_u64_s16(v398)); + svst1w_u64(pred_full, (unsigned *)(v741), svreinterpret_u64_s16(v406)); + svfloat32_t v264 = svadd_f32_x(svptrue_b32(), v263, v212); + svfloat32_t v265 = svsub_f32_x(svptrue_b32(), v263, v212); + svfloat32_t v268 = svmla_f32_x(pred_full, v267, v190, v719); + svfloat32_t v270 = svmls_f32_x(pred_full, v267, v191, v720); + svfloat32_t v272 = svmls_f32_x(pred_full, v267, v190, v719); + svfloat32_t v374 = svadd_f32_x(svptrue_b32(), v373, v322); + svfloat32_t v375 = svsub_f32_x(svptrue_b32(), v373, v322); + svfloat32_t v378 = svmla_f32_x(pred_full, v377, v300, v719); + svfloat32_t v380 = svmls_f32_x(pred_full, v377, v301, v720); + svfloat32_t v382 = svmls_f32_x(pred_full, v377, v300, v719); + svfloat32_t v269 = svmla_f32_x(pred_full, v268, v191, v720); + svfloat32_t v271 = svmla_f32_x(pred_full, v270, v192, v721); + svfloat32_t v273 = svmls_f32_x(pred_full, v272, v192, v721); + svfloat32_t v379 = svmla_f32_x(pred_full, v378, v301, v720); + svfloat32_t v381 = svmla_f32_x(pred_full, v380, v302, v721); + svfloat32_t v383 = svmls_f32_x(pred_full, v382, v302, v721); + svint16_t v446 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v265, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v454 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v375, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v494 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v264, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v502 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v374, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svfloat32_t v280 = svadd_f32_x(svptrue_b32(), v269, v275); + svfloat32_t v281 = svsub_f32_x(svptrue_b32(), v269, v275); + svfloat32_t v282 = svadd_f32_x(svptrue_b32(), v271, v277); + svfloat32_t v283 = svsub_f32_x(svptrue_b32(), v271, v277); + svfloat32_t v284 = svadd_f32_x(svptrue_b32(), v273, v279); + svfloat32_t v285 = svsub_f32_x(svptrue_b32(), v273, v279); + svfloat32_t v390 = svadd_f32_x(svptrue_b32(), v379, v385); + svfloat32_t v391 = svsub_f32_x(svptrue_b32(), v379, v385); + svfloat32_t v392 = svadd_f32_x(svptrue_b32(), v381, v387); + svfloat32_t v393 = svsub_f32_x(svptrue_b32(), v381, v387); + svfloat32_t v394 = svadd_f32_x(svptrue_b32(), v383, v389); + svfloat32_t v395 = svsub_f32_x(svptrue_b32(), v383, v389); + svst1w_u64(pred_full, (unsigned *)(v786), svreinterpret_u64_s16(v446)); + svst1w_u64(pred_full, (unsigned *)(v795), svreinterpret_u64_s16(v454)); + svst1w_u64(pred_full, (unsigned *)(v840), svreinterpret_u64_s16(v494)); + svst1w_u64(pred_full, (unsigned *)(v849), svreinterpret_u64_s16(v502)); + svint16_t v414 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v281, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v422 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v391, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v430 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v282, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v438 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v392, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v462 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v285, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v470 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v395, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v478 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v284, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v486 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v394, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v510 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v283, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v518 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v393, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v526 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v280, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v534 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v390, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svst1w_u64(pred_full, (unsigned *)(v750), svreinterpret_u64_s16(v414)); + svst1w_u64(pred_full, (unsigned *)(v759), svreinterpret_u64_s16(v422)); + svst1w_u64(pred_full, (unsigned *)(v768), svreinterpret_u64_s16(v430)); + svst1w_u64(pred_full, (unsigned *)(v777), svreinterpret_u64_s16(v438)); + svst1w_u64(pred_full, (unsigned *)(v804), svreinterpret_u64_s16(v462)); + svst1w_u64(pred_full, (unsigned *)(v813), svreinterpret_u64_s16(v470)); + svst1w_u64(pred_full, (unsigned *)(v822), svreinterpret_u64_s16(v478)); + svst1w_u64(pred_full, (unsigned *)(v831), svreinterpret_u64_s16(v486)); + svst1w_u64(pred_full, (unsigned *)(v858), svreinterpret_u64_s16(v510)); + svst1w_u64(pred_full, (unsigned *)(v867), svreinterpret_u64_s16(v518)); + svst1w_u64(pred_full, (unsigned *)(v876), svreinterpret_u64_s16(v526)); + svst1w_u64(pred_full, (unsigned *)(v885), svreinterpret_u64_s16(v534)); +} +#endif + +#ifndef ARMRAL_ARCH_SVE +void armral_fft_cs16_cf32_cs16_ac_n_uun19( + const armral_cmplx_int16_t *restrict x, armral_cmplx_int16_t *restrict y, + int istride, int ostride, int howmany, float dir) { + float v4 = dir; + const int32_t *v5 = (const int32_t *)x; + int32_t *v6 = (int32_t *)y; + float v211 = -1.0555555555555556e+00F; + float v215 = 1.7752228513927079e-01F; + float v219 = -1.2820077502191529e-01F; + float v223 = 4.9321510117355499e-02F; + float v227 = 5.7611011491005903e-01F; + float v231 = -7.4996449655536279e-01F; + float v235 = -1.7385438164530381e-01F; + float v239 = -2.1729997561977314e+00F; + float v243 = -1.7021211726914738e+00F; + float v247 = 4.7087858350625778e-01F; + float v251 = -2.0239400846888440e+00F; + float v255 = 1.0551641201664090e-01F; + float v259 = 2.1294564967054850e+00F; + float v263 = -7.5087543897371167e-01F; + float v267 = 1.4812817695157160e-01F; + float v271 = 8.9900361592528333e-01F; + float v275 = -6.2148246772602778e-01F; + float v279 = -7.9869352098712687e-01F; + float v283 = -4.7339199623771833e-01F; + float v286 = -2.4216105241892630e-01F; + float v287 = 2.4216105241892630e-01F; + float v293 = -5.9368607967505101e-02F; + float v294 = 5.9368607967505101e-02F; + float v300 = 1.2578688255176201e-02F; + float v301 = -1.2578688255176201e-02F; + float v307 = -4.6789919712328903e-02F; + float v308 = 4.6789919712328903e-02F; + float v314 = -9.3750121913782358e-01F; + float v315 = 9.3750121913782358e-01F; + float v321 = -5.0111537043352902e-02F; + float v322 = 5.0111537043352902e-02F; + float v328 = -9.8761275618117661e-01F; + float v329 = 9.8761275618117661e-01F; + float v335 = -1.1745786501205959e+00F; + float v336 = 1.1745786501205959e+00F; + float v342 = 1.1114482296234993e+00F; + float v343 = -1.1114482296234993e+00F; + float v349 = 2.2860268797440955e+00F; + float v350 = -2.2860268797440955e+00F; + float v356 = 2.6420523257930939e-01F; + float v357 = -2.6420523257930939e-01F; + float v363 = 2.1981792779352136e+00F; + float v364 = -2.1981792779352136e+00F; + float v370 = 1.9339740453559042e+00F; + float v371 = -1.9339740453559042e+00F; + float v377 = -7.4825847091254893e-01F; + float v378 = 7.4825847091254893e-01F; + float v384 = -4.7820835642768872e-01F; + float v385 = 4.7820835642768872e-01F; + float v391 = 2.7005011448486022e-01F; + float v392 = -2.7005011448486022e-01F; + float v398 = -3.4642356159542270e-01F; + float v399 = 3.4642356159542270e-01F; + float v405 = -8.3485429360688279e-01F; + float v406 = 8.3485429360688279e-01F; + float v412 = -3.9375928506743518e-01F; + float v413 = 3.9375928506743518e-01F; + int16x4_t v13 = vld1s_s16(&v5[istride]); + int16x4_t v155 = vld1s_s16(&v5[0]); + float32x2_t v212 = (float32x2_t){v211, v211}; + float32x2_t v216 = (float32x2_t){v215, v215}; + float32x2_t v220 = (float32x2_t){v219, v219}; + float32x2_t v224 = (float32x2_t){v223, v223}; + float32x2_t v228 = (float32x2_t){v227, v227}; + float32x2_t v232 = (float32x2_t){v231, v231}; + float32x2_t v236 = (float32x2_t){v235, v235}; + float32x2_t v240 = (float32x2_t){v239, v239}; + float32x2_t v244 = (float32x2_t){v243, v243}; + float32x2_t v248 = (float32x2_t){v247, v247}; + float32x2_t v252 = (float32x2_t){v251, v251}; + float32x2_t v256 = (float32x2_t){v255, v255}; + float32x2_t v260 = (float32x2_t){v259, v259}; + float32x2_t v264 = (float32x2_t){v263, v263}; + float32x2_t v268 = (float32x2_t){v267, v267}; + float32x2_t v272 = (float32x2_t){v271, v271}; + float32x2_t v276 = (float32x2_t){v275, v275}; + float32x2_t v280 = (float32x2_t){v279, v279}; + float32x2_t v284 = (float32x2_t){v283, v283}; + float32x2_t v288 = (float32x2_t){v286, v287}; + float32x2_t v295 = (float32x2_t){v293, v294}; + float32x2_t v302 = (float32x2_t){v300, v301}; + float32x2_t v309 = (float32x2_t){v307, v308}; + float32x2_t v316 = (float32x2_t){v314, v315}; + float32x2_t v323 = (float32x2_t){v321, v322}; + float32x2_t v330 = (float32x2_t){v328, v329}; + float32x2_t v337 = (float32x2_t){v335, v336}; + float32x2_t v344 = (float32x2_t){v342, v343}; + float32x2_t v351 = (float32x2_t){v349, v350}; + float32x2_t v358 = (float32x2_t){v356, v357}; + float32x2_t v365 = (float32x2_t){v363, v364}; + float32x2_t v372 = (float32x2_t){v370, v371}; + float32x2_t v379 = (float32x2_t){v377, v378}; + float32x2_t v386 = (float32x2_t){v384, v385}; + float32x2_t v393 = (float32x2_t){v391, v392}; + float32x2_t v400 = (float32x2_t){v398, v399}; + float32x2_t v407 = (float32x2_t){v405, v406}; + float32x2_t v414 = (float32x2_t){v412, v413}; + float32x2_t v415 = (float32x2_t){v4, v4}; + float32x2_t v14 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v13)), 15); + int16x4_t v19 = vld1s_s16(&v5[istride * 18]); + int16x4_t v27 = vld1s_s16(&v5[istride * 2]); + int16x4_t v33 = vld1s_s16(&v5[istride * 17]); + int16x4_t v41 = vld1s_s16(&v5[istride * 4]); + int16x4_t v47 = vld1s_s16(&v5[istride * 15]); + int16x4_t v55 = vld1s_s16(&v5[istride * 8]); + int16x4_t v61 = vld1s_s16(&v5[istride * 11]); + int16x4_t v69 = vld1s_s16(&v5[istride * 16]); + int16x4_t v75 = vld1s_s16(&v5[istride * 3]); + int16x4_t v83 = vld1s_s16(&v5[istride * 13]); + int16x4_t v89 = vld1s_s16(&v5[istride * 6]); + int16x4_t v97 = vld1s_s16(&v5[istride * 7]); + int16x4_t v103 = vld1s_s16(&v5[istride * 12]); + int16x4_t v111 = vld1s_s16(&v5[istride * 14]); + int16x4_t v117 = vld1s_s16(&v5[istride * 5]); + int16x4_t v125 = vld1s_s16(&v5[istride * 9]); + int16x4_t v131 = vld1s_s16(&v5[istride * 10]); + float32x2_t v156 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v155)), 15); + float32x2_t v290 = vmul_f32(v415, v288); + float32x2_t v297 = vmul_f32(v415, v295); + float32x2_t v304 = vmul_f32(v415, v302); + float32x2_t v311 = vmul_f32(v415, v309); + float32x2_t v318 = vmul_f32(v415, v316); + float32x2_t v325 = vmul_f32(v415, v323); + float32x2_t v332 = vmul_f32(v415, v330); + float32x2_t v339 = vmul_f32(v415, v337); + float32x2_t v346 = vmul_f32(v415, v344); + float32x2_t v353 = vmul_f32(v415, v351); + float32x2_t v360 = vmul_f32(v415, v358); + float32x2_t v367 = vmul_f32(v415, v365); + float32x2_t v374 = vmul_f32(v415, v372); + float32x2_t v381 = vmul_f32(v415, v379); + float32x2_t v388 = vmul_f32(v415, v386); + float32x2_t v395 = vmul_f32(v415, v393); + float32x2_t v402 = vmul_f32(v415, v400); + float32x2_t v409 = vmul_f32(v415, v407); + float32x2_t v416 = vmul_f32(v415, v414); + float32x2_t v20 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v19)), 15); + float32x2_t v28 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v27)), 15); + float32x2_t v34 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v33)), 15); + float32x2_t v42 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v41)), 15); + float32x2_t v48 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v47)), 15); + float32x2_t v56 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v55)), 15); + float32x2_t v62 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v61)), 15); + float32x2_t v70 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v69)), 15); + float32x2_t v76 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v75)), 15); + float32x2_t v84 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v83)), 15); + float32x2_t v90 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v89)), 15); + float32x2_t v98 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v97)), 15); + float32x2_t v104 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v103)), 15); + float32x2_t v112 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v111)), 15); + float32x2_t v118 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v117)), 15); + float32x2_t v126 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v125)), 15); + float32x2_t v132 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v131)), 15); + float32x2_t v21 = vadd_f32(v14, v20); + float32x2_t v22 = vsub_f32(v14, v20); + float32x2_t v35 = vadd_f32(v28, v34); + float32x2_t v36 = vsub_f32(v34, v28); + float32x2_t v49 = vadd_f32(v42, v48); + float32x2_t v50 = vsub_f32(v42, v48); + float32x2_t v63 = vadd_f32(v56, v62); + float32x2_t v64 = vsub_f32(v62, v56); + float32x2_t v77 = vadd_f32(v70, v76); + float32x2_t v78 = vsub_f32(v70, v76); + float32x2_t v91 = vadd_f32(v84, v90); + float32x2_t v92 = vsub_f32(v90, v84); + float32x2_t v105 = vadd_f32(v98, v104); + float32x2_t v106 = vsub_f32(v98, v104); + float32x2_t v119 = vadd_f32(v112, v118); + float32x2_t v120 = vsub_f32(v118, v112); + float32x2_t v133 = vadd_f32(v126, v132); + float32x2_t v134 = vsub_f32(v126, v132); + float32x2_t v135 = vsub_f32(v21, v105); + float32x2_t v136 = vsub_f32(v35, v119); + float32x2_t v137 = vsub_f32(v49, v133); + float32x2_t v138 = vsub_f32(v63, v105); + float32x2_t v139 = vsub_f32(v77, v119); + float32x2_t v140 = vsub_f32(v91, v133); + float32x2_t v141 = vadd_f32(v21, v63); + float32x2_t v143 = vadd_f32(v35, v77); + float32x2_t v145 = vadd_f32(v49, v91); + float32x2_t v174 = vsub_f32(v22, v106); + float32x2_t v175 = vsub_f32(v36, v120); + float32x2_t v176 = vsub_f32(v50, v134); + float32x2_t v177 = vsub_f32(v64, v106); + float32x2_t v178 = vsub_f32(v78, v120); + float32x2_t v179 = vsub_f32(v92, v134); + float32x2_t v180 = vadd_f32(v22, v64); + float32x2_t v182 = vadd_f32(v36, v78); + float32x2_t v184 = vadd_f32(v50, v92); + float32x2_t v142 = vadd_f32(v141, v105); + float32x2_t v144 = vadd_f32(v143, v119); + float32x2_t v146 = vadd_f32(v145, v133); + float32x2_t v147 = vadd_f32(v135, v137); + float32x2_t v148 = vadd_f32(v138, v140); + float32x2_t v164 = vsub_f32(v135, v138); + float32x2_t v165 = vsub_f32(v137, v140); + float32x2_t v181 = vadd_f32(v180, v106); + float32x2_t v183 = vadd_f32(v182, v120); + float32x2_t v185 = vadd_f32(v184, v134); + float32x2_t v186 = vadd_f32(v174, v176); + float32x2_t v187 = vadd_f32(v177, v179); + float32x2_t v196 = vsub_f32(v174, v177); + float32x2_t v197 = vsub_f32(v176, v179); + float32x2_t v241 = vmul_f32(v138, v240); + float32x2_t v253 = vmul_f32(v140, v252); + float32x2_t v261 = vmul_f32(v137, v260); + float32x2_t v340 = vrev64_f32(v177); + float32x2_t v354 = vrev64_f32(v174); + float32x2_t v361 = vrev64_f32(v179); + float32x2_t v375 = vrev64_f32(v176); + float32x2_t v149 = vadd_f32(v142, v144); + float32x2_t v158 = vadd_f32(v148, v139); + float32x2_t v159 = vadd_f32(v147, v136); + float32x2_t v161 = vsub_f32(v148, v139); + float32x2_t v162 = vsub_f32(v147, v136); + float32x2_t v166 = vsub_f32(v135, v165); + float32x2_t v168 = vadd_f32(v164, v140); + float32x2_t v171 = vsub_f32(v142, v146); + float32x2_t v172 = vsub_f32(v144, v146); + float32x2_t v188 = vadd_f32(v181, v183); + float32x2_t v190 = vadd_f32(v187, v178); + float32x2_t v191 = vadd_f32(v186, v175); + float32x2_t v193 = vsub_f32(v187, v178); + float32x2_t v194 = vsub_f32(v186, v175); + float32x2_t v198 = vsub_f32(v174, v197); + float32x2_t v200 = vadd_f32(v196, v179); + float32x2_t v203 = vsub_f32(v181, v185); + float32x2_t v204 = vsub_f32(v183, v185); + float32x2_t v245 = vmul_f32(v164, v244); + float32x2_t v257 = vmul_f32(v165, v256); + float32x2_t v341 = vmul_f32(v340, v339); + float32x2_t v347 = vrev64_f32(v196); + float32x2_t v362 = vmul_f32(v361, v360); + float32x2_t v368 = vrev64_f32(v197); + float32x2_t v376 = vmul_f32(v375, v374); + float32x2_t v150 = vadd_f32(v149, v146); + float32x2_t v160 = vsub_f32(v159, v158); + float32x2_t v163 = vsub_f32(v162, v161); + float32x2_t v167 = vsub_f32(v166, v139); + float32x2_t v169 = vsub_f32(v168, v136); + float32x2_t v173 = vadd_f32(v171, v172); + float32x2_t v189 = vadd_f32(v188, v185); + float32x2_t v192 = vsub_f32(v191, v190); + float32x2_t v195 = vsub_f32(v194, v193); + float32x2_t v199 = vsub_f32(v198, v178); + float32x2_t v201 = vsub_f32(v200, v175); + float32x2_t v205 = vadd_f32(v203, v204); + float32x2_t v217 = vmul_f32(v158, v216); + float32x2_t v221 = vmul_f32(v159, v220); + float32x2_t v229 = vmul_f32(v161, v228); + float32x2_t v233 = vmul_f32(v162, v232); + float32x2_t v277 = vmul_f32(v171, v276); + float32x2_t v281 = vmul_f32(v172, v280); + float32x2_t v298 = vrev64_f32(v190); + float32x2_t v305 = vrev64_f32(v191); + float32x2_t v319 = vrev64_f32(v193); + float32x2_t v326 = vrev64_f32(v194); + float32x2_t v348 = vmul_f32(v347, v346); + float32x2_t v369 = vmul_f32(v368, v367); + float32x2_t v403 = vrev64_f32(v203); + float32x2_t v410 = vrev64_f32(v204); + float32x2_t v157 = vadd_f32(v156, v150); + float32x2_t v170 = vsub_f32(v167, v169); + float32x2_t v202 = vsub_f32(v199, v201); + float32x2_t v213 = vmul_f32(v150, v212); + float32x2_t v225 = vmul_f32(v160, v224); + float32x2_t v237 = vmul_f32(v163, v236); + float32x2_t v265 = vmul_f32(v167, v264); + float32x2_t v269 = vmul_f32(v169, v268); + float32x2_t v285 = vmul_f32(v173, v284); + float32x2_t v291 = vrev64_f32(v189); + float32x2_t v299 = vmul_f32(v298, v297); + float32x2_t v306 = vmul_f32(v305, v304); + float32x2_t v312 = vrev64_f32(v192); + float32x2_t v320 = vmul_f32(v319, v318); + float32x2_t v327 = vmul_f32(v326, v325); + float32x2_t v333 = vrev64_f32(v195); + float32x2_t v382 = vrev64_f32(v199); + float32x2_t v389 = vrev64_f32(v201); + float32x2_t v404 = vmul_f32(v403, v402); + float32x2_t v411 = vmul_f32(v410, v409); + float32x2_t v417 = vrev64_f32(v205); + float32x2_t v419 = vadd_f32(v217, v221); + float32x2_t v420 = vadd_f32(v229, v233); + float32x2_t v273 = vmul_f32(v170, v272); + float32x2_t v292 = vmul_f32(v291, v290); + float32x2_t v313 = vmul_f32(v312, v311); + float32x2_t v334 = vmul_f32(v333, v332); + float32x2_t v383 = vmul_f32(v382, v381); + float32x2_t v390 = vmul_f32(v389, v388); + float32x2_t v396 = vrev64_f32(v202); + float32x2_t v418 = vmul_f32(v417, v416); + float32x2_t v422 = vadd_f32(v419, v420); + float32x2_t v423 = vadd_f32(v217, v225); + float32x2_t v424 = vadd_f32(v229, v237); + float32x2_t v441 = vsub_f32(v419, v420); + float32x2_t v443 = vsub_f32(v277, v285); + float32x2_t v444 = vsub_f32(v281, v285); + float32x2_t v445 = vadd_f32(v213, v157); + float32x2_t v450 = vadd_f32(v299, v306); + float32x2_t v451 = vadd_f32(v320, v327); + int16x4_t v506 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v157, 15), (int32x2_t){0, 0})); + float32x2_t v397 = vmul_f32(v396, v395); + float32x2_t v421 = vadd_f32(v269, v273); + float32x2_t v425 = vadd_f32(v265, v273); + float32x2_t v426 = vsub_f32(v241, v422); + float32x2_t v427 = vadd_f32(v423, v424); + float32x2_t v433 = vsub_f32(v423, v424); + float32x2_t v438 = vadd_f32(v422, v261); + float32x2_t v446 = vadd_f32(v445, v443); + float32x2_t v447 = vsub_f32(v445, v443); + float32x2_t v449 = vadd_f32(v445, v444); + float32x2_t v453 = vadd_f32(v450, v451); + float32x2_t v454 = vadd_f32(v299, v313); + float32x2_t v455 = vadd_f32(v320, v334); + float32x2_t v472 = vsub_f32(v450, v451); + float32x2_t v474 = vsub_f32(v404, v418); + float32x2_t v475 = vsub_f32(v411, v418); + v6[0] = vget_lane_s32(vreinterpret_s32_s16(v506), 0); + float32x2_t v428 = vsub_f32(v253, v425); + float32x2_t v429 = vadd_f32(v245, v421); + float32x2_t v431 = vadd_f32(v427, v257); + float32x2_t v434 = vadd_f32(v433, v421); + float32x2_t v435 = vadd_f32(v426, v427); + float32x2_t v442 = vadd_f32(v441, v425); + float32x2_t v448 = vsub_f32(v447, v444); + float32x2_t v452 = vadd_f32(v390, v397); + float32x2_t v456 = vadd_f32(v383, v397); + float32x2_t v457 = vsub_f32(v341, v453); + float32x2_t v458 = vadd_f32(v454, v455); + float32x2_t v464 = vsub_f32(v454, v455); + float32x2_t v469 = vadd_f32(v453, v376); + float32x2_t v476 = vadd_f32(v292, v474); + float32x2_t v477 = vsub_f32(v292, v474); + float32x2_t v479 = vadd_f32(v292, v475); + float32x2_t v430 = vadd_f32(v429, v426); + float32x2_t v432 = vadd_f32(v431, v428); + float32x2_t v436 = vfma_f32(v435, v135, v248); + float32x2_t v439 = vadd_f32(v438, v428); + float32x2_t v459 = vsub_f32(v362, v456); + float32x2_t v460 = vadd_f32(v348, v452); + float32x2_t v462 = vadd_f32(v458, v369); + float32x2_t v465 = vadd_f32(v464, v452); + float32x2_t v466 = vadd_f32(v457, v458); + float32x2_t v473 = vadd_f32(v472, v456); + float32x2_t v478 = vsub_f32(v477, v475); + float32x2_t v484 = vsub_f32(v442, v434); + float32x2_t v488 = vsub_f32(v449, v442); + float32x2_t v491 = vadd_f32(v434, v449); + float32x2_t v437 = vadd_f32(v436, v425); + float32x2_t v440 = vadd_f32(v439, v421); + float32x2_t v461 = vadd_f32(v460, v457); + float32x2_t v463 = vadd_f32(v462, v459); + float32x2_t v467 = vfma_f32(v466, v354, v353); + float32x2_t v470 = vadd_f32(v469, v459); + float32x2_t v485 = vadd_f32(v484, v449); + float32x2_t v489 = vadd_f32(v430, v446); + float32x2_t v490 = vadd_f32(v432, v448); + float32x2_t v496 = vsub_f32(v473, v465); + float32x2_t v500 = vsub_f32(v473, v479); + float32x2_t v503 = vadd_f32(v465, v479); + float32x2_t v468 = vadd_f32(v467, v456); + float32x2_t v471 = vadd_f32(v470, v452); + float32x2_t v480 = vsub_f32(v437, v430); + float32x2_t v482 = vsub_f32(v440, v432); + float32x2_t v486 = vsub_f32(v446, v437); + float32x2_t v487 = vsub_f32(v448, v440); + float32x2_t v497 = vadd_f32(v496, v479); + float32x2_t v501 = vadd_f32(v461, v476); + float32x2_t v502 = vadd_f32(v463, v478); + float32x2_t v524 = vsub_f32(v491, v503); + float32x2_t v531 = vadd_f32(v491, v503); + float32x2_t v538 = vadd_f32(v488, v500); + float32x2_t v545 = vsub_f32(v488, v500); + float32x2_t v481 = vadd_f32(v480, v446); + float32x2_t v483 = vadd_f32(v482, v448); + float32x2_t v492 = vsub_f32(v468, v461); + float32x2_t v494 = vsub_f32(v471, v463); + float32x2_t v498 = vsub_f32(v476, v468); + float32x2_t v499 = vsub_f32(v478, v471); + int16x4_t v527 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v524, 15), (int32x2_t){0, 0})); + int16x4_t v534 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v531, 15), (int32x2_t){0, 0})); + int16x4_t v541 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v538, 15), (int32x2_t){0, 0})); + int16x4_t v548 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v545, 15), (int32x2_t){0, 0})); + float32x2_t v552 = vadd_f32(v490, v502); + float32x2_t v559 = vsub_f32(v490, v502); + float32x2_t v566 = vadd_f32(v485, v497); + float32x2_t v573 = vsub_f32(v485, v497); + float32x2_t v608 = vsub_f32(v489, v501); + float32x2_t v615 = vadd_f32(v489, v501); + float32x2_t v493 = vadd_f32(v492, v476); + float32x2_t v495 = vadd_f32(v494, v478); + v6[ostride * 2] = vget_lane_s32(vreinterpret_s32_s16(v527), 0); + v6[ostride * 17] = vget_lane_s32(vreinterpret_s32_s16(v534), 0); + v6[ostride * 3] = vget_lane_s32(vreinterpret_s32_s16(v541), 0); + v6[ostride * 16] = vget_lane_s32(vreinterpret_s32_s16(v548), 0); + int16x4_t v555 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v552, 15), (int32x2_t){0, 0})); + int16x4_t v562 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v559, 15), (int32x2_t){0, 0})); + int16x4_t v569 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v566, 15), (int32x2_t){0, 0})); + int16x4_t v576 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v573, 15), (int32x2_t){0, 0})); + float32x2_t v580 = vadd_f32(v487, v499); + float32x2_t v587 = vsub_f32(v487, v499); + float32x2_t v594 = vadd_f32(v486, v498); + float32x2_t v601 = vsub_f32(v486, v498); + int16x4_t v611 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v608, 15), (int32x2_t){0, 0})); + int16x4_t v618 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v615, 15), (int32x2_t){0, 0})); + float32x2_t v510 = vadd_f32(v481, v493); + float32x2_t v517 = vsub_f32(v481, v493); + v6[ostride * 4] = vget_lane_s32(vreinterpret_s32_s16(v555), 0); + v6[ostride * 15] = vget_lane_s32(vreinterpret_s32_s16(v562), 0); + v6[ostride * 5] = vget_lane_s32(vreinterpret_s32_s16(v569), 0); + v6[ostride * 14] = vget_lane_s32(vreinterpret_s32_s16(v576), 0); + int16x4_t v583 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v580, 15), (int32x2_t){0, 0})); + int16x4_t v590 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v587, 15), (int32x2_t){0, 0})); + int16x4_t v597 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v594, 15), (int32x2_t){0, 0})); + int16x4_t v604 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v601, 15), (int32x2_t){0, 0})); + v6[ostride * 8] = vget_lane_s32(vreinterpret_s32_s16(v611), 0); + v6[ostride * 11] = vget_lane_s32(vreinterpret_s32_s16(v618), 0); + float32x2_t v622 = vadd_f32(v483, v495); + float32x2_t v629 = vsub_f32(v483, v495); + int16x4_t v513 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v510, 15), (int32x2_t){0, 0})); + int16x4_t v520 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v517, 15), (int32x2_t){0, 0})); + v6[ostride * 6] = vget_lane_s32(vreinterpret_s32_s16(v583), 0); + v6[ostride * 13] = vget_lane_s32(vreinterpret_s32_s16(v590), 0); + v6[ostride * 7] = vget_lane_s32(vreinterpret_s32_s16(v597), 0); + v6[ostride * 12] = vget_lane_s32(vreinterpret_s32_s16(v604), 0); + int16x4_t v625 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v622, 15), (int32x2_t){0, 0})); + int16x4_t v632 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v629, 15), (int32x2_t){0, 0})); + v6[ostride] = vget_lane_s32(vreinterpret_s32_s16(v513), 0); + v6[ostride * 18] = vget_lane_s32(vreinterpret_s32_s16(v520), 0); + v6[ostride * 9] = vget_lane_s32(vreinterpret_s32_s16(v625), 0); + v6[ostride * 10] = vget_lane_s32(vreinterpret_s32_s16(v632), 0); +} +#endif + +#ifdef ARMRAL_ARCH_SVE +void armral_fft_cs16_cf32_cs16_ac_n_uun19( + const armral_cmplx_int16_t *restrict x, armral_cmplx_int16_t *restrict y, + int istride, int ostride, int howmany, float dir) { + int64_t v0 = istride; + int64_t v2 = ostride; + float v4 = dir; + const int32_t *v5 = (const int32_t *)x; + int32_t *v6 = (int32_t *)y; + svbool_t pred_full = svptrue_pat_b32(SV_VL2); + float v255 = -1.0555555555555556e+00F; + float v260 = 1.7752228513927079e-01F; + float v265 = -1.2820077502191529e-01F; + float v270 = 4.9321510117355499e-02F; + float v275 = 5.7611011491005903e-01F; + float v280 = -7.4996449655536279e-01F; + float v285 = -1.7385438164530381e-01F; + float v290 = -2.1729997561977314e+00F; + float v295 = -1.7021211726914738e+00F; + float v300 = 4.7087858350625778e-01F; + float v305 = -2.0239400846888440e+00F; + float v310 = 1.0551641201664090e-01F; + float v315 = 2.1294564967054850e+00F; + float v320 = -7.5087543897371167e-01F; + float v325 = 1.4812817695157160e-01F; + float v330 = 8.9900361592528333e-01F; + float v335 = -6.2148246772602778e-01F; + float v340 = -7.9869352098712687e-01F; + float v345 = -4.7339199623771833e-01F; + float v350 = 2.4216105241892630e-01F; + float v357 = 5.9368607967505101e-02F; + float v364 = -1.2578688255176201e-02F; + float v371 = 4.6789919712328903e-02F; + float v378 = 9.3750121913782358e-01F; + float v385 = 5.0111537043352902e-02F; + float v392 = 9.8761275618117661e-01F; + float v399 = 1.1745786501205959e+00F; + float v406 = -1.1114482296234993e+00F; + float v413 = -2.2860268797440955e+00F; + float v420 = -2.6420523257930939e-01F; + float v427 = -2.1981792779352136e+00F; + float v434 = -1.9339740453559042e+00F; + float v441 = 7.4825847091254893e-01F; + float v448 = 4.7820835642768872e-01F; + float v455 = -2.7005011448486022e-01F; + float v462 = 3.4642356159542270e-01F; + float v469 = 8.3485429360688279e-01F; + float v476 = 3.9375928506743518e-01F; + const int32_t *v743 = &v5[v0]; + int32_t *v964 = &v6[v2]; + int64_t v23 = v0 * 18; + int64_t v33 = v0 * 2; + int64_t v41 = v0 * 17; + int64_t v51 = v0 * 4; + int64_t v59 = v0 * 15; + int64_t v69 = v0 * 8; + int64_t v77 = v0 * 11; + int64_t v87 = v0 * 16; + int64_t v95 = v0 * 3; + int64_t v105 = v0 * 13; + int64_t v113 = v0 * 6; + int64_t v123 = v0 * 7; + int64_t v131 = v0 * 12; + int64_t v141 = v0 * 14; + int64_t v149 = v0 * 5; + int64_t v159 = v0 * 9; + int64_t v167 = v0 * 10; + float v353 = v4 * v350; + float v360 = v4 * v357; + float v367 = v4 * v364; + float v374 = v4 * v371; + float v381 = v4 * v378; + float v388 = v4 * v385; + float v395 = v4 * v392; + float v402 = v4 * v399; + float v409 = v4 * v406; + float v416 = v4 * v413; + float v423 = v4 * v420; + float v430 = v4 * v427; + float v437 = v4 * v434; + float v444 = v4 * v441; + float v451 = v4 * v448; + float v458 = v4 * v455; + float v465 = v4 * v462; + float v472 = v4 * v469; + float v479 = v4 * v476; + int64_t v586 = v2 * 18; + int64_t v595 = v2 * 2; + int64_t v604 = v2 * 17; + int64_t v613 = v2 * 3; + int64_t v622 = v2 * 16; + int64_t v631 = v2 * 4; + int64_t v640 = v2 * 15; + int64_t v649 = v2 * 5; + int64_t v658 = v2 * 14; + int64_t v667 = v2 * 6; + int64_t v676 = v2 * 13; + int64_t v685 = v2 * 7; + int64_t v694 = v2 * 12; + int64_t v703 = v2 * 8; + int64_t v712 = v2 * 11; + int64_t v721 = v2 * 9; + int64_t v730 = v2 * 10; + const int32_t *v906 = &v5[0]; + svfloat32_t v910 = svdup_n_f32(v255); + svfloat32_t v911 = svdup_n_f32(v260); + svfloat32_t v912 = svdup_n_f32(v265); + svfloat32_t v913 = svdup_n_f32(v270); + svfloat32_t v914 = svdup_n_f32(v275); + svfloat32_t v915 = svdup_n_f32(v280); + svfloat32_t v916 = svdup_n_f32(v285); + svfloat32_t v917 = svdup_n_f32(v290); + svfloat32_t v918 = svdup_n_f32(v295); + svfloat32_t v919 = svdup_n_f32(v300); + svfloat32_t v920 = svdup_n_f32(v305); + svfloat32_t v921 = svdup_n_f32(v310); + svfloat32_t v922 = svdup_n_f32(v315); + svfloat32_t v923 = svdup_n_f32(v320); + svfloat32_t v924 = svdup_n_f32(v325); + svfloat32_t v925 = svdup_n_f32(v330); + svfloat32_t v926 = svdup_n_f32(v335); + svfloat32_t v927 = svdup_n_f32(v340); + svfloat32_t v928 = svdup_n_f32(v345); + int32_t *v955 = &v6[0]; + svfloat32_t v21 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v743[0])), + 1.F / (1ULL << 15ULL)); + const int32_t *v752 = &v5[v23]; + const int32_t *v761 = &v5[v33]; + const int32_t *v770 = &v5[v41]; + const int32_t *v779 = &v5[v51]; + const int32_t *v788 = &v5[v59]; + const int32_t *v797 = &v5[v69]; + const int32_t *v806 = &v5[v77]; + const int32_t *v815 = &v5[v87]; + const int32_t *v824 = &v5[v95]; + const int32_t *v833 = &v5[v105]; + const int32_t *v842 = &v5[v113]; + const int32_t *v851 = &v5[v123]; + const int32_t *v860 = &v5[v131]; + const int32_t *v869 = &v5[v141]; + const int32_t *v878 = &v5[v149]; + const int32_t *v887 = &v5[v159]; + const int32_t *v896 = &v5[v167]; + svfloat32_t v929 = svdup_n_f32(v353); + svfloat32_t v930 = svdup_n_f32(v360); + svfloat32_t v931 = svdup_n_f32(v367); + svfloat32_t v932 = svdup_n_f32(v374); + svfloat32_t v933 = svdup_n_f32(v381); + svfloat32_t v934 = svdup_n_f32(v388); + svfloat32_t v935 = svdup_n_f32(v395); + svfloat32_t v936 = svdup_n_f32(v402); + svfloat32_t v937 = svdup_n_f32(v409); + svfloat32_t v938 = svdup_n_f32(v416); + svfloat32_t v939 = svdup_n_f32(v423); + svfloat32_t v940 = svdup_n_f32(v430); + svfloat32_t v941 = svdup_n_f32(v437); + svfloat32_t v942 = svdup_n_f32(v444); + svfloat32_t v943 = svdup_n_f32(v451); + svfloat32_t v944 = svdup_n_f32(v458); + svfloat32_t v945 = svdup_n_f32(v465); + svfloat32_t v946 = svdup_n_f32(v472); + svfloat32_t v947 = svdup_n_f32(v479); + int32_t *v973 = &v6[v586]; + int32_t *v982 = &v6[v595]; + int32_t *v991 = &v6[v604]; + int32_t *v1000 = &v6[v613]; + int32_t *v1009 = &v6[v622]; + int32_t *v1018 = &v6[v631]; + int32_t *v1027 = &v6[v640]; + int32_t *v1036 = &v6[v649]; + int32_t *v1045 = &v6[v658]; + int32_t *v1054 = &v6[v667]; + int32_t *v1063 = &v6[v676]; + int32_t *v1072 = &v6[v685]; + int32_t *v1081 = &v6[v694]; + int32_t *v1090 = &v6[v703]; + int32_t *v1099 = &v6[v712]; + int32_t *v1108 = &v6[v721]; + int32_t *v1117 = &v6[v730]; + svfloat32_t v199 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v906[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v29 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v752[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v39 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v761[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v47 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v770[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v57 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v779[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v65 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v788[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v75 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v797[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v83 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v806[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v93 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v815[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v101 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v824[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v111 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v833[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v119 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v842[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v129 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v851[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v137 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v860[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v147 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v869[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v155 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v878[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v165 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v887[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v173 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v896[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v30 = svadd_f32_x(svptrue_b32(), v21, v29); + svfloat32_t v31 = svsub_f32_x(svptrue_b32(), v21, v29); + svfloat32_t v48 = svadd_f32_x(svptrue_b32(), v39, v47); + svfloat32_t v49 = svsub_f32_x(svptrue_b32(), v47, v39); + svfloat32_t v66 = svadd_f32_x(svptrue_b32(), v57, v65); + svfloat32_t v67 = svsub_f32_x(svptrue_b32(), v57, v65); + svfloat32_t v84 = svadd_f32_x(svptrue_b32(), v75, v83); + svfloat32_t v85 = svsub_f32_x(svptrue_b32(), v83, v75); + svfloat32_t v102 = svadd_f32_x(svptrue_b32(), v93, v101); + svfloat32_t v103 = svsub_f32_x(svptrue_b32(), v93, v101); + svfloat32_t v120 = svadd_f32_x(svptrue_b32(), v111, v119); + svfloat32_t v121 = svsub_f32_x(svptrue_b32(), v119, v111); + svfloat32_t v138 = svadd_f32_x(svptrue_b32(), v129, v137); + svfloat32_t v139 = svsub_f32_x(svptrue_b32(), v129, v137); + svfloat32_t v156 = svadd_f32_x(svptrue_b32(), v147, v155); + svfloat32_t v157 = svsub_f32_x(svptrue_b32(), v155, v147); + svfloat32_t v174 = svadd_f32_x(svptrue_b32(), v165, v173); + svfloat32_t v175 = svsub_f32_x(svptrue_b32(), v165, v173); + svfloat32_t v176 = svsub_f32_x(svptrue_b32(), v30, v138); + svfloat32_t v177 = svsub_f32_x(svptrue_b32(), v48, v156); + svfloat32_t v178 = svsub_f32_x(svptrue_b32(), v66, v174); + svfloat32_t v179 = svsub_f32_x(svptrue_b32(), v84, v138); + svfloat32_t v180 = svsub_f32_x(svptrue_b32(), v102, v156); + svfloat32_t v181 = svsub_f32_x(svptrue_b32(), v120, v174); + svfloat32_t v182 = svadd_f32_x(svptrue_b32(), v30, v84); + svfloat32_t v184 = svadd_f32_x(svptrue_b32(), v48, v102); + svfloat32_t v186 = svadd_f32_x(svptrue_b32(), v66, v120); + svfloat32_t v217 = svsub_f32_x(svptrue_b32(), v31, v139); + svfloat32_t v218 = svsub_f32_x(svptrue_b32(), v49, v157); + svfloat32_t v219 = svsub_f32_x(svptrue_b32(), v67, v175); + svfloat32_t v220 = svsub_f32_x(svptrue_b32(), v85, v139); + svfloat32_t v221 = svsub_f32_x(svptrue_b32(), v103, v157); + svfloat32_t v222 = svsub_f32_x(svptrue_b32(), v121, v175); + svfloat32_t v223 = svadd_f32_x(svptrue_b32(), v31, v85); + svfloat32_t v225 = svadd_f32_x(svptrue_b32(), v49, v103); + svfloat32_t v227 = svadd_f32_x(svptrue_b32(), v67, v121); + svfloat32_t v183 = svadd_f32_x(svptrue_b32(), v182, v138); + svfloat32_t v185 = svadd_f32_x(svptrue_b32(), v184, v156); + svfloat32_t v187 = svadd_f32_x(svptrue_b32(), v186, v174); + svfloat32_t v188 = svadd_f32_x(svptrue_b32(), v176, v178); + svfloat32_t v189 = svadd_f32_x(svptrue_b32(), v179, v181); + svfloat32_t v207 = svsub_f32_x(svptrue_b32(), v176, v179); + svfloat32_t v208 = svsub_f32_x(svptrue_b32(), v178, v181); + svfloat32_t v224 = svadd_f32_x(svptrue_b32(), v223, v139); + svfloat32_t v226 = svadd_f32_x(svptrue_b32(), v225, v157); + svfloat32_t v228 = svadd_f32_x(svptrue_b32(), v227, v175); + svfloat32_t v229 = svadd_f32_x(svptrue_b32(), v217, v219); + svfloat32_t v230 = svadd_f32_x(svptrue_b32(), v220, v222); + svfloat32_t v239 = svsub_f32_x(svptrue_b32(), v217, v220); + svfloat32_t v240 = svsub_f32_x(svptrue_b32(), v219, v222); + svfloat32_t zero404 = svdup_n_f32(0); + svfloat32_t v404 = svcmla_f32_x(pred_full, zero404, v936, v220, 90); + svfloat32_t zero425 = svdup_n_f32(0); + svfloat32_t v425 = svcmla_f32_x(pred_full, zero425, v939, v222, 90); + svfloat32_t v190 = svadd_f32_x(svptrue_b32(), v183, v185); + svfloat32_t v201 = svadd_f32_x(svptrue_b32(), v189, v180); + svfloat32_t v202 = svadd_f32_x(svptrue_b32(), v188, v177); + svfloat32_t v204 = svsub_f32_x(svptrue_b32(), v189, v180); + svfloat32_t v205 = svsub_f32_x(svptrue_b32(), v188, v177); + svfloat32_t v209 = svsub_f32_x(svptrue_b32(), v176, v208); + svfloat32_t v211 = svadd_f32_x(svptrue_b32(), v207, v181); + svfloat32_t v214 = svsub_f32_x(svptrue_b32(), v183, v187); + svfloat32_t v215 = svsub_f32_x(svptrue_b32(), v185, v187); + svfloat32_t v231 = svadd_f32_x(svptrue_b32(), v224, v226); + svfloat32_t v233 = svadd_f32_x(svptrue_b32(), v230, v221); + svfloat32_t v234 = svadd_f32_x(svptrue_b32(), v229, v218); + svfloat32_t v236 = svsub_f32_x(svptrue_b32(), v230, v221); + svfloat32_t v237 = svsub_f32_x(svptrue_b32(), v229, v218); + svfloat32_t v241 = svsub_f32_x(svptrue_b32(), v217, v240); + svfloat32_t v243 = svadd_f32_x(svptrue_b32(), v239, v222); + svfloat32_t v246 = svsub_f32_x(svptrue_b32(), v224, v228); + svfloat32_t v247 = svsub_f32_x(svptrue_b32(), v226, v228); + svfloat32_t v191 = svadd_f32_x(svptrue_b32(), v190, v187); + svfloat32_t v203 = svsub_f32_x(svptrue_b32(), v202, v201); + svfloat32_t v206 = svsub_f32_x(svptrue_b32(), v205, v204); + svfloat32_t v210 = svsub_f32_x(svptrue_b32(), v209, v180); + svfloat32_t v212 = svsub_f32_x(svptrue_b32(), v211, v177); + svfloat32_t v216 = svadd_f32_x(svptrue_b32(), v214, v215); + svfloat32_t v232 = svadd_f32_x(svptrue_b32(), v231, v228); + svfloat32_t v235 = svsub_f32_x(svptrue_b32(), v234, v233); + svfloat32_t v238 = svsub_f32_x(svptrue_b32(), v237, v236); + svfloat32_t v242 = svsub_f32_x(svptrue_b32(), v241, v221); + svfloat32_t v244 = svsub_f32_x(svptrue_b32(), v243, v218); + svfloat32_t v248 = svadd_f32_x(svptrue_b32(), v246, v247); + svfloat32_t v268 = svmul_f32_x(svptrue_b32(), v202, v912); + svfloat32_t v283 = svmul_f32_x(svptrue_b32(), v205, v915); + svfloat32_t zero362 = svdup_n_f32(0); + svfloat32_t v362 = svcmla_f32_x(pred_full, zero362, v930, v233, 90); + svfloat32_t zero383 = svdup_n_f32(0); + svfloat32_t v383 = svcmla_f32_x(pred_full, zero383, v933, v236, 90); + svfloat32_t zero467 = svdup_n_f32(0); + svfloat32_t v467 = svcmla_f32_x(pred_full, zero467, v945, v246, 90); + svfloat32_t zero474 = svdup_n_f32(0); + svfloat32_t v474 = svcmla_f32_x(pred_full, zero474, v946, v247, 90); + svfloat32_t v200 = svadd_f32_x(svptrue_b32(), v199, v191); + svfloat32_t v213 = svsub_f32_x(svptrue_b32(), v210, v212); + svfloat32_t v245 = svsub_f32_x(svptrue_b32(), v242, v244); + svfloat32_t v273 = svmul_f32_x(svptrue_b32(), v203, v913); + svfloat32_t v288 = svmul_f32_x(svptrue_b32(), v206, v916); + svfloat32_t v348 = svmul_f32_x(svptrue_b32(), v216, v928); + svfloat32_t zero355 = svdup_n_f32(0); + svfloat32_t v355 = svcmla_f32_x(pred_full, zero355, v929, v232, 90); + svfloat32_t zero481 = svdup_n_f32(0); + svfloat32_t v481 = svcmla_f32_x(pred_full, zero481, v947, v248, 90); + svfloat32_t v482 = svmla_f32_x(pred_full, v268, v201, v911); + svfloat32_t v483 = svmla_f32_x(pred_full, v283, v204, v914); + svfloat32_t v513 = svcmla_f32_x(pred_full, v362, v931, v234, 90); + svfloat32_t v514 = svcmla_f32_x(pred_full, v383, v934, v237, 90); + svfloat32_t v333 = svmul_f32_x(svptrue_b32(), v213, v925); + svfloat32_t zero460 = svdup_n_f32(0); + svfloat32_t v460 = svcmla_f32_x(pred_full, zero460, v944, v245, 90); + svfloat32_t v485 = svadd_f32_x(svptrue_b32(), v482, v483); + svfloat32_t v486 = svmla_f32_x(pred_full, v273, v201, v911); + svfloat32_t v487 = svmla_f32_x(pred_full, v288, v204, v914); + svfloat32_t v504 = svsub_f32_x(svptrue_b32(), v482, v483); + svfloat32_t v506 = svnmls_f32_x(pred_full, v348, v214, v926); + svfloat32_t v507 = svnmls_f32_x(pred_full, v348, v215, v927); + svfloat32_t v508 = svmla_f32_x(pred_full, v200, v191, v910); + svfloat32_t v516 = svadd_f32_x(svptrue_b32(), v513, v514); + svfloat32_t v517 = svcmla_f32_x(pred_full, v362, v932, v235, 90); + svfloat32_t v518 = svcmla_f32_x(pred_full, v383, v935, v238, 90); + svfloat32_t v535 = svsub_f32_x(svptrue_b32(), v513, v514); + svfloat32_t v537 = svsub_f32_x(svptrue_b32(), v467, v481); + svfloat32_t v538 = svsub_f32_x(svptrue_b32(), v474, v481); + svint16_t v569 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v200, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svfloat32_t v484 = svmla_f32_x(pred_full, v333, v212, v924); + svfloat32_t v488 = svmla_f32_x(pred_full, v333, v210, v923); + svfloat32_t v489 = svnmls_f32_x(pred_full, v485, v179, v917); + svfloat32_t v490 = svadd_f32_x(svptrue_b32(), v486, v487); + svfloat32_t v496 = svsub_f32_x(svptrue_b32(), v486, v487); + svfloat32_t v501 = svmla_f32_x(pred_full, v485, v178, v922); + svfloat32_t v509 = svadd_f32_x(svptrue_b32(), v508, v506); + svfloat32_t v510 = svsub_f32_x(svptrue_b32(), v508, v506); + svfloat32_t v512 = svadd_f32_x(svptrue_b32(), v508, v507); + svfloat32_t v515 = svcmla_f32_x(pred_full, v460, v943, v244, 90); + svfloat32_t v519 = svcmla_f32_x(pred_full, v460, v942, v242, 90); + svfloat32_t v520 = svsub_f32_x(svptrue_b32(), v404, v516); + svfloat32_t v521 = svadd_f32_x(svptrue_b32(), v517, v518); + svfloat32_t v527 = svsub_f32_x(svptrue_b32(), v517, v518); + svfloat32_t v532 = svcmla_f32_x(pred_full, v516, v941, v219, 90); + svfloat32_t v539 = svadd_f32_x(svptrue_b32(), v355, v537); + svfloat32_t v540 = svsub_f32_x(svptrue_b32(), v355, v537); + svfloat32_t v542 = svadd_f32_x(svptrue_b32(), v355, v538); + svst1w_u64(pred_full, (unsigned *)(v955), svreinterpret_u64_s16(v569)); + svfloat32_t v491 = svnmls_f32_x(pred_full, v488, v181, v920); + svfloat32_t v492 = svmla_f32_x(pred_full, v484, v207, v918); + svfloat32_t v494 = svmla_f32_x(pred_full, v490, v208, v921); + svfloat32_t v497 = svadd_f32_x(svptrue_b32(), v496, v484); + svfloat32_t v498 = svadd_f32_x(svptrue_b32(), v489, v490); + svfloat32_t v505 = svadd_f32_x(svptrue_b32(), v504, v488); + svfloat32_t v511 = svsub_f32_x(svptrue_b32(), v510, v507); + svfloat32_t v522 = svsub_f32_x(svptrue_b32(), v425, v519); + svfloat32_t v523 = svcmla_f32_x(pred_full, v515, v937, v239, 90); + svfloat32_t v525 = svcmla_f32_x(pred_full, v521, v940, v240, 90); + svfloat32_t v528 = svadd_f32_x(svptrue_b32(), v527, v515); + svfloat32_t v529 = svadd_f32_x(svptrue_b32(), v520, v521); + svfloat32_t v536 = svadd_f32_x(svptrue_b32(), v535, v519); + svfloat32_t v541 = svsub_f32_x(svptrue_b32(), v540, v538); + svfloat32_t v493 = svadd_f32_x(svptrue_b32(), v492, v489); + svfloat32_t v495 = svadd_f32_x(svptrue_b32(), v494, v491); + svfloat32_t v499 = svmla_f32_x(pred_full, v498, v176, v919); + svfloat32_t v502 = svadd_f32_x(svptrue_b32(), v501, v491); + svfloat32_t v524 = svadd_f32_x(svptrue_b32(), v523, v520); + svfloat32_t v526 = svadd_f32_x(svptrue_b32(), v525, v522); + svfloat32_t v530 = svcmla_f32_x(pred_full, v529, v938, v217, 90); + svfloat32_t v533 = svadd_f32_x(svptrue_b32(), v532, v522); + svfloat32_t v547 = svsub_f32_x(svptrue_b32(), v505, v497); + svfloat32_t v551 = svsub_f32_x(svptrue_b32(), v512, v505); + svfloat32_t v554 = svadd_f32_x(svptrue_b32(), v497, v512); + svfloat32_t v559 = svsub_f32_x(svptrue_b32(), v536, v528); + svfloat32_t v563 = svsub_f32_x(svptrue_b32(), v536, v542); + svfloat32_t v566 = svadd_f32_x(svptrue_b32(), v528, v542); + svfloat32_t v500 = svadd_f32_x(svptrue_b32(), v499, v488); + svfloat32_t v503 = svadd_f32_x(svptrue_b32(), v502, v484); + svfloat32_t v531 = svadd_f32_x(svptrue_b32(), v530, v519); + svfloat32_t v534 = svadd_f32_x(svptrue_b32(), v533, v515); + svfloat32_t v548 = svadd_f32_x(svptrue_b32(), v547, v512); + svfloat32_t v552 = svadd_f32_x(svptrue_b32(), v493, v509); + svfloat32_t v553 = svadd_f32_x(svptrue_b32(), v495, v511); + svfloat32_t v560 = svadd_f32_x(svptrue_b32(), v559, v542); + svfloat32_t v564 = svadd_f32_x(svptrue_b32(), v524, v539); + svfloat32_t v565 = svadd_f32_x(svptrue_b32(), v526, v541); + svfloat32_t v593 = svsub_f32_x(svptrue_b32(), v554, v566); + svfloat32_t v602 = svadd_f32_x(svptrue_b32(), v554, v566); + svfloat32_t v611 = svadd_f32_x(svptrue_b32(), v551, v563); + svfloat32_t v620 = svsub_f32_x(svptrue_b32(), v551, v563); + svfloat32_t v543 = svsub_f32_x(svptrue_b32(), v500, v493); + svfloat32_t v545 = svsub_f32_x(svptrue_b32(), v503, v495); + svfloat32_t v549 = svsub_f32_x(svptrue_b32(), v509, v500); + svfloat32_t v550 = svsub_f32_x(svptrue_b32(), v511, v503); + svfloat32_t v555 = svsub_f32_x(svptrue_b32(), v531, v524); + svfloat32_t v557 = svsub_f32_x(svptrue_b32(), v534, v526); + svfloat32_t v561 = svsub_f32_x(svptrue_b32(), v539, v531); + svfloat32_t v562 = svsub_f32_x(svptrue_b32(), v541, v534); + svint16_t v596 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v593, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v605 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v602, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v614 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v611, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v623 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v620, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svfloat32_t v629 = svadd_f32_x(svptrue_b32(), v553, v565); + svfloat32_t v638 = svsub_f32_x(svptrue_b32(), v553, v565); + svfloat32_t v647 = svadd_f32_x(svptrue_b32(), v548, v560); + svfloat32_t v656 = svsub_f32_x(svptrue_b32(), v548, v560); + svfloat32_t v701 = svsub_f32_x(svptrue_b32(), v552, v564); + svfloat32_t v710 = svadd_f32_x(svptrue_b32(), v552, v564); + svfloat32_t v544 = svadd_f32_x(svptrue_b32(), v543, v509); + svfloat32_t v546 = svadd_f32_x(svptrue_b32(), v545, v511); + svfloat32_t v556 = svadd_f32_x(svptrue_b32(), v555, v539); + svfloat32_t v558 = svadd_f32_x(svptrue_b32(), v557, v541); + svint16_t v632 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v629, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v641 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v638, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v650 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v647, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v659 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v656, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svfloat32_t v665 = svadd_f32_x(svptrue_b32(), v550, v562); + svfloat32_t v674 = svsub_f32_x(svptrue_b32(), v550, v562); + svfloat32_t v683 = svadd_f32_x(svptrue_b32(), v549, v561); + svfloat32_t v692 = svsub_f32_x(svptrue_b32(), v549, v561); + svint16_t v704 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v701, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v713 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v710, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svst1w_u64(pred_full, (unsigned *)(v982), svreinterpret_u64_s16(v596)); + svst1w_u64(pred_full, (unsigned *)(v991), svreinterpret_u64_s16(v605)); + svst1w_u64(pred_full, (unsigned *)(v1000), svreinterpret_u64_s16(v614)); + svst1w_u64(pred_full, (unsigned *)(v1009), svreinterpret_u64_s16(v623)); + svfloat32_t v575 = svadd_f32_x(svptrue_b32(), v544, v556); + svfloat32_t v584 = svsub_f32_x(svptrue_b32(), v544, v556); + svint16_t v668 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v665, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v677 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v674, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v686 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v683, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v695 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v692, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svfloat32_t v719 = svadd_f32_x(svptrue_b32(), v546, v558); + svfloat32_t v728 = svsub_f32_x(svptrue_b32(), v546, v558); + svst1w_u64(pred_full, (unsigned *)(v1018), svreinterpret_u64_s16(v632)); + svst1w_u64(pred_full, (unsigned *)(v1027), svreinterpret_u64_s16(v641)); + svst1w_u64(pred_full, (unsigned *)(v1036), svreinterpret_u64_s16(v650)); + svst1w_u64(pred_full, (unsigned *)(v1045), svreinterpret_u64_s16(v659)); + svst1w_u64(pred_full, (unsigned *)(v1090), svreinterpret_u64_s16(v704)); + svst1w_u64(pred_full, (unsigned *)(v1099), svreinterpret_u64_s16(v713)); + svint16_t v578 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v575, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v587 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v584, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v722 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v719, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v731 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v728, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svst1w_u64(pred_full, (unsigned *)(v1054), svreinterpret_u64_s16(v668)); + svst1w_u64(pred_full, (unsigned *)(v1063), svreinterpret_u64_s16(v677)); + svst1w_u64(pred_full, (unsigned *)(v1072), svreinterpret_u64_s16(v686)); + svst1w_u64(pred_full, (unsigned *)(v1081), svreinterpret_u64_s16(v695)); + svst1w_u64(pred_full, (unsigned *)(v964), svreinterpret_u64_s16(v578)); + svst1w_u64(pred_full, (unsigned *)(v973), svreinterpret_u64_s16(v587)); + svst1w_u64(pred_full, (unsigned *)(v1108), svreinterpret_u64_s16(v722)); + svst1w_u64(pred_full, (unsigned *)(v1117), svreinterpret_u64_s16(v731)); +} +#endif + +#ifndef ARMRAL_ARCH_SVE +void armral_fft_cs16_cf32_cs16_ac_n_uun20( + const armral_cmplx_int16_t *restrict x, armral_cmplx_int16_t *restrict y, + int istride, int ostride, int howmany, float dir) { + float v4 = dir; + const int32_t *v5 = (const int32_t *)x; + int32_t *v6 = (int32_t *)y; + float v279 = 1.5388417685876268e+00F; + float v286 = 5.8778525229247325e-01F; + float v293 = 3.6327126400268028e-01F; + float v317 = 1.0000000000000000e+00F; + float v318 = -1.0000000000000000e+00F; + float v324 = -1.2500000000000000e+00F; + float v325 = 1.2500000000000000e+00F; + float v331 = 5.5901699437494745e-01F; + float v332 = -5.5901699437494745e-01F; + float v339 = -1.5388417685876268e+00F; + float v343 = -5.8778525229247325e-01F; + float v347 = -3.6327126400268028e-01F; + int16x4_t v13 = vld1s_s16(&v5[0]); + int16x4_t v147 = vld1s_s16(&v5[istride]); + float32x2_t v273 = (float32x2_t){v324, v324}; + float32x2_t v277 = (float32x2_t){v331, v331}; + float32x2_t v281 = (float32x2_t){v279, v339}; + float32x2_t v288 = (float32x2_t){v286, v343}; + float32x2_t v295 = (float32x2_t){v293, v347}; + float32x2_t v319 = (float32x2_t){v317, v318}; + float32x2_t v326 = (float32x2_t){v324, v325}; + float32x2_t v333 = (float32x2_t){v331, v332}; + float32x2_t v334 = (float32x2_t){v4, v4}; + float32x2_t v340 = (float32x2_t){v339, v339}; + float32x2_t v344 = (float32x2_t){v343, v343}; + float32x2_t v348 = (float32x2_t){v347, v347}; + float32x2_t v14 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v13)), 15); + int16x4_t v19 = vld1s_s16(&v5[istride * 10]); + int16x4_t v27 = vld1s_s16(&v5[istride * 5]); + int16x4_t v33 = vld1s_s16(&v5[istride * 15]); + int16x4_t v43 = vld1s_s16(&v5[istride * 4]); + int16x4_t v49 = vld1s_s16(&v5[istride * 14]); + int16x4_t v57 = vld1s_s16(&v5[istride * 9]); + int16x4_t v63 = vld1s_s16(&v5[istride * 19]); + int16x4_t v73 = vld1s_s16(&v5[istride * 8]); + int16x4_t v79 = vld1s_s16(&v5[istride * 18]); + int16x4_t v87 = vld1s_s16(&v5[istride * 13]); + int16x4_t v93 = vld1s_s16(&v5[istride * 3]); + int16x4_t v103 = vld1s_s16(&v5[istride * 12]); + int16x4_t v109 = vld1s_s16(&v5[istride * 2]); + int16x4_t v117 = vld1s_s16(&v5[istride * 17]); + int16x4_t v123 = vld1s_s16(&v5[istride * 7]); + int16x4_t v133 = vld1s_s16(&v5[istride * 16]); + int16x4_t v139 = vld1s_s16(&v5[istride * 6]); + float32x2_t v148 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v147)), 15); + int16x4_t v153 = vld1s_s16(&v5[istride * 11]); + float32x2_t v283 = vmul_f32(v334, v281); + float32x2_t v290 = vmul_f32(v334, v288); + float32x2_t v297 = vmul_f32(v334, v295); + float32x2_t v321 = vmul_f32(v334, v319); + float32x2_t v328 = vmul_f32(v334, v326); + float32x2_t v335 = vmul_f32(v334, v333); + float32x2_t v20 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v19)), 15); + float32x2_t v28 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v27)), 15); + float32x2_t v34 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v33)), 15); + float32x2_t v44 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v43)), 15); + float32x2_t v50 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v49)), 15); + float32x2_t v58 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v57)), 15); + float32x2_t v64 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v63)), 15); + float32x2_t v74 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v73)), 15); + float32x2_t v80 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v79)), 15); + float32x2_t v88 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v87)), 15); + float32x2_t v94 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v93)), 15); + float32x2_t v104 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v103)), 15); + float32x2_t v110 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v109)), 15); + float32x2_t v118 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v117)), 15); + float32x2_t v124 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v123)), 15); + float32x2_t v134 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v133)), 15); + float32x2_t v140 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v139)), 15); + float32x2_t v154 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v153)), 15); + float32x2_t v21 = vadd_f32(v14, v20); + float32x2_t v22 = vsub_f32(v14, v20); + float32x2_t v35 = vadd_f32(v28, v34); + float32x2_t v36 = vsub_f32(v28, v34); + float32x2_t v51 = vadd_f32(v44, v50); + float32x2_t v52 = vsub_f32(v44, v50); + float32x2_t v65 = vadd_f32(v58, v64); + float32x2_t v66 = vsub_f32(v58, v64); + float32x2_t v81 = vadd_f32(v74, v80); + float32x2_t v82 = vsub_f32(v74, v80); + float32x2_t v95 = vadd_f32(v88, v94); + float32x2_t v96 = vsub_f32(v88, v94); + float32x2_t v111 = vadd_f32(v104, v110); + float32x2_t v112 = vsub_f32(v104, v110); + float32x2_t v125 = vadd_f32(v118, v124); + float32x2_t v126 = vsub_f32(v118, v124); + float32x2_t v141 = vadd_f32(v134, v140); + float32x2_t v142 = vsub_f32(v134, v140); + float32x2_t v155 = vadd_f32(v148, v154); + float32x2_t v156 = vsub_f32(v148, v154); + float32x2_t v37 = vadd_f32(v21, v35); + float32x2_t v38 = vsub_f32(v21, v35); + float32x2_t v67 = vadd_f32(v51, v65); + float32x2_t v68 = vsub_f32(v51, v65); + float32x2_t v97 = vadd_f32(v81, v95); + float32x2_t v98 = vsub_f32(v81, v95); + float32x2_t v127 = vadd_f32(v111, v125); + float32x2_t v128 = vsub_f32(v111, v125); + float32x2_t v157 = vadd_f32(v141, v155); + float32x2_t v158 = vsub_f32(v141, v155); + float32x2_t v259 = vadd_f32(v52, v142); + float32x2_t v260 = vsub_f32(v52, v142); + float32x2_t v261 = vadd_f32(v112, v82); + float32x2_t v262 = vsub_f32(v112, v82); + float32x2_t v309 = vadd_f32(v66, v156); + float32x2_t v310 = vsub_f32(v66, v156); + float32x2_t v311 = vadd_f32(v126, v96); + float32x2_t v312 = vsub_f32(v126, v96); + float32x2_t v159 = vadd_f32(v67, v157); + float32x2_t v160 = vsub_f32(v67, v157); + float32x2_t v161 = vadd_f32(v127, v97); + float32x2_t v162 = vsub_f32(v127, v97); + float32x2_t v209 = vadd_f32(v68, v158); + float32x2_t v210 = vsub_f32(v68, v158); + float32x2_t v211 = vadd_f32(v128, v98); + float32x2_t v212 = vsub_f32(v128, v98); + float32x2_t v263 = vadd_f32(v259, v261); + float32x2_t v264 = vsub_f32(v259, v261); + float32x2_t v265 = vadd_f32(v260, v262); + float32x2_t v284 = vrev64_f32(v260); + float32x2_t v298 = vrev64_f32(v262); + float32x2_t v313 = vadd_f32(v309, v311); + float32x2_t v314 = vsub_f32(v309, v311); + float32x2_t v315 = vadd_f32(v310, v312); + float32x2_t v341 = vmul_f32(v310, v340); + float32x2_t v349 = vmul_f32(v312, v348); + float32x2_t v163 = vadd_f32(v159, v161); + float32x2_t v164 = vsub_f32(v159, v161); + float32x2_t v165 = vadd_f32(v160, v162); + float32x2_t v184 = vrev64_f32(v160); + float32x2_t v198 = vrev64_f32(v162); + float32x2_t v213 = vadd_f32(v209, v211); + float32x2_t v214 = vsub_f32(v209, v211); + float32x2_t v215 = vadd_f32(v210, v212); + float32x2_t v234 = vrev64_f32(v210); + float32x2_t v248 = vrev64_f32(v212); + float32x2_t v266 = vadd_f32(v263, v22); + float32x2_t v274 = vmul_f32(v263, v273); + float32x2_t v278 = vmul_f32(v264, v277); + float32x2_t v285 = vmul_f32(v284, v283); + float32x2_t v291 = vrev64_f32(v265); + float32x2_t v299 = vmul_f32(v298, v297); + float32x2_t v316 = vadd_f32(v313, v36); + float32x2_t v329 = vrev64_f32(v313); + float32x2_t v336 = vrev64_f32(v314); + float32x2_t v345 = vmul_f32(v315, v344); + float32x2_t v166 = vadd_f32(v163, v37); + float32x2_t v174 = vmul_f32(v163, v273); + float32x2_t v178 = vmul_f32(v164, v277); + float32x2_t v185 = vmul_f32(v184, v283); + float32x2_t v191 = vrev64_f32(v165); + float32x2_t v199 = vmul_f32(v198, v297); + float32x2_t v216 = vadd_f32(v213, v38); + float32x2_t v224 = vmul_f32(v213, v273); + float32x2_t v228 = vmul_f32(v214, v277); + float32x2_t v235 = vmul_f32(v234, v283); + float32x2_t v241 = vrev64_f32(v215); + float32x2_t v249 = vmul_f32(v248, v297); + float32x2_t v292 = vmul_f32(v291, v290); + float32x2_t v300 = vadd_f32(v266, v274); + float32x2_t v322 = vrev64_f32(v316); + float32x2_t v330 = vmul_f32(v329, v328); + float32x2_t v337 = vmul_f32(v336, v335); + float32x2_t v353 = vsub_f32(v341, v345); + float32x2_t v354 = vadd_f32(v345, v349); + float32x2_t v192 = vmul_f32(v191, v290); + float32x2_t v200 = vadd_f32(v166, v174); + float32x2_t v242 = vmul_f32(v241, v290); + float32x2_t v250 = vadd_f32(v216, v224); + float32x2_t v301 = vadd_f32(v300, v278); + float32x2_t v302 = vsub_f32(v300, v278); + float32x2_t v303 = vsub_f32(v285, v292); + float32x2_t v304 = vadd_f32(v292, v299); + float32x2_t v323 = vmul_f32(v322, v321); + int16x4_t v363 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v166, 15), (int32x2_t){0, 0})); + int16x4_t v375 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v216, 15), (int32x2_t){0, 0})); + float32x2_t v201 = vadd_f32(v200, v178); + float32x2_t v202 = vsub_f32(v200, v178); + float32x2_t v203 = vsub_f32(v185, v192); + float32x2_t v204 = vadd_f32(v192, v199); + float32x2_t v251 = vadd_f32(v250, v228); + float32x2_t v252 = vsub_f32(v250, v228); + float32x2_t v253 = vsub_f32(v235, v242); + float32x2_t v254 = vadd_f32(v242, v249); + float32x2_t v305 = vadd_f32(v301, v303); + float32x2_t v306 = vsub_f32(v301, v303); + float32x2_t v307 = vadd_f32(v302, v304); + float32x2_t v308 = vsub_f32(v302, v304); + float32x2_t v350 = vadd_f32(v323, v330); + float32x2_t v359 = vadd_f32(v266, v323); + float32x2_t v360 = vsub_f32(v266, v323); + v6[0] = vget_lane_s32(vreinterpret_s32_s16(v363), 0); + v6[ostride * 10] = vget_lane_s32(vreinterpret_s32_s16(v375), 0); + float32x2_t v205 = vadd_f32(v201, v203); + float32x2_t v206 = vsub_f32(v201, v203); + float32x2_t v207 = vadd_f32(v202, v204); + float32x2_t v208 = vsub_f32(v202, v204); + float32x2_t v255 = vadd_f32(v251, v253); + float32x2_t v256 = vsub_f32(v251, v253); + float32x2_t v257 = vadd_f32(v252, v254); + float32x2_t v258 = vsub_f32(v252, v254); + float32x2_t v351 = vadd_f32(v350, v337); + float32x2_t v352 = vsub_f32(v350, v337); + int16x4_t v369 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v360, 15), (int32x2_t){0, 0})); + int16x4_t v381 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v359, 15), (int32x2_t){0, 0})); + float32x2_t v355 = vadd_f32(v351, v353); + float32x2_t v356 = vsub_f32(v351, v353); + float32x2_t v357 = vadd_f32(v352, v354); + float32x2_t v358 = vsub_f32(v352, v354); + v6[ostride * 5] = vget_lane_s32(vreinterpret_s32_s16(v369), 0); + v6[ostride * 15] = vget_lane_s32(vreinterpret_s32_s16(v381), 0); + int16x4_t v389 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v206, 15), (int32x2_t){0, 0})); + int16x4_t v401 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v256, 15), (int32x2_t){0, 0})); + int16x4_t v415 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v208, 15), (int32x2_t){0, 0})); + int16x4_t v427 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v258, 15), (int32x2_t){0, 0})); + int16x4_t v441 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v207, 15), (int32x2_t){0, 0})); + int16x4_t v453 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v257, 15), (int32x2_t){0, 0})); + int16x4_t v467 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v205, 15), (int32x2_t){0, 0})); + int16x4_t v479 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v255, 15), (int32x2_t){0, 0})); + float32x2_t v385 = vadd_f32(v306, v356); + float32x2_t v386 = vsub_f32(v306, v356); + v6[ostride * 16] = vget_lane_s32(vreinterpret_s32_s16(v389), 0); + v6[ostride * 6] = vget_lane_s32(vreinterpret_s32_s16(v401), 0); + float32x2_t v411 = vadd_f32(v308, v358); + float32x2_t v412 = vsub_f32(v308, v358); + v6[ostride * 12] = vget_lane_s32(vreinterpret_s32_s16(v415), 0); + v6[ostride * 2] = vget_lane_s32(vreinterpret_s32_s16(v427), 0); + float32x2_t v437 = vadd_f32(v307, v357); + float32x2_t v438 = vsub_f32(v307, v357); + v6[ostride * 8] = vget_lane_s32(vreinterpret_s32_s16(v441), 0); + v6[ostride * 18] = vget_lane_s32(vreinterpret_s32_s16(v453), 0); + float32x2_t v463 = vadd_f32(v305, v355); + float32x2_t v464 = vsub_f32(v305, v355); + v6[ostride * 4] = vget_lane_s32(vreinterpret_s32_s16(v467), 0); + v6[ostride * 14] = vget_lane_s32(vreinterpret_s32_s16(v479), 0); + int16x4_t v395 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v386, 15), (int32x2_t){0, 0})); + int16x4_t v407 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v385, 15), (int32x2_t){0, 0})); + int16x4_t v421 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v412, 15), (int32x2_t){0, 0})); + int16x4_t v433 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v411, 15), (int32x2_t){0, 0})); + int16x4_t v447 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v438, 15), (int32x2_t){0, 0})); + int16x4_t v459 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v437, 15), (int32x2_t){0, 0})); + int16x4_t v473 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v464, 15), (int32x2_t){0, 0})); + int16x4_t v485 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v463, 15), (int32x2_t){0, 0})); + v6[ostride] = vget_lane_s32(vreinterpret_s32_s16(v395), 0); + v6[ostride * 11] = vget_lane_s32(vreinterpret_s32_s16(v407), 0); + v6[ostride * 17] = vget_lane_s32(vreinterpret_s32_s16(v421), 0); + v6[ostride * 7] = vget_lane_s32(vreinterpret_s32_s16(v433), 0); + v6[ostride * 13] = vget_lane_s32(vreinterpret_s32_s16(v447), 0); + v6[ostride * 3] = vget_lane_s32(vreinterpret_s32_s16(v459), 0); + v6[ostride * 9] = vget_lane_s32(vreinterpret_s32_s16(v473), 0); + v6[ostride * 19] = vget_lane_s32(vreinterpret_s32_s16(v485), 0); +} +#endif + +#ifdef ARMRAL_ARCH_SVE +void armral_fft_cs16_cf32_cs16_ac_n_uun20( + const armral_cmplx_int16_t *restrict x, armral_cmplx_int16_t *restrict y, + int istride, int ostride, int howmany, float dir) { + int64_t v0 = istride; + int64_t v2 = ostride; + float v4 = dir; + const int32_t *v5 = (const int32_t *)x; + int32_t *v6 = (int32_t *)y; + svbool_t pred_full = svptrue_pat_b32(SV_VL2); + float v324 = -1.2500000000000000e+00F; + float v329 = 5.5901699437494745e-01F; + float v372 = -1.0000000000000000e+00F; + float v379 = 1.2500000000000000e+00F; + float v386 = -5.5901699437494745e-01F; + float v393 = -1.5388417685876268e+00F; + float v398 = -5.8778525229247325e-01F; + float v403 = -3.6327126400268028e-01F; + const int32_t *v755 = &v5[v0]; + int32_t *v843 = &v6[v2]; + int64_t v23 = v0 * 10; + int64_t v33 = v0 * 5; + int64_t v41 = v0 * 15; + int64_t v53 = v0 * 4; + int64_t v61 = v0 * 14; + int64_t v71 = v0 * 9; + int64_t v79 = v0 * 19; + int64_t v91 = v0 * 8; + int64_t v99 = v0 * 18; + int64_t v109 = v0 * 13; + int64_t v117 = v0 * 3; + int64_t v129 = v0 * 12; + int64_t v137 = v0 * 2; + int64_t v147 = v0 * 17; + int64_t v155 = v0 * 7; + int64_t v167 = v0 * 16; + int64_t v175 = v0 * 6; + int64_t v193 = v0 * 11; + float v337 = v4 * v393; + float v344 = v4 * v398; + float v351 = v4 * v403; + float v375 = v4 * v372; + float v382 = v4 * v379; + float v389 = v4 * v386; + int64_t v427 = v2 * 5; + int64_t v435 = v2 * 10; + int64_t v443 = v2 * 15; + int64_t v453 = v2 * 16; + int64_t v469 = v2 * 6; + int64_t v477 = v2 * 11; + int64_t v487 = v2 * 12; + int64_t v495 = v2 * 17; + int64_t v503 = v2 * 2; + int64_t v511 = v2 * 7; + int64_t v521 = v2 * 8; + int64_t v529 = v2 * 13; + int64_t v537 = v2 * 18; + int64_t v545 = v2 * 3; + int64_t v555 = v2 * 4; + int64_t v563 = v2 * 9; + int64_t v571 = v2 * 14; + int64_t v579 = v2 * 19; + const int32_t *v593 = &v5[0]; + svfloat32_t v780 = svdup_n_f32(v324); + svfloat32_t v781 = svdup_n_f32(v329); + svfloat32_t v788 = svdup_n_f32(v393); + svfloat32_t v789 = svdup_n_f32(v398); + svfloat32_t v790 = svdup_n_f32(v403); + int32_t *v798 = &v6[0]; + svfloat32_t v191 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v755[0])), + 1.F / (1ULL << 15ULL)); + const int32_t *v602 = &v5[v23]; + const int32_t *v611 = &v5[v33]; + const int32_t *v620 = &v5[v41]; + const int32_t *v629 = &v5[v53]; + const int32_t *v638 = &v5[v61]; + const int32_t *v647 = &v5[v71]; + const int32_t *v656 = &v5[v79]; + const int32_t *v665 = &v5[v91]; + const int32_t *v674 = &v5[v99]; + const int32_t *v683 = &v5[v109]; + const int32_t *v692 = &v5[v117]; + const int32_t *v701 = &v5[v129]; + const int32_t *v710 = &v5[v137]; + const int32_t *v719 = &v5[v147]; + const int32_t *v728 = &v5[v155]; + const int32_t *v737 = &v5[v167]; + const int32_t *v746 = &v5[v175]; + const int32_t *v764 = &v5[v193]; + svfloat32_t v782 = svdup_n_f32(v337); + svfloat32_t v783 = svdup_n_f32(v344); + svfloat32_t v784 = svdup_n_f32(v351); + svfloat32_t v785 = svdup_n_f32(v375); + svfloat32_t v786 = svdup_n_f32(v382); + svfloat32_t v787 = svdup_n_f32(v389); + int32_t *v807 = &v6[v427]; + int32_t *v816 = &v6[v435]; + int32_t *v825 = &v6[v443]; + int32_t *v834 = &v6[v453]; + int32_t *v852 = &v6[v469]; + int32_t *v861 = &v6[v477]; + int32_t *v870 = &v6[v487]; + int32_t *v879 = &v6[v495]; + int32_t *v888 = &v6[v503]; + int32_t *v897 = &v6[v511]; + int32_t *v906 = &v6[v521]; + int32_t *v915 = &v6[v529]; + int32_t *v924 = &v6[v537]; + int32_t *v933 = &v6[v545]; + int32_t *v942 = &v6[v555]; + int32_t *v951 = &v6[v563]; + int32_t *v960 = &v6[v571]; + int32_t *v969 = &v6[v579]; + svfloat32_t v21 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v593[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v29 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v602[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v39 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v611[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v47 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v620[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v59 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v629[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v67 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v638[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v77 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v647[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v85 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v656[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v97 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v665[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v105 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v674[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v115 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v683[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v123 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v692[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v135 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v701[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v143 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v710[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v153 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v719[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v161 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v728[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v173 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v737[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v181 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v746[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v199 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v764[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v30 = svadd_f32_x(svptrue_b32(), v21, v29); + svfloat32_t v31 = svsub_f32_x(svptrue_b32(), v21, v29); + svfloat32_t v48 = svadd_f32_x(svptrue_b32(), v39, v47); + svfloat32_t v49 = svsub_f32_x(svptrue_b32(), v39, v47); + svfloat32_t v68 = svadd_f32_x(svptrue_b32(), v59, v67); + svfloat32_t v69 = svsub_f32_x(svptrue_b32(), v59, v67); + svfloat32_t v86 = svadd_f32_x(svptrue_b32(), v77, v85); + svfloat32_t v87 = svsub_f32_x(svptrue_b32(), v77, v85); + svfloat32_t v106 = svadd_f32_x(svptrue_b32(), v97, v105); + svfloat32_t v107 = svsub_f32_x(svptrue_b32(), v97, v105); + svfloat32_t v124 = svadd_f32_x(svptrue_b32(), v115, v123); + svfloat32_t v125 = svsub_f32_x(svptrue_b32(), v115, v123); + svfloat32_t v144 = svadd_f32_x(svptrue_b32(), v135, v143); + svfloat32_t v145 = svsub_f32_x(svptrue_b32(), v135, v143); + svfloat32_t v162 = svadd_f32_x(svptrue_b32(), v153, v161); + svfloat32_t v163 = svsub_f32_x(svptrue_b32(), v153, v161); + svfloat32_t v182 = svadd_f32_x(svptrue_b32(), v173, v181); + svfloat32_t v183 = svsub_f32_x(svptrue_b32(), v173, v181); + svfloat32_t v200 = svadd_f32_x(svptrue_b32(), v191, v199); + svfloat32_t v201 = svsub_f32_x(svptrue_b32(), v191, v199); + svfloat32_t v50 = svadd_f32_x(svptrue_b32(), v30, v48); + svfloat32_t v51 = svsub_f32_x(svptrue_b32(), v30, v48); + svfloat32_t v88 = svadd_f32_x(svptrue_b32(), v68, v86); + svfloat32_t v89 = svsub_f32_x(svptrue_b32(), v68, v86); + svfloat32_t v126 = svadd_f32_x(svptrue_b32(), v106, v124); + svfloat32_t v127 = svsub_f32_x(svptrue_b32(), v106, v124); + svfloat32_t v164 = svadd_f32_x(svptrue_b32(), v144, v162); + svfloat32_t v165 = svsub_f32_x(svptrue_b32(), v144, v162); + svfloat32_t v202 = svadd_f32_x(svptrue_b32(), v182, v200); + svfloat32_t v203 = svsub_f32_x(svptrue_b32(), v182, v200); + svfloat32_t v310 = svadd_f32_x(svptrue_b32(), v69, v183); + svfloat32_t v311 = svsub_f32_x(svptrue_b32(), v69, v183); + svfloat32_t v312 = svadd_f32_x(svptrue_b32(), v145, v107); + svfloat32_t v313 = svsub_f32_x(svptrue_b32(), v145, v107); + svfloat32_t v363 = svadd_f32_x(svptrue_b32(), v87, v201); + svfloat32_t v364 = svsub_f32_x(svptrue_b32(), v87, v201); + svfloat32_t v365 = svadd_f32_x(svptrue_b32(), v163, v125); + svfloat32_t v366 = svsub_f32_x(svptrue_b32(), v163, v125); + svfloat32_t v204 = svadd_f32_x(svptrue_b32(), v88, v202); + svfloat32_t v205 = svsub_f32_x(svptrue_b32(), v88, v202); + svfloat32_t v206 = svadd_f32_x(svptrue_b32(), v164, v126); + svfloat32_t v207 = svsub_f32_x(svptrue_b32(), v164, v126); + svfloat32_t v257 = svadd_f32_x(svptrue_b32(), v89, v203); + svfloat32_t v258 = svsub_f32_x(svptrue_b32(), v89, v203); + svfloat32_t v259 = svadd_f32_x(svptrue_b32(), v165, v127); + svfloat32_t v260 = svsub_f32_x(svptrue_b32(), v165, v127); + svfloat32_t v314 = svadd_f32_x(svptrue_b32(), v310, v312); + svfloat32_t v315 = svsub_f32_x(svptrue_b32(), v310, v312); + svfloat32_t v316 = svadd_f32_x(svptrue_b32(), v311, v313); + svfloat32_t zero339 = svdup_n_f32(0); + svfloat32_t v339 = svcmla_f32_x(pred_full, zero339, v782, v311, 90); + svfloat32_t v367 = svadd_f32_x(svptrue_b32(), v363, v365); + svfloat32_t v368 = svsub_f32_x(svptrue_b32(), v363, v365); + svfloat32_t v369 = svadd_f32_x(svptrue_b32(), v364, v366); + svfloat32_t v406 = svmul_f32_x(svptrue_b32(), v366, v790); + svfloat32_t v208 = svadd_f32_x(svptrue_b32(), v204, v206); + svfloat32_t v209 = svsub_f32_x(svptrue_b32(), v204, v206); + svfloat32_t v210 = svadd_f32_x(svptrue_b32(), v205, v207); + svfloat32_t zero233 = svdup_n_f32(0); + svfloat32_t v233 = svcmla_f32_x(pred_full, zero233, v782, v205, 90); + svfloat32_t v261 = svadd_f32_x(svptrue_b32(), v257, v259); + svfloat32_t v262 = svsub_f32_x(svptrue_b32(), v257, v259); + svfloat32_t v263 = svadd_f32_x(svptrue_b32(), v258, v260); + svfloat32_t zero286 = svdup_n_f32(0); + svfloat32_t v286 = svcmla_f32_x(pred_full, zero286, v782, v258, 90); + svfloat32_t v317 = svadd_f32_x(svptrue_b32(), v314, v31); + svfloat32_t zero346 = svdup_n_f32(0); + svfloat32_t v346 = svcmla_f32_x(pred_full, zero346, v783, v316, 90); + svfloat32_t v370 = svadd_f32_x(svptrue_b32(), v367, v49); + svfloat32_t zero391 = svdup_n_f32(0); + svfloat32_t v391 = svcmla_f32_x(pred_full, zero391, v787, v368, 90); + svfloat32_t v401 = svmul_f32_x(svptrue_b32(), v369, v789); + svfloat32_t v211 = svadd_f32_x(svptrue_b32(), v208, v50); + svfloat32_t zero240 = svdup_n_f32(0); + svfloat32_t v240 = svcmla_f32_x(pred_full, zero240, v783, v210, 90); + svfloat32_t v264 = svadd_f32_x(svptrue_b32(), v261, v51); + svfloat32_t zero293 = svdup_n_f32(0); + svfloat32_t v293 = svcmla_f32_x(pred_full, zero293, v783, v263, 90); + svfloat32_t v354 = svmla_f32_x(pred_full, v317, v314, v780); + svfloat32_t v357 = svsub_f32_x(svptrue_b32(), v339, v346); + svfloat32_t v358 = svcmla_f32_x(pred_full, v346, v784, v313, 90); + svfloat32_t zero377 = svdup_n_f32(0); + svfloat32_t v377 = svcmla_f32_x(pred_full, zero377, v785, v370, 90); + svfloat32_t v410 = svnmls_f32_x(pred_full, v401, v364, v788); + svfloat32_t v411 = svmla_f32_x(pred_full, v406, v369, v789); + svfloat32_t v248 = svmla_f32_x(pred_full, v211, v208, v780); + svfloat32_t v251 = svsub_f32_x(svptrue_b32(), v233, v240); + svfloat32_t v252 = svcmla_f32_x(pred_full, v240, v784, v207, 90); + svfloat32_t v301 = svmla_f32_x(pred_full, v264, v261, v780); + svfloat32_t v304 = svsub_f32_x(svptrue_b32(), v286, v293); + svfloat32_t v305 = svcmla_f32_x(pred_full, v293, v784, v260, 90); + svfloat32_t v355 = svmla_f32_x(pred_full, v354, v315, v781); + svfloat32_t v356 = svmls_f32_x(pred_full, v354, v315, v781); + svfloat32_t v407 = svcmla_f32_x(pred_full, v377, v786, v367, 90); + svfloat32_t v416 = svadd_f32_x(svptrue_b32(), v317, v377); + svfloat32_t v417 = svsub_f32_x(svptrue_b32(), v317, v377); + svint16_t v420 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v211, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v436 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v264, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svfloat32_t v249 = svmla_f32_x(pred_full, v248, v209, v781); + svfloat32_t v250 = svmls_f32_x(pred_full, v248, v209, v781); + svfloat32_t v302 = svmla_f32_x(pred_full, v301, v262, v781); + svfloat32_t v303 = svmls_f32_x(pred_full, v301, v262, v781); + svfloat32_t v359 = svadd_f32_x(svptrue_b32(), v355, v357); + svfloat32_t v360 = svsub_f32_x(svptrue_b32(), v355, v357); + svfloat32_t v361 = svadd_f32_x(svptrue_b32(), v356, v358); + svfloat32_t v362 = svsub_f32_x(svptrue_b32(), v356, v358); + svfloat32_t v408 = svadd_f32_x(svptrue_b32(), v407, v391); + svfloat32_t v409 = svsub_f32_x(svptrue_b32(), v407, v391); + svint16_t v428 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v417, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v444 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v416, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svst1w_u64(pred_full, (unsigned *)(v798), svreinterpret_u64_s16(v420)); + svst1w_u64(pred_full, (unsigned *)(v816), svreinterpret_u64_s16(v436)); + svfloat32_t v253 = svadd_f32_x(svptrue_b32(), v249, v251); + svfloat32_t v254 = svsub_f32_x(svptrue_b32(), v249, v251); + svfloat32_t v255 = svadd_f32_x(svptrue_b32(), v250, v252); + svfloat32_t v256 = svsub_f32_x(svptrue_b32(), v250, v252); + svfloat32_t v306 = svadd_f32_x(svptrue_b32(), v302, v304); + svfloat32_t v307 = svsub_f32_x(svptrue_b32(), v302, v304); + svfloat32_t v308 = svadd_f32_x(svptrue_b32(), v303, v305); + svfloat32_t v309 = svsub_f32_x(svptrue_b32(), v303, v305); + svfloat32_t v412 = svadd_f32_x(svptrue_b32(), v408, v410); + svfloat32_t v413 = svsub_f32_x(svptrue_b32(), v408, v410); + svfloat32_t v414 = svadd_f32_x(svptrue_b32(), v409, v411); + svfloat32_t v415 = svsub_f32_x(svptrue_b32(), v409, v411); + svst1w_u64(pred_full, (unsigned *)(v807), svreinterpret_u64_s16(v428)); + svst1w_u64(pred_full, (unsigned *)(v825), svreinterpret_u64_s16(v444)); + svfloat32_t v450 = svadd_f32_x(svptrue_b32(), v360, v413); + svfloat32_t v451 = svsub_f32_x(svptrue_b32(), v360, v413); + svint16_t v454 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v254, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v470 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v307, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svfloat32_t v484 = svadd_f32_x(svptrue_b32(), v362, v415); + svfloat32_t v485 = svsub_f32_x(svptrue_b32(), v362, v415); + svint16_t v488 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v256, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v504 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v309, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svfloat32_t v518 = svadd_f32_x(svptrue_b32(), v361, v414); + svfloat32_t v519 = svsub_f32_x(svptrue_b32(), v361, v414); + svint16_t v522 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v255, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v538 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v308, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svfloat32_t v552 = svadd_f32_x(svptrue_b32(), v359, v412); + svfloat32_t v553 = svsub_f32_x(svptrue_b32(), v359, v412); + svint16_t v556 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v253, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v572 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v306, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v462 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v451, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v478 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v450, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v496 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v485, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v512 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v484, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v530 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v519, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v546 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v518, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v564 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v553, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v580 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v552, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svst1w_u64(pred_full, (unsigned *)(v834), svreinterpret_u64_s16(v454)); + svst1w_u64(pred_full, (unsigned *)(v852), svreinterpret_u64_s16(v470)); + svst1w_u64(pred_full, (unsigned *)(v870), svreinterpret_u64_s16(v488)); + svst1w_u64(pred_full, (unsigned *)(v888), svreinterpret_u64_s16(v504)); + svst1w_u64(pred_full, (unsigned *)(v906), svreinterpret_u64_s16(v522)); + svst1w_u64(pred_full, (unsigned *)(v924), svreinterpret_u64_s16(v538)); + svst1w_u64(pred_full, (unsigned *)(v942), svreinterpret_u64_s16(v556)); + svst1w_u64(pred_full, (unsigned *)(v960), svreinterpret_u64_s16(v572)); + svst1w_u64(pred_full, (unsigned *)(v843), svreinterpret_u64_s16(v462)); + svst1w_u64(pred_full, (unsigned *)(v861), svreinterpret_u64_s16(v478)); + svst1w_u64(pred_full, (unsigned *)(v879), svreinterpret_u64_s16(v496)); + svst1w_u64(pred_full, (unsigned *)(v897), svreinterpret_u64_s16(v512)); + svst1w_u64(pred_full, (unsigned *)(v915), svreinterpret_u64_s16(v530)); + svst1w_u64(pred_full, (unsigned *)(v933), svreinterpret_u64_s16(v546)); + svst1w_u64(pred_full, (unsigned *)(v951), svreinterpret_u64_s16(v564)); + svst1w_u64(pred_full, (unsigned *)(v969), svreinterpret_u64_s16(v580)); +} +#endif + +#ifndef ARMRAL_ARCH_SVE +void armral_fft_cs16_cf32_cs16_ac_n_uun21( + const armral_cmplx_int16_t *restrict x, armral_cmplx_int16_t *restrict y, + int istride, int ostride, int howmany, float dir) { + float v4 = dir; + const int32_t *v5 = (const int32_t *)x; + int32_t *v6 = (int32_t *)y; + float v178 = -1.1666666666666665e+00F; + float v182 = 7.9015646852540022e-01F; + float v186 = 5.5854267289647742e-02F; + float v190 = 7.3430220123575241e-01F; + float v193 = 4.4095855184409838e-01F; + float v194 = -4.4095855184409838e-01F; + float v200 = 3.4087293062393137e-01F; + float v201 = -3.4087293062393137e-01F; + float v207 = -5.3396936033772524e-01F; + float v208 = 5.3396936033772524e-01F; + float v214 = 8.7484229096165667e-01F; + float v215 = -8.7484229096165667e-01F; + float v258 = -1.4999999999999998e+00F; + float v262 = 1.7499999999999996e+00F; + float v266 = -1.1852347027881001e+00F; + float v270 = -8.3781400934471603e-02F; + float v274 = -1.1014533018536286e+00F; + float v277 = -6.6143782776614746e-01F; + float v278 = 6.6143782776614746e-01F; + float v284 = -5.1130939593589697e-01F; + float v285 = 5.1130939593589697e-01F; + float v291 = 8.0095404050658769e-01F; + float v292 = -8.0095404050658769e-01F; + float v298 = -1.3122634364424848e+00F; + float v299 = 1.3122634364424848e+00F; + float v341 = 8.6602540378443871e-01F; + float v342 = -8.6602540378443871e-01F; + float v348 = -1.0103629710818451e+00F; + float v349 = 1.0103629710818451e+00F; + float v355 = 6.8429557470759583e-01F; + float v356 = -6.8429557470759583e-01F; + float v362 = 4.8371214382601155e-02F; + float v363 = -4.8371214382601155e-02F; + float v369 = 6.3592436032499466e-01F; + float v370 = -6.3592436032499466e-01F; + float v377 = -3.8188130791298663e-01F; + float v381 = -2.9520461738277515e-01F; + float v385 = 4.6243103089499693e-01F; + float v389 = -7.5763564827777208e-01F; + int16x4_t v27 = vld1s_s16(&v5[0]); + int16x4_t v118 = vld1s_s16(&v5[istride]); + float32x2_t v179 = (float32x2_t){v178, v178}; + float32x2_t v183 = (float32x2_t){v182, v182}; + float32x2_t v187 = (float32x2_t){v186, v186}; + float32x2_t v191 = (float32x2_t){v190, v190}; + float32x2_t v195 = (float32x2_t){v193, v194}; + float32x2_t v202 = (float32x2_t){v200, v201}; + float32x2_t v209 = (float32x2_t){v207, v208}; + float32x2_t v216 = (float32x2_t){v214, v215}; + float32x2_t v259 = (float32x2_t){v258, v258}; + float32x2_t v263 = (float32x2_t){v262, v262}; + float32x2_t v267 = (float32x2_t){v266, v266}; + float32x2_t v271 = (float32x2_t){v270, v270}; + float32x2_t v275 = (float32x2_t){v274, v274}; + float32x2_t v279 = (float32x2_t){v277, v278}; + float32x2_t v286 = (float32x2_t){v284, v285}; + float32x2_t v293 = (float32x2_t){v291, v292}; + float32x2_t v300 = (float32x2_t){v298, v299}; + float32x2_t v343 = (float32x2_t){v341, v342}; + float32x2_t v350 = (float32x2_t){v348, v349}; + float32x2_t v357 = (float32x2_t){v355, v356}; + float32x2_t v364 = (float32x2_t){v362, v363}; + float32x2_t v371 = (float32x2_t){v369, v370}; + float32x2_t v372 = (float32x2_t){v4, v4}; + float32x2_t v378 = (float32x2_t){v377, v377}; + float32x2_t v382 = (float32x2_t){v381, v381}; + float32x2_t v386 = (float32x2_t){v385, v385}; + float32x2_t v390 = (float32x2_t){v389, v389}; + int16x4_t v13 = vld1s_s16(&v5[istride * 7]); + int16x4_t v19 = vld1s_s16(&v5[istride * 14]); + float32x2_t v28 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v27)), 15); + int16x4_t v34 = vld1s_s16(&v5[istride * 10]); + int16x4_t v40 = vld1s_s16(&v5[istride * 17]); + int16x4_t v48 = vld1s_s16(&v5[istride * 3]); + int16x4_t v55 = vld1s_s16(&v5[istride * 13]); + int16x4_t v61 = vld1s_s16(&v5[istride * 20]); + int16x4_t v69 = vld1s_s16(&v5[istride * 6]); + int16x4_t v76 = vld1s_s16(&v5[istride * 16]); + int16x4_t v82 = vld1s_s16(&v5[istride * 2]); + int16x4_t v90 = vld1s_s16(&v5[istride * 9]); + int16x4_t v97 = vld1s_s16(&v5[istride * 19]); + int16x4_t v103 = vld1s_s16(&v5[istride * 5]); + int16x4_t v111 = vld1s_s16(&v5[istride * 12]); + float32x2_t v119 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v118)), 15); + int16x4_t v124 = vld1s_s16(&v5[istride * 8]); + int16x4_t v132 = vld1s_s16(&v5[istride * 15]); + int16x4_t v139 = vld1s_s16(&v5[istride * 4]); + int16x4_t v145 = vld1s_s16(&v5[istride * 11]); + int16x4_t v153 = vld1s_s16(&v5[istride * 18]); + float32x2_t v197 = vmul_f32(v372, v195); + float32x2_t v204 = vmul_f32(v372, v202); + float32x2_t v211 = vmul_f32(v372, v209); + float32x2_t v218 = vmul_f32(v372, v216); + float32x2_t v281 = vmul_f32(v372, v279); + float32x2_t v288 = vmul_f32(v372, v286); + float32x2_t v295 = vmul_f32(v372, v293); + float32x2_t v302 = vmul_f32(v372, v300); + float32x2_t v345 = vmul_f32(v372, v343); + float32x2_t v352 = vmul_f32(v372, v350); + float32x2_t v359 = vmul_f32(v372, v357); + float32x2_t v366 = vmul_f32(v372, v364); + float32x2_t v373 = vmul_f32(v372, v371); + float32x2_t v14 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v13)), 15); + float32x2_t v20 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v19)), 15); + float32x2_t v35 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v34)), 15); + float32x2_t v41 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v40)), 15); + float32x2_t v49 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v48)), 15); + float32x2_t v56 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v55)), 15); + float32x2_t v62 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v61)), 15); + float32x2_t v70 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v69)), 15); + float32x2_t v77 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v76)), 15); + float32x2_t v83 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v82)), 15); + float32x2_t v91 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v90)), 15); + float32x2_t v98 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v97)), 15); + float32x2_t v104 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v103)), 15); + float32x2_t v112 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v111)), 15); + float32x2_t v125 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v124)), 15); + float32x2_t v133 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v132)), 15); + float32x2_t v140 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v139)), 15); + float32x2_t v146 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v145)), 15); + float32x2_t v154 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v153)), 15); + float32x2_t v21 = vadd_f32(v14, v20); + float32x2_t v22 = vsub_f32(v14, v20); + float32x2_t v42 = vadd_f32(v35, v41); + float32x2_t v43 = vsub_f32(v35, v41); + float32x2_t v63 = vadd_f32(v56, v62); + float32x2_t v64 = vsub_f32(v56, v62); + float32x2_t v84 = vadd_f32(v77, v83); + float32x2_t v85 = vsub_f32(v77, v83); + float32x2_t v105 = vadd_f32(v98, v104); + float32x2_t v106 = vsub_f32(v98, v104); + float32x2_t v126 = vadd_f32(v119, v125); + float32x2_t v127 = vsub_f32(v119, v125); + float32x2_t v147 = vadd_f32(v140, v146); + float32x2_t v148 = vsub_f32(v140, v146); + float32x2_t v29 = vadd_f32(v21, v28); + float32x2_t v50 = vadd_f32(v42, v49); + float32x2_t v71 = vadd_f32(v63, v70); + float32x2_t v92 = vadd_f32(v84, v91); + float32x2_t v113 = vadd_f32(v105, v112); + float32x2_t v134 = vadd_f32(v126, v133); + float32x2_t v155 = vadd_f32(v147, v154); + float32x2_t v240 = vadd_f32(v42, v147); + float32x2_t v241 = vsub_f32(v42, v147); + float32x2_t v242 = vadd_f32(v105, v84); + float32x2_t v243 = vsub_f32(v105, v84); + float32x2_t v244 = vadd_f32(v63, v126); + float32x2_t v245 = vsub_f32(v63, v126); + float32x2_t v324 = vadd_f32(v43, v148); + float32x2_t v325 = vsub_f32(v43, v148); + float32x2_t v326 = vadd_f32(v106, v85); + float32x2_t v327 = vsub_f32(v106, v85); + float32x2_t v328 = vadd_f32(v64, v127); + float32x2_t v329 = vsub_f32(v64, v127); + float32x2_t v156 = vadd_f32(v50, v155); + float32x2_t v157 = vsub_f32(v50, v155); + float32x2_t v158 = vadd_f32(v113, v92); + float32x2_t v159 = vsub_f32(v113, v92); + float32x2_t v160 = vadd_f32(v71, v134); + float32x2_t v161 = vsub_f32(v71, v134); + float32x2_t v246 = vadd_f32(v240, v242); + float32x2_t v249 = vsub_f32(v240, v242); + float32x2_t v250 = vsub_f32(v242, v244); + float32x2_t v251 = vsub_f32(v244, v240); + float32x2_t v252 = vadd_f32(v241, v243); + float32x2_t v254 = vsub_f32(v241, v243); + float32x2_t v255 = vsub_f32(v243, v245); + float32x2_t v256 = vsub_f32(v245, v241); + float32x2_t v330 = vadd_f32(v324, v326); + float32x2_t v333 = vsub_f32(v324, v326); + float32x2_t v334 = vsub_f32(v326, v328); + float32x2_t v335 = vsub_f32(v328, v324); + float32x2_t v336 = vadd_f32(v325, v327); + float32x2_t v338 = vsub_f32(v325, v327); + float32x2_t v339 = vsub_f32(v327, v329); + float32x2_t v340 = vsub_f32(v329, v325); + float32x2_t v162 = vadd_f32(v156, v158); + float32x2_t v165 = vsub_f32(v156, v158); + float32x2_t v166 = vsub_f32(v158, v160); + float32x2_t v167 = vsub_f32(v160, v156); + float32x2_t v168 = vadd_f32(v157, v159); + float32x2_t v170 = vsub_f32(v157, v159); + float32x2_t v171 = vsub_f32(v159, v161); + float32x2_t v172 = vsub_f32(v161, v157); + float32x2_t v247 = vadd_f32(v246, v244); + float32x2_t v253 = vadd_f32(v252, v245); + float32x2_t v268 = vmul_f32(v249, v267); + float32x2_t v272 = vmul_f32(v250, v271); + float32x2_t v276 = vmul_f32(v251, v275); + float32x2_t v289 = vrev64_f32(v254); + float32x2_t v296 = vrev64_f32(v255); + float32x2_t v303 = vrev64_f32(v256); + float32x2_t v331 = vadd_f32(v330, v328); + float32x2_t v337 = vadd_f32(v336, v329); + float32x2_t v360 = vrev64_f32(v333); + float32x2_t v367 = vrev64_f32(v334); + float32x2_t v374 = vrev64_f32(v335); + float32x2_t v383 = vmul_f32(v338, v382); + float32x2_t v387 = vmul_f32(v339, v386); + float32x2_t v391 = vmul_f32(v340, v390); + float32x2_t v163 = vadd_f32(v162, v160); + float32x2_t v169 = vadd_f32(v168, v161); + float32x2_t v184 = vmul_f32(v165, v183); + float32x2_t v188 = vmul_f32(v166, v187); + float32x2_t v192 = vmul_f32(v167, v191); + float32x2_t v205 = vrev64_f32(v170); + float32x2_t v212 = vrev64_f32(v171); + float32x2_t v219 = vrev64_f32(v172); + float32x2_t v248 = vadd_f32(v247, v21); + float32x2_t v264 = vmul_f32(v247, v263); + float32x2_t v282 = vrev64_f32(v253); + float32x2_t v290 = vmul_f32(v289, v288); + float32x2_t v297 = vmul_f32(v296, v295); + float32x2_t v304 = vmul_f32(v303, v302); + float32x2_t v332 = vadd_f32(v331, v22); + float32x2_t v353 = vrev64_f32(v331); + float32x2_t v361 = vmul_f32(v360, v359); + float32x2_t v368 = vmul_f32(v367, v366); + float32x2_t v375 = vmul_f32(v374, v373); + float32x2_t v379 = vmul_f32(v337, v378); + float32x2_t v164 = vadd_f32(v163, v29); + float32x2_t v180 = vmul_f32(v163, v179); + float32x2_t v198 = vrev64_f32(v169); + float32x2_t v206 = vmul_f32(v205, v204); + float32x2_t v213 = vmul_f32(v212, v211); + float32x2_t v220 = vmul_f32(v219, v218); + float32x2_t v260 = vmul_f32(v248, v259); + float32x2_t v283 = vmul_f32(v282, v281); + float32x2_t v346 = vrev64_f32(v332); + float32x2_t v354 = vmul_f32(v353, v352); + float32x2_t v399 = vadd_f32(v379, v383); + float32x2_t v401 = vsub_f32(v379, v383); + float32x2_t v403 = vsub_f32(v379, v387); + float32x2_t v199 = vmul_f32(v198, v197); + float32x2_t v221 = vadd_f32(v164, v180); + float32x2_t v305 = vadd_f32(v260, v264); + float32x2_t v312 = vadd_f32(v283, v290); + float32x2_t v314 = vsub_f32(v283, v290); + float32x2_t v316 = vsub_f32(v283, v297); + float32x2_t v347 = vmul_f32(v346, v345); + float32x2_t v400 = vadd_f32(v399, v387); + float32x2_t v402 = vsub_f32(v401, v391); + float32x2_t v404 = vadd_f32(v403, v391); + float32x2_t v411 = vadd_f32(v164, v260); + int16x4_t v416 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v164, 15), (int32x2_t){0, 0})); + float32x2_t v222 = vadd_f32(v221, v184); + float32x2_t v224 = vsub_f32(v221, v184); + float32x2_t v226 = vsub_f32(v221, v188); + float32x2_t v228 = vadd_f32(v199, v206); + float32x2_t v230 = vsub_f32(v199, v206); + float32x2_t v232 = vsub_f32(v199, v213); + float32x2_t v306 = vadd_f32(v305, v268); + float32x2_t v308 = vsub_f32(v305, v268); + float32x2_t v310 = vsub_f32(v305, v272); + float32x2_t v313 = vadd_f32(v312, v297); + float32x2_t v315 = vsub_f32(v314, v304); + float32x2_t v317 = vadd_f32(v316, v304); + float32x2_t v392 = vadd_f32(v347, v354); + float32x2_t v412 = vadd_f32(v411, v347); + float32x2_t v413 = vsub_f32(v411, v347); + v6[0] = vget_lane_s32(vreinterpret_s32_s16(v416), 0); + float32x2_t v223 = vadd_f32(v222, v188); + float32x2_t v225 = vsub_f32(v224, v192); + float32x2_t v227 = vadd_f32(v226, v192); + float32x2_t v229 = vadd_f32(v228, v213); + float32x2_t v231 = vsub_f32(v230, v220); + float32x2_t v233 = vadd_f32(v232, v220); + float32x2_t v307 = vadd_f32(v306, v272); + float32x2_t v309 = vsub_f32(v308, v276); + float32x2_t v311 = vadd_f32(v310, v276); + float32x2_t v393 = vadd_f32(v392, v361); + float32x2_t v395 = vsub_f32(v392, v361); + float32x2_t v397 = vsub_f32(v392, v368); + int16x4_t v422 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v413, 15), (int32x2_t){0, 0})); + int16x4_t v428 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v412, 15), (int32x2_t){0, 0})); + float32x2_t v234 = vadd_f32(v223, v229); + float32x2_t v235 = vsub_f32(v223, v229); + float32x2_t v236 = vadd_f32(v225, v231); + float32x2_t v237 = vsub_f32(v225, v231); + float32x2_t v238 = vadd_f32(v227, v233); + float32x2_t v239 = vsub_f32(v227, v233); + float32x2_t v318 = vadd_f32(v307, v313); + float32x2_t v319 = vsub_f32(v307, v313); + float32x2_t v320 = vadd_f32(v309, v315); + float32x2_t v321 = vsub_f32(v309, v315); + float32x2_t v322 = vadd_f32(v311, v317); + float32x2_t v323 = vsub_f32(v311, v317); + float32x2_t v394 = vadd_f32(v393, v368); + float32x2_t v396 = vsub_f32(v395, v375); + float32x2_t v398 = vadd_f32(v397, v375); + v6[ostride * 7] = vget_lane_s32(vreinterpret_s32_s16(v422), 0); + v6[ostride * 14] = vget_lane_s32(vreinterpret_s32_s16(v428), 0); + float32x2_t v405 = vadd_f32(v394, v400); + float32x2_t v406 = vsub_f32(v394, v400); + float32x2_t v407 = vadd_f32(v396, v402); + float32x2_t v408 = vsub_f32(v396, v402); + float32x2_t v409 = vadd_f32(v398, v404); + float32x2_t v410 = vsub_f32(v398, v404); + float32x2_t v432 = vadd_f32(v235, v319); + int16x4_t v437 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v235, 15), (int32x2_t){0, 0})); + float32x2_t v453 = vadd_f32(v237, v321); + int16x4_t v458 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v237, 15), (int32x2_t){0, 0})); + float32x2_t v474 = vadd_f32(v238, v322); + int16x4_t v479 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v238, 15), (int32x2_t){0, 0})); + float32x2_t v495 = vadd_f32(v239, v323); + int16x4_t v500 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v239, 15), (int32x2_t){0, 0})); + float32x2_t v516 = vadd_f32(v236, v320); + int16x4_t v521 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v236, 15), (int32x2_t){0, 0})); + float32x2_t v537 = vadd_f32(v234, v318); + int16x4_t v542 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v234, 15), (int32x2_t){0, 0})); + float32x2_t v433 = vadd_f32(v432, v406); + float32x2_t v434 = vsub_f32(v432, v406); + v6[ostride * 15] = vget_lane_s32(vreinterpret_s32_s16(v437), 0); + float32x2_t v454 = vadd_f32(v453, v408); + float32x2_t v455 = vsub_f32(v453, v408); + v6[ostride * 9] = vget_lane_s32(vreinterpret_s32_s16(v458), 0); + float32x2_t v475 = vadd_f32(v474, v409); + float32x2_t v476 = vsub_f32(v474, v409); + v6[ostride * 3] = vget_lane_s32(vreinterpret_s32_s16(v479), 0); + float32x2_t v496 = vadd_f32(v495, v410); + float32x2_t v497 = vsub_f32(v495, v410); + v6[ostride * 18] = vget_lane_s32(vreinterpret_s32_s16(v500), 0); + float32x2_t v517 = vadd_f32(v516, v407); + float32x2_t v518 = vsub_f32(v516, v407); + v6[ostride * 12] = vget_lane_s32(vreinterpret_s32_s16(v521), 0); + float32x2_t v538 = vadd_f32(v537, v405); + float32x2_t v539 = vsub_f32(v537, v405); + v6[ostride * 6] = vget_lane_s32(vreinterpret_s32_s16(v542), 0); + int16x4_t v443 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v434, 15), (int32x2_t){0, 0})); + int16x4_t v449 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v433, 15), (int32x2_t){0, 0})); + int16x4_t v464 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v455, 15), (int32x2_t){0, 0})); + int16x4_t v470 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v454, 15), (int32x2_t){0, 0})); + int16x4_t v485 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v476, 15), (int32x2_t){0, 0})); + int16x4_t v491 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v475, 15), (int32x2_t){0, 0})); + int16x4_t v506 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v497, 15), (int32x2_t){0, 0})); + int16x4_t v512 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v496, 15), (int32x2_t){0, 0})); + int16x4_t v527 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v518, 15), (int32x2_t){0, 0})); + int16x4_t v533 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v517, 15), (int32x2_t){0, 0})); + int16x4_t v548 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v539, 15), (int32x2_t){0, 0})); + int16x4_t v554 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v538, 15), (int32x2_t){0, 0})); + v6[ostride] = vget_lane_s32(vreinterpret_s32_s16(v443), 0); + v6[ostride * 8] = vget_lane_s32(vreinterpret_s32_s16(v449), 0); + v6[ostride * 16] = vget_lane_s32(vreinterpret_s32_s16(v464), 0); + v6[ostride * 2] = vget_lane_s32(vreinterpret_s32_s16(v470), 0); + v6[ostride * 10] = vget_lane_s32(vreinterpret_s32_s16(v485), 0); + v6[ostride * 17] = vget_lane_s32(vreinterpret_s32_s16(v491), 0); + v6[ostride * 4] = vget_lane_s32(vreinterpret_s32_s16(v506), 0); + v6[ostride * 11] = vget_lane_s32(vreinterpret_s32_s16(v512), 0); + v6[ostride * 19] = vget_lane_s32(vreinterpret_s32_s16(v527), 0); + v6[ostride * 5] = vget_lane_s32(vreinterpret_s32_s16(v533), 0); + v6[ostride * 13] = vget_lane_s32(vreinterpret_s32_s16(v548), 0); + v6[ostride * 20] = vget_lane_s32(vreinterpret_s32_s16(v554), 0); +} +#endif + +#ifdef ARMRAL_ARCH_SVE +void armral_fft_cs16_cf32_cs16_ac_n_uun21( + const armral_cmplx_int16_t *restrict x, armral_cmplx_int16_t *restrict y, + int istride, int ostride, int howmany, float dir) { + int64_t v0 = istride; + int64_t v2 = ostride; + float v4 = dir; + const int32_t *v5 = (const int32_t *)x; + int32_t *v6 = (int32_t *)y; + svbool_t pred_full = svptrue_pat_b32(SV_VL2); + float v226 = -1.1666666666666665e+00F; + float v231 = 7.9015646852540022e-01F; + float v236 = 5.5854267289647742e-02F; + float v241 = 7.3430220123575241e-01F; + float v246 = -4.4095855184409838e-01F; + float v253 = -3.4087293062393137e-01F; + float v260 = 5.3396936033772524e-01F; + float v267 = -8.7484229096165667e-01F; + float v310 = -1.4999999999999998e+00F; + float v315 = 1.7499999999999996e+00F; + float v320 = -1.1852347027881001e+00F; + float v325 = -8.3781400934471603e-02F; + float v330 = -1.1014533018536286e+00F; + float v335 = 6.6143782776614746e-01F; + float v342 = 5.1130939593589697e-01F; + float v349 = -8.0095404050658769e-01F; + float v356 = 1.3122634364424848e+00F; + float v399 = -8.6602540378443871e-01F; + float v406 = 1.0103629710818451e+00F; + float v413 = -6.8429557470759583e-01F; + float v420 = -4.8371214382601155e-02F; + float v427 = -6.3592436032499466e-01F; + float v434 = -3.8188130791298663e-01F; + float v439 = -2.9520461738277515e-01F; + float v444 = 4.6243103089499693e-01F; + float v449 = -7.5763564827777208e-01F; + const int32_t *v803 = &v5[v0]; + int32_t *v921 = &v6[v2]; + int64_t v15 = v0 * 7; + int64_t v23 = v0 * 14; + int64_t v42 = v0 * 10; + int64_t v50 = v0 * 17; + int64_t v60 = v0 * 3; + int64_t v69 = v0 * 13; + int64_t v77 = v0 * 20; + int64_t v87 = v0 * 6; + int64_t v96 = v0 * 16; + int64_t v104 = v0 * 2; + int64_t v114 = v0 * 9; + int64_t v123 = v0 * 19; + int64_t v131 = v0 * 5; + int64_t v141 = v0 * 12; + int64_t v158 = v0 * 8; + int64_t v168 = v0 * 15; + int64_t v177 = v0 * 4; + int64_t v185 = v0 * 11; + int64_t v195 = v0 * 18; + float v249 = v4 * v246; + float v256 = v4 * v253; + float v263 = v4 * v260; + float v270 = v4 * v267; + float v338 = v4 * v335; + float v345 = v4 * v342; + float v352 = v4 * v349; + float v359 = v4 * v356; + float v402 = v4 * v399; + float v409 = v4 * v406; + float v416 = v4 * v413; + float v423 = v4 * v420; + float v430 = v4 * v427; + int64_t v484 = v2 * 7; + int64_t v492 = v2 * 14; + int64_t v503 = v2 * 15; + int64_t v519 = v2 * 8; + int64_t v530 = v2 * 9; + int64_t v538 = v2 * 16; + int64_t v546 = v2 * 2; + int64_t v557 = v2 * 3; + int64_t v565 = v2 * 10; + int64_t v573 = v2 * 17; + int64_t v584 = v2 * 18; + int64_t v592 = v2 * 4; + int64_t v600 = v2 * 11; + int64_t v611 = v2 * 12; + int64_t v619 = v2 * 19; + int64_t v627 = v2 * 5; + int64_t v638 = v2 * 6; + int64_t v646 = v2 * 13; + int64_t v654 = v2 * 20; + const int32_t *v686 = &v5[0]; + svfloat32_t v852 = svdup_n_f32(v226); + svfloat32_t v853 = svdup_n_f32(v231); + svfloat32_t v854 = svdup_n_f32(v236); + svfloat32_t v855 = svdup_n_f32(v241); + svfloat32_t v860 = svdup_n_f32(v310); + svfloat32_t v861 = svdup_n_f32(v315); + svfloat32_t v862 = svdup_n_f32(v320); + svfloat32_t v863 = svdup_n_f32(v325); + svfloat32_t v864 = svdup_n_f32(v330); + svfloat32_t v874 = svdup_n_f32(v434); + svfloat32_t v875 = svdup_n_f32(v439); + svfloat32_t v876 = svdup_n_f32(v444); + svfloat32_t v877 = svdup_n_f32(v449); + int32_t *v885 = &v6[0]; + svfloat32_t v156 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v803[0])), + 1.F / (1ULL << 15ULL)); + const int32_t *v667 = &v5[v15]; + const int32_t *v676 = &v5[v23]; + const int32_t *v695 = &v5[v42]; + const int32_t *v704 = &v5[v50]; + const int32_t *v713 = &v5[v60]; + const int32_t *v722 = &v5[v69]; + const int32_t *v731 = &v5[v77]; + const int32_t *v740 = &v5[v87]; + const int32_t *v749 = &v5[v96]; + const int32_t *v758 = &v5[v104]; + const int32_t *v767 = &v5[v114]; + const int32_t *v776 = &v5[v123]; + const int32_t *v785 = &v5[v131]; + const int32_t *v794 = &v5[v141]; + const int32_t *v812 = &v5[v158]; + const int32_t *v821 = &v5[v168]; + const int32_t *v830 = &v5[v177]; + const int32_t *v839 = &v5[v185]; + const int32_t *v848 = &v5[v195]; + svfloat32_t v856 = svdup_n_f32(v249); + svfloat32_t v857 = svdup_n_f32(v256); + svfloat32_t v858 = svdup_n_f32(v263); + svfloat32_t v859 = svdup_n_f32(v270); + svfloat32_t v865 = svdup_n_f32(v338); + svfloat32_t v866 = svdup_n_f32(v345); + svfloat32_t v867 = svdup_n_f32(v352); + svfloat32_t v868 = svdup_n_f32(v359); + svfloat32_t v869 = svdup_n_f32(v402); + svfloat32_t v870 = svdup_n_f32(v409); + svfloat32_t v871 = svdup_n_f32(v416); + svfloat32_t v872 = svdup_n_f32(v423); + svfloat32_t v873 = svdup_n_f32(v430); + int32_t *v894 = &v6[v484]; + int32_t *v903 = &v6[v492]; + int32_t *v912 = &v6[v503]; + int32_t *v930 = &v6[v519]; + int32_t *v939 = &v6[v530]; + int32_t *v948 = &v6[v538]; + int32_t *v957 = &v6[v546]; + int32_t *v966 = &v6[v557]; + int32_t *v975 = &v6[v565]; + int32_t *v984 = &v6[v573]; + int32_t *v993 = &v6[v584]; + int32_t *v1002 = &v6[v592]; + int32_t *v1011 = &v6[v600]; + int32_t *v1020 = &v6[v611]; + int32_t *v1029 = &v6[v619]; + int32_t *v1038 = &v6[v627]; + int32_t *v1047 = &v6[v638]; + int32_t *v1056 = &v6[v646]; + int32_t *v1065 = &v6[v654]; + svfloat32_t v39 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v686[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v21 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v667[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v29 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v676[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v48 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v695[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v56 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v704[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v66 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v713[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v75 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v722[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v83 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v731[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v93 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v740[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v102 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v749[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v110 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v758[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v120 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v767[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v129 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v776[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v137 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v785[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v147 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v794[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v164 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v812[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v174 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v821[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v183 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v830[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v191 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v839[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v201 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v848[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v30 = svadd_f32_x(svptrue_b32(), v21, v29); + svfloat32_t v31 = svsub_f32_x(svptrue_b32(), v21, v29); + svfloat32_t v57 = svadd_f32_x(svptrue_b32(), v48, v56); + svfloat32_t v58 = svsub_f32_x(svptrue_b32(), v48, v56); + svfloat32_t v84 = svadd_f32_x(svptrue_b32(), v75, v83); + svfloat32_t v85 = svsub_f32_x(svptrue_b32(), v75, v83); + svfloat32_t v111 = svadd_f32_x(svptrue_b32(), v102, v110); + svfloat32_t v112 = svsub_f32_x(svptrue_b32(), v102, v110); + svfloat32_t v138 = svadd_f32_x(svptrue_b32(), v129, v137); + svfloat32_t v139 = svsub_f32_x(svptrue_b32(), v129, v137); + svfloat32_t v165 = svadd_f32_x(svptrue_b32(), v156, v164); + svfloat32_t v166 = svsub_f32_x(svptrue_b32(), v156, v164); + svfloat32_t v192 = svadd_f32_x(svptrue_b32(), v183, v191); + svfloat32_t v193 = svsub_f32_x(svptrue_b32(), v183, v191); + svfloat32_t v40 = svadd_f32_x(svptrue_b32(), v30, v39); + svfloat32_t v67 = svadd_f32_x(svptrue_b32(), v57, v66); + svfloat32_t v94 = svadd_f32_x(svptrue_b32(), v84, v93); + svfloat32_t v121 = svadd_f32_x(svptrue_b32(), v111, v120); + svfloat32_t v148 = svadd_f32_x(svptrue_b32(), v138, v147); + svfloat32_t v175 = svadd_f32_x(svptrue_b32(), v165, v174); + svfloat32_t v202 = svadd_f32_x(svptrue_b32(), v192, v201); + svfloat32_t v292 = svadd_f32_x(svptrue_b32(), v57, v192); + svfloat32_t v293 = svsub_f32_x(svptrue_b32(), v57, v192); + svfloat32_t v294 = svadd_f32_x(svptrue_b32(), v138, v111); + svfloat32_t v295 = svsub_f32_x(svptrue_b32(), v138, v111); + svfloat32_t v296 = svadd_f32_x(svptrue_b32(), v84, v165); + svfloat32_t v297 = svsub_f32_x(svptrue_b32(), v84, v165); + svfloat32_t v381 = svadd_f32_x(svptrue_b32(), v58, v193); + svfloat32_t v382 = svsub_f32_x(svptrue_b32(), v58, v193); + svfloat32_t v383 = svadd_f32_x(svptrue_b32(), v139, v112); + svfloat32_t v384 = svsub_f32_x(svptrue_b32(), v139, v112); + svfloat32_t v385 = svadd_f32_x(svptrue_b32(), v85, v166); + svfloat32_t v386 = svsub_f32_x(svptrue_b32(), v85, v166); + svfloat32_t v203 = svadd_f32_x(svptrue_b32(), v67, v202); + svfloat32_t v204 = svsub_f32_x(svptrue_b32(), v67, v202); + svfloat32_t v205 = svadd_f32_x(svptrue_b32(), v148, v121); + svfloat32_t v206 = svsub_f32_x(svptrue_b32(), v148, v121); + svfloat32_t v207 = svadd_f32_x(svptrue_b32(), v94, v175); + svfloat32_t v208 = svsub_f32_x(svptrue_b32(), v94, v175); + svfloat32_t v298 = svadd_f32_x(svptrue_b32(), v292, v294); + svfloat32_t v301 = svsub_f32_x(svptrue_b32(), v292, v294); + svfloat32_t v302 = svsub_f32_x(svptrue_b32(), v294, v296); + svfloat32_t v303 = svsub_f32_x(svptrue_b32(), v296, v292); + svfloat32_t v304 = svadd_f32_x(svptrue_b32(), v293, v295); + svfloat32_t v306 = svsub_f32_x(svptrue_b32(), v293, v295); + svfloat32_t v307 = svsub_f32_x(svptrue_b32(), v295, v297); + svfloat32_t v308 = svsub_f32_x(svptrue_b32(), v297, v293); + svfloat32_t v387 = svadd_f32_x(svptrue_b32(), v381, v383); + svfloat32_t v390 = svsub_f32_x(svptrue_b32(), v381, v383); + svfloat32_t v391 = svsub_f32_x(svptrue_b32(), v383, v385); + svfloat32_t v392 = svsub_f32_x(svptrue_b32(), v385, v381); + svfloat32_t v393 = svadd_f32_x(svptrue_b32(), v382, v384); + svfloat32_t v395 = svsub_f32_x(svptrue_b32(), v382, v384); + svfloat32_t v396 = svsub_f32_x(svptrue_b32(), v384, v386); + svfloat32_t v397 = svsub_f32_x(svptrue_b32(), v386, v382); + svfloat32_t v209 = svadd_f32_x(svptrue_b32(), v203, v205); + svfloat32_t v212 = svsub_f32_x(svptrue_b32(), v203, v205); + svfloat32_t v213 = svsub_f32_x(svptrue_b32(), v205, v207); + svfloat32_t v214 = svsub_f32_x(svptrue_b32(), v207, v203); + svfloat32_t v215 = svadd_f32_x(svptrue_b32(), v204, v206); + svfloat32_t v217 = svsub_f32_x(svptrue_b32(), v204, v206); + svfloat32_t v218 = svsub_f32_x(svptrue_b32(), v206, v208); + svfloat32_t v219 = svsub_f32_x(svptrue_b32(), v208, v204); + svfloat32_t v299 = svadd_f32_x(svptrue_b32(), v298, v296); + svfloat32_t v305 = svadd_f32_x(svptrue_b32(), v304, v297); + svfloat32_t zero347 = svdup_n_f32(0); + svfloat32_t v347 = svcmla_f32_x(pred_full, zero347, v866, v306, 90); + svfloat32_t zero354 = svdup_n_f32(0); + svfloat32_t v354 = svcmla_f32_x(pred_full, zero354, v867, v307, 90); + svfloat32_t zero361 = svdup_n_f32(0); + svfloat32_t v361 = svcmla_f32_x(pred_full, zero361, v868, v308, 90); + svfloat32_t v388 = svadd_f32_x(svptrue_b32(), v387, v385); + svfloat32_t v394 = svadd_f32_x(svptrue_b32(), v393, v386); + svfloat32_t zero418 = svdup_n_f32(0); + svfloat32_t v418 = svcmla_f32_x(pred_full, zero418, v871, v390, 90); + svfloat32_t zero425 = svdup_n_f32(0); + svfloat32_t v425 = svcmla_f32_x(pred_full, zero425, v872, v391, 90); + svfloat32_t zero432 = svdup_n_f32(0); + svfloat32_t v432 = svcmla_f32_x(pred_full, zero432, v873, v392, 90); + svfloat32_t v442 = svmul_f32_x(svptrue_b32(), v395, v875); + svfloat32_t v447 = svmul_f32_x(svptrue_b32(), v396, v876); + svfloat32_t v210 = svadd_f32_x(svptrue_b32(), v209, v207); + svfloat32_t v216 = svadd_f32_x(svptrue_b32(), v215, v208); + svfloat32_t zero258 = svdup_n_f32(0); + svfloat32_t v258 = svcmla_f32_x(pred_full, zero258, v857, v217, 90); + svfloat32_t zero265 = svdup_n_f32(0); + svfloat32_t v265 = svcmla_f32_x(pred_full, zero265, v858, v218, 90); + svfloat32_t zero272 = svdup_n_f32(0); + svfloat32_t v272 = svcmla_f32_x(pred_full, zero272, v859, v219, 90); + svfloat32_t v300 = svadd_f32_x(svptrue_b32(), v299, v30); + svfloat32_t v318 = svmul_f32_x(svptrue_b32(), v299, v861); + svfloat32_t zero340 = svdup_n_f32(0); + svfloat32_t v340 = svcmla_f32_x(pred_full, zero340, v865, v305, 90); + svfloat32_t v389 = svadd_f32_x(svptrue_b32(), v388, v31); + svfloat32_t v211 = svadd_f32_x(svptrue_b32(), v210, v40); + svfloat32_t zero251 = svdup_n_f32(0); + svfloat32_t v251 = svcmla_f32_x(pred_full, zero251, v856, v216, 90); + svfloat32_t v369 = svadd_f32_x(svptrue_b32(), v340, v347); + svfloat32_t v371 = svsub_f32_x(svptrue_b32(), v340, v347); + svfloat32_t v373 = svsub_f32_x(svptrue_b32(), v340, v354); + svfloat32_t zero404 = svdup_n_f32(0); + svfloat32_t v404 = svcmla_f32_x(pred_full, zero404, v869, v389, 90); + svfloat32_t v460 = svmla_f32_x(pred_full, v442, v394, v874); + svfloat32_t v462 = svnmls_f32_x(pred_full, v442, v394, v874); + svfloat32_t v464 = svnmls_f32_x(pred_full, v447, v394, v874); + svfloat32_t v273 = svmla_f32_x(pred_full, v211, v210, v852); + svfloat32_t v280 = svadd_f32_x(svptrue_b32(), v251, v258); + svfloat32_t v282 = svsub_f32_x(svptrue_b32(), v251, v258); + svfloat32_t v284 = svsub_f32_x(svptrue_b32(), v251, v265); + svfloat32_t v362 = svmla_f32_x(pred_full, v318, v300, v860); + svfloat32_t v370 = svadd_f32_x(svptrue_b32(), v369, v354); + svfloat32_t v372 = svsub_f32_x(svptrue_b32(), v371, v361); + svfloat32_t v374 = svadd_f32_x(svptrue_b32(), v373, v361); + svfloat32_t v453 = svcmla_f32_x(pred_full, v404, v870, v388, 90); + svfloat32_t v461 = svmla_f32_x(pred_full, v460, v396, v876); + svfloat32_t v463 = svmls_f32_x(pred_full, v462, v397, v877); + svfloat32_t v465 = svmla_f32_x(pred_full, v464, v397, v877); + svfloat32_t v472 = svmla_f32_x(pred_full, v211, v300, v860); + svint16_t v477 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v211, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svfloat32_t v274 = svmla_f32_x(pred_full, v273, v212, v853); + svfloat32_t v276 = svmls_f32_x(pred_full, v273, v212, v853); + svfloat32_t v278 = svmls_f32_x(pred_full, v273, v213, v854); + svfloat32_t v281 = svadd_f32_x(svptrue_b32(), v280, v265); + svfloat32_t v283 = svsub_f32_x(svptrue_b32(), v282, v272); + svfloat32_t v285 = svadd_f32_x(svptrue_b32(), v284, v272); + svfloat32_t v363 = svmla_f32_x(pred_full, v362, v301, v862); + svfloat32_t v365 = svmls_f32_x(pred_full, v362, v301, v862); + svfloat32_t v367 = svmls_f32_x(pred_full, v362, v302, v863); + svfloat32_t v454 = svadd_f32_x(svptrue_b32(), v453, v418); + svfloat32_t v456 = svsub_f32_x(svptrue_b32(), v453, v418); + svfloat32_t v458 = svsub_f32_x(svptrue_b32(), v453, v425); + svfloat32_t v473 = svadd_f32_x(svptrue_b32(), v472, v404); + svfloat32_t v474 = svsub_f32_x(svptrue_b32(), v472, v404); + svst1w_u64(pred_full, (unsigned *)(v885), svreinterpret_u64_s16(v477)); + svfloat32_t v275 = svmla_f32_x(pred_full, v274, v213, v854); + svfloat32_t v277 = svmls_f32_x(pred_full, v276, v214, v855); + svfloat32_t v279 = svmla_f32_x(pred_full, v278, v214, v855); + svfloat32_t v364 = svmla_f32_x(pred_full, v363, v302, v863); + svfloat32_t v366 = svmls_f32_x(pred_full, v365, v303, v864); + svfloat32_t v368 = svmla_f32_x(pred_full, v367, v303, v864); + svfloat32_t v455 = svadd_f32_x(svptrue_b32(), v454, v425); + svfloat32_t v457 = svsub_f32_x(svptrue_b32(), v456, v432); + svfloat32_t v459 = svadd_f32_x(svptrue_b32(), v458, v432); + svint16_t v485 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v474, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v493 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v473, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svfloat32_t v286 = svadd_f32_x(svptrue_b32(), v275, v281); + svfloat32_t v287 = svsub_f32_x(svptrue_b32(), v275, v281); + svfloat32_t v288 = svadd_f32_x(svptrue_b32(), v277, v283); + svfloat32_t v289 = svsub_f32_x(svptrue_b32(), v277, v283); + svfloat32_t v290 = svadd_f32_x(svptrue_b32(), v279, v285); + svfloat32_t v291 = svsub_f32_x(svptrue_b32(), v279, v285); + svfloat32_t v375 = svadd_f32_x(svptrue_b32(), v364, v370); + svfloat32_t v376 = svsub_f32_x(svptrue_b32(), v364, v370); + svfloat32_t v377 = svadd_f32_x(svptrue_b32(), v366, v372); + svfloat32_t v378 = svsub_f32_x(svptrue_b32(), v366, v372); + svfloat32_t v379 = svadd_f32_x(svptrue_b32(), v368, v374); + svfloat32_t v380 = svsub_f32_x(svptrue_b32(), v368, v374); + svfloat32_t v466 = svadd_f32_x(svptrue_b32(), v455, v461); + svfloat32_t v467 = svsub_f32_x(svptrue_b32(), v455, v461); + svfloat32_t v468 = svadd_f32_x(svptrue_b32(), v457, v463); + svfloat32_t v469 = svsub_f32_x(svptrue_b32(), v457, v463); + svfloat32_t v470 = svadd_f32_x(svptrue_b32(), v459, v465); + svfloat32_t v471 = svsub_f32_x(svptrue_b32(), v459, v465); + svst1w_u64(pred_full, (unsigned *)(v894), svreinterpret_u64_s16(v485)); + svst1w_u64(pred_full, (unsigned *)(v903), svreinterpret_u64_s16(v493)); + svfloat32_t v499 = svadd_f32_x(svptrue_b32(), v287, v376); + svint16_t v504 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v287, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svfloat32_t v526 = svadd_f32_x(svptrue_b32(), v289, v378); + svint16_t v531 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v289, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svfloat32_t v553 = svadd_f32_x(svptrue_b32(), v290, v379); + svint16_t v558 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v290, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svfloat32_t v580 = svadd_f32_x(svptrue_b32(), v291, v380); + svint16_t v585 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v291, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svfloat32_t v607 = svadd_f32_x(svptrue_b32(), v288, v377); + svint16_t v612 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v288, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svfloat32_t v634 = svadd_f32_x(svptrue_b32(), v286, v375); + svint16_t v639 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v286, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svfloat32_t v500 = svadd_f32_x(svptrue_b32(), v499, v467); + svfloat32_t v501 = svsub_f32_x(svptrue_b32(), v499, v467); + svfloat32_t v527 = svadd_f32_x(svptrue_b32(), v526, v469); + svfloat32_t v528 = svsub_f32_x(svptrue_b32(), v526, v469); + svfloat32_t v554 = svadd_f32_x(svptrue_b32(), v553, v470); + svfloat32_t v555 = svsub_f32_x(svptrue_b32(), v553, v470); + svfloat32_t v581 = svadd_f32_x(svptrue_b32(), v580, v471); + svfloat32_t v582 = svsub_f32_x(svptrue_b32(), v580, v471); + svfloat32_t v608 = svadd_f32_x(svptrue_b32(), v607, v468); + svfloat32_t v609 = svsub_f32_x(svptrue_b32(), v607, v468); + svfloat32_t v635 = svadd_f32_x(svptrue_b32(), v634, v466); + svfloat32_t v636 = svsub_f32_x(svptrue_b32(), v634, v466); + svst1w_u64(pred_full, (unsigned *)(v912), svreinterpret_u64_s16(v504)); + svst1w_u64(pred_full, (unsigned *)(v939), svreinterpret_u64_s16(v531)); + svst1w_u64(pred_full, (unsigned *)(v966), svreinterpret_u64_s16(v558)); + svst1w_u64(pred_full, (unsigned *)(v993), svreinterpret_u64_s16(v585)); + svst1w_u64(pred_full, (unsigned *)(v1020), svreinterpret_u64_s16(v612)); + svst1w_u64(pred_full, (unsigned *)(v1047), svreinterpret_u64_s16(v639)); + svint16_t v512 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v501, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v520 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v500, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v539 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v528, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v547 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v527, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v566 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v555, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v574 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v554, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v593 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v582, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v601 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v581, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v620 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v609, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v628 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v608, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v647 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v636, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v655 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v635, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svst1w_u64(pred_full, (unsigned *)(v921), svreinterpret_u64_s16(v512)); + svst1w_u64(pred_full, (unsigned *)(v930), svreinterpret_u64_s16(v520)); + svst1w_u64(pred_full, (unsigned *)(v948), svreinterpret_u64_s16(v539)); + svst1w_u64(pred_full, (unsigned *)(v957), svreinterpret_u64_s16(v547)); + svst1w_u64(pred_full, (unsigned *)(v975), svreinterpret_u64_s16(v566)); + svst1w_u64(pred_full, (unsigned *)(v984), svreinterpret_u64_s16(v574)); + svst1w_u64(pred_full, (unsigned *)(v1002), svreinterpret_u64_s16(v593)); + svst1w_u64(pred_full, (unsigned *)(v1011), svreinterpret_u64_s16(v601)); + svst1w_u64(pred_full, (unsigned *)(v1029), svreinterpret_u64_s16(v620)); + svst1w_u64(pred_full, (unsigned *)(v1038), svreinterpret_u64_s16(v628)); + svst1w_u64(pred_full, (unsigned *)(v1056), svreinterpret_u64_s16(v647)); + svst1w_u64(pred_full, (unsigned *)(v1065), svreinterpret_u64_s16(v655)); +} +#endif + +#ifndef ARMRAL_ARCH_SVE +void armral_fft_cs16_cf32_cs16_ac_n_uun22( + const armral_cmplx_int16_t *restrict x, armral_cmplx_int16_t *restrict y, + int istride, int ostride, int howmany, float dir) { + float v4 = dir; + const int32_t *v5 = (const int32_t *)x; + int32_t *v6 = (int32_t *)y; + float v403 = 1.1000000000000001e+00F; + float v406 = 3.3166247903554003e-01F; + float v407 = -3.3166247903554003e-01F; + float v414 = 5.1541501300188641e-01F; + float v418 = 9.4125353283118118e-01F; + float v422 = 1.4143537075597825e+00F; + float v426 = 8.5949297361449750e-01F; + float v430 = 4.2314838273285138e-02F; + float v434 = 3.8639279888589606e-01F; + float v438 = 5.1254589567200015e-01F; + float v442 = 1.0702757469471715e+00F; + float v446 = 5.5486073394528512e-01F; + float v449 = 1.2412944743900585e+00F; + float v450 = -1.2412944743900585e+00F; + float v456 = 2.0897833842005756e-01F; + float v457 = -2.0897833842005756e-01F; + float v463 = 3.7415717312460811e-01F; + float v464 = -3.7415717312460811e-01F; + float v470 = 4.9929922194110327e-02F; + float v471 = -4.9929922194110327e-02F; + float v477 = 6.5815896284539266e-01F; + float v478 = -6.5815896284539266e-01F; + float v484 = 6.3306543373877577e-01F; + float v485 = -6.3306543373877577e-01F; + float v491 = 1.0822460581641109e+00F; + float v492 = -1.0822460581641109e+00F; + float v498 = 8.1720737907134022e-01F; + float v499 = -8.1720737907134022e-01F; + float v505 = 4.2408709531871824e-01F; + float v506 = -4.2408709531871824e-01F; + int16x4_t v13 = vld1s_s16(&v5[0]); + int16x4_t v103 = vld1s_s16(&v5[istride]); + float32x2_t v404 = (float32x2_t){v403, v403}; + float32x2_t v408 = (float32x2_t){v406, v407}; + float32x2_t v415 = (float32x2_t){v414, v414}; + float32x2_t v419 = (float32x2_t){v418, v418}; + float32x2_t v423 = (float32x2_t){v422, v422}; + float32x2_t v427 = (float32x2_t){v426, v426}; + float32x2_t v431 = (float32x2_t){v430, v430}; + float32x2_t v435 = (float32x2_t){v434, v434}; + float32x2_t v439 = (float32x2_t){v438, v438}; + float32x2_t v443 = (float32x2_t){v442, v442}; + float32x2_t v447 = (float32x2_t){v446, v446}; + float32x2_t v451 = (float32x2_t){v449, v450}; + float32x2_t v458 = (float32x2_t){v456, v457}; + float32x2_t v465 = (float32x2_t){v463, v464}; + float32x2_t v472 = (float32x2_t){v470, v471}; + float32x2_t v479 = (float32x2_t){v477, v478}; + float32x2_t v486 = (float32x2_t){v484, v485}; + float32x2_t v493 = (float32x2_t){v491, v492}; + float32x2_t v500 = (float32x2_t){v498, v499}; + float32x2_t v507 = (float32x2_t){v505, v506}; + float32x2_t v508 = (float32x2_t){v4, v4}; + float32x2_t v14 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v13)), 15); + int16x4_t v19 = vld1s_s16(&v5[istride * 11]); + int16x4_t v27 = vld1s_s16(&v5[istride * 2]); + int16x4_t v33 = vld1s_s16(&v5[istride * 13]); + int16x4_t v41 = vld1s_s16(&v5[istride * 4]); + int16x4_t v47 = vld1s_s16(&v5[istride * 15]); + int16x4_t v55 = vld1s_s16(&v5[istride * 6]); + int16x4_t v61 = vld1s_s16(&v5[istride * 17]); + int16x4_t v69 = vld1s_s16(&v5[istride * 8]); + int16x4_t v75 = vld1s_s16(&v5[istride * 19]); + int16x4_t v83 = vld1s_s16(&v5[istride * 10]); + int16x4_t v89 = vld1s_s16(&v5[istride * 21]); + int16x4_t v97 = vld1s_s16(&v5[istride * 12]); + float32x2_t v104 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v103)), 15); + int16x4_t v111 = vld1s_s16(&v5[istride * 14]); + int16x4_t v117 = vld1s_s16(&v5[istride * 3]); + int16x4_t v125 = vld1s_s16(&v5[istride * 16]); + int16x4_t v131 = vld1s_s16(&v5[istride * 5]); + int16x4_t v139 = vld1s_s16(&v5[istride * 18]); + int16x4_t v145 = vld1s_s16(&v5[istride * 7]); + int16x4_t v153 = vld1s_s16(&v5[istride * 20]); + int16x4_t v159 = vld1s_s16(&v5[istride * 9]); + float32x2_t v410 = vmul_f32(v508, v408); + float32x2_t v453 = vmul_f32(v508, v451); + float32x2_t v460 = vmul_f32(v508, v458); + float32x2_t v467 = vmul_f32(v508, v465); + float32x2_t v474 = vmul_f32(v508, v472); + float32x2_t v481 = vmul_f32(v508, v479); + float32x2_t v488 = vmul_f32(v508, v486); + float32x2_t v495 = vmul_f32(v508, v493); + float32x2_t v502 = vmul_f32(v508, v500); + float32x2_t v509 = vmul_f32(v508, v507); + float32x2_t v20 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v19)), 15); + float32x2_t v28 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v27)), 15); + float32x2_t v34 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v33)), 15); + float32x2_t v42 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v41)), 15); + float32x2_t v48 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v47)), 15); + float32x2_t v56 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v55)), 15); + float32x2_t v62 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v61)), 15); + float32x2_t v70 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v69)), 15); + float32x2_t v76 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v75)), 15); + float32x2_t v84 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v83)), 15); + float32x2_t v90 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v89)), 15); + float32x2_t v98 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v97)), 15); + float32x2_t v112 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v111)), 15); + float32x2_t v118 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v117)), 15); + float32x2_t v126 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v125)), 15); + float32x2_t v132 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v131)), 15); + float32x2_t v140 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v139)), 15); + float32x2_t v146 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v145)), 15); + float32x2_t v154 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v153)), 15); + float32x2_t v160 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v159)), 15); + float32x2_t v21 = vadd_f32(v14, v20); + float32x2_t v22 = vsub_f32(v14, v20); + float32x2_t v35 = vadd_f32(v28, v34); + float32x2_t v36 = vsub_f32(v28, v34); + float32x2_t v49 = vadd_f32(v42, v48); + float32x2_t v50 = vsub_f32(v42, v48); + float32x2_t v63 = vadd_f32(v56, v62); + float32x2_t v64 = vsub_f32(v56, v62); + float32x2_t v77 = vadd_f32(v70, v76); + float32x2_t v78 = vsub_f32(v70, v76); + float32x2_t v91 = vadd_f32(v84, v90); + float32x2_t v92 = vsub_f32(v84, v90); + float32x2_t v105 = vadd_f32(v98, v104); + float32x2_t v106 = vsub_f32(v98, v104); + float32x2_t v119 = vadd_f32(v112, v118); + float32x2_t v120 = vsub_f32(v112, v118); + float32x2_t v133 = vadd_f32(v126, v132); + float32x2_t v134 = vsub_f32(v126, v132); + float32x2_t v147 = vadd_f32(v140, v146); + float32x2_t v148 = vsub_f32(v140, v146); + float32x2_t v161 = vadd_f32(v154, v160); + float32x2_t v162 = vsub_f32(v154, v160); + float32x2_t v163 = vadd_f32(v35, v161); + float32x2_t v164 = vadd_f32(v49, v147); + float32x2_t v165 = vadd_f32(v63, v133); + float32x2_t v166 = vadd_f32(v77, v119); + float32x2_t v167 = vadd_f32(v91, v105); + float32x2_t v168 = vsub_f32(v35, v161); + float32x2_t v169 = vsub_f32(v49, v147); + float32x2_t v170 = vsub_f32(v63, v133); + float32x2_t v171 = vsub_f32(v77, v119); + float32x2_t v172 = vsub_f32(v91, v105); + float32x2_t v361 = vadd_f32(v36, v162); + float32x2_t v362 = vadd_f32(v50, v148); + float32x2_t v363 = vadd_f32(v64, v134); + float32x2_t v364 = vadd_f32(v78, v120); + float32x2_t v365 = vadd_f32(v92, v106); + float32x2_t v366 = vsub_f32(v36, v162); + float32x2_t v367 = vsub_f32(v50, v148); + float32x2_t v368 = vsub_f32(v64, v134); + float32x2_t v369 = vsub_f32(v78, v120); + float32x2_t v370 = vsub_f32(v92, v106); + float32x2_t v173 = vadd_f32(v163, v164); + float32x2_t v174 = vadd_f32(v165, v167); + float32x2_t v176 = vsub_f32(v169, v170); + float32x2_t v177 = vadd_f32(v168, v172); + float32x2_t v182 = vsub_f32(v164, v166); + float32x2_t v183 = vsub_f32(v163, v166); + float32x2_t v184 = vsub_f32(v164, v163); + float32x2_t v185 = vsub_f32(v167, v166); + float32x2_t v186 = vsub_f32(v165, v166); + float32x2_t v187 = vsub_f32(v167, v165); + float32x2_t v188 = vsub_f32(v164, v167); + float32x2_t v189 = vsub_f32(v163, v165); + float32x2_t v191 = vadd_f32(v169, v171); + float32x2_t v192 = vsub_f32(v168, v171); + float32x2_t v193 = vadd_f32(v168, v169); + float32x2_t v194 = vsub_f32(v171, v172); + float32x2_t v195 = vsub_f32(v170, v171); + float32x2_t v196 = vsub_f32(v170, v172); + float32x2_t v197 = vadd_f32(v169, v172); + float32x2_t v198 = vsub_f32(v168, v170); + float32x2_t v371 = vadd_f32(v361, v362); + float32x2_t v372 = vadd_f32(v363, v365); + float32x2_t v374 = vsub_f32(v367, v368); + float32x2_t v375 = vadd_f32(v366, v370); + float32x2_t v380 = vsub_f32(v362, v364); + float32x2_t v381 = vsub_f32(v361, v364); + float32x2_t v382 = vsub_f32(v362, v361); + float32x2_t v383 = vsub_f32(v365, v364); + float32x2_t v384 = vsub_f32(v363, v364); + float32x2_t v385 = vsub_f32(v365, v363); + float32x2_t v386 = vsub_f32(v362, v365); + float32x2_t v387 = vsub_f32(v361, v363); + float32x2_t v389 = vadd_f32(v367, v369); + float32x2_t v390 = vsub_f32(v366, v369); + float32x2_t v391 = vadd_f32(v366, v367); + float32x2_t v392 = vsub_f32(v369, v370); + float32x2_t v393 = vsub_f32(v368, v369); + float32x2_t v394 = vsub_f32(v368, v370); + float32x2_t v395 = vadd_f32(v367, v370); + float32x2_t v396 = vsub_f32(v366, v368); + float32x2_t v175 = vadd_f32(v166, v173); + float32x2_t v180 = vsub_f32(v176, v177); + float32x2_t v190 = vsub_f32(v174, v173); + float32x2_t v199 = vadd_f32(v176, v177); + float32x2_t v218 = vmul_f32(v182, v415); + float32x2_t v222 = vmul_f32(v183, v419); + float32x2_t v226 = vmul_f32(v184, v423); + float32x2_t v230 = vmul_f32(v185, v427); + float32x2_t v234 = vmul_f32(v186, v431); + float32x2_t v238 = vmul_f32(v187, v435); + float32x2_t v242 = vmul_f32(v188, v439); + float32x2_t v246 = vmul_f32(v189, v443); + float32x2_t v256 = vrev64_f32(v191); + float32x2_t v263 = vrev64_f32(v192); + float32x2_t v270 = vrev64_f32(v193); + float32x2_t v277 = vrev64_f32(v194); + float32x2_t v284 = vrev64_f32(v195); + float32x2_t v291 = vrev64_f32(v196); + float32x2_t v298 = vrev64_f32(v197); + float32x2_t v305 = vrev64_f32(v198); + float32x2_t v373 = vadd_f32(v364, v371); + float32x2_t v378 = vsub_f32(v374, v375); + float32x2_t v388 = vsub_f32(v372, v371); + float32x2_t v397 = vadd_f32(v374, v375); + float32x2_t v416 = vmul_f32(v380, v415); + float32x2_t v420 = vmul_f32(v381, v419); + float32x2_t v424 = vmul_f32(v382, v423); + float32x2_t v428 = vmul_f32(v383, v427); + float32x2_t v432 = vmul_f32(v384, v431); + float32x2_t v436 = vmul_f32(v385, v435); + float32x2_t v440 = vmul_f32(v386, v439); + float32x2_t v444 = vmul_f32(v387, v443); + float32x2_t v454 = vrev64_f32(v389); + float32x2_t v461 = vrev64_f32(v390); + float32x2_t v468 = vrev64_f32(v391); + float32x2_t v475 = vrev64_f32(v392); + float32x2_t v482 = vrev64_f32(v393); + float32x2_t v489 = vrev64_f32(v394); + float32x2_t v496 = vrev64_f32(v395); + float32x2_t v503 = vrev64_f32(v396); + float32x2_t v178 = vadd_f32(v175, v174); + float32x2_t v181 = vsub_f32(v180, v171); + float32x2_t v250 = vmul_f32(v190, v447); + float32x2_t v257 = vmul_f32(v256, v453); + float32x2_t v264 = vmul_f32(v263, v460); + float32x2_t v271 = vmul_f32(v270, v467); + float32x2_t v278 = vmul_f32(v277, v474); + float32x2_t v285 = vmul_f32(v284, v481); + float32x2_t v292 = vmul_f32(v291, v488); + float32x2_t v299 = vmul_f32(v298, v495); + float32x2_t v306 = vmul_f32(v305, v502); + float32x2_t v312 = vrev64_f32(v199); + float32x2_t v315 = vadd_f32(v218, v222); + float32x2_t v316 = vadd_f32(v222, v226); + float32x2_t v317 = vsub_f32(v218, v226); + float32x2_t v318 = vadd_f32(v230, v234); + float32x2_t v319 = vadd_f32(v234, v238); + float32x2_t v320 = vsub_f32(v230, v238); + float32x2_t v376 = vadd_f32(v373, v372); + float32x2_t v379 = vsub_f32(v378, v369); + float32x2_t v448 = vmul_f32(v388, v447); + float32x2_t v455 = vmul_f32(v454, v453); + float32x2_t v462 = vmul_f32(v461, v460); + float32x2_t v469 = vmul_f32(v468, v467); + float32x2_t v476 = vmul_f32(v475, v474); + float32x2_t v483 = vmul_f32(v482, v481); + float32x2_t v490 = vmul_f32(v489, v488); + float32x2_t v497 = vmul_f32(v496, v495); + float32x2_t v504 = vmul_f32(v503, v502); + float32x2_t v510 = vrev64_f32(v397); + float32x2_t v513 = vadd_f32(v416, v420); + float32x2_t v514 = vadd_f32(v420, v424); + float32x2_t v515 = vsub_f32(v416, v424); + float32x2_t v516 = vadd_f32(v428, v432); + float32x2_t v517 = vadd_f32(v432, v436); + float32x2_t v518 = vsub_f32(v428, v436); + float32x2_t v179 = vadd_f32(v21, v178); + float32x2_t v207 = vmul_f32(v178, v404); + float32x2_t v213 = vrev64_f32(v181); + float32x2_t v313 = vmul_f32(v312, v509); + float32x2_t v321 = vadd_f32(v246, v250); + float32x2_t v322 = vadd_f32(v242, v250); + float32x2_t v323 = vadd_f32(v264, v271); + float32x2_t v324 = vsub_f32(v257, v271); + float32x2_t v325 = vadd_f32(v285, v292); + float32x2_t v326 = vsub_f32(v278, v292); + float32x2_t v377 = vadd_f32(v22, v376); + float32x2_t v405 = vmul_f32(v376, v404); + float32x2_t v411 = vrev64_f32(v379); + float32x2_t v511 = vmul_f32(v510, v509); + float32x2_t v519 = vadd_f32(v444, v448); + float32x2_t v520 = vadd_f32(v440, v448); + float32x2_t v521 = vadd_f32(v462, v469); + float32x2_t v522 = vsub_f32(v455, v469); + float32x2_t v523 = vadd_f32(v483, v490); + float32x2_t v524 = vsub_f32(v476, v490); + float32x2_t v214 = vmul_f32(v213, v410); + float32x2_t v314 = vsub_f32(v179, v207); + float32x2_t v327 = vadd_f32(v306, v313); + float32x2_t v328 = vsub_f32(v299, v313); + float32x2_t v329 = vadd_f32(v319, v321); + float32x2_t v347 = vadd_f32(v323, v324); + float32x2_t v412 = vmul_f32(v411, v410); + float32x2_t v512 = vsub_f32(v377, v405); + float32x2_t v525 = vadd_f32(v504, v511); + float32x2_t v526 = vsub_f32(v497, v511); + float32x2_t v527 = vadd_f32(v517, v519); + float32x2_t v545 = vadd_f32(v521, v522); + int16x4_t v561 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v179, 15), (int32x2_t){0, 0})); + int16x4_t v567 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v377, 15), (int32x2_t){0, 0})); + float32x2_t v330 = vadd_f32(v329, v314); + float32x2_t v331 = vsub_f32(v314, v316); + float32x2_t v333 = vadd_f32(v314, v320); + float32x2_t v335 = vsub_f32(v314, v317); + float32x2_t v337 = vadd_f32(v314, v315); + float32x2_t v339 = vadd_f32(v214, v325); + float32x2_t v341 = vsub_f32(v327, v323); + float32x2_t v343 = vadd_f32(v214, v328); + float32x2_t v345 = vsub_f32(v328, v324); + float32x2_t v348 = vadd_f32(v347, v325); + float32x2_t v528 = vadd_f32(v527, v512); + float32x2_t v529 = vsub_f32(v512, v514); + float32x2_t v531 = vadd_f32(v512, v518); + float32x2_t v533 = vsub_f32(v512, v515); + float32x2_t v535 = vadd_f32(v512, v513); + float32x2_t v537 = vadd_f32(v412, v523); + float32x2_t v539 = vsub_f32(v525, v521); + float32x2_t v541 = vadd_f32(v412, v526); + float32x2_t v543 = vsub_f32(v526, v522); + float32x2_t v546 = vadd_f32(v545, v523); + v6[0] = vget_lane_s32(vreinterpret_s32_s16(v561), 0); + v6[ostride * 11] = vget_lane_s32(vreinterpret_s32_s16(v567), 0); + float32x2_t v332 = vsub_f32(v331, v321); + float32x2_t v334 = vadd_f32(v333, v322); + float32x2_t v336 = vsub_f32(v335, v322); + float32x2_t v338 = vsub_f32(v337, v318); + float32x2_t v340 = vadd_f32(v339, v327); + float32x2_t v342 = vsub_f32(v341, v214); + float32x2_t v344 = vadd_f32(v343, v326); + float32x2_t v346 = vsub_f32(v345, v214); + float32x2_t v349 = vadd_f32(v348, v326); + float32x2_t v530 = vsub_f32(v529, v519); + float32x2_t v532 = vadd_f32(v531, v520); + float32x2_t v534 = vsub_f32(v533, v520); + float32x2_t v536 = vsub_f32(v535, v516); + float32x2_t v538 = vadd_f32(v537, v525); + float32x2_t v540 = vsub_f32(v539, v412); + float32x2_t v542 = vadd_f32(v541, v524); + float32x2_t v544 = vsub_f32(v543, v412); + float32x2_t v547 = vadd_f32(v546, v524); + float32x2_t v350 = vsub_f32(v349, v214); + float32x2_t v352 = vadd_f32(v330, v340); + float32x2_t v353 = vadd_f32(v332, v342); + float32x2_t v354 = vsub_f32(v334, v344); + float32x2_t v355 = vadd_f32(v336, v346); + float32x2_t v356 = vsub_f32(v336, v346); + float32x2_t v357 = vadd_f32(v334, v344); + float32x2_t v358 = vsub_f32(v332, v342); + float32x2_t v359 = vsub_f32(v330, v340); + float32x2_t v548 = vsub_f32(v547, v412); + float32x2_t v550 = vadd_f32(v528, v538); + float32x2_t v551 = vadd_f32(v530, v540); + float32x2_t v552 = vsub_f32(v532, v542); + float32x2_t v553 = vadd_f32(v534, v544); + float32x2_t v554 = vsub_f32(v534, v544); + float32x2_t v555 = vadd_f32(v532, v542); + float32x2_t v556 = vsub_f32(v530, v540); + float32x2_t v557 = vsub_f32(v528, v538); + float32x2_t v351 = vadd_f32(v338, v350); + float32x2_t v360 = vsub_f32(v338, v350); + float32x2_t v549 = vadd_f32(v536, v548); + float32x2_t v558 = vsub_f32(v536, v548); + int16x4_t v585 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v359, 15), (int32x2_t){0, 0})); + int16x4_t v591 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v557, 15), (int32x2_t){0, 0})); + int16x4_t v597 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v358, 15), (int32x2_t){0, 0})); + int16x4_t v603 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v556, 15), (int32x2_t){0, 0})); + int16x4_t v609 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v357, 15), (int32x2_t){0, 0})); + int16x4_t v615 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v555, 15), (int32x2_t){0, 0})); + int16x4_t v621 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v356, 15), (int32x2_t){0, 0})); + int16x4_t v627 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v554, 15), (int32x2_t){0, 0})); + int16x4_t v633 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v355, 15), (int32x2_t){0, 0})); + int16x4_t v639 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v553, 15), (int32x2_t){0, 0})); + int16x4_t v645 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v354, 15), (int32x2_t){0, 0})); + int16x4_t v651 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v552, 15), (int32x2_t){0, 0})); + int16x4_t v657 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v353, 15), (int32x2_t){0, 0})); + int16x4_t v663 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v551, 15), (int32x2_t){0, 0})); + int16x4_t v669 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v352, 15), (int32x2_t){0, 0})); + int16x4_t v675 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v550, 15), (int32x2_t){0, 0})); + int16x4_t v573 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v360, 15), (int32x2_t){0, 0})); + int16x4_t v579 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v558, 15), (int32x2_t){0, 0})); + v6[ostride * 2] = vget_lane_s32(vreinterpret_s32_s16(v585), 0); + v6[ostride * 13] = vget_lane_s32(vreinterpret_s32_s16(v591), 0); + v6[ostride * 14] = vget_lane_s32(vreinterpret_s32_s16(v597), 0); + v6[ostride * 3] = vget_lane_s32(vreinterpret_s32_s16(v603), 0); + v6[ostride * 4] = vget_lane_s32(vreinterpret_s32_s16(v609), 0); + v6[ostride * 15] = vget_lane_s32(vreinterpret_s32_s16(v615), 0); + v6[ostride * 16] = vget_lane_s32(vreinterpret_s32_s16(v621), 0); + v6[ostride * 5] = vget_lane_s32(vreinterpret_s32_s16(v627), 0); + v6[ostride * 6] = vget_lane_s32(vreinterpret_s32_s16(v633), 0); + v6[ostride * 17] = vget_lane_s32(vreinterpret_s32_s16(v639), 0); + v6[ostride * 18] = vget_lane_s32(vreinterpret_s32_s16(v645), 0); + v6[ostride * 7] = vget_lane_s32(vreinterpret_s32_s16(v651), 0); + v6[ostride * 8] = vget_lane_s32(vreinterpret_s32_s16(v657), 0); + v6[ostride * 19] = vget_lane_s32(vreinterpret_s32_s16(v663), 0); + v6[ostride * 20] = vget_lane_s32(vreinterpret_s32_s16(v669), 0); + v6[ostride * 9] = vget_lane_s32(vreinterpret_s32_s16(v675), 0); + int16x4_t v681 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v351, 15), (int32x2_t){0, 0})); + int16x4_t v687 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v549, 15), (int32x2_t){0, 0})); + v6[ostride * 12] = vget_lane_s32(vreinterpret_s32_s16(v573), 0); + v6[ostride] = vget_lane_s32(vreinterpret_s32_s16(v579), 0); + v6[ostride * 10] = vget_lane_s32(vreinterpret_s32_s16(v681), 0); + v6[ostride * 21] = vget_lane_s32(vreinterpret_s32_s16(v687), 0); +} +#endif + +#ifdef ARMRAL_ARCH_SVE +void armral_fft_cs16_cf32_cs16_ac_n_uun22( + const armral_cmplx_int16_t *restrict x, armral_cmplx_int16_t *restrict y, + int istride, int ostride, int howmany, float dir) { + int64_t v0 = istride; + int64_t v2 = ostride; + float v4 = dir; + const int32_t *v5 = (const int32_t *)x; + int32_t *v6 = (int32_t *)y; + svbool_t pred_full = svptrue_pat_b32(SV_VL2); + float v464 = 1.1000000000000001e+00F; + float v469 = -3.3166247903554003e-01F; + float v476 = 5.1541501300188641e-01F; + float v481 = 9.4125353283118118e-01F; + float v486 = 1.4143537075597825e+00F; + float v491 = 8.5949297361449750e-01F; + float v496 = 4.2314838273285138e-02F; + float v501 = 3.8639279888589606e-01F; + float v506 = 5.1254589567200015e-01F; + float v511 = 1.0702757469471715e+00F; + float v516 = 5.5486073394528512e-01F; + float v521 = -1.2412944743900585e+00F; + float v528 = -2.0897833842005756e-01F; + float v535 = -3.7415717312460811e-01F; + float v542 = -4.9929922194110327e-02F; + float v549 = -6.5815896284539266e-01F; + float v556 = -6.3306543373877577e-01F; + float v563 = -1.0822460581641109e+00F; + float v570 = -8.1720737907134022e-01F; + float v577 = -4.2408709531871824e-01F; + const int32_t *v930 = &v5[v0]; + int32_t *v1081 = &v6[v2]; + int64_t v23 = v0 * 11; + int64_t v33 = v0 * 2; + int64_t v41 = v0 * 13; + int64_t v51 = v0 * 4; + int64_t v59 = v0 * 15; + int64_t v69 = v0 * 6; + int64_t v77 = v0 * 17; + int64_t v87 = v0 * 8; + int64_t v95 = v0 * 19; + int64_t v105 = v0 * 10; + int64_t v113 = v0 * 21; + int64_t v123 = v0 * 12; + int64_t v141 = v0 * 14; + int64_t v149 = v0 * 3; + int64_t v159 = v0 * 16; + int64_t v167 = v0 * 5; + int64_t v177 = v0 * 18; + int64_t v185 = v0 * 7; + int64_t v195 = v0 * 20; + int64_t v203 = v0 * 9; + float v472 = v4 * v469; + float v524 = v4 * v521; + float v531 = v4 * v528; + float v538 = v4 * v535; + float v545 = v4 * v542; + float v552 = v4 * v549; + float v559 = v4 * v556; + float v566 = v4 * v563; + float v573 = v4 * v570; + float v580 = v4 * v577; + int64_t v639 = v2 * 11; + int64_t v647 = v2 * 12; + int64_t v663 = v2 * 2; + int64_t v671 = v2 * 13; + int64_t v679 = v2 * 14; + int64_t v687 = v2 * 3; + int64_t v695 = v2 * 4; + int64_t v703 = v2 * 15; + int64_t v711 = v2 * 16; + int64_t v719 = v2 * 5; + int64_t v727 = v2 * 6; + int64_t v735 = v2 * 17; + int64_t v743 = v2 * 18; + int64_t v751 = v2 * 7; + int64_t v759 = v2 * 8; + int64_t v767 = v2 * 19; + int64_t v775 = v2 * 20; + int64_t v783 = v2 * 9; + int64_t v791 = v2 * 10; + int64_t v799 = v2 * 21; + const int32_t *v813 = &v5[0]; + svfloat32_t v1027 = svdup_n_f32(v464); + svfloat32_t v1029 = svdup_n_f32(v476); + svfloat32_t v1030 = svdup_n_f32(v481); + svfloat32_t v1031 = svdup_n_f32(v486); + svfloat32_t v1032 = svdup_n_f32(v491); + svfloat32_t v1033 = svdup_n_f32(v496); + svfloat32_t v1034 = svdup_n_f32(v501); + svfloat32_t v1035 = svdup_n_f32(v506); + svfloat32_t v1036 = svdup_n_f32(v511); + svfloat32_t v1037 = svdup_n_f32(v516); + int32_t *v1054 = &v6[0]; + svfloat32_t v137 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v930[0])), + 1.F / (1ULL << 15ULL)); + const int32_t *v822 = &v5[v23]; + const int32_t *v831 = &v5[v33]; + const int32_t *v840 = &v5[v41]; + const int32_t *v849 = &v5[v51]; + const int32_t *v858 = &v5[v59]; + const int32_t *v867 = &v5[v69]; + const int32_t *v876 = &v5[v77]; + const int32_t *v885 = &v5[v87]; + const int32_t *v894 = &v5[v95]; + const int32_t *v903 = &v5[v105]; + const int32_t *v912 = &v5[v113]; + const int32_t *v921 = &v5[v123]; + const int32_t *v939 = &v5[v141]; + const int32_t *v948 = &v5[v149]; + const int32_t *v957 = &v5[v159]; + const int32_t *v966 = &v5[v167]; + const int32_t *v975 = &v5[v177]; + const int32_t *v984 = &v5[v185]; + const int32_t *v993 = &v5[v195]; + const int32_t *v1002 = &v5[v203]; + svfloat32_t v1028 = svdup_n_f32(v472); + svfloat32_t v1038 = svdup_n_f32(v524); + svfloat32_t v1039 = svdup_n_f32(v531); + svfloat32_t v1040 = svdup_n_f32(v538); + svfloat32_t v1041 = svdup_n_f32(v545); + svfloat32_t v1042 = svdup_n_f32(v552); + svfloat32_t v1043 = svdup_n_f32(v559); + svfloat32_t v1044 = svdup_n_f32(v566); + svfloat32_t v1045 = svdup_n_f32(v573); + svfloat32_t v1046 = svdup_n_f32(v580); + int32_t *v1063 = &v6[v639]; + int32_t *v1072 = &v6[v647]; + int32_t *v1090 = &v6[v663]; + int32_t *v1099 = &v6[v671]; + int32_t *v1108 = &v6[v679]; + int32_t *v1117 = &v6[v687]; + int32_t *v1126 = &v6[v695]; + int32_t *v1135 = &v6[v703]; + int32_t *v1144 = &v6[v711]; + int32_t *v1153 = &v6[v719]; + int32_t *v1162 = &v6[v727]; + int32_t *v1171 = &v6[v735]; + int32_t *v1180 = &v6[v743]; + int32_t *v1189 = &v6[v751]; + int32_t *v1198 = &v6[v759]; + int32_t *v1207 = &v6[v767]; + int32_t *v1216 = &v6[v775]; + int32_t *v1225 = &v6[v783]; + int32_t *v1234 = &v6[v791]; + int32_t *v1243 = &v6[v799]; + svfloat32_t v21 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v813[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v29 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v822[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v39 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v831[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v47 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v840[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v57 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v849[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v65 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v858[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v75 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v867[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v83 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v876[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v93 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v885[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v101 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v894[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v111 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v903[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v119 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v912[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v129 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v921[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v147 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v939[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v155 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v948[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v165 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v957[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v173 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v966[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v183 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v975[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v191 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v984[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v201 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v993[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v209 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v1002[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v30 = svadd_f32_x(svptrue_b32(), v21, v29); + svfloat32_t v31 = svsub_f32_x(svptrue_b32(), v21, v29); + svfloat32_t v48 = svadd_f32_x(svptrue_b32(), v39, v47); + svfloat32_t v49 = svsub_f32_x(svptrue_b32(), v39, v47); + svfloat32_t v66 = svadd_f32_x(svptrue_b32(), v57, v65); + svfloat32_t v67 = svsub_f32_x(svptrue_b32(), v57, v65); + svfloat32_t v84 = svadd_f32_x(svptrue_b32(), v75, v83); + svfloat32_t v85 = svsub_f32_x(svptrue_b32(), v75, v83); + svfloat32_t v102 = svadd_f32_x(svptrue_b32(), v93, v101); + svfloat32_t v103 = svsub_f32_x(svptrue_b32(), v93, v101); + svfloat32_t v120 = svadd_f32_x(svptrue_b32(), v111, v119); + svfloat32_t v121 = svsub_f32_x(svptrue_b32(), v111, v119); + svfloat32_t v138 = svadd_f32_x(svptrue_b32(), v129, v137); + svfloat32_t v139 = svsub_f32_x(svptrue_b32(), v129, v137); + svfloat32_t v156 = svadd_f32_x(svptrue_b32(), v147, v155); + svfloat32_t v157 = svsub_f32_x(svptrue_b32(), v147, v155); + svfloat32_t v174 = svadd_f32_x(svptrue_b32(), v165, v173); + svfloat32_t v175 = svsub_f32_x(svptrue_b32(), v165, v173); + svfloat32_t v192 = svadd_f32_x(svptrue_b32(), v183, v191); + svfloat32_t v193 = svsub_f32_x(svptrue_b32(), v183, v191); + svfloat32_t v210 = svadd_f32_x(svptrue_b32(), v201, v209); + svfloat32_t v211 = svsub_f32_x(svptrue_b32(), v201, v209); + svfloat32_t v212 = svadd_f32_x(svptrue_b32(), v48, v210); + svfloat32_t v213 = svadd_f32_x(svptrue_b32(), v66, v192); + svfloat32_t v214 = svadd_f32_x(svptrue_b32(), v84, v174); + svfloat32_t v215 = svadd_f32_x(svptrue_b32(), v102, v156); + svfloat32_t v216 = svadd_f32_x(svptrue_b32(), v120, v138); + svfloat32_t v217 = svsub_f32_x(svptrue_b32(), v48, v210); + svfloat32_t v218 = svsub_f32_x(svptrue_b32(), v66, v192); + svfloat32_t v219 = svsub_f32_x(svptrue_b32(), v84, v174); + svfloat32_t v220 = svsub_f32_x(svptrue_b32(), v102, v156); + svfloat32_t v221 = svsub_f32_x(svptrue_b32(), v120, v138); + svfloat32_t v421 = svadd_f32_x(svptrue_b32(), v49, v211); + svfloat32_t v422 = svadd_f32_x(svptrue_b32(), v67, v193); + svfloat32_t v423 = svadd_f32_x(svptrue_b32(), v85, v175); + svfloat32_t v424 = svadd_f32_x(svptrue_b32(), v103, v157); + svfloat32_t v425 = svadd_f32_x(svptrue_b32(), v121, v139); + svfloat32_t v426 = svsub_f32_x(svptrue_b32(), v49, v211); + svfloat32_t v427 = svsub_f32_x(svptrue_b32(), v67, v193); + svfloat32_t v428 = svsub_f32_x(svptrue_b32(), v85, v175); + svfloat32_t v429 = svsub_f32_x(svptrue_b32(), v103, v157); + svfloat32_t v430 = svsub_f32_x(svptrue_b32(), v121, v139); + svfloat32_t v222 = svadd_f32_x(svptrue_b32(), v212, v213); + svfloat32_t v223 = svadd_f32_x(svptrue_b32(), v214, v216); + svfloat32_t v225 = svsub_f32_x(svptrue_b32(), v218, v219); + svfloat32_t v226 = svadd_f32_x(svptrue_b32(), v217, v221); + svfloat32_t v231 = svsub_f32_x(svptrue_b32(), v213, v215); + svfloat32_t v232 = svsub_f32_x(svptrue_b32(), v212, v215); + svfloat32_t v233 = svsub_f32_x(svptrue_b32(), v213, v212); + svfloat32_t v234 = svsub_f32_x(svptrue_b32(), v216, v215); + svfloat32_t v235 = svsub_f32_x(svptrue_b32(), v214, v215); + svfloat32_t v236 = svsub_f32_x(svptrue_b32(), v216, v214); + svfloat32_t v237 = svsub_f32_x(svptrue_b32(), v213, v216); + svfloat32_t v238 = svsub_f32_x(svptrue_b32(), v212, v214); + svfloat32_t v240 = svadd_f32_x(svptrue_b32(), v218, v220); + svfloat32_t v241 = svsub_f32_x(svptrue_b32(), v217, v220); + svfloat32_t v242 = svadd_f32_x(svptrue_b32(), v217, v218); + svfloat32_t v243 = svsub_f32_x(svptrue_b32(), v220, v221); + svfloat32_t v244 = svsub_f32_x(svptrue_b32(), v219, v220); + svfloat32_t v245 = svsub_f32_x(svptrue_b32(), v219, v221); + svfloat32_t v246 = svadd_f32_x(svptrue_b32(), v218, v221); + svfloat32_t v247 = svsub_f32_x(svptrue_b32(), v217, v219); + svfloat32_t v431 = svadd_f32_x(svptrue_b32(), v421, v422); + svfloat32_t v432 = svadd_f32_x(svptrue_b32(), v423, v425); + svfloat32_t v434 = svsub_f32_x(svptrue_b32(), v427, v428); + svfloat32_t v435 = svadd_f32_x(svptrue_b32(), v426, v430); + svfloat32_t v440 = svsub_f32_x(svptrue_b32(), v422, v424); + svfloat32_t v441 = svsub_f32_x(svptrue_b32(), v421, v424); + svfloat32_t v442 = svsub_f32_x(svptrue_b32(), v422, v421); + svfloat32_t v443 = svsub_f32_x(svptrue_b32(), v425, v424); + svfloat32_t v444 = svsub_f32_x(svptrue_b32(), v423, v424); + svfloat32_t v445 = svsub_f32_x(svptrue_b32(), v425, v423); + svfloat32_t v446 = svsub_f32_x(svptrue_b32(), v422, v425); + svfloat32_t v447 = svsub_f32_x(svptrue_b32(), v421, v423); + svfloat32_t v449 = svadd_f32_x(svptrue_b32(), v427, v429); + svfloat32_t v450 = svsub_f32_x(svptrue_b32(), v426, v429); + svfloat32_t v451 = svadd_f32_x(svptrue_b32(), v426, v427); + svfloat32_t v452 = svsub_f32_x(svptrue_b32(), v429, v430); + svfloat32_t v453 = svsub_f32_x(svptrue_b32(), v428, v429); + svfloat32_t v454 = svsub_f32_x(svptrue_b32(), v428, v430); + svfloat32_t v455 = svadd_f32_x(svptrue_b32(), v427, v430); + svfloat32_t v456 = svsub_f32_x(svptrue_b32(), v426, v428); + svfloat32_t v224 = svadd_f32_x(svptrue_b32(), v215, v222); + svfloat32_t v229 = svsub_f32_x(svptrue_b32(), v225, v226); + svfloat32_t v239 = svsub_f32_x(svptrue_b32(), v223, v222); + svfloat32_t v248 = svadd_f32_x(svptrue_b32(), v225, v226); + svfloat32_t v275 = svmul_f32_x(svptrue_b32(), v232, v1030); + svfloat32_t v280 = svmul_f32_x(svptrue_b32(), v233, v1031); + svfloat32_t v290 = svmul_f32_x(svptrue_b32(), v235, v1033); + svfloat32_t v295 = svmul_f32_x(svptrue_b32(), v236, v1034); + svfloat32_t zero317 = svdup_n_f32(0); + svfloat32_t v317 = svcmla_f32_x(pred_full, zero317, v1038, v240, 90); + svfloat32_t zero331 = svdup_n_f32(0); + svfloat32_t v331 = svcmla_f32_x(pred_full, zero331, v1040, v242, 90); + svfloat32_t zero338 = svdup_n_f32(0); + svfloat32_t v338 = svcmla_f32_x(pred_full, zero338, v1041, v243, 90); + svfloat32_t zero352 = svdup_n_f32(0); + svfloat32_t v352 = svcmla_f32_x(pred_full, zero352, v1043, v245, 90); + svfloat32_t zero359 = svdup_n_f32(0); + svfloat32_t v359 = svcmla_f32_x(pred_full, zero359, v1044, v246, 90); + svfloat32_t v433 = svadd_f32_x(svptrue_b32(), v424, v431); + svfloat32_t v438 = svsub_f32_x(svptrue_b32(), v434, v435); + svfloat32_t v448 = svsub_f32_x(svptrue_b32(), v432, v431); + svfloat32_t v457 = svadd_f32_x(svptrue_b32(), v434, v435); + svfloat32_t v484 = svmul_f32_x(svptrue_b32(), v441, v1030); + svfloat32_t v489 = svmul_f32_x(svptrue_b32(), v442, v1031); + svfloat32_t v499 = svmul_f32_x(svptrue_b32(), v444, v1033); + svfloat32_t v504 = svmul_f32_x(svptrue_b32(), v445, v1034); + svfloat32_t zero526 = svdup_n_f32(0); + svfloat32_t v526 = svcmla_f32_x(pred_full, zero526, v1038, v449, 90); + svfloat32_t zero540 = svdup_n_f32(0); + svfloat32_t v540 = svcmla_f32_x(pred_full, zero540, v1040, v451, 90); + svfloat32_t zero547 = svdup_n_f32(0); + svfloat32_t v547 = svcmla_f32_x(pred_full, zero547, v1041, v452, 90); + svfloat32_t zero561 = svdup_n_f32(0); + svfloat32_t v561 = svcmla_f32_x(pred_full, zero561, v1043, v454, 90); + svfloat32_t zero568 = svdup_n_f32(0); + svfloat32_t v568 = svcmla_f32_x(pred_full, zero568, v1044, v455, 90); + svfloat32_t v227 = svadd_f32_x(svptrue_b32(), v224, v223); + svfloat32_t v230 = svsub_f32_x(svptrue_b32(), v229, v220); + svfloat32_t v310 = svmul_f32_x(svptrue_b32(), v239, v1037); + svfloat32_t zero373 = svdup_n_f32(0); + svfloat32_t v373 = svcmla_f32_x(pred_full, zero373, v1046, v248, 90); + svfloat32_t v375 = svmla_f32_x(pred_full, v275, v231, v1029); + svfloat32_t v376 = svmla_f32_x(pred_full, v280, v232, v1030); + svfloat32_t v377 = svnmls_f32_x(pred_full, v280, v231, v1029); + svfloat32_t v378 = svmla_f32_x(pred_full, v290, v234, v1032); + svfloat32_t v379 = svmla_f32_x(pred_full, v295, v235, v1033); + svfloat32_t v380 = svnmls_f32_x(pred_full, v295, v234, v1032); + svfloat32_t v383 = svcmla_f32_x(pred_full, v331, v1039, v241, 90); + svfloat32_t v384 = svsub_f32_x(svptrue_b32(), v317, v331); + svfloat32_t v385 = svcmla_f32_x(pred_full, v352, v1042, v244, 90); + svfloat32_t v386 = svsub_f32_x(svptrue_b32(), v338, v352); + svfloat32_t v436 = svadd_f32_x(svptrue_b32(), v433, v432); + svfloat32_t v439 = svsub_f32_x(svptrue_b32(), v438, v429); + svfloat32_t v519 = svmul_f32_x(svptrue_b32(), v448, v1037); + svfloat32_t zero582 = svdup_n_f32(0); + svfloat32_t v582 = svcmla_f32_x(pred_full, zero582, v1046, v457, 90); + svfloat32_t v584 = svmla_f32_x(pred_full, v484, v440, v1029); + svfloat32_t v585 = svmla_f32_x(pred_full, v489, v441, v1030); + svfloat32_t v586 = svnmls_f32_x(pred_full, v489, v440, v1029); + svfloat32_t v587 = svmla_f32_x(pred_full, v499, v443, v1032); + svfloat32_t v588 = svmla_f32_x(pred_full, v504, v444, v1033); + svfloat32_t v589 = svnmls_f32_x(pred_full, v504, v443, v1032); + svfloat32_t v592 = svcmla_f32_x(pred_full, v540, v1039, v450, 90); + svfloat32_t v593 = svsub_f32_x(svptrue_b32(), v526, v540); + svfloat32_t v594 = svcmla_f32_x(pred_full, v561, v1042, v453, 90); + svfloat32_t v595 = svsub_f32_x(svptrue_b32(), v547, v561); + svfloat32_t v228 = svadd_f32_x(svptrue_b32(), v30, v227); + svfloat32_t zero265 = svdup_n_f32(0); + svfloat32_t v265 = svcmla_f32_x(pred_full, zero265, v1028, v230, 90); + svfloat32_t v381 = svmla_f32_x(pred_full, v310, v238, v1036); + svfloat32_t v382 = svmla_f32_x(pred_full, v310, v237, v1035); + svfloat32_t v387 = svcmla_f32_x(pred_full, v373, v1045, v247, 90); + svfloat32_t v388 = svsub_f32_x(svptrue_b32(), v359, v373); + svfloat32_t v407 = svadd_f32_x(svptrue_b32(), v383, v384); + svfloat32_t v437 = svadd_f32_x(svptrue_b32(), v31, v436); + svfloat32_t zero474 = svdup_n_f32(0); + svfloat32_t v474 = svcmla_f32_x(pred_full, zero474, v1028, v439, 90); + svfloat32_t v590 = svmla_f32_x(pred_full, v519, v447, v1036); + svfloat32_t v591 = svmla_f32_x(pred_full, v519, v446, v1035); + svfloat32_t v596 = svcmla_f32_x(pred_full, v582, v1045, v456, 90); + svfloat32_t v597 = svsub_f32_x(svptrue_b32(), v568, v582); + svfloat32_t v616 = svadd_f32_x(svptrue_b32(), v592, v593); + svfloat32_t v374 = svmls_f32_x(pred_full, v228, v227, v1027); + svfloat32_t v389 = svadd_f32_x(svptrue_b32(), v379, v381); + svfloat32_t v399 = svadd_f32_x(svptrue_b32(), v265, v385); + svfloat32_t v401 = svsub_f32_x(svptrue_b32(), v387, v383); + svfloat32_t v403 = svadd_f32_x(svptrue_b32(), v265, v388); + svfloat32_t v405 = svsub_f32_x(svptrue_b32(), v388, v384); + svfloat32_t v408 = svadd_f32_x(svptrue_b32(), v407, v385); + svfloat32_t v583 = svmls_f32_x(pred_full, v437, v436, v1027); + svfloat32_t v598 = svadd_f32_x(svptrue_b32(), v588, v590); + svfloat32_t v608 = svadd_f32_x(svptrue_b32(), v474, v594); + svfloat32_t v610 = svsub_f32_x(svptrue_b32(), v596, v592); + svfloat32_t v612 = svadd_f32_x(svptrue_b32(), v474, v597); + svfloat32_t v614 = svsub_f32_x(svptrue_b32(), v597, v593); + svfloat32_t v617 = svadd_f32_x(svptrue_b32(), v616, v594); + svint16_t v632 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v228, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v640 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v437, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svfloat32_t v390 = svadd_f32_x(svptrue_b32(), v389, v374); + svfloat32_t v391 = svsub_f32_x(svptrue_b32(), v374, v376); + svfloat32_t v393 = svadd_f32_x(svptrue_b32(), v374, v380); + svfloat32_t v395 = svsub_f32_x(svptrue_b32(), v374, v377); + svfloat32_t v397 = svadd_f32_x(svptrue_b32(), v374, v375); + svfloat32_t v400 = svadd_f32_x(svptrue_b32(), v399, v387); + svfloat32_t v402 = svsub_f32_x(svptrue_b32(), v401, v265); + svfloat32_t v404 = svadd_f32_x(svptrue_b32(), v403, v386); + svfloat32_t v406 = svsub_f32_x(svptrue_b32(), v405, v265); + svfloat32_t v409 = svadd_f32_x(svptrue_b32(), v408, v386); + svfloat32_t v599 = svadd_f32_x(svptrue_b32(), v598, v583); + svfloat32_t v600 = svsub_f32_x(svptrue_b32(), v583, v585); + svfloat32_t v602 = svadd_f32_x(svptrue_b32(), v583, v589); + svfloat32_t v604 = svsub_f32_x(svptrue_b32(), v583, v586); + svfloat32_t v606 = svadd_f32_x(svptrue_b32(), v583, v584); + svfloat32_t v609 = svadd_f32_x(svptrue_b32(), v608, v596); + svfloat32_t v611 = svsub_f32_x(svptrue_b32(), v610, v474); + svfloat32_t v613 = svadd_f32_x(svptrue_b32(), v612, v595); + svfloat32_t v615 = svsub_f32_x(svptrue_b32(), v614, v474); + svfloat32_t v618 = svadd_f32_x(svptrue_b32(), v617, v595); + svst1w_u64(pred_full, (unsigned *)(v1054), svreinterpret_u64_s16(v632)); + svst1w_u64(pred_full, (unsigned *)(v1063), svreinterpret_u64_s16(v640)); + svfloat32_t v392 = svsub_f32_x(svptrue_b32(), v391, v381); + svfloat32_t v394 = svadd_f32_x(svptrue_b32(), v393, v382); + svfloat32_t v396 = svsub_f32_x(svptrue_b32(), v395, v382); + svfloat32_t v398 = svsub_f32_x(svptrue_b32(), v397, v378); + svfloat32_t v410 = svsub_f32_x(svptrue_b32(), v409, v265); + svfloat32_t v412 = svadd_f32_x(svptrue_b32(), v390, v400); + svfloat32_t v419 = svsub_f32_x(svptrue_b32(), v390, v400); + svfloat32_t v601 = svsub_f32_x(svptrue_b32(), v600, v590); + svfloat32_t v603 = svadd_f32_x(svptrue_b32(), v602, v591); + svfloat32_t v605 = svsub_f32_x(svptrue_b32(), v604, v591); + svfloat32_t v607 = svsub_f32_x(svptrue_b32(), v606, v587); + svfloat32_t v619 = svsub_f32_x(svptrue_b32(), v618, v474); + svfloat32_t v621 = svadd_f32_x(svptrue_b32(), v599, v609); + svfloat32_t v628 = svsub_f32_x(svptrue_b32(), v599, v609); + svfloat32_t v411 = svadd_f32_x(svptrue_b32(), v398, v410); + svfloat32_t v413 = svadd_f32_x(svptrue_b32(), v392, v402); + svfloat32_t v414 = svsub_f32_x(svptrue_b32(), v394, v404); + svfloat32_t v415 = svadd_f32_x(svptrue_b32(), v396, v406); + svfloat32_t v416 = svsub_f32_x(svptrue_b32(), v396, v406); + svfloat32_t v417 = svadd_f32_x(svptrue_b32(), v394, v404); + svfloat32_t v418 = svsub_f32_x(svptrue_b32(), v392, v402); + svfloat32_t v420 = svsub_f32_x(svptrue_b32(), v398, v410); + svfloat32_t v620 = svadd_f32_x(svptrue_b32(), v607, v619); + svfloat32_t v622 = svadd_f32_x(svptrue_b32(), v601, v611); + svfloat32_t v623 = svsub_f32_x(svptrue_b32(), v603, v613); + svfloat32_t v624 = svadd_f32_x(svptrue_b32(), v605, v615); + svfloat32_t v625 = svsub_f32_x(svptrue_b32(), v605, v615); + svfloat32_t v626 = svadd_f32_x(svptrue_b32(), v603, v613); + svfloat32_t v627 = svsub_f32_x(svptrue_b32(), v601, v611); + svfloat32_t v629 = svsub_f32_x(svptrue_b32(), v607, v619); + svint16_t v664 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v419, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v672 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v628, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v776 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v412, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v784 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v621, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v648 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v420, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v656 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v629, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v680 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v418, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v688 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v627, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v696 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v417, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v704 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v626, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v712 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v416, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v720 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v625, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v728 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v415, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v736 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v624, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v744 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v414, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v752 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v623, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v760 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v413, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v768 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v622, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v792 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v411, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v800 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v620, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svst1w_u64(pred_full, (unsigned *)(v1090), svreinterpret_u64_s16(v664)); + svst1w_u64(pred_full, (unsigned *)(v1099), svreinterpret_u64_s16(v672)); + svst1w_u64(pred_full, (unsigned *)(v1216), svreinterpret_u64_s16(v776)); + svst1w_u64(pred_full, (unsigned *)(v1225), svreinterpret_u64_s16(v784)); + svst1w_u64(pred_full, (unsigned *)(v1072), svreinterpret_u64_s16(v648)); + svst1w_u64(pred_full, (unsigned *)(v1081), svreinterpret_u64_s16(v656)); + svst1w_u64(pred_full, (unsigned *)(v1108), svreinterpret_u64_s16(v680)); + svst1w_u64(pred_full, (unsigned *)(v1117), svreinterpret_u64_s16(v688)); + svst1w_u64(pred_full, (unsigned *)(v1126), svreinterpret_u64_s16(v696)); + svst1w_u64(pred_full, (unsigned *)(v1135), svreinterpret_u64_s16(v704)); + svst1w_u64(pred_full, (unsigned *)(v1144), svreinterpret_u64_s16(v712)); + svst1w_u64(pred_full, (unsigned *)(v1153), svreinterpret_u64_s16(v720)); + svst1w_u64(pred_full, (unsigned *)(v1162), svreinterpret_u64_s16(v728)); + svst1w_u64(pred_full, (unsigned *)(v1171), svreinterpret_u64_s16(v736)); + svst1w_u64(pred_full, (unsigned *)(v1180), svreinterpret_u64_s16(v744)); + svst1w_u64(pred_full, (unsigned *)(v1189), svreinterpret_u64_s16(v752)); + svst1w_u64(pred_full, (unsigned *)(v1198), svreinterpret_u64_s16(v760)); + svst1w_u64(pred_full, (unsigned *)(v1207), svreinterpret_u64_s16(v768)); + svst1w_u64(pred_full, (unsigned *)(v1234), svreinterpret_u64_s16(v792)); + svst1w_u64(pred_full, (unsigned *)(v1243), svreinterpret_u64_s16(v800)); +} +#endif + +#ifndef ARMRAL_ARCH_SVE +void armral_fft_cs16_cf32_cs16_ac_n_uun24( + const armral_cmplx_int16_t *restrict x, armral_cmplx_int16_t *restrict y, + int istride, int ostride, int howmany, float dir) { + float v4 = dir; + const int32_t *v5 = (const int32_t *)x; + int32_t *v6 = (int32_t *)y; + float v216 = 1.0000000000000000e+00F; + float v217 = -1.0000000000000000e+00F; + float v224 = -7.0710678118654746e-01F; + float v231 = 7.0710678118654757e-01F; + float v283 = -1.4999999999999998e+00F; + float v284 = 1.4999999999999998e+00F; + float v291 = 1.0606601717798210e+00F; + float v298 = -1.0606601717798212e+00F; + float v352 = 8.6602540378443871e-01F; + float v360 = -8.6602540378443871e-01F; + float v367 = 6.1237243569579458e-01F; + float v368 = -6.1237243569579458e-01F; + int16x4_t v27 = vld1s_s16(&v5[0]); + int16x4_t v82 = vld1s_s16(&v5[istride]); + float32x2_t v218 = (float32x2_t){v216, v217}; + float32x2_t v225 = (float32x2_t){v231, v224}; + float32x2_t v232 = (float32x2_t){v231, v231}; + float32x2_t v281 = (float32x2_t){v283, v283}; + float32x2_t v285 = (float32x2_t){v283, v284}; + float32x2_t v292 = (float32x2_t){v298, v291}; + float32x2_t v299 = (float32x2_t){v298, v298}; + float32x2_t v354 = (float32x2_t){v352, v360}; + float32x2_t v361 = (float32x2_t){v360, v360}; + float32x2_t v365 = (float32x2_t){v368, v368}; + float32x2_t v369 = (float32x2_t){v367, v368}; + float32x2_t v370 = (float32x2_t){v4, v4}; + int16x4_t v13 = vld1s_s16(&v5[istride * 8]); + int16x4_t v19 = vld1s_s16(&v5[istride * 16]); + float32x2_t v28 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v27)), 15); + int16x4_t v34 = vld1s_s16(&v5[istride * 11]); + int16x4_t v40 = vld1s_s16(&v5[istride * 19]); + int16x4_t v48 = vld1s_s16(&v5[istride * 3]); + int16x4_t v55 = vld1s_s16(&v5[istride * 14]); + int16x4_t v61 = vld1s_s16(&v5[istride * 22]); + int16x4_t v69 = vld1s_s16(&v5[istride * 6]); + int16x4_t v76 = vld1s_s16(&v5[istride * 17]); + float32x2_t v83 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v82)), 15); + int16x4_t v90 = vld1s_s16(&v5[istride * 9]); + int16x4_t v97 = vld1s_s16(&v5[istride * 20]); + int16x4_t v103 = vld1s_s16(&v5[istride * 4]); + int16x4_t v111 = vld1s_s16(&v5[istride * 12]); + int16x4_t v118 = vld1s_s16(&v5[istride * 23]); + int16x4_t v124 = vld1s_s16(&v5[istride * 7]); + int16x4_t v132 = vld1s_s16(&v5[istride * 15]); + int16x4_t v139 = vld1s_s16(&v5[istride * 2]); + int16x4_t v145 = vld1s_s16(&v5[istride * 10]); + int16x4_t v153 = vld1s_s16(&v5[istride * 18]); + int16x4_t v160 = vld1s_s16(&v5[istride * 5]); + int16x4_t v166 = vld1s_s16(&v5[istride * 13]); + int16x4_t v174 = vld1s_s16(&v5[istride * 21]); + float32x2_t v220 = vmul_f32(v370, v218); + float32x2_t v227 = vmul_f32(v370, v225); + float32x2_t v287 = vmul_f32(v370, v285); + float32x2_t v294 = vmul_f32(v370, v292); + float32x2_t v356 = vmul_f32(v370, v354); + float32x2_t v371 = vmul_f32(v370, v369); + float32x2_t v14 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v13)), 15); + float32x2_t v20 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v19)), 15); + float32x2_t v35 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v34)), 15); + float32x2_t v41 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v40)), 15); + float32x2_t v49 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v48)), 15); + float32x2_t v56 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v55)), 15); + float32x2_t v62 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v61)), 15); + float32x2_t v70 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v69)), 15); + float32x2_t v77 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v76)), 15); + float32x2_t v91 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v90)), 15); + float32x2_t v98 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v97)), 15); + float32x2_t v104 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v103)), 15); + float32x2_t v112 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v111)), 15); + float32x2_t v119 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v118)), 15); + float32x2_t v125 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v124)), 15); + float32x2_t v133 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v132)), 15); + float32x2_t v140 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v139)), 15); + float32x2_t v146 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v145)), 15); + float32x2_t v154 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v153)), 15); + float32x2_t v161 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v160)), 15); + float32x2_t v167 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v166)), 15); + float32x2_t v175 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v174)), 15); + float32x2_t v21 = vadd_f32(v14, v20); + float32x2_t v22 = vsub_f32(v14, v20); + float32x2_t v42 = vadd_f32(v35, v41); + float32x2_t v43 = vsub_f32(v35, v41); + float32x2_t v63 = vadd_f32(v56, v62); + float32x2_t v64 = vsub_f32(v56, v62); + float32x2_t v84 = vadd_f32(v77, v83); + float32x2_t v85 = vsub_f32(v77, v83); + float32x2_t v105 = vadd_f32(v98, v104); + float32x2_t v106 = vsub_f32(v98, v104); + float32x2_t v126 = vadd_f32(v119, v125); + float32x2_t v127 = vsub_f32(v119, v125); + float32x2_t v147 = vadd_f32(v140, v146); + float32x2_t v148 = vsub_f32(v140, v146); + float32x2_t v168 = vadd_f32(v161, v167); + float32x2_t v169 = vsub_f32(v161, v167); + float32x2_t v29 = vadd_f32(v21, v28); + float32x2_t v50 = vadd_f32(v42, v49); + float32x2_t v71 = vadd_f32(v63, v70); + float32x2_t v92 = vadd_f32(v84, v91); + float32x2_t v113 = vadd_f32(v105, v112); + float32x2_t v134 = vadd_f32(v126, v133); + float32x2_t v155 = vadd_f32(v147, v154); + float32x2_t v176 = vadd_f32(v168, v175); + float32x2_t v244 = vadd_f32(v21, v105); + float32x2_t v245 = vsub_f32(v21, v105); + float32x2_t v246 = vadd_f32(v63, v147); + float32x2_t v247 = vsub_f32(v63, v147); + float32x2_t v248 = vadd_f32(v42, v126); + float32x2_t v249 = vsub_f32(v42, v126); + float32x2_t v250 = vadd_f32(v84, v168); + float32x2_t v251 = vsub_f32(v84, v168); + float32x2_t v311 = vadd_f32(v22, v106); + float32x2_t v312 = vsub_f32(v22, v106); + float32x2_t v313 = vadd_f32(v64, v148); + float32x2_t v314 = vsub_f32(v64, v148); + float32x2_t v315 = vadd_f32(v43, v127); + float32x2_t v316 = vsub_f32(v43, v127); + float32x2_t v317 = vadd_f32(v85, v169); + float32x2_t v318 = vsub_f32(v85, v169); + float32x2_t v177 = vadd_f32(v29, v113); + float32x2_t v178 = vsub_f32(v29, v113); + float32x2_t v179 = vadd_f32(v71, v155); + float32x2_t v180 = vsub_f32(v71, v155); + float32x2_t v181 = vadd_f32(v50, v134); + float32x2_t v182 = vsub_f32(v50, v134); + float32x2_t v183 = vadd_f32(v92, v176); + float32x2_t v184 = vsub_f32(v92, v176); + float32x2_t v252 = vadd_f32(v244, v246); + float32x2_t v253 = vsub_f32(v244, v246); + float32x2_t v254 = vadd_f32(v248, v250); + float32x2_t v255 = vsub_f32(v248, v250); + float32x2_t v258 = vadd_f32(v249, v251); + float32x2_t v259 = vsub_f32(v249, v251); + float32x2_t v282 = vmul_f32(v245, v281); + float32x2_t v288 = vrev64_f32(v247); + float32x2_t v319 = vadd_f32(v311, v313); + float32x2_t v320 = vsub_f32(v311, v313); + float32x2_t v321 = vadd_f32(v315, v317); + float32x2_t v322 = vsub_f32(v315, v317); + float32x2_t v325 = vadd_f32(v316, v318); + float32x2_t v326 = vsub_f32(v316, v318); + float32x2_t v357 = vrev64_f32(v312); + float32x2_t v362 = vmul_f32(v314, v361); + float32x2_t v185 = vadd_f32(v177, v179); + float32x2_t v186 = vsub_f32(v177, v179); + float32x2_t v187 = vadd_f32(v181, v183); + float32x2_t v188 = vsub_f32(v181, v183); + float32x2_t v191 = vadd_f32(v182, v184); + float32x2_t v192 = vsub_f32(v182, v184); + float32x2_t v221 = vrev64_f32(v180); + float32x2_t v256 = vadd_f32(v252, v254); + float32x2_t v257 = vsub_f32(v252, v254); + float32x2_t v271 = vmul_f32(v253, v281); + float32x2_t v277 = vrev64_f32(v255); + float32x2_t v289 = vmul_f32(v288, v287); + float32x2_t v295 = vrev64_f32(v258); + float32x2_t v300 = vmul_f32(v259, v299); + float32x2_t v323 = vadd_f32(v319, v321); + float32x2_t v324 = vsub_f32(v319, v321); + float32x2_t v346 = vrev64_f32(v320); + float32x2_t v351 = vmul_f32(v322, v361); + float32x2_t v358 = vmul_f32(v357, v356); + float32x2_t v366 = vmul_f32(v325, v365); + float32x2_t v372 = vrev64_f32(v326); + float32x2_t v189 = vadd_f32(v185, v187); + float32x2_t v190 = vsub_f32(v185, v187); + float32x2_t v210 = vrev64_f32(v188); + float32x2_t v222 = vmul_f32(v221, v220); + float32x2_t v228 = vrev64_f32(v191); + float32x2_t v233 = vmul_f32(v192, v232); + float32x2_t v263 = vmul_f32(v256, v281); + float32x2_t v267 = vmul_f32(v257, v281); + float32x2_t v278 = vmul_f32(v277, v287); + float32x2_t v296 = vmul_f32(v295, v294); + float32x2_t v303 = vadd_f32(v282, v300); + float32x2_t v304 = vsub_f32(v282, v300); + float32x2_t v332 = vrev64_f32(v323); + float32x2_t v339 = vrev64_f32(v324); + float32x2_t v347 = vmul_f32(v346, v356); + float32x2_t v373 = vmul_f32(v372, v371); + float32x2_t v378 = vadd_f32(v362, v366); + float32x2_t v379 = vsub_f32(v362, v366); + float32x2_t v211 = vmul_f32(v210, v220); + float32x2_t v229 = vmul_f32(v228, v227); + float32x2_t v236 = vadd_f32(v178, v233); + float32x2_t v237 = vsub_f32(v178, v233); + float32x2_t v301 = vadd_f32(v271, v278); + float32x2_t v302 = vsub_f32(v271, v278); + float32x2_t v305 = vadd_f32(v289, v296); + float32x2_t v306 = vsub_f32(v289, v296); + float32x2_t v333 = vmul_f32(v332, v356); + float32x2_t v340 = vmul_f32(v339, v356); + float32x2_t v374 = vadd_f32(v347, v351); + float32x2_t v375 = vsub_f32(v347, v351); + float32x2_t v376 = vadd_f32(v358, v373); + float32x2_t v377 = vsub_f32(v358, v373); + float32x2_t v384 = vadd_f32(v189, v263); + int16x4_t v389 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v189, 15), (int32x2_t){0, 0})); + float32x2_t v468 = vadd_f32(v190, v267); + int16x4_t v473 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v190, 15), (int32x2_t){0, 0})); + float32x2_t v234 = vadd_f32(v186, v211); + float32x2_t v235 = vsub_f32(v186, v211); + float32x2_t v238 = vadd_f32(v222, v229); + float32x2_t v239 = vsub_f32(v222, v229); + float32x2_t v307 = vadd_f32(v303, v305); + float32x2_t v308 = vsub_f32(v303, v305); + float32x2_t v309 = vadd_f32(v304, v306); + float32x2_t v310 = vsub_f32(v304, v306); + float32x2_t v380 = vadd_f32(v376, v378); + float32x2_t v381 = vsub_f32(v376, v378); + float32x2_t v382 = vadd_f32(v377, v379); + float32x2_t v383 = vsub_f32(v377, v379); + float32x2_t v385 = vadd_f32(v384, v333); + float32x2_t v386 = vsub_f32(v384, v333); + v6[0] = vget_lane_s32(vreinterpret_s32_s16(v389), 0); + float32x2_t v469 = vadd_f32(v468, v340); + float32x2_t v470 = vsub_f32(v468, v340); + v6[ostride * 12] = vget_lane_s32(vreinterpret_s32_s16(v473), 0); + float32x2_t v240 = vadd_f32(v236, v238); + float32x2_t v241 = vsub_f32(v236, v238); + float32x2_t v242 = vadd_f32(v237, v239); + float32x2_t v243 = vsub_f32(v237, v239); + int16x4_t v395 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v386, 15), (int32x2_t){0, 0})); + int16x4_t v401 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v385, 15), (int32x2_t){0, 0})); + float32x2_t v426 = vadd_f32(v235, v302); + int16x4_t v431 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v235, 15), (int32x2_t){0, 0})); + int16x4_t v479 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v470, 15), (int32x2_t){0, 0})); + int16x4_t v485 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v469, 15), (int32x2_t){0, 0})); + float32x2_t v510 = vadd_f32(v234, v301); + int16x4_t v515 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v234, 15), (int32x2_t){0, 0})); + v6[ostride * 16] = vget_lane_s32(vreinterpret_s32_s16(v395), 0); + v6[ostride * 8] = vget_lane_s32(vreinterpret_s32_s16(v401), 0); + float32x2_t v405 = vadd_f32(v241, v308); + int16x4_t v410 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v241, 15), (int32x2_t){0, 0})); + float32x2_t v427 = vadd_f32(v426, v375); + float32x2_t v428 = vsub_f32(v426, v375); + v6[ostride * 18] = vget_lane_s32(vreinterpret_s32_s16(v431), 0); + float32x2_t v447 = vadd_f32(v242, v309); + int16x4_t v452 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v242, 15), (int32x2_t){0, 0})); + v6[ostride * 4] = vget_lane_s32(vreinterpret_s32_s16(v479), 0); + v6[ostride * 20] = vget_lane_s32(vreinterpret_s32_s16(v485), 0); + float32x2_t v489 = vadd_f32(v243, v310); + int16x4_t v494 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v243, 15), (int32x2_t){0, 0})); + float32x2_t v511 = vadd_f32(v510, v374); + float32x2_t v512 = vsub_f32(v510, v374); + v6[ostride * 6] = vget_lane_s32(vreinterpret_s32_s16(v515), 0); + float32x2_t v531 = vadd_f32(v240, v307); + int16x4_t v536 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v240, 15), (int32x2_t){0, 0})); + float32x2_t v406 = vadd_f32(v405, v381); + float32x2_t v407 = vsub_f32(v405, v381); + v6[ostride * 9] = vget_lane_s32(vreinterpret_s32_s16(v410), 0); + int16x4_t v437 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v428, 15), (int32x2_t){0, 0})); + int16x4_t v443 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v427, 15), (int32x2_t){0, 0})); + float32x2_t v448 = vadd_f32(v447, v382); + float32x2_t v449 = vsub_f32(v447, v382); + v6[ostride * 3] = vget_lane_s32(vreinterpret_s32_s16(v452), 0); + float32x2_t v490 = vadd_f32(v489, v383); + float32x2_t v491 = vsub_f32(v489, v383); + v6[ostride * 21] = vget_lane_s32(vreinterpret_s32_s16(v494), 0); + int16x4_t v521 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v512, 15), (int32x2_t){0, 0})); + int16x4_t v527 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v511, 15), (int32x2_t){0, 0})); + float32x2_t v532 = vadd_f32(v531, v380); + float32x2_t v533 = vsub_f32(v531, v380); + v6[ostride * 15] = vget_lane_s32(vreinterpret_s32_s16(v536), 0); + int16x4_t v416 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v407, 15), (int32x2_t){0, 0})); + int16x4_t v422 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v406, 15), (int32x2_t){0, 0})); + v6[ostride * 10] = vget_lane_s32(vreinterpret_s32_s16(v437), 0); + v6[ostride * 2] = vget_lane_s32(vreinterpret_s32_s16(v443), 0); + int16x4_t v458 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v449, 15), (int32x2_t){0, 0})); + int16x4_t v464 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v448, 15), (int32x2_t){0, 0})); + int16x4_t v500 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v491, 15), (int32x2_t){0, 0})); + int16x4_t v506 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v490, 15), (int32x2_t){0, 0})); + v6[ostride * 22] = vget_lane_s32(vreinterpret_s32_s16(v521), 0); + v6[ostride * 14] = vget_lane_s32(vreinterpret_s32_s16(v527), 0); + int16x4_t v542 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v533, 15), (int32x2_t){0, 0})); + int16x4_t v548 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v532, 15), (int32x2_t){0, 0})); + v6[ostride] = vget_lane_s32(vreinterpret_s32_s16(v416), 0); + v6[ostride * 17] = vget_lane_s32(vreinterpret_s32_s16(v422), 0); + v6[ostride * 19] = vget_lane_s32(vreinterpret_s32_s16(v458), 0); + v6[ostride * 11] = vget_lane_s32(vreinterpret_s32_s16(v464), 0); + v6[ostride * 13] = vget_lane_s32(vreinterpret_s32_s16(v500), 0); + v6[ostride * 5] = vget_lane_s32(vreinterpret_s32_s16(v506), 0); + v6[ostride * 7] = vget_lane_s32(vreinterpret_s32_s16(v542), 0); + v6[ostride * 23] = vget_lane_s32(vreinterpret_s32_s16(v548), 0); +} +#endif + +#ifdef ARMRAL_ARCH_SVE +void armral_fft_cs16_cf32_cs16_ac_n_uun24( + const armral_cmplx_int16_t *restrict x, armral_cmplx_int16_t *restrict y, + int istride, int ostride, int howmany, float dir) { + int64_t v0 = istride; + int64_t v2 = ostride; + float v4 = dir; + const int32_t *v5 = (const int32_t *)x; + int32_t *v6 = (int32_t *)y; + svbool_t pred_full = svptrue_pat_b32(SV_VL2); + float v274 = -1.0000000000000000e+00F; + float v281 = -7.0710678118654746e-01F; + float v288 = 7.0710678118654757e-01F; + float v341 = -1.4999999999999998e+00F; + float v346 = 1.4999999999999998e+00F; + float v353 = 1.0606601717798210e+00F; + float v360 = -1.0606601717798212e+00F; + float v424 = -8.6602540378443871e-01F; + float v434 = -6.1237243569579458e-01F; + const int32_t *v763 = &v5[v0]; + int32_t *v950 = &v6[v2]; + int64_t v15 = v0 * 8; + int64_t v23 = v0 * 16; + int64_t v42 = v0 * 11; + int64_t v50 = v0 * 19; + int64_t v60 = v0 * 3; + int64_t v69 = v0 * 14; + int64_t v77 = v0 * 22; + int64_t v87 = v0 * 6; + int64_t v96 = v0 * 17; + int64_t v114 = v0 * 9; + int64_t v123 = v0 * 20; + int64_t v131 = v0 * 4; + int64_t v141 = v0 * 12; + int64_t v150 = v0 * 23; + int64_t v158 = v0 * 7; + int64_t v168 = v0 * 15; + int64_t v177 = v0 * 2; + int64_t v185 = v0 * 10; + int64_t v195 = v0 * 18; + int64_t v204 = v0 * 5; + int64_t v212 = v0 * 13; + int64_t v222 = v0 * 21; + float v277 = v4 * v274; + float v284 = v4 * v281; + float v349 = v4 * v346; + float v356 = v4 * v353; + float v420 = v4 * v424; + float v437 = v4 * v434; + int64_t v462 = v2 * 16; + int64_t v470 = v2 * 8; + int64_t v481 = v2 * 9; + int64_t v497 = v2 * 17; + int64_t v508 = v2 * 18; + int64_t v516 = v2 * 10; + int64_t v524 = v2 * 2; + int64_t v535 = v2 * 3; + int64_t v543 = v2 * 19; + int64_t v551 = v2 * 11; + int64_t v562 = v2 * 12; + int64_t v570 = v2 * 4; + int64_t v578 = v2 * 20; + int64_t v589 = v2 * 21; + int64_t v597 = v2 * 13; + int64_t v605 = v2 * 5; + int64_t v616 = v2 * 6; + int64_t v624 = v2 * 22; + int64_t v632 = v2 * 14; + int64_t v643 = v2 * 15; + int64_t v651 = v2 * 7; + int64_t v659 = v2 * 23; + const int32_t *v691 = &v5[0]; + svfloat32_t v890 = svdup_n_f32(v288); + svfloat32_t v895 = svdup_n_f32(v341); + svfloat32_t v898 = svdup_n_f32(v360); + svfloat32_t v904 = svdup_n_f32(v424); + svfloat32_t v905 = svdup_n_f32(v434); + int32_t *v914 = &v6[0]; + svfloat32_t v110 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v763[0])), + 1.F / (1ULL << 15ULL)); + const int32_t *v672 = &v5[v15]; + const int32_t *v681 = &v5[v23]; + const int32_t *v700 = &v5[v42]; + const int32_t *v709 = &v5[v50]; + const int32_t *v718 = &v5[v60]; + const int32_t *v727 = &v5[v69]; + const int32_t *v736 = &v5[v77]; + const int32_t *v745 = &v5[v87]; + const int32_t *v754 = &v5[v96]; + const int32_t *v772 = &v5[v114]; + const int32_t *v781 = &v5[v123]; + const int32_t *v790 = &v5[v131]; + const int32_t *v799 = &v5[v141]; + const int32_t *v808 = &v5[v150]; + const int32_t *v817 = &v5[v158]; + const int32_t *v826 = &v5[v168]; + const int32_t *v835 = &v5[v177]; + const int32_t *v844 = &v5[v185]; + const int32_t *v853 = &v5[v195]; + const int32_t *v862 = &v5[v204]; + const int32_t *v871 = &v5[v212]; + const int32_t *v880 = &v5[v222]; + svfloat32_t v888 = svdup_n_f32(v277); + svfloat32_t v889 = svdup_n_f32(v284); + svfloat32_t v896 = svdup_n_f32(v349); + svfloat32_t v897 = svdup_n_f32(v356); + svfloat32_t v903 = svdup_n_f32(v420); + svfloat32_t v906 = svdup_n_f32(v437); + int32_t *v923 = &v6[v462]; + int32_t *v932 = &v6[v470]; + int32_t *v941 = &v6[v481]; + int32_t *v959 = &v6[v497]; + int32_t *v968 = &v6[v508]; + int32_t *v977 = &v6[v516]; + int32_t *v986 = &v6[v524]; + int32_t *v995 = &v6[v535]; + int32_t *v1004 = &v6[v543]; + int32_t *v1013 = &v6[v551]; + int32_t *v1022 = &v6[v562]; + int32_t *v1031 = &v6[v570]; + int32_t *v1040 = &v6[v578]; + int32_t *v1049 = &v6[v589]; + int32_t *v1058 = &v6[v597]; + int32_t *v1067 = &v6[v605]; + int32_t *v1076 = &v6[v616]; + int32_t *v1085 = &v6[v624]; + int32_t *v1094 = &v6[v632]; + int32_t *v1103 = &v6[v643]; + int32_t *v1112 = &v6[v651]; + int32_t *v1121 = &v6[v659]; + svfloat32_t v39 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v691[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v21 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v672[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v29 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v681[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v48 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v700[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v56 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v709[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v66 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v718[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v75 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v727[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v83 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v736[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v93 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v745[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v102 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v754[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v120 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v772[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v129 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v781[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v137 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v790[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v147 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v799[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v156 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v808[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v164 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v817[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v174 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v826[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v183 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v835[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v191 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v844[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v201 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v853[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v210 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v862[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v218 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v871[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v228 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v880[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v30 = svadd_f32_x(svptrue_b32(), v21, v29); + svfloat32_t v31 = svsub_f32_x(svptrue_b32(), v21, v29); + svfloat32_t v57 = svadd_f32_x(svptrue_b32(), v48, v56); + svfloat32_t v58 = svsub_f32_x(svptrue_b32(), v48, v56); + svfloat32_t v84 = svadd_f32_x(svptrue_b32(), v75, v83); + svfloat32_t v85 = svsub_f32_x(svptrue_b32(), v75, v83); + svfloat32_t v111 = svadd_f32_x(svptrue_b32(), v102, v110); + svfloat32_t v112 = svsub_f32_x(svptrue_b32(), v102, v110); + svfloat32_t v138 = svadd_f32_x(svptrue_b32(), v129, v137); + svfloat32_t v139 = svsub_f32_x(svptrue_b32(), v129, v137); + svfloat32_t v165 = svadd_f32_x(svptrue_b32(), v156, v164); + svfloat32_t v166 = svsub_f32_x(svptrue_b32(), v156, v164); + svfloat32_t v192 = svadd_f32_x(svptrue_b32(), v183, v191); + svfloat32_t v193 = svsub_f32_x(svptrue_b32(), v183, v191); + svfloat32_t v219 = svadd_f32_x(svptrue_b32(), v210, v218); + svfloat32_t v220 = svsub_f32_x(svptrue_b32(), v210, v218); + svfloat32_t v40 = svadd_f32_x(svptrue_b32(), v30, v39); + svfloat32_t v67 = svadd_f32_x(svptrue_b32(), v57, v66); + svfloat32_t v94 = svadd_f32_x(svptrue_b32(), v84, v93); + svfloat32_t v121 = svadd_f32_x(svptrue_b32(), v111, v120); + svfloat32_t v148 = svadd_f32_x(svptrue_b32(), v138, v147); + svfloat32_t v175 = svadd_f32_x(svptrue_b32(), v165, v174); + svfloat32_t v202 = svadd_f32_x(svptrue_b32(), v192, v201); + svfloat32_t v229 = svadd_f32_x(svptrue_b32(), v219, v228); + svfloat32_t v302 = svadd_f32_x(svptrue_b32(), v30, v138); + svfloat32_t v303 = svsub_f32_x(svptrue_b32(), v30, v138); + svfloat32_t v304 = svadd_f32_x(svptrue_b32(), v84, v192); + svfloat32_t v305 = svsub_f32_x(svptrue_b32(), v84, v192); + svfloat32_t v306 = svadd_f32_x(svptrue_b32(), v57, v165); + svfloat32_t v307 = svsub_f32_x(svptrue_b32(), v57, v165); + svfloat32_t v308 = svadd_f32_x(svptrue_b32(), v111, v219); + svfloat32_t v309 = svsub_f32_x(svptrue_b32(), v111, v219); + svfloat32_t v374 = svadd_f32_x(svptrue_b32(), v31, v139); + svfloat32_t v375 = svsub_f32_x(svptrue_b32(), v31, v139); + svfloat32_t v376 = svadd_f32_x(svptrue_b32(), v85, v193); + svfloat32_t v377 = svsub_f32_x(svptrue_b32(), v85, v193); + svfloat32_t v378 = svadd_f32_x(svptrue_b32(), v58, v166); + svfloat32_t v379 = svsub_f32_x(svptrue_b32(), v58, v166); + svfloat32_t v380 = svadd_f32_x(svptrue_b32(), v112, v220); + svfloat32_t v381 = svsub_f32_x(svptrue_b32(), v112, v220); + svfloat32_t v230 = svadd_f32_x(svptrue_b32(), v40, v148); + svfloat32_t v231 = svsub_f32_x(svptrue_b32(), v40, v148); + svfloat32_t v232 = svadd_f32_x(svptrue_b32(), v94, v202); + svfloat32_t v233 = svsub_f32_x(svptrue_b32(), v94, v202); + svfloat32_t v234 = svadd_f32_x(svptrue_b32(), v67, v175); + svfloat32_t v235 = svsub_f32_x(svptrue_b32(), v67, v175); + svfloat32_t v236 = svadd_f32_x(svptrue_b32(), v121, v229); + svfloat32_t v237 = svsub_f32_x(svptrue_b32(), v121, v229); + svfloat32_t v310 = svadd_f32_x(svptrue_b32(), v302, v304); + svfloat32_t v311 = svsub_f32_x(svptrue_b32(), v302, v304); + svfloat32_t v312 = svadd_f32_x(svptrue_b32(), v306, v308); + svfloat32_t v313 = svsub_f32_x(svptrue_b32(), v306, v308); + svfloat32_t v316 = svadd_f32_x(svptrue_b32(), v307, v309); + svfloat32_t v317 = svsub_f32_x(svptrue_b32(), v307, v309); + svfloat32_t zero351 = svdup_n_f32(0); + svfloat32_t v351 = svcmla_f32_x(pred_full, zero351, v896, v305, 90); + svfloat32_t v382 = svadd_f32_x(svptrue_b32(), v374, v376); + svfloat32_t v383 = svsub_f32_x(svptrue_b32(), v374, v376); + svfloat32_t v384 = svadd_f32_x(svptrue_b32(), v378, v380); + svfloat32_t v385 = svsub_f32_x(svptrue_b32(), v378, v380); + svfloat32_t v388 = svadd_f32_x(svptrue_b32(), v379, v381); + svfloat32_t v389 = svsub_f32_x(svptrue_b32(), v379, v381); + svfloat32_t zero422 = svdup_n_f32(0); + svfloat32_t v422 = svcmla_f32_x(pred_full, zero422, v903, v375, 90); + svfloat32_t v238 = svadd_f32_x(svptrue_b32(), v230, v232); + svfloat32_t v239 = svsub_f32_x(svptrue_b32(), v230, v232); + svfloat32_t v240 = svadd_f32_x(svptrue_b32(), v234, v236); + svfloat32_t v241 = svsub_f32_x(svptrue_b32(), v234, v236); + svfloat32_t v244 = svadd_f32_x(svptrue_b32(), v235, v237); + svfloat32_t v245 = svsub_f32_x(svptrue_b32(), v235, v237); + svfloat32_t zero279 = svdup_n_f32(0); + svfloat32_t v279 = svcmla_f32_x(pred_full, zero279, v888, v233, 90); + svfloat32_t v314 = svadd_f32_x(svptrue_b32(), v310, v312); + svfloat32_t v315 = svsub_f32_x(svptrue_b32(), v310, v312); + svfloat32_t zero339 = svdup_n_f32(0); + svfloat32_t v339 = svcmla_f32_x(pred_full, zero339, v896, v313, 90); + svfloat32_t zero358 = svdup_n_f32(0); + svfloat32_t v358 = svcmla_f32_x(pred_full, zero358, v897, v316, 90); + svfloat32_t v363 = svmul_f32_x(svptrue_b32(), v317, v898); + svfloat32_t v386 = svadd_f32_x(svptrue_b32(), v382, v384); + svfloat32_t v387 = svsub_f32_x(svptrue_b32(), v382, v384); + svfloat32_t zero410 = svdup_n_f32(0); + svfloat32_t v410 = svcmla_f32_x(pred_full, zero410, v903, v383, 90); + svfloat32_t v432 = svmul_f32_x(svptrue_b32(), v388, v905); + svfloat32_t zero439 = svdup_n_f32(0); + svfloat32_t v439 = svcmla_f32_x(pred_full, zero439, v906, v389, 90); + svfloat32_t v242 = svadd_f32_x(svptrue_b32(), v238, v240); + svfloat32_t v243 = svsub_f32_x(svptrue_b32(), v238, v240); + svfloat32_t zero267 = svdup_n_f32(0); + svfloat32_t v267 = svcmla_f32_x(pred_full, zero267, v888, v241, 90); + svfloat32_t zero286 = svdup_n_f32(0); + svfloat32_t v286 = svcmla_f32_x(pred_full, zero286, v889, v244, 90); + svfloat32_t v364 = svmla_f32_x(pred_full, v339, v311, v895); + svfloat32_t v365 = svnmls_f32_x(pred_full, v339, v311, v895); + svfloat32_t v366 = svmla_f32_x(pred_full, v363, v303, v895); + svfloat32_t v367 = svnmls_f32_x(pred_full, v363, v303, v895); + svfloat32_t v368 = svadd_f32_x(svptrue_b32(), v351, v358); + svfloat32_t v369 = svsub_f32_x(svptrue_b32(), v351, v358); + svfloat32_t zero396 = svdup_n_f32(0); + svfloat32_t v396 = svcmla_f32_x(pred_full, zero396, v903, v386, 90); + svfloat32_t zero403 = svdup_n_f32(0); + svfloat32_t v403 = svcmla_f32_x(pred_full, zero403, v903, v387, 90); + svfloat32_t v440 = svmla_f32_x(pred_full, v410, v385, v904); + svfloat32_t v441 = svmls_f32_x(pred_full, v410, v385, v904); + svfloat32_t v442 = svadd_f32_x(svptrue_b32(), v422, v439); + svfloat32_t v443 = svsub_f32_x(svptrue_b32(), v422, v439); + svfloat32_t v444 = svmla_f32_x(pred_full, v432, v377, v904); + svfloat32_t v445 = svnmls_f32_x(pred_full, v432, v377, v904); + svfloat32_t v292 = svadd_f32_x(svptrue_b32(), v239, v267); + svfloat32_t v293 = svsub_f32_x(svptrue_b32(), v239, v267); + svfloat32_t v294 = svmla_f32_x(pred_full, v231, v245, v890); + svfloat32_t v295 = svmls_f32_x(pred_full, v231, v245, v890); + svfloat32_t v296 = svadd_f32_x(svptrue_b32(), v279, v286); + svfloat32_t v297 = svsub_f32_x(svptrue_b32(), v279, v286); + svfloat32_t v370 = svadd_f32_x(svptrue_b32(), v366, v368); + svfloat32_t v371 = svsub_f32_x(svptrue_b32(), v366, v368); + svfloat32_t v372 = svadd_f32_x(svptrue_b32(), v367, v369); + svfloat32_t v373 = svsub_f32_x(svptrue_b32(), v367, v369); + svfloat32_t v446 = svadd_f32_x(svptrue_b32(), v442, v444); + svfloat32_t v447 = svsub_f32_x(svptrue_b32(), v442, v444); + svfloat32_t v448 = svadd_f32_x(svptrue_b32(), v443, v445); + svfloat32_t v449 = svsub_f32_x(svptrue_b32(), v443, v445); + svfloat32_t v450 = svmla_f32_x(pred_full, v242, v314, v895); + svint16_t v455 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v242, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svfloat32_t v558 = svmla_f32_x(pred_full, v243, v315, v895); + svint16_t v563 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v243, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svfloat32_t v298 = svadd_f32_x(svptrue_b32(), v294, v296); + svfloat32_t v299 = svsub_f32_x(svptrue_b32(), v294, v296); + svfloat32_t v300 = svadd_f32_x(svptrue_b32(), v295, v297); + svfloat32_t v301 = svsub_f32_x(svptrue_b32(), v295, v297); + svfloat32_t v451 = svadd_f32_x(svptrue_b32(), v450, v396); + svfloat32_t v452 = svsub_f32_x(svptrue_b32(), v450, v396); + svfloat32_t v504 = svadd_f32_x(svptrue_b32(), v293, v365); + svint16_t v509 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v293, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svfloat32_t v559 = svadd_f32_x(svptrue_b32(), v558, v403); + svfloat32_t v560 = svsub_f32_x(svptrue_b32(), v558, v403); + svfloat32_t v612 = svadd_f32_x(svptrue_b32(), v292, v364); + svint16_t v617 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v292, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svst1w_u64(pred_full, (unsigned *)(v914), svreinterpret_u64_s16(v455)); + svst1w_u64(pred_full, (unsigned *)(v1022), svreinterpret_u64_s16(v563)); + svint16_t v463 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v452, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v471 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v451, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svfloat32_t v477 = svadd_f32_x(svptrue_b32(), v299, v371); + svint16_t v482 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v299, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svfloat32_t v505 = svadd_f32_x(svptrue_b32(), v504, v441); + svfloat32_t v506 = svsub_f32_x(svptrue_b32(), v504, v441); + svfloat32_t v531 = svadd_f32_x(svptrue_b32(), v300, v372); + svint16_t v536 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v300, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v571 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v560, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v579 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v559, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svfloat32_t v585 = svadd_f32_x(svptrue_b32(), v301, v373); + svint16_t v590 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v301, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svfloat32_t v613 = svadd_f32_x(svptrue_b32(), v612, v440); + svfloat32_t v614 = svsub_f32_x(svptrue_b32(), v612, v440); + svfloat32_t v639 = svadd_f32_x(svptrue_b32(), v298, v370); + svint16_t v644 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v298, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svst1w_u64(pred_full, (unsigned *)(v968), svreinterpret_u64_s16(v509)); + svst1w_u64(pred_full, (unsigned *)(v1076), svreinterpret_u64_s16(v617)); + svfloat32_t v478 = svadd_f32_x(svptrue_b32(), v477, v447); + svfloat32_t v479 = svsub_f32_x(svptrue_b32(), v477, v447); + svint16_t v517 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v506, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v525 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v505, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svfloat32_t v532 = svadd_f32_x(svptrue_b32(), v531, v448); + svfloat32_t v533 = svsub_f32_x(svptrue_b32(), v531, v448); + svfloat32_t v586 = svadd_f32_x(svptrue_b32(), v585, v449); + svfloat32_t v587 = svsub_f32_x(svptrue_b32(), v585, v449); + svint16_t v625 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v614, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v633 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v613, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svfloat32_t v640 = svadd_f32_x(svptrue_b32(), v639, v446); + svfloat32_t v641 = svsub_f32_x(svptrue_b32(), v639, v446); + svst1w_u64(pred_full, (unsigned *)(v923), svreinterpret_u64_s16(v463)); + svst1w_u64(pred_full, (unsigned *)(v932), svreinterpret_u64_s16(v471)); + svst1w_u64(pred_full, (unsigned *)(v941), svreinterpret_u64_s16(v482)); + svst1w_u64(pred_full, (unsigned *)(v995), svreinterpret_u64_s16(v536)); + svst1w_u64(pred_full, (unsigned *)(v1031), svreinterpret_u64_s16(v571)); + svst1w_u64(pred_full, (unsigned *)(v1040), svreinterpret_u64_s16(v579)); + svst1w_u64(pred_full, (unsigned *)(v1049), svreinterpret_u64_s16(v590)); + svst1w_u64(pred_full, (unsigned *)(v1103), svreinterpret_u64_s16(v644)); + svint16_t v490 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v479, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v498 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v478, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v544 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v533, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v552 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v532, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v598 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v587, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v606 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v586, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v652 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v641, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v660 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v640, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svst1w_u64(pred_full, (unsigned *)(v977), svreinterpret_u64_s16(v517)); + svst1w_u64(pred_full, (unsigned *)(v986), svreinterpret_u64_s16(v525)); + svst1w_u64(pred_full, (unsigned *)(v1085), svreinterpret_u64_s16(v625)); + svst1w_u64(pred_full, (unsigned *)(v1094), svreinterpret_u64_s16(v633)); + svst1w_u64(pred_full, (unsigned *)(v950), svreinterpret_u64_s16(v490)); + svst1w_u64(pred_full, (unsigned *)(v959), svreinterpret_u64_s16(v498)); + svst1w_u64(pred_full, (unsigned *)(v1004), svreinterpret_u64_s16(v544)); + svst1w_u64(pred_full, (unsigned *)(v1013), svreinterpret_u64_s16(v552)); + svst1w_u64(pred_full, (unsigned *)(v1058), svreinterpret_u64_s16(v598)); + svst1w_u64(pred_full, (unsigned *)(v1067), svreinterpret_u64_s16(v606)); + svst1w_u64(pred_full, (unsigned *)(v1112), svreinterpret_u64_s16(v652)); + svst1w_u64(pred_full, (unsigned *)(v1121), svreinterpret_u64_s16(v660)); +} +#endif + +#ifndef ARMRAL_ARCH_SVE +void armral_fft_cs16_cf32_cs16_ac_n_uun25( + const armral_cmplx_int16_t *restrict x, armral_cmplx_int16_t *restrict y, + int istride, int ostride, int howmany, float dir) { + float v4 = dir; + const int32_t *v5 = (const int32_t *)x; + int32_t *v6 = (int32_t *)y; + float v874 = 9.6858316112863108e-01F; + float v877 = -2.4868988716485479e-01F; + float v878 = 2.4868988716485479e-01F; + float v1018 = 8.7630668004386358e-01F; + float v1021 = -4.8175367410171532e-01F; + float v1022 = 4.8175367410171532e-01F; + float v1162 = 7.2896862742141155e-01F; + float v1165 = -6.8454710592868862e-01F; + float v1166 = 6.8454710592868862e-01F; + float v1174 = 6.2790519529313527e-02F; + float v1177 = -9.9802672842827156e-01F; + float v1178 = 9.9802672842827156e-01F; + float v1306 = 5.3582679497899655e-01F; + float v1309 = -8.4432792550201508e-01F; + float v1310 = 8.4432792550201508e-01F; + float v1318 = -4.2577929156507272e-01F; + float v1321 = -9.0482705246601947e-01F; + float v1322 = 9.0482705246601947e-01F; + float v1330 = -6.3742398974868952e-01F; + float v1333 = 7.7051324277578936e-01F; + float v1334 = -7.7051324277578936e-01F; + float v1348 = -9.9211470131447776e-01F; + float v1351 = -1.2533323356430454e-01F; + float v1352 = 1.2533323356430454e-01F; + float v1368 = 2.5000000000000000e-01F; + float v1378 = 5.5901699437494745e-01F; + float v1388 = 6.1803398874989490e-01F; + float v1413 = 9.5105651629515353e-01F; + float v1414 = -9.5105651629515353e-01F; + float v1439 = 2.0000000000000000e+00F; + int16x4_t v13 = vld1s_s16(&v5[0]); + int16x4_t v157 = vld1s_s16(&v5[istride]); + float32x2_t v875 = (float32x2_t){v874, v874}; + float32x2_t v879 = (float32x2_t){v877, v878}; + float32x2_t v1019 = (float32x2_t){v1018, v1018}; + float32x2_t v1023 = (float32x2_t){v1021, v1022}; + float32x2_t v1163 = (float32x2_t){v1162, v1162}; + float32x2_t v1167 = (float32x2_t){v1165, v1166}; + float32x2_t v1175 = (float32x2_t){v1174, v1174}; + float32x2_t v1179 = (float32x2_t){v1177, v1178}; + float32x2_t v1209 = (float32x2_t){v1334, v1333}; + float32x2_t v1307 = (float32x2_t){v1306, v1306}; + float32x2_t v1311 = (float32x2_t){v1309, v1310}; + float32x2_t v1319 = (float32x2_t){v1318, v1318}; + float32x2_t v1323 = (float32x2_t){v1321, v1322}; + float32x2_t v1331 = (float32x2_t){v1330, v1330}; + float32x2_t v1335 = (float32x2_t){v1333, v1334}; + float32x2_t v1349 = (float32x2_t){v1348, v1348}; + float32x2_t v1353 = (float32x2_t){v1351, v1352}; + float32x2_t v1369 = (float32x2_t){v1368, v1368}; + float32x2_t v1379 = (float32x2_t){v1378, v1378}; + float32x2_t v1389 = (float32x2_t){v1388, v1388}; + float32x2_t v1415 = (float32x2_t){v1413, v1414}; + float32x2_t v1416 = (float32x2_t){v4, v4}; + float32x2_t v1440 = (float32x2_t){v1439, v1439}; + float32x2_t v14 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v13)), 15); + int16x4_t v19 = vld1s_s16(&v5[istride * 5]); + int16x4_t v25 = vld1s_s16(&v5[istride * 10]); + int16x4_t v31 = vld1s_s16(&v5[istride * 15]); + int16x4_t v37 = vld1s_s16(&v5[istride * 20]); + float32x2_t v158 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v157)), 15); + int16x4_t v163 = vld1s_s16(&v5[istride * 6]); + int16x4_t v169 = vld1s_s16(&v5[istride * 11]); + int16x4_t v175 = vld1s_s16(&v5[istride * 16]); + int16x4_t v181 = vld1s_s16(&v5[istride * 21]); + int16x4_t v301 = vld1s_s16(&v5[istride * 2]); + int16x4_t v307 = vld1s_s16(&v5[istride * 7]); + int16x4_t v313 = vld1s_s16(&v5[istride * 12]); + int16x4_t v319 = vld1s_s16(&v5[istride * 17]); + int16x4_t v325 = vld1s_s16(&v5[istride * 22]); + int16x4_t v445 = vld1s_s16(&v5[istride * 3]); + int16x4_t v451 = vld1s_s16(&v5[istride * 8]); + int16x4_t v457 = vld1s_s16(&v5[istride * 13]); + int16x4_t v463 = vld1s_s16(&v5[istride * 18]); + int16x4_t v469 = vld1s_s16(&v5[istride * 23]); + int16x4_t v589 = vld1s_s16(&v5[istride * 4]); + int16x4_t v595 = vld1s_s16(&v5[istride * 9]); + int16x4_t v601 = vld1s_s16(&v5[istride * 14]); + int16x4_t v607 = vld1s_s16(&v5[istride * 19]); + int16x4_t v613 = vld1s_s16(&v5[istride * 24]); + float32x2_t v881 = vmul_f32(v1416, v879); + float32x2_t v1025 = vmul_f32(v1416, v1023); + float32x2_t v1169 = vmul_f32(v1416, v1167); + float32x2_t v1181 = vmul_f32(v1416, v1179); + float32x2_t v1211 = vmul_f32(v1416, v1209); + float32x2_t v1313 = vmul_f32(v1416, v1311); + float32x2_t v1325 = vmul_f32(v1416, v1323); + float32x2_t v1337 = vmul_f32(v1416, v1335); + float32x2_t v1355 = vmul_f32(v1416, v1353); + float32x2_t v1417 = vmul_f32(v1416, v1415); + float32x2_t v20 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v19)), 15); + float32x2_t v26 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v25)), 15); + float32x2_t v32 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v31)), 15); + float32x2_t v38 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v37)), 15); + float32x2_t v164 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v163)), 15); + float32x2_t v170 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v169)), 15); + float32x2_t v176 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v175)), 15); + float32x2_t v182 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v181)), 15); + float32x2_t v302 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v301)), 15); + float32x2_t v308 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v307)), 15); + float32x2_t v314 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v313)), 15); + float32x2_t v320 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v319)), 15); + float32x2_t v326 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v325)), 15); + float32x2_t v446 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v445)), 15); + float32x2_t v452 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v451)), 15); + float32x2_t v458 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v457)), 15); + float32x2_t v464 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v463)), 15); + float32x2_t v470 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v469)), 15); + float32x2_t v590 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v589)), 15); + float32x2_t v596 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v595)), 15); + float32x2_t v602 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v601)), 15); + float32x2_t v608 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v607)), 15); + float32x2_t v614 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v613)), 15); + float32x2_t v75 = vsub_f32(v20, v38); + float32x2_t v79 = vmul_f32(v20, v1440); + float32x2_t v93 = vsub_f32(v26, v32); + float32x2_t v97 = vmul_f32(v26, v1440); + float32x2_t v219 = vsub_f32(v164, v182); + float32x2_t v223 = vmul_f32(v164, v1440); + float32x2_t v237 = vsub_f32(v170, v176); + float32x2_t v241 = vmul_f32(v170, v1440); + float32x2_t v363 = vsub_f32(v308, v326); + float32x2_t v367 = vmul_f32(v308, v1440); + float32x2_t v381 = vsub_f32(v314, v320); + float32x2_t v385 = vmul_f32(v314, v1440); + float32x2_t v507 = vsub_f32(v452, v470); + float32x2_t v511 = vmul_f32(v452, v1440); + float32x2_t v525 = vsub_f32(v458, v464); + float32x2_t v529 = vmul_f32(v458, v1440); + float32x2_t v651 = vsub_f32(v596, v614); + float32x2_t v655 = vmul_f32(v596, v1440); + float32x2_t v669 = vsub_f32(v602, v608); + float32x2_t v673 = vmul_f32(v602, v1440); + float32x2_t v80 = vsub_f32(v79, v75); + float32x2_t v98 = vsub_f32(v97, v93); + float32x2_t v109 = vmul_f32(v93, v1389); + float32x2_t v124 = vmul_f32(v75, v1389); + float32x2_t v224 = vsub_f32(v223, v219); + float32x2_t v242 = vsub_f32(v241, v237); + float32x2_t v253 = vmul_f32(v237, v1389); + float32x2_t v268 = vmul_f32(v219, v1389); + float32x2_t v368 = vsub_f32(v367, v363); + float32x2_t v386 = vsub_f32(v385, v381); + float32x2_t v397 = vmul_f32(v381, v1389); + float32x2_t v412 = vmul_f32(v363, v1389); + float32x2_t v512 = vsub_f32(v511, v507); + float32x2_t v530 = vsub_f32(v529, v525); + float32x2_t v541 = vmul_f32(v525, v1389); + float32x2_t v556 = vmul_f32(v507, v1389); + float32x2_t v656 = vsub_f32(v655, v651); + float32x2_t v674 = vsub_f32(v673, v669); + float32x2_t v685 = vmul_f32(v669, v1389); + float32x2_t v700 = vmul_f32(v651, v1389); + float32x2_t v99 = vadd_f32(v80, v98); + float32x2_t v100 = vsub_f32(v80, v98); + float32x2_t v110 = vadd_f32(v75, v109); + float32x2_t v125 = vsub_f32(v124, v93); + float32x2_t v243 = vadd_f32(v224, v242); + float32x2_t v244 = vsub_f32(v224, v242); + float32x2_t v254 = vadd_f32(v219, v253); + float32x2_t v269 = vsub_f32(v268, v237); + float32x2_t v387 = vadd_f32(v368, v386); + float32x2_t v388 = vsub_f32(v368, v386); + float32x2_t v398 = vadd_f32(v363, v397); + float32x2_t v413 = vsub_f32(v412, v381); + float32x2_t v531 = vadd_f32(v512, v530); + float32x2_t v532 = vsub_f32(v512, v530); + float32x2_t v542 = vadd_f32(v507, v541); + float32x2_t v557 = vsub_f32(v556, v525); + float32x2_t v675 = vadd_f32(v656, v674); + float32x2_t v676 = vsub_f32(v656, v674); + float32x2_t v686 = vadd_f32(v651, v685); + float32x2_t v701 = vsub_f32(v700, v669); + float32x2_t v104 = vmul_f32(v99, v1369); + float32x2_t v114 = vmul_f32(v100, v1379); + float32x2_t v126 = vadd_f32(v14, v99); + float32x2_t v132 = vrev64_f32(v110); + float32x2_t v140 = vrev64_f32(v125); + float32x2_t v248 = vmul_f32(v243, v1369); + float32x2_t v258 = vmul_f32(v244, v1379); + float32x2_t v270 = vadd_f32(v158, v243); + float32x2_t v276 = vrev64_f32(v254); + float32x2_t v284 = vrev64_f32(v269); + float32x2_t v392 = vmul_f32(v387, v1369); + float32x2_t v402 = vmul_f32(v388, v1379); + float32x2_t v414 = vadd_f32(v302, v387); + float32x2_t v420 = vrev64_f32(v398); + float32x2_t v428 = vrev64_f32(v413); + float32x2_t v536 = vmul_f32(v531, v1369); + float32x2_t v546 = vmul_f32(v532, v1379); + float32x2_t v558 = vadd_f32(v446, v531); + float32x2_t v564 = vrev64_f32(v542); + float32x2_t v572 = vrev64_f32(v557); + float32x2_t v680 = vmul_f32(v675, v1369); + float32x2_t v690 = vmul_f32(v676, v1379); + float32x2_t v702 = vadd_f32(v590, v675); + float32x2_t v708 = vrev64_f32(v686); + float32x2_t v716 = vrev64_f32(v701); + float32x2_t v105 = vsub_f32(v14, v104); + float32x2_t v133 = vmul_f32(v132, v1417); + float32x2_t v141 = vmul_f32(v140, v1417); + float32x2_t v249 = vsub_f32(v158, v248); + float32x2_t v277 = vmul_f32(v276, v1417); + float32x2_t v285 = vmul_f32(v284, v1417); + float32x2_t v393 = vsub_f32(v302, v392); + float32x2_t v421 = vmul_f32(v420, v1417); + float32x2_t v429 = vmul_f32(v428, v1417); + float32x2_t v537 = vsub_f32(v446, v536); + float32x2_t v565 = vmul_f32(v564, v1417); + float32x2_t v573 = vmul_f32(v572, v1417); + float32x2_t v681 = vsub_f32(v590, v680); + float32x2_t v709 = vmul_f32(v708, v1417); + float32x2_t v717 = vmul_f32(v716, v1417); + float32x2_t v765 = vsub_f32(v270, v702); + float32x2_t v769 = vmul_f32(v270, v1440); + float32x2_t v783 = vsub_f32(v414, v558); + float32x2_t v787 = vmul_f32(v414, v1440); + float32x2_t v115 = vsub_f32(v105, v114); + float32x2_t v119 = vmul_f32(v105, v1440); + float32x2_t v259 = vsub_f32(v249, v258); + float32x2_t v263 = vmul_f32(v249, v1440); + float32x2_t v403 = vsub_f32(v393, v402); + float32x2_t v407 = vmul_f32(v393, v1440); + float32x2_t v547 = vsub_f32(v537, v546); + float32x2_t v551 = vmul_f32(v537, v1440); + float32x2_t v691 = vsub_f32(v681, v690); + float32x2_t v695 = vmul_f32(v681, v1440); + float32x2_t v770 = vsub_f32(v769, v765); + float32x2_t v788 = vsub_f32(v787, v783); + float32x2_t v799 = vmul_f32(v783, v1389); + float32x2_t v814 = vmul_f32(v765, v1389); + float32x2_t v120 = vsub_f32(v119, v115); + float32x2_t v142 = vsub_f32(v115, v141); + float32x2_t v146 = vmul_f32(v115, v1440); + float32x2_t v264 = vsub_f32(v263, v259); + float32x2_t v286 = vsub_f32(v259, v285); + float32x2_t v290 = vmul_f32(v259, v1440); + float32x2_t v408 = vsub_f32(v407, v403); + float32x2_t v430 = vsub_f32(v403, v429); + float32x2_t v434 = vmul_f32(v403, v1440); + float32x2_t v552 = vsub_f32(v551, v547); + float32x2_t v574 = vsub_f32(v547, v573); + float32x2_t v578 = vmul_f32(v547, v1440); + float32x2_t v696 = vsub_f32(v695, v691); + float32x2_t v718 = vsub_f32(v691, v717); + float32x2_t v722 = vmul_f32(v691, v1440); + float32x2_t v789 = vadd_f32(v770, v788); + float32x2_t v790 = vsub_f32(v770, v788); + float32x2_t v800 = vadd_f32(v765, v799); + float32x2_t v815 = vsub_f32(v814, v783); + float32x2_t v134 = vsub_f32(v120, v133); + float32x2_t v147 = vsub_f32(v146, v142); + float32x2_t v151 = vmul_f32(v120, v1440); + float32x2_t v278 = vsub_f32(v264, v277); + float32x2_t v291 = vsub_f32(v290, v286); + float32x2_t v295 = vmul_f32(v264, v1440); + float32x2_t v422 = vsub_f32(v408, v421); + float32x2_t v435 = vsub_f32(v434, v430); + float32x2_t v439 = vmul_f32(v408, v1440); + float32x2_t v566 = vsub_f32(v552, v565); + float32x2_t v579 = vsub_f32(v578, v574); + float32x2_t v583 = vmul_f32(v552, v1440); + float32x2_t v710 = vsub_f32(v696, v709); + float32x2_t v723 = vsub_f32(v722, v718); + float32x2_t v727 = vmul_f32(v696, v1440); + float32x2_t v794 = vmul_f32(v789, v1369); + float32x2_t v804 = vmul_f32(v790, v1379); + float32x2_t v816 = vadd_f32(v126, v789); + float32x2_t v828 = vrev64_f32(v800); + float32x2_t v842 = vrev64_f32(v815); + float32x2_t v1026 = vrev64_f32(v286); + float32x2_t v1038 = vrev64_f32(v430); + float32x2_t v1050 = vrev64_f32(v718); + float32x2_t v1068 = vrev64_f32(v574); + float32x2_t v152 = vsub_f32(v151, v134); + float32x2_t v296 = vsub_f32(v295, v278); + float32x2_t v440 = vsub_f32(v439, v422); + float32x2_t v584 = vsub_f32(v583, v566); + float32x2_t v728 = vsub_f32(v727, v710); + float32x2_t v795 = vsub_f32(v126, v794); + int16x4_t v819 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v816, 15), (int32x2_t){0, 0})); + float32x2_t v829 = vmul_f32(v828, v1417); + float32x2_t v843 = vmul_f32(v842, v1417); + float32x2_t v882 = vrev64_f32(v278); + float32x2_t v894 = vrev64_f32(v422); + float32x2_t v906 = vrev64_f32(v710); + float32x2_t v924 = vrev64_f32(v566); + float32x2_t v1027 = vmul_f32(v1026, v1025); + float32x2_t v1039 = vmul_f32(v1038, v1313); + float32x2_t v1051 = vmul_f32(v1050, v1325); + float32x2_t v1069 = vmul_f32(v1068, v1181); + float32x2_t v1170 = vrev64_f32(v291); + float32x2_t v1182 = vrev64_f32(v435); + float32x2_t v1194 = vrev64_f32(v723); + float32x2_t v1212 = vrev64_f32(v579); + float32x2_t v805 = vsub_f32(v795, v804); + float32x2_t v809 = vmul_f32(v795, v1440); + v6[0] = vget_lane_s32(vreinterpret_s32_s16(v819), 0); + float32x2_t v883 = vmul_f32(v882, v881); + float32x2_t v895 = vmul_f32(v894, v1025); + float32x2_t v907 = vmul_f32(v906, v1313); + float32x2_t v925 = vmul_f32(v924, v1169); + float32x2_t v1028 = vfma_f32(v1027, v286, v1019); + float32x2_t v1040 = vfma_f32(v1039, v430, v1307); + float32x2_t v1052 = vfma_f32(v1051, v718, v1319); + float32x2_t v1070 = vfma_f32(v1069, v574, v1175); + float32x2_t v1171 = vmul_f32(v1170, v1169); + float32x2_t v1183 = vmul_f32(v1182, v1181); + float32x2_t v1195 = vmul_f32(v1194, v1355); + float32x2_t v1213 = vmul_f32(v1212, v1211); + float32x2_t v1314 = vrev64_f32(v296); + float32x2_t v1326 = vrev64_f32(v440); + float32x2_t v1338 = vrev64_f32(v728); + float32x2_t v1356 = vrev64_f32(v584); + float32x2_t v810 = vsub_f32(v809, v805); + float32x2_t v844 = vsub_f32(v805, v843); + float32x2_t v854 = vmul_f32(v805, v1440); + float32x2_t v884 = vfma_f32(v883, v278, v875); + float32x2_t v896 = vfma_f32(v895, v422, v1019); + float32x2_t v908 = vfma_f32(v907, v710, v1307); + float32x2_t v926 = vfma_f32(v925, v566, v1163); + float32x2_t v1053 = vsub_f32(v1028, v1052); + float32x2_t v1057 = vmul_f32(v1028, v1440); + float32x2_t v1071 = vsub_f32(v1040, v1070); + float32x2_t v1075 = vmul_f32(v1040, v1440); + float32x2_t v1172 = vfma_f32(v1171, v291, v1163); + float32x2_t v1184 = vfma_f32(v1183, v435, v1175); + float32x2_t v1196 = vfma_f32(v1195, v723, v1349); + float32x2_t v1214 = vfma_f32(v1213, v579, v1331); + float32x2_t v1315 = vmul_f32(v1314, v1313); + float32x2_t v1327 = vmul_f32(v1326, v1325); + float32x2_t v1339 = vmul_f32(v1338, v1337); + float32x2_t v1357 = vmul_f32(v1356, v1355); + float32x2_t v830 = vsub_f32(v810, v829); + int16x4_t v847 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v844, 15), (int32x2_t){0, 0})); + float32x2_t v855 = vsub_f32(v854, v844); + float32x2_t v865 = vmul_f32(v810, v1440); + float32x2_t v909 = vsub_f32(v884, v908); + float32x2_t v913 = vmul_f32(v884, v1440); + float32x2_t v927 = vsub_f32(v896, v926); + float32x2_t v931 = vmul_f32(v896, v1440); + float32x2_t v1058 = vsub_f32(v1057, v1053); + float32x2_t v1076 = vsub_f32(v1075, v1071); + float32x2_t v1087 = vmul_f32(v1071, v1389); + float32x2_t v1102 = vmul_f32(v1053, v1389); + float32x2_t v1197 = vsub_f32(v1172, v1196); + float32x2_t v1201 = vmul_f32(v1172, v1440); + float32x2_t v1215 = vsub_f32(v1184, v1214); + float32x2_t v1219 = vmul_f32(v1184, v1440); + float32x2_t v1316 = vfma_f32(v1315, v296, v1307); + float32x2_t v1328 = vfma_f32(v1327, v440, v1319); + float32x2_t v1340 = vfma_f32(v1339, v728, v1331); + float32x2_t v1358 = vfma_f32(v1357, v584, v1349); + int16x4_t v833 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v830, 15), (int32x2_t){0, 0})); + v6[ostride * 10] = vget_lane_s32(vreinterpret_s32_s16(v847), 0); + int16x4_t v858 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v855, 15), (int32x2_t){0, 0})); + float32x2_t v866 = vsub_f32(v865, v830); + float32x2_t v914 = vsub_f32(v913, v909); + float32x2_t v932 = vsub_f32(v931, v927); + float32x2_t v943 = vmul_f32(v927, v1389); + float32x2_t v958 = vmul_f32(v909, v1389); + float32x2_t v1077 = vadd_f32(v1058, v1076); + float32x2_t v1078 = vsub_f32(v1058, v1076); + float32x2_t v1088 = vadd_f32(v1053, v1087); + float32x2_t v1103 = vsub_f32(v1102, v1071); + float32x2_t v1202 = vsub_f32(v1201, v1197); + float32x2_t v1220 = vsub_f32(v1219, v1215); + float32x2_t v1231 = vmul_f32(v1215, v1389); + float32x2_t v1246 = vmul_f32(v1197, v1389); + float32x2_t v1341 = vsub_f32(v1316, v1340); + float32x2_t v1345 = vmul_f32(v1316, v1440); + float32x2_t v1359 = vsub_f32(v1328, v1358); + float32x2_t v1363 = vmul_f32(v1328, v1440); + v6[ostride * 5] = vget_lane_s32(vreinterpret_s32_s16(v833), 0); + v6[ostride * 15] = vget_lane_s32(vreinterpret_s32_s16(v858), 0); + int16x4_t v869 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v866, 15), (int32x2_t){0, 0})); + float32x2_t v933 = vadd_f32(v914, v932); + float32x2_t v934 = vsub_f32(v914, v932); + float32x2_t v944 = vadd_f32(v909, v943); + float32x2_t v959 = vsub_f32(v958, v927); + float32x2_t v1082 = vmul_f32(v1077, v1369); + float32x2_t v1092 = vmul_f32(v1078, v1379); + float32x2_t v1104 = vadd_f32(v142, v1077); + float32x2_t v1116 = vrev64_f32(v1088); + float32x2_t v1130 = vrev64_f32(v1103); + float32x2_t v1221 = vadd_f32(v1202, v1220); + float32x2_t v1222 = vsub_f32(v1202, v1220); + float32x2_t v1232 = vadd_f32(v1197, v1231); + float32x2_t v1247 = vsub_f32(v1246, v1215); + float32x2_t v1346 = vsub_f32(v1345, v1341); + float32x2_t v1364 = vsub_f32(v1363, v1359); + float32x2_t v1375 = vmul_f32(v1359, v1389); + float32x2_t v1390 = vmul_f32(v1341, v1389); + v6[ostride * 20] = vget_lane_s32(vreinterpret_s32_s16(v869), 0); + float32x2_t v938 = vmul_f32(v933, v1369); + float32x2_t v948 = vmul_f32(v934, v1379); + float32x2_t v960 = vadd_f32(v134, v933); + float32x2_t v972 = vrev64_f32(v944); + float32x2_t v986 = vrev64_f32(v959); + float32x2_t v1083 = vsub_f32(v142, v1082); + int16x4_t v1107 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1104, 15), (int32x2_t){0, 0})); + float32x2_t v1117 = vmul_f32(v1116, v1417); + float32x2_t v1131 = vmul_f32(v1130, v1417); + float32x2_t v1226 = vmul_f32(v1221, v1369); + float32x2_t v1236 = vmul_f32(v1222, v1379); + float32x2_t v1248 = vadd_f32(v147, v1221); + float32x2_t v1260 = vrev64_f32(v1232); + float32x2_t v1274 = vrev64_f32(v1247); + float32x2_t v1365 = vadd_f32(v1346, v1364); + float32x2_t v1366 = vsub_f32(v1346, v1364); + float32x2_t v1376 = vadd_f32(v1341, v1375); + float32x2_t v1391 = vsub_f32(v1390, v1359); + float32x2_t v939 = vsub_f32(v134, v938); + int16x4_t v963 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v960, 15), (int32x2_t){0, 0})); + float32x2_t v973 = vmul_f32(v972, v1417); + float32x2_t v987 = vmul_f32(v986, v1417); + float32x2_t v1093 = vsub_f32(v1083, v1092); + float32x2_t v1097 = vmul_f32(v1083, v1440); + v6[ostride * 2] = vget_lane_s32(vreinterpret_s32_s16(v1107), 0); + float32x2_t v1227 = vsub_f32(v147, v1226); + int16x4_t v1251 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1248, 15), (int32x2_t){0, 0})); + float32x2_t v1261 = vmul_f32(v1260, v1417); + float32x2_t v1275 = vmul_f32(v1274, v1417); + float32x2_t v1370 = vmul_f32(v1365, v1369); + float32x2_t v1380 = vmul_f32(v1366, v1379); + float32x2_t v1392 = vadd_f32(v152, v1365); + float32x2_t v1404 = vrev64_f32(v1376); + float32x2_t v1418 = vrev64_f32(v1391); + float32x2_t v949 = vsub_f32(v939, v948); + float32x2_t v953 = vmul_f32(v939, v1440); + v6[ostride] = vget_lane_s32(vreinterpret_s32_s16(v963), 0); + float32x2_t v1098 = vsub_f32(v1097, v1093); + float32x2_t v1132 = vsub_f32(v1093, v1131); + float32x2_t v1142 = vmul_f32(v1093, v1440); + float32x2_t v1237 = vsub_f32(v1227, v1236); + float32x2_t v1241 = vmul_f32(v1227, v1440); + v6[ostride * 3] = vget_lane_s32(vreinterpret_s32_s16(v1251), 0); + float32x2_t v1371 = vsub_f32(v152, v1370); + int16x4_t v1395 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1392, 15), (int32x2_t){0, 0})); + float32x2_t v1405 = vmul_f32(v1404, v1417); + float32x2_t v1419 = vmul_f32(v1418, v1417); + float32x2_t v954 = vsub_f32(v953, v949); + float32x2_t v988 = vsub_f32(v949, v987); + float32x2_t v998 = vmul_f32(v949, v1440); + float32x2_t v1118 = vsub_f32(v1098, v1117); + int16x4_t v1135 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1132, 15), (int32x2_t){0, 0})); + float32x2_t v1143 = vsub_f32(v1142, v1132); + float32x2_t v1153 = vmul_f32(v1098, v1440); + float32x2_t v1242 = vsub_f32(v1241, v1237); + float32x2_t v1276 = vsub_f32(v1237, v1275); + float32x2_t v1286 = vmul_f32(v1237, v1440); + float32x2_t v1381 = vsub_f32(v1371, v1380); + float32x2_t v1385 = vmul_f32(v1371, v1440); + v6[ostride * 4] = vget_lane_s32(vreinterpret_s32_s16(v1395), 0); + float32x2_t v974 = vsub_f32(v954, v973); + int16x4_t v991 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v988, 15), (int32x2_t){0, 0})); + float32x2_t v999 = vsub_f32(v998, v988); + float32x2_t v1009 = vmul_f32(v954, v1440); + int16x4_t v1121 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1118, 15), (int32x2_t){0, 0})); + v6[ostride * 12] = vget_lane_s32(vreinterpret_s32_s16(v1135), 0); + int16x4_t v1146 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1143, 15), (int32x2_t){0, 0})); + float32x2_t v1154 = vsub_f32(v1153, v1118); + float32x2_t v1262 = vsub_f32(v1242, v1261); + int16x4_t v1279 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1276, 15), (int32x2_t){0, 0})); + float32x2_t v1287 = vsub_f32(v1286, v1276); + float32x2_t v1297 = vmul_f32(v1242, v1440); + float32x2_t v1386 = vsub_f32(v1385, v1381); + float32x2_t v1420 = vsub_f32(v1381, v1419); + float32x2_t v1430 = vmul_f32(v1381, v1440); + int16x4_t v977 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v974, 15), (int32x2_t){0, 0})); + v6[ostride * 11] = vget_lane_s32(vreinterpret_s32_s16(v991), 0); + int16x4_t v1002 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v999, 15), (int32x2_t){0, 0})); + float32x2_t v1010 = vsub_f32(v1009, v974); + v6[ostride * 7] = vget_lane_s32(vreinterpret_s32_s16(v1121), 0); + v6[ostride * 17] = vget_lane_s32(vreinterpret_s32_s16(v1146), 0); + int16x4_t v1157 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1154, 15), (int32x2_t){0, 0})); + int16x4_t v1265 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1262, 15), (int32x2_t){0, 0})); + v6[ostride * 13] = vget_lane_s32(vreinterpret_s32_s16(v1279), 0); + int16x4_t v1290 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1287, 15), (int32x2_t){0, 0})); + float32x2_t v1298 = vsub_f32(v1297, v1262); + float32x2_t v1406 = vsub_f32(v1386, v1405); + int16x4_t v1423 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1420, 15), (int32x2_t){0, 0})); + float32x2_t v1431 = vsub_f32(v1430, v1420); + float32x2_t v1441 = vmul_f32(v1386, v1440); + v6[ostride * 6] = vget_lane_s32(vreinterpret_s32_s16(v977), 0); + v6[ostride * 16] = vget_lane_s32(vreinterpret_s32_s16(v1002), 0); + int16x4_t v1013 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1010, 15), (int32x2_t){0, 0})); + v6[ostride * 22] = vget_lane_s32(vreinterpret_s32_s16(v1157), 0); + v6[ostride * 8] = vget_lane_s32(vreinterpret_s32_s16(v1265), 0); + v6[ostride * 18] = vget_lane_s32(vreinterpret_s32_s16(v1290), 0); + int16x4_t v1301 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1298, 15), (int32x2_t){0, 0})); + int16x4_t v1409 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1406, 15), (int32x2_t){0, 0})); + v6[ostride * 14] = vget_lane_s32(vreinterpret_s32_s16(v1423), 0); + int16x4_t v1434 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1431, 15), (int32x2_t){0, 0})); + float32x2_t v1442 = vsub_f32(v1441, v1406); + v6[ostride * 21] = vget_lane_s32(vreinterpret_s32_s16(v1013), 0); + v6[ostride * 23] = vget_lane_s32(vreinterpret_s32_s16(v1301), 0); + v6[ostride * 9] = vget_lane_s32(vreinterpret_s32_s16(v1409), 0); + v6[ostride * 19] = vget_lane_s32(vreinterpret_s32_s16(v1434), 0); + int16x4_t v1445 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1442, 15), (int32x2_t){0, 0})); + v6[ostride * 24] = vget_lane_s32(vreinterpret_s32_s16(v1445), 0); +} +#endif + +#ifdef ARMRAL_ARCH_SVE +void armral_fft_cs16_cf32_cs16_ac_n_uun25( + const armral_cmplx_int16_t *restrict x, armral_cmplx_int16_t *restrict y, + int istride, int ostride, int howmany, float dir) { + int64_t v0 = istride; + int64_t v2 = ostride; + float v4 = dir; + const int32_t *v5 = (const int32_t *)x; + int32_t *v6 = (int32_t *)y; + svbool_t pred_full = svptrue_pat_b32(SV_VL2); + float v1017 = 9.6858316112863108e-01F; + float v1022 = 2.4868988716485479e-01F; + float v1184 = 8.7630668004386358e-01F; + float v1189 = 4.8175367410171532e-01F; + float v1351 = 7.2896862742141155e-01F; + float v1356 = 6.8454710592868862e-01F; + float v1364 = 6.2790519529313527e-02F; + float v1369 = 9.9802672842827156e-01F; + float v1402 = 7.7051324277578925e-01F; + float v1518 = 5.3582679497899655e-01F; + float v1523 = 8.4432792550201508e-01F; + float v1531 = -4.2577929156507272e-01F; + float v1536 = 9.0482705246601947e-01F; + float v1544 = -6.3742398974868952e-01F; + float v1549 = -7.7051324277578936e-01F; + float v1564 = -9.9211470131447776e-01F; + float v1569 = 1.2533323356430454e-01F; + float v1586 = 2.5000000000000000e-01F; + float v1598 = 5.5901699437494745e-01F; + float v1610 = 6.1803398874989490e-01F; + float v1641 = -9.5105651629515353e-01F; + float v1671 = 2.0000000000000000e+00F; + const int32_t *v1755 = &v5[v0]; + int32_t *v2091 = &v6[v2]; + int64_t v23 = v0 * 5; + int64_t v31 = v0 * 10; + int64_t v39 = v0 * 15; + int64_t v47 = v0 * 20; + int64_t v190 = v0 * 6; + int64_t v198 = v0 * 11; + int64_t v206 = v0 * 16; + int64_t v214 = v0 * 21; + int64_t v349 = v0 * 2; + int64_t v357 = v0 * 7; + int64_t v365 = v0 * 12; + int64_t v373 = v0 * 17; + int64_t v381 = v0 * 22; + int64_t v516 = v0 * 3; + int64_t v524 = v0 * 8; + int64_t v532 = v0 * 13; + int64_t v540 = v0 * 18; + int64_t v548 = v0 * 23; + int64_t v683 = v0 * 4; + int64_t v691 = v0 * 9; + int64_t v699 = v0 * 14; + int64_t v707 = v0 * 19; + int64_t v715 = v0 * 24; + int64_t v965 = v2 * 5; + int64_t v981 = v2 * 10; + int64_t v995 = v2 * 15; + int64_t v1009 = v2 * 20; + float v1025 = v4 * v1022; + int64_t v1132 = v2 * 6; + int64_t v1148 = v2 * 11; + int64_t v1162 = v2 * 16; + int64_t v1176 = v2 * 21; + float v1192 = v4 * v1189; + int64_t v1283 = v2 * 2; + int64_t v1299 = v2 * 7; + int64_t v1315 = v2 * 12; + int64_t v1329 = v2 * 17; + int64_t v1343 = v2 * 22; + float v1359 = v4 * v1356; + float v1372 = v4 * v1369; + float v1405 = v4 * v1402; + int64_t v1450 = v2 * 3; + int64_t v1466 = v2 * 8; + int64_t v1482 = v2 * 13; + int64_t v1496 = v2 * 18; + int64_t v1510 = v2 * 23; + float v1526 = v4 * v1523; + float v1539 = v4 * v1536; + float v1552 = v4 * v1549; + float v1572 = v4 * v1569; + int64_t v1617 = v2 * 4; + int64_t v1633 = v2 * 9; + float v1644 = v4 * v1641; + int64_t v1649 = v2 * 14; + int64_t v1663 = v2 * 19; + int64_t v1677 = v2 * 24; + const int32_t *v1691 = &v5[0]; + svfloat32_t v2013 = svdup_n_f32(0); + int32_t *v2027 = &v6[0]; + svfloat32_t v2070 = svdup_n_f32(v1017); + svfloat32_t v2134 = svdup_n_f32(v1184); + svfloat32_t v2198 = svdup_n_f32(v1351); + svfloat32_t v2200 = svdup_n_f32(v1364); + svfloat32_t v2262 = svdup_n_f32(v1518); + svfloat32_t v2264 = svdup_n_f32(v1531); + svfloat32_t v2266 = svdup_n_f32(v1544); + svfloat32_t v2269 = svdup_n_f32(v1564); + svfloat32_t v2272 = svdup_n_f32(v1586); + svfloat32_t v2274 = svdup_n_f32(v1598); + svfloat32_t v2276 = svdup_n_f32(v1610); + svfloat32_t v2316 = svdup_n_f32(v1671); + svfloat32_t v188 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v1755[0])), + 1.F / (1ULL << 15ULL)); + const int32_t *v1700 = &v5[v23]; + const int32_t *v1709 = &v5[v31]; + const int32_t *v1718 = &v5[v39]; + const int32_t *v1727 = &v5[v47]; + const int32_t *v1764 = &v5[v190]; + const int32_t *v1773 = &v5[v198]; + const int32_t *v1782 = &v5[v206]; + const int32_t *v1791 = &v5[v214]; + const int32_t *v1819 = &v5[v349]; + const int32_t *v1828 = &v5[v357]; + const int32_t *v1837 = &v5[v365]; + const int32_t *v1846 = &v5[v373]; + const int32_t *v1855 = &v5[v381]; + const int32_t *v1883 = &v5[v516]; + const int32_t *v1892 = &v5[v524]; + const int32_t *v1901 = &v5[v532]; + const int32_t *v1910 = &v5[v540]; + const int32_t *v1919 = &v5[v548]; + const int32_t *v1947 = &v5[v683]; + const int32_t *v1956 = &v5[v691]; + const int32_t *v1965 = &v5[v699]; + const int32_t *v1974 = &v5[v707]; + const int32_t *v1983 = &v5[v715]; + int32_t *v2037 = &v6[v965]; + int32_t *v2047 = &v6[v981]; + int32_t *v2057 = &v6[v995]; + int32_t *v2067 = &v6[v1009]; + svfloat32_t v2071 = svdup_n_f32(v1025); + int32_t *v2101 = &v6[v1132]; + int32_t *v2111 = &v6[v1148]; + int32_t *v2121 = &v6[v1162]; + int32_t *v2131 = &v6[v1176]; + svfloat32_t v2135 = svdup_n_f32(v1192); + int32_t *v2155 = &v6[v1283]; + int32_t *v2165 = &v6[v1299]; + int32_t *v2175 = &v6[v1315]; + int32_t *v2185 = &v6[v1329]; + int32_t *v2195 = &v6[v1343]; + svfloat32_t v2199 = svdup_n_f32(v1359); + svfloat32_t v2201 = svdup_n_f32(v1372); + svfloat32_t v2206 = svdup_n_f32(v1405); + int32_t *v2219 = &v6[v1450]; + int32_t *v2229 = &v6[v1466]; + int32_t *v2239 = &v6[v1482]; + int32_t *v2249 = &v6[v1496]; + int32_t *v2259 = &v6[v1510]; + svfloat32_t v2263 = svdup_n_f32(v1526); + svfloat32_t v2265 = svdup_n_f32(v1539); + svfloat32_t v2267 = svdup_n_f32(v1552); + svfloat32_t v2270 = svdup_n_f32(v1572); + int32_t *v2283 = &v6[v1617]; + int32_t *v2293 = &v6[v1633]; + svfloat32_t v2296 = svdup_n_f32(v1644); + int32_t *v2303 = &v6[v1649]; + int32_t *v2313 = &v6[v1663]; + int32_t *v2323 = &v6[v1677]; + svfloat32_t v21 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v1691[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v29 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v1700[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v37 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v1709[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v45 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v1718[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v53 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v1727[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v196 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v1764[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v204 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v1773[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v212 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v1782[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v220 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v1791[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v355 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v1819[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v363 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v1828[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v371 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v1837[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v379 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v1846[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v387 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v1855[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v522 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v1883[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v530 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v1892[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v538 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v1901[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v546 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v1910[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v554 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v1919[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v689 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v1947[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v697 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v1956[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v705 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v1965[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v713 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v1974[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v721 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v1983[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v66 = svcmla_f32_x(pred_full, v29, v2013, v29, 90); + svfloat32_t v79 = svcmla_f32_x(pred_full, v37, v2013, v37, 90); + svfloat32_t v92 = svcmla_f32_x(pred_full, v53, v2013, v53, 90); + svfloat32_t v112 = svcmla_f32_x(pred_full, v45, v2013, v45, 90); + svfloat32_t v233 = svcmla_f32_x(pred_full, v196, v2013, v196, 90); + svfloat32_t v246 = svcmla_f32_x(pred_full, v204, v2013, v204, 90); + svfloat32_t v259 = svcmla_f32_x(pred_full, v220, v2013, v220, 90); + svfloat32_t v279 = svcmla_f32_x(pred_full, v212, v2013, v212, 90); + svfloat32_t v400 = svcmla_f32_x(pred_full, v363, v2013, v363, 90); + svfloat32_t v413 = svcmla_f32_x(pred_full, v371, v2013, v371, 90); + svfloat32_t v426 = svcmla_f32_x(pred_full, v387, v2013, v387, 90); + svfloat32_t v446 = svcmla_f32_x(pred_full, v379, v2013, v379, 90); + svfloat32_t v567 = svcmla_f32_x(pred_full, v530, v2013, v530, 90); + svfloat32_t v580 = svcmla_f32_x(pred_full, v538, v2013, v538, 90); + svfloat32_t v593 = svcmla_f32_x(pred_full, v554, v2013, v554, 90); + svfloat32_t v613 = svcmla_f32_x(pred_full, v546, v2013, v546, 90); + svfloat32_t v734 = svcmla_f32_x(pred_full, v697, v2013, v697, 90); + svfloat32_t v747 = svcmla_f32_x(pred_full, v705, v2013, v705, 90); + svfloat32_t v760 = svcmla_f32_x(pred_full, v721, v2013, v721, 90); + svfloat32_t v780 = svcmla_f32_x(pred_full, v713, v2013, v713, 90); + svfloat32_t v93 = svsub_f32_x(svptrue_b32(), v66, v92); + svfloat32_t v113 = svsub_f32_x(svptrue_b32(), v79, v112); + svfloat32_t v260 = svsub_f32_x(svptrue_b32(), v233, v259); + svfloat32_t v280 = svsub_f32_x(svptrue_b32(), v246, v279); + svfloat32_t v427 = svsub_f32_x(svptrue_b32(), v400, v426); + svfloat32_t v447 = svsub_f32_x(svptrue_b32(), v413, v446); + svfloat32_t v594 = svsub_f32_x(svptrue_b32(), v567, v593); + svfloat32_t v614 = svsub_f32_x(svptrue_b32(), v580, v613); + svfloat32_t v761 = svsub_f32_x(svptrue_b32(), v734, v760); + svfloat32_t v781 = svsub_f32_x(svptrue_b32(), v747, v780); + svfloat32_t v99 = svnmls_f32_x(pred_full, v93, v66, v2316); + svfloat32_t v119 = svnmls_f32_x(pred_full, v113, v79, v2316); + svfloat32_t v266 = svnmls_f32_x(pred_full, v260, v233, v2316); + svfloat32_t v286 = svnmls_f32_x(pred_full, v280, v246, v2316); + svfloat32_t v433 = svnmls_f32_x(pred_full, v427, v400, v2316); + svfloat32_t v453 = svnmls_f32_x(pred_full, v447, v413, v2316); + svfloat32_t v600 = svnmls_f32_x(pred_full, v594, v567, v2316); + svfloat32_t v620 = svnmls_f32_x(pred_full, v614, v580, v2316); + svfloat32_t v767 = svnmls_f32_x(pred_full, v761, v734, v2316); + svfloat32_t v787 = svnmls_f32_x(pred_full, v781, v747, v2316); + svfloat32_t v120 = svadd_f32_x(svptrue_b32(), v99, v119); + svfloat32_t v121 = svsub_f32_x(svptrue_b32(), v99, v119); + svfloat32_t v133 = svmla_f32_x(pred_full, v93, v113, v2276); + svfloat32_t v151 = svnmls_f32_x(pred_full, v113, v93, v2276); + svfloat32_t v287 = svadd_f32_x(svptrue_b32(), v266, v286); + svfloat32_t v288 = svsub_f32_x(svptrue_b32(), v266, v286); + svfloat32_t v300 = svmla_f32_x(pred_full, v260, v280, v2276); + svfloat32_t v318 = svnmls_f32_x(pred_full, v280, v260, v2276); + svfloat32_t v454 = svadd_f32_x(svptrue_b32(), v433, v453); + svfloat32_t v455 = svsub_f32_x(svptrue_b32(), v433, v453); + svfloat32_t v467 = svmla_f32_x(pred_full, v427, v447, v2276); + svfloat32_t v485 = svnmls_f32_x(pred_full, v447, v427, v2276); + svfloat32_t v621 = svadd_f32_x(svptrue_b32(), v600, v620); + svfloat32_t v622 = svsub_f32_x(svptrue_b32(), v600, v620); + svfloat32_t v634 = svmla_f32_x(pred_full, v594, v614, v2276); + svfloat32_t v652 = svnmls_f32_x(pred_full, v614, v594, v2276); + svfloat32_t v788 = svadd_f32_x(svptrue_b32(), v767, v787); + svfloat32_t v789 = svsub_f32_x(svptrue_b32(), v767, v787); + svfloat32_t v801 = svmla_f32_x(pred_full, v761, v781, v2276); + svfloat32_t v819 = svnmls_f32_x(pred_full, v781, v761, v2276); + svfloat32_t v152 = svadd_f32_x(svptrue_b32(), v21, v120); + svfloat32_t zero159 = svdup_n_f32(0); + svfloat32_t v159 = svcmla_f32_x(pred_full, zero159, v2296, v133, 90); + svfloat32_t zero167 = svdup_n_f32(0); + svfloat32_t v167 = svcmla_f32_x(pred_full, zero167, v2296, v151, 90); + svfloat32_t v319 = svadd_f32_x(svptrue_b32(), v188, v287); + svfloat32_t zero326 = svdup_n_f32(0); + svfloat32_t v326 = svcmla_f32_x(pred_full, zero326, v2296, v300, 90); + svfloat32_t zero334 = svdup_n_f32(0); + svfloat32_t v334 = svcmla_f32_x(pred_full, zero334, v2296, v318, 90); + svfloat32_t v486 = svadd_f32_x(svptrue_b32(), v355, v454); + svfloat32_t zero493 = svdup_n_f32(0); + svfloat32_t v493 = svcmla_f32_x(pred_full, zero493, v2296, v467, 90); + svfloat32_t zero501 = svdup_n_f32(0); + svfloat32_t v501 = svcmla_f32_x(pred_full, zero501, v2296, v485, 90); + svfloat32_t v653 = svadd_f32_x(svptrue_b32(), v522, v621); + svfloat32_t zero660 = svdup_n_f32(0); + svfloat32_t v660 = svcmla_f32_x(pred_full, zero660, v2296, v634, 90); + svfloat32_t zero668 = svdup_n_f32(0); + svfloat32_t v668 = svcmla_f32_x(pred_full, zero668, v2296, v652, 90); + svfloat32_t v820 = svadd_f32_x(svptrue_b32(), v689, v788); + svfloat32_t zero827 = svdup_n_f32(0); + svfloat32_t v827 = svcmla_f32_x(pred_full, zero827, v2296, v801, 90); + svfloat32_t zero835 = svdup_n_f32(0); + svfloat32_t v835 = svcmla_f32_x(pred_full, zero835, v2296, v819, 90); + svfloat32_t v127 = svmls_f32_x(pred_full, v21, v120, v2272); + svfloat32_t v294 = svmls_f32_x(pred_full, v188, v287, v2272); + svfloat32_t v461 = svmls_f32_x(pred_full, v355, v454, v2272); + svfloat32_t v628 = svmls_f32_x(pred_full, v522, v621, v2272); + svfloat32_t v795 = svmls_f32_x(pred_full, v689, v788, v2272); + svfloat32_t v139 = svmls_f32_x(pred_full, v127, v121, v2274); + svfloat32_t v306 = svmls_f32_x(pred_full, v294, v288, v2274); + svfloat32_t v473 = svmls_f32_x(pred_full, v461, v455, v2274); + svfloat32_t v640 = svmls_f32_x(pred_full, v628, v622, v2274); + svfloat32_t v807 = svmls_f32_x(pred_full, v795, v789, v2274); + svfloat32_t v861 = svcmla_f32_x(pred_full, v319, v2013, v319, 90); + svfloat32_t v874 = svcmla_f32_x(pred_full, v486, v2013, v486, 90); + svfloat32_t v887 = svcmla_f32_x(pred_full, v820, v2013, v820, 90); + svfloat32_t v907 = svcmla_f32_x(pred_full, v653, v2013, v653, 90); + svfloat32_t v145 = svnmls_f32_x(pred_full, v139, v127, v2316); + svfloat32_t v168 = svsub_f32_x(svptrue_b32(), v139, v167); + svfloat32_t v312 = svnmls_f32_x(pred_full, v306, v294, v2316); + svfloat32_t v335 = svsub_f32_x(svptrue_b32(), v306, v334); + svfloat32_t v479 = svnmls_f32_x(pred_full, v473, v461, v2316); + svfloat32_t v502 = svsub_f32_x(svptrue_b32(), v473, v501); + svfloat32_t v646 = svnmls_f32_x(pred_full, v640, v628, v2316); + svfloat32_t v669 = svsub_f32_x(svptrue_b32(), v640, v668); + svfloat32_t v813 = svnmls_f32_x(pred_full, v807, v795, v2316); + svfloat32_t v836 = svsub_f32_x(svptrue_b32(), v807, v835); + svfloat32_t v888 = svsub_f32_x(svptrue_b32(), v861, v887); + svfloat32_t v908 = svsub_f32_x(svptrue_b32(), v874, v907); + svfloat32_t v160 = svsub_f32_x(svptrue_b32(), v145, v159); + svfloat32_t v174 = svnmls_f32_x(pred_full, v168, v139, v2316); + svfloat32_t v327 = svsub_f32_x(svptrue_b32(), v312, v326); + svfloat32_t v341 = svnmls_f32_x(pred_full, v335, v306, v2316); + svfloat32_t v494 = svsub_f32_x(svptrue_b32(), v479, v493); + svfloat32_t v508 = svnmls_f32_x(pred_full, v502, v473, v2316); + svfloat32_t v661 = svsub_f32_x(svptrue_b32(), v646, v660); + svfloat32_t v675 = svnmls_f32_x(pred_full, v669, v640, v2316); + svfloat32_t v828 = svsub_f32_x(svptrue_b32(), v813, v827); + svfloat32_t v842 = svnmls_f32_x(pred_full, v836, v807, v2316); + svfloat32_t v894 = svnmls_f32_x(pred_full, v888, v861, v2316); + svfloat32_t v914 = svnmls_f32_x(pred_full, v908, v874, v2316); + svfloat32_t v1187 = svmul_f32_x(svptrue_b32(), v335, v2134); + svfloat32_t v1200 = svmul_f32_x(svptrue_b32(), v502, v2262); + svfloat32_t v1213 = svmul_f32_x(svptrue_b32(), v836, v2264); + svfloat32_t v1233 = svmul_f32_x(svptrue_b32(), v669, v2200); + svfloat32_t v180 = svnmls_f32_x(pred_full, v160, v145, v2316); + svfloat32_t v347 = svnmls_f32_x(pred_full, v327, v312, v2316); + svfloat32_t v514 = svnmls_f32_x(pred_full, v494, v479, v2316); + svfloat32_t v681 = svnmls_f32_x(pred_full, v661, v646, v2316); + svfloat32_t v848 = svnmls_f32_x(pred_full, v828, v813, v2316); + svfloat32_t v915 = svadd_f32_x(svptrue_b32(), v894, v914); + svfloat32_t v916 = svsub_f32_x(svptrue_b32(), v894, v914); + svfloat32_t v928 = svmla_f32_x(pred_full, v888, v908, v2276); + svfloat32_t v946 = svnmls_f32_x(pred_full, v908, v888, v2276); + svfloat32_t v1020 = svmul_f32_x(svptrue_b32(), v327, v2070); + svfloat32_t v1033 = svmul_f32_x(svptrue_b32(), v494, v2134); + svfloat32_t v1046 = svmul_f32_x(svptrue_b32(), v828, v2262); + svfloat32_t v1066 = svmul_f32_x(svptrue_b32(), v661, v2198); + svfloat32_t v1195 = svcmla_f32_x(pred_full, v1187, v2135, v335, 90); + svfloat32_t v1208 = svcmla_f32_x(pred_full, v1200, v2263, v502, 90); + svfloat32_t v1221 = svcmla_f32_x(pred_full, v1213, v2265, v836, 90); + svfloat32_t v1241 = svcmla_f32_x(pred_full, v1233, v2201, v669, 90); + svfloat32_t v1354 = svmul_f32_x(svptrue_b32(), v341, v2198); + svfloat32_t v1367 = svmul_f32_x(svptrue_b32(), v508, v2200); + svfloat32_t v1380 = svmul_f32_x(svptrue_b32(), v842, v2269); + svfloat32_t v1400 = svmul_f32_x(svptrue_b32(), v675, v2266); + svfloat32_t v947 = svadd_f32_x(svptrue_b32(), v152, v915); + svfloat32_t zero962 = svdup_n_f32(0); + svfloat32_t v962 = svcmla_f32_x(pred_full, zero962, v2296, v928, 90); + svfloat32_t zero978 = svdup_n_f32(0); + svfloat32_t v978 = svcmla_f32_x(pred_full, zero978, v2296, v946, 90); + svfloat32_t v1028 = svcmla_f32_x(pred_full, v1020, v2071, v327, 90); + svfloat32_t v1041 = svcmla_f32_x(pred_full, v1033, v2135, v494, 90); + svfloat32_t v1054 = svcmla_f32_x(pred_full, v1046, v2263, v828, 90); + svfloat32_t v1074 = svcmla_f32_x(pred_full, v1066, v2199, v661, 90); + svfloat32_t v1222 = svsub_f32_x(svptrue_b32(), v1195, v1221); + svfloat32_t v1242 = svsub_f32_x(svptrue_b32(), v1208, v1241); + svfloat32_t v1362 = svcmla_f32_x(pred_full, v1354, v2199, v341, 90); + svfloat32_t v1375 = svcmla_f32_x(pred_full, v1367, v2201, v508, 90); + svfloat32_t v1388 = svcmla_f32_x(pred_full, v1380, v2270, v842, 90); + svfloat32_t v1408 = svcmla_f32_x(pred_full, v1400, v2206, v675, 90); + svfloat32_t v1521 = svmul_f32_x(svptrue_b32(), v347, v2262); + svfloat32_t v1534 = svmul_f32_x(svptrue_b32(), v514, v2264); + svfloat32_t v1547 = svmul_f32_x(svptrue_b32(), v848, v2266); + svfloat32_t v1567 = svmul_f32_x(svptrue_b32(), v681, v2269); + svfloat32_t v922 = svmls_f32_x(pred_full, v152, v915, v2272); + svint16_t v950 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v947, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svfloat32_t v1055 = svsub_f32_x(svptrue_b32(), v1028, v1054); + svfloat32_t v1075 = svsub_f32_x(svptrue_b32(), v1041, v1074); + svfloat32_t v1228 = svnmls_f32_x(pred_full, v1222, v1195, v2316); + svfloat32_t v1248 = svnmls_f32_x(pred_full, v1242, v1208, v2316); + svfloat32_t v1389 = svsub_f32_x(svptrue_b32(), v1362, v1388); + svfloat32_t v1409 = svsub_f32_x(svptrue_b32(), v1375, v1408); + svfloat32_t v1529 = svcmla_f32_x(pred_full, v1521, v2263, v347, 90); + svfloat32_t v1542 = svcmla_f32_x(pred_full, v1534, v2265, v514, 90); + svfloat32_t v1555 = svcmla_f32_x(pred_full, v1547, v2267, v848, 90); + svfloat32_t v1575 = svcmla_f32_x(pred_full, v1567, v2270, v681, 90); + svfloat32_t v934 = svmls_f32_x(pred_full, v922, v916, v2274); + svfloat32_t v1061 = svnmls_f32_x(pred_full, v1055, v1028, v2316); + svfloat32_t v1081 = svnmls_f32_x(pred_full, v1075, v1041, v2316); + svfloat32_t v1249 = svadd_f32_x(svptrue_b32(), v1228, v1248); + svfloat32_t v1250 = svsub_f32_x(svptrue_b32(), v1228, v1248); + svfloat32_t v1262 = svmla_f32_x(pred_full, v1222, v1242, v2276); + svfloat32_t v1280 = svnmls_f32_x(pred_full, v1242, v1222, v2276); + svfloat32_t v1395 = svnmls_f32_x(pred_full, v1389, v1362, v2316); + svfloat32_t v1415 = svnmls_f32_x(pred_full, v1409, v1375, v2316); + svfloat32_t v1556 = svsub_f32_x(svptrue_b32(), v1529, v1555); + svfloat32_t v1576 = svsub_f32_x(svptrue_b32(), v1542, v1575); + svst1w_u64(pred_full, (unsigned *)(v2027), svreinterpret_u64_s16(v950)); + svfloat32_t v940 = svnmls_f32_x(pred_full, v934, v922, v2316); + svfloat32_t v979 = svsub_f32_x(svptrue_b32(), v934, v978); + svfloat32_t v1082 = svadd_f32_x(svptrue_b32(), v1061, v1081); + svfloat32_t v1083 = svsub_f32_x(svptrue_b32(), v1061, v1081); + svfloat32_t v1095 = svmla_f32_x(pred_full, v1055, v1075, v2276); + svfloat32_t v1113 = svnmls_f32_x(pred_full, v1075, v1055, v2276); + svfloat32_t v1281 = svadd_f32_x(svptrue_b32(), v168, v1249); + svfloat32_t zero1296 = svdup_n_f32(0); + svfloat32_t v1296 = svcmla_f32_x(pred_full, zero1296, v2296, v1262, 90); + svfloat32_t zero1312 = svdup_n_f32(0); + svfloat32_t v1312 = svcmla_f32_x(pred_full, zero1312, v2296, v1280, 90); + svfloat32_t v1416 = svadd_f32_x(svptrue_b32(), v1395, v1415); + svfloat32_t v1417 = svsub_f32_x(svptrue_b32(), v1395, v1415); + svfloat32_t v1429 = svmla_f32_x(pred_full, v1389, v1409, v2276); + svfloat32_t v1447 = svnmls_f32_x(pred_full, v1409, v1389, v2276); + svfloat32_t v1562 = svnmls_f32_x(pred_full, v1556, v1529, v2316); + svfloat32_t v1582 = svnmls_f32_x(pred_full, v1576, v1542, v2316); + svfloat32_t v963 = svsub_f32_x(svptrue_b32(), v940, v962); + svint16_t v982 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v979, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svfloat32_t v993 = svnmls_f32_x(pred_full, v979, v934, v2316); + svfloat32_t v1114 = svadd_f32_x(svptrue_b32(), v160, v1082); + svfloat32_t zero1129 = svdup_n_f32(0); + svfloat32_t v1129 = svcmla_f32_x(pred_full, zero1129, v2296, v1095, 90); + svfloat32_t zero1145 = svdup_n_f32(0); + svfloat32_t v1145 = svcmla_f32_x(pred_full, zero1145, v2296, v1113, 90); + svfloat32_t v1256 = svmls_f32_x(pred_full, v168, v1249, v2272); + svint16_t v1284 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v1281, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svfloat32_t v1448 = svadd_f32_x(svptrue_b32(), v174, v1416); + svfloat32_t zero1463 = svdup_n_f32(0); + svfloat32_t v1463 = svcmla_f32_x(pred_full, zero1463, v2296, v1429, 90); + svfloat32_t zero1479 = svdup_n_f32(0); + svfloat32_t v1479 = svcmla_f32_x(pred_full, zero1479, v2296, v1447, 90); + svfloat32_t v1583 = svadd_f32_x(svptrue_b32(), v1562, v1582); + svfloat32_t v1584 = svsub_f32_x(svptrue_b32(), v1562, v1582); + svfloat32_t v1596 = svmla_f32_x(pred_full, v1556, v1576, v2276); + svfloat32_t v1614 = svnmls_f32_x(pred_full, v1576, v1556, v2276); + svint16_t v966 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v963, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v996 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v993, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svfloat32_t v1007 = svnmls_f32_x(pred_full, v963, v940, v2316); + svfloat32_t v1089 = svmls_f32_x(pred_full, v160, v1082, v2272); + svint16_t v1117 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v1114, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svfloat32_t v1268 = svmls_f32_x(pred_full, v1256, v1250, v2274); + svfloat32_t v1423 = svmls_f32_x(pred_full, v174, v1416, v2272); + svint16_t v1451 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v1448, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svfloat32_t v1615 = svadd_f32_x(svptrue_b32(), v180, v1583); + svfloat32_t zero1630 = svdup_n_f32(0); + svfloat32_t v1630 = svcmla_f32_x(pred_full, zero1630, v2296, v1596, 90); + svfloat32_t zero1646 = svdup_n_f32(0); + svfloat32_t v1646 = svcmla_f32_x(pred_full, zero1646, v2296, v1614, 90); + svst1w_u64(pred_full, (unsigned *)(v2047), svreinterpret_u64_s16(v982)); + svst1w_u64(pred_full, (unsigned *)(v2155), svreinterpret_u64_s16(v1284)); + svint16_t v1010 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v1007, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svfloat32_t v1101 = svmls_f32_x(pred_full, v1089, v1083, v2274); + svfloat32_t v1274 = svnmls_f32_x(pred_full, v1268, v1256, v2316); + svfloat32_t v1313 = svsub_f32_x(svptrue_b32(), v1268, v1312); + svfloat32_t v1435 = svmls_f32_x(pred_full, v1423, v1417, v2274); + svfloat32_t v1590 = svmls_f32_x(pred_full, v180, v1583, v2272); + svint16_t v1618 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v1615, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svst1w_u64(pred_full, (unsigned *)(v2037), svreinterpret_u64_s16(v966)); + svst1w_u64(pred_full, (unsigned *)(v2057), svreinterpret_u64_s16(v996)); + svst1w_u64(pred_full, (unsigned *)(v2091), svreinterpret_u64_s16(v1117)); + svst1w_u64(pred_full, (unsigned *)(v2219), svreinterpret_u64_s16(v1451)); + svfloat32_t v1107 = svnmls_f32_x(pred_full, v1101, v1089, v2316); + svfloat32_t v1146 = svsub_f32_x(svptrue_b32(), v1101, v1145); + svfloat32_t v1297 = svsub_f32_x(svptrue_b32(), v1274, v1296); + svint16_t v1316 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v1313, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svfloat32_t v1327 = svnmls_f32_x(pred_full, v1313, v1268, v2316); + svfloat32_t v1441 = svnmls_f32_x(pred_full, v1435, v1423, v2316); + svfloat32_t v1480 = svsub_f32_x(svptrue_b32(), v1435, v1479); + svfloat32_t v1602 = svmls_f32_x(pred_full, v1590, v1584, v2274); + svst1w_u64(pred_full, (unsigned *)(v2067), svreinterpret_u64_s16(v1010)); + svst1w_u64(pred_full, (unsigned *)(v2283), svreinterpret_u64_s16(v1618)); + svfloat32_t v1130 = svsub_f32_x(svptrue_b32(), v1107, v1129); + svint16_t v1149 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v1146, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svfloat32_t v1160 = svnmls_f32_x(pred_full, v1146, v1101, v2316); + svint16_t v1300 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v1297, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v1330 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v1327, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svfloat32_t v1341 = svnmls_f32_x(pred_full, v1297, v1274, v2316); + svfloat32_t v1464 = svsub_f32_x(svptrue_b32(), v1441, v1463); + svint16_t v1483 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v1480, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svfloat32_t v1494 = svnmls_f32_x(pred_full, v1480, v1435, v2316); + svfloat32_t v1608 = svnmls_f32_x(pred_full, v1602, v1590, v2316); + svfloat32_t v1647 = svsub_f32_x(svptrue_b32(), v1602, v1646); + svst1w_u64(pred_full, (unsigned *)(v2175), svreinterpret_u64_s16(v1316)); + svint16_t v1133 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v1130, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v1163 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v1160, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svfloat32_t v1174 = svnmls_f32_x(pred_full, v1130, v1107, v2316); + svint16_t v1344 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v1341, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v1467 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v1464, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v1497 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v1494, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svfloat32_t v1508 = svnmls_f32_x(pred_full, v1464, v1441, v2316); + svfloat32_t v1631 = svsub_f32_x(svptrue_b32(), v1608, v1630); + svint16_t v1650 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v1647, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svfloat32_t v1661 = svnmls_f32_x(pred_full, v1647, v1602, v2316); + svst1w_u64(pred_full, (unsigned *)(v2111), svreinterpret_u64_s16(v1149)); + svst1w_u64(pred_full, (unsigned *)(v2165), svreinterpret_u64_s16(v1300)); + svst1w_u64(pred_full, (unsigned *)(v2185), svreinterpret_u64_s16(v1330)); + svst1w_u64(pred_full, (unsigned *)(v2239), svreinterpret_u64_s16(v1483)); + svint16_t v1177 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v1174, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v1511 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v1508, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v1634 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v1631, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v1664 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v1661, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svfloat32_t v1675 = svnmls_f32_x(pred_full, v1631, v1608, v2316); + svst1w_u64(pred_full, (unsigned *)(v2101), svreinterpret_u64_s16(v1133)); + svst1w_u64(pred_full, (unsigned *)(v2121), svreinterpret_u64_s16(v1163)); + svst1w_u64(pred_full, (unsigned *)(v2195), svreinterpret_u64_s16(v1344)); + svst1w_u64(pred_full, (unsigned *)(v2229), svreinterpret_u64_s16(v1467)); + svst1w_u64(pred_full, (unsigned *)(v2249), svreinterpret_u64_s16(v1497)); + svst1w_u64(pred_full, (unsigned *)(v2303), svreinterpret_u64_s16(v1650)); + svint16_t v1678 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v1675, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svst1w_u64(pred_full, (unsigned *)(v2131), svreinterpret_u64_s16(v1177)); + svst1w_u64(pred_full, (unsigned *)(v2259), svreinterpret_u64_s16(v1511)); + svst1w_u64(pred_full, (unsigned *)(v2293), svreinterpret_u64_s16(v1634)); + svst1w_u64(pred_full, (unsigned *)(v2313), svreinterpret_u64_s16(v1664)); + svst1w_u64(pred_full, (unsigned *)(v2323), svreinterpret_u64_s16(v1678)); +} +#endif + +#ifndef ARMRAL_ARCH_SVE +void armral_fft_cs16_cf32_cs16_ac_n_uun32( + const armral_cmplx_int16_t *restrict x, armral_cmplx_int16_t *restrict y, + int istride, int ostride, int howmany, float dir) { + float v4 = dir; + const int32_t *v5 = (const int32_t *)x; + int32_t *v6 = (int32_t *)y; + float v783 = 7.0710678118654757e-01F; + float v794 = -7.0710678118654746e-01F; + float v844 = 5.5557023301960229e-01F; + float v858 = -1.9509032201612861e-01F; + float v909 = 9.2387953251128674e-01F; + float v916 = -9.2387953251128685e-01F; + float v919 = 3.8268343236508967e-01F; + float v920 = -3.8268343236508967e-01F; + float v966 = 1.9509032201612833e-01F; + float v969 = -9.8078528040323043e-01F; + float v970 = 9.8078528040323043e-01F; + float v977 = -5.5557023301960218e-01F; + float v980 = 8.3146961230254524e-01F; + float v981 = -8.3146961230254524e-01F; + float v991 = -1.0000000000000000e+00F; + float v992 = 1.0000000000000000e+00F; + int16x4_t v13 = vld1s_s16(&v5[0]); + int16x4_t v332 = vld1s_s16(&v5[istride]); + float32x2_t v601 = (float32x2_t){v970, v970}; + float32x2_t v662 = (float32x2_t){v909, v909}; + float32x2_t v666 = (float32x2_t){v920, v919}; + float32x2_t v723 = (float32x2_t){v980, v980}; + float32x2_t v727 = (float32x2_t){v977, v844}; + float32x2_t v734 = (float32x2_t){v858, v858}; + float32x2_t v784 = (float32x2_t){v783, v783}; + float32x2_t v795 = (float32x2_t){v794, v794}; + float32x2_t v799 = (float32x2_t){v992, v991}; + float32x2_t v845 = (float32x2_t){v844, v844}; + float32x2_t v849 = (float32x2_t){v981, v980}; + float32x2_t v856 = (float32x2_t){v969, v969}; + float32x2_t v860 = (float32x2_t){v858, v966}; + float32x2_t v906 = (float32x2_t){v919, v919}; + float32x2_t v910 = (float32x2_t){v916, v909}; + float32x2_t v917 = (float32x2_t){v916, v916}; + float32x2_t v921 = (float32x2_t){v919, v920}; + float32x2_t v967 = (float32x2_t){v966, v966}; + float32x2_t v971 = (float32x2_t){v969, v970}; + float32x2_t v978 = (float32x2_t){v977, v977}; + float32x2_t v982 = (float32x2_t){v980, v981}; + float32x2_t v993 = (float32x2_t){v991, v992}; + float32x2_t v994 = (float32x2_t){v4, v4}; + float32x2_t v14 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v13)), 15); + int16x4_t v19 = vld1s_s16(&v5[istride * 16]); + int16x4_t v27 = vld1s_s16(&v5[istride * 8]); + int16x4_t v33 = vld1s_s16(&v5[istride * 24]); + int16x4_t v52 = vld1s_s16(&v5[istride * 4]); + int16x4_t v58 = vld1s_s16(&v5[istride * 20]); + int16x4_t v66 = vld1s_s16(&v5[istride * 12]); + int16x4_t v72 = vld1s_s16(&v5[istride * 28]); + int16x4_t v130 = vld1s_s16(&v5[istride * 2]); + int16x4_t v136 = vld1s_s16(&v5[istride * 18]); + int16x4_t v144 = vld1s_s16(&v5[istride * 10]); + int16x4_t v150 = vld1s_s16(&v5[istride * 26]); + int16x4_t v169 = vld1s_s16(&v5[istride * 6]); + int16x4_t v175 = vld1s_s16(&v5[istride * 22]); + int16x4_t v183 = vld1s_s16(&v5[istride * 14]); + int16x4_t v189 = vld1s_s16(&v5[istride * 30]); + float32x2_t v333 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v332)), 15); + int16x4_t v338 = vld1s_s16(&v5[istride * 17]); + int16x4_t v346 = vld1s_s16(&v5[istride * 9]); + int16x4_t v352 = vld1s_s16(&v5[istride * 25]); + int16x4_t v371 = vld1s_s16(&v5[istride * 5]); + int16x4_t v377 = vld1s_s16(&v5[istride * 21]); + int16x4_t v385 = vld1s_s16(&v5[istride * 13]); + int16x4_t v391 = vld1s_s16(&v5[istride * 29]); + int16x4_t v449 = vld1s_s16(&v5[istride * 3]); + int16x4_t v455 = vld1s_s16(&v5[istride * 19]); + int16x4_t v463 = vld1s_s16(&v5[istride * 11]); + int16x4_t v469 = vld1s_s16(&v5[istride * 27]); + int16x4_t v488 = vld1s_s16(&v5[istride * 7]); + int16x4_t v494 = vld1s_s16(&v5[istride * 23]); + int16x4_t v502 = vld1s_s16(&v5[istride * 15]); + int16x4_t v508 = vld1s_s16(&v5[istride * 31]); + float32x2_t v668 = vmul_f32(v994, v666); + float32x2_t v729 = vmul_f32(v994, v727); + float32x2_t v801 = vmul_f32(v994, v799); + float32x2_t v851 = vmul_f32(v994, v849); + float32x2_t v862 = vmul_f32(v994, v860); + float32x2_t v912 = vmul_f32(v994, v910); + float32x2_t v923 = vmul_f32(v994, v921); + float32x2_t v973 = vmul_f32(v994, v971); + float32x2_t v984 = vmul_f32(v994, v982); + float32x2_t v995 = vmul_f32(v994, v993); + float32x2_t v20 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v19)), 15); + float32x2_t v28 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v27)), 15); + float32x2_t v34 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v33)), 15); + float32x2_t v53 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v52)), 15); + float32x2_t v59 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v58)), 15); + float32x2_t v67 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v66)), 15); + float32x2_t v73 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v72)), 15); + float32x2_t v131 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v130)), 15); + float32x2_t v137 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v136)), 15); + float32x2_t v145 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v144)), 15); + float32x2_t v151 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v150)), 15); + float32x2_t v170 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v169)), 15); + float32x2_t v176 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v175)), 15); + float32x2_t v184 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v183)), 15); + float32x2_t v190 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v189)), 15); + float32x2_t v339 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v338)), 15); + float32x2_t v347 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v346)), 15); + float32x2_t v353 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v352)), 15); + float32x2_t v372 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v371)), 15); + float32x2_t v378 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v377)), 15); + float32x2_t v386 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v385)), 15); + float32x2_t v392 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v391)), 15); + float32x2_t v450 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v449)), 15); + float32x2_t v456 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v455)), 15); + float32x2_t v464 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v463)), 15); + float32x2_t v470 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v469)), 15); + float32x2_t v489 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v488)), 15); + float32x2_t v495 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v494)), 15); + float32x2_t v503 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v502)), 15); + float32x2_t v509 = vcvt_n_f32_s32(vget_low_s32(vmovl_s16(v508)), 15); + float32x2_t v21 = vadd_f32(v14, v20); + float32x2_t v22 = vsub_f32(v14, v20); + float32x2_t v35 = vadd_f32(v28, v34); + float32x2_t v36 = vsub_f32(v28, v34); + float32x2_t v60 = vadd_f32(v53, v59); + float32x2_t v61 = vsub_f32(v53, v59); + float32x2_t v74 = vadd_f32(v67, v73); + float32x2_t v75 = vsub_f32(v67, v73); + float32x2_t v138 = vadd_f32(v131, v137); + float32x2_t v139 = vsub_f32(v131, v137); + float32x2_t v152 = vadd_f32(v145, v151); + float32x2_t v153 = vsub_f32(v145, v151); + float32x2_t v177 = vadd_f32(v170, v176); + float32x2_t v178 = vsub_f32(v170, v176); + float32x2_t v191 = vadd_f32(v184, v190); + float32x2_t v192 = vsub_f32(v184, v190); + float32x2_t v340 = vadd_f32(v333, v339); + float32x2_t v341 = vsub_f32(v333, v339); + float32x2_t v354 = vadd_f32(v347, v353); + float32x2_t v355 = vsub_f32(v347, v353); + float32x2_t v379 = vadd_f32(v372, v378); + float32x2_t v380 = vsub_f32(v372, v378); + float32x2_t v393 = vadd_f32(v386, v392); + float32x2_t v394 = vsub_f32(v386, v392); + float32x2_t v457 = vadd_f32(v450, v456); + float32x2_t v458 = vsub_f32(v450, v456); + float32x2_t v471 = vadd_f32(v464, v470); + float32x2_t v472 = vsub_f32(v464, v470); + float32x2_t v496 = vadd_f32(v489, v495); + float32x2_t v497 = vsub_f32(v489, v495); + float32x2_t v510 = vadd_f32(v503, v509); + float32x2_t v511 = vsub_f32(v503, v509); + float32x2_t v42 = vrev64_f32(v36); + float32x2_t v44 = vadd_f32(v21, v35); + float32x2_t v45 = vsub_f32(v21, v35); + float32x2_t v76 = vadd_f32(v60, v74); + float32x2_t v77 = vsub_f32(v60, v74); + float32x2_t v92 = vmul_f32(v61, v784); + float32x2_t v103 = vmul_f32(v75, v795); + float32x2_t v159 = vrev64_f32(v153); + float32x2_t v161 = vadd_f32(v138, v152); + float32x2_t v162 = vsub_f32(v138, v152); + float32x2_t v198 = vrev64_f32(v192); + float32x2_t v200 = vadd_f32(v177, v191); + float32x2_t v201 = vsub_f32(v177, v191); + float32x2_t v361 = vrev64_f32(v355); + float32x2_t v363 = vadd_f32(v340, v354); + float32x2_t v364 = vsub_f32(v340, v354); + float32x2_t v395 = vadd_f32(v379, v393); + float32x2_t v396 = vsub_f32(v379, v393); + float32x2_t v411 = vmul_f32(v380, v784); + float32x2_t v422 = vmul_f32(v394, v795); + float32x2_t v478 = vrev64_f32(v472); + float32x2_t v480 = vadd_f32(v457, v471); + float32x2_t v481 = vsub_f32(v457, v471); + float32x2_t v512 = vadd_f32(v496, v510); + float32x2_t v513 = vsub_f32(v496, v510); + float32x2_t v528 = vmul_f32(v497, v784); + float32x2_t v539 = vmul_f32(v511, v795); + float32x2_t v43 = vmul_f32(v42, v801); + float32x2_t v83 = vrev64_f32(v77); + float32x2_t v85 = vadd_f32(v44, v76); + float32x2_t v86 = vsub_f32(v44, v76); + float32x2_t v98 = vrev64_f32(v92); + float32x2_t v109 = vrev64_f32(v103); + float32x2_t v160 = vmul_f32(v159, v801); + float32x2_t v199 = vmul_f32(v198, v801); + float32x2_t v204 = vadd_f32(v161, v200); + float32x2_t v205 = vsub_f32(v161, v200); + float32x2_t v257 = vmul_f32(v162, v784); + float32x2_t v268 = vmul_f32(v201, v795); + float32x2_t v362 = vmul_f32(v361, v801); + float32x2_t v402 = vrev64_f32(v396); + float32x2_t v404 = vadd_f32(v363, v395); + float32x2_t v405 = vsub_f32(v363, v395); + float32x2_t v417 = vrev64_f32(v411); + float32x2_t v428 = vrev64_f32(v422); + float32x2_t v479 = vmul_f32(v478, v801); + float32x2_t v519 = vrev64_f32(v513); + float32x2_t v521 = vadd_f32(v480, v512); + float32x2_t v522 = vsub_f32(v480, v512); + float32x2_t v534 = vrev64_f32(v528); + float32x2_t v545 = vrev64_f32(v539); + float32x2_t v46 = vsub_f32(v22, v43); + float32x2_t v47 = vadd_f32(v22, v43); + float32x2_t v84 = vmul_f32(v83, v801); + float32x2_t v99 = vmul_f32(v98, v995); + float32x2_t v110 = vmul_f32(v109, v801); + float32x2_t v163 = vsub_f32(v139, v160); + float32x2_t v164 = vadd_f32(v139, v160); + float32x2_t v202 = vsub_f32(v178, v199); + float32x2_t v203 = vadd_f32(v178, v199); + float32x2_t v211 = vrev64_f32(v205); + float32x2_t v213 = vadd_f32(v85, v204); + float32x2_t v214 = vsub_f32(v85, v204); + float32x2_t v263 = vrev64_f32(v257); + float32x2_t v274 = vrev64_f32(v268); + float32x2_t v365 = vsub_f32(v341, v362); + float32x2_t v366 = vadd_f32(v341, v362); + float32x2_t v403 = vmul_f32(v402, v801); + float32x2_t v418 = vmul_f32(v417, v995); + float32x2_t v429 = vmul_f32(v428, v801); + float32x2_t v482 = vsub_f32(v458, v479); + float32x2_t v483 = vadd_f32(v458, v479); + float32x2_t v520 = vmul_f32(v519, v801); + float32x2_t v535 = vmul_f32(v534, v995); + float32x2_t v546 = vmul_f32(v545, v801); + float32x2_t v562 = vadd_f32(v404, v521); + float32x2_t v563 = vsub_f32(v404, v521); + float32x2_t v785 = vmul_f32(v405, v784); + float32x2_t v796 = vmul_f32(v522, v795); + float32x2_t v87 = vsub_f32(v45, v84); + float32x2_t v88 = vadd_f32(v45, v84); + float32x2_t v111 = vadd_f32(v92, v99); + float32x2_t v112 = vadd_f32(v103, v110); + float32x2_t v212 = vmul_f32(v211, v801); + float32x2_t v220 = vmul_f32(v163, v662); + float32x2_t v226 = vrev64_f32(v163); + float32x2_t v231 = vmul_f32(v202, v906); + float32x2_t v237 = vrev64_f32(v202); + float32x2_t v264 = vmul_f32(v263, v995); + float32x2_t v275 = vmul_f32(v274, v801); + float32x2_t v294 = vmul_f32(v164, v906); + float32x2_t v300 = vrev64_f32(v164); + float32x2_t v305 = vmul_f32(v203, v917); + float32x2_t v311 = vrev64_f32(v203); + float32x2_t v406 = vsub_f32(v364, v403); + float32x2_t v407 = vadd_f32(v364, v403); + float32x2_t v430 = vadd_f32(v411, v418); + float32x2_t v431 = vadd_f32(v422, v429); + float32x2_t v523 = vsub_f32(v481, v520); + float32x2_t v524 = vadd_f32(v481, v520); + float32x2_t v547 = vadd_f32(v528, v535); + float32x2_t v548 = vadd_f32(v539, v546); + float32x2_t v569 = vrev64_f32(v563); + float32x2_t v571 = vadd_f32(v213, v562); + float32x2_t v572 = vsub_f32(v213, v562); + float32x2_t v791 = vrev64_f32(v785); + float32x2_t v802 = vrev64_f32(v796); + float32x2_t v113 = vadd_f32(v111, v112); + float32x2_t v114 = vsub_f32(v112, v111); + float32x2_t v215 = vsub_f32(v86, v212); + float32x2_t v216 = vadd_f32(v86, v212); + float32x2_t v276 = vadd_f32(v257, v264); + float32x2_t v277 = vadd_f32(v268, v275); + float32x2_t v432 = vadd_f32(v430, v431); + float32x2_t v433 = vsub_f32(v431, v430); + float32x2_t v549 = vadd_f32(v547, v548); + float32x2_t v550 = vsub_f32(v548, v547); + float32x2_t v570 = vmul_f32(v569, v801); + int16x4_t v577 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v571, 15), (int32x2_t){0, 0})); + int16x4_t v589 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v572, 15), (int32x2_t){0, 0})); + float32x2_t v663 = vmul_f32(v406, v662); + float32x2_t v669 = vrev64_f32(v406); + float32x2_t v674 = vmul_f32(v523, v906); + float32x2_t v680 = vrev64_f32(v523); + float32x2_t v792 = vmul_f32(v791, v995); + float32x2_t v803 = vmul_f32(v802, v801); + float32x2_t v907 = vmul_f32(v407, v906); + float32x2_t v913 = vrev64_f32(v407); + float32x2_t v918 = vmul_f32(v524, v917); + float32x2_t v924 = vrev64_f32(v524); + float32x2_t v120 = vrev64_f32(v114); + float32x2_t v122 = vadd_f32(v46, v113); + float32x2_t v123 = vsub_f32(v46, v113); + float32x2_t v239 = vfma_f32(v220, v226, v668); + float32x2_t v240 = vfma_f32(v231, v237, v912); + float32x2_t v278 = vadd_f32(v276, v277); + float32x2_t v279 = vsub_f32(v277, v276); + float32x2_t v313 = vfma_f32(v294, v300, v912); + float32x2_t v314 = vfma_f32(v305, v311, v923); + float32x2_t v439 = vrev64_f32(v433); + float32x2_t v441 = vadd_f32(v365, v432); + float32x2_t v442 = vsub_f32(v365, v432); + float32x2_t v556 = vrev64_f32(v550); + float32x2_t v558 = vadd_f32(v482, v549); + float32x2_t v559 = vsub_f32(v482, v549); + float32x2_t v573 = vsub_f32(v214, v570); + float32x2_t v574 = vadd_f32(v214, v570); + v6[0] = vget_lane_s32(vreinterpret_s32_s16(v577), 0); + v6[ostride * 16] = vget_lane_s32(vreinterpret_s32_s16(v589), 0); + float32x2_t v804 = vadd_f32(v785, v792); + float32x2_t v805 = vadd_f32(v796, v803); + float32x2_t v121 = vmul_f32(v120, v995); + float32x2_t v241 = vadd_f32(v239, v240); + float32x2_t v242 = vsub_f32(v240, v239); + float32x2_t v285 = vrev64_f32(v279); + float32x2_t v287 = vadd_f32(v87, v278); + float32x2_t v288 = vsub_f32(v87, v278); + float32x2_t v315 = vadd_f32(v313, v314); + float32x2_t v316 = vsub_f32(v314, v313); + float32x2_t v440 = vmul_f32(v439, v995); + float32x2_t v557 = vmul_f32(v556, v995); + int16x4_t v583 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v573, 15), (int32x2_t){0, 0})); + int16x4_t v595 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v574, 15), (int32x2_t){0, 0})); + float32x2_t v602 = vmul_f32(v441, v601); + float32x2_t v608 = vrev64_f32(v441); + float32x2_t v613 = vmul_f32(v558, v723); + float32x2_t v619 = vrev64_f32(v558); + float32x2_t v682 = vfma_f32(v663, v669, v668); + float32x2_t v683 = vfma_f32(v674, v680, v912); + float32x2_t v806 = vadd_f32(v804, v805); + float32x2_t v807 = vsub_f32(v805, v804); + float32x2_t v846 = vmul_f32(v442, v845); + float32x2_t v852 = vrev64_f32(v442); + float32x2_t v857 = vmul_f32(v559, v856); + float32x2_t v863 = vrev64_f32(v559); + float32x2_t v926 = vfma_f32(v907, v913, v912); + float32x2_t v927 = vfma_f32(v918, v924, v923); + float32x2_t v124 = vsub_f32(v47, v121); + float32x2_t v125 = vadd_f32(v47, v121); + float32x2_t v248 = vrev64_f32(v242); + float32x2_t v250 = vadd_f32(v122, v241); + float32x2_t v251 = vsub_f32(v122, v241); + float32x2_t v286 = vmul_f32(v285, v995); + float32x2_t v322 = vrev64_f32(v316); + float32x2_t v443 = vsub_f32(v366, v440); + float32x2_t v444 = vadd_f32(v366, v440); + float32x2_t v560 = vsub_f32(v483, v557); + float32x2_t v561 = vadd_f32(v483, v557); + v6[ostride * 8] = vget_lane_s32(vreinterpret_s32_s16(v583), 0); + v6[ostride * 24] = vget_lane_s32(vreinterpret_s32_s16(v595), 0); + float32x2_t v684 = vadd_f32(v682, v683); + float32x2_t v685 = vsub_f32(v683, v682); + float32x2_t v813 = vrev64_f32(v807); + float32x2_t v815 = vadd_f32(v215, v806); + float32x2_t v816 = vsub_f32(v215, v806); + float32x2_t v928 = vadd_f32(v926, v927); + float32x2_t v929 = vsub_f32(v927, v926); + float32x2_t v249 = vmul_f32(v248, v995); + float32x2_t v289 = vsub_f32(v88, v286); + float32x2_t v290 = vadd_f32(v88, v286); + float32x2_t v323 = vmul_f32(v322, v995); + float32x2_t v324 = vadd_f32(v124, v315); + float32x2_t v325 = vsub_f32(v124, v315); + float32x2_t v621 = vfma_f32(v602, v608, v862); + float32x2_t v622 = vfma_f32(v613, v619, v729); + float32x2_t v691 = vrev64_f32(v685); + float32x2_t v693 = vadd_f32(v287, v684); + float32x2_t v694 = vsub_f32(v287, v684); + float32x2_t v724 = vmul_f32(v443, v723); + float32x2_t v730 = vrev64_f32(v443); + float32x2_t v735 = vmul_f32(v560, v734); + float32x2_t v741 = vrev64_f32(v560); + float32x2_t v814 = vmul_f32(v813, v995); + int16x4_t v821 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v815, 15), (int32x2_t){0, 0})); + int16x4_t v833 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v816, 15), (int32x2_t){0, 0})); + float32x2_t v865 = vfma_f32(v846, v852, v851); + float32x2_t v866 = vfma_f32(v857, v863, v862); + float32x2_t v935 = vrev64_f32(v929); + float32x2_t v968 = vmul_f32(v444, v967); + float32x2_t v974 = vrev64_f32(v444); + float32x2_t v979 = vmul_f32(v561, v978); + float32x2_t v985 = vrev64_f32(v561); + float32x2_t v252 = vsub_f32(v123, v249); + float32x2_t v253 = vadd_f32(v123, v249); + float32x2_t v326 = vsub_f32(v125, v323); + float32x2_t v327 = vadd_f32(v125, v323); + float32x2_t v623 = vadd_f32(v621, v622); + float32x2_t v624 = vsub_f32(v622, v621); + float32x2_t v692 = vmul_f32(v691, v995); + int16x4_t v699 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v693, 15), (int32x2_t){0, 0})); + int16x4_t v711 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v694, 15), (int32x2_t){0, 0})); + float32x2_t v817 = vsub_f32(v216, v814); + float32x2_t v818 = vadd_f32(v216, v814); + v6[ostride * 4] = vget_lane_s32(vreinterpret_s32_s16(v821), 0); + v6[ostride * 20] = vget_lane_s32(vreinterpret_s32_s16(v833), 0); + float32x2_t v867 = vadd_f32(v865, v866); + float32x2_t v868 = vsub_f32(v866, v865); + float32x2_t v936 = vmul_f32(v935, v995); + float32x2_t v937 = vadd_f32(v289, v928); + float32x2_t v938 = vsub_f32(v289, v928); + float32x2_t v630 = vrev64_f32(v624); + float32x2_t v632 = vadd_f32(v250, v623); + float32x2_t v633 = vsub_f32(v250, v623); + float32x2_t v695 = vsub_f32(v288, v692); + float32x2_t v696 = vadd_f32(v288, v692); + v6[ostride * 2] = vget_lane_s32(vreinterpret_s32_s16(v699), 0); + v6[ostride * 18] = vget_lane_s32(vreinterpret_s32_s16(v711), 0); + float32x2_t v743 = vfma_f32(v724, v730, v729); + float32x2_t v744 = vfma_f32(v735, v741, v973); + int16x4_t v827 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v817, 15), (int32x2_t){0, 0})); + int16x4_t v839 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v818, 15), (int32x2_t){0, 0})); + float32x2_t v874 = vrev64_f32(v868); + float32x2_t v876 = vadd_f32(v252, v867); + float32x2_t v877 = vsub_f32(v252, v867); + float32x2_t v939 = vsub_f32(v290, v936); + float32x2_t v940 = vadd_f32(v290, v936); + int16x4_t v943 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v937, 15), (int32x2_t){0, 0})); + int16x4_t v955 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v938, 15), (int32x2_t){0, 0})); + float32x2_t v987 = vfma_f32(v968, v974, v973); + float32x2_t v988 = vfma_f32(v979, v985, v984); + float32x2_t v631 = vmul_f32(v630, v995); + int16x4_t v638 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v632, 15), (int32x2_t){0, 0})); + int16x4_t v650 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v633, 15), (int32x2_t){0, 0})); + int16x4_t v705 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v695, 15), (int32x2_t){0, 0})); + int16x4_t v717 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v696, 15), (int32x2_t){0, 0})); + float32x2_t v745 = vadd_f32(v743, v744); + float32x2_t v746 = vsub_f32(v744, v743); + v6[ostride * 12] = vget_lane_s32(vreinterpret_s32_s16(v827), 0); + v6[ostride * 28] = vget_lane_s32(vreinterpret_s32_s16(v839), 0); + float32x2_t v875 = vmul_f32(v874, v995); + int16x4_t v882 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v876, 15), (int32x2_t){0, 0})); + int16x4_t v894 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v877, 15), (int32x2_t){0, 0})); + v6[ostride * 6] = vget_lane_s32(vreinterpret_s32_s16(v943), 0); + int16x4_t v949 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v939, 15), (int32x2_t){0, 0})); + v6[ostride * 22] = vget_lane_s32(vreinterpret_s32_s16(v955), 0); + int16x4_t v961 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v940, 15), (int32x2_t){0, 0})); + float32x2_t v989 = vadd_f32(v987, v988); + float32x2_t v990 = vsub_f32(v988, v987); + float32x2_t v634 = vsub_f32(v251, v631); + float32x2_t v635 = vadd_f32(v251, v631); + v6[ostride] = vget_lane_s32(vreinterpret_s32_s16(v638), 0); + v6[ostride * 17] = vget_lane_s32(vreinterpret_s32_s16(v650), 0); + v6[ostride * 10] = vget_lane_s32(vreinterpret_s32_s16(v705), 0); + v6[ostride * 26] = vget_lane_s32(vreinterpret_s32_s16(v717), 0); + float32x2_t v752 = vrev64_f32(v746); + float32x2_t v754 = vadd_f32(v324, v745); + float32x2_t v755 = vsub_f32(v324, v745); + float32x2_t v878 = vsub_f32(v253, v875); + float32x2_t v879 = vadd_f32(v253, v875); + v6[ostride * 5] = vget_lane_s32(vreinterpret_s32_s16(v882), 0); + v6[ostride * 21] = vget_lane_s32(vreinterpret_s32_s16(v894), 0); + v6[ostride * 14] = vget_lane_s32(vreinterpret_s32_s16(v949), 0); + v6[ostride * 30] = vget_lane_s32(vreinterpret_s32_s16(v961), 0); + float32x2_t v996 = vrev64_f32(v990); + float32x2_t v998 = vadd_f32(v326, v989); + float32x2_t v999 = vsub_f32(v326, v989); + int16x4_t v644 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v634, 15), (int32x2_t){0, 0})); + int16x4_t v656 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v635, 15), (int32x2_t){0, 0})); + float32x2_t v753 = vmul_f32(v752, v995); + int16x4_t v760 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v754, 15), (int32x2_t){0, 0})); + int16x4_t v772 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v755, 15), (int32x2_t){0, 0})); + int16x4_t v888 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v878, 15), (int32x2_t){0, 0})); + int16x4_t v900 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v879, 15), (int32x2_t){0, 0})); + float32x2_t v997 = vmul_f32(v996, v995); + int16x4_t v1004 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v998, 15), (int32x2_t){0, 0})); + int16x4_t v1016 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v999, 15), (int32x2_t){0, 0})); + v6[ostride * 9] = vget_lane_s32(vreinterpret_s32_s16(v644), 0); + v6[ostride * 25] = vget_lane_s32(vreinterpret_s32_s16(v656), 0); + float32x2_t v756 = vsub_f32(v325, v753); + float32x2_t v757 = vadd_f32(v325, v753); + v6[ostride * 3] = vget_lane_s32(vreinterpret_s32_s16(v760), 0); + v6[ostride * 19] = vget_lane_s32(vreinterpret_s32_s16(v772), 0); + v6[ostride * 13] = vget_lane_s32(vreinterpret_s32_s16(v888), 0); + v6[ostride * 29] = vget_lane_s32(vreinterpret_s32_s16(v900), 0); + float32x2_t v1000 = vsub_f32(v327, v997); + float32x2_t v1001 = vadd_f32(v327, v997); + v6[ostride * 7] = vget_lane_s32(vreinterpret_s32_s16(v1004), 0); + v6[ostride * 23] = vget_lane_s32(vreinterpret_s32_s16(v1016), 0); + int16x4_t v766 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v756, 15), (int32x2_t){0, 0})); + int16x4_t v778 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v757, 15), (int32x2_t){0, 0})); + int16x4_t v1010 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1000, 15), (int32x2_t){0, 0})); + int16x4_t v1022 = + vqmovn_s32(vcombine_s32(vcvt_n_s32_f32(v1001, 15), (int32x2_t){0, 0})); + v6[ostride * 11] = vget_lane_s32(vreinterpret_s32_s16(v766), 0); + v6[ostride * 27] = vget_lane_s32(vreinterpret_s32_s16(v778), 0); + v6[ostride * 15] = vget_lane_s32(vreinterpret_s32_s16(v1010), 0); + v6[ostride * 31] = vget_lane_s32(vreinterpret_s32_s16(v1022), 0); +} +#endif + +#ifdef ARMRAL_ARCH_SVE +void armral_fft_cs16_cf32_cs16_ac_n_uun32( + const armral_cmplx_int16_t *restrict x, armral_cmplx_int16_t *restrict y, + int istride, int ostride, int howmany, float dir) { + int64_t v0 = istride; + int64_t v2 = ostride; + float v4 = dir; + const int32_t *v5 = (const int32_t *)x; + int32_t *v6 = (int32_t *)y; + svbool_t pred_full = svptrue_pat_b32(SV_VL2); + float v843 = -1.9509032201612819e-01F; + float v902 = 7.0710678118654757e-01F; + float v914 = -7.0710678118654746e-01F; + float v919 = -1.0000000000000000e+00F; + float v973 = 5.5557023301960229e-01F; + float v978 = 8.3146961230254524e-01F; + float v985 = -9.8078528040323043e-01F; + float v1044 = 3.8268343236508984e-01F; + float v1049 = 9.2387953251128674e-01F; + float v1056 = -9.2387953251128685e-01F; + float v1061 = -3.8268343236508967e-01F; + float v1115 = 1.9509032201612833e-01F; + float v1120 = 9.8078528040323043e-01F; + float v1127 = -5.5557023301960218e-01F; + float v1132 = -8.3146961230254524e-01F; + const int32_t *v1361 = &v5[v0]; + int32_t *v1562 = &v6[v2]; + int64_t v23 = v0 * 16; + int64_t v33 = v0 * 8; + int64_t v41 = v0 * 24; + int64_t v62 = v0 * 4; + int64_t v70 = v0 * 20; + int64_t v80 = v0 * 12; + int64_t v88 = v0 * 28; + int64_t v150 = v0 * 2; + int64_t v158 = v0 * 18; + int64_t v168 = v0 * 10; + int64_t v176 = v0 * 26; + int64_t v197 = v0 * 6; + int64_t v205 = v0 * 22; + int64_t v215 = v0 * 14; + int64_t v223 = v0 * 30; + int64_t v382 = v0 * 17; + int64_t v392 = v0 * 9; + int64_t v400 = v0 * 25; + int64_t v421 = v0 * 5; + int64_t v429 = v0 * 21; + int64_t v439 = v0 * 13; + int64_t v447 = v0 * 29; + int64_t v509 = v0 * 3; + int64_t v517 = v0 * 19; + int64_t v527 = v0 * 11; + int64_t v535 = v0 * 27; + int64_t v556 = v0 * 7; + int64_t v564 = v0 * 23; + int64_t v574 = v0 * 15; + int64_t v582 = v0 * 31; + int64_t v665 = v2 * 8; + int64_t v673 = v2 * 16; + int64_t v681 = v2 * 24; + int64_t v736 = v2 * 9; + int64_t v744 = v2 * 17; + int64_t v752 = v2 * 25; + float v768 = v4 * v1044; + int64_t v799 = v2 * 2; + int64_t v807 = v2 * 10; + int64_t v815 = v2 * 18; + int64_t v823 = v2 * 26; + float v839 = v4 * v973; + int64_t v870 = v2 * 3; + int64_t v878 = v2 * 11; + int64_t v886 = v2 * 19; + int64_t v894 = v2 * 27; + float v922 = v4 * v919; + int64_t v941 = v2 * 4; + int64_t v949 = v2 * 12; + int64_t v957 = v2 * 20; + int64_t v965 = v2 * 28; + float v981 = v4 * v978; + float v993 = v4 * v1115; + int64_t v1012 = v2 * 5; + int64_t v1020 = v2 * 13; + int64_t v1028 = v2 * 21; + int64_t v1036 = v2 * 29; + float v1052 = v4 * v1049; + float v1064 = v4 * v1061; + int64_t v1083 = v2 * 6; + int64_t v1091 = v2 * 14; + int64_t v1099 = v2 * 22; + int64_t v1107 = v2 * 30; + float v1123 = v4 * v1120; + float v1135 = v4 * v1132; + int64_t v1154 = v2 * 7; + int64_t v1162 = v2 * 15; + int64_t v1170 = v2 * 23; + int64_t v1178 = v2 * 31; + const int32_t *v1192 = &v5[0]; + int32_t *v1521 = &v6[0]; + svfloat32_t v1551 = svdup_n_f32(v1120); + svfloat32_t v1592 = svdup_n_f32(v1049); + svfloat32_t v1633 = svdup_n_f32(v978); + svfloat32_t v1635 = svdup_n_f32(v843); + svfloat32_t v1674 = svdup_n_f32(v902); + svfloat32_t v1676 = svdup_n_f32(v914); + svfloat32_t v1715 = svdup_n_f32(v973); + svfloat32_t v1717 = svdup_n_f32(v985); + svfloat32_t v1756 = svdup_n_f32(v1044); + svfloat32_t v1758 = svdup_n_f32(v1056); + svfloat32_t v1797 = svdup_n_f32(v1115); + svfloat32_t v1799 = svdup_n_f32(v1127); + svfloat32_t v1801 = svdup_n_f32(v4); + svfloat32_t v380 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v1361[0])), + 1.F / (1ULL << 15ULL)); + const int32_t *v1201 = &v5[v23]; + const int32_t *v1210 = &v5[v33]; + const int32_t *v1219 = &v5[v41]; + const int32_t *v1229 = &v5[v62]; + const int32_t *v1238 = &v5[v70]; + const int32_t *v1247 = &v5[v80]; + const int32_t *v1256 = &v5[v88]; + const int32_t *v1271 = &v5[v150]; + const int32_t *v1280 = &v5[v158]; + const int32_t *v1289 = &v5[v168]; + const int32_t *v1298 = &v5[v176]; + const int32_t *v1308 = &v5[v197]; + const int32_t *v1317 = &v5[v205]; + const int32_t *v1326 = &v5[v215]; + const int32_t *v1335 = &v5[v223]; + const int32_t *v1370 = &v5[v382]; + const int32_t *v1379 = &v5[v392]; + const int32_t *v1388 = &v5[v400]; + const int32_t *v1398 = &v5[v421]; + const int32_t *v1407 = &v5[v429]; + const int32_t *v1416 = &v5[v439]; + const int32_t *v1425 = &v5[v447]; + const int32_t *v1440 = &v5[v509]; + const int32_t *v1449 = &v5[v517]; + const int32_t *v1458 = &v5[v527]; + const int32_t *v1467 = &v5[v535]; + const int32_t *v1477 = &v5[v556]; + const int32_t *v1486 = &v5[v564]; + const int32_t *v1495 = &v5[v574]; + const int32_t *v1504 = &v5[v582]; + int32_t *v1530 = &v6[v665]; + int32_t *v1539 = &v6[v673]; + int32_t *v1548 = &v6[v681]; + int32_t *v1571 = &v6[v736]; + int32_t *v1580 = &v6[v744]; + int32_t *v1589 = &v6[v752]; + svfloat32_t v1593 = svdup_n_f32(v768); + int32_t *v1603 = &v6[v799]; + int32_t *v1612 = &v6[v807]; + int32_t *v1621 = &v6[v815]; + int32_t *v1630 = &v6[v823]; + svfloat32_t v1634 = svdup_n_f32(v839); + int32_t *v1644 = &v6[v870]; + int32_t *v1653 = &v6[v878]; + int32_t *v1662 = &v6[v886]; + int32_t *v1671 = &v6[v894]; + svfloat32_t v1677 = svdup_n_f32(v922); + int32_t *v1685 = &v6[v941]; + int32_t *v1694 = &v6[v949]; + int32_t *v1703 = &v6[v957]; + int32_t *v1712 = &v6[v965]; + svfloat32_t v1716 = svdup_n_f32(v981); + svfloat32_t v1718 = svdup_n_f32(v993); + int32_t *v1726 = &v6[v1012]; + int32_t *v1735 = &v6[v1020]; + int32_t *v1744 = &v6[v1028]; + int32_t *v1753 = &v6[v1036]; + svfloat32_t v1757 = svdup_n_f32(v1052); + svfloat32_t v1759 = svdup_n_f32(v1064); + int32_t *v1767 = &v6[v1083]; + int32_t *v1776 = &v6[v1091]; + int32_t *v1785 = &v6[v1099]; + int32_t *v1794 = &v6[v1107]; + svfloat32_t v1798 = svdup_n_f32(v1123); + svfloat32_t v1800 = svdup_n_f32(v1135); + int32_t *v1808 = &v6[v1154]; + int32_t *v1817 = &v6[v1162]; + int32_t *v1826 = &v6[v1170]; + int32_t *v1835 = &v6[v1178]; + svfloat32_t v21 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v1192[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v29 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v1201[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v39 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v1210[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v47 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v1219[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v68 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v1229[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v76 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v1238[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v86 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v1247[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v94 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v1256[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v156 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v1271[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v164 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v1280[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v174 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v1289[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v182 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v1298[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v203 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v1308[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v211 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v1317[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v221 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v1326[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v229 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v1335[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v388 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v1370[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v398 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v1379[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v406 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v1388[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v427 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v1398[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v435 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v1407[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v445 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v1416[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v453 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v1425[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v515 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v1440[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v523 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v1449[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v533 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v1458[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v541 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v1467[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v562 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v1477[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v570 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v1486[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v580 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v1495[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v588 = svmul_n_f32_x( + pred_full, + svcvt_f32_s32_x(pred_full, + svld1sh_s32(pred_full, (const int16_t *)&v1504[0])), + 1.F / (1ULL << 15ULL)); + svfloat32_t v30 = svadd_f32_x(svptrue_b32(), v21, v29); + svfloat32_t v31 = svsub_f32_x(svptrue_b32(), v21, v29); + svfloat32_t v48 = svadd_f32_x(svptrue_b32(), v39, v47); + svfloat32_t v49 = svsub_f32_x(svptrue_b32(), v39, v47); + svfloat32_t v77 = svadd_f32_x(svptrue_b32(), v68, v76); + svfloat32_t v78 = svsub_f32_x(svptrue_b32(), v68, v76); + svfloat32_t v95 = svadd_f32_x(svptrue_b32(), v86, v94); + svfloat32_t v96 = svsub_f32_x(svptrue_b32(), v86, v94); + svfloat32_t v165 = svadd_f32_x(svptrue_b32(), v156, v164); + svfloat32_t v166 = svsub_f32_x(svptrue_b32(), v156, v164); + svfloat32_t v183 = svadd_f32_x(svptrue_b32(), v174, v182); + svfloat32_t v184 = svsub_f32_x(svptrue_b32(), v174, v182); + svfloat32_t v212 = svadd_f32_x(svptrue_b32(), v203, v211); + svfloat32_t v213 = svsub_f32_x(svptrue_b32(), v203, v211); + svfloat32_t v230 = svadd_f32_x(svptrue_b32(), v221, v229); + svfloat32_t v231 = svsub_f32_x(svptrue_b32(), v221, v229); + svfloat32_t v389 = svadd_f32_x(svptrue_b32(), v380, v388); + svfloat32_t v390 = svsub_f32_x(svptrue_b32(), v380, v388); + svfloat32_t v407 = svadd_f32_x(svptrue_b32(), v398, v406); + svfloat32_t v408 = svsub_f32_x(svptrue_b32(), v398, v406); + svfloat32_t v436 = svadd_f32_x(svptrue_b32(), v427, v435); + svfloat32_t v437 = svsub_f32_x(svptrue_b32(), v427, v435); + svfloat32_t v454 = svadd_f32_x(svptrue_b32(), v445, v453); + svfloat32_t v455 = svsub_f32_x(svptrue_b32(), v445, v453); + svfloat32_t v524 = svadd_f32_x(svptrue_b32(), v515, v523); + svfloat32_t v525 = svsub_f32_x(svptrue_b32(), v515, v523); + svfloat32_t v542 = svadd_f32_x(svptrue_b32(), v533, v541); + svfloat32_t v543 = svsub_f32_x(svptrue_b32(), v533, v541); + svfloat32_t v571 = svadd_f32_x(svptrue_b32(), v562, v570); + svfloat32_t v572 = svsub_f32_x(svptrue_b32(), v562, v570); + svfloat32_t v589 = svadd_f32_x(svptrue_b32(), v580, v588); + svfloat32_t v590 = svsub_f32_x(svptrue_b32(), v580, v588); + svfloat32_t zero56 = svdup_n_f32(0); + svfloat32_t v56 = svcmla_f32_x(pred_full, zero56, v1677, v49, 90); + svfloat32_t v57 = svadd_f32_x(svptrue_b32(), v30, v48); + svfloat32_t v58 = svsub_f32_x(svptrue_b32(), v30, v48); + svfloat32_t v97 = svadd_f32_x(svptrue_b32(), v77, v95); + svfloat32_t v98 = svsub_f32_x(svptrue_b32(), v77, v95); + svfloat32_t v114 = svmul_f32_x(svptrue_b32(), v78, v1674); + svfloat32_t v126 = svmul_f32_x(svptrue_b32(), v96, v1676); + svfloat32_t zero191 = svdup_n_f32(0); + svfloat32_t v191 = svcmla_f32_x(pred_full, zero191, v1677, v184, 90); + svfloat32_t v192 = svadd_f32_x(svptrue_b32(), v165, v183); + svfloat32_t v193 = svsub_f32_x(svptrue_b32(), v165, v183); + svfloat32_t zero238 = svdup_n_f32(0); + svfloat32_t v238 = svcmla_f32_x(pred_full, zero238, v1677, v231, 90); + svfloat32_t v239 = svadd_f32_x(svptrue_b32(), v212, v230); + svfloat32_t v240 = svsub_f32_x(svptrue_b32(), v212, v230); + svfloat32_t zero415 = svdup_n_f32(0); + svfloat32_t v415 = svcmla_f32_x(pred_full, zero415, v1677, v408, 90); + svfloat32_t v416 = svadd_f32_x(svptrue_b32(), v389, v407); + svfloat32_t v417 = svsub_f32_x(svptrue_b32(), v389, v407); + svfloat32_t v456 = svadd_f32_x(svptrue_b32(), v436, v454); + svfloat32_t v457 = svsub_f32_x(svptrue_b32(), v436, v454); + svfloat32_t v473 = svmul_f32_x(svptrue_b32(), v437, v1674); + svfloat32_t v485 = svmul_f32_x(svptrue_b32(), v455, v1676); + svfloat32_t zero550 = svdup_n_f32(0); + svfloat32_t v550 = svcmla_f32_x(pred_full, zero550, v1677, v543, 90); + svfloat32_t v551 = svadd_f32_x(svptrue_b32(), v524, v542); + svfloat32_t v552 = svsub_f32_x(svptrue_b32(), v524, v542); + svfloat32_t v591 = svadd_f32_x(svptrue_b32(), v571, v589); + svfloat32_t v592 = svsub_f32_x(svptrue_b32(), v571, v589); + svfloat32_t v608 = svmul_f32_x(svptrue_b32(), v572, v1674); + svfloat32_t v620 = svmul_f32_x(svptrue_b32(), v590, v1676); + svfloat32_t v59 = svsub_f32_x(svptrue_b32(), v31, v56); + svfloat32_t v60 = svadd_f32_x(svptrue_b32(), v31, v56); + svfloat32_t zero105 = svdup_n_f32(0); + svfloat32_t v105 = svcmla_f32_x(pred_full, zero105, v1677, v98, 90); + svfloat32_t v106 = svadd_f32_x(svptrue_b32(), v57, v97); + svfloat32_t v107 = svsub_f32_x(svptrue_b32(), v57, v97); + svfloat32_t v194 = svsub_f32_x(svptrue_b32(), v166, v191); + svfloat32_t v195 = svadd_f32_x(svptrue_b32(), v166, v191); + svfloat32_t v241 = svsub_f32_x(svptrue_b32(), v213, v238); + svfloat32_t v242 = svadd_f32_x(svptrue_b32(), v213, v238); + svfloat32_t v243 = svadd_f32_x(svptrue_b32(), v192, v239); + svfloat32_t v244 = svsub_f32_x(svptrue_b32(), v192, v239); + svfloat32_t v299 = svmul_f32_x(svptrue_b32(), v193, v1674); + svfloat32_t v311 = svmul_f32_x(svptrue_b32(), v240, v1676); + svfloat32_t v418 = svsub_f32_x(svptrue_b32(), v390, v415); + svfloat32_t v419 = svadd_f32_x(svptrue_b32(), v390, v415); + svfloat32_t zero464 = svdup_n_f32(0); + svfloat32_t v464 = svcmla_f32_x(pred_full, zero464, v1677, v457, 90); + svfloat32_t v465 = svadd_f32_x(svptrue_b32(), v416, v456); + svfloat32_t v466 = svsub_f32_x(svptrue_b32(), v416, v456); + svfloat32_t v553 = svsub_f32_x(svptrue_b32(), v525, v550); + svfloat32_t v554 = svadd_f32_x(svptrue_b32(), v525, v550); + svfloat32_t zero599 = svdup_n_f32(0); + svfloat32_t v599 = svcmla_f32_x(pred_full, zero599, v1677, v592, 90); + svfloat32_t v600 = svadd_f32_x(svptrue_b32(), v551, v591); + svfloat32_t v601 = svsub_f32_x(svptrue_b32(), v551, v591); + svfloat32_t v108 = svsub_f32_x(svptrue_b32(), v58, v105); + svfloat32_t v109 = svadd_f32_x(svptrue_b32(), v58, v105); + svfloat32_t v134 = svcmla_f32_x(pred_full, v114, v1801, v114, 90); + svfloat32_t v135 = svcmla_f32_x(pred_full, v126, v1677, v126, 90); + svfloat32_t zero251 = svdup_n_f32(0); + svfloat32_t v251 = svcmla_f32_x(pred_full, zero251, v1677, v244, 90); + svfloat32_t v252 = svadd_f32_x(svptrue_b32(), v106, v243); + svfloat32_t v253 = svsub_f32_x(svptrue_b32(), v106, v243); + svfloat32_t v260 = svmul_f32_x(svptrue_b32(), v194, v1592); + svfloat32_t v272 = svmul_f32_x(svptrue_b32(), v241, v1756); + svfloat32_t v338 = svmul_f32_x(svptrue_b32(), v195, v1756); + svfloat32_t v350 = svmul_f32_x(svptrue_b32(), v242, v1758); + svfloat32_t v467 = svsub_f32_x(svptrue_b32(), v417, v464); + svfloat32_t v468 = svadd_f32_x(svptrue_b32(), v417, v464); + svfloat32_t v493 = svcmla_f32_x(pred_full, v473, v1801, v473, 90); + svfloat32_t v494 = svcmla_f32_x(pred_full, v485, v1677, v485, 90); + svfloat32_t v602 = svsub_f32_x(svptrue_b32(), v552, v599); + svfloat32_t v603 = svadd_f32_x(svptrue_b32(), v552, v599); + svfloat32_t v628 = svcmla_f32_x(pred_full, v608, v1801, v608, 90); + svfloat32_t v629 = svcmla_f32_x(pred_full, v620, v1677, v620, 90); + svfloat32_t v643 = svadd_f32_x(svptrue_b32(), v465, v600); + svfloat32_t v644 = svsub_f32_x(svptrue_b32(), v465, v600); + svfloat32_t v905 = svmul_f32_x(svptrue_b32(), v466, v1674); + svfloat32_t v917 = svmul_f32_x(svptrue_b32(), v601, v1676); + svfloat32_t v136 = svadd_f32_x(svptrue_b32(), v134, v135); + svfloat32_t v137 = svsub_f32_x(svptrue_b32(), v135, v134); + svfloat32_t v254 = svsub_f32_x(svptrue_b32(), v107, v251); + svfloat32_t v255 = svadd_f32_x(svptrue_b32(), v107, v251); + svfloat32_t v280 = svcmla_f32_x(pred_full, v260, v1593, v194, 90); + svfloat32_t v281 = svcmla_f32_x(pred_full, v272, v1757, v241, 90); + svfloat32_t v319 = svcmla_f32_x(pred_full, v299, v1801, v299, 90); + svfloat32_t v320 = svcmla_f32_x(pred_full, v311, v1677, v311, 90); + svfloat32_t v358 = svcmla_f32_x(pred_full, v338, v1757, v195, 90); + svfloat32_t v359 = svcmla_f32_x(pred_full, v350, v1759, v242, 90); + svfloat32_t v495 = svadd_f32_x(svptrue_b32(), v493, v494); + svfloat32_t v496 = svsub_f32_x(svptrue_b32(), v494, v493); + svfloat32_t v630 = svadd_f32_x(svptrue_b32(), v628, v629); + svfloat32_t v631 = svsub_f32_x(svptrue_b32(), v629, v628); + svfloat32_t zero651 = svdup_n_f32(0); + svfloat32_t v651 = svcmla_f32_x(pred_full, zero651, v1677, v644, 90); + svfloat32_t v652 = svadd_f32_x(svptrue_b32(), v252, v643); + svfloat32_t v653 = svsub_f32_x(svptrue_b32(), v252, v643); + svfloat32_t v763 = svmul_f32_x(svptrue_b32(), v467, v1592); + svfloat32_t v775 = svmul_f32_x(svptrue_b32(), v602, v1756); + svfloat32_t v1047 = svmul_f32_x(svptrue_b32(), v468, v1756); + svfloat32_t v1059 = svmul_f32_x(svptrue_b32(), v603, v1758); + svfloat32_t zero144 = svdup_n_f32(0); + svfloat32_t v144 = svcmla_f32_x(pred_full, zero144, v1801, v137, 90); + svfloat32_t v145 = svadd_f32_x(svptrue_b32(), v59, v136); + svfloat32_t v146 = svsub_f32_x(svptrue_b32(), v59, v136); + svfloat32_t v282 = svadd_f32_x(svptrue_b32(), v280, v281); + svfloat32_t v283 = svsub_f32_x(svptrue_b32(), v281, v280); + svfloat32_t v321 = svadd_f32_x(svptrue_b32(), v319, v320); + svfloat32_t v322 = svsub_f32_x(svptrue_b32(), v320, v319); + svfloat32_t v360 = svadd_f32_x(svptrue_b32(), v358, v359); + svfloat32_t v361 = svsub_f32_x(svptrue_b32(), v359, v358); + svfloat32_t zero503 = svdup_n_f32(0); + svfloat32_t v503 = svcmla_f32_x(pred_full, zero503, v1801, v496, 90); + svfloat32_t v504 = svadd_f32_x(svptrue_b32(), v418, v495); + svfloat32_t v505 = svsub_f32_x(svptrue_b32(), v418, v495); + svfloat32_t zero638 = svdup_n_f32(0); + svfloat32_t v638 = svcmla_f32_x(pred_full, zero638, v1801, v631, 90); + svfloat32_t v639 = svadd_f32_x(svptrue_b32(), v553, v630); + svfloat32_t v640 = svsub_f32_x(svptrue_b32(), v553, v630); + svfloat32_t v654 = svsub_f32_x(svptrue_b32(), v253, v651); + svfloat32_t v655 = svadd_f32_x(svptrue_b32(), v253, v651); + svint16_t v658 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v652, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v674 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v653, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svfloat32_t v783 = svcmla_f32_x(pred_full, v763, v1593, v467, 90); + svfloat32_t v784 = svcmla_f32_x(pred_full, v775, v1757, v602, 90); + svfloat32_t v925 = svcmla_f32_x(pred_full, v905, v1801, v905, 90); + svfloat32_t v926 = svcmla_f32_x(pred_full, v917, v1677, v917, 90); + svfloat32_t v1067 = svcmla_f32_x(pred_full, v1047, v1757, v468, 90); + svfloat32_t v1068 = svcmla_f32_x(pred_full, v1059, v1759, v603, 90); + svfloat32_t v147 = svsub_f32_x(svptrue_b32(), v60, v144); + svfloat32_t v148 = svadd_f32_x(svptrue_b32(), v60, v144); + svfloat32_t zero290 = svdup_n_f32(0); + svfloat32_t v290 = svcmla_f32_x(pred_full, zero290, v1801, v283, 90); + svfloat32_t v291 = svadd_f32_x(svptrue_b32(), v145, v282); + svfloat32_t v292 = svsub_f32_x(svptrue_b32(), v145, v282); + svfloat32_t zero329 = svdup_n_f32(0); + svfloat32_t v329 = svcmla_f32_x(pred_full, zero329, v1801, v322, 90); + svfloat32_t v330 = svadd_f32_x(svptrue_b32(), v108, v321); + svfloat32_t v331 = svsub_f32_x(svptrue_b32(), v108, v321); + svfloat32_t zero368 = svdup_n_f32(0); + svfloat32_t v368 = svcmla_f32_x(pred_full, zero368, v1801, v361, 90); + svfloat32_t v506 = svsub_f32_x(svptrue_b32(), v419, v503); + svfloat32_t v507 = svadd_f32_x(svptrue_b32(), v419, v503); + svfloat32_t v641 = svsub_f32_x(svptrue_b32(), v554, v638); + svfloat32_t v642 = svadd_f32_x(svptrue_b32(), v554, v638); + svint16_t v666 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v654, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v682 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v655, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svfloat32_t v692 = svmul_f32_x(svptrue_b32(), v504, v1551); + svfloat32_t v704 = svmul_f32_x(svptrue_b32(), v639, v1633); + svfloat32_t v785 = svadd_f32_x(svptrue_b32(), v783, v784); + svfloat32_t v786 = svsub_f32_x(svptrue_b32(), v784, v783); + svfloat32_t v927 = svadd_f32_x(svptrue_b32(), v925, v926); + svfloat32_t v928 = svsub_f32_x(svptrue_b32(), v926, v925); + svfloat32_t v976 = svmul_f32_x(svptrue_b32(), v505, v1715); + svfloat32_t v988 = svmul_f32_x(svptrue_b32(), v640, v1717); + svfloat32_t v1069 = svadd_f32_x(svptrue_b32(), v1067, v1068); + svfloat32_t v1070 = svsub_f32_x(svptrue_b32(), v1068, v1067); + svst1w_u64(pred_full, (unsigned *)(v1521), svreinterpret_u64_s16(v658)); + svst1w_u64(pred_full, (unsigned *)(v1539), svreinterpret_u64_s16(v674)); + svfloat32_t v293 = svsub_f32_x(svptrue_b32(), v146, v290); + svfloat32_t v294 = svadd_f32_x(svptrue_b32(), v146, v290); + svfloat32_t v332 = svsub_f32_x(svptrue_b32(), v109, v329); + svfloat32_t v333 = svadd_f32_x(svptrue_b32(), v109, v329); + svfloat32_t v369 = svadd_f32_x(svptrue_b32(), v147, v360); + svfloat32_t v370 = svsub_f32_x(svptrue_b32(), v147, v360); + svfloat32_t v371 = svsub_f32_x(svptrue_b32(), v148, v368); + svfloat32_t v372 = svadd_f32_x(svptrue_b32(), v148, v368); + svfloat32_t v712 = svcmla_f32_x(pred_full, v692, v1718, v504, 90); + svfloat32_t v713 = svcmla_f32_x(pred_full, v704, v1634, v639, 90); + svfloat32_t zero793 = svdup_n_f32(0); + svfloat32_t v793 = svcmla_f32_x(pred_full, zero793, v1801, v786, 90); + svfloat32_t v794 = svadd_f32_x(svptrue_b32(), v330, v785); + svfloat32_t v795 = svsub_f32_x(svptrue_b32(), v330, v785); + svfloat32_t v834 = svmul_f32_x(svptrue_b32(), v506, v1633); + svfloat32_t v846 = svmul_f32_x(svptrue_b32(), v641, v1635); + svfloat32_t zero935 = svdup_n_f32(0); + svfloat32_t v935 = svcmla_f32_x(pred_full, zero935, v1801, v928, 90); + svfloat32_t v936 = svadd_f32_x(svptrue_b32(), v254, v927); + svfloat32_t v937 = svsub_f32_x(svptrue_b32(), v254, v927); + svfloat32_t v996 = svcmla_f32_x(pred_full, v976, v1716, v505, 90); + svfloat32_t v997 = svcmla_f32_x(pred_full, v988, v1718, v640, 90); + svfloat32_t zero1077 = svdup_n_f32(0); + svfloat32_t v1077 = svcmla_f32_x(pred_full, zero1077, v1801, v1070, 90); + svfloat32_t v1118 = svmul_f32_x(svptrue_b32(), v507, v1797); + svfloat32_t v1130 = svmul_f32_x(svptrue_b32(), v642, v1799); + svst1w_u64(pred_full, (unsigned *)(v1530), svreinterpret_u64_s16(v666)); + svst1w_u64(pred_full, (unsigned *)(v1548), svreinterpret_u64_s16(v682)); + svfloat32_t v714 = svadd_f32_x(svptrue_b32(), v712, v713); + svfloat32_t v715 = svsub_f32_x(svptrue_b32(), v713, v712); + svfloat32_t v796 = svsub_f32_x(svptrue_b32(), v331, v793); + svfloat32_t v797 = svadd_f32_x(svptrue_b32(), v331, v793); + svint16_t v800 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v794, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v816 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v795, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svfloat32_t v854 = svcmla_f32_x(pred_full, v834, v1634, v506, 90); + svfloat32_t v855 = svcmla_f32_x(pred_full, v846, v1798, v641, 90); + svfloat32_t v938 = svsub_f32_x(svptrue_b32(), v255, v935); + svfloat32_t v939 = svadd_f32_x(svptrue_b32(), v255, v935); + svint16_t v942 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v936, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v958 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v937, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svfloat32_t v998 = svadd_f32_x(svptrue_b32(), v996, v997); + svfloat32_t v999 = svsub_f32_x(svptrue_b32(), v997, v996); + svfloat32_t v1078 = svadd_f32_x(svptrue_b32(), v332, v1069); + svfloat32_t v1079 = svsub_f32_x(svptrue_b32(), v332, v1069); + svfloat32_t v1080 = svsub_f32_x(svptrue_b32(), v333, v1077); + svfloat32_t v1081 = svadd_f32_x(svptrue_b32(), v333, v1077); + svfloat32_t v1138 = svcmla_f32_x(pred_full, v1118, v1798, v507, 90); + svfloat32_t v1139 = svcmla_f32_x(pred_full, v1130, v1800, v642, 90); + svfloat32_t zero722 = svdup_n_f32(0); + svfloat32_t v722 = svcmla_f32_x(pred_full, zero722, v1801, v715, 90); + svfloat32_t v723 = svadd_f32_x(svptrue_b32(), v291, v714); + svfloat32_t v724 = svsub_f32_x(svptrue_b32(), v291, v714); + svint16_t v808 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v796, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v824 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v797, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svfloat32_t v856 = svadd_f32_x(svptrue_b32(), v854, v855); + svfloat32_t v857 = svsub_f32_x(svptrue_b32(), v855, v854); + svint16_t v950 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v938, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v966 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v939, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svfloat32_t zero1006 = svdup_n_f32(0); + svfloat32_t v1006 = svcmla_f32_x(pred_full, zero1006, v1801, v999, 90); + svfloat32_t v1007 = svadd_f32_x(svptrue_b32(), v293, v998); + svfloat32_t v1008 = svsub_f32_x(svptrue_b32(), v293, v998); + svint16_t v1084 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v1078, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v1092 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v1080, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v1100 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v1079, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v1108 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v1081, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svfloat32_t v1140 = svadd_f32_x(svptrue_b32(), v1138, v1139); + svfloat32_t v1141 = svsub_f32_x(svptrue_b32(), v1139, v1138); + svst1w_u64(pred_full, (unsigned *)(v1603), svreinterpret_u64_s16(v800)); + svst1w_u64(pred_full, (unsigned *)(v1621), svreinterpret_u64_s16(v816)); + svst1w_u64(pred_full, (unsigned *)(v1685), svreinterpret_u64_s16(v942)); + svst1w_u64(pred_full, (unsigned *)(v1703), svreinterpret_u64_s16(v958)); + svfloat32_t v725 = svsub_f32_x(svptrue_b32(), v292, v722); + svfloat32_t v726 = svadd_f32_x(svptrue_b32(), v292, v722); + svint16_t v729 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v723, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v745 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v724, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svfloat32_t zero864 = svdup_n_f32(0); + svfloat32_t v864 = svcmla_f32_x(pred_full, zero864, v1801, v857, 90); + svfloat32_t v865 = svadd_f32_x(svptrue_b32(), v369, v856); + svfloat32_t v866 = svsub_f32_x(svptrue_b32(), v369, v856); + svfloat32_t v1009 = svsub_f32_x(svptrue_b32(), v294, v1006); + svfloat32_t v1010 = svadd_f32_x(svptrue_b32(), v294, v1006); + svint16_t v1013 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v1007, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v1029 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v1008, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svfloat32_t zero1148 = svdup_n_f32(0); + svfloat32_t v1148 = svcmla_f32_x(pred_full, zero1148, v1801, v1141, 90); + svfloat32_t v1149 = svadd_f32_x(svptrue_b32(), v371, v1140); + svfloat32_t v1150 = svsub_f32_x(svptrue_b32(), v371, v1140); + svst1w_u64(pred_full, (unsigned *)(v1612), svreinterpret_u64_s16(v808)); + svst1w_u64(pred_full, (unsigned *)(v1630), svreinterpret_u64_s16(v824)); + svst1w_u64(pred_full, (unsigned *)(v1694), svreinterpret_u64_s16(v950)); + svst1w_u64(pred_full, (unsigned *)(v1712), svreinterpret_u64_s16(v966)); + svst1w_u64(pred_full, (unsigned *)(v1767), svreinterpret_u64_s16(v1084)); + svst1w_u64(pred_full, (unsigned *)(v1776), svreinterpret_u64_s16(v1092)); + svst1w_u64(pred_full, (unsigned *)(v1785), svreinterpret_u64_s16(v1100)); + svst1w_u64(pred_full, (unsigned *)(v1794), svreinterpret_u64_s16(v1108)); + svint16_t v737 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v725, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v753 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v726, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svfloat32_t v867 = svsub_f32_x(svptrue_b32(), v370, v864); + svfloat32_t v868 = svadd_f32_x(svptrue_b32(), v370, v864); + svint16_t v871 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v865, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v887 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v866, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v1021 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v1009, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v1037 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v1010, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svfloat32_t v1151 = svsub_f32_x(svptrue_b32(), v372, v1148); + svfloat32_t v1152 = svadd_f32_x(svptrue_b32(), v372, v1148); + svint16_t v1155 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v1149, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v1171 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v1150, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svst1w_u64(pred_full, (unsigned *)(v1562), svreinterpret_u64_s16(v729)); + svst1w_u64(pred_full, (unsigned *)(v1580), svreinterpret_u64_s16(v745)); + svst1w_u64(pred_full, (unsigned *)(v1726), svreinterpret_u64_s16(v1013)); + svst1w_u64(pred_full, (unsigned *)(v1744), svreinterpret_u64_s16(v1029)); + svint16_t v879 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v867, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v895 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v868, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v1163 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v1151, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svint16_t v1179 = svtbl_s16( + svreinterpret_s16_s32(svcvt_s32_f32_x( + pred_full, svmul_n_f32_x(pred_full, v1152, (float)(1ULL << 31ULL)))), + svreinterpret_u16_u64( + svindex_u64(0xffffffff00030001ULL, 0x0000000000040004ULL))); + svst1w_u64(pred_full, (unsigned *)(v1571), svreinterpret_u64_s16(v737)); + svst1w_u64(pred_full, (unsigned *)(v1589), svreinterpret_u64_s16(v753)); + svst1w_u64(pred_full, (unsigned *)(v1644), svreinterpret_u64_s16(v871)); + svst1w_u64(pred_full, (unsigned *)(v1662), svreinterpret_u64_s16(v887)); + svst1w_u64(pred_full, (unsigned *)(v1735), svreinterpret_u64_s16(v1021)); + svst1w_u64(pred_full, (unsigned *)(v1753), svreinterpret_u64_s16(v1037)); + svst1w_u64(pred_full, (unsigned *)(v1808), svreinterpret_u64_s16(v1155)); + svst1w_u64(pred_full, (unsigned *)(v1826), svreinterpret_u64_s16(v1171)); + svst1w_u64(pred_full, (unsigned *)(v1653), svreinterpret_u64_s16(v879)); + svst1w_u64(pred_full, (unsigned *)(v1671), svreinterpret_u64_s16(v895)); + svst1w_u64(pred_full, (unsigned *)(v1817), svreinterpret_u64_s16(v1163)); + svst1w_u64(pred_full, (unsigned *)(v1835), svreinterpret_u64_s16(v1179)); +} +#endif diff --git a/src/LowerPHY/FFT/fft_cs16_cf32_cs16_ac_n_uun.h b/src/LowerPHY/FFT/fft_cs16_cf32_cs16_ac_n_uun.h new file mode 100644 index 0000000000000000000000000000000000000000..2afcae356bd19d710e41aee6a4b0523f16411884 --- /dev/null +++ b/src/LowerPHY/FFT/fft_cs16_cf32_cs16_ac_n_uun.h @@ -0,0 +1,48 @@ +/* + Arm RAN Acceleration Library + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause +*/ +#pragma once + +#include "armral.h" +#include "fft_helper.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef void(cs16_cf32_cs16_ac_n_uun_fft_t)(const armral_cmplx_int16_t *x, + armral_cmplx_int16_t *y, + int istride, int ostride, + int howmany, float dir); + +cs16_cf32_cs16_ac_n_uun_fft_t armral_fft_cs16_cf32_cs16_ac_n_uun2; +cs16_cf32_cs16_ac_n_uun_fft_t armral_fft_cs16_cf32_cs16_ac_n_uun3; +cs16_cf32_cs16_ac_n_uun_fft_t armral_fft_cs16_cf32_cs16_ac_n_uun4; +cs16_cf32_cs16_ac_n_uun_fft_t armral_fft_cs16_cf32_cs16_ac_n_uun5; +cs16_cf32_cs16_ac_n_uun_fft_t armral_fft_cs16_cf32_cs16_ac_n_uun6; +cs16_cf32_cs16_ac_n_uun_fft_t armral_fft_cs16_cf32_cs16_ac_n_uun7; +cs16_cf32_cs16_ac_n_uun_fft_t armral_fft_cs16_cf32_cs16_ac_n_uun8; +cs16_cf32_cs16_ac_n_uun_fft_t armral_fft_cs16_cf32_cs16_ac_n_uun9; +cs16_cf32_cs16_ac_n_uun_fft_t armral_fft_cs16_cf32_cs16_ac_n_uun10; +cs16_cf32_cs16_ac_n_uun_fft_t armral_fft_cs16_cf32_cs16_ac_n_uun11; +cs16_cf32_cs16_ac_n_uun_fft_t armral_fft_cs16_cf32_cs16_ac_n_uun12; +cs16_cf32_cs16_ac_n_uun_fft_t armral_fft_cs16_cf32_cs16_ac_n_uun13; +cs16_cf32_cs16_ac_n_uun_fft_t armral_fft_cs16_cf32_cs16_ac_n_uun14; +cs16_cf32_cs16_ac_n_uun_fft_t armral_fft_cs16_cf32_cs16_ac_n_uun15; +cs16_cf32_cs16_ac_n_uun_fft_t armral_fft_cs16_cf32_cs16_ac_n_uun16; +cs16_cf32_cs16_ac_n_uun_fft_t armral_fft_cs16_cf32_cs16_ac_n_uun17; +cs16_cf32_cs16_ac_n_uun_fft_t armral_fft_cs16_cf32_cs16_ac_n_uun18; +cs16_cf32_cs16_ac_n_uun_fft_t armral_fft_cs16_cf32_cs16_ac_n_uun19; +cs16_cf32_cs16_ac_n_uun_fft_t armral_fft_cs16_cf32_cs16_ac_n_uun20; +cs16_cf32_cs16_ac_n_uun_fft_t armral_fft_cs16_cf32_cs16_ac_n_uun21; +cs16_cf32_cs16_ac_n_uun_fft_t armral_fft_cs16_cf32_cs16_ac_n_uun22; +cs16_cf32_cs16_ac_n_uun_fft_t armral_fft_cs16_cf32_cs16_ac_n_uun24; +cs16_cf32_cs16_ac_n_uun_fft_t armral_fft_cs16_cf32_cs16_ac_n_uun25; +cs16_cf32_cs16_ac_n_uun_fft_t armral_fft_cs16_cf32_cs16_ac_n_uun32; + +#ifdef __cplusplus +} // extern "C" +#endif \ No newline at end of file diff --git a/src/LowerPHY/FFT/fft_cs16_kernel_lookup.c b/src/LowerPHY/FFT/fft_cs16_kernel_lookup.c index 349d5488a6dd184b88c41a778a3390b4e9099b96..1ee3d7c937caec92ab066ebcb7ca2f92ffbd5158 100644 --- a/src/LowerPHY/FFT/fft_cs16_kernel_lookup.c +++ b/src/LowerPHY/FFT/fft_cs16_kernel_lookup.c @@ -1,49 +1,59 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "fft_cs16_kernel_lookup.h" #include -#define NUM_FFT_CS16_BASE_KERNELS 33 +#define NUM_FFT_CS16_BASE_KERNELS 41 -static cs16_cf32_cs16_ac_n_uu_fft_t - *base_cs16_cf32_cs16_ac_n_uu_kernels[NUM_FFT_CS16_BASE_KERNELS] = { +static cs16_cf32_cs16_ac_n_uun_fft_t + *base_cs16_cf32_cs16_ac_n_uun_kernels[NUM_FFT_CS16_BASE_KERNELS] = { NULL, NULL, - armral_fft_cs16_cf32_cs16_ac_n_uu2, - armral_fft_cs16_cf32_cs16_ac_n_uu3, - armral_fft_cs16_cf32_cs16_ac_n_uu4, - armral_fft_cs16_cf32_cs16_ac_n_uu5, - armral_fft_cs16_cf32_cs16_ac_n_uu6, - armral_fft_cs16_cf32_cs16_ac_n_uu7, - armral_fft_cs16_cf32_cs16_ac_n_uu8, - armral_fft_cs16_cf32_cs16_ac_n_uu9, - armral_fft_cs16_cf32_cs16_ac_n_uu10, - armral_fft_cs16_cf32_cs16_ac_n_uu11, - armral_fft_cs16_cf32_cs16_ac_n_uu12, - armral_fft_cs16_cf32_cs16_ac_n_uu13, - armral_fft_cs16_cf32_cs16_ac_n_uu14, - armral_fft_cs16_cf32_cs16_ac_n_uu15, - armral_fft_cs16_cf32_cs16_ac_n_uu16, - armral_fft_cs16_cf32_cs16_ac_n_uu17, - armral_fft_cs16_cf32_cs16_ac_n_uu18, - armral_fft_cs16_cf32_cs16_ac_n_uu19, - armral_fft_cs16_cf32_cs16_ac_n_uu20, - armral_fft_cs16_cf32_cs16_ac_n_uu21, - armral_fft_cs16_cf32_cs16_ac_n_uu22, + armral_fft_cs16_cf32_cs16_ac_n_uun2, + armral_fft_cs16_cf32_cs16_ac_n_uun3, + armral_fft_cs16_cf32_cs16_ac_n_uun4, + armral_fft_cs16_cf32_cs16_ac_n_uun5, + armral_fft_cs16_cf32_cs16_ac_n_uun6, + armral_fft_cs16_cf32_cs16_ac_n_uun7, + armral_fft_cs16_cf32_cs16_ac_n_uun8, + armral_fft_cs16_cf32_cs16_ac_n_uun9, + armral_fft_cs16_cf32_cs16_ac_n_uun10, + armral_fft_cs16_cf32_cs16_ac_n_uun11, + armral_fft_cs16_cf32_cs16_ac_n_uun12, + armral_fft_cs16_cf32_cs16_ac_n_uun13, + armral_fft_cs16_cf32_cs16_ac_n_uun14, + armral_fft_cs16_cf32_cs16_ac_n_uun15, + armral_fft_cs16_cf32_cs16_ac_n_uun16, + armral_fft_cs16_cf32_cs16_ac_n_uun17, + armral_fft_cs16_cf32_cs16_ac_n_uun18, + armral_fft_cs16_cf32_cs16_ac_n_uun19, + armral_fft_cs16_cf32_cs16_ac_n_uun20, + armral_fft_cs16_cf32_cs16_ac_n_uun21, + armral_fft_cs16_cf32_cs16_ac_n_uun22, + NULL, + armral_fft_cs16_cf32_cs16_ac_n_uun24, + armral_fft_cs16_cf32_cs16_ac_n_uun25, + NULL, + NULL, + NULL, + NULL, + NULL, + NULL, + armral_fft_cs16_cf32_cs16_ac_n_uun32, + NULL, NULL, - armral_fft_cs16_cf32_cs16_ac_n_uu24, - armral_fft_cs16_cf32_cs16_ac_n_uu25, NULL, NULL, NULL, NULL, NULL, NULL, - armral_fft_cs16_cf32_cs16_ac_n_uu32, }; static cs16_cf32_cf32_ac_n_uu_fft_t @@ -81,6 +91,14 @@ static cs16_cf32_cf32_ac_n_uu_fft_t NULL, NULL, armral_fft_cs16_cf32_cf32_ac_n_uu32, + NULL, + NULL, + NULL, + NULL, + NULL, + NULL, + NULL, + NULL, }; static cf32_cf32_cs16_ab_t_gu_fft_t @@ -118,6 +136,14 @@ static cf32_cf32_cs16_ab_t_gu_fft_t NULL, NULL, armral_fft_cf32_cf32_cs16_ab_t_gu32, + NULL, + NULL, + NULL, + NULL, + NULL, + NULL, + NULL, + NULL, }; static cf32_cf32_cs16_ac_n_uu_fft_t @@ -155,14 +181,22 @@ static cf32_cf32_cs16_ac_n_uu_fft_t NULL, NULL, armral_fft_cf32_cf32_cs16_ac_n_uu32, + NULL, + NULL, + NULL, + NULL, + NULL, + NULL, + NULL, + NULL, }; -cs16_cf32_cs16_ac_n_uu_fft_t * -lookup_ac_uu_base_kernel_cs16_cs16(int n, armral_fft_direction_t dir) { +cs16_cf32_cs16_ac_n_uun_fft_t * +lookup_ac_uun_base_kernel_cs16_cs16(int n, armral_fft_direction_t dir) { if (n >= NUM_FFT_CS16_BASE_KERNELS) { return NULL; } - return base_cs16_cf32_cs16_ac_n_uu_kernels[n]; + return base_cs16_cf32_cs16_ac_n_uun_kernels[n]; } cs16_cf32_cf32_ac_n_uu_fft_t * diff --git a/src/LowerPHY/FFT/fft_cs16_kernel_lookup.h b/src/LowerPHY/FFT/fft_cs16_kernel_lookup.h index 864b0d9604bb7f034ee24564d4ba404b078bdfe3..f85f5e0cc85f0e0222925020da07a872dc720100 100644 --- a/src/LowerPHY/FFT/fft_cs16_kernel_lookup.h +++ b/src/LowerPHY/FFT/fft_cs16_kernel_lookup.h @@ -1,20 +1,22 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #pragma once #include "fft_cf32_cf32_cs16_ab_t_gu.h" #include "fft_cf32_cf32_cs16_ac_n_uu.h" #include "fft_cs16_cf32_cf32_ac_n_uu.h" -#include "fft_cs16_cf32_cs16_ac_n_uu.h" +#include "fft_cs16_cf32_cs16_ac_n_uun.h" #ifdef __cplusplus extern "C" { #endif -cs16_cf32_cs16_ac_n_uu_fft_t * -lookup_ac_uu_base_kernel_cs16_cs16(int n, armral_fft_direction_t dir); +cs16_cf32_cs16_ac_n_uun_fft_t * +lookup_ac_uun_base_kernel_cs16_cs16(int n, armral_fft_direction_t dir); cs16_cf32_cf32_ac_n_uu_fft_t * lookup_ac_uu_base_kernel_cs16_cf32(int n, armral_fft_direction_t dir); diff --git a/src/LowerPHY/FFT/fft_execute.cpp b/src/LowerPHY/FFT/fft_execute.cpp index fc23ddd71f22b80eddd294e285ebd332898a5915..6960837a20e3fb553e69b0caab41aad54b71d654 100644 --- a/src/LowerPHY/FFT/fft_execute.cpp +++ b/src/LowerPHY/FFT/fft_execute.cpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "fft_execute.hpp" @@ -170,7 +172,10 @@ template armral_status execute(const armral_fft_plan_t *p, const Tx *x, Ty *y, int istride, int ostride, int howmany) { static_assert(sizeof(Tw) >= sizeof(Tx) && sizeof(Tw) >= sizeof(Ty)); - assert(p); + if (p == nullptr) { + assert(false && "Plan is invalid"); + return ARMRAL_ARGUMENT_ERROR; + } int num_levels = p->num_levels; const auto *levs = p->levels; diff --git a/src/LowerPHY/FFT/fft_execute.hpp b/src/LowerPHY/FFT/fft_execute.hpp index 4cf5edd6dcfd16ae8a6b287a192b6915f3175e8b..6f410b8c9e9829e97fc7a5a9e7ac1ca57118f6b4 100644 --- a/src/LowerPHY/FFT/fft_execute.hpp +++ b/src/LowerPHY/FFT/fft_execute.hpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #pragma once diff --git a/src/LowerPHY/FFT/fft_helper.h b/src/LowerPHY/FFT/fft_helper.h index 978ed19b3a23e5669aca3b8b76bf7e4973fabb32..2a73cb84ed5230dd8d20b89588c0c027db86eff8 100644 --- a/src/LowerPHY/FFT/fft_helper.h +++ b/src/LowerPHY/FFT/fft_helper.h @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #pragma once diff --git a/src/LowerPHY/FFT/fft_level.cpp b/src/LowerPHY/FFT/fft_level.cpp index e91517d82d660cc0fd2a80c17dd6b9ed214a5ac7..6d9125817ac4dcf9d61b461012d68113dd13b60e 100644 --- a/src/LowerPHY/FFT/fft_level.cpp +++ b/src/LowerPHY/FFT/fft_level.cpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "fft_level.hpp" diff --git a/src/LowerPHY/FFT/fft_level.hpp b/src/LowerPHY/FFT/fft_level.hpp index f254a383fb88ce6b7bf0658dd7e69a5fe35b7f14..0015278233f4471a4d27e3b9f6841895be340545 100644 --- a/src/LowerPHY/FFT/fft_level.hpp +++ b/src/LowerPHY/FFT/fft_level.hpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #pragma once diff --git a/src/LowerPHY/FFT/fft_plan.cpp b/src/LowerPHY/FFT/fft_plan.cpp index 689ae5ffe778746dbf334b87e44d345067db5f37..96f8f1c7a8547352e9d08c5936b579e5a80fa387 100644 --- a/src/LowerPHY/FFT/fft_plan.cpp +++ b/src/LowerPHY/FFT/fft_plan.cpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "fft_plan.hpp" #include "bluestein.hpp" @@ -21,10 +23,23 @@ namespace { -constexpr int len_base_kernels = 33; +constexpr int len_base_kernels = 41; +// Extra kernel sizes are available when use_all_kernels = true. The 2 least +// significant bits in each element are used to distinguish between these +// sizes, and those that are always available (regardless of whether +// use_all_kernels = true or not). +// Bit 2 is set for sizes that are always available. Bit 1 is set for sizes +// that are available when use_all_kernels = true (this includes the sizes +// that are always available plus the extra sizes). +// This means we can query whether bit 1 is set when we want to include the +// extra kernels, or query whether bit 2 is set for the reduced set of kernel +// sizes. For example, size 2 is always available so both bits are set and +// base_kernels[2] = 3. However, size 28 is only available when +// use_all_kernels = true so only bit 1 is set and base_kernels[28] = 1. constexpr int base_kernels[len_base_kernels] = { - 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1}; + 0x0, 0x0, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, + 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x3, 0x0, 0x3, 0x3, 0x0, 0x0, + 0x1, 0x0, 0x1, 0x0, 0x3, 0x0, 0x0, 0x0, 0x1, 0x0, 0x0, 0x0, 0x1}; template Tw *make_twiddles(int n1, int n2, armral_fft_direction_t dir, @@ -56,22 +71,28 @@ Tw *make_twiddles(int n1, int n2, armral_fft_direction_t dir, return twids; } -inline int kernel_exists(int i) { +inline int kernel_exists(int i, bool use_all_kernels = false) { if (i >= len_base_kernels) { return 0; } - return base_kernels[i]; + if (use_all_kernels) { + return base_kernels[i] & 0x1; + } + return base_kernels[i] & 0x2; } template inline armral::fft::fft_ac_uu_func_t -get_base_kernel(int n, armral_fft_direction_t dir); +get_base_kernel(int n, armral_fft_direction_t dir, bool want_uun); template<> inline armral::fft::fft_ac_uu_func_t get_base_kernel( - int n, armral_fft_direction_t dir) { + int n, armral_fft_direction_t dir, bool want_uun) { + if (want_uun) { + return lookup_ac_uun_base_kernel_cf32_cf32(n, dir); + } return lookup_ac_uu_base_kernel_cf32_cf32(n, dir); } @@ -79,15 +100,16 @@ template<> inline armral::fft::fft_ac_uu_func_t get_base_kernel( - int n, armral_fft_direction_t dir) { - return lookup_ac_uu_base_kernel_cs16_cs16(n, dir); + int n, armral_fft_direction_t dir, bool want_uun) { + assert(want_uun); + return lookup_ac_uun_base_kernel_cs16_cs16(n, dir); } template<> inline armral::fft::fft_ac_uu_func_t get_base_kernel( - int n, armral_fft_direction_t dir) { + int n, armral_fft_direction_t dir, bool want_uun) { return lookup_ac_uu_base_kernel_cs16_cf32(n, dir); } @@ -95,7 +117,7 @@ template<> inline armral::fft::fft_ac_uu_func_t get_base_kernel( - int n, armral_fft_direction_t dir) { + int n, armral_fft_direction_t dir, bool want_uun) { return lookup_ac_uu_base_kernel_cf32_cs16(n, dir); } @@ -184,8 +206,8 @@ struct kernel_selection { template kernel_selection get_kernels(int n1, armral_fft_direction_t dir, - bool want_twids) { - auto kernel = get_base_kernel(n1, dir); + bool want_twids, bool want_uun) { + auto kernel = get_base_kernel(n1, dir, want_uun); auto ac_gu_kernel = (std::is_same_v && std::is_same_v) ? get_ac_gu_base_kernel(n1, dir) : nullptr; @@ -212,11 +234,20 @@ struct factorize_result { factorize_result factorize_descending_base_kernels(int n, armral_fft_direction_t dir, - int max_nfacts, - int *factors) { + int max_nfacts, int *factors, + bool use_all_kernels) { factorize_result fr(n); - // factorize using the kernel lengths we have available, up to a maximum - // number of factors. + + // We get a performance benefit using additional kernels for Rader's cases + // provided n is the only factor + if (use_all_kernels && kernel_exists(n, true) != 0) { + factors[fr.num_factors++] = n; + fr.remainder = 1; + return fr; + } + + // Factorize using the reduced set of kernel lengths we have available, up to + // a maximum number of factors for (int factor = 32; factor >= 2 && fr.num_factors < max_nfacts;) { if ((kernel_exists(factor) != 0) && fr.remainder % factor == 0) { factors[fr.num_factors++] = factor; @@ -250,9 +281,9 @@ void factorize_primes(int *factors, factorize_result *fr, int max_nfacts) { } int factorize_descending(int n, armral_fft_direction_t dir, int max_nfacts, - int *factors) { - factorize_result fr = - factorize_descending_base_kernels(n, dir, max_nfacts, factors); + int *factors, bool use_all_kernels) { + factorize_result fr = factorize_descending_base_kernels( + n, dir, max_nfacts, factors, use_all_kernels); if (fr.remainder == 1) { return fr.num_factors; } @@ -273,11 +304,13 @@ int factorize_descending(int n, armral_fft_direction_t dir, int max_nfacts, template armral::fft::lev_base_t * make_level_data(int n, int n1, int n2, int how_many, armral_fft_direction_t dir, - bool want_twiddles, bool want_ac, bool allow_raders) { + bool want_twiddles, bool want_ac, bool allow_raders, + bool use_all_kernels, bool want_uun) { using level_type = armral::fft::lev_t; - if (kernel_exists(n1)) { + if (kernel_exists(n1, use_all_kernels)) { auto [kernel, ac_gu_kernel, ab_twid_gu_kernel, ab_twid_gs_kernel, - ac_twid_kernel] = get_kernels(n1, dir, want_twiddles); + ac_twid_kernel] = + get_kernels(n1, dir, want_twiddles, want_uun); Tw *twids = nullptr; if (want_twiddles) { #ifdef ARMRAL_ARCH_SVE @@ -294,7 +327,7 @@ make_level_data(int n, int n1, int n2, int how_many, armral_fft_direction_t dir, return nullptr; } Tw *twids = want_twiddles ? make_twiddles(n1, n2, dir, 1, true) : nullptr; - auto maybe_r = armral::fft::make_rader(n1, dir); + auto maybe_r = armral::fft::make_rader(n1, dir, n); if (maybe_r) { auto r = std::move(*maybe_r); if (r.n == 0) { @@ -303,23 +336,30 @@ make_level_data(int n, int n1, int n2, int how_many, armral_fft_direction_t dir, return new level_type(n, n1, n2, how_many, dir, twids, nullptr, nullptr, nullptr, nullptr, nullptr, std::move(r), {}); } - auto bs = armral::fft::make_bluestein(n1, dir, base_kernels, - len_base_kernels); - if (bs.n == 0) { - return nullptr; + auto maybe_bs = armral::fft::make_bluestein(n1, dir, base_kernels, + len_base_kernels); + if (maybe_bs) { + auto bs = std::move(*maybe_bs); + if (bs.n == 0) { + return nullptr; + } + return new level_type(n, n1, n2, how_many, dir, twids, nullptr, nullptr, + nullptr, nullptr, nullptr, {}, std::move(bs)); } - return new level_type(n, n1, n2, how_many, dir, twids, nullptr, nullptr, - nullptr, nullptr, nullptr, {}, std::move(bs)); + return nullptr; } template int factorize(int n, armral_fft_direction_t dir, int max_levels, - armral::fft::lev_base_t **levels, bool allow_raders) { - // search through the set of supported factors to find a suitable + armral::fft::lev_base_t **levels, bool allow_raders, + bool use_all_kernels, bool want_uun) { + // Search through the set of supported factors to find a suitable // factorization, then use that to build the level data structures. int factors[max_levels]; - int num_factors = factorize_descending(n, dir, max_levels, factors); + int num_factors = + factorize_descending(n, dir, max_levels, factors, use_all_kernels); if (num_factors == 0) { + assert(false && "Unable to factorize this FFT length"); return 0; } @@ -335,23 +375,27 @@ int factorize(int n, armral_fft_direction_t dir, int max_levels, // Operating on a single level - input output and working types are as // specified for this function levels[fi] = make_level_data(n, n1, n2, how_many, dir, - false, false, allow_raders); + false, false, allow_raders, + use_all_kernels, want_uun); } else { // We have multiple levels, and are currently dealing with the first // level. Transform data to the working type from the input type - levels[fi] = make_level_data(n, n1, n2, how_many, dir, - false, false, allow_raders); + levels[fi] = + make_level_data(n, n1, n2, how_many, dir, false, false, + allow_raders, use_all_kernels, false); } } else if (fi == num_factors - 1) { // We have multiple levels and are currently dealing with the last level. // Transform data from the working type to the output type - levels[fi] = make_level_data(n, n1, n2, how_many, dir, true, - false, allow_raders); + levels[fi] = + make_level_data(n, n1, n2, how_many, dir, true, false, + allow_raders, use_all_kernels, false); } else { // We have multiple levels and are currently dealing with an intermediate // level (i.e. not first or last). All work is done in the working type - levels[fi] = make_level_data(n, n1, n2, how_many, dir, true, - true, allow_raders); + levels[fi] = + make_level_data(n, n1, n2, how_many, dir, true, true, + allow_raders, use_all_kernels, false); } if (!levels[fi]) { @@ -367,21 +411,26 @@ namespace armral::fft { template armral_status create_plan(armral_fft_plan_t **p, int n, - armral_fft_direction_t dir, bool allow_raders) { + armral_fft_direction_t dir, bool allow_raders, + bool use_all_kernels, bool want_uun) { + if (n > 42012) { + // This length is currently unsupported due to the limit on the number of + // allowed factors/levels, which is defined by armral_fft_plan_t::max_levels + return ARMRAL_ARGUMENT_ERROR; + } assert(p); - // try and find a suitable decomposition, else give up. - // we arbitrarily limit ourselves to four factors here, but there's - // no particular reason why we couldn't support more. + // Try and find a suitable decomposition, else give up. armral_fft_plan_t tmp_plan = {}; tmp_plan.n = n; tmp_plan.dir = dir; tmp_plan.num_levels = factorize( - n, dir, armral_fft_plan_t::max_levels, tmp_plan.levels, allow_raders); + n, dir, armral_fft_plan_t::max_levels, tmp_plan.levels, allow_raders, + use_all_kernels, want_uun); if (tmp_plan.num_levels == 0) { return ARMRAL_ARGUMENT_ERROR; } - // only allocate once we're sure we actually have a plan to return. + // Only allocate once we're sure we actually have a plan to return. *p = static_cast(malloc(sizeof(armral_fft_plan_t))); memcpy(*p, &tmp_plan, sizeof(armral_fft_plan_t)); return ARMRAL_SUCCESS; @@ -389,16 +438,18 @@ armral_status create_plan(armral_fft_plan_t **p, int n, template armral_status create_plan( - armral_fft_plan_t **p, int n, armral_fft_direction_t dir, - bool allow_raders); + armral_fft_plan_t **p, int n, armral_fft_direction_t dir, bool allow_raders, + bool use_all_kernels, bool want_uun); template armral_status create_plan( - armral_fft_plan_t **p, int n, armral_fft_direction_t dir, - bool allow_raders); + armral_fft_plan_t **p, int n, armral_fft_direction_t dir, bool allow_raders, + bool use_all_kernels, bool want_uun); armral_status destroy_plan(armral_fft_plan_t **p) { - assert(p); - assert(*p); + if (p == nullptr || *p == nullptr) { + assert(false && "Invalid plan"); + return ARMRAL_ARGUMENT_ERROR; + } for (int i = 0; i < (*p)->num_levels; ++i) { assert((*p)->levels[i]); // Call the virtual destructor of the level data diff --git a/src/LowerPHY/FFT/fft_plan.hpp b/src/LowerPHY/FFT/fft_plan.hpp index 9141d436379b01ebbd09116a43681bc4aacdbccd..74cca316820b39b5ceb7a2221058555428fe6d66 100644 --- a/src/LowerPHY/FFT/fft_plan.hpp +++ b/src/LowerPHY/FFT/fft_plan.hpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #pragma once @@ -11,18 +13,22 @@ namespace armral::fft { /** * Creates a plan for solving FFTs. Depending on the data type, the * plan will execute different functions. - * @tparam Tx Input data type - * @tparam Ty Output data type - * @tparam Tw Working data type - * @param [out] p Pointer to populate with the created FFT plan. - * @param [in] n The overall size of the FFT to perform. - * @param [in] dir The direction of the FFT (forwards or backwards). - * @param [in] allow_raders Allow use of Rader's algorithm. + * @tparam Tx Input data type + * @tparam Ty Output data type + * @tparam Tw Working data type + * @param [out] p Pointer to populate with the created FFT plan. + * @param [in] n The overall size of the FFT to perform. + * @param [in] dir The direction of the FFT (forwards or + * backwards). + * @param [in] allow_raders Allow use of Rader's algorithm. + * @param [in] use_all_kernels Allow use of all available kernels. Default is + * false. * @returns ARMRAL_SUCCESS if a plan is successfully created. */ template armral_status create_plan(armral_fft_plan_t **p, int n, - armral_fft_direction_t dir, bool allow_raders); + armral_fft_direction_t dir, bool allow_raders, + bool use_all_kernels = false, bool want_uun = true); /** * Common code for destroying a plan. For the time being, the plan is identical @@ -47,7 +53,7 @@ struct lev_base_t; * particular, constructing twiddle factors). */ struct armral_fft_plan_t { - static constexpr int max_levels = 4; + static constexpr int max_levels = 5; /// The problem size being solved. int n; /// The direction of the problem being solved. diff --git a/src/LowerPHY/FFT/fft_types.hpp b/src/LowerPHY/FFT/fft_types.hpp index 9cc619913aa262aa7657b3831cc074e4e530204a..d23b5bfe70c7fa9c11b3c7843a4b2e232ac82d86 100644 --- a/src/LowerPHY/FFT/fft_types.hpp +++ b/src/LowerPHY/FFT/fft_types.hpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #pragma once diff --git a/src/LowerPHY/FFT/rader.cpp b/src/LowerPHY/FFT/rader.cpp index efa38bcc2d49846195df9b11c592f095374ad527..a91099bb3f573e39758a610023b745904481358b 100644 --- a/src/LowerPHY/FFT/rader.cpp +++ b/src/LowerPHY/FFT/rader.cpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "rader.hpp" @@ -16,7 +18,8 @@ namespace armral::fft { template -std::optional> make_rader(int n, armral_fft_direction_t dir) { +std::optional> make_rader(int n, armral_fft_direction_t dir, + int n_whole) { using real_t = armral::fft::real_t; auto g = find_group_generator(n); @@ -29,10 +32,19 @@ std::optional> make_rader(int n, armral_fft_direction_t dir) { // algorithm since that tends to be slower than just using Bluestein. armral_fft_plan_t *pf = nullptr; armral_fft_plan_t *pb = nullptr; + // We get a performance benefit from using additional kernels provided the n + // we are creating a Rader's plan for isn't the only factor of n_whole. + bool use_all_kernels = n_whole > n; + // Only allow uun kernels to be used if we know the plans will be executed + // with howmany = 1. This will be the case if the level that the Rader's plan + // is being created for has n2 = 1, i.e. if n = n_whole. + bool want_uun = n == n_whole; armral::fft::create_plan( - &pf, n - 1, armral_fft_direction_t::ARMRAL_FFT_FORWARDS, false); + &pf, n - 1, armral_fft_direction_t::ARMRAL_FFT_FORWARDS, false, + use_all_kernels, want_uun); armral::fft::create_plan( - &pb, n - 1, armral_fft_direction_t::ARMRAL_FFT_BACKWARDS, false); + &pb, n - 1, armral_fft_direction_t::ARMRAL_FFT_BACKWARDS, false, + use_all_kernels, want_uun); if (!pf || !pb) { if (pf) { armral::fft::destroy_plan(&pf); @@ -89,16 +101,16 @@ std::optional> make_rader(int n, armral_fft_direction_t dir) { template std::optional< rader> -make_rader(int n, armral_fft_direction_t dir); +make_rader(int n, armral_fft_direction_t dir, int n_whole); template std::optional< rader> -make_rader(int n, armral_fft_direction_t dir); +make_rader(int n, armral_fft_direction_t dir, int n_whole); template std::optional< rader> -make_rader(int n, armral_fft_direction_t dir); +make_rader(int n, armral_fft_direction_t dir, int n_whole); template std::optional< rader> -make_rader(int n, armral_fft_direction_t dir); +make_rader(int n, armral_fft_direction_t dir, int n_whole); template struct rader; diff --git a/src/LowerPHY/FFT/rader.hpp b/src/LowerPHY/FFT/rader.hpp index 89b006c0a9a6049bffc61d4a1b4844f340ad6a03..c26f124eeb5d79582d7444552513dac814ac5064 100644 --- a/src/LowerPHY/FFT/rader.hpp +++ b/src/LowerPHY/FFT/rader.hpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #pragma once @@ -84,7 +86,8 @@ struct rader { }; template -std::optional> make_rader(int n, armral_fft_direction_t dir); +std::optional> make_rader(int n, armral_fft_direction_t dir, + int n_whole); template void execute_rader(const rader &r, const Tx *x, Ty *y, int istride, diff --git a/src/LowerPHY/FFT/rader_generator.cpp b/src/LowerPHY/FFT/rader_generator.cpp index 9e798f2df322ec8414452d78825e3d7a2dd1acd9..ef1abc4849b5bc08885ada432d16d136ff1879b6 100644 --- a/src/LowerPHY/FFT/rader_generator.cpp +++ b/src/LowerPHY/FFT/rader_generator.cpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "rader_generator.hpp" diff --git a/src/LowerPHY/FFT/rader_generator.hpp b/src/LowerPHY/FFT/rader_generator.hpp index 49b3cfda30910455756dd48acc33976ee22f02ba..df42c50d10c42f0b4d6e3cc9dc5e41c5f9f98c4e 100644 --- a/src/LowerPHY/FFT/rader_generator.hpp +++ b/src/LowerPHY/FFT/rader_generator.hpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #pragma once diff --git a/src/LowerPHY/FIR/arm_fir_filter_cf32.c b/src/LowerPHY/FIR/arm_fir_filter_cf32.c index 04ffc612e47d482f0b83a2c9fbe94d587359f764..d779624b93885d71e23e3aeb65690b7ad0aeb307 100644 --- a/src/LowerPHY/FIR/arm_fir_filter_cf32.c +++ b/src/LowerPHY/FIR/arm_fir_filter_cf32.c @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "armral.h" diff --git a/src/LowerPHY/FIR/arm_fir_filter_cf32_decimate_2.c b/src/LowerPHY/FIR/arm_fir_filter_cf32_decimate_2.c index ebeef5def330ae6532168d19b2f8e08817e7ad8a..0919a0126231f23f7c6ec42bb2283b8ab5a9f529 100644 --- a/src/LowerPHY/FIR/arm_fir_filter_cf32_decimate_2.c +++ b/src/LowerPHY/FIR/arm_fir_filter_cf32_decimate_2.c @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "armral.h" diff --git a/src/LowerPHY/FIR/arm_fir_filter_cs16.c b/src/LowerPHY/FIR/arm_fir_filter_cs16.c index c4fa6954f57e19958a8a0e968ca1cb9ccbf72e77..0c8926c4adc5b965a84f607754065250811cec8b 100644 --- a/src/LowerPHY/FIR/arm_fir_filter_cs16.c +++ b/src/LowerPHY/FIR/arm_fir_filter_cs16.c @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "armral.h" #include "intrinsics.h" diff --git a/src/LowerPHY/FIR/arm_fir_filter_cs16_decimate_2.c b/src/LowerPHY/FIR/arm_fir_filter_cs16_decimate_2.c index 302c4464af0c7ddd33eaf5e569a712c964dcc7be..7e931f05ece17d9354158c44c3d04339e5fc6b6f 100644 --- a/src/LowerPHY/FIR/arm_fir_filter_cs16_decimate_2.c +++ b/src/LowerPHY/FIR/arm_fir_filter_cs16_decimate_2.c @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "armral.h" diff --git a/src/LowerPHY/Scrambling/arm_scrambling.cpp b/src/LowerPHY/Scrambling/arm_scrambling.cpp index b9f1812873eef1b18e53b685ffd92f50e166ef9f..268b0c146834b97801743ab25c9d76c5b2d3c903 100644 --- a/src/LowerPHY/Scrambling/arm_scrambling.cpp +++ b/src/LowerPHY/Scrambling/arm_scrambling.cpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "armral.h" diff --git a/src/LowerPHY/SeqGenerator/arm_mat_seq_generator.cpp b/src/LowerPHY/SeqGenerator/arm_mat_seq_generator.cpp index 0ed8fd3b14710ece35914f709789b047c33a8c91..85725677625186672d0c8a5dfe68312bd73bd21c 100644 --- a/src/LowerPHY/SeqGenerator/arm_mat_seq_generator.cpp +++ b/src/LowerPHY/SeqGenerator/arm_mat_seq_generator.cpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "armral.h" diff --git a/src/MatrixFactorizations/SVD/arm_svd.cpp b/src/MatrixFactorizations/SVD/arm_svd.cpp index d7086cbd67bc7bd204ad345144f70a5e929fbc87..5d35ba76269b8049d84be37966eb171a732f9968 100644 --- a/src/MatrixFactorizations/SVD/arm_svd.cpp +++ b/src/MatrixFactorizations/SVD/arm_svd.cpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "armral.h" diff --git a/src/MatrixFactorizations/SVD/matrix_view.hpp b/src/MatrixFactorizations/SVD/matrix_view.hpp index 36c7f638fc27d184c2d4043c579ef211267fee4e..685b498cbf61c63bc213ee5970104f8ad3e6dabd 100644 --- a/src/MatrixFactorizations/SVD/matrix_view.hpp +++ b/src/MatrixFactorizations/SVD/matrix_view.hpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2023-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2023-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #pragma once diff --git a/src/UpperPHY/CRC/arm_crc11.cpp b/src/UpperPHY/CRC/arm_crc11.cpp index d41889d79d8d6a4818e5116a44e5f028d2c26db3..871a9bce093e621051e9ccc3c57733d770483f95 100644 --- a/src/UpperPHY/CRC/arm_crc11.cpp +++ b/src/UpperPHY/CRC/arm_crc11.cpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "armral.h" #include "crc_common.hpp" diff --git a/src/UpperPHY/CRC/arm_crc16.cpp b/src/UpperPHY/CRC/arm_crc16.cpp index e727c607acee5bf70ec27d1c2491178bc24e6d63..0e1527bfacc7b42f1e271c8fa7f545c91965ef9a 100644 --- a/src/UpperPHY/CRC/arm_crc16.cpp +++ b/src/UpperPHY/CRC/arm_crc16.cpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "armral.h" #include "crc_common.hpp" diff --git a/src/UpperPHY/CRC/arm_crc24_a.cpp b/src/UpperPHY/CRC/arm_crc24_a.cpp index af8e43e80047e35204d05ae37d8d87abe62e4ccd..9108807ae104fcb32e1f038896c214e72bb13724 100644 --- a/src/UpperPHY/CRC/arm_crc24_a.cpp +++ b/src/UpperPHY/CRC/arm_crc24_a.cpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "armral.h" #include "crc_common.hpp" diff --git a/src/UpperPHY/CRC/arm_crc24_b.cpp b/src/UpperPHY/CRC/arm_crc24_b.cpp index b0e9023279d4bb90aa740f73706b0f2e3a11d76d..55cce37dbc907270356044a6698f4dd2c7a5ee76 100644 --- a/src/UpperPHY/CRC/arm_crc24_b.cpp +++ b/src/UpperPHY/CRC/arm_crc24_b.cpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "armral.h" #include "crc_common.hpp" diff --git a/src/UpperPHY/CRC/arm_crc24_c.cpp b/src/UpperPHY/CRC/arm_crc24_c.cpp index 42302a58ddc8e06543081b599fe7b7c5c7e23f77..b7ab624e257fe1674af06dc10e9132ddfbf69f43 100644 --- a/src/UpperPHY/CRC/arm_crc24_c.cpp +++ b/src/UpperPHY/CRC/arm_crc24_c.cpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "armral.h" #include "crc_common.hpp" diff --git a/src/UpperPHY/CRC/arm_crc6.cpp b/src/UpperPHY/CRC/arm_crc6.cpp index 0277ba31e64634f1c4e54a21763cefa3b9fe5819..49f2e5d56574e614e2c5badaeea2af5f82c6547b 100644 --- a/src/UpperPHY/CRC/arm_crc6.cpp +++ b/src/UpperPHY/CRC/arm_crc6.cpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "armral.h" #include "crc_common.hpp" diff --git a/src/UpperPHY/CRC/crc_basic.hpp b/src/UpperPHY/CRC/crc_basic.hpp index 0e6e7dfb49f7eadfeeb152071a9c79b2f889bf91..c6e0c60f7c10f9a62237278da5380c3610a54e1f 100644 --- a/src/UpperPHY/CRC/crc_basic.hpp +++ b/src/UpperPHY/CRC/crc_basic.hpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #pragma once diff --git a/src/UpperPHY/CRC/crc_common.hpp b/src/UpperPHY/CRC/crc_common.hpp index 47bf69e47039d1a596c204e2e02b50abba1662a1..9ed03ce58ae1d19edb24b1dd25a9fdb263beb211 100644 --- a/src/UpperPHY/CRC/crc_common.hpp +++ b/src/UpperPHY/CRC/crc_common.hpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #pragma once diff --git a/src/UpperPHY/ConvolutionalEncoder/arm_convolutional_decoder.cpp b/src/UpperPHY/ConvolutionalEncoder/arm_convolutional_decoder.cpp index 76476b45e66bee88ff5e928b9fc71d3b1b9204a3..365dac4be184b4cd04688875d16ea736724fd168 100644 --- a/src/UpperPHY/ConvolutionalEncoder/arm_convolutional_decoder.cpp +++ b/src/UpperPHY/ConvolutionalEncoder/arm_convolutional_decoder.cpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "armral.h" #include "utils/allocators.hpp" diff --git a/src/UpperPHY/ConvolutionalEncoder/arm_convolutional_encoder.cpp b/src/UpperPHY/ConvolutionalEncoder/arm_convolutional_encoder.cpp index 58d57d2a96cf74f10e78df62c8895dd947b11e2d..0e7fc3bc7ae4842e649fe2d5704f1af1f51d8ee4 100644 --- a/src/UpperPHY/ConvolutionalEncoder/arm_convolutional_encoder.cpp +++ b/src/UpperPHY/ConvolutionalEncoder/arm_convolutional_encoder.cpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "armral.h" diff --git a/src/UpperPHY/ConvolutionalEncoder/convolutional_code_table.hpp b/src/UpperPHY/ConvolutionalEncoder/convolutional_code_table.hpp index 49ea3fde6b7dfdcf50647f96c23c339575eb9f57..bbc0105d3b883db43d1d2349a5a1c962d8276c1b 100644 --- a/src/UpperPHY/ConvolutionalEncoder/convolutional_code_table.hpp +++ b/src/UpperPHY/ConvolutionalEncoder/convolutional_code_table.hpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ namespace { diff --git a/src/UpperPHY/Demodulation/arm_demodulation.c b/src/UpperPHY/Demodulation/arm_demodulation.c index 238abf02462b5540b8460521f48cf11dfc304ab1..833ea6b7a886183a0a370fd60eff18eeab426cca 100644 --- a/src/UpperPHY/Demodulation/arm_demodulation.c +++ b/src/UpperPHY/Demodulation/arm_demodulation.c @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "armral.h" #include "intrinsics.h" diff --git a/src/UpperPHY/LDPC/ldpc_decoder.cpp b/src/UpperPHY/LDPC/arm_ldpc_decoder.cpp similarity index 98% rename from src/UpperPHY/LDPC/ldpc_decoder.cpp rename to src/UpperPHY/LDPC/arm_ldpc_decoder.cpp index 78bda9262e1d3365c6dbd21b1f16427f96067ddb..9eaffd97d8c7058eec49b173d846b43d48d7fa4c 100644 --- a/src/UpperPHY/LDPC/ldpc_decoder.cpp +++ b/src/UpperPHY/LDPC/arm_ldpc_decoder.cpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "ldpc_coding.hpp" @@ -63,9 +65,9 @@ public: // non-filler bits of the code block) m_k_prime = crc_idx + 24; - // The CRC calculation routine expects a particular size of input (n % 16 = 0 - // where n is the number of bytes), which requires padding the input to the - // required size + // The CRC calculation routine expects a particular size of input + // (n % 16 = 0 where n is the number of bytes), which requires padding + // the input to the required size m_buffer_size = (m_k_prime + 7) / 8; m_total_bits = m_k_prime; if (m_k_prime % 128 != 0) { @@ -966,7 +968,8 @@ void compute_r_and_llrs(const int16_t *l, int16_t *r, int16_t *llrs, const int16_t *l_ptr = l + zb * d->num_cols; // Loop through the columns in the row (variable node n in psi(m)) for (uint32_t col = 0; col < d->num_cols; ++col) { - // Compute the product of sign(L(n',m)) without L(n,m) (the sign of the product) + // Compute the product of sign(L(n',m)) without L(n,m) (the sign of the + // product) int16_t col_sign = (row_sign_array[zb] ^ l_ptr[col]) < 0 ? -1 : 1; // Compute R(n,m) @@ -1009,7 +1012,8 @@ void compute_r_and_llrs(const int16_t *l, int16_t *r, int16_t *llrs, const int16_t *l_ptr = l + d->z * col; int16_t *llrs_ptr = llrs + col_ind + shift; - // Compute the product of sign(L(n',m)) without L(n,m) (the sign of the product) + // Compute the product of sign(L(n',m)) without L(n,m) (the sign of the + // product) svint16_t l_reg = svld1_s16(pg_tail, l_ptr); svint16_t abs_reg = svqabs_s16_x(pg_tail, l_reg); svint16_t eor_reg = sveor_s16_x(pg_tail, row_sign, l_reg); @@ -1046,7 +1050,8 @@ void compute_r_and_llrs(const int16_t *l, int16_t *r, int16_t *llrs, const int16_t *l_ptr = l + d->z * col; int16_t *llrs_ptr = llrs + col_ind + shift; - // Compute the product of sign(L(n',m)) without L(n,m) (the sign of the product) + // Compute the product of sign(L(n',m)) without L(n,m) (the sign of the + // product) int16x4_t l_reg = vld1_s16(l_ptr); int16x4_t abs_reg = vqabs_s16(l_reg); int16x4_t eor_reg = veor_s16(row_sign, l_reg); @@ -1070,7 +1075,8 @@ void compute_r_and_llrs(const int16_t *l, int16_t *r, int16_t *llrs, // Process tail for (uint32_t zb = d->z - tail_size; zb < d->z; ++zb) { - // Compute the product of sign(L(n',m)) without L(n,m) (the sign of the product) + // Compute the product of sign(L(n',m)) without L(n,m) (the sign of the + // product) int16_t col_sign = (row_sign_array[zb] ^ l[d->z * col + zb]) < 0 ? -1 : 1; // Compute R(n,m) @@ -1119,7 +1125,8 @@ void compute_r_and_llrs(const int16_t *l, int16_t *r, int16_t *llrs, // Loop through the Z rows in the layer (check node m) for (int32_t vec_idx = 0; vec_idx < full_vec; ++vec_idx) { - // Compute the product of sign(L(n',m)) without L(n,m) (the sign of the product) + // Compute the product of sign(L(n',m)) without L(n,m) (the sign of the + // product) svint16_t l_reg = svld1_s16(pg, l_ptr); svint16_t sign_reg = svld1_s16(pg, sign_ptr); svint16_t eor_reg = sveor_s16_x(pg, sign_reg, l_reg); @@ -1150,7 +1157,8 @@ void compute_r_and_llrs(const int16_t *l, int16_t *r, int16_t *llrs, } if (tail_size != 0) { - // Compute the product of sign(L(n',m)) without L(n,m) (the sign of the product) + // Compute the product of sign(L(n',m)) without L(n,m) (the sign of the + // product) svint16_t l_reg = svld1_s16(pg_tail, l_ptr); svint16_t sign_reg = svld1_s16(pg_tail, sign_ptr); svint16_t eor_reg = sveor_s16_x(pg_tail, sign_reg, l_reg); @@ -1196,7 +1204,8 @@ void compute_r_and_llrs(const int16_t *l, int16_t *r, int16_t *llrs, // Loop through the Z rows in the layer (check node m) // Process 8 entries at a time for (int32_t v_cnt = 0; v_cnt < full_vec; v_cnt++) { - // Compute the product of sign(L(n',m)) without L(n,m) (the sign of the product) + // Compute the product of sign(L(n',m)) without L(n,m) (the sign of the + // product) int16x8_t l_reg = vld1q_s16(l_ptr); int16x8_t sign_reg = vld1q_s16(sign_ptr); int16x8_t eor_reg = veorq_s16(sign_reg, l_reg); @@ -1232,7 +1241,8 @@ void compute_r_and_llrs(const int16_t *l, int16_t *r, int16_t *llrs, // Process a group of 4 elts if (tail_cnt > 3U) { - // Compute the product of sign(L(n',m)) without L(n,m) (the sign of the product) + // Compute the product of sign(L(n',m)) without L(n,m) (the sign of the + // product) int16x4_t l_reg = vld1_s16(l_ptr); int16x4_t sign_reg = vld1_s16(sign_ptr); int16x4_t eor_reg = veor_s16(sign_reg, l_reg); @@ -1262,7 +1272,8 @@ void compute_r_and_llrs(const int16_t *l, int16_t *r, int16_t *llrs, // Process tail for (uint32_t zb = d->z - tail_cnt; zb < d->z; ++zb) { - // Compute the product of sign(L(n',m)) without L(n,m) (the sign of the product) + // Compute the product of sign(L(n',m)) without L(n,m) (the sign of the + // product) int16_t col_sign = (row_sign_array[zb] ^ l[d->z * col + zb]) < 0 ? -1 : 1; // Compute R(n,m) diff --git a/src/UpperPHY/LDPC/ldpc_encoder.cpp b/src/UpperPHY/LDPC/arm_ldpc_encoder.cpp similarity index 88% rename from src/UpperPHY/LDPC/ldpc_encoder.cpp rename to src/UpperPHY/LDPC/arm_ldpc_encoder.cpp index 5e97e351eb1ea777b0679613d65e006d6cafcae0..749b4e3ffd60ffd454ecf3f7055a882ab6422b98 100644 --- a/src/UpperPHY/LDPC/ldpc_encoder.cpp +++ b/src/UpperPHY/LDPC/arm_ldpc_encoder.cpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "armral.h" #include "ldpc_coding.hpp" @@ -41,12 +43,12 @@ const uint32_t bg1_columns[] = { 0, 2, 3, 4, 5, 7, 8, 9, 11, 12, 14, 15, 16, 17, 19, 21, 22, 23, 24, // row 1: 19 0, 1, 2, 4, 5, 6, 7, 8, 9, 10, 13, 14, 15, 17, 18, 19, 20, 24, 25, // row 2: 19 0, 1, 3, 4, 6, 7, 8, 10, 11, 12, 13, 14, 16, 17, 18, 20, 21, 22, 25, // row 3: 19 - 0, 1, 26, // row 4: 3 - 0, 1, 3, 12, 16, 21, 22, 27, // row 5: 8 - 0, 6, 10, 11, 13, 17, 18, 20, 28, // row 6: 9 - 0, 1, 4, 7, 8, 14, 29, // row 7: 7 - 0, 1, 3, 12, 16, 19, 21, 22, 24, 30, // row 8: 10 - 0, 1, 10, 11, 13, 17, 18, 20, 31, // row 9: 9 + 0, 1, 26, // row 4: 3 + 0, 1, 3, 12, 16, 21, 22, 27, // row 5: 8 + 0, 6, 10, 11, 13, 17, 18, 20, 28, // row 6: 9 + 0, 1, 4, 7, 8, 14, 29, // row 7: 7 + 0, 1, 3, 12, 16, 19, 21, 22, 24, 30, // row 8: 10 + 0, 1, 10, 11, 13, 17, 18, 20, 31, // row 9: 9 1, 2, 4, 7, 8, 14, 32, // row 10: 7 0, 1, 12, 16, 21, 22, 23, 33, // row 11: 8 0, 1, 10, 11, 13, 18, 34, // row 12: 7 @@ -230,7 +232,7 @@ const uint32_t bg1_shifts[] = { 40, 96, 65, 63, 75, 179, 0, // row 15 241, 2, 210, 318, 55, 269, 0, 229, 290, 60, 130, 184, 51, 0, - 90, 120, 131, 209, 209, 81, 0, + 90, 120, 131, 209, 209, 81, 0, 170, 0, 183, 108, 68, 64, 0, 176, 348, 15, 81, 176, 113, 0, 173, 6, 81, 182, 53, 46, 0, @@ -239,7 +241,7 @@ const uint32_t bg1_shifts[] = { 64, 49, 49, 51, 154, 0, // row 16 13, 338, 57, 289, 57, 0, 69, 140, 45, 115, 300, 0, - 154, 164, 43, 189, 101, 0, + 154, 164, 43, 189, 101, 0, 270, 13, 99, 54, 0, 0, 190, 293, 332, 331, 114, 0, 88, 198, 160, 122, 182, 0, @@ -520,16 +522,16 @@ const uint32_t bg2_row_start[] = { // clang-format off const uint32_t bg2_columns[] = { - 0, 1, 2, 3, 6, 9, 10, 11, // row 0: 8 - 0, 3, 4, 5, 6, 7, 8, 9, 11, 12, // row 1: 10 - 0, 1, 3, 4, 8, 10, 12, 13, // row 2: 8 - 1, 2, 4, 5, 6, 7, 8, 9, 10, 13, // row 3: 10 - 0, 1, 11, 14, // row 4: 4 - 0, 1, 5, 7, 11, 15, // row 5: 6 - 0, 5, 7, 9, 11, 16, // row 6: 6 - 1, 5, 7, 11, 13, 17, // row 7: 6 - 0, 1, 12, 18, // row 8: 4 - 1, 8, 10, 11, 19, // row 9: 5 + 0, 1, 2, 3, 6, 9, 10, 11, // row 0: 8 + 0, 3, 4, 5, 6, 7, 8, 9, 11, 12, // row 1: 10 + 0, 1, 3, 4, 8, 10, 12, 13, // row 2: 8 + 1, 2, 4, 5, 6, 7, 8, 9, 10, 13, // row 3: 10 + 0, 1, 11, 14, // row 4: 4 + 0, 1, 5, 7, 11, 15, // row 5: 6 + 0, 5, 7, 9, 11, 16, // row 6: 6 + 1, 5, 7, 11, 13, 17, // row 7: 6 + 0, 1, 12, 18, // row 8: 4 + 1, 8, 10, 11, 19, // row 9: 5 0, 1, 6, 7, 20, // row 10: 5 0, 7, 9, 13, 21, // row 11: 5 1, 3, 11, 22, // row 12: 4 @@ -546,7 +548,7 @@ const uint32_t bg2_columns[] = { 0, 3, 5, 33, // row 23: 4 1, 2, 9, 34, // row 24: 4 0, 5, 35, // row 25: 3 - 2, 7, 12, 13, 36, // row 26: 5 + 2, 7, 12, 13, 36, // row 26: 5 0, 6, 37, // row 27: 3 1, 2, 5, 38, // row 28: 4 0, 4, 39, // row 29: 3 @@ -558,273 +560,273 @@ const uint32_t bg2_columns[] = { 1, 5, 11, 45, // row 35: 4 0, 2, 7, 46, // row 36: 4 10, 13, 47, // row 37: 3 - 1, 5, 11, 48, // row 38: 4 - 0, 7, 12, 49, // row 39: 4 - 2, 10, 13, 50, // row 40: 4 - 1, 5, 11, 51 // row 41: 4 + 1, 5, 11, 48, // row 38: 4 + 0, 7, 12, 49, // row 39: 4 + 2, 10, 13, 50, // row 40: 4 + 1, 5, 11, 51 // row 41: 4 }; const uint32_t bg2_shifts[] = { - 9, 117, 204, 26, 189, 205, 0, 0, // row 0 - 174, 97, 166, 66, 71, 172, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, - 72, 110, 23, 181, 95, 8, 1, 0, - 3, 26, 53, 35, 115, 127, 0, 0, - 156, 143, 14, 3, 40, 123, 0, 0, - 143, 19, 176, 165, 196, 13, 0, 0, - 145, 131, 71, 21, 23, 112, 1, 0, - - 167, 166, 253, 125, 226, 156, 224, 252, 0, 0, // row 1 - 27, 36, 48, 92, 31, 187, 185, 3, 0, 0, - 137, 124, 0, 0, 88, 0, 0, 55, 0, 0, - 53, 156, 115, 156, 115, 200, 29, 31, 0, 0, - 19, 94, 104, 66, 84, 98, 69, 50, 0, 0, - 17, 65, 63, 1, 55, 37, 171, 133, 0, 0, - 18, 27, 3, 102, 185, 17, 14, 180, 0, 0, - 142, 174, 183, 27, 96, 23, 9, 167, 0, 0, - - 81, 114, 44, 52, 240, 1, 0, 0, // row 2 - 25, 114, 117, 110, 114, 1, 0, 0, - 20, 94, 99, 9, 108, 1, 0, 0, - 152, 131, 46, 191, 91, 0, 0, 0, - 95, 106, 92, 110, 111, 1, 0, 0, - 98, 168, 107, 82, 142, 1, 0, 0, - 126, 163, 47, 183, 132, 1, 0, 0, - 74, 31, 3, 53, 155, 0, 0, 0, - - 8, 58, 158, 104, 209, 54, 18, 128, 0, 0, // row 3 - 136, 175, 113, 72, 123, 118, 28, 186, 0, 0, - 38, 15, 102, 146, 12, 57, 53, 46, 0, 0, - 185, 6, 36, 124, 124, 110, 156, 133, 1, 0, - 120, 121, 22, 4, 73, 49, 128, 79, 0, 0, - 53, 174, 174, 127, 17, 89, 17, 105, 0, 0, - 36, 48, 18, 111, 203, 3, 191, 160, 0, 0, - 239, 171, 95, 110, 159, 199, 43, 75, 1, 0, - - 179, 214, 71, 0, // row 4 - 72, 74, 29, 0, - 0, 136, 157, 0, - 200, 16, 101, 0, - 42, 24, 51, 0, - 86, 67, 83, 0, - 43, 27, 117, 0, - 29, 140, 180, 0, - - 231, 41, 194, 159, 103, 0, // row 5 - 10, 44, 121, 80, 48, 0, - 0, 131, 142, 141, 64, 0, - 185, 138, 170, 219, 193, 0, - 40, 140, 84, 137, 71, 0, - 79, 84, 35, 103, 60, 0, - 136, 49, 36, 132, 62, 0, - 121, 41, 169, 88, 207, 0, - - 155, 228, 45, 28, 158, 0, // row 6 - 129, 92, 100, 49, 184, 0, - 0, 124, 99, 45, 148, 0, - 123, 55, 31, 222, 209, 0, - 109, 87, 107, 133, 139, 0, - 47, 154, 10, 155, 29, 0, - 7, 34, 198, 168, 12, 0, - 137, 72, 172, 124, 56, 0, - - 129, 147, 140, 3, 116, 0, // row 7 - 80, 186, 16, 102, 143, 0, - 0, 45, 148, 96, 78, 0, - 103, 13, 105, 150, 181, 0, - 97, 135, 35, 108, 65, 0, - 48, 125, 24, 47, 55, 0, - 163, 78, 143, 107, 58, 0, - 86, 186, 87, 172, 154, 0, - - 142, 94, 230, 0, // row 8 - 118, 70, 152, 0, - 0, 65, 87, 0, - 147, 43, 152, 0, - 70, 69, 88, 0, - 53, 31, 161, 0, - 101, 177, 22, 0, - 176, 169, 225, 0, - - 203, 205, 61, 247, 0, // row 9 - 28, 132, 185, 178, 0, - 0, 97, 51, 85, 0, - 2, 30, 184, 83, 0, - 97, 40, 24, 49, 0, - 104, 142, 99, 64, 0, - 186, 27, 205, 81, 0, - 167, 238, 48, 68, 0, - - 11, 185, 0, 117, 0, // row 10 - 59, 104, 22, 52, 0, - 0, 17, 156, 20, 0, - 174, 150, 8, 56, 0, - 46, 41, 101, 96, 0, - 111, 25, 174, 23, 0, - 125, 60, 177, 51, 0, - 38, 217, 208, 232, 0, - - 11, 236, 210, 56, 0, // row 11 - 32, 92, 174, 154, 0, - 0, 7, 4, 2, 0, - 99, 138, 110, 99, 0, - 28, 30, 116, 64, 0, - 91, 175, 24, 141, 0, - 39, 29, 35, 8, 0, - 178, 214, 168, 51, 0, - - 63, 111, 14, 0, // row 12 - 39, 93, 11, 0, - 0, 113, 48, 0, - 46, 217, 109, 0, - 33, 122, 131, 0, - 122, 11, 4, 0, - 18, 155, 49, 0, - 124, 122, 72, 0, - - 83, 2, 38, 222, 0, // row 13 - 49, 125, 35, 166, 0, - 0, 112, 102, 26, 0, - 37, 113, 143, 140, 0, - 76, 37, 62, 47, 0, - 29, 91, 27, 127, 0, - 32, 53, 95, 186, 0, - 48, 57, 167, 219, 0, - - 115, 145, 3, 232, 0, // row 14 - 19, 118, 21, 163, 0, - 0, 138, 57, 27, 0, - 36, 95, 40, 116, 0, - 143, 51, 130, 97, 0, - 11, 145, 8, 166, 0, - 91, 20, 52, 109, 0, - 82, 232, 204, 162, 0, - - 51, 175, 213, 0, // row 15 - 68, 63, 81, 0, - 0, 73, 99, 0, - 116, 200, 110, 0, - 139, 96, 128, 0, - 137, 103, 40, 0, - 174, 108, 102, 0, - 38, 217, 157, 0, - - 203, 142, 8, 242, 0, // row 16 - 87, 177, 135, 64, 0, - 0, 79, 111, 143, 0, - 75, 158, 134, 97, 0, - 48, 9, 28, 8, 0, - 78, 158, 17, 165, 0, - 125, 31, 54, 176, 0, - 170, 23, 175, 202, 0, - - 254, 124, 114, 64, 0, // row 17 - 158, 23, 9, 6, 0, - 0, 24, 109, 18, 0, - 48, 132, 206, 2, 0, - 120, 43, 65, 42, 0, - 134, 23, 62, 163, 0, - 57, 201, 142, 35, 0, - 196, 173, 195, 218, 0, - - 220, 194, 50, 0, // row 18 - 186, 6, 46, 0, - 0, 18, 86, 0, - 68, 16, 156, 0, - 17, 106, 142, 0, - 173, 31, 22, 0, - 129, 203, 140, 0, - 128, 211, 210, 0, - - 87, 20, 185, 0, // row 19 - 58, 42, 156, 0, - 0, 158, 154, 0, - 35, 138, 86, 0, - 79, 28, 41, 0, - 13, 135, 145, 0, - 110, 124, 52, 0, - 39, 84, 88, 0, - - 26, 105, 29, 0, // row 20 - 76, 61, 153, 0, - 0, 148, 104, 0, - 6, 20, 141, 0, - 2, 103, 78, 0, - 128, 52, 173, 0, - 196, 35, 114, 0, - 117, 227, 6, 0, - - 76, 42, 210, 0, // row 21 - 157, 175, 67, 0, - 0, 17, 33, 0, - 80, 43, 81, 0, - 91, 75, 81, 0, - 156, 166, 40, 0, - 10, 122, 23, 0, - 238, 13, 11, 0, - - 222, 63, 0, // row 22 - 20, 52, 0, - 0, 4, 0, - 49, 1, 0, - 54, 132, 0, - 18, 163, 0, - 202, 126, 0, - 195, 44, 0, - - 23, 235, 238, 0, // row 23 - 106, 86, 95, 0, - 0, 75, 158, 0, - 156, 54, 134, 0, - 68, 115, 56, 0, - 110, 132, 150, 0, - 52, 170, 13, 0, - 5, 94, 111, 0, - - 46, 139, 8, 0, // row 24 - 182, 153, 64, 0, - 0, 69, 87, 0, - 153, 88, 63, 0, - 30, 42, 101, 0, - 113, 108, 61, 0, - 113, 161, 88, 0, - 81, 19, 130, 0, - - 228, 156, 0, // row 25 - 45, 21, 0, - 0, 65, 0, - 211, 94, 0, - 128, 63, 0, - 72, 136, 0, - 197, 194, 0, - 66, 95, 0, - - 29, 143, 160, 122, 0, // row 26 - 67, 137, 55, 85, 0, - 0, 100, 13, 7, 0, - 90, 6, 221, 6, 0, - 142, 28, 100, 133, 0, - 36, 38, 53, 145, 0, - 164, 172, 49, 161, 0, - 146, 66, 190, 86, 0, - - 8, 151, 0, // row 27 - 103, 50, 0, - 0, 32, 0, - 27, 118, 0, - 13, 10, 0, - 42, 104, 0, - 168, 193, 0, - 64, 181, 0, - - 98, 101, 135, 0, // row 28 - 70, 111, 168, 0, - 0, 126, 110, 0, - 216, 212, 193, 0, - 106, 77, 43, 0, - 64, 24, 149, 0, - 14, 186, 46, 0, - 7, 144, 16, 0, + 9, 117, 204, 26, 189, 205, 0, 0, // row 0 + 174, 97, 166, 66, 71, 172, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 72, 110, 23, 181, 95, 8, 1, 0, + 3, 26, 53, 35, 115, 127, 0, 0, + 156, 143, 14, 3, 40, 123, 0, 0, + 143, 19, 176, 165, 196, 13, 0, 0, + 145, 131, 71, 21, 23, 112, 1, 0, + + 167, 166, 253, 125, 226, 156, 224, 252, 0, 0, // row 1 + 27, 36, 48, 92, 31, 187, 185, 3, 0, 0, + 137, 124, 0, 0, 88, 0, 0, 55, 0, 0, + 53, 156, 115, 156, 115, 200, 29, 31, 0, 0, + 19, 94, 104, 66, 84, 98, 69, 50, 0, 0, + 17, 65, 63, 1, 55, 37, 171, 133, 0, 0, + 18, 27, 3, 102, 185, 17, 14, 180, 0, 0, + 142, 174, 183, 27, 96, 23, 9, 167, 0, 0, + + 81, 114, 44, 52, 240, 1, 0, 0, // row 2 + 25, 114, 117, 110, 114, 1, 0, 0, + 20, 94, 99, 9, 108, 1, 0, 0, + 152, 131, 46, 191, 91, 0, 0, 0, + 95, 106, 92, 110, 111, 1, 0, 0, + 98, 168, 107, 82, 142, 1, 0, 0, + 126, 163, 47, 183, 132, 1, 0, 0, + 74, 31, 3, 53, 155, 0, 0, 0, + + 8, 58, 158, 104, 209, 54, 18, 128, 0, 0, // row 3 + 136, 175, 113, 72, 123, 118, 28, 186, 0, 0, + 38, 15, 102, 146, 12, 57, 53, 46, 0, 0, + 185, 6, 36, 124, 124, 110, 156, 133, 1, 0, + 120, 121, 22, 4, 73, 49, 128, 79, 0, 0, + 53, 174, 174, 127, 17, 89, 17, 105, 0, 0, + 36, 48, 18, 111, 203, 3, 191, 160, 0, 0, + 239, 171, 95, 110, 159, 199, 43, 75, 1, 0, + + 179, 214, 71, 0, // row 4 + 72, 74, 29, 0, + 0, 136, 157, 0, + 200, 16, 101, 0, + 42, 24, 51, 0, + 86, 67, 83, 0, + 43, 27, 117, 0, + 29, 140, 180, 0, + + 231, 41, 194, 159, 103, 0, // row 5 + 10, 44, 121, 80, 48, 0, + 0, 131, 142, 141, 64, 0, + 185, 138, 170, 219, 193, 0, + 40, 140, 84, 137, 71, 0, + 79, 84, 35, 103, 60, 0, + 136, 49, 36, 132, 62, 0, + 121, 41, 169, 88, 207, 0, + + 155, 228, 45, 28, 158, 0, // row 6 + 129, 92, 100, 49, 184, 0, + 0, 124, 99, 45, 148, 0, + 123, 55, 31, 222, 209, 0, + 109, 87, 107, 133, 139, 0, + 47, 154, 10, 155, 29, 0, + 7, 34, 198, 168, 12, 0, + 137, 72, 172, 124, 56, 0, + + 129, 147, 140, 3, 116, 0, // row 7 + 80, 186, 16, 102, 143, 0, + 0, 45, 148, 96, 78, 0, + 103, 13, 105, 150, 181, 0, + 97, 135, 35, 108, 65, 0, + 48, 125, 24, 47, 55, 0, + 163, 78, 143, 107, 58, 0, + 86, 186, 87, 172, 154, 0, + + 142, 94, 230, 0, // row 8 + 118, 70, 152, 0, + 0, 65, 87, 0, + 147, 43, 152, 0, + 70, 69, 88, 0, + 53, 31, 161, 0, + 101, 177, 22, 0, + 176, 169, 225, 0, + + 203, 205, 61, 247, 0, // row 9 + 28, 132, 185, 178, 0, + 0, 97, 51, 85, 0, + 2, 30, 184, 83, 0, + 97, 40, 24, 49, 0, + 104, 142, 99, 64, 0, + 186, 27, 205, 81, 0, + 167, 238, 48, 68, 0, + + 11, 185, 0, 117, 0, // row 10 + 59, 104, 22, 52, 0, + 0, 17, 156, 20, 0, + 174, 150, 8, 56, 0, + 46, 41, 101, 96, 0, + 111, 25, 174, 23, 0, + 125, 60, 177, 51, 0, + 38, 217, 208, 232, 0, + + 11, 236, 210, 56, 0, // row 11 + 32, 92, 174, 154, 0, + 0, 7, 4, 2, 0, + 99, 138, 110, 99, 0, + 28, 30, 116, 64, 0, + 91, 175, 24, 141, 0, + 39, 29, 35, 8, 0, + 178, 214, 168, 51, 0, + + 63, 111, 14, 0, // row 12 + 39, 93, 11, 0, + 0, 113, 48, 0, + 46, 217, 109, 0, + 33, 122, 131, 0, + 122, 11, 4, 0, + 18, 155, 49, 0, + 124, 122, 72, 0, + + 83, 2, 38, 222, 0, // row 13 + 49, 125, 35, 166, 0, + 0, 112, 102, 26, 0, + 37, 113, 143, 140, 0, + 76, 37, 62, 47, 0, + 29, 91, 27, 127, 0, + 32, 53, 95, 186, 0, + 48, 57, 167, 219, 0, + + 115, 145, 3, 232, 0, // row 14 + 19, 118, 21, 163, 0, + 0, 138, 57, 27, 0, + 36, 95, 40, 116, 0, + 143, 51, 130, 97, 0, + 11, 145, 8, 166, 0, + 91, 20, 52, 109, 0, + 82, 232, 204, 162, 0, + + 51, 175, 213, 0, // row 15 + 68, 63, 81, 0, + 0, 73, 99, 0, + 116, 200, 110, 0, + 139, 96, 128, 0, + 137, 103, 40, 0, + 174, 108, 102, 0, + 38, 217, 157, 0, + + 203, 142, 8, 242, 0, // row 16 + 87, 177, 135, 64, 0, + 0, 79, 111, 143, 0, + 75, 158, 134, 97, 0, + 48, 9, 28, 8, 0, + 78, 158, 17, 165, 0, + 125, 31, 54, 176, 0, + 170, 23, 175, 202, 0, + + 254, 124, 114, 64, 0, // row 17 + 158, 23, 9, 6, 0, + 0, 24, 109, 18, 0, + 48, 132, 206, 2, 0, + 120, 43, 65, 42, 0, + 134, 23, 62, 163, 0, + 57, 201, 142, 35, 0, + 196, 173, 195, 218, 0, + + 220, 194, 50, 0, // row 18 + 186, 6, 46, 0, + 0, 18, 86, 0, + 68, 16, 156, 0, + 17, 106, 142, 0, + 173, 31, 22, 0, + 129, 203, 140, 0, + 128, 211, 210, 0, + + 87, 20, 185, 0, // row 19 + 58, 42, 156, 0, + 0, 158, 154, 0, + 35, 138, 86, 0, + 79, 28, 41, 0, + 13, 135, 145, 0, + 110, 124, 52, 0, + 39, 84, 88, 0, + + 26, 105, 29, 0, // row 20 + 76, 61, 153, 0, + 0, 148, 104, 0, + 6, 20, 141, 0, + 2, 103, 78, 0, + 128, 52, 173, 0, + 196, 35, 114, 0, + 117, 227, 6, 0, + + 76, 42, 210, 0, // row 21 + 157, 175, 67, 0, + 0, 17, 33, 0, + 80, 43, 81, 0, + 91, 75, 81, 0, + 156, 166, 40, 0, + 10, 122, 23, 0, + 238, 13, 11, 0, + + 222, 63, 0, // row 22 + 20, 52, 0, + 0, 4, 0, + 49, 1, 0, + 54, 132, 0, + 18, 163, 0, + 202, 126, 0, + 195, 44, 0, + + 23, 235, 238, 0, // row 23 + 106, 86, 95, 0, + 0, 75, 158, 0, + 156, 54, 134, 0, + 68, 115, 56, 0, + 110, 132, 150, 0, + 52, 170, 13, 0, + 5, 94, 111, 0, + + 46, 139, 8, 0, // row 24 + 182, 153, 64, 0, + 0, 69, 87, 0, + 153, 88, 63, 0, + 30, 42, 101, 0, + 113, 108, 61, 0, + 113, 161, 88, 0, + 81, 19, 130, 0, + + 228, 156, 0, // row 25 + 45, 21, 0, + 0, 65, 0, + 211, 94, 0, + 128, 63, 0, + 72, 136, 0, + 197, 194, 0, + 66, 95, 0, + + 29, 143, 160, 122, 0, // row 26 + 67, 137, 55, 85, 0, + 0, 100, 13, 7, 0, + 90, 6, 221, 6, 0, + 142, 28, 100, 133, 0, + 36, 38, 53, 145, 0, + 164, 172, 49, 161, 0, + 146, 66, 190, 86, 0, + + 8, 151, 0, // row 27 + 103, 50, 0, + 0, 32, 0, + 27, 118, 0, + 13, 10, 0, + 42, 104, 0, + 168, 193, 0, + 64, 181, 0, + + 98, 101, 135, 0, // row 28 + 70, 111, 168, 0, + 0, 126, 110, 0, + 216, 212, 193, 0, + 106, 77, 43, 0, + 64, 24, 149, 0, + 14, 186, 46, 0, + 7, 144, 16, 0, 18, 28, 0, // row 29 110, 17, 0, @@ -853,95 +855,95 @@ const uint32_t bg2_shifts[] = { 115, 189, 0, 201, 46, 0, - 242, 44, 166, 0, // row 32 - 84, 8, 17, 0, - 0, 20, 122, 0, - 108, 21, 110, 0, - 32, 89, 71, 0, - 116, 73, 142, 0, - 110, 0, 163, 0, - 179, 14, 116, 0, - - 132, 164, 235, 0, // row 33 - 165, 179, 124, 0, - 0, 88, 13, 0, - 71, 12, 109, 0, - 135, 6, 2, 0, - 105, 137, 29, 0, - 163, 173, 179, 0, - 46, 2, 106, 0, - - 147, 85, 36, 0, // row 34 - 173, 177, 12, 0, - 0, 19, 78, 0, - 29, 201, 69, 0, - 37, 25, 114, 0, - 11, 41, 162, 0, - 197, 191, 193, 0, - 184, 135, 141, 0, - - 57, 40, 63, 0, // row 35 - 77, 184, 18, 0, - 0, 157, 6, 0, - 91, 165, 55, 0, - 60, 137, 93, 0, - 126, 152, 172, 0, - 157, 167, 181, 0, - 85, 225, 175, 0, - - 140, 38, 154, 0, // row 36 - 25, 151, 170, 0, - 0, 63, 82, 0, - 1, 175, 83, 0, - 121, 129, 26, 0, - 73, 154, 129, 0, - 197, 167, 179, 0, - 178, 112, 106, 0, - - 219, 151, 0, // row 37 - 37, 31, 0, - 0, 144, 0, - 40, 12, 0, - 97, 56, 0, - 167, 38, 0, - 181, 193, 0, - 154, 114, 0, - - 31, 66, 38, 0, // row 38 - 84, 151, 190, 0, - 0, 93, 19, 0, - 37, 97, 46, 0, - 1, 70, 1, 0, - 112, 7, 19, 0, - 157, 173, 191, 0, - 42, 41, 105, 0, - - 239, 172, 34, 0, // row 39 - 93, 132, 57, 0, - 0, 24, 138, 0, - 106, 181, 154, 0, - 119, 32, 142, 0, - 109, 6, 105, 0, - 181, 157, 173, 0, - 167, 45, 189, 0, - - 0, 75, 120, 0, // row 40 - 103, 107, 163, 0, - 0, 36, 143, 0, - 98, 35, 36, 0, - 6, 73, 102, 0, - 160, 156, 82, 0, - 193, 163, 179, 0, - 78, 67, 180, 0, - - 129, 229, 118, 0, // row 41 - 147, 7, 60, 0, - 0, 2, 55, 0, - 120, 101, 81, 0, - 48, 47, 19, 0, - 132, 6, 8, 0, - 191, 197, 167, 0, - 53, 215, 230, 0 + 242, 44, 166, 0, // row 32 + 84, 8, 17, 0, + 0, 20, 122, 0, + 108, 21, 110, 0, + 32, 89, 71, 0, + 116, 73, 142, 0, + 110, 0, 163, 0, + 179, 14, 116, 0, + + 132, 164, 235, 0, // row 33 + 165, 179, 124, 0, + 0, 88, 13, 0, + 71, 12, 109, 0, + 135, 6, 2, 0, + 105, 137, 29, 0, + 163, 173, 179, 0, + 46, 2, 106, 0, + + 147, 85, 36, 0, // row 34 + 173, 177, 12, 0, + 0, 19, 78, 0, + 29, 201, 69, 0, + 37, 25, 114, 0, + 11, 41, 162, 0, + 197, 191, 193, 0, + 184, 135, 141, 0, + + 57, 40, 63, 0, // row 35 + 77, 184, 18, 0, + 0, 157, 6, 0, + 91, 165, 55, 0, + 60, 137, 93, 0, + 126, 152, 172, 0, + 157, 167, 181, 0, + 85, 225, 175, 0, + + 140, 38, 154, 0, // row 36 + 25, 151, 170, 0, + 0, 63, 82, 0, + 1, 175, 83, 0, + 121, 129, 26, 0, + 73, 154, 129, 0, + 197, 167, 179, 0, + 178, 112, 106, 0, + + 219, 151, 0, // row 37 + 37, 31, 0, + 0, 144, 0, + 40, 12, 0, + 97, 56, 0, + 167, 38, 0, + 181, 193, 0, + 154, 114, 0, + + 31, 66, 38, 0, // row 38 + 84, 151, 190, 0, + 0, 93, 19, 0, + 37, 97, 46, 0, + 1, 70, 1, 0, + 112, 7, 19, 0, + 157, 173, 191, 0, + 42, 41, 105, 0, + + 239, 172, 34, 0, // row 39 + 93, 132, 57, 0, + 0, 24, 138, 0, + 106, 181, 154, 0, + 119, 32, 142, 0, + 109, 6, 105, 0, + 181, 157, 173, 0, + 167, 45, 189, 0, + + 0, 75, 120, 0, // row 40 + 103, 107, 163, 0, + 0, 36, 143, 0, + 98, 35, 36, 0, + 6, 73, 102, 0, + 160, 156, 82, 0, + 193, 163, 179, 0, + 78, 67, 180, 0, + + 129, 229, 118, 0, // row 41 + 147, 7, 60, 0, + 0, 2, 55, 0, + 120, 101, 81, 0, + 48, 47, 19, 0, + 132, 6, 8, 0, + 191, 197, 167, 0, + 53, 215, 230, 0 }; // clang-format on diff --git a/src/UpperPHY/LDPC/ldpc_rate_matching.cpp b/src/UpperPHY/LDPC/arm_ldpc_rate_matching.cpp similarity index 96% rename from src/UpperPHY/LDPC/ldpc_rate_matching.cpp rename to src/UpperPHY/LDPC/arm_ldpc_rate_matching.cpp index 56171f4991100521e23d53520d3b5e3601ed23b8..8efac64e48ccfe79d78021486e11a9f35ced26ae 100644 --- a/src/UpperPHY/LDPC/ldpc_rate_matching.cpp +++ b/src/UpperPHY/LDPC/arm_ldpc_rate_matching.cpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "armral.h" #include "ldpc_rate_common.hpp" diff --git a/src/UpperPHY/LDPC/ldpc_rate_recovery.cpp b/src/UpperPHY/LDPC/arm_ldpc_rate_recovery.cpp similarity index 96% rename from src/UpperPHY/LDPC/ldpc_rate_recovery.cpp rename to src/UpperPHY/LDPC/arm_ldpc_rate_recovery.cpp index 206e6548253f3357570cde5d7ccbf739e196a5e3..e843d2fa43566dd3d883c776de6417784a66aaa2 100644 --- a/src/UpperPHY/LDPC/ldpc_rate_recovery.cpp +++ b/src/UpperPHY/LDPC/arm_ldpc_rate_recovery.cpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "armral.h" #include "ldpc_rate_common.hpp" diff --git a/src/UpperPHY/LDPC/ldpc_coding.hpp b/src/UpperPHY/LDPC/ldpc_coding.hpp index c69f7531ed093bc064a10fd7693275efb80952f2..046b0659657641e6ce6286e78a10749d5e481acb 100644 --- a/src/UpperPHY/LDPC/ldpc_coding.hpp +++ b/src/UpperPHY/LDPC/ldpc_coding.hpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #pragma once diff --git a/src/UpperPHY/LDPC/ldpc_rate_common.hpp b/src/UpperPHY/LDPC/ldpc_rate_common.hpp index e4037ea32d1d611f87e677e3efa37c0360caaa30..c27e7ef044302dad6deecb823b760c141fdddd37 100644 --- a/src/UpperPHY/LDPC/ldpc_rate_common.hpp +++ b/src/UpperPHY/LDPC/ldpc_rate_common.hpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2023-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2023-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #pragma once diff --git a/src/UpperPHY/Modulation/arm_modulation.c b/src/UpperPHY/Modulation/arm_modulation.c index 3049942e60e708268eebc0ef2b12891aa64ea8fc..32d6f1de7a3b9e7ce9e9b427890f4d09484c0953 100644 --- a/src/UpperPHY/Modulation/arm_modulation.c +++ b/src/UpperPHY/Modulation/arm_modulation.c @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "armral.h" #include "intrinsics.h" diff --git a/src/UpperPHY/Polar/arm_polar_crc_attachment.cpp b/src/UpperPHY/Polar/arm_polar_crc_attachment.cpp index 341b4b0ce481f64c42dfeed5c094c222d2a4f7ab..2cf6d0df899729159dbaeaef533ee19ee9b24b5b 100644 --- a/src/UpperPHY/Polar/arm_polar_crc_attachment.cpp +++ b/src/UpperPHY/Polar/arm_polar_crc_attachment.cpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "armral.h" #include "utils/allocators.hpp" diff --git a/src/UpperPHY/Polar/arm_polar_crc_check.cpp b/src/UpperPHY/Polar/arm_polar_crc_check.cpp index b0682cd948ac92bd52ebf7cdd99ef990c1631e30..a3264270ba2e658a799d3681feb52ba9de3f644e 100644 --- a/src/UpperPHY/Polar/arm_polar_crc_check.cpp +++ b/src/UpperPHY/Polar/arm_polar_crc_check.cpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "armral.h" #include "utils/allocators.hpp" diff --git a/src/UpperPHY/Polar/arm_polar_decoder.cpp b/src/UpperPHY/Polar/arm_polar_decoder.cpp index 83eb4123ecbfb5443aea13675dd8c71cfd73b8a3..db5ab6bf6125142a22ba278e8ecc68f75c3ec0ad 100644 --- a/src/UpperPHY/Polar/arm_polar_decoder.cpp +++ b/src/UpperPHY/Polar/arm_polar_decoder.cpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "armral.h" #include "intrinsics.h" diff --git a/src/UpperPHY/Polar/arm_polar_decoder.hpp b/src/UpperPHY/Polar/arm_polar_decoder.hpp index 7989bacdb00d31d5bb72ff79ac1ba569f1bccf62..2bb1c5fb6435c95d3924f8d6c5e8b2398fa4561d 100644 --- a/src/UpperPHY/Polar/arm_polar_decoder.hpp +++ b/src/UpperPHY/Polar/arm_polar_decoder.hpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2023-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2023-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #pragma once diff --git a/src/UpperPHY/Polar/arm_polar_decoder_neon.hpp b/src/UpperPHY/Polar/arm_polar_decoder_neon.hpp index fb20d2ebba37b80e82c9bd0147e3a1d9bbc24b98..60d27136980e7e9fe7edc928d9e146064bf13bbb 100644 --- a/src/UpperPHY/Polar/arm_polar_decoder_neon.hpp +++ b/src/UpperPHY/Polar/arm_polar_decoder_neon.hpp @@ -1,9 +1,12 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2023-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2023-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #pragma once + #include "arm_polar_decoder.hpp" namespace { @@ -504,4 +507,4 @@ inline void combine_hist<1>(const uint8_t * /*hist1*/, // nothing to do if L=1, only one choice of history. } -} // namespace \ No newline at end of file +} // namespace diff --git a/src/UpperPHY/Polar/arm_polar_encoder.c b/src/UpperPHY/Polar/arm_polar_encoder.c index 71ad88e2380bd6a5bef8f9aef7c0fa4fbaaa03f8..7e3a606e0b053e51902d7058e36bd821960377e8 100644 --- a/src/UpperPHY/Polar/arm_polar_encoder.c +++ b/src/UpperPHY/Polar/arm_polar_encoder.c @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "armral.h" diff --git a/src/UpperPHY/Polar/arm_polar_frozen_bits.cpp b/src/UpperPHY/Polar/arm_polar_frozen_bits.cpp index 5b3767fd5ced4af837d5637f4e71f23e937ad7a5..b38aa7b419935788eada5695ce227ce09ede8a18 100644 --- a/src/UpperPHY/Polar/arm_polar_frozen_bits.cpp +++ b/src/UpperPHY/Polar/arm_polar_frozen_bits.cpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "armral.h" diff --git a/src/UpperPHY/Polar/arm_polar_rate_matching.cpp b/src/UpperPHY/Polar/arm_polar_rate_matching.cpp index dbf884cbeba6d0151e81eda10aa2d9f3b8d717d3..68991b2566dcae73eb77de5ede8441b2dec101ea 100644 --- a/src/UpperPHY/Polar/arm_polar_rate_matching.cpp +++ b/src/UpperPHY/Polar/arm_polar_rate_matching.cpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "armral.h" #include "utils/allocators.hpp" diff --git a/src/UpperPHY/Polar/arm_polar_rate_recovery.cpp b/src/UpperPHY/Polar/arm_polar_rate_recovery.cpp index 06c903ef62a2298fa02480528463f730f1da5213..0e0e16ad123954eb090dff1116ffc1b4491dfd29 100644 --- a/src/UpperPHY/Polar/arm_polar_rate_recovery.cpp +++ b/src/UpperPHY/Polar/arm_polar_rate_recovery.cpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "armral.h" #include "utils/allocators.hpp" diff --git a/src/UpperPHY/Polar/arm_polar_subchannel_deinterleave.cpp b/src/UpperPHY/Polar/arm_polar_subchannel_deinterleave.cpp index 6b1b86a39661a2e70df233e97accf48ea712b284..e9a0a20796ab92873ca5e91eeda8d88c7204e8d5 100644 --- a/src/UpperPHY/Polar/arm_polar_subchannel_deinterleave.cpp +++ b/src/UpperPHY/Polar/arm_polar_subchannel_deinterleave.cpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "armral.h" diff --git a/src/UpperPHY/Polar/arm_polar_subchannel_interleave.cpp b/src/UpperPHY/Polar/arm_polar_subchannel_interleave.cpp index 192d339f5e75a95bfa7621fcc5167d3e479b8788..e3d683fdc8e6210df8cfbaa96025c57b34f1cb9a 100644 --- a/src/UpperPHY/Polar/arm_polar_subchannel_interleave.cpp +++ b/src/UpperPHY/Polar/arm_polar_subchannel_interleave.cpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "armral.h" diff --git a/src/UpperPHY/Turbo/arm_turbo_decoder.cpp b/src/UpperPHY/Turbo/arm_turbo_decoder.cpp index cc380fd355add6eeb11495ce0c01c344bd424c9a..4938df71880497006362355b74705b1f7885987c 100644 --- a/src/UpperPHY/Turbo/arm_turbo_decoder.cpp +++ b/src/UpperPHY/Turbo/arm_turbo_decoder.cpp @@ -1,70 +1,95 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "armral.h" -#include "intrinsics.h" -#include "turbo_code.hpp" +#include "turbo_code_common.hpp" #include "turbo_tables.hpp" #include "utils/allocators.hpp" -#include "arm_turbo_decoder.hpp" +#include "arm_turbo_decoder_batch.hpp" +#include "arm_turbo_decoder_single.hpp" -template void armral::turbo::decode_block( +// Declarations for no convergence checking +template armral_status armral::turbo::decode( const int8_t *sys, const int8_t *par, const int8_t *itl, uint32_t k, - uint8_t *dst, float32_t l_c, uint32_t max_iter, uint16_t *perm_idxs, - heap_allocator &); + uint8_t *dst, float32_t l_c, uint32_t max_iter, uint32_t num_blocks, + uint16_t *perm_idxs, heap_allocator &, trellis_term_func_t, + decode_step_func_t, trellis_term_func_t, decode_step_func_t); -template void armral::turbo::decode_block( +template armral_status armral::turbo::decode( const int8_t *sys, const int8_t *par, const int8_t *itl, uint32_t k, - uint8_t *dst, float32_t l_c, uint32_t max_iter, uint16_t *perm_idxs, - buffer_bump_allocator &); + uint8_t *dst, float32_t l_c, uint32_t max_iter, uint32_t num_blocks, + uint16_t *perm_idxs, buffer_bump_allocator &, trellis_term_func_t, + decode_step_func_t, trellis_term_func_t, decode_step_func_t); +// Permutation indices armral_status armral_turbo_perm_idx_init(uint16_t *buffer) { return armral::turbo::all_perm_idx_init( (armral::turbo::perm_idx_lookup *)buffer); } -template -static armral_status -turbo_decode_block(const int8_t *sys, const int8_t *par, const int8_t *itl, - uint32_t k, uint8_t *dst, uint32_t max_iter, - uint16_t *perm_idxs, Allocator &allocator) { - if (!armral::turbo::valid_num_bits(k)) { - return ARMRAL_ARGUMENT_ERROR; - } - // We set the channel reliability parameter l_c to a value that simplifies - // to 1 in the computation of the gamma values. This is justified because - // max-log-MAP decoding is independent of the channel reliability (which is - // itself relative to the channel SNR). For reference see: - // N. Wehn, "Turbo-decoding without SNR estimation", IEEE Communications - // Letters 4(6), pp. 193-195, July 2000. - armral::turbo::decode_block(sys, par, itl, k, dst, 2.F, max_iter, - perm_idxs, allocator); - return ARMRAL_SUCCESS; -} - +// Decode wrappers armral_status armral_turbo_decode_block(const int8_t *sys, const int8_t *par, const int8_t *itl, uint32_t k, uint8_t *dst, uint32_t max_iter, uint16_t *perm_idxs) { heap_allocator allocator{}; - return turbo_decode_block(sys, par, itl, k, dst, max_iter, perm_idxs, - allocator); + return armral::turbo::decode(sys, par, itl, k, dst, 2.F, max_iter, 1, + perm_idxs, allocator, trellis_termination, + decode_block_step, nullptr, nullptr); } armral_status armral_turbo_decode_block_noalloc( const int8_t *sys, const int8_t *par, const int8_t *itl, uint32_t k, uint8_t *dst, uint32_t max_iter, uint16_t *perm_idxs, void *buffer) { buffer_bump_allocator allocator{buffer}; - return turbo_decode_block(sys, par, itl, k, dst, max_iter, perm_idxs, - allocator); + return armral::turbo::decode(sys, par, itl, k, dst, 2.F, max_iter, 1, + perm_idxs, allocator, trellis_termination, + decode_block_step, nullptr, nullptr); +} + +uint32_t armral_turbo_decode_block_noalloc_buffer_size(uint32_t k) { + counting_allocator allocator{}; + (void)armral::turbo::decode( + nullptr, nullptr, nullptr, k, nullptr, 2.F, 1, 1, nullptr, allocator, + trellis_termination, decode_block_step, nullptr, nullptr); + return allocator.required_bytes(); +} + +// Batched decoding also requires the single functions, as any remaining blocks +// (modulo the vector size: 8) will be decoded using them. +armral_status armral_turbo_decode_batch(uint32_t num_blocks, const int8_t *sys, + const int8_t *par, const int8_t *itl, + uint32_t k, uint8_t *dst, + uint32_t max_iter, + uint16_t *perm_idxs) { + heap_allocator allocator{}; + return armral::turbo::decode( + sys, par, itl, k, dst, 2.F, max_iter, num_blocks, perm_idxs, allocator, + trellis_termination, decode_block_step, batched_trellis_termination, + decode_batch_step); +} + +armral_status +armral_turbo_decode_batch_noalloc(uint32_t num_blocks, const int8_t *sys, + const int8_t *par, const int8_t *itl, + uint32_t k, uint8_t *dst, uint32_t max_iter, + uint16_t *perm_idxs, void *buffer) { + buffer_bump_allocator allocator{buffer}; + return armral::turbo::decode( + sys, par, itl, k, dst, 2.F, max_iter, num_blocks, perm_idxs, allocator, + trellis_termination, decode_block_step, batched_trellis_termination, + decode_batch_step); } -uint32_t armral_turbo_decode_block_noalloc_buffer_size(uint32_t k, - uint32_t max_iter) { +uint32_t armral_turbo_decode_batch_noalloc_buffer_size(uint32_t k) { counting_allocator allocator{}; - (void)turbo_decode_block(nullptr, nullptr, nullptr, k, nullptr, max_iter, - nullptr, allocator); + (void)armral::turbo::decode( + nullptr, nullptr, nullptr, k, nullptr, 2.F, 0, 8, nullptr, allocator, + trellis_termination, decode_block_step, batched_trellis_termination, + decode_batch_step); return allocator.required_bytes(); } diff --git a/src/UpperPHY/Turbo/arm_turbo_decoder_batch.hpp b/src/UpperPHY/Turbo/arm_turbo_decoder_batch.hpp new file mode 100644 index 0000000000000000000000000000000000000000..59c74fa8c8211205336a8514da4dc8afe6c60176 --- /dev/null +++ b/src/UpperPHY/Turbo/arm_turbo_decoder_batch.hpp @@ -0,0 +1,241 @@ +/* + Arm RAN Acceleration Library + SPDX-FileCopyrightText: Copyright 2024-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause +*/ +#pragma once + +#include "turbo_code_common.hpp" +#include "utils/allocators.hpp" + +#include + +namespace { + +inline void batched_turbo_llrs_to_bits(uint32_t k, const int16x8_t *llr, + uint8_t *data_out) { + constexpr uint32_t data_t_size = 8; // data_out is uint8 + // We want to write to the upper half of the uint16 vectors, so that we can + // easily narrow later with vaddhn_u16() + uint16x8_t ones[data_t_size] = { + {32768, 32768, 32768, 32768, 32768, 32768, 32768, 32768}, // 10000000 << 8 + {16384, 16384, 16384, 16384, 16384, 16384, 16384, 16384}, // 01000000 << 8 + {8192, 8192, 8192, 8192, 8192, 8192, 8192, 8192}, // 00100000 << 8 + {4096, 4096, 4096, 4096, 4096, 4096, 4096, 4096}, // 00010000 << 8 + {2048, 2048, 2048, 2048, 2048, 2048, 2048, 2048}, // 00001000 << 8 + {1024, 1024, 1024, 1024, 1024, 1024, 1024, 1024}, // 00000100 << 8 + {512, 512, 512, 512, 512, 512, 512, 512}, // 00000010 << 8 + {256, 256, 256, 256, 256, 256, 256, 256}}; // 00000001 << 8 + + for (uint32_t k8 = 0; k8 < k / data_t_size; ++k8) { + uint16x8_t mask[data_t_size]; + for (uint32_t j = 0; j < data_t_size; ++j) { + uint16x8_t pred = vcltzq_s16(llr[k8 * data_t_size + j]); + // jth bit in mask[j][vi] is jth bit in data_out[vi] + mask[j] = vandq_u16(pred, ones[j]); + } + ((uint8x8_t *)data_out)[k8] = vaddhn_u16( + vqaddq_u16(vqaddq_u16(mask[0], mask[1]), vqaddq_u16(mask[2], mask[3])), + vqaddq_u16(vqaddq_u16(mask[4], mask[5]), vqaddq_u16(mask[6], mask[7]))); + } +} + +// Calculate the trellis termination values. These are independent of the +// extrinsic information and so can be done once without needing to be updated +// on every iteration. +void batched_trellis_termination(const int16x8_t *sys, const int16x8_t *par, + uint32_t k, int16x8_t *beta_tail, + int16x8_t l_c) { + // We handle the gammas for the trellis termination bits separately + // as the state transitions are different. The x_{kl} are never 1 + // here, because we always use inputs of 0 to drive the trellis back + // to state 0 in the encoder, so we only need to consider a smaller + // number of state transitions. We also do not have any extrinsic + // information. Because some of the gamma terms will always be + // -INFINITY (specifically indices [1] and [3]) we can forgo adding + // to them to beta or taking the max with them, compared with when + // we calculate beta in the main calculations. As above, we assume + // that the channel reliability parameter l_c/2 = 1. + int16x8_t pdf_00[3] = {vqaddq_s16(sys[k], par[k]), + vqaddq_s16(sys[k + 1], par[k + 1]), + vqaddq_s16(sys[k + 2], par[k + 2])}; + int16x8_t pdf_01[3] = {vqsubq_s16(sys[k], par[k]), + vqsubq_s16(sys[k + 1], par[k + 1]), + vqsubq_s16(sys[k + 2], par[k + 2])}; + + // From each state, there is one path through trellis termination to state 0. + // So no need to do any maxes, or store intermediate results in more betas + beta_tail[0] = vqaddq_s16(pdf_00[0], vqaddq_s16(pdf_00[1], pdf_00[2])); + beta_tail[1] = vqaddq_s16(pdf_01[0], vqaddq_s16(pdf_00[1], pdf_00[2])); + beta_tail[2] = vqaddq_s16(pdf_00[0], vqaddq_s16(pdf_01[1], pdf_00[2])); + beta_tail[3] = vqaddq_s16(pdf_01[0], vqaddq_s16(pdf_01[1], pdf_00[2])); + beta_tail[4] = vqaddq_s16(pdf_01[0], vqaddq_s16(pdf_00[1], pdf_01[2])); + beta_tail[5] = vqaddq_s16(pdf_00[0], vqaddq_s16(pdf_00[1], pdf_01[2])); + beta_tail[6] = vqaddq_s16(pdf_01[0], vqaddq_s16(pdf_01[1], pdf_01[2])); + beta_tail[7] = vqaddq_s16(pdf_00[0], vqaddq_s16(pdf_01[1], pdf_01[2])); +} + +// Assumes l_c is 2 and can be ignored (l_c/2 = 2/2 = 1) +void decode_batch_step(const int16x8_t *sys, const int16x8_t *par, + const int16x8_t *extrinsic, uint32_t k, int16x8_t *llr, + int16x8_t *a, int16x8_t *b_tail, void *g_i16, + int16x8_t l_c) { + + constexpr uint32_t normalize_frequency = 4; + constexpr uint32_t states = 8; // 8 states in the encoder + + // Gamma + // Cast gamma to align function signature with single decode_block_step + int16x8x4_t *g = static_cast(g_i16); + // Gamma[i] = +/- (sys[i] + extrinsic[i]/2) +/- par[i] + // So for each k idx there are 4 Gammas (+ +, + -, - +, - -). + // So store 4 per k and label each transition with its corresponding offset + + // + + + constexpr uint32_t t00 = 0; + constexpr uint32_t t14 = 0; + constexpr uint32_t t67 = 0; + constexpr uint32_t t73 = 0; + // + - + constexpr uint32_t t25 = 1; + constexpr uint32_t t31 = 1; + constexpr uint32_t t42 = 1; + constexpr uint32_t t56 = 1; + // - + + constexpr uint32_t t21 = 2; + constexpr uint32_t t35 = 2; + constexpr uint32_t t46 = 2; + constexpr uint32_t t52 = 2; + // - - + constexpr uint32_t t04 = 3; + constexpr uint32_t t10 = 3; + constexpr uint32_t t63 = 3; + constexpr uint32_t t77 = 3; + + for (uint32_t i = 0; i < k; ++i) { + int16x8_t term = vqaddq_s16(extrinsic[i] >> 1, sys[i]); + g[i].val[0] = vqaddq_s16(term, par[i]); // + + + g[i].val[1] = vqsubq_s16(term, par[i]); // + - + g[i].val[2] = vqsubq_s16(par[i], term); // - + + g[i].val[3] = vqsubq_s16(vqnegq_s16(par[i]), term); // - - + } + + // Alpha + for (uint32_t i = 0; i < k; ++i) { + uint32_t k_idx = states * i; + uint32_t kp1_idx = states * (i + 1); + + a[kp1_idx + 0] = vmaxq_s16(vqaddq_s16(g[i].val[t00], a[k_idx + 0]), + vqaddq_s16(g[i].val[t10], a[k_idx + 1])); + a[kp1_idx + 1] = vmaxq_s16(vqaddq_s16(g[i].val[t21], a[k_idx + 2]), + vqaddq_s16(g[i].val[t31], a[k_idx + 3])); + a[kp1_idx + 2] = vmaxq_s16(vqaddq_s16(g[i].val[t42], a[k_idx + 4]), + vqaddq_s16(g[i].val[t52], a[k_idx + 5])); + a[kp1_idx + 3] = vmaxq_s16(vqaddq_s16(g[i].val[t63], a[k_idx + 6]), + vqaddq_s16(g[i].val[t73], a[k_idx + 7])); + a[kp1_idx + 4] = vmaxq_s16(vqaddq_s16(g[i].val[t04], a[k_idx + 0]), + vqaddq_s16(g[i].val[t14], a[k_idx + 1])); + a[kp1_idx + 5] = vmaxq_s16(vqaddq_s16(g[i].val[t25], a[k_idx + 2]), + vqaddq_s16(g[i].val[t35], a[k_idx + 3])); + a[kp1_idx + 6] = vmaxq_s16(vqaddq_s16(g[i].val[t46], a[k_idx + 4]), + vqaddq_s16(g[i].val[t56], a[k_idx + 5])); + a[kp1_idx + 7] = vmaxq_s16(vqaddq_s16(g[i].val[t67], a[k_idx + 6]), + vqaddq_s16(g[i].val[t77], a[k_idx + 7])); + // Normalize + if (i % normalize_frequency == 0) { + a[kp1_idx + 1] = vqsubq_s16(a[kp1_idx + 1], a[kp1_idx]); + a[kp1_idx + 2] = vqsubq_s16(a[kp1_idx + 2], a[kp1_idx]); + a[kp1_idx + 3] = vqsubq_s16(a[kp1_idx + 3], a[kp1_idx]); + a[kp1_idx + 4] = vqsubq_s16(a[kp1_idx + 4], a[kp1_idx]); + a[kp1_idx + 5] = vqsubq_s16(a[kp1_idx + 5], a[kp1_idx]); + a[kp1_idx + 6] = vqsubq_s16(a[kp1_idx + 6], a[kp1_idx]); + a[kp1_idx + 7] = vqsubq_s16(a[kp1_idx + 7], a[kp1_idx]); + + a[kp1_idx + 0] = vdupq_n_s16(0); + } + } + + // LLR and Beta + // b_tail should be already initialized by trellis termination + int16x8_t b_kp1[states]; + for (uint32_t s = 0; s < states; ++s) { + b_kp1[s] = b_tail[s]; + } + + int16x8_t b[states]; + for (int32_t i = k - 1; i >= 0; --i) { + uint32_t k_idx = states * i; + + // Normalize beta_kp1 + if (i % normalize_frequency == 0) { + b_kp1[1] = vqsubq_s16(b[1], b_kp1[0]); + b_kp1[2] = vqsubq_s16(b[2], b_kp1[0]); + b_kp1[3] = vqsubq_s16(b[3], b_kp1[0]); + b_kp1[4] = vqsubq_s16(b[4], b_kp1[0]); + b_kp1[5] = vqsubq_s16(b[5], b_kp1[0]); + b_kp1[6] = vqsubq_s16(b[6], b_kp1[0]); + b_kp1[7] = vqsubq_s16(b[7], b_kp1[0]); + b_kp1[0] = vdupq_n_s16(0); + } + // Beta + b[0] = vmaxq_s16(vqaddq_s16(g[i].val[t00], b_kp1[0]), + vqaddq_s16(g[i].val[t04], b_kp1[4])); + b[1] = vmaxq_s16(vqaddq_s16(g[i].val[t10], b_kp1[0]), + vqaddq_s16(g[i].val[t14], b_kp1[4])); + b[2] = vmaxq_s16(vqaddq_s16(g[i].val[t21], b_kp1[1]), + vqaddq_s16(g[i].val[t25], b_kp1[5])); + b[3] = vmaxq_s16(vqaddq_s16(g[i].val[t31], b_kp1[1]), + vqaddq_s16(g[i].val[t35], b_kp1[5])); + b[4] = vmaxq_s16(vqaddq_s16(g[i].val[t42], b_kp1[2]), + vqaddq_s16(g[i].val[t46], b_kp1[6])); + b[5] = vmaxq_s16(vqaddq_s16(g[i].val[t52], b_kp1[2]), + vqaddq_s16(g[i].val[t56], b_kp1[6])); + b[6] = vmaxq_s16(vqaddq_s16(g[i].val[t63], b_kp1[3]), + vqaddq_s16(g[i].val[t67], b_kp1[7])); + b[7] = vmaxq_s16(vqaddq_s16(g[i].val[t73], b_kp1[3]), + vqaddq_s16(g[i].val[t77], b_kp1[7])); + + // LLR + int16x8_t prob_0 = vmaxq_s16( + vmaxq_s16( + vmaxq_s16( + vqaddq_s16(vqaddq_s16(a[k_idx + 0], b_kp1[0]), g[i].val[t00]), + vqaddq_s16(vqaddq_s16(a[k_idx + 1], b_kp1[4]), g[i].val[t14])), + vmaxq_s16( + vqaddq_s16(vqaddq_s16(a[k_idx + 2], b_kp1[5]), g[i].val[t25]), + vqaddq_s16(vqaddq_s16(a[k_idx + 3], b_kp1[1]), g[i].val[t31]))), + vmaxq_s16( + vmaxq_s16( + vqaddq_s16(vqaddq_s16(a[k_idx + 4], b_kp1[2]), g[i].val[t42]), + vqaddq_s16(vqaddq_s16(a[k_idx + 5], b_kp1[6]), g[i].val[t56])), + vmaxq_s16( + vqaddq_s16(vqaddq_s16(a[k_idx + 6], b_kp1[7]), g[i].val[t67]), + vqaddq_s16(vqaddq_s16(a[k_idx + 7], b_kp1[3]), + g[i].val[t73])))); + int16x8_t prob_1 = vmaxq_s16( + vmaxq_s16( + vmaxq_s16( + vqaddq_s16(vqaddq_s16(a[k_idx + 0], b_kp1[4]), g[i].val[t04]), + vqaddq_s16(vqaddq_s16(a[k_idx + 1], b_kp1[0]), g[i].val[t10])), + vmaxq_s16( + vqaddq_s16(vqaddq_s16(a[k_idx + 2], b_kp1[1]), g[i].val[t21]), + vqaddq_s16(vqaddq_s16(a[k_idx + 3], b_kp1[5]), g[i].val[t35]))), + vmaxq_s16( + vmaxq_s16( + vqaddq_s16(vqaddq_s16(a[k_idx + 4], b_kp1[6]), g[i].val[t46]), + vqaddq_s16(vqaddq_s16(a[k_idx + 5], b_kp1[2]), g[i].val[t52])), + vmaxq_s16( + vqaddq_s16(vqaddq_s16(a[k_idx + 6], b_kp1[3]), g[i].val[t63]), + vqaddq_s16(vqaddq_s16(a[k_idx + 7], b_kp1[7]), + g[i].val[t77])))); + + llr[i] = vqsubq_s16(prob_0, prob_1); + + for (uint32_t s = 0; s < states; ++s) { + b_kp1[s] = b[s]; + } + } +} + +} // namespace diff --git a/src/UpperPHY/Turbo/arm_turbo_decoder.hpp b/src/UpperPHY/Turbo/arm_turbo_decoder_single.hpp similarity index 50% rename from src/UpperPHY/Turbo/arm_turbo_decoder.hpp rename to src/UpperPHY/Turbo/arm_turbo_decoder_single.hpp index f2e3012b2f606540b5925e091c9a402ec9829aa1..8e36f5aae501b6c312b6c89a586be833e4f2ddfb 100644 --- a/src/UpperPHY/Turbo/arm_turbo_decoder.hpp +++ b/src/UpperPHY/Turbo/arm_turbo_decoder_single.hpp @@ -1,9 +1,13 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #pragma once +#include "intrinsics.h" + #include namespace { @@ -12,43 +16,11 @@ struct int16x4x8_t { int16x4_t val[8]; }; -// With Turbo codes n (=k) is always divisible by 8 so we -// do not have to worry about tail bits -inline void turbo_llrs_to_bits(uint32_t n, const int16x8_t *llr, - uint8_t *data_out) { - uint32_t full_bytes = n >> 3; - constexpr uint16x8_t ones = {128, 64, 32, 16, 8, 4, 2, 1}; - - for (uint32_t i = 0; i < full_bytes; ++i) { - // The first bit to write in the byte is the most significant - uint16x8_t pred = vcltzq_s16(llr[i]); - uint16x8_t mask = vandq_u16(pred, ones); - data_out[i] = (uint8_t)vaddvq_u16(mask); - } -} - -// Take the input int8_t LLRs and convert them to int16x8_ts -inline void convert_llrs(uint32_t k, const int8_t *llrs, int16x8_t *llrs_i16) { - // With turbo codes k is always a multiple of 8 so we do 8 LLRs at a time - for (uint32_t i = 0, j = 0; i < k; i += 8, j++) { - llrs_i16[j] = vshll_n_s8(vld1_s8(&llrs[i]), 0); - } -} - -// Update the extrinsic information output from the decoding stage -// based on the computed LLRs, the old extrinsic information and the input -inline void update_extrinsic(uint32_t len, const int16x8_t *llr, - int16x8_t *extrinsic, const int16x8_t *input) { - for (uint32_t i = 0; i < len; i++) { - extrinsic[i] = vqsubq_s16(vqsubq_s16(llr[i], extrinsic[i]), input[i]); - } -} - // Calculate the trellis termination values. These are independent of the // extrinsic information and so can be done once without needing to be updated // on every iteration. -int16x8_t trellis_termination(const int16x8_t *sys, const int16x8_t *par, - uint32_t k8, int16x8_t l_c) { +void trellis_termination(const int16x8_t *sys, const int16x8_t *par, + uint32_t k8, int16x8_t *beta_tail, int16x8_t l_c) { // We handle the gammas for the trellis termination bits separately // as the state transitions are different. The x_{kl} are never 1 // here, because we always use inputs of 0 to drive the trellis back @@ -75,18 +47,21 @@ int16x8_t trellis_termination(const int16x8_t *sys, const int16x8_t *par, int16x8_t b0123 = vzip1q_s16(beta_term, beta_term); - return vqaddq_s16(g, b0123); + *beta_tail = vqaddq_s16(g, b0123); } // A single max-log-MAP decoder that works on an array of systematic bits (sys), // an array of parity bits (par), and an array of extrinsic values from a // previous decoding stage (extrinsic) -void decode_step(const int16x8_t *sys, const int16x8_t *par, - const int16x8_t *extrinsic, uint32_t k8, int16x8_t *llr, - int16x8_t *alpha, int16x8_t beta_tail, int16x4x8_t *pdf4, - int16x8_t l_c) { +void decode_block_step(const int16x8_t *sys, const int16x8_t *par, + const int16x8_t *extrinsic, uint32_t k8, int16x8_t *llr, + int16x8_t *alpha, int16x8_t *beta_tail, void *gamma_i16, + int16x8_t l_c) { + constexpr uint32_t normalize_frequency = 4; uint32_t k_idx; uint32_t kp1_idx; + // cast gamma to align function signature with batched decode_step + int16x4x8_t *pdf4 = static_cast(gamma_i16); // Start by computing the non-zero conditional state transition probabilities // from state s' to state s for every k, denoted gamma_k(s',s). In general for @@ -183,7 +158,7 @@ void decode_step(const int16x8_t *sys, const int16x8_t *par, alpha[kp1_idx] = vmaxq_s16(left, right); // Normalize alpha - if (j % 4 == 0) { + if (j % normalize_frequency == 0) { int16x8_t alpha0 = vdupq_n_s16(alpha[kp1_idx][0]); alpha[kp1_idx] = vqsubq_s16(alpha[kp1_idx], alpha0); } @@ -213,7 +188,7 @@ void decode_step(const int16x8_t *sys, const int16x8_t *par, constexpr uint8x16_t idx_1302 = {6, 7, 0, 1, 4, 5, 2, 3, 2, 3, 4, 5, 0, 1, 6, 7}; - int16x8_t beta_kp1 = beta_tail; + int16x8_t beta_kp1 = *beta_tail; for (int32_t i = k8 - 1; i >= 0; i--) { int16x8_t prob_0; @@ -223,7 +198,7 @@ void decode_step(const int16x8_t *sys, const int16x8_t *par, k_idx = 8 * i + j; // Normalize beta - if (j % 4 == 0) { + if (j % normalize_frequency == 0) { int16x8_t beta0 = vdupq_n_s16(beta_kp1[0]); beta_kp1 = vqsubq_s16(beta_kp1, beta0); } @@ -252,8 +227,10 @@ void decode_step(const int16x8_t *sys, const int16x8_t *par, int16x8_t beta_k = vmaxq_s16(left, right); - // a0213 = {alpha[k_idx][0], alpha[k_idx][2], alpha[k_idx][4], alpha[k_idx][6], - // alpha[k_idx][1], alpha[k_idx][3], alpha[k_idx][5], alpha[k_idx][7]}; + // a0213 = {alpha[k_idx][0], alpha[k_idx][2], + // alpha[k_idx][4], alpha[k_idx][6], + // alpha[k_idx][1], alpha[k_idx][3], + // alpha[k_idx][5], alpha[k_idx][7]}; int16x8_t a0213 = vreinterpretq_s16_u8( vqtbl1q_u8(vreinterpretq_u8_s16(alpha[k_idx]), idx_even_odd)); @@ -285,200 +262,3 @@ void decode_step(const int16x8_t *sys, const int16x8_t *par, } } // namespace - -// The template parameter allows us to disable checking for convergence (and -// thus terminating the iterations early) so we always run a fixed number of -// iterations in our benchmarking -template -void armral::turbo::decode_block(const int8_t *sys, const int8_t *par, - const int8_t *itl, uint32_t k, uint8_t *dst, - float32_t l_c, uint32_t max_iter, - uint16_t *perm_idxs, Allocator &allocator) { - // This implements multiple steps of the max-log-MAP algorithm, - // which is an approximation to the MAP (BCJR) algorithm. It returns - // a hard decision rather than raw LLRs. - - // We will be working with int16x8_t, so work out how many of these - // will be needed to store k int16_ts. k is always a multiple of 8, - // so no need to worry about remainders. - uint32_t k8 = k >> 3; - - auto sys_s16 = allocate_uninitialized(allocator, k8 + 1); - auto par_s16 = allocate_uninitialized(allocator, k8 + 1); - auto itl_s16 = allocate_uninitialized(allocator, k8 + 1); - - auto perm_sys = allocate_uninitialized(allocator, k8 + 1); - - // Allocate space to hold the extrinsic and permuted extrinsic information - // to be passed between the two decoders. Extrinsic is initially set to 0. - auto extrinsic = allocate_zeroed(allocator, k8); - auto perm_extrinsic = allocate_zeroed(allocator, k8); - - // Allocate space for log likelihood ratios from both stages of decoding - auto l1_uky = allocate_uninitialized(allocator, k8); - auto l2_uky = allocate_uninitialized(allocator, k8); - auto prev_l2_uky = allocate_zeroed(allocator, k8); - - // Allocate space to hold alpha and gamma - // alpha stores the forward-accumulated state probabilities for each decoded - // bit, where the LTE encoder has 8 states and there are k+3 bits to decode - // plus the starting condition - auto alpha = allocate_uninitialized(allocator, 8 * k8 + 1); - - // gamma stores the conditional state transition probabilities for each of the - // k+3 bits to decode - auto gamma = allocate_uninitialized(allocator, k8); - - // Get the permutation vector for the input value of k. - // declare unique_ptr here to keep the allocated memory's scope outside the - // else block - unique_ptr perm_lookup_unique; - perm_idx_lookup *perm_lookup = nullptr; - // Find the index into the array of parameter arrays corresponding - // to the current k. Subtract 40 because k=40 is the lowest value. - uint32_t param_idx = armral::turbo::perm_params_lookup[(k - 40) >> 3]; - if (perm_idxs != NULL) { - // NOTE: All allocations done. - if constexpr (Allocator::is_counting) { - return; - } - perm_lookup = (perm_idx_lookup *)perm_idxs + - armral::turbo::perm_lookup_offset[param_idx]; - } else { - perm_lookup_unique = allocate_uninitialized(allocator, k); - - // NOTE: All allocations done. - if constexpr (Allocator::is_counting) { - return; - } - - perm_lookup = perm_lookup_unique.get(); - - // Generate the permutation vector for the input value of k. - armral::turbo::k_perm_idx_init(k, param_idx, perm_lookup); - } - - // Convert our LLRs from int8_ts into int16_ts - convert_llrs(k, sys, sys_s16.get()); - convert_llrs(k, par, par_s16.get()); - convert_llrs(k, itl, itl_s16.get()); - - // Unperturb the trellis termination bits. They are transmitted as: - // X0 Z1 X'0 Z'1 Z0 X2 Z'0 X'2 X1 Z2 X'1 - // Z'2 - // but need to appended to the inputs as: - // X0 X1 X2 Z0 Z1 Z2 X'0 X'1 X'2 Z'0 Z'1 - // Z'2 - // We append to the systematic (X), the parity (Z) and the interleaved parity - // (Z') values here, and to the interleaved systematic values (X') further - // down. - sys_s16[k8][0] = (int16_t)sys[k]; - sys_s16[k8][1] = (int16_t)itl[k]; - sys_s16[k8][2] = (int16_t)par[k + 1]; - - par_s16[k8][0] = (int16_t)par[k]; - par_s16[k8][1] = (int16_t)sys[k + 1]; - par_s16[k8][2] = (int16_t)itl[k + 1]; - - itl_s16[k8][0] = (int16_t)par[k + 2]; - itl_s16[k8][1] = (int16_t)sys[k + 3]; - itl_s16[k8][2] = (int16_t)itl[k + 3]; - - // Prescale l_c to avoid doing it repeatedly in the PDF calculations later - const int16x8_t channel_reliability = vdupq_n_s16((int16_t)l_c / 2); - - // Create a permuted version of the systematic output for use - // with the second decoder - for (uint32_t i = 0; i < k8; i++) { - for (uint32_t j = 0; j < 8; j++) { - perm_sys[i][j] = (int16_t)sys[perm_lookup[(i * 8) + j].perm_idx]; - } - } - perm_sys[k8][0] = (int16_t)sys[k + 2]; - perm_sys[k8][1] = (int16_t)itl[k + 2]; - perm_sys[k8][2] = (int16_t)par[k + 3]; - - // Initialize alpha - alpha[0] = vdupq_n_s16(std::numeric_limits::min()); - alpha[0][0] = 0; - - // Calculate the trellis termination state transition probabilities, which - // do not require any extrinsic information - int16x8_t beta_tail = trellis_termination(sys_s16.get(), par_s16.get(), k8, - channel_reliability); - int16x8_t perm_beta_tail = trellis_termination(perm_sys.get(), itl_s16.get(), - k8, channel_reliability); - - // Initialize the number of iterations - uint32_t num_iter = 0; - - while (num_iter < max_iter) { - // Run the first decoder step - decode_step(sys_s16.get(), par_s16.get(), extrinsic.get(), k8, l1_uky.get(), - alpha.get(), beta_tail, gamma.get(), channel_reliability); - - // Compute the new extrinsic information to pass into the second decoder - update_extrinsic(k8, l1_uky.get(), extrinsic.get(), sys_s16.get()); - - // Need to unpermute extrinsic to match input to second decoder - for (uint32_t i = 0; i < k8; i++) { - for (uint32_t j = 0; j < 8; j++) { - perm_extrinsic[i][j] = extrinsic[perm_lookup[i * 8 + j].vec_idx] - [perm_lookup[i * 8 + j].vec_lane]; - } - } - - // Run the second decoder step - decode_step(perm_sys.get(), itl_s16.get(), perm_extrinsic.get(), k8, - l2_uky.get(), alpha.get(), perm_beta_tail, gamma.get(), - channel_reliability); - - // Compute the new extrinsic information to pass back into the first encoder - update_extrinsic(k8, l2_uky.get(), perm_extrinsic.get(), perm_sys.get()); - - // But need to unpermute extrinsic first - for (uint32_t i = 0; i < k8; i++) { - for (uint32_t j = 0; j < 8; j++) { - extrinsic[perm_lookup[i * 8 + j].vec_idx] - [perm_lookup[i * 8 + j].vec_lane] = perm_extrinsic[i][j]; - } - } - - // Compare this iteration's results with those from the previous iteration - int16_t max_abs_diff = 0; - for (uint32_t i = 0; i < k8; i++) { - int16_t abs_diff = - vmaxvq_s16(vqabsq_s16(vqsubq_s16(l2_uky[i], prev_l2_uky[i]))); - if (abs_diff > max_abs_diff) { - max_abs_diff = abs_diff; - } - } - - // If we've converged, finish decoding - if constexpr (check_convergence) { - if (max_abs_diff == 0) { - break; - } - } - - // Store the current "final" LLRs to use in convergence checking next - // iteration - for (uint32_t i = 0; i < k8; i++) { - prev_l2_uky[i] = l2_uky[i]; - } - - num_iter++; - } - - // Return unpermuted final output from second encoder - // Rather than allocate another new vector, copy into l1_uky and return that - for (uint32_t i = 0; i < k8; i++) { - for (uint32_t j = 0; j < 8; j++) { - l1_uky[perm_lookup[i * 8 + j].vec_idx][perm_lookup[i * 8 + j].vec_lane] = - l2_uky[i][j]; - } - } - - // Make a hard decision based on the final LLRs - turbo_llrs_to_bits(k, l1_uky.get(), dst); -} diff --git a/src/UpperPHY/Turbo/arm_turbo_encoder.cpp b/src/UpperPHY/Turbo/arm_turbo_encoder.cpp index 896519eb2e4e6872a6ff5677148e5aa2e892220e..0c8455efa1c548488da88fd1935bdacfa74324a9 100644 --- a/src/UpperPHY/Turbo/arm_turbo_encoder.cpp +++ b/src/UpperPHY/Turbo/arm_turbo_encoder.cpp @@ -1,9 +1,11 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "armral.h" -#include "turbo_code.hpp" +#include "turbo_code_common.hpp" #include "turbo_tables.hpp" #include "utils/allocators.hpp" diff --git a/src/UpperPHY/Turbo/arm_turbo_rate_matching.cpp b/src/UpperPHY/Turbo/arm_turbo_rate_matching.cpp index beac775f134a2d1487cac429a48a4581080f3ddb..8da2d51f212ddcb84b5b9473c94dee59475e7fb3 100644 --- a/src/UpperPHY/Turbo/arm_turbo_rate_matching.cpp +++ b/src/UpperPHY/Turbo/arm_turbo_rate_matching.cpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "armral.h" #include "turbo_tables.hpp" diff --git a/src/UpperPHY/Turbo/arm_turbo_rate_recovery.cpp b/src/UpperPHY/Turbo/arm_turbo_rate_recovery.cpp index 02dfa286ac292118b95adfb676675ecfddf5402e..daf910e4b84630c1fe6645cf878ce6e9c5a3a7c7 100644 --- a/src/UpperPHY/Turbo/arm_turbo_rate_recovery.cpp +++ b/src/UpperPHY/Turbo/arm_turbo_rate_recovery.cpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "armral.h" diff --git a/src/UpperPHY/Turbo/turbo_code.hpp b/src/UpperPHY/Turbo/turbo_code.hpp deleted file mode 100644 index 6d56f1d51029753c908f6ab25d4d959abb78528a..0000000000000000000000000000000000000000 --- a/src/UpperPHY/Turbo/turbo_code.hpp +++ /dev/null @@ -1,104 +0,0 @@ -/* - Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates -*/ -#pragma once - -#include "armral.h" -#include "turbo_tables.hpp" - -namespace armral::turbo { - -// Check that the number of bits, k, is one of the valid -// values for LTE turbo coding as specified in TS36.212 -inline bool valid_num_bits(uint32_t k) { - if (k < 40) { - return false; - } - if (k <= 512) { - return k % 8 == 0; - } - if (k <= 1024) { - return k % 16 == 0; - } - if (k <= 2048) { - return k % 32 == 0; - } - if (k <= 6144) { - return k % 64 == 0; - } - return false; -} - -// Generate the permuted index for given value of k using the polynomial -// scheme described in TS36.212 -inline uint16_t generate_perm_idx(uint32_t i, uint16_t f1, uint16_t f2, - uint32_t k) { - // 0 <= perm < 6144 but f2*i*i may be much larger - return static_cast((uint64_t(f1) * i + uint64_t(f2) * i * i) % k); -} - -struct perm_idx_lookup { - uint16_t perm_idx; - uint16_t vec_idx; - uint16_t vec_lane; -}; - -inline void k_perm_idx_init(uint16_t k, uint16_t k_idx, - perm_idx_lookup *perm_idxs) { - // Extract the correct values of f1 and f2 to build the - // interleaving polynomial - uint16_t f1 = armral::turbo::perm_params[k_idx][0]; - uint16_t f2 = armral::turbo::perm_params[k_idx][1]; - // Generate the permutation vector for the input value of k. - for (uint16_t i = 0; i < k; ++i) { - uint16_t perm_idx = generate_perm_idx(i, f1, f2, k); - perm_idxs[i].perm_idx = perm_idx; - perm_idxs[i].vec_idx = perm_idx / 8; - perm_idxs[i].vec_lane = perm_idx % 8; - } -} - -inline armral_status all_perm_idx_init(perm_idx_lookup *buffer) { - - uint16_t k = 40; - uint16_t k_idx = 0; - for (; k < 512; k += 8, ++k_idx) { - k_perm_idx_init(k, k_idx, - buffer + armral::turbo::perm_lookup_offset[k_idx]); - } - for (; k < 1024; k += 16, ++k_idx) { - k_perm_idx_init(k, k_idx, - buffer + armral::turbo::perm_lookup_offset[k_idx]); - } - for (; k < 2048; k += 32, ++k_idx) { - k_perm_idx_init(k, k_idx, - buffer + armral::turbo::perm_lookup_offset[k_idx]); - } - for (; k <= 6144; k += 64, ++k_idx) { - k_perm_idx_init(k, k_idx, - buffer + armral::turbo::perm_lookup_offset[k_idx]); - } - - return ARMRAL_SUCCESS; -} - -// An "expert" interface for Turbo decoding a single block. It allows the user -// to specify a channel reliability measure L_c, which should be computed as: -// -// E_b -// L_c = 4 a R_c --- -// N_0 -// -// where a is the fading amplitude, R_c is the code rate, E_b is the -// energy per bit, and N_0 is the noise power spectral density ratio. -// -// The template parameter allows us to disable checking for convergence (and -// thus terminating the iterations early) so we always run a fixed number of -// iterations in our benchmarking -template -void decode_block(const int8_t *sys, const int8_t *par, const int8_t *itl, - uint32_t k, uint8_t *dst, float32_t l_c, uint32_t max_iter, - uint16_t *perm_idxs, Allocator &allocator); - -} // namespace armral::turbo diff --git a/src/UpperPHY/Turbo/turbo_code_common.hpp b/src/UpperPHY/Turbo/turbo_code_common.hpp new file mode 100644 index 0000000000000000000000000000000000000000..027d081cc772a629ad8f9fc29cacbb3d8a6099f5 --- /dev/null +++ b/src/UpperPHY/Turbo/turbo_code_common.hpp @@ -0,0 +1,449 @@ +/* + Arm RAN Acceleration Library + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause +*/ +#pragma once + +#include "armral.h" +#include "intrinsics.h" +#include "turbo_tables.hpp" +#include "utils/allocators.hpp" + +#include + +namespace armral::turbo { + +// Check that the number of bits, k, is one of the valid +// values for LTE turbo coding as specified in TS36.212 +inline bool valid_num_bits(uint32_t k) { + if (k < 40) { + return false; + } + if (k <= 512) { + return k % 8 == 0; + } + if (k <= 1024) { + return k % 16 == 0; + } + if (k <= 2048) { + return k % 32 == 0; + } + if (k <= 6144) { + return k % 64 == 0; + } + return false; +} + +// Generate the permuted index for given value of k using the polynomial +// scheme described in TS36.212 +inline uint16_t generate_perm_idx(uint32_t i, uint16_t f1, uint16_t f2, + uint32_t k) { + // 0 <= perm < 6144 but f2*i*i may be much larger + return static_cast((uint64_t(f1) * i + uint64_t(f2) * i * i) % k); +} + +struct perm_idx_lookup { + uint16_t perm_idx; + uint16_t vec_idx; + uint16_t vec_lane; +}; + +inline void k_perm_idx_init(uint16_t k, uint16_t k_idx, + perm_idx_lookup *perm_idxs) { + // Extract the correct values of f1 and f2 to build the + // interleaving polynomial + uint16_t f1 = perm_params[k_idx][0]; + uint16_t f2 = perm_params[k_idx][1]; + // Generate the permutation vector for the input value of k. + for (uint16_t i = 0; i < k; ++i) { + uint16_t perm_idx = generate_perm_idx(i, f1, f2, k); + perm_idxs[i].perm_idx = perm_idx; + perm_idxs[i].vec_idx = perm_idx / 8; + perm_idxs[i].vec_lane = perm_idx % 8; + } +} + +inline armral_status all_perm_idx_init(perm_idx_lookup *buffer) { + + uint16_t k = 40; + uint16_t k_idx = 0; + for (; k < 512; k += 8, ++k_idx) { + k_perm_idx_init(k, k_idx, buffer + perm_lookup_offset[k_idx]); + } + for (; k < 1024; k += 16, ++k_idx) { + k_perm_idx_init(k, k_idx, buffer + perm_lookup_offset[k_idx]); + } + for (; k < 2048; k += 32, ++k_idx) { + k_perm_idx_init(k, k_idx, buffer + perm_lookup_offset[k_idx]); + } + for (; k <= 6144; k += 64, ++k_idx) { + k_perm_idx_init(k, k_idx, buffer + perm_lookup_offset[k_idx]); + } + + return ARMRAL_SUCCESS; +} + +// Take the input int8_t LLRs and convert them to int16x8_ts. +// This can be used as is for both the batched and the single case +inline void convert_llrs(uint32_t length, const int8_t *llrs, + int16x8_t *llrs_i16) { + constexpr uint32_t vec_len = 8; + uint32_t i = 0; + uint32_t v = 0; + for (; i < length - vec_len + 1; i += vec_len, ++v) { + llrs_i16[v] = vshll_n_s8(vld1_s8(&llrs[i]), 0); + } + for (uint32_t r = 0; r < length % vec_len; ++r) { + llrs_i16[v][r] = static_cast(llrs[i + r]); + } +} + +// Update the extrinsic information output from the decoding stage +// based on the computed LLRs, the old extrinsic information and the input +// This can be used as is for both the batched and the single case +inline void update_extrinsic(uint32_t length, const int16x8_t *llr, + int16x8_t *extrinsic, const int16x8_t *input) { + for (uint32_t i = 0; i < length; i++) { + extrinsic[i] = vqsubq_s16(vqsubq_s16(llr[i], extrinsic[i]), input[i]); + } +} + +// With Turbo codes n (=k) is always divisible by 8 so we +// do not have to worry about tail bits +inline void turbo_llrs_to_bits(uint32_t n, const int16x8_t *llr, + uint8_t *data_out) { + uint32_t full_bytes = n >> 3; + constexpr uint16x8_t ones = {128, 64, 32, 16, 8, 4, 2, 1}; + + for (uint32_t i = 0; i < full_bytes; ++i) { + // The first bit to write in the byte is the most significant + uint16x8_t pred = vcltzq_s16(llr[i]); + uint16x8_t mask = vandq_u16(pred, ones); + data_out[i] = (uint8_t)vaddvq_u16(mask); + } +} + +template +void permute(perm_idx_lookup *perm_lookup, Vec *from, Vec *to, + uint32_t arr_length, uint32_t vec_length = 8) { + for (uint32_t i = 0; i < arr_length; ++i) { + if constexpr (batched) { + to[i] = from[perm_lookup[i].perm_idx]; + } else { + for (uint32_t j = 0; j < vec_length; j++) { + to[i][j] = from[perm_lookup[(i * vec_length) + j].vec_idx] + [perm_lookup[(i * vec_length) + j].vec_lane]; + } + } + } +} + +template +void unpermute(perm_idx_lookup *perm_lookup, Vec *from, Vec *to, + uint32_t arr_length, uint32_t vec_length = 8) { + for (uint32_t i = 0; i < arr_length; i++) { + if constexpr (batched) { + to[perm_lookup[i].perm_idx] = from[i]; + } else { + for (uint32_t j = 0; j < vec_length; j++) { + to[perm_lookup[i * vec_length + j].vec_idx] + [perm_lookup[i * vec_length + j].vec_lane] = from[i][j]; + } + } + } +} + +template +void interleave(const T *src, uint32_t ldsrc, T *dst, uint32_t lddst) { + for (uint32_t a = 0; a < ldsrc; ++a) { + for (uint32_t b = 0; b < lddst; ++b) { + dst[a * lddst + b] = src[b * ldsrc + a]; + } + } +} + +using trellis_term_func_t = void (*)(const int16x8_t *sys, const int16x8_t *par, + uint32_t k, int16x8_t *beta_tail, + int16x8_t l_c); +using decode_step_func_t = void (*)(const int16x8_t *sys, const int16x8_t *par, + const int16x8_t *extrinsic, uint32_t k, + int16x8_t *llr, int16x8_t *alpha, + int16x8_t *beta_tail, void *gamma, + int16x8_t l_c); + +// We set the channel reliability parameter l_c to a value that simplifies +// to 1 in the computation of the gamma values. This is justified because +// max-log-MAP decoding is independent of the channel reliability (which is +// itself relative to the channel SNR). For reference see: +// N. Wehn, "Turbo-decoding without SNR estimation", IEEE Communications +// Letters 4(6), pp. 193-195, July 2000. +template +armral_status +decode_loop(int16x8_t *sys, int16x8_t *pys, int16x8_t *par, int16x8_t *itl, + int16x8_t *extrinsic, int16x8_t *perm_extrinsic, int16x8_t *llr, + int16x8_t *perm_llr, int16x8_t *prev_perm_llr, int16x8_t *alpha, + int16x8_t *beta_tail, int16x8_t *perm_beta_tail, int16_t *gamma, + uint32_t k, uint8_t *dst, float32_t l_c, uint32_t max_iter, + perm_idx_lookup *perm_lookup, + trellis_term_func_t trellis_termination, + decode_step_func_t decode_step) { + // Middle decoder function for either a single block or batch of 8 blocks. + // The inputs (sys_i8 par_i8 itl_i8) to this function are interleaved, + // The output (dst) is returned uninterleaved. + // + // This function does some batch/block specific setup (permute sys into pys, + // handle trellis termination, and alpha/beta init) + // Then it executes the decode loop using decode_step(). + + constexpr uint32_t vec_len = 8; + const uint32_t kv = batched ? k : k / vec_len; + + // permute sys into pys + // perm_idx is < k, so we can do this before handling trellis termination + permute(perm_lookup, sys, pys, kv); + // Unperturb the trellis termination bits. They are transmitted as: + // X0 Z1 X'0 Z'1 Z0 X2 Z'0 X'2 X1 Z2 X'1 Z'2 + // but need to appended to the inputs as: + // X0 X1 X2 + // Z0 Z1 Z2 + // X'0 X'1 X'2 + // Z'0 Z'1 Z'2 + // Order like this so we can copy inplace + if constexpr (batched) { + // sys[k] = sys[k]; + // par[k] = par[k]; + pys[k] = sys[k + 2]; + sys[k + 2] = par[k + 1]; + par[k + 1] = sys[k + 1]; + sys[k + 1] = itl[k]; + itl[k] = par[k + 2]; + par[k + 2] = itl[k + 1]; + itl[k + 1] = sys[k + 3]; + pys[k + 1] = itl[k + 2]; + itl[k + 2] = itl[k + 3]; + pys[k + 2] = par[k + 3]; + } else { + // sys[kv][0] = sys[kv][0]; + // par[kv][0] = par[kv][0]; + pys[kv][0] = sys[kv][2]; + sys[kv][2] = par[kv][1]; + par[kv][1] = sys[kv][1]; + sys[kv][1] = itl[kv][0]; + itl[kv][0] = par[kv][2]; + par[kv][2] = itl[kv][1]; + itl[kv][1] = sys[kv][3]; + pys[kv][1] = itl[kv][2]; + itl[kv][2] = itl[kv][3]; + pys[kv][2] = par[kv][3]; + } + // Prescale l_c to avoid doing it a bunch later + const int16x8_t channel_reliability = vdupq_n_s16((int16_t)l_c / 2); + // Initialize alpha (= zero or min) and beta (from the trellis termination) + int16x8_t min = vdupq_n_s16(std::numeric_limits::min()); + if constexpr (batched) { + alpha[0] = vdupq_n_s16(0); + alpha[1] = min; + alpha[2] = min; + alpha[3] = min; + alpha[4] = min; + alpha[5] = min; + alpha[6] = min; + alpha[7] = min; + } else { + alpha[0] = min; + alpha[0][0] = 0; + } + trellis_termination(sys, par, kv, beta_tail, channel_reliability); + trellis_termination(pys, itl, kv, perm_beta_tail, channel_reliability); + + // DECODE + for (uint32_t num_iter = 0; num_iter < max_iter; ++num_iter) { + decode_step(sys, par, extrinsic, kv, llr, alpha, beta_tail, gamma, + channel_reliability); + + update_extrinsic(kv, llr, extrinsic, sys); + permute(perm_lookup, extrinsic, perm_extrinsic, kv); + + decode_step(pys, itl, perm_extrinsic, kv, perm_llr, alpha, perm_beta_tail, + gamma, channel_reliability); + + update_extrinsic(kv, perm_llr, perm_extrinsic, pys); + unpermute(perm_lookup, perm_extrinsic, extrinsic, kv); + + // CHECK CONVERGENCE + int16_t max_abs_diff = 0; + for (uint32_t i = 0; i < kv; ++i) { + int16_t abs_diff = + vmaxvq_s16(vqabsq_s16(vqsubq_s16(perm_llr[i], prev_perm_llr[i]))); + if (abs_diff > max_abs_diff) { + max_abs_diff = abs_diff; + } + } + // If we've converged, finish decoding + if constexpr (check_convergence) { + if (max_abs_diff == 0) { + break; + } + } + // Store the current "final" LLRs to use in convergence checking next + // iteration + for (uint32_t i = 0; i < kv; i++) { + prev_perm_llr[i] = perm_llr[i]; + } + } + + // Return unpermuted output (use llr as a buffer) + unpermute(perm_lookup, perm_llr, llr, kv); + + if constexpr (batched) { // uninterleave dst (use pys as a buffer) + interleave((int16_t *)llr, vec_len, (int16_t *)pys, k); + } + int16x8_t *llr_buff = batched ? pys : llr; + constexpr uint32_t num_blocks = batched ? vec_len : 1; + for (uint32_t b = 0; b < num_blocks; ++b) { + turbo_llrs_to_bits(k, llr_buff + b * k / vec_len, dst + b * k / vec_len); + } + return ARMRAL_SUCCESS; +} + +template +armral_status decode(const int8_t *sys_i8, const int8_t *par_i8, + const int8_t *itl_i8, uint32_t k, uint8_t *dst, + float32_t l_c, uint32_t max_iter, uint32_t num_blocks, + uint16_t *perm_idxs, Allocator &allocator, + trellis_term_func_t trellis_termination_single, + decode_step_func_t decode_step_single, + trellis_term_func_t trellis_termination_batched, + decode_step_func_t decode_step_batched) { + // Outer decoder function for a single block or batch of `num_blocks` blocks. + // The inputs and the output to this function are uninterleaved. + // + // eg. for int8_t elements ki_bj of sys_i8: + // For batched data the vectorization strategy is to have 8 interleaved blocks + // int16x8_t *sys: {{k0_b0, k0_b1, ... k0_b7}, {k1_b0, k1_b1, ...}, ...} + // For single, vectorize within the block: + // int16x8_t *sys: {{k0_b0, k1_b0, ... k7_b0}, {k8_b0, k9_b0, ...}, ...} + // So for the batched case, interleave then decode 8 blocks. (as vec_len = 8) + // For the single case, vectorize the data as is. + + if (!valid_num_bits(k)) { + return ARMRAL_ARGUMENT_ERROR; + } + + constexpr uint32_t vec_len = 8; + // For smaller batch sizes, just run the single decoder num_blocks times. + const bool batched = num_blocks >= vec_len; + // k Adjusted by vec and batch size (no change when v_s = b_s) + const uint32_t kv = batched ? k : k / vec_len; + const uint32_t len = batched ? k + 4 : kv + 1; + const uint32_t beta_tail_len = batched ? vec_len : 1; + + auto sys = allocate_uninitialized(allocator, len); + auto par = allocate_uninitialized(allocator, len); + auto pys = allocate_uninitialized(allocator, len); // permuted sys + auto itl = allocate_uninitialized(allocator, len); + auto extrinsic = allocate_zeroed(allocator, kv); + auto perm_extrinsic = allocate_uninitialized(allocator, kv); + // Allocate space for log likelihood ratios from both stages of decoding + auto llr = allocate_uninitialized(allocator, kv); + auto perm_llr = allocate_uninitialized(allocator, kv); + auto prev_perm_llr = allocate_zeroed(allocator, kv); + // Allocate space to hold alpha and gamma + // alpha stores the forward-accumulated state probabilities for each decoded + // bit, where the LTE encoder has 8 states and there are k bits to decode + // plus the starting condition (no alpha for the 3 trellis termination bits) + auto alpha = allocate_uninitialized(allocator, 8 * (kv + 1)); + auto beta_tail = allocate_uninitialized(allocator, beta_tail_len); + auto perm_beta_tail = + allocate_uninitialized(allocator, beta_tail_len); + // gamma stores the conditional state transition probabilities for each of the + // k bits to decode. There are 16 transitions per k but only 4 unique values. + // Use int16_t so decode_step has same signature for batched and single. + auto gamma = allocate_uninitialized(allocator, kv * vec_len * 4); + + // PERM_IDXS + // Get the permutation vector for the input value of k. + // If perm_idxs is uninitialized (==nullptr) then generate indices here. + unique_ptr perm_lookup_unique; + perm_idx_lookup *perm_lookup = nullptr; + // Find the index into the array of parameter arrays corresponding + // to the current k. Subtract 40 because k=40 is the lowest value. + uint32_t param_idx = perm_params_lookup[(k - 40) >> 3]; + if (perm_idxs != nullptr) { + if constexpr (Allocator::is_counting) { // NOTE: All allocations done. + return ARMRAL_SUCCESS; + } + perm_lookup = (perm_idx_lookup *)perm_idxs + perm_lookup_offset[param_idx]; + } else { + perm_lookup_unique = allocate_uninitialized(allocator, k); + if constexpr (Allocator::is_counting) { // NOTE: All allocations done. + return ARMRAL_SUCCESS; + } + perm_lookup = perm_lookup_unique.get(); + // Generate the permutation vector for the input value of k. + k_perm_idx_init(k, param_idx, perm_lookup); + } + + // How many elements in input data? + // This is different than `len` which accounts for vectorizing + const uint32_t dat_len = k + 4; + uint32_t b = 0; // block index + uint32_t dat_offset = 0; + uint32_t dst_offset = 0; + for (; batched && b < num_blocks - vec_len + 1; + b += vec_len, dat_offset += vec_len * dat_len, dst_offset += k) { + // Decode 8 blocks + + if (b > 0) { // Re-zero buffers which should start at 0 + for (uint32_t i = 0; i < k; i++) { + extrinsic[i] = vdupq_n_s16(0); + prev_perm_llr[i] = vdupq_n_s16(0); + } + } + + // Convert type and vectorize, then interleave (use pys as a buffer) + convert_llrs(dat_len * vec_len, sys_i8 + dat_offset, pys.get()); + interleave((int16_t *)pys.get(), dat_len, (int16_t *)sys.get(), vec_len); + + convert_llrs(dat_len * vec_len, par_i8 + dat_offset, pys.get()); + interleave((int16_t *)pys.get(), dat_len, (int16_t *)par.get(), vec_len); + + convert_llrs(dat_len * vec_len, itl_i8 + dat_offset, pys.get()); + interleave((int16_t *)pys.get(), dat_len, (int16_t *)itl.get(), vec_len); + + decode_loop( + sys.get(), pys.get(), par.get(), itl.get(), extrinsic.get(), + perm_extrinsic.get(), llr.get(), perm_llr.get(), prev_perm_llr.get(), + alpha.get(), beta_tail.get(), perm_beta_tail.get(), gamma.get(), k, + dst + dst_offset, l_c, max_iter, perm_lookup, + trellis_termination_batched, decode_step_batched); + } + for (; b < num_blocks; + ++b, dat_offset += dat_len, dst_offset += k / vec_len) { + // Decode 1 block + + if (b > 0) { // Re-zero buffers which should start at 0 + for (uint32_t i = 0; i < k / vec_len; i++) { + extrinsic[i] = vdupq_n_s16(0); + prev_perm_llr[i] = vdupq_n_s16(0); + } + } + + // Convert type and vectorize + convert_llrs(dat_len, sys_i8 + dat_offset, sys.get()); + convert_llrs(dat_len, par_i8 + dat_offset, par.get()); + convert_llrs(dat_len, itl_i8 + dat_offset, itl.get()); + + decode_loop( + sys.get(), pys.get(), par.get(), itl.get(), extrinsic.get(), + perm_extrinsic.get(), llr.get(), perm_llr.get(), prev_perm_llr.get(), + alpha.get(), beta_tail.get(), perm_beta_tail.get(), gamma.get(), k, + dst + dst_offset, l_c, max_iter, perm_lookup, + trellis_termination_single, decode_step_single); + } + return ARMRAL_SUCCESS; +} + +} // namespace armral::turbo diff --git a/src/UpperPHY/Turbo/turbo_tables.hpp b/src/UpperPHY/Turbo/turbo_tables.hpp index 6fb5d8d94b7750193a7950242a629b6375e3f1fc..ecd524406fca73e27500219c9f5994486095f30a 100644 --- a/src/UpperPHY/Turbo/turbo_tables.hpp +++ b/src/UpperPHY/Turbo/turbo_tables.hpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #pragma once @@ -149,7 +151,8 @@ static constexpr int16_t perm_params_lookup[764] = { // A table to get the offset to a specific K's permutation indices from the all // k indices array created by armral_turbo_perm_idx_init(). Usage: -// perm_idx_lookup *k_perm_idxs = all_k_perm_idxs + perm_lookup_offset[perm_params_lookup[k/8-5]] +// perm_idx_lookup *k_perm_idxs = +// all_k_perm_idxs + perm_lookup_offset[perm_params_lookup[k / 8 - 5]] static constexpr uint32_t perm_lookup_offset[188] = { 0, 40, 88, 144, 208, 280, 360, 448, 544, 648, 760, 880, 1008, 1144, 1288, 1440, 1600, 1768, diff --git a/src/intrinsics.h b/src/intrinsics.h index 5e35954996191493154414cb183a42fe4b1e3015..082251336bccd01ae256c952e0066abc870fc82c 100644 --- a/src/intrinsics.h +++ b/src/intrinsics.h @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #pragma once diff --git a/src/utils/allocators.hpp b/src/utils/allocators.hpp index 9bfa826b83b41c47e060b6a888fb6e38c6a694e4..f2868c8569f19337de0e7b39b7f5251c5cecb98b 100644 --- a/src/utils/allocators.hpp +++ b/src/utils/allocators.hpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2023-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2023-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #pragma once @@ -159,7 +161,8 @@ private: do_thing_with_cmplxs(a, b, c, cmplxs.get()) - // Note: The unique_ptr needs to stay in scope for the raw pointer to be valid! + // Note: The unique_ptr needs to stay in scope for the raw pointer to be + // valid! // If you want the address of the n-th item rather than doing: diff --git a/src/utils/bits_to_bytes.hpp b/src/utils/bits_to_bytes.hpp index 2cc811d4928f5ad06f0848c09a28c1cc2074da7c..0be7f213fda66eb34d1a52c188ff1fa3e7288b82 100644 --- a/src/utils/bits_to_bytes.hpp +++ b/src/utils/bits_to_bytes.hpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #pragma once diff --git a/src/utils/cmplx_arith_f32.hpp b/src/utils/cmplx_arith_f32.hpp index 87022ed2b3b07b13786b938b1223d1a5cba1d737..5f4c88151774cb39e832549c20fcb5f770b58e07 100644 --- a/src/utils/cmplx_arith_f32.hpp +++ b/src/utils/cmplx_arith_f32.hpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #pragma once diff --git a/src/utils/vec_mul.hpp b/src/utils/vec_mul.hpp index b056fd948dfdf9050f54f54dd0d12067867f965a..72ccd728da9f6f56db8eca4537253397b666e354 100644 --- a/src/utils/vec_mul.hpp +++ b/src/utils/vec_mul.hpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #pragma once diff --git a/test/BasicMathFun/MatrixInv/Batch/main.cpp b/test/BasicMathFun/MatrixInv/Batch/main.cpp index ec1ad37f0568cb2ff7fa4bcdbcbfb24870a24816..f7eb0aa46837fce692dc064e43e81f14fc8ae2c5 100644 --- a/test/BasicMathFun/MatrixInv/Batch/main.cpp +++ b/test/BasicMathFun/MatrixInv/Batch/main.cpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "armral.h" #include "cf32_utils.hpp" diff --git a/test/BasicMathFun/MatrixInv/Single/main.cpp b/test/BasicMathFun/MatrixInv/Single/main.cpp index 88d4becfa84995d831c5eea13f2dfb6b45bc421b..3ac656ac7e4166f2d9f128e8a32578bca9d4571e 100644 --- a/test/BasicMathFun/MatrixInv/Single/main.cpp +++ b/test/BasicMathFun/MatrixInv/Single/main.cpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "armral.h" #include "cf32_utils.hpp" diff --git a/test/BasicMathFun/MatrixMult/Batch/ArmSolve/main.cpp b/test/BasicMathFun/MatrixMult/Batch/ArmSolve/main.cpp index 0a17a6299273a9e660e5725ae8bf92a070f13238..3ab49e7824e584196eb1ae65e4e38b9acf8f505d 100644 --- a/test/BasicMathFun/MatrixMult/Batch/ArmSolve/main.cpp +++ b/test/BasicMathFun/MatrixMult/Batch/ArmSolve/main.cpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "cf32_utils.hpp" #include "cs16_utils.hpp" diff --git a/test/BasicMathFun/MatrixMult/Batch/MatrixVectorMult16/main.cpp b/test/BasicMathFun/MatrixMult/Batch/MatrixVectorMult16/main.cpp index 16f9985e56fc132a816557d6677bc93ab6695990..e615595a77eea6c3ff79e6e667c4ae8902352fd7 100644 --- a/test/BasicMathFun/MatrixMult/Batch/MatrixVectorMult16/main.cpp +++ b/test/BasicMathFun/MatrixMult/Batch/MatrixVectorMult16/main.cpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "matrix_utils.hpp" diff --git a/test/BasicMathFun/MatrixMult/Batch/MatrixVectorMult32/main.cpp b/test/BasicMathFun/MatrixMult/Batch/MatrixVectorMult32/main.cpp index 516f0754fce9902159566a6115c375dc252ad901..56532dc5071560fa0a25e8f5c284c043a76ca5bf 100644 --- a/test/BasicMathFun/MatrixMult/Batch/MatrixVectorMult32/main.cpp +++ b/test/BasicMathFun/MatrixMult/Batch/MatrixVectorMult32/main.cpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "cf32_utils.hpp" diff --git a/test/BasicMathFun/MatrixMult/Single/MatrixMult16/main.cpp b/test/BasicMathFun/MatrixMult/Single/MatrixMult16/main.cpp index 90a4e482a744c32500d2d47a647a675dbbabcb31..9420597be9ddee49439bfa13b7450cd5c0c0119a 100644 --- a/test/BasicMathFun/MatrixMult/Single/MatrixMult16/main.cpp +++ b/test/BasicMathFun/MatrixMult/Single/MatrixMult16/main.cpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "reference_linalg.hpp" #include diff --git a/test/BasicMathFun/MatrixMult/Single/MatrixMult32/main.cpp b/test/BasicMathFun/MatrixMult/Single/MatrixMult32/main.cpp index fd21f7d69b375a4ac2443bc686d0fe1568ab3eb8..ece51f9703ce862ed5cc0cf6344c78f46de9c71f 100644 --- a/test/BasicMathFun/MatrixMult/Single/MatrixMult32/main.cpp +++ b/test/BasicMathFun/MatrixMult/Single/MatrixMult32/main.cpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "cf32_utils.hpp" #include "reference_linalg.hpp" diff --git a/test/BasicMathFun/MatrixMult/Single/MatrixMultAAH32/main.cpp b/test/BasicMathFun/MatrixMult/Single/MatrixMultAAH32/main.cpp index 2324d97f120723e6fc0520704d74c3f57c1949b7..77bcc5450b5a96cfb94e42f44c53e9d1a3fa9dab 100644 --- a/test/BasicMathFun/MatrixMult/Single/MatrixMultAAH32/main.cpp +++ b/test/BasicMathFun/MatrixMult/Single/MatrixMultAAH32/main.cpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2023-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2023-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "cf32_utils.hpp" #include "matrix_utils.hpp" diff --git a/test/BasicMathFun/MatrixMult/Single/MatrixMultAHB32/main.cpp b/test/BasicMathFun/MatrixMult/Single/MatrixMultAHB32/main.cpp index d5348b68341e2fc2102bde9413b0197289e56d55..af802d6f0aea6cadc82a472e3f75b748311914d5 100644 --- a/test/BasicMathFun/MatrixMult/Single/MatrixMultAHB32/main.cpp +++ b/test/BasicMathFun/MatrixMult/Single/MatrixMultAHB32/main.cpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2023-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2023-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include diff --git a/test/BasicMathFun/MatrixMult/Single/MatrixVectorMult16/main.cpp b/test/BasicMathFun/MatrixMult/Single/MatrixVectorMult16/main.cpp index eb024e3d4517983718c9ffdbd6a6ee888bb53145..c92c02a7d37726cfc59948cafa8db8fe1e879ea2 100644 --- a/test/BasicMathFun/MatrixMult/Single/MatrixVectorMult16/main.cpp +++ b/test/BasicMathFun/MatrixMult/Single/MatrixVectorMult16/main.cpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "reference_linalg.hpp" diff --git a/test/BasicMathFun/MatrixMult/Single/MatrixVectorMult32/main.cpp b/test/BasicMathFun/MatrixMult/Single/MatrixVectorMult32/main.cpp index 1b9e1ab4dcb4d649c5bc8b08b2ab7876d097fff5..c06061347c105e1d609f8e10a3cfee0be9a53ff2 100644 --- a/test/BasicMathFun/MatrixMult/Single/MatrixVectorMult32/main.cpp +++ b/test/BasicMathFun/MatrixMult/Single/MatrixVectorMult32/main.cpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "cf32_utils.hpp" #include "reference_linalg.hpp" diff --git a/test/BasicMathFun/MatrixPseudoInv/Direct/main.cpp b/test/BasicMathFun/MatrixPseudoInv/Direct/main.cpp index f2f39158df92ecb955d6f2715624a39b0a4b8fbb..18a37082031380cf7a61f53b410f26124a7d1549 100644 --- a/test/BasicMathFun/MatrixPseudoInv/Direct/main.cpp +++ b/test/BasicMathFun/MatrixPseudoInv/Direct/main.cpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2023-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2023-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "cf32_utils.hpp" #include "matrix_utils.hpp" diff --git a/test/BasicMathFun/VectorDotProd/VecDot16/main.cpp b/test/BasicMathFun/VectorDotProd/VecDot16/main.cpp index 9f4c4a04b45d41996421e6ba44c38c8f000f88cd..2ee50a0f21bc59fd899f606cf3876227800f954b 100644 --- a/test/BasicMathFun/VectorDotProd/VecDot16/main.cpp +++ b/test/BasicMathFun/VectorDotProd/VecDot16/main.cpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "cs16_utils.hpp" #include "qint64.hpp" diff --git a/test/BasicMathFun/VectorDotProd/VecDot16_2/main.cpp b/test/BasicMathFun/VectorDotProd/VecDot16_2/main.cpp index bf33cab2f29ca7aa3129afd0ff6563b2e42d6771..b3d0888e6201bf3ec6d43bacb8615193439356da 100644 --- a/test/BasicMathFun/VectorDotProd/VecDot16_2/main.cpp +++ b/test/BasicMathFun/VectorDotProd/VecDot16_2/main.cpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "cs16_utils.hpp" #include "qint64.hpp" diff --git a/test/BasicMathFun/VectorDotProd/VecDot16_2_32bit/main.cpp b/test/BasicMathFun/VectorDotProd/VecDot16_2_32bit/main.cpp index 303faa706a605c1874899d3aeed3d30f607dd345..9a662df7f2cda9237db394201915378554e74665 100644 --- a/test/BasicMathFun/VectorDotProd/VecDot16_2_32bit/main.cpp +++ b/test/BasicMathFun/VectorDotProd/VecDot16_2_32bit/main.cpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "cs16_utils.hpp" #include "qint64.hpp" diff --git a/test/BasicMathFun/VectorDotProd/VecDot16_32bit/main.cpp b/test/BasicMathFun/VectorDotProd/VecDot16_32bit/main.cpp index 8bfb96cad7b14b9436499656a5297ab94f1ada9a..122bf84cbee5276cdcda0fbdb34f6b2bee7d5133 100644 --- a/test/BasicMathFun/VectorDotProd/VecDot16_32bit/main.cpp +++ b/test/BasicMathFun/VectorDotProd/VecDot16_32bit/main.cpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "cs16_utils.hpp" #include "qint64.hpp" diff --git a/test/BasicMathFun/VectorDotProd/VecDot32/main.cpp b/test/BasicMathFun/VectorDotProd/VecDot32/main.cpp index e2ed1288d4180a28c6b1e0bd09676c8b4e6b2645..b06453a6ef6ff4264bea8084dca0fc73a4e552d4 100644 --- a/test/BasicMathFun/VectorDotProd/VecDot32/main.cpp +++ b/test/BasicMathFun/VectorDotProd/VecDot32/main.cpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "cf32_utils.hpp" diff --git a/test/BasicMathFun/VectorDotProd/VecDot32_2/main.cpp b/test/BasicMathFun/VectorDotProd/VecDot32_2/main.cpp index 35f7be34ae8a5e88a80b73d62072257217465399..a41dc8a13d7a85c6d00b9f47b70b5e56b3986a82 100644 --- a/test/BasicMathFun/VectorDotProd/VecDot32_2/main.cpp +++ b/test/BasicMathFun/VectorDotProd/VecDot32_2/main.cpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "cf32_utils.hpp" diff --git a/test/BasicMathFun/VectorMult/VecMul16/main.cpp b/test/BasicMathFun/VectorMult/VecMul16/main.cpp index 8c31bea7deae1493ab4a8869237c4ba534f6ce71..303738f0c849c1492d38e3177cd281ac152c72d8 100644 --- a/test/BasicMathFun/VectorMult/VecMul16/main.cpp +++ b/test/BasicMathFun/VectorMult/VecMul16/main.cpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "cs16_utils.hpp" #include "qint64.hpp" diff --git a/test/BasicMathFun/VectorMult/VecMul16_2/main.cpp b/test/BasicMathFun/VectorMult/VecMul16_2/main.cpp index 3ee9745c75c2851ff2a96971a28be41a4efeb6bf..c01953c3276b32750f9f4897cb37f4b00d19fd9e 100644 --- a/test/BasicMathFun/VectorMult/VecMul16_2/main.cpp +++ b/test/BasicMathFun/VectorMult/VecMul16_2/main.cpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "cs16_utils.hpp" #include "int_utils.hpp" diff --git a/test/BasicMathFun/VectorMult/VecMul32/main.cpp b/test/BasicMathFun/VectorMult/VecMul32/main.cpp index 991b6bd0ca31a0a85cbd139da62aa4ed37b0ee65..5976b9b9ea9f50aa8b373be78d4d381bde4d696f 100644 --- a/test/BasicMathFun/VectorMult/VecMul32/main.cpp +++ b/test/BasicMathFun/VectorMult/VecMul32/main.cpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "cf32_utils.hpp" diff --git a/test/BasicMathFun/VectorMult/VecMul32_2/main.cpp b/test/BasicMathFun/VectorMult/VecMul32_2/main.cpp index 38ac06e820c683a2d70e29c06d584ac18fa2804d..e114569bccb4352202cf2e0d122bf2ac0f528daa 100644 --- a/test/BasicMathFun/VectorMult/VecMul32_2/main.cpp +++ b/test/BasicMathFun/VectorMult/VecMul32_2/main.cpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "cf32_utils.hpp" diff --git a/test/DuRuInterface/MuLaw/Compression/main.cpp b/test/DuRuInterface/MuLaw/Compression/main.cpp index d8daa67bacb9bd392bf77c8ec6ea7714f1338ffc..875360c1c327eace6b6e562aa9f9468a240b1fd8 100644 --- a/test/DuRuInterface/MuLaw/Compression/main.cpp +++ b/test/DuRuInterface/MuLaw/Compression/main.cpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "armral.h" #include "cs16_utils.hpp" diff --git a/test/DuRuInterface/MuLaw/Decompression/main.cpp b/test/DuRuInterface/MuLaw/Decompression/main.cpp index f59c35249eba818687b7915a7b722bf633d8da5b..5f3b464b3a2a3cbc644b03ebada290cabb0bda35 100644 --- a/test/DuRuInterface/MuLaw/Decompression/main.cpp +++ b/test/DuRuInterface/MuLaw/Decompression/main.cpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "armral.h" #include "cs16_utils.hpp" diff --git a/test/DuRuInterface/ORanBlockFloat/Compression/main.cpp b/test/DuRuInterface/ORanBlockFloat/Compression/main.cpp index 9888d636d4b61f7832d97ec0767dcf1a178d28fa..37f0f6160989ced42ca226407bbc27f215a095b2 100644 --- a/test/DuRuInterface/ORanBlockFloat/Compression/main.cpp +++ b/test/DuRuInterface/ORanBlockFloat/Compression/main.cpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "cs16_utils.hpp" #include "int_utils.hpp" diff --git a/test/DuRuInterface/ORanBlockFloat/Decompression/main.cpp b/test/DuRuInterface/ORanBlockFloat/Decompression/main.cpp index 2e8a1359c1aa8f58776f695c4c9b976e5358a487..b824cca07df340d57f24337a0011c2d6d4604f3a 100644 --- a/test/DuRuInterface/ORanBlockFloat/Decompression/main.cpp +++ b/test/DuRuInterface/ORanBlockFloat/Decompression/main.cpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "cs16_utils.hpp" #include "int_utils.hpp" diff --git a/test/DuRuInterface/ORanBlockScaling/Compression/main.cpp b/test/DuRuInterface/ORanBlockScaling/Compression/main.cpp index 6cc3baac97ac990271d06595ce3bdef132fdc62d..eb30ff9997ef46a34efa072a011150242d0ec0bb 100644 --- a/test/DuRuInterface/ORanBlockScaling/Compression/main.cpp +++ b/test/DuRuInterface/ORanBlockScaling/Compression/main.cpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "cs16_utils.hpp" #include "int_utils.hpp" diff --git a/test/DuRuInterface/ORanBlockScaling/Decompression/main.cpp b/test/DuRuInterface/ORanBlockScaling/Decompression/main.cpp index 37bf2d138b9f9c1ab47223b307ecbe63ba9cff6a..e4cce7774ff898bb6156072c6fd53ddfc927061b 100644 --- a/test/DuRuInterface/ORanBlockScaling/Decompression/main.cpp +++ b/test/DuRuInterface/ORanBlockScaling/Decompression/main.cpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "cs16_utils.hpp" #include "int_utils.hpp" diff --git a/test/LowerPHY/Correlation/main.cpp b/test/LowerPHY/Correlation/main.cpp index 4dff97e9f8d16e9045fccbed4ea7d26381dd32f3..9fe8238b4e1153e65b1d686de4b693363ec12839 100644 --- a/test/LowerPHY/Correlation/main.cpp +++ b/test/LowerPHY/Correlation/main.cpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "cs16_utils.hpp" #include "qint64.hpp" diff --git a/test/LowerPHY/FFT/FFT16/main.cpp b/test/LowerPHY/FFT/FFT16/main.cpp index 3db2dc3feeb55b9c1a83eb80bd31d2e67bea3790..fe0710570cdf388e29fb0bb87f3919efd6574ed9 100644 --- a/test/LowerPHY/FFT/FFT16/main.cpp +++ b/test/LowerPHY/FFT/FFT16/main.cpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "armral.h" #include "cf32_utils.hpp" @@ -16,15 +18,16 @@ #define M_PI 3.14159265358979323846 #endif +namespace { + float clamp_neg1_to_1(float x) { float low = -1.0; float high = (float32_t)((1 << 15) - 1) / (1 << 15); return std::max(low, std::min(high, x)); } -static bool check_fft_results(const char *name, - const armral_cmplx_int16_t *result, - const armral_cmplx_f32_t *expected, uint32_t n) { +bool check_fft_results(const char *name, const armral_cmplx_int16_t *result, + const armral_cmplx_f32_t *expected, uint32_t n) { bool passed = true; float max_error = 0; @@ -59,8 +62,8 @@ static bool check_fft_results(const char *name, return passed; } -static std::vector -run_fft_ref(int n, armral_fft_direction_t dir, const armral_cmplx_int16_t *x) { +std::vector run_fft_ref(int n, armral_fft_direction_t dir, + const armral_cmplx_int16_t *x) { std::vector> in(n); std::vector> out(n); for (int i = 0; i < n; i++) { @@ -71,7 +74,17 @@ run_fft_ref(int n, armral_fft_direction_t dir, const armral_cmplx_int16_t *x) { return armral::utils::narrow_to_cf32(out); } -static bool run_fft_test(int n, armral_fft_direction_t dir) { +bool check_status(const armral_status ret_status, const char *message) { + if (ret_status == ARMRAL_ARGUMENT_ERROR) { + // GCOVR_EXCL_START + printf("Error! %s\n", message); + return false; + // GCOVR_EXCL_STOP + } + return true; +} + +bool run_fft_test(int n, armral_fft_direction_t dir) { printf("Testing FFT n=%d dir=%d\n", n, (int)dir); constexpr armral_cmplx_int16_t min = {-4096, -4096}; constexpr armral_cmplx_int16_t max = {4095, 4095}; @@ -80,28 +93,42 @@ static bool run_fft_test(int n, armral_fft_direction_t dir) { auto y = random.vector(n, min, max); const auto y_ref = run_fft_ref(n, dir, x.data()); - armral_fft_plan_t *p; - armral_fft_create_plan_cs16(&p, n, dir); - if (p == nullptr) { + armral_fft_plan_t *p = nullptr; + auto plan_status = armral_fft_create_plan_cs16(&p, n, dir); + if (!check_status(plan_status, "Failed to create a plan")) { + // GCOVR_EXCL_START + return false; + // GCOVR_EXCL_STOP + } + + auto execute_status = armral_fft_execute_cs16(p, x.data(), y.data()); + if (!check_status(execute_status, "Failed to execute plan")) { + // GCOVR_EXCL_START + return false; + // GCOVR_EXCL_STOP + } + + auto destroy_status = armral_fft_destroy_plan_cs16(&p); + if (!check_status(destroy_status, "Failed to destroy plan")) { // GCOVR_EXCL_START - printf("Error! Failed to create a plan for n=%d\n", n); return false; // GCOVR_EXCL_STOP } - armral_fft_execute_cs16(p, x.data(), y.data()); - armral_fft_destroy_plan_cs16(&p); return check_fft_results("FFT", y.data(), y_ref.data(), n); } +} // Anonymous namespace + int main(int argc, char **argv) { bool passed = true; - constexpr int ns[] = { - 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, - 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 32, 35, - 36, 40, 45, 46, 47, 48, 50, 64, 65, 66, 68, 77, 81, - 98, 99, 102, 112, 136, 121, 169, 170, 204, 238, 255, 272, 289, - 342, 361, 440, 441, 484, 529, 552, 768, 800, 1024, 1104, 2048, 2401}; + constexpr int ns[] = {2, 3, 4, 5, 6, 7, 8, 9, 10, 11, + 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, + 22, 23, 24, 25, 32, 35, 36, 40, 45, 46, + 47, 48, 50, 64, 65, 66, 68, 77, 81, 98, + 99, 102, 112, 136, 121, 169, 170, 204, 238, 255, + 272, 289, 342, 361, 440, 441, 484, 529, 552, 768, + 800, 1024, 1125, 1140, 1170, 1104, 2048, 2401}; for (int n : ns) { for (auto dir : {ARMRAL_FFT_FORWARDS, ARMRAL_FFT_BACKWARDS}) { passed &= run_fft_test(n, dir); diff --git a/test/LowerPHY/FFT/FFT32/main.cpp b/test/LowerPHY/FFT/FFT32/main.cpp index 12ab42733014843b2bd7f7a083831797855947b0..345a4bcbc44b5100f45a39f7f2fa2805920584c6 100644 --- a/test/LowerPHY/FFT/FFT32/main.cpp +++ b/test/LowerPHY/FFT/FFT32/main.cpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "armral.h" #include "cf32_utils.hpp" @@ -14,9 +16,10 @@ #define M_PI 3.14159265358979323846 #endif -static bool check_fft_results(const char *name, - const armral_cmplx_f32_t *result, - const armral_cmplx_f32_t *expected, uint32_t n) { +namespace { + +bool check_fft_results(const char *name, const armral_cmplx_f32_t *result, + const armral_cmplx_f32_t *expected, uint32_t n) { bool passed = true; float max_error = 0; @@ -48,15 +51,25 @@ static bool check_fft_results(const char *name, return passed; } -static std::vector -run_fft_ref(int n, armral_fft_direction_t dir, const armral_cmplx_f32_t *x) { +std::vector run_fft_ref(int n, armral_fft_direction_t dir, + const armral_cmplx_f32_t *x) { std::vector> in = armral::utils::widen_cf32(x, n); std::vector> out(n); armral::utils::fft_ref(n, 1, dir, in.data(), out.data()); return armral::utils::narrow_to_cf32(out); } -static bool run_fft_test(int n, armral_fft_direction_t dir) { +bool check_status(const armral_status ret_status, const char *message) { + if (ret_status == ARMRAL_ARGUMENT_ERROR) { + // GCOVR_EXCL_START + printf("Error! %s\n", message); + return false; + // GCOVR_EXCL_STOP + } + return true; +} + +bool run_fft_test(int n, armral_fft_direction_t dir) { printf("Testing FFT n=%d dir=%d\n", n, (int)dir); armral::utils::cf32_random random; const auto x = random.vector(n); @@ -64,28 +77,42 @@ static bool run_fft_test(int n, armral_fft_direction_t dir) { const auto y_ref = run_fft_ref(n, dir, x.data()); armral_fft_plan_t *p = nullptr; - armral_fft_create_plan_cf32(&p, n, dir); - if (p == nullptr) { + auto plan_status = armral_fft_create_plan_cf32(&p, n, dir); + if (!check_status(plan_status, "Failed to create a plan")) { + // GCOVR_EXCL_START + return false; + // GCOVR_EXCL_STOP + } + + auto execute_status = armral_fft_execute_cf32(p, x.data(), y.data()); + if (!check_status(execute_status, "Failed to execute plan")) { + // GCOVR_EXCL_START + return false; + // GCOVR_EXCL_STOP + } + + auto destroy_status = armral_fft_destroy_plan_cf32(&p); + if (!check_status(destroy_status, "Failed to destroy plan")) { // GCOVR_EXCL_START - printf("Error! Failed to create a plan for n=%d\n", n); return false; // GCOVR_EXCL_STOP } - armral_fft_execute_cf32(p, x.data(), y.data()); - armral_fft_destroy_plan_cf32(&p); return check_fft_results("FFT", y.data(), y_ref.data(), n); } +} // Anonymous namespace + int main(int argc, char **argv) { bool passed = true; constexpr int ns[] = { 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, - 32, 62, 106, 142, 202, 206, 226, 274, 314, 326, 394, 484, - 542, 573, 614, 626, 706, 758, 800, 802, 821, 838, 842, 926, - 968, 1024, 1063, 1198, 1202, 1366, 1728, 2013, 2025, 2030, 2128, 2209, - 2401, 2557, 3001, 3226, 3240, 3309, 3482, 3998, 4096}; + 28, 29, 30, 31, 32, 37, 40, 41, 62, 74, 82, 86, + 106, 142, 202, 206, 226, 274, 314, 326, 394, 484, 542, 573, + 614, 626, 706, 758, 800, 802, 821, 838, 842, 926, 968, 1024, + 1063, 1198, 1202, 1366, 1728, 2013, 2025, 2030, 2128, 2209, 2401, 2557, + 3001, 3226, 3240, 3309, 3482, 3998, 4096, 9413}; for (int n : ns) { for (auto dir : {ARMRAL_FFT_FORWARDS, ARMRAL_FFT_BACKWARDS}) { passed &= run_fft_test(n, dir); diff --git a/test/LowerPHY/FIR/FIR16/main.cpp b/test/LowerPHY/FIR/FIR16/main.cpp index 22604a33a9b74f80cc94e69cd9a6712362da569f..99e1bf61c7020320d3eecbd81b749087635bc850 100644 --- a/test/LowerPHY/FIR/FIR16/main.cpp +++ b/test/LowerPHY/FIR/FIR16/main.cpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "cs16_utils.hpp" diff --git a/test/LowerPHY/FIR/FIR16Decimate2/main.cpp b/test/LowerPHY/FIR/FIR16Decimate2/main.cpp index d5588c21ec7b7cd3113841b0ea0bb16ef4c14625..cbd7df60903e393654585b61dedf7999a51361ef 100644 --- a/test/LowerPHY/FIR/FIR16Decimate2/main.cpp +++ b/test/LowerPHY/FIR/FIR16Decimate2/main.cpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "cs16_utils.hpp" diff --git a/test/LowerPHY/FIR/FIR32/main.cpp b/test/LowerPHY/FIR/FIR32/main.cpp index 910b7619c37951a944ff825cc67c4c5e73229da5..9e52ca1f1a35419e3763460f8489ddde7e838d26 100644 --- a/test/LowerPHY/FIR/FIR32/main.cpp +++ b/test/LowerPHY/FIR/FIR32/main.cpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "cf32_utils.hpp" diff --git a/test/LowerPHY/FIR/FIR32Decimate2/main.cpp b/test/LowerPHY/FIR/FIR32Decimate2/main.cpp index 671ec438e17f2639f532e8f6aad3c208c32ecd21..d58e9e6c64c51938629d251db2d6c5c0445936ac 100644 --- a/test/LowerPHY/FIR/FIR32Decimate2/main.cpp +++ b/test/LowerPHY/FIR/FIR32Decimate2/main.cpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "cf32_utils.hpp" diff --git a/test/LowerPHY/Scrambling/main.cpp b/test/LowerPHY/Scrambling/main.cpp index 39fd597ddc0623c021508e578f33634e8aecbebd..a442f0c32933e9c1295830b6ba4a48fdf97ce4c7 100644 --- a/test/LowerPHY/Scrambling/main.cpp +++ b/test/LowerPHY/Scrambling/main.cpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "armral.h" #include "int_utils.hpp" diff --git a/test/LowerPHY/SeqGenerator/main.cpp b/test/LowerPHY/SeqGenerator/main.cpp index 7433915932498ff583c1007281169167e6d94a2f..6d6de2c4c27733830d2f9d404d476be740cf8f7d 100644 --- a/test/LowerPHY/SeqGenerator/main.cpp +++ b/test/LowerPHY/SeqGenerator/main.cpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "cs16_utils.hpp" #include "int_utils.hpp" diff --git a/test/MatrixFactorizations/SVD/main.cpp b/test/MatrixFactorizations/SVD/main.cpp index 87ae255ac762ab3525d43f3a0e9004a7857d4557..a18e1e2db77946a7cb7da77baaa92d89ab592e1e 100644 --- a/test/MatrixFactorizations/SVD/main.cpp +++ b/test/MatrixFactorizations/SVD/main.cpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "reference_linalg.hpp" diff --git a/test/MatrixFactorizations/SVD/svd_sample_data.h b/test/MatrixFactorizations/SVD/svd_sample_data.h index 51accb35df363d1433e1caff8fdfef2a5b4b1cfc..f55d438dc7a6f5df77ecbaa2d235c838c7321d92 100644 --- a/test/MatrixFactorizations/SVD/svd_sample_data.h +++ b/test/MatrixFactorizations/SVD/svd_sample_data.h @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #pragma once diff --git a/test/MatrixFactorizations/SVD/svd_test.hpp b/test/MatrixFactorizations/SVD/svd_test.hpp index d23cef67710861275f94cfe49325ab16990f02a3..43695910304bf7d74ba238908335fbdf0b951ca1 100644 --- a/test/MatrixFactorizations/SVD/svd_test.hpp +++ b/test/MatrixFactorizations/SVD/svd_test.hpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #pragma once diff --git a/test/UpperPHY/CRC/main.cpp b/test/UpperPHY/CRC/main.cpp index f61bca7428f43b47f3f11e47a2ea12c8514bf20e..6b88bea4e6e022d00da3e25fdac345721054ef02 100644 --- a/test/UpperPHY/CRC/main.cpp +++ b/test/UpperPHY/CRC/main.cpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "cs16_utils.hpp" #include "int_utils.hpp" diff --git a/test/UpperPHY/ConvolutionalDecoder/main.cpp b/test/UpperPHY/ConvolutionalDecoder/main.cpp index cd9ca73c242621cd27576d0dadea93b2bf691be2..f1aa2db30084a242391c0dda12420869cb985cd7 100644 --- a/test/UpperPHY/ConvolutionalDecoder/main.cpp +++ b/test/UpperPHY/ConvolutionalDecoder/main.cpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "armral.h" #include "int_utils.hpp" diff --git a/test/UpperPHY/ConvolutionalEncoder/main.cpp b/test/UpperPHY/ConvolutionalEncoder/main.cpp index b9ab8f3e199b9a3da28dbcc5818099e9d1e3fa5d..38f68c3c62d07d4665f0083c6aa30386a34f72da 100644 --- a/test/UpperPHY/ConvolutionalEncoder/main.cpp +++ b/test/UpperPHY/ConvolutionalEncoder/main.cpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "armral.h" #include "int_utils.hpp" diff --git a/test/UpperPHY/Demodulation/main.cpp b/test/UpperPHY/Demodulation/main.cpp index c03563fdfa3825bb858ca88508673c1ada49d285..cd7039e66635f53bda2ce68a7b2e9866ed383a47 100644 --- a/test/UpperPHY/Demodulation/main.cpp +++ b/test/UpperPHY/Demodulation/main.cpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "armral.h" #include "cs16_utils.hpp" diff --git a/test/UpperPHY/LDPC/Decoding/main.cpp b/test/UpperPHY/LDPC/Decoding/main.cpp index f28b0394b618169abaef668451d7162a997a283f..88bd5815abc767b9931ded04b5348c8a66878fe7 100644 --- a/test/UpperPHY/LDPC/Decoding/main.cpp +++ b/test/UpperPHY/LDPC/Decoding/main.cpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "../ldpc_test_common.hpp" diff --git a/test/UpperPHY/LDPC/Encoding/ldpc_encoding_test_data.h b/test/UpperPHY/LDPC/Encoding/ldpc_encoding_test_data.h index 4a94f7a4937702dea45e2c39ebab6ffb2b82fe0a..ab376254e51d814dc720c54dda1c28dc341076d1 100644 --- a/test/UpperPHY/LDPC/Encoding/ldpc_encoding_test_data.h +++ b/test/UpperPHY/LDPC/Encoding/ldpc_encoding_test_data.h @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #pragma once diff --git a/test/UpperPHY/LDPC/Encoding/main.cpp b/test/UpperPHY/LDPC/Encoding/main.cpp index 4b7732a52d52aac89120f32d0555cc8c844564cb..d3767f4ff6aa9e13d11ccb8be0d996333c3d4691 100644 --- a/test/UpperPHY/LDPC/Encoding/main.cpp +++ b/test/UpperPHY/LDPC/Encoding/main.cpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "../ldpc_test_common.hpp" #include "armral.h" diff --git a/test/UpperPHY/LDPC/RateMatching/main.cpp b/test/UpperPHY/LDPC/RateMatching/main.cpp index 647955b62856098e049201363cc7b5fef149e848..9b93620a17035a4247501c41766cc382329f115b 100644 --- a/test/UpperPHY/LDPC/RateMatching/main.cpp +++ b/test/UpperPHY/LDPC/RateMatching/main.cpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "armral.h" #include "int_utils.hpp" diff --git a/test/UpperPHY/LDPC/RateRecovery/main.cpp b/test/UpperPHY/LDPC/RateRecovery/main.cpp index 907115f24ffaffcba5ab607fd5341bd3c6307f9a..cfff29147de30c58d7ac31424e391da88958bafc 100644 --- a/test/UpperPHY/LDPC/RateRecovery/main.cpp +++ b/test/UpperPHY/LDPC/RateRecovery/main.cpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "armral.h" #include "int_utils.hpp" @@ -202,7 +204,7 @@ void armral_ref_rate_recovery(armral_ldpc_graph_t bg, uint32_t z, uint32_t e, bool test_ref_rate_recovery() { bool passed = true; - // Test ring behaviour of selection process. + // Test ring behavior of selection process. uint32_t e = 100; uint32_t n = 100; uint32_t k0 = 16; diff --git a/test/UpperPHY/LDPC/ldpc_test_common.hpp b/test/UpperPHY/LDPC/ldpc_test_common.hpp index 2b8d4a97c986db42abcc4b13cd5700331f4cba4a..358751866fa4f92fecbbaabe0715720e001b3a7f 100644 --- a/test/UpperPHY/LDPC/ldpc_test_common.hpp +++ b/test/UpperPHY/LDPC/ldpc_test_common.hpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "armral.h" diff --git a/test/UpperPHY/Modulation/main.cpp b/test/UpperPHY/Modulation/main.cpp index 528205b8992b58fa055bbf0c8e6e395bb6c3529c..cd9c43a696eda061045db6c74c4721b6c2dac478 100644 --- a/test/UpperPHY/Modulation/main.cpp +++ b/test/UpperPHY/Modulation/main.cpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "armral.h" #include "cs16_utils.hpp" @@ -35,8 +37,10 @@ static armral_cmplx_int16_t mapping_qpsk(const std::vector &values, static void modulation_ref_qpsk(const uint32_t nbits, const int8_t *p_src, armral_cmplx_int16_t *p_dst) { // We assume that pSrc is a byte stream, where the most significant bits of - // the byte are in the lowest positions Output is in Q2.13 format Get the - // constellation points in Q2.13 format + // the byte are in the lowest positions + // Output is in Q2.13 format + + // Get the constellation points in Q2.13 format constexpr uint16_t sqrt_2 = 0x16A1; auto constellation = gen_qam_constellation(mapping_qpsk, {sqrt_2}, 4); @@ -98,7 +102,7 @@ static void modulation_ref_16qam(const uint32_t nbits, const int8_t *p_src, // Loop over full bytes for (uint32_t i = 0; i < num_bytes; ++i) { uint8_t curr_byte = p_src[i]; - // Star with the left-most four bits + // Start with the left-most four bits uint8_t ind = curr_byte >> 4; p_dst[i * 2] = constellation[ind]; ind = curr_byte & 0xF; @@ -116,7 +120,9 @@ static void modulation_ref_16qam(const uint32_t nbits, const int8_t *p_src, static armral_cmplx_int16_t mapping_64qam(const std::vector &values, uint8_t ind) { // The mapping can be found in the latest document at - // https://www.3gpp.org/ftp/Specs/archive/38_series/38.211/ Get the sign bits + // https://www.3gpp.org/ftp/Specs/archive/38_series/38.211/ + + // Get the sign bits int16_t sign_real = 1 - 2 * (ind >> 5); int16_t sign_im = 1 - 2 * ((ind >> 4) & 0x1); auto ind_func = [](uint8_t left_bit, uint8_t right_bit) { @@ -187,7 +193,9 @@ static void modulation_ref_64qam(const uint32_t nbits, const int8_t *p_src, static armral_cmplx_int16_t mapping_256qam(const std::vector &values, uint8_t ind) { // The mapping can be found in the latest document at - // https://www.3gpp.org/ftp/Specs/archive/38_series/38.211/ Get the sign bits + // https://www.3gpp.org/ftp/Specs/archive/38_series/38.211/ + + // Get the sign bits int16_t sign_real = 1 - 2 * (ind >> 7); int16_t sign_im = 1 - 2 * ((ind >> 6) & 0x1); auto ind_func = [](uint8_t bit_l, uint8_t bit_m, uint8_t bit_r) { diff --git a/test/UpperPHY/Polar/CrcAttachment/main.cpp b/test/UpperPHY/Polar/CrcAttachment/main.cpp index d0fa2e274f35768a985d028f0eba7bb7e87dd1ae..fc0cdc9937505367dcd356c7943a8d3eb8d911f5 100644 --- a/test/UpperPHY/Polar/CrcAttachment/main.cpp +++ b/test/UpperPHY/Polar/CrcAttachment/main.cpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "int_utils.hpp" #include "polar_crc_attach_data.hpp" diff --git a/test/UpperPHY/Polar/CrcAttachment/polar_crc_attach_data.hpp b/test/UpperPHY/Polar/CrcAttachment/polar_crc_attach_data.hpp index 555f4af61e61948ae40b2f804ac5da10c09b352c..0cfa65a334f5fddc793d83dedae414ba84071e59 100644 --- a/test/UpperPHY/Polar/CrcAttachment/polar_crc_attach_data.hpp +++ b/test/UpperPHY/Polar/CrcAttachment/polar_crc_attach_data.hpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #pragma once diff --git a/test/UpperPHY/Polar/Decoding/main.cpp b/test/UpperPHY/Polar/Decoding/main.cpp index bf70339494b7e78926d8bcf341aa87025eda5f91..a91fdfa13009015f035bf2b692011c9c03fa99e3 100644 --- a/test/UpperPHY/Polar/Decoding/main.cpp +++ b/test/UpperPHY/Polar/Decoding/main.cpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "cs16_utils.hpp" #include "int_utils.hpp" diff --git a/test/UpperPHY/Polar/Encoding/main.cpp b/test/UpperPHY/Polar/Encoding/main.cpp index 5d7a55c3afc5cef50a8b3877d278ae7cb76431d4..3c4a28bfd89e4f46fab2db491344655d952af507 100644 --- a/test/UpperPHY/Polar/Encoding/main.cpp +++ b/test/UpperPHY/Polar/Encoding/main.cpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "cs16_utils.hpp" #include "int_utils.hpp" diff --git a/test/UpperPHY/Polar/Frozen/main.cpp b/test/UpperPHY/Polar/Frozen/main.cpp index f1c1ac639bdfb08e48fd96c30799bb4fcb5f36da..e42651f2b1bf8e20a13857196d5c464b89443a17 100644 --- a/test/UpperPHY/Polar/Frozen/main.cpp +++ b/test/UpperPHY/Polar/Frozen/main.cpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "armral.h" #include "int_utils.hpp" diff --git a/test/UpperPHY/Polar/RateMatching/main.cpp b/test/UpperPHY/Polar/RateMatching/main.cpp index 27962fd259e033c7989a14e6bd9806dbccb34e15..8f7f4a559f107a31cff345fc8c3900a1ee41d34a 100644 --- a/test/UpperPHY/Polar/RateMatching/main.cpp +++ b/test/UpperPHY/Polar/RateMatching/main.cpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "armral.h" #include "int_utils.hpp" diff --git a/test/UpperPHY/Polar/RateRecovery/main.cpp b/test/UpperPHY/Polar/RateRecovery/main.cpp index 8f7cd7593ba8db5731ab5db691084cb8e32aa3bf..2e40fb8b7d56b766ede92c99179fc71572d24fea 100644 --- a/test/UpperPHY/Polar/RateRecovery/main.cpp +++ b/test/UpperPHY/Polar/RateRecovery/main.cpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "armral.h" #include "int_utils.hpp" diff --git a/test/UpperPHY/Polar/SubchannelDeinterleave/main.cpp b/test/UpperPHY/Polar/SubchannelDeinterleave/main.cpp index b5c2c04448587392999f84ccf3d0e9e18cc85010..504db3aa5f5798751979f999fa5f2a302bc57946 100644 --- a/test/UpperPHY/Polar/SubchannelDeinterleave/main.cpp +++ b/test/UpperPHY/Polar/SubchannelDeinterleave/main.cpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "armral.h" #include "int_utils.hpp" diff --git a/test/UpperPHY/Polar/SubchannelInterleave/main.cpp b/test/UpperPHY/Polar/SubchannelInterleave/main.cpp index 4938c2d20c75ec72961eb1b081057d466eb27cfb..0ea9061b95170b4ce93caa786fdc5a934efec1fa 100644 --- a/test/UpperPHY/Polar/SubchannelInterleave/main.cpp +++ b/test/UpperPHY/Polar/SubchannelInterleave/main.cpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "armral.h" #include "int_utils.hpp" diff --git a/test/UpperPHY/Turbo/Batch/Decoding/main.cpp b/test/UpperPHY/Turbo/Batch/Decoding/main.cpp new file mode 100644 index 0000000000000000000000000000000000000000..8d63663e6374f5120290917ee0593850f19bc7f1 --- /dev/null +++ b/test/UpperPHY/Turbo/Batch/Decoding/main.cpp @@ -0,0 +1,39 @@ +/* + Arm RAN Acceleration Library + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause +*/ +#include "../../turbo_decode_test_utils.hpp" + +int main(int argc, char **argv) { + + constexpr uint32_t num_blocks_arr[3] = {7, 8, 18}; + + bool passed = true; + for (auto num_blocks : num_blocks_arr) { + // There is a signature difference for batched vs single + // so create lambda wrappers to handle this difference + auto decode_batch_alloc = [&](const int8_t *sys, const int8_t *par, + const int8_t *itl, uint32_t passed_k, + uint8_t *dst, uint32_t max_iter, + uint16_t *perm_idxs) { + return armral_turbo_decode_batch(num_blocks, sys, par, itl, passed_k, dst, + max_iter, perm_idxs); + }; + auto decode_batch_noalloc = [&](const int8_t *sys, const int8_t *par, + const int8_t *itl, uint32_t passed_k, + uint8_t *dst, uint32_t max_iter, + uint16_t *perm_idxs, void *buffer) { + return armral_turbo_decode_batch_noalloc(num_blocks, sys, par, itl, + passed_k, dst, max_iter, + perm_idxs, buffer); + }; + passed &= run_all_turbo_decoding_tests( + num_blocks, decode_batch_alloc, decode_batch_noalloc, + armral_turbo_decode_batch_noalloc_buffer_size, "TurboDecodingBatch", + "TurboDecodingBatchNoPermIdxs", "TurboDecodingBatchNoAlloc", + "TurboDecodingBatchNoAllocNoPermIdxs"); + } + exit(passed ? EXIT_SUCCESS : EXIT_FAILURE); +} diff --git a/test/UpperPHY/Turbo/Decoding/main.cpp b/test/UpperPHY/Turbo/Decoding/main.cpp deleted file mode 100644 index ece895d04d3f1d3bae7a5e355c4062d5fb97ffb8..0000000000000000000000000000000000000000 --- a/test/UpperPHY/Turbo/Decoding/main.cpp +++ /dev/null @@ -1,145 +0,0 @@ -/* - Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates -*/ -#include "armral.h" - -#include "../turbo_test_data.hpp" - -#include -#include -#include - -// Check that the decoder returns the expected error code when -// passed an invalid value of k. We can safely pass uninitialized -// memory to the routine as the parameter test is the first thing -// it does and it will return immediately when k is invalid. -static bool run_turbo_decoding_parameter_test() { - return armral_turbo_decode_block(NULL, NULL, NULL, 1040, NULL, 0, NULL) == - ARMRAL_ARGUMENT_ERROR; -} - -// Check that the decoder returns the original -// unencoded input for valid values of k. -template -static bool -run_turbo_decoding_test(char const *name, uint32_t k, uint16_t *perm_idxs, - TurboDecodeFunction turbo_decode_under_test) { - auto k_bytes = k >> 3; - - std::vector src(k_bytes + 1, 255); - std::vector sys(k_bytes + 1, 255); - std::vector par(k_bytes + 1, 255); - std::vector itl(k_bytes + 1, 255); - std::vector ans(k_bytes, 255); - - // Generate the input test data - generate_turbo_test_data(src.data(), k); - - // Encode the test data - armral_status ret = armral_turbo_encode_block(src.data(), k, sys.data(), - par.data(), itl.data()); - - // Run modulation on the three output vectors - armral_modulation_type mod_type = ARMRAL_MOD_16QAM; - auto encoded_len = k + 4; - auto mod_num_symbols = encoded_len / 4; - std::vector sys_mod(mod_num_symbols); - std::vector par_mod(mod_num_symbols); - std::vector itl_mod(mod_num_symbols); - armral_modulation(encoded_len, mod_type, sys.data(), sys_mod.data()); - armral_modulation(encoded_len, mod_type, par.data(), par_mod.data()); - armral_modulation(encoded_len, mod_type, itl.data(), itl_mod.data()); - - // Now demodulate to get LLRs - // The value of ulp shouldn't matter for a noiseless channel, but a small - // ulp helps test that overflow in the llrs is handled correctly. - uint16_t ulp = 11; - std::vector sys_demod_soft(mod_num_symbols * 4); - std::vector par_demod_soft(mod_num_symbols * 4); - std::vector itl_demod_soft(mod_num_symbols * 4); - armral_demodulation(mod_num_symbols, ulp, mod_type, sys_mod.data(), - sys_demod_soft.data()); - armral_demodulation(mod_num_symbols, ulp, mod_type, par_mod.data(), - par_demod_soft.data()); - armral_demodulation(mod_num_symbols, ulp, mod_type, itl_mod.data(), - itl_demod_soft.data()); - - // Decode the encoded data. We set the maximum number of decoder iterations to - // 5, which in the absence of noise should always be more than enough. - ret = turbo_decode_under_test(sys_demod_soft.data(), par_demod_soft.data(), - itl_demod_soft.data(), k, ans.data(), 5, - perm_idxs); - - bool passed = true; - - if (ret != ARMRAL_SUCCESS) { - // GCOVR_EXCL_START - printf("Error! [%s_%u] did not return ARMRAL_SUCCESS\n", name, k); - passed = false; - // GCOVR_EXCL_STOP - } - - // Check the decoded data matches the original. - // We are checking uint8_ts here not individual bits. The only valid values - // for k in the spec are multiples of 8 so no need to worry about remainders. - for (uint32_t i = 0; i < k_bytes; i++) { - if (ans[i] != src[i]) { - // GCOVR_EXCL_START - printf("Error! [%s_%u] result[%u] = 0x%x and expected[%u] = 0x%x\n", name, - k, i, ans[i], i, src[i]); - passed = false; - // GCOVR_EXCL_STOP - } - } - if (passed) { - printf("[%s_%u] - check result: OK\n", name, k); - } - return passed; -} - -int main(int argc, char **argv) { - bool passed = true; - - // Check invalid k is detected - passed &= run_turbo_decoding_parameter_test(); - - // Initialize the buffer for the perm_idxs - uint32_t buff_size = 0; - for (auto k : valid_ks) { - buff_size += k; - } - buff_size *= 3; // perm_idx, vec_idx, and vec_lane - std::vector perm_idxs_buff(buff_size); - armral_turbo_perm_idx_init(perm_idxs_buff.data()); - - // Check decoder decodes correctly - for (auto k : valid_ks) { - passed &= run_turbo_decoding_test("TurboDecoding", k, perm_idxs_buff.data(), - armral_turbo_decode_block); - } - for (auto k : valid_ks) { - passed &= run_turbo_decoding_test("TurboDecodingNoPermIdxs", k, nullptr, - armral_turbo_decode_block); - } - - auto no_alloc_test = [](const int8_t *sys, const int8_t *par, - const int8_t *itl, uint32_t passed_k, uint8_t *dst, - uint32_t max_iter, uint16_t *perm_idxs) { - auto buffer_size = - armral_turbo_decode_block_noalloc_buffer_size(passed_k, max_iter); - std::vector buffer(buffer_size); - return armral_turbo_decode_block_noalloc( - sys, par, itl, passed_k, dst, max_iter, perm_idxs, buffer.data()); - }; - for (auto k : valid_ks) { - passed &= run_turbo_decoding_test("TurboDecodingNoAlloc", k, - perm_idxs_buff.data(), no_alloc_test); - } - for (auto k : valid_ks) { - passed &= run_turbo_decoding_test("TurboDecodingNoAllocNoPermIdxs", k, - nullptr, no_alloc_test); - } - - exit(passed ? EXIT_SUCCESS : EXIT_FAILURE); -} diff --git a/test/UpperPHY/Turbo/PermIndices/main.cpp b/test/UpperPHY/Turbo/PermIndices/main.cpp index 216be7fd4104d6ba17192a442f875c0823f4ef59..6c13f392d479ec81b8c7b21b0f9af03841ef4c3c 100644 --- a/test/UpperPHY/Turbo/PermIndices/main.cpp +++ b/test/UpperPHY/Turbo/PermIndices/main.cpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "armral.h" diff --git a/test/UpperPHY/Turbo/Single/Decoding/main.cpp b/test/UpperPHY/Turbo/Single/Decoding/main.cpp new file mode 100644 index 0000000000000000000000000000000000000000..614d34bd0b979df8a9477fc81ff0520bdb496912 --- /dev/null +++ b/test/UpperPHY/Turbo/Single/Decoding/main.cpp @@ -0,0 +1,16 @@ +/* + Arm RAN Acceleration Library + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause +*/ +#include "../../turbo_decode_test_utils.hpp" + +int main(int argc, char **argv) { + bool passed = run_all_turbo_decoding_tests( + 1, armral_turbo_decode_block, armral_turbo_decode_block_noalloc, + armral_turbo_decode_block_noalloc_buffer_size, "TurboDecoding", + "TurboDecodingNoPermIdxs", "TurboDecodingNoAlloc", + "TurboDecodingNoAllocNoPermIdxs"); + exit(passed ? EXIT_SUCCESS : EXIT_FAILURE); +} diff --git a/test/UpperPHY/Turbo/Encoding/main.cpp b/test/UpperPHY/Turbo/Single/Encoding/main.cpp similarity index 94% rename from test/UpperPHY/Turbo/Encoding/main.cpp rename to test/UpperPHY/Turbo/Single/Encoding/main.cpp index a5b109f048bb4d5d8abd02b6c5d758dbaf6dccf1..38cb0982fb259fb7f66e55830fd84b7c5fb1aada 100644 --- a/test/UpperPHY/Turbo/Encoding/main.cpp +++ b/test/UpperPHY/Turbo/Single/Encoding/main.cpp @@ -1,10 +1,12 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "armral.h" -#include "../turbo_test_data.hpp" +#include "../../turbo_test_data.hpp" #include "reference_turbo_encoder.hpp" #include diff --git a/test/UpperPHY/Turbo/Encoding/reference_turbo_encoder.hpp b/test/UpperPHY/Turbo/Single/Encoding/reference_turbo_encoder.hpp similarity index 97% rename from test/UpperPHY/Turbo/Encoding/reference_turbo_encoder.hpp rename to test/UpperPHY/Turbo/Single/Encoding/reference_turbo_encoder.hpp index d38c6c7b7444157eab442cd21bea4ec766178427..94438d43d899947b5258c1d1f8a19dbee7f67d90 100644 --- a/test/UpperPHY/Turbo/Encoding/reference_turbo_encoder.hpp +++ b/test/UpperPHY/Turbo/Single/Encoding/reference_turbo_encoder.hpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #pragma once diff --git a/test/UpperPHY/Turbo/RateMatching/main.cpp b/test/UpperPHY/Turbo/Single/RateMatching/main.cpp similarity index 98% rename from test/UpperPHY/Turbo/RateMatching/main.cpp rename to test/UpperPHY/Turbo/Single/RateMatching/main.cpp index 44c66e842daddf1616996dae5482239f683e7cbe..051d6009a137b6ca2ed9160356c07c8899e1a1d3 100644 --- a/test/UpperPHY/Turbo/RateMatching/main.cpp +++ b/test/UpperPHY/Turbo/Single/RateMatching/main.cpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "armral.h" #include "int_utils.hpp" diff --git a/test/UpperPHY/Turbo/RateRecovery/main.cpp b/test/UpperPHY/Turbo/Single/RateRecovery/main.cpp similarity index 91% rename from test/UpperPHY/Turbo/RateRecovery/main.cpp rename to test/UpperPHY/Turbo/Single/RateRecovery/main.cpp index b9636de19eef22a4e1a1683e1b1dafd675a87db6..52205b593c5f27691ce6666e2187ad4e2ff2c8a2 100644 --- a/test/UpperPHY/Turbo/RateRecovery/main.cpp +++ b/test/UpperPHY/Turbo/Single/RateRecovery/main.cpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include "armral.h" #include "int_utils.hpp" diff --git a/test/UpperPHY/Turbo/RateRecovery/rate_recovery_data.hpp b/test/UpperPHY/Turbo/Single/RateRecovery/rate_recovery_data.hpp similarity index 98% rename from test/UpperPHY/Turbo/RateRecovery/rate_recovery_data.hpp rename to test/UpperPHY/Turbo/Single/RateRecovery/rate_recovery_data.hpp index 361289e217e2d35d9b29421f786fd7f05f69d7e3..a02b18f1a5693378ab3b2b3dc9a61d3b2cfff5c1 100644 --- a/test/UpperPHY/Turbo/RateRecovery/rate_recovery_data.hpp +++ b/test/UpperPHY/Turbo/Single/RateRecovery/rate_recovery_data.hpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #pragma once diff --git a/test/UpperPHY/Turbo/turbo_decode_test_utils.hpp b/test/UpperPHY/Turbo/turbo_decode_test_utils.hpp new file mode 100644 index 0000000000000000000000000000000000000000..19020be65a15c7b78a8eead5f94200b80d16c6a7 --- /dev/null +++ b/test/UpperPHY/Turbo/turbo_decode_test_utils.hpp @@ -0,0 +1,176 @@ +/* + Arm RAN Acceleration Library + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause +*/ +#include "armral.h" +#include "turbo_test_data.hpp" + +#include + +template +static inline bool +run_turbo_decoding_parameter_test(TurboDecodeFunction turbo_decode_under_test) { + return turbo_decode_under_test(NULL, NULL, NULL, 1040, NULL, 0, NULL) == + ARMRAL_ARGUMENT_ERROR; +} + +template +void interleave(T *src, uint32_t ldsrc, T *dst, uint32_t lddst) { + for (uint32_t a = 0; a < ldsrc; ++a) { + for (uint32_t b = 0; b < lddst; ++b) { + dst[a * lddst + b] = src[b * ldsrc + a]; + } + } +} + +static inline void setup_block_data(uint32_t k, uint32_t b, uint8_t *src, + int8_t *sys, int8_t *par, int8_t *itl) { + auto k_bytes = k / 8; + + std::vector sys_encode(k_bytes + 1, 255); + std::vector par_encode(k_bytes + 1, 255); + std::vector itl_encode(k_bytes + 1, 255); + + // Generate the input test data + generate_turbo_test_data(src, k, b); + + // Encode the test data + armral_turbo_encode_block(src, k, sys_encode.data(), par_encode.data(), + itl_encode.data()); + + // Run modulation on the three output vectors + armral_modulation_type mod_type = ARMRAL_MOD_16QAM; + auto encoded_len = k + 4; + auto mod_num_symbols = encoded_len / 4; + std::vector sys_mod(mod_num_symbols); + std::vector par_mod(mod_num_symbols); + std::vector itl_mod(mod_num_symbols); + armral_modulation(encoded_len, mod_type, sys_encode.data(), sys_mod.data()); + armral_modulation(encoded_len, mod_type, par_encode.data(), par_mod.data()); + armral_modulation(encoded_len, mod_type, itl_encode.data(), itl_mod.data()); + + // Now demodulate to get LLRs + // The value of ulp shouldn't matter for a noiseless channel, but a small + // ulp helps test that overflow in the llrs is handled correctly. + uint16_t ulp = 11; + armral_demodulation(mod_num_symbols, ulp, mod_type, sys_mod.data(), sys); + armral_demodulation(mod_num_symbols, ulp, mod_type, par_mod.data(), par); + armral_demodulation(mod_num_symbols, ulp, mod_type, itl_mod.data(), itl); +} + +// Check that the decoder returns the original +// unencoded input for valid values of k. +template +static bool +run_one_turbo_decoding_test(uint32_t num_blocks, char const *name, uint32_t k, + uint16_t *perm_idxs, + TurboDecodeFunction turbo_decode_under_test) { + + uint32_t len = k + 4; + auto k_bytes = k >> 3; + + std::vector src(num_blocks * (k_bytes + 1)); + std::vector ans(num_blocks * k_bytes); + std::vector sys(num_blocks * len); + std::vector par(num_blocks * len); + std::vector itl(num_blocks * len); + + for (uint32_t b = 0; b < num_blocks; ++b) { + setup_block_data(k, b, &src[b * (k_bytes + 1)], &sys[b * len], + &par[b * len], &itl[b * len]); + } + + // Decode the encoded data. We set the maximum number of decoder iterations to + // 6, without noise this should be enough. + armral_status ret = turbo_decode_under_test( + sys.data(), par.data(), itl.data(), k, ans.data(), 6, perm_idxs); + + bool passed = true; + if (ret != ARMRAL_SUCCESS) { + // GCOVR_EXCL_START + printf("Error! [%s_BatchSize-%u_k-%u] did not return ARMRAL_SUCCESS\n", + name, num_blocks, k); + passed = false; + // GCOVR_EXCL_STOP + } + + // Check the decoded data matches the original. + // We are checking uint8_ts here not individual bits. The only valid values + // for k in the spec are multiples of 8 so no need to worry about remainders. + for (uint32_t b = 0; b < num_blocks; ++b) { + for (uint32_t i = 0; i < k_bytes; ++i) { + if (ans[b * k_bytes + i] != src[b * (k_bytes + 1) + i]) { + // GCOVR_EXCL_START + printf( + "Error! [%s_BatchSize-%u_k-%u] result[b_i=%u][k8_i=%u] = 0x%x and " + "expected[b_i=%u][k8_i=%u] = 0x%x. Diff = 0x%x\n", + name, num_blocks, k, b, i, ans[b * k_bytes + i], b, i, + src[b * (k_bytes + 1) + i], + std::abs(ans[b * k_bytes + i] - src[b * (k_bytes + 1) + i])); + passed = false; + // GCOVR_EXCL_STOP + } + } + } + if (passed) { + printf("[%s_BatchSize-%u_k-%u] - check result: OK\n", name, num_blocks, k); + } + return passed; +} + +template +static bool +run_all_turbo_decoding_tests(uint32_t num_blocks, TurboDecodeAlloc alloc_fn, + TurboDecodeNoAlloc noalloc_fn, + TurboBuffSize buff_size_fn, char const *base, + char const *no_perm_idxs, char const *no_alloc, + char const *no_alloc_no_perm_idxs) { + bool passed = true; + + // Check invalid k is detected + passed &= run_turbo_decoding_parameter_test(alloc_fn); + + // init the buffer for the perm_idxs + uint32_t perm_buff_size = 0; + for (auto k : valid_ks) { + perm_buff_size += k; + } + perm_buff_size *= 3; // perm_idx, vec_idx, and vec_lane + std::vector perm_idxs_buff(perm_buff_size); + armral_turbo_perm_idx_init(perm_idxs_buff.data()); + + // Check decoder decodes correctly + // Allocating + for (auto k : valid_ks) { + passed &= run_one_turbo_decoding_test(num_blocks, base, k, + perm_idxs_buff.data(), alloc_fn); + } + for (auto k : valid_ks) { + passed &= run_one_turbo_decoding_test(num_blocks, no_perm_idxs, k, nullptr, + alloc_fn); + } + + // No allocating + auto no_alloc_test = [&](const int8_t *sys, const int8_t *par, + const int8_t *itl, uint32_t passed_k, uint8_t *dst, + uint32_t max_iter, uint16_t *perm_idxs) { + auto buffer_size = buff_size_fn(passed_k); + std::vector buffer(buffer_size); + return noalloc_fn(sys, par, itl, passed_k, dst, max_iter, perm_idxs, + buffer.data()); + }; + for (auto k : valid_ks) { + passed &= run_one_turbo_decoding_test(num_blocks, no_alloc, k, + perm_idxs_buff.data(), no_alloc_test); + } + + for (auto k : valid_ks) { + passed &= run_one_turbo_decoding_test(num_blocks, no_alloc_no_perm_idxs, k, + nullptr, no_alloc_test); + } + + return passed; +} diff --git a/test/UpperPHY/Turbo/turbo_test_data.hpp b/test/UpperPHY/Turbo/turbo_test_data.hpp index cd1697df973018f684dd81fff2ffa47b66f2d0d3..a02bba718c40fc0ab88e26e48baad04474c49b47 100644 --- a/test/UpperPHY/Turbo/turbo_test_data.hpp +++ b/test/UpperPHY/Turbo/turbo_test_data.hpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #pragma once @@ -8,9 +10,10 @@ #include -static inline void generate_turbo_test_data(uint8_t *src, uint32_t k) { +static inline void generate_turbo_test_data(uint8_t *src, uint32_t k, + uint32_t b = 0) { static armral::utils::linear_congruential_generator lcg; - auto state = armral::utils::random_state::from_seeds({k}); + auto state = armral::utils::random_state::from_seeds({k + b}); // k is always divisible by 8 uint32_t k_bytes = k >> 3; @@ -117,4 +120,4 @@ static const std::map> perm_params = { {5504, {21, 86}}, {5568, {43, 174}}, {5632, {45, 176}}, {5696, {45, 178}}, {5760, {161, 120}}, {5824, {89, 182}}, {5888, {323, 184}}, {5952, {47, 186}}, {6016, {23, 94}}, - {6080, {47, 190}}, {6144, {263, 480}}}; \ No newline at end of file + {6080, {47, 190}}, {6144, {263, 480}}}; diff --git a/utils/cf32_utils.hpp b/utils/cf32_utils.hpp index c015e2497c649168ed8ca6f93e46ac4c49df1187..9b9f57c01e28d05c3ce16ac982de9170833236ae 100644 --- a/utils/cf32_utils.hpp +++ b/utils/cf32_utils.hpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #pragma once diff --git a/utils/cs16_utils.hpp b/utils/cs16_utils.hpp index 9463e7d6a4ec74b9d46b50d7c36daf010f860907..46571ca34d17ab7d223681abe5b21cbf10507027 100644 --- a/utils/cs16_utils.hpp +++ b/utils/cs16_utils.hpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #pragma once diff --git a/utils/fft_utils.hpp b/utils/fft_utils.hpp index 494ea902838d835714b2f56ea0952572dcf4906b..ce1a0538538ec3b0ba4d7417268af3296d2a8a59 100644 --- a/utils/fft_utils.hpp +++ b/utils/fft_utils.hpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #pragma once diff --git a/utils/int_utils.hpp b/utils/int_utils.hpp index 6d65eae5d38827cb6588d8861cfa8414aaf01509..660f4e035513b76df06fdb3572d3264ec3cede94 100644 --- a/utils/int_utils.hpp +++ b/utils/int_utils.hpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #pragma once diff --git a/utils/matrix_utils.hpp b/utils/matrix_utils.hpp index 8a27b84c06b8c749700b53e0e588a2dbb50fd399..6af70996843ffeab689686444e83e8c53a537bc8 100644 --- a/utils/matrix_utils.hpp +++ b/utils/matrix_utils.hpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #pragma once diff --git a/utils/qint64.hpp b/utils/qint64.hpp index 856899396dfccb445bc39115bbc85494813b9660..39246e3575789d1fd78c4cd355fac3a8fe570693 100644 --- a/utils/qint64.hpp +++ b/utils/qint64.hpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #pragma once diff --git a/utils/reference_linalg.hpp b/utils/reference_linalg.hpp index d01d1bf9c10fe8b221a11e70188796a8b122891c..f17fad18a1c813479bb7185e0b12833884050207 100644 --- a/utils/reference_linalg.hpp +++ b/utils/reference_linalg.hpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #pragma once diff --git a/utils/rng.cpp b/utils/rng.cpp index 33887ee326941cf279626148157881ba33c971c1..23e03edaf64b785ca024bd3690cca74cbb547de1 100644 --- a/utils/rng.cpp +++ b/utils/rng.cpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #include diff --git a/utils/rng.hpp b/utils/rng.hpp index 0e21bde9a039eb260dfaa387c9ead5f647911f39..d366a8ed7840558b5529c3bae3eedc680da3b0a3 100644 --- a/utils/rng.hpp +++ b/utils/rng.hpp @@ -1,6 +1,8 @@ /* Arm RAN Acceleration Library - SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + SPDX-FileCopyrightText: Copyright 2020-2025 Arm Limited and/or its + affiliates + SPDX-License-Identifier: BSD-3-Clause */ #pragma once @@ -101,7 +103,8 @@ struct random_state { static random_state from_seeds(std::initializer_list seeds); }; -// An abstract base class from which other stateful RNG helper classes are defined +// An abstract base class from which other stateful RNG helper classes are +// defined template class base_random { public: